75 lines
2.3 KiB
TypeScript
75 lines
2.3 KiB
TypeScript
|
|
#!/usr/bin/env tsx
|
||
|
|
/**
|
||
|
|
* Deep dive into Paróquia da Paz parsing bug
|
||
|
|
*/
|
||
|
|
|
||
|
|
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||
|
|
|
||
|
|
async function debugPaz() {
|
||
|
|
const url = 'https://www.paroquiadapaz.org.br/';
|
||
|
|
console.log(`Debugging: ${url}\n`);
|
||
|
|
|
||
|
|
const scraper = new GenericScraper();
|
||
|
|
await scraper.init();
|
||
|
|
scraper.setCountry('BR');
|
||
|
|
|
||
|
|
const result = await scraper.scrape(url);
|
||
|
|
|
||
|
|
console.log(`Success: ${result.success}`);
|
||
|
|
console.log(`Schedules: ${result.schedules.length}\n`);
|
||
|
|
|
||
|
|
if (result.rawHtml) {
|
||
|
|
const text = result.rawHtml
|
||
|
|
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||
|
|
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||
|
|
.replace(/<[^>]+>/g, ' ')
|
||
|
|
.replace(/\s+/g, ' ')
|
||
|
|
.toLowerCase();
|
||
|
|
|
||
|
|
// Find where days appear
|
||
|
|
console.log('=== Finding day + time patterns ===\n');
|
||
|
|
|
||
|
|
const days = ['domingo', 'segunda', 'terça', 'terca', 'quarta', 'quinta', 'sexta', 'sábado', 'sabado'];
|
||
|
|
|
||
|
|
for (const day of days) {
|
||
|
|
const dayIndex = text.indexOf(day);
|
||
|
|
if (dayIndex !== -1) {
|
||
|
|
// Show context around the day (100 chars before and 200 after)
|
||
|
|
const before = Math.max(0, dayIndex - 100);
|
||
|
|
const after = Math.min(text.length, dayIndex + 200);
|
||
|
|
const snippet = text.substring(before, after);
|
||
|
|
|
||
|
|
console.log(`${day.toUpperCase()}:`);
|
||
|
|
console.log(` Position: ${dayIndex}`);
|
||
|
|
console.log(` Context: ...${snippet}...`);
|
||
|
|
console.log('');
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Check for "h" time format specifically
|
||
|
|
console.log('\n=== Checking "h" time format ===');
|
||
|
|
const hTimeRegex = /(\d{1,2})h(\d{2})?/g;
|
||
|
|
const hTimes = text.match(hTimeRegex);
|
||
|
|
if (hTimes) {
|
||
|
|
console.log(`Found ${hTimes.length} "h" format times:`);
|
||
|
|
console.log(hTimes.slice(0, 30).join(', '));
|
||
|
|
}
|
||
|
|
|
||
|
|
// Look for schedule structure
|
||
|
|
console.log('\n=== Looking for schedule structure ===');
|
||
|
|
const scheduleKeywords = ['horário', 'horario', 'missa', 'missas', 'santa missa'];
|
||
|
|
for (const keyword of scheduleKeywords) {
|
||
|
|
const index = text.indexOf(keyword);
|
||
|
|
if (index !== -1) {
|
||
|
|
const snippet = text.substring(index, Math.min(text.length, index + 500));
|
||
|
|
console.log(`\nFound "${keyword}" at position ${index}:`);
|
||
|
|
console.log(snippet.substring(0, 300));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
await scraper.close();
|
||
|
|
}
|
||
|
|
|
||
|
|
debugPaz().catch(console.error);
|