Reset local main to gitea/master (new source of truth) and restored local-only files: web scrapers, admin dashboard, ChromaDB integration, debug scripts, and utility libraries that aren't tracked in Gitea. Gitea master adds: discovermass, buscarmisas-network, hk-parishes, bohosluzby, kerknet, gottesdienstzeiten, miserend importers, ClaimRequest model, forward geocoding, heartbeat healthcheck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
99 lines
3.1 KiB
TypeScript
99 lines
3.1 KiB
TypeScript
#!/usr/bin/env tsx
|
|
/**
|
|
* Debug the full parsing flow with section detection
|
|
*/
|
|
|
|
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
|
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
|
|
|
|
async function debugFullFlow() {
|
|
const url = 'https://www.paroquiadapaz.org.br/';
|
|
console.log(`Debugging: ${url}\n`);
|
|
|
|
const scraper = new GenericScraper();
|
|
await scraper.init();
|
|
scraper.setCountry('BR');
|
|
|
|
const result = await scraper.scrape(url);
|
|
|
|
if (!result.rawHtml) {
|
|
console.log('No HTML received');
|
|
await scraper.close();
|
|
return;
|
|
}
|
|
|
|
const text = result.rawHtml
|
|
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
|
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
|
.replace(/<[^>]+>/g, ' ')
|
|
.replace(/\s+/g, ' ')
|
|
.toLowerCase();
|
|
|
|
// Find the schedule section
|
|
const scheduleIndex = text.indexOf('segundas, terças');
|
|
if (scheduleIndex === -1) {
|
|
console.log('Schedule text not found!');
|
|
await scraper.close();
|
|
return;
|
|
}
|
|
|
|
const snippet = text.substring(scheduleIndex, scheduleIndex + 500);
|
|
console.log('Schedule snippet from actual HTML:');
|
|
console.log(snippet);
|
|
console.log('\n');
|
|
|
|
// Now test section matching on actual text
|
|
const dayConfigs = getDayNamesForCountry('BR');
|
|
const dayPatterns = buildDayPatterns(dayConfigs);
|
|
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
|
|
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
|
|
|
|
console.log('=== Testing sábados and domingos matches ===\n');
|
|
|
|
// Test sábados
|
|
const sabadosRegex = new RegExp(
|
|
`(?:^|\\s|[,;:])sábados[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
|
|
'i'
|
|
);
|
|
const sabadosMatch = snippet.match(sabadosRegex);
|
|
console.log('sábados match:', sabadosMatch ? `Found: "${sabadosMatch[1].substring(0, 50)}"` : 'Not found');
|
|
|
|
// Test sabados (no accent)
|
|
const sabadosRegex2 = new RegExp(
|
|
`(?:^|\\s|[,;:])sabados[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
|
|
'i'
|
|
);
|
|
const sabadosMatch2 = snippet.match(sabadosRegex2);
|
|
console.log('sabados match:', sabadosMatch2 ? `Found: "${sabadosMatch2[1].substring(0, 50)}"` : 'Not found');
|
|
|
|
// Test domingos
|
|
const domingosRegex = new RegExp(
|
|
`(?:^|\\s|[,;:])domingos[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
|
|
'i'
|
|
);
|
|
const domingosMatch = snippet.match(domingosRegex);
|
|
console.log('domingos match:', domingosMatch ? `Found: "${domingosMatch[1].substring(0, 50)}"` : 'Not found');
|
|
|
|
console.log('\n=== Final parsed schedules ===\n');
|
|
console.log(`Total: ${result.schedules.length}`);
|
|
|
|
const byDay: Record<number, typeof result.schedules> = {};
|
|
for (const sched of result.schedules) {
|
|
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
|
|
byDay[sched.dayOfWeek].push(sched);
|
|
}
|
|
|
|
const dayNames = ['Domingo', 'Segunda', 'Terça', 'Quarta', 'Quinta', 'Sexta', 'Sábado'];
|
|
for (let i = 0; i < 7; i++) {
|
|
if (byDay[i]) {
|
|
console.log(`${dayNames[i]}: ${byDay[i].length} schedules`);
|
|
} else {
|
|
console.log(`${dayNames[i]}: 0 schedules ❌`);
|
|
}
|
|
}
|
|
|
|
await scraper.close();
|
|
}
|
|
|
|
debugFullFlow().catch(console.error);
|