#!/usr/bin/env tsx /** * Debug the full parsing flow with section detection */ import { GenericScraper } from '../../src/scrapers/strategies/generic'; import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names'; async function debugFullFlow() { const url = 'https://www.paroquiadapaz.org.br/'; console.log(`Debugging: ${url}\n`); const scraper = new GenericScraper(); await scraper.init(); scraper.setCountry('BR'); const result = await scraper.scrape(url); if (!result.rawHtml) { console.log('No HTML received'); await scraper.close(); return; } const text = result.rawHtml .replace(/]*>[\s\S]*?<\/script>/gi, '') .replace(/]*>[\s\S]*?<\/style>/gi, '') .replace(/<[^>]+>/g, ' ') .replace(/\s+/g, ' ') .toLowerCase(); // Find the schedule section const scheduleIndex = text.indexOf('segundas, terças'); if (scheduleIndex === -1) { console.log('Schedule text not found!'); await scraper.close(); return; } const snippet = text.substring(scheduleIndex, scheduleIndex + 500); console.log('Schedule snippet from actual HTML:'); console.log(snippet); console.log('\n'); // Now test section matching on actual text const dayConfigs = getDayNamesForCountry('BR'); const dayPatterns = buildDayPatterns(dayConfigs); const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length); const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|'); console.log('=== Testing sábados and domingos matches ===\n'); // Test sábados const sabadosRegex = new RegExp( `(?:^|\\s|[,;:])sábados[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`, 'i' ); const sabadosMatch = snippet.match(sabadosRegex); console.log('sábados match:', sabadosMatch ? `Found: "${sabadosMatch[1].substring(0, 50)}"` : 'Not found'); // Test sabados (no accent) const sabadosRegex2 = new RegExp( `(?:^|\\s|[,;:])sabados[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`, 'i' ); const sabadosMatch2 = snippet.match(sabadosRegex2); console.log('sabados match:', sabadosMatch2 ? `Found: "${sabadosMatch2[1].substring(0, 50)}"` : 'Not found'); // Test domingos const domingosRegex = new RegExp( `(?:^|\\s|[,;:])domingos[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`, 'i' ); const domingosMatch = snippet.match(domingosRegex); console.log('domingos match:', domingosMatch ? `Found: "${domingosMatch[1].substring(0, 50)}"` : 'Not found'); console.log('\n=== Final parsed schedules ===\n'); console.log(`Total: ${result.schedules.length}`); const byDay: Record = {}; for (const sched of result.schedules) { if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = []; byDay[sched.dayOfWeek].push(sched); } const dayNames = ['Domingo', 'Segunda', 'Terça', 'Quarta', 'Quinta', 'Sexta', 'Sábado']; for (let i = 0; i < 7; i++) { if (byDay[i]) { console.log(`${dayNames[i]}: ${byDay[i].length} schedules`); } else { console.log(`${dayNames[i]}: 0 schedules ❌`); } } await scraper.close(); } debugFullFlow().catch(console.error);