Files
ScraperControl/scripts/debug/debug-paz-full-flow.ts
Albert 2c51513851 chore: sync with Gitea master and restore local-only files
Reset local main to gitea/master (new source of truth) and restored
local-only files: web scrapers, admin dashboard, ChromaDB integration,
debug scripts, and utility libraries that aren't tracked in Gitea.

Gitea master adds: discovermass, buscarmisas-network, hk-parishes,
bohosluzby, kerknet, gottesdienstzeiten, miserend importers,
ClaimRequest model, forward geocoding, heartbeat healthcheck.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-12 19:11:22 -04:00

99 lines
3.1 KiB
TypeScript

#!/usr/bin/env tsx
/**
* Debug the full parsing flow with section detection
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
async function debugFullFlow() {
const url = 'https://www.paroquiadapaz.org.br/';
console.log(`Debugging: ${url}\n`);
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('BR');
const result = await scraper.scrape(url);
if (!result.rawHtml) {
console.log('No HTML received');
await scraper.close();
return;
}
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Find the schedule section
const scheduleIndex = text.indexOf('segundas, terças');
if (scheduleIndex === -1) {
console.log('Schedule text not found!');
await scraper.close();
return;
}
const snippet = text.substring(scheduleIndex, scheduleIndex + 500);
console.log('Schedule snippet from actual HTML:');
console.log(snippet);
console.log('\n');
// Now test section matching on actual text
const dayConfigs = getDayNamesForCountry('BR');
const dayPatterns = buildDayPatterns(dayConfigs);
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
console.log('=== Testing sábados and domingos matches ===\n');
// Test sábados
const sabadosRegex = new RegExp(
`(?:^|\\s|[,;:])sábados[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
'i'
);
const sabadosMatch = snippet.match(sabadosRegex);
console.log('sábados match:', sabadosMatch ? `Found: "${sabadosMatch[1].substring(0, 50)}"` : 'Not found');
// Test sabados (no accent)
const sabadosRegex2 = new RegExp(
`(?:^|\\s|[,;:])sabados[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
'i'
);
const sabadosMatch2 = snippet.match(sabadosRegex2);
console.log('sabados match:', sabadosMatch2 ? `Found: "${sabadosMatch2[1].substring(0, 50)}"` : 'Not found');
// Test domingos
const domingosRegex = new RegExp(
`(?:^|\\s|[,;:])domingos[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
'i'
);
const domingosMatch = snippet.match(domingosRegex);
console.log('domingos match:', domingosMatch ? `Found: "${domingosMatch[1].substring(0, 50)}"` : 'Not found');
console.log('\n=== Final parsed schedules ===\n');
console.log(`Total: ${result.schedules.length}`);
const byDay: Record<number, typeof result.schedules> = {};
for (const sched of result.schedules) {
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
byDay[sched.dayOfWeek].push(sched);
}
const dayNames = ['Domingo', 'Segunda', 'Terça', 'Quarta', 'Quinta', 'Sexta', 'Sábado'];
for (let i = 0; i < 7; i++) {
if (byDay[i]) {
console.log(`${dayNames[i]}: ${byDay[i].length} schedules`);
} else {
console.log(`${dayNames[i]}: 0 schedules ❌`);
}
}
await scraper.close();
}
debugFullFlow().catch(console.error);