Files
ScraperControl/scripts/debug/debug-paz-sections.ts

57 lines
1.8 KiB
TypeScript
Raw Permalink Normal View History

#!/usr/bin/env tsx
/**
* Debug which sections are being found
*/
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
// Simulate the exact text from the page
const scheduleText = `
horário das missas igreja matriz de santo antônio
segundas, terças, quartas e sextas-feiras: 16h e 18h.
quintas-feiras: 16h e 19h (adoração ao santíssimo 18h).
sábados: 8h, 16h e 18h.
domingos: 8h, 11h, 16h, 18h e 20h.
`.toLowerCase();
console.log('Text to parse:');
console.log(scheduleText);
console.log('');
const dayConfigs = getDayNamesForCountry('BR');
const dayPatterns = buildDayPatterns(dayConfigs);
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
console.log('=== COMMA-SEPARATED GROUP MATCHING ===\n');
const dayGroupRegex = new RegExp(
`((?:${allDayNamesPattern})(?:[,\\s]+(?:e|and|et|und|y)?\\s*(?:${allDayNamesPattern}))+)[:\\s]+([^]*?)(?=(?:${allDayNamesPattern})|$)`,
'gi'
);
let groupMatch;
let matchCount = 0;
while ((groupMatch = dayGroupRegex.exec(scheduleText)) !== null) {
matchCount++;
console.log(`Match #${matchCount}:`);
console.log(` Day group: "${groupMatch[1]}"`);
console.log(` Time text: "${groupMatch[2]}"`);
console.log('');
}
console.log('=== INDIVIDUAL DAY MATCHING ===\n');
for (const [dayName, dayIndex] of Object.entries(dayPatterns)) {
const escaped = dayName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const regex = new RegExp(
`(?:^|\\s|[,;:])${escaped}[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
'i'
);
const match = scheduleText.match(regex);
if (match) {
console.log(`Found ${dayName} (day ${dayIndex}):`);
console.log(` Time text: "${match[1].substring(0, 100)}"`);
}
}