Files
ScraperControl/scripts/debug/debug-paz-sections.ts
Albert 2c51513851 chore: sync with Gitea master and restore local-only files
Reset local main to gitea/master (new source of truth) and restored
local-only files: web scrapers, admin dashboard, ChromaDB integration,
debug scripts, and utility libraries that aren't tracked in Gitea.

Gitea master adds: discovermass, buscarmisas-network, hk-parishes,
bohosluzby, kerknet, gottesdienstzeiten, miserend importers,
ClaimRequest model, forward geocoding, heartbeat healthcheck.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-12 19:11:22 -04:00

57 lines
1.8 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env tsx
/**
* Debug which sections are being found
*/
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
// Simulate the exact text from the page
const scheduleText = `
horário das missas igreja matriz de santo antônio
segundas, terças, quartas e sextas-feiras: 16h e 18h.
quintas-feiras: 16h e 19h (adoração ao santíssimo 18h).
sábados: 8h, 16h e 18h.
domingos: 8h, 11h, 16h, 18h e 20h.
`.toLowerCase();
console.log('Text to parse:');
console.log(scheduleText);
console.log('');
const dayConfigs = getDayNamesForCountry('BR');
const dayPatterns = buildDayPatterns(dayConfigs);
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
console.log('=== COMMA-SEPARATED GROUP MATCHING ===\n');
const dayGroupRegex = new RegExp(
`((?:${allDayNamesPattern})(?:[,\\s]+(?:e|and|et|und|y)?\\s*(?:${allDayNamesPattern}))+)[:\\s]+([^]*?)(?=(?:${allDayNamesPattern})|$)`,
'gi'
);
let groupMatch;
let matchCount = 0;
while ((groupMatch = dayGroupRegex.exec(scheduleText)) !== null) {
matchCount++;
console.log(`Match #${matchCount}:`);
console.log(` Day group: "${groupMatch[1]}"`);
console.log(` Time text: "${groupMatch[2]}"`);
console.log('');
}
console.log('=== INDIVIDUAL DAY MATCHING ===\n');
for (const [dayName, dayIndex] of Object.entries(dayPatterns)) {
const escaped = dayName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const regex = new RegExp(
`(?:^|\\s|[,;:])${escaped}[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
'i'
);
const match = scheduleText.match(regex);
if (match) {
console.log(`Found ${dayName} (day ${dayIndex}):`);
console.log(` Time text: "${match[1].substring(0, 100)}"`);
}
}