chore: sync with Gitea master and restore local-only files
Reset local main to gitea/master (new source of truth) and restored local-only files: web scrapers, admin dashboard, ChromaDB integration, debug scripts, and utility libraries that aren't tracked in Gitea. Gitea master adds: discovermass, buscarmisas-network, hk-parishes, bohosluzby, kerknet, gottesdienstzeiten, miserend importers, ClaimRequest model, forward geocoding, heartbeat healthcheck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
56
scripts/debug/debug-paz-sections.ts
Normal file
56
scripts/debug/debug-paz-sections.ts
Normal file
@@ -0,0 +1,56 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Debug which sections are being found
|
||||
*/
|
||||
|
||||
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
|
||||
|
||||
// Simulate the exact text from the page
|
||||
const scheduleText = `
|
||||
horário das missas igreja matriz de santo antônio
|
||||
segundas, terças, quartas e sextas-feiras: 16h e 18h.
|
||||
quintas-feiras: 16h e 19h (adoração ao santíssimo – 18h).
|
||||
sábados: 8h, 16h e 18h.
|
||||
domingos: 8h, 11h, 16h, 18h e 20h.
|
||||
`.toLowerCase();
|
||||
|
||||
console.log('Text to parse:');
|
||||
console.log(scheduleText);
|
||||
console.log('');
|
||||
|
||||
const dayConfigs = getDayNamesForCountry('BR');
|
||||
const dayPatterns = buildDayPatterns(dayConfigs);
|
||||
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
|
||||
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
|
||||
|
||||
console.log('=== COMMA-SEPARATED GROUP MATCHING ===\n');
|
||||
|
||||
const dayGroupRegex = new RegExp(
|
||||
`((?:${allDayNamesPattern})(?:[,\\s]+(?:e|and|et|und|y)?\\s*(?:${allDayNamesPattern}))+)[:\\s]+([^]*?)(?=(?:${allDayNamesPattern})|$)`,
|
||||
'gi'
|
||||
);
|
||||
|
||||
let groupMatch;
|
||||
let matchCount = 0;
|
||||
while ((groupMatch = dayGroupRegex.exec(scheduleText)) !== null) {
|
||||
matchCount++;
|
||||
console.log(`Match #${matchCount}:`);
|
||||
console.log(` Day group: "${groupMatch[1]}"`);
|
||||
console.log(` Time text: "${groupMatch[2]}"`);
|
||||
console.log('');
|
||||
}
|
||||
|
||||
console.log('=== INDIVIDUAL DAY MATCHING ===\n');
|
||||
|
||||
for (const [dayName, dayIndex] of Object.entries(dayPatterns)) {
|
||||
const escaped = dayName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||
const regex = new RegExp(
|
||||
`(?:^|\\s|[,;:])${escaped}[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
|
||||
'i'
|
||||
);
|
||||
const match = scheduleText.match(regex);
|
||||
if (match) {
|
||||
console.log(`Found ${dayName} (day ${dayIndex}):`);
|
||||
console.log(` Time text: "${match[1].substring(0, 100)}"`);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user