Reset local main to gitea/master (new source of truth) and restored local-only files: web scrapers, admin dashboard, ChromaDB integration, debug scripts, and utility libraries that aren't tracked in Gitea. Gitea master adds: discovermass, buscarmisas-network, hk-parishes, bohosluzby, kerknet, gottesdienstzeiten, miserend importers, ClaimRequest model, forward geocoding, heartbeat healthcheck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
80 lines
2.8 KiB
TypeScript
80 lines
2.8 KiB
TypeScript
#!/usr/bin/env tsx
|
|
/**
|
|
* Debug why Sunday and Monday aren't parsing for Polish church
|
|
*/
|
|
|
|
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
|
|
|
|
// Exact schedule text from website
|
|
const text = `msze święte niedziela i uroczystości: 8 00 , 9 30 (lubojenka), 11 00 , 16 00 w lipcu i sierpniu nie ma mszy popołudniowej!--> dni powszednie: poniedziałek: godz. 8 00 wtorek - sobota: godz. 18 00`.toLowerCase();
|
|
|
|
console.log('Text to parse:');
|
|
console.log(text);
|
|
console.log('\n');
|
|
|
|
const dayConfigs = getDayNamesForCountry('PL');
|
|
const dayPatterns = buildDayPatterns(dayConfigs);
|
|
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
|
|
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
|
|
|
|
console.log('=== Testing niedziela (Sunday) ===\n');
|
|
|
|
// Current regex pattern
|
|
const niedziela = 'niedziela';
|
|
const escaped = niedziela.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
const regex = new RegExp(
|
|
`(?:^|\\s|[,;:])${escaped}(?:(?:[^:]{1,50})?:|\\s+)([^]*?)(?=${allDayNamesPattern}|$)`,
|
|
'i'
|
|
);
|
|
|
|
const match = text.match(regex);
|
|
if (match) {
|
|
console.log(`✓ Matched!`);
|
|
console.log(` Full match: "${match[0].substring(0, 100)}"`);
|
|
console.log(` Captured text: "${match[1].substring(0, 100)}"`);
|
|
console.log('');
|
|
|
|
// Check if times can be extracted
|
|
const spacePattern = /\b(\d{1,2})\s+(\d{2})(?!\d)/g;
|
|
const times = match[1].match(spacePattern);
|
|
console.log(` Times found: ${times ? times.join(', ') : 'none'}`);
|
|
} else {
|
|
console.log(`✗ NOT matched`);
|
|
}
|
|
|
|
console.log('\n=== Testing poniedziałek (Monday) ===\n');
|
|
|
|
const ponieRegex = new RegExp(
|
|
`(?:^|\\s|[,;:])poniedziałek(?:(?:[^:]{1,50})?:|\\s+)([^]*?)(?=${allDayNamesPattern}|$)`,
|
|
'i'
|
|
);
|
|
|
|
const ponieMatch = text.match(ponieRegex);
|
|
if (ponieMatch) {
|
|
console.log(`✓ Matched!`);
|
|
console.log(` Full match: "${ponieMatch[0].substring(0, 100)}"`);
|
|
console.log(` Captured text: "${ponieMatch[1].substring(0, 100)}"`);
|
|
console.log('');
|
|
|
|
const times = ponieMatch[1].match(/\b(\d{1,2})\s+(\d{2})(?!\d)/g);
|
|
console.log(` Times found: ${times ? times.join(', ') : 'none'}`);
|
|
} else {
|
|
console.log(`✗ NOT matched`);
|
|
}
|
|
|
|
console.log('\n=== Analyzing why niedziela might fail ===\n');
|
|
|
|
// The issue might be "niedziela i uroczystości:" - the phrase is long
|
|
// Check if the lookahead is hitting "uroczystości" before getting to the times
|
|
const niedziela_index = text.indexOf('niedziela');
|
|
const next_day_index = Math.min(
|
|
...sortedDayNames
|
|
.filter(d => d !== 'niedziela')
|
|
.map(d => text.indexOf(d, niedziela_index))
|
|
.filter(i => i > 0)
|
|
);
|
|
|
|
console.log(`niedziela position: ${niedziela_index}`);
|
|
console.log(`Next day name position: ${next_day_index}`);
|
|
console.log(`Text between: "${text.substring(niedziela_index, next_day_index)}"`);
|