chore: sync with Gitea master and restore local-only files
Reset local main to gitea/master (new source of truth) and restored local-only files: web scrapers, admin dashboard, ChromaDB integration, debug scripts, and utility libraries that aren't tracked in Gitea. Gitea master adds: discovermass, buscarmisas-network, hk-parishes, bohosluzby, kerknet, gottesdienstzeiten, miserend importers, ClaimRequest model, forward geocoding, heartbeat healthcheck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
79
scripts/debug/debug-polish-sunday-monday.ts
Normal file
79
scripts/debug/debug-polish-sunday-monday.ts
Normal file
@@ -0,0 +1,79 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Debug why Sunday and Monday aren't parsing for Polish church
|
||||
*/
|
||||
|
||||
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
|
||||
|
||||
// Exact schedule text from website
|
||||
const text = `msze święte niedziela i uroczystości: 8 00 , 9 30 (lubojenka), 11 00 , 16 00 w lipcu i sierpniu nie ma mszy popołudniowej!--> dni powszednie: poniedziałek: godz. 8 00 wtorek - sobota: godz. 18 00`.toLowerCase();
|
||||
|
||||
console.log('Text to parse:');
|
||||
console.log(text);
|
||||
console.log('\n');
|
||||
|
||||
const dayConfigs = getDayNamesForCountry('PL');
|
||||
const dayPatterns = buildDayPatterns(dayConfigs);
|
||||
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
|
||||
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
|
||||
|
||||
console.log('=== Testing niedziela (Sunday) ===\n');
|
||||
|
||||
// Current regex pattern
|
||||
const niedziela = 'niedziela';
|
||||
const escaped = niedziela.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||
const regex = new RegExp(
|
||||
`(?:^|\\s|[,;:])${escaped}(?:(?:[^:]{1,50})?:|\\s+)([^]*?)(?=${allDayNamesPattern}|$)`,
|
||||
'i'
|
||||
);
|
||||
|
||||
const match = text.match(regex);
|
||||
if (match) {
|
||||
console.log(`✓ Matched!`);
|
||||
console.log(` Full match: "${match[0].substring(0, 100)}"`);
|
||||
console.log(` Captured text: "${match[1].substring(0, 100)}"`);
|
||||
console.log('');
|
||||
|
||||
// Check if times can be extracted
|
||||
const spacePattern = /\b(\d{1,2})\s+(\d{2})(?!\d)/g;
|
||||
const times = match[1].match(spacePattern);
|
||||
console.log(` Times found: ${times ? times.join(', ') : 'none'}`);
|
||||
} else {
|
||||
console.log(`✗ NOT matched`);
|
||||
}
|
||||
|
||||
console.log('\n=== Testing poniedziałek (Monday) ===\n');
|
||||
|
||||
const ponieRegex = new RegExp(
|
||||
`(?:^|\\s|[,;:])poniedziałek(?:(?:[^:]{1,50})?:|\\s+)([^]*?)(?=${allDayNamesPattern}|$)`,
|
||||
'i'
|
||||
);
|
||||
|
||||
const ponieMatch = text.match(ponieRegex);
|
||||
if (ponieMatch) {
|
||||
console.log(`✓ Matched!`);
|
||||
console.log(` Full match: "${ponieMatch[0].substring(0, 100)}"`);
|
||||
console.log(` Captured text: "${ponieMatch[1].substring(0, 100)}"`);
|
||||
console.log('');
|
||||
|
||||
const times = ponieMatch[1].match(/\b(\d{1,2})\s+(\d{2})(?!\d)/g);
|
||||
console.log(` Times found: ${times ? times.join(', ') : 'none'}`);
|
||||
} else {
|
||||
console.log(`✗ NOT matched`);
|
||||
}
|
||||
|
||||
console.log('\n=== Analyzing why niedziela might fail ===\n');
|
||||
|
||||
// The issue might be "niedziela i uroczystości:" - the phrase is long
|
||||
// Check if the lookahead is hitting "uroczystości" before getting to the times
|
||||
const niedziela_index = text.indexOf('niedziela');
|
||||
const next_day_index = Math.min(
|
||||
...sortedDayNames
|
||||
.filter(d => d !== 'niedziela')
|
||||
.map(d => text.indexOf(d, niedziela_index))
|
||||
.filter(i => i > 0)
|
||||
);
|
||||
|
||||
console.log(`niedziela position: ${niedziela_index}`);
|
||||
console.log(`Next day name position: ${next_day_index}`);
|
||||
console.log(`Text between: "${text.substring(niedziela_index, next_day_index)}"`);
|
||||
Reference in New Issue
Block a user