Files
ScraperControl/scripts/debug/debug-polish-sunday-monday.ts

80 lines
2.8 KiB
TypeScript
Raw Permalink Normal View History

#!/usr/bin/env tsx
/**
* Debug why Sunday and Monday aren't parsing for Polish church
*/
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
// Exact schedule text from website
const text = `msze święte niedziela i uroczystości: 8 00 , 9 30 (lubojenka), 11 00 , 16 00 w lipcu i sierpniu nie ma mszy popołudniowej!--> dni powszednie: poniedziałek: godz. 8 00 wtorek - sobota: godz. 18 00`.toLowerCase();
console.log('Text to parse:');
console.log(text);
console.log('\n');
const dayConfigs = getDayNamesForCountry('PL');
const dayPatterns = buildDayPatterns(dayConfigs);
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
console.log('=== Testing niedziela (Sunday) ===\n');
// Current regex pattern
const niedziela = 'niedziela';
const escaped = niedziela.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const regex = new RegExp(
`(?:^|\\s|[,;:])${escaped}(?:(?:[^:]{1,50})?:|\\s+)([^]*?)(?=${allDayNamesPattern}|$)`,
'i'
);
const match = text.match(regex);
if (match) {
console.log(`✓ Matched!`);
console.log(` Full match: "${match[0].substring(0, 100)}"`);
console.log(` Captured text: "${match[1].substring(0, 100)}"`);
console.log('');
// Check if times can be extracted
const spacePattern = /\b(\d{1,2})\s+(\d{2})(?!\d)/g;
const times = match[1].match(spacePattern);
console.log(` Times found: ${times ? times.join(', ') : 'none'}`);
} else {
console.log(`✗ NOT matched`);
}
console.log('\n=== Testing poniedziałek (Monday) ===\n');
const ponieRegex = new RegExp(
`(?:^|\\s|[,;:])poniedziałek(?:(?:[^:]{1,50})?:|\\s+)([^]*?)(?=${allDayNamesPattern}|$)`,
'i'
);
const ponieMatch = text.match(ponieRegex);
if (ponieMatch) {
console.log(`✓ Matched!`);
console.log(` Full match: "${ponieMatch[0].substring(0, 100)}"`);
console.log(` Captured text: "${ponieMatch[1].substring(0, 100)}"`);
console.log('');
const times = ponieMatch[1].match(/\b(\d{1,2})\s+(\d{2})(?!\d)/g);
console.log(` Times found: ${times ? times.join(', ') : 'none'}`);
} else {
console.log(`✗ NOT matched`);
}
console.log('\n=== Analyzing why niedziela might fail ===\n');
// The issue might be "niedziela i uroczystości:" - the phrase is long
// Check if the lookahead is hitting "uroczystości" before getting to the times
const niedziela_index = text.indexOf('niedziela');
const next_day_index = Math.min(
...sortedDayNames
.filter(d => d !== 'niedziela')
.map(d => text.indexOf(d, niedziela_index))
.filter(i => i > 0)
);
console.log(`niedziela position: ${niedziela_index}`);
console.log(`Next day name position: ${next_day_index}`);
console.log(`Text between: "${text.substring(niedziela_index, next_day_index)}"`);