#!/usr/bin/env tsx /** * Debug why Sunday and Monday aren't parsing for Polish church */ import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names'; // Exact schedule text from website const text = `msze święte niedziela i uroczystości: 8 00 , 9 30 (lubojenka), 11 00 , 16 00 w lipcu i sierpniu nie ma mszy popołudniowej!--> dni powszednie: poniedziałek: godz. 8 00 wtorek - sobota: godz. 18 00`.toLowerCase(); console.log('Text to parse:'); console.log(text); console.log('\n'); const dayConfigs = getDayNamesForCountry('PL'); const dayPatterns = buildDayPatterns(dayConfigs); const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length); const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|'); console.log('=== Testing niedziela (Sunday) ===\n'); // Current regex pattern const niedziela = 'niedziela'; const escaped = niedziela.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); const regex = new RegExp( `(?:^|\\s|[,;:])${escaped}(?:(?:[^:]{1,50})?:|\\s+)([^]*?)(?=${allDayNamesPattern}|$)`, 'i' ); const match = text.match(regex); if (match) { console.log(`✓ Matched!`); console.log(` Full match: "${match[0].substring(0, 100)}"`); console.log(` Captured text: "${match[1].substring(0, 100)}"`); console.log(''); // Check if times can be extracted const spacePattern = /\b(\d{1,2})\s+(\d{2})(?!\d)/g; const times = match[1].match(spacePattern); console.log(` Times found: ${times ? times.join(', ') : 'none'}`); } else { console.log(`✗ NOT matched`); } console.log('\n=== Testing poniedziałek (Monday) ===\n'); const ponieRegex = new RegExp( `(?:^|\\s|[,;:])poniedziałek(?:(?:[^:]{1,50})?:|\\s+)([^]*?)(?=${allDayNamesPattern}|$)`, 'i' ); const ponieMatch = text.match(ponieRegex); if (ponieMatch) { console.log(`✓ Matched!`); console.log(` Full match: "${ponieMatch[0].substring(0, 100)}"`); console.log(` Captured text: "${ponieMatch[1].substring(0, 100)}"`); console.log(''); const times = ponieMatch[1].match(/\b(\d{1,2})\s+(\d{2})(?!\d)/g); console.log(` Times found: ${times ? times.join(', ') : 'none'}`); } else { console.log(`✗ NOT matched`); } console.log('\n=== Analyzing why niedziela might fail ===\n'); // The issue might be "niedziela i uroczystości:" - the phrase is long // Check if the lookahead is hitting "uroczystości" before getting to the times const niedziela_index = text.indexOf('niedziela'); const next_day_index = Math.min( ...sortedDayNames .filter(d => d !== 'niedziela') .map(d => text.indexOf(d, niedziela_index)) .filter(i => i > 0) ); console.log(`niedziela position: ${niedziela_index}`); console.log(`Next day name position: ${next_day_index}`); console.log(`Text between: "${text.substring(niedziela_index, next_day_index)}"`);