80 lines
2.8 KiB
TypeScript
80 lines
2.8 KiB
TypeScript
|
|
#!/usr/bin/env tsx
|
||
|
|
/**
|
||
|
|
* Debug why Sunday and Monday aren't parsing for Polish church
|
||
|
|
*/
|
||
|
|
|
||
|
|
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
|
||
|
|
|
||
|
|
// Exact schedule text from website
|
||
|
|
const text = `msze święte niedziela i uroczystości: 8 00 , 9 30 (lubojenka), 11 00 , 16 00 w lipcu i sierpniu nie ma mszy popołudniowej!--> dni powszednie: poniedziałek: godz. 8 00 wtorek - sobota: godz. 18 00`.toLowerCase();
|
||
|
|
|
||
|
|
console.log('Text to parse:');
|
||
|
|
console.log(text);
|
||
|
|
console.log('\n');
|
||
|
|
|
||
|
|
const dayConfigs = getDayNamesForCountry('PL');
|
||
|
|
const dayPatterns = buildDayPatterns(dayConfigs);
|
||
|
|
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
|
||
|
|
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
|
||
|
|
|
||
|
|
console.log('=== Testing niedziela (Sunday) ===\n');
|
||
|
|
|
||
|
|
// Current regex pattern
|
||
|
|
const niedziela = 'niedziela';
|
||
|
|
const escaped = niedziela.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||
|
|
const regex = new RegExp(
|
||
|
|
`(?:^|\\s|[,;:])${escaped}(?:(?:[^:]{1,50})?:|\\s+)([^]*?)(?=${allDayNamesPattern}|$)`,
|
||
|
|
'i'
|
||
|
|
);
|
||
|
|
|
||
|
|
const match = text.match(regex);
|
||
|
|
if (match) {
|
||
|
|
console.log(`✓ Matched!`);
|
||
|
|
console.log(` Full match: "${match[0].substring(0, 100)}"`);
|
||
|
|
console.log(` Captured text: "${match[1].substring(0, 100)}"`);
|
||
|
|
console.log('');
|
||
|
|
|
||
|
|
// Check if times can be extracted
|
||
|
|
const spacePattern = /\b(\d{1,2})\s+(\d{2})(?!\d)/g;
|
||
|
|
const times = match[1].match(spacePattern);
|
||
|
|
console.log(` Times found: ${times ? times.join(', ') : 'none'}`);
|
||
|
|
} else {
|
||
|
|
console.log(`✗ NOT matched`);
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log('\n=== Testing poniedziałek (Monday) ===\n');
|
||
|
|
|
||
|
|
const ponieRegex = new RegExp(
|
||
|
|
`(?:^|\\s|[,;:])poniedziałek(?:(?:[^:]{1,50})?:|\\s+)([^]*?)(?=${allDayNamesPattern}|$)`,
|
||
|
|
'i'
|
||
|
|
);
|
||
|
|
|
||
|
|
const ponieMatch = text.match(ponieRegex);
|
||
|
|
if (ponieMatch) {
|
||
|
|
console.log(`✓ Matched!`);
|
||
|
|
console.log(` Full match: "${ponieMatch[0].substring(0, 100)}"`);
|
||
|
|
console.log(` Captured text: "${ponieMatch[1].substring(0, 100)}"`);
|
||
|
|
console.log('');
|
||
|
|
|
||
|
|
const times = ponieMatch[1].match(/\b(\d{1,2})\s+(\d{2})(?!\d)/g);
|
||
|
|
console.log(` Times found: ${times ? times.join(', ') : 'none'}`);
|
||
|
|
} else {
|
||
|
|
console.log(`✗ NOT matched`);
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log('\n=== Analyzing why niedziela might fail ===\n');
|
||
|
|
|
||
|
|
// The issue might be "niedziela i uroczystości:" - the phrase is long
|
||
|
|
// Check if the lookahead is hitting "uroczystości" before getting to the times
|
||
|
|
const niedziela_index = text.indexOf('niedziela');
|
||
|
|
const next_day_index = Math.min(
|
||
|
|
...sortedDayNames
|
||
|
|
.filter(d => d !== 'niedziela')
|
||
|
|
.map(d => text.indexOf(d, niedziela_index))
|
||
|
|
.filter(i => i > 0)
|
||
|
|
);
|
||
|
|
|
||
|
|
console.log(`niedziela position: ${niedziela_index}`);
|
||
|
|
console.log(`Next day name position: ${next_day_index}`);
|
||
|
|
console.log(`Text between: "${text.substring(niedziela_index, next_day_index)}"`);
|