Files
ScraperControl/scripts/debug/test-polish-sections.ts
Albert 2c51513851 chore: sync with Gitea master and restore local-only files
Reset local main to gitea/master (new source of truth) and restored
local-only files: web scrapers, admin dashboard, ChromaDB integration,
debug scripts, and utility libraries that aren't tracked in Gitea.

Gitea master adds: discovermass, buscarmisas-network, hk-parishes,
bohosluzby, kerknet, gottesdienstzeiten, miserend importers,
ClaimRequest model, forward geocoding, heartbeat healthcheck.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-12 19:11:22 -04:00

71 lines
2.3 KiB
TypeScript

#!/usr/bin/env tsx
/**
* Test which sections are being created for Polish church
*/
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
// Exact text from the page
const text = `msze święte niedziela i uroczystości: 8 00 , 9 30 (lubojenka), 11 00 , 16 00 w lipcu i sierpniu nie ma mszy popołudniowej!--> dni powszednie: poniedziałek: godz. 8 00 wtorek - sobota: godz. 18 00`.toLowerCase();
console.log('Text:');
console.log(text);
console.log('\n');
const dayConfigs = getDayNamesForCountry('PL');
const dayPatterns = buildDayPatterns(dayConfigs);
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
console.log('=== Testing individual day matching ===\n');
// Test niedziela specifically
const niedziela = 'niedziela';
const escaped = niedziela.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const regex = new RegExp(
`(?:^|\\s|[,;:])${escaped}[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
'i'
);
const match = text.match(regex);
if (match) {
console.log(`✓ niedziela matched!`);
console.log(` Full match: "${match[0].substring(0, 100)}"`);
console.log(` Captured text: "${match[1].substring(0, 100)}"`);
console.log('');
// Test if times can be extracted from captured text
const spacePattern = /\b(\d{1,2})\s+(\d{2})(?!\d)/g;
const times = match[1].match(spacePattern);
console.log(` Times in captured text: ${times ? times.join(', ') : 'none'}`);
} else {
console.log(`✗ niedziela NOT matched`);
console.log('');
// Try simpler regex
const simpleRegex = /niedziela[:\s]+(.{0,100})/i;
const simpleMatch = text.match(simpleRegex);
if (simpleMatch) {
console.log(`Simple regex matched: "${simpleMatch[1]}"`);
}
}
// Test poniedziałek
console.log('\n=== Testing poniedziałek ===\n');
const ponieRegex = new RegExp(
`(?:^|\\s|[,;:])poniedziałek[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
'i'
);
const ponieMatch = text.match(ponieRegex);
if (ponieMatch) {
console.log(`✓ poniedziałek matched!`);
console.log(` Captured text: "${ponieMatch[1].substring(0, 100)}"`);
const times = ponieMatch[1].match(/\b(\d{1,2})\s+(\d{2})(?!\d)/g);
console.log(` Times: ${times ? times.join(', ') : 'none'}`);
} else {
console.log(`✗ poniedziałek NOT matched`);
}