chore: sync with Gitea master and restore local-only files
Reset local main to gitea/master (new source of truth) and restored local-only files: web scrapers, admin dashboard, ChromaDB integration, debug scripts, and utility libraries that aren't tracked in Gitea. Gitea master adds: discovermass, buscarmisas-network, hk-parishes, bohosluzby, kerknet, gottesdienstzeiten, miserend importers, ClaimRequest model, forward geocoding, heartbeat healthcheck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
85
scripts/debug/debug-polish-church.ts
Normal file
85
scripts/debug/debug-polish-church.ts
Normal file
@@ -0,0 +1,85 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Debug Polish church in detail
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
|
||||
|
||||
async function debugPolish() {
|
||||
const url = 'http://parafialubojna.pl';
|
||||
console.log(`Debugging: ${url}\n`);
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('PL');
|
||||
|
||||
const result = await scraper.scrape(url);
|
||||
|
||||
console.log(`Success: ${result.success}`);
|
||||
console.log(`Schedules found: ${result.schedules.length}\n`);
|
||||
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Find the schedule section
|
||||
const scheduleIndex = text.indexOf('msze święte') || text.indexOf('msze swiete');
|
||||
if (scheduleIndex !== -1) {
|
||||
const snippet = text.substring(scheduleIndex, scheduleIndex + 500);
|
||||
console.log('Schedule section:');
|
||||
console.log(snippet);
|
||||
console.log('\n');
|
||||
|
||||
// Test all time pattern matches
|
||||
console.log('=== Testing time pattern matches ===\n');
|
||||
|
||||
// Space separator pattern
|
||||
const spacePattern = /\b(\d{1,2})\s+(\d{2})(?!\d)/g;
|
||||
const spaceMatches = snippet.match(spacePattern);
|
||||
console.log('Space-separated times (8 00, 9 30):');
|
||||
console.log(spaceMatches ? spaceMatches.join(', ') : 'none');
|
||||
console.log('');
|
||||
|
||||
// Colon pattern
|
||||
const colonPattern = /\d{1,2}:\d{2}/g;
|
||||
const colonMatches = snippet.match(colonPattern);
|
||||
console.log('Colon times (8:00, 9:30):');
|
||||
console.log(colonMatches ? colonMatches.join(', ') : 'none');
|
||||
console.log('');
|
||||
|
||||
// Polish day names
|
||||
console.log('=== Polish day names in snippet ===\n');
|
||||
const dayConfigs = getDayNamesForCountry('PL');
|
||||
const dayPatterns = buildDayPatterns(dayConfigs);
|
||||
|
||||
for (const [dayName, dayNum] of Object.entries(dayPatterns)) {
|
||||
if (snippet.includes(dayName)) {
|
||||
console.log(`Found: ${dayName} (day ${dayNum})`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log('\n=== Parsed schedules ===\n');
|
||||
const byDay: Record<number, typeof result.schedules> = {};
|
||||
for (const sched of result.schedules) {
|
||||
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
|
||||
byDay[sched.dayOfWeek].push(sched);
|
||||
}
|
||||
|
||||
const dayNames = ['Niedziela', 'Poniedziałek', 'Wtorek', 'Środa', 'Czwartek', 'Piątek', 'Sobota'];
|
||||
for (let i = 0; i < 7; i++) {
|
||||
if (byDay[i]) {
|
||||
console.log(`${dayNames[i]}: ${byDay[i].map(s => s.time).join(', ')}`);
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
debugPolish().catch(console.error);
|
||||
Reference in New Issue
Block a user