chore: sync with Gitea master and restore local-only files
Reset local main to gitea/master (new source of truth) and restored local-only files: web scrapers, admin dashboard, ChromaDB integration, debug scripts, and utility libraries that aren't tracked in Gitea. Gitea master adds: discovermass, buscarmisas-network, hk-parishes, bohosluzby, kerknet, gottesdienstzeiten, miserend importers, ClaimRequest model, forward geocoding, heartbeat healthcheck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
210
scripts/debug/test-international-sample.ts
Normal file
210
scripts/debug/test-international-sample.ts
Normal file
@@ -0,0 +1,210 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Test scraper on a diverse sample of international churches
|
||||
* to identify edge cases across different languages and formats
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
interface TestChurch {
|
||||
name: string;
|
||||
url: string;
|
||||
country: string;
|
||||
language: string;
|
||||
expectedDays?: string; // e.g., "Sun-Sat" or "Sun, Wed, Sat"
|
||||
notes?: string;
|
||||
}
|
||||
|
||||
// Sample churches from different countries/languages
|
||||
const testChurches: TestChurch[] = [
|
||||
// FRENCH
|
||||
{
|
||||
name: 'Saint-Étienne du Mont, Paris',
|
||||
url: 'https://www.saintetiennedumontparis.fr/',
|
||||
country: 'FR',
|
||||
language: 'French',
|
||||
notes: 'French format with "du lundi au vendredi"',
|
||||
},
|
||||
{
|
||||
name: 'Notre-Dame de la Garde, Marseille',
|
||||
url: 'https://www.notredamedelagarde.fr/',
|
||||
country: 'FR',
|
||||
language: 'French',
|
||||
notes: 'Major pilgrimage site',
|
||||
},
|
||||
|
||||
// GERMAN
|
||||
{
|
||||
name: 'St. Peter, Munich',
|
||||
url: 'https://www.alterpeter.de/',
|
||||
country: 'DE',
|
||||
language: 'German',
|
||||
notes: 'German format with "bis" for ranges',
|
||||
},
|
||||
{
|
||||
name: 'Kölner Dom, Cologne',
|
||||
url: 'https://www.koelner-dom.de/',
|
||||
country: 'DE',
|
||||
language: 'German',
|
||||
notes: 'Cathedral with Uhr time format',
|
||||
},
|
||||
|
||||
// SPANISH
|
||||
{
|
||||
name: 'Sagrada Família, Barcelona',
|
||||
url: 'https://sagradafamilia.org/',
|
||||
country: 'ES',
|
||||
language: 'Spanish',
|
||||
notes: 'Major tourist site, may have complex schedule',
|
||||
},
|
||||
{
|
||||
name: 'Parroquia San Miguel, Madrid',
|
||||
url: 'https://www.parroquiasanmiguel.es/',
|
||||
country: 'ES',
|
||||
language: 'Spanish',
|
||||
notes: 'Spanish format with "de lunes a viernes"',
|
||||
},
|
||||
|
||||
// PORTUGUESE
|
||||
{
|
||||
name: 'Basílica da Estrela, Lisbon',
|
||||
url: 'https://www.basilicadaestrela.com/',
|
||||
country: 'PT',
|
||||
language: 'Portuguese',
|
||||
notes: 'Portuguese format',
|
||||
},
|
||||
|
||||
// ITALIAN
|
||||
{
|
||||
name: 'Santa Maria Maggiore, Rome',
|
||||
url: 'https://www.vatican.va/various/basiliche/sm_maggiore/index_it.htm',
|
||||
country: 'IT',
|
||||
language: 'Italian',
|
||||
notes: 'Major basilica',
|
||||
},
|
||||
{
|
||||
name: 'Duomo di Milano',
|
||||
url: 'https://www.duomomilano.it/',
|
||||
country: 'IT',
|
||||
language: 'Italian',
|
||||
notes: 'Cathedral with Italian format',
|
||||
},
|
||||
|
||||
// DUTCH
|
||||
{
|
||||
name: 'Basiliek van de H. Nicolaas, Amsterdam',
|
||||
url: 'https://www.nicolaas-parochie.nl/',
|
||||
country: 'NL',
|
||||
language: 'Dutch',
|
||||
notes: 'Dutch format with "tot" for ranges',
|
||||
},
|
||||
|
||||
// CZECH
|
||||
{
|
||||
name: 'Chrám sv. Víta, Prague',
|
||||
url: 'https://www.katedralasvatehovita.cz/',
|
||||
country: 'CZ',
|
||||
language: 'Czech',
|
||||
notes: 'Czech format',
|
||||
},
|
||||
|
||||
// HUNGARIAN
|
||||
{
|
||||
name: 'Szent István Bazilika, Budapest',
|
||||
url: 'https://www.bazilika.biz/',
|
||||
country: 'HU',
|
||||
language: 'Hungarian',
|
||||
notes: 'Hungarian format',
|
||||
},
|
||||
|
||||
// More complex cases
|
||||
{
|
||||
name: 'Cathédrale Notre-Dame, Strasbourg',
|
||||
url: 'https://www.cathedrale-strasbourg.fr/',
|
||||
country: 'FR',
|
||||
language: 'French',
|
||||
notes: 'Bilingual region (French/German)',
|
||||
},
|
||||
];
|
||||
|
||||
async function testChurch(church: TestChurch, scraper: GenericScraper): Promise<void> {
|
||||
console.log(`\n${'='.repeat(80)}`);
|
||||
console.log(`📍 ${church.name}`);
|
||||
console.log(` ${church.url}`);
|
||||
console.log(` Language: ${church.language} | Country: ${church.country}`);
|
||||
if (church.notes) console.log(` Notes: ${church.notes}`);
|
||||
console.log(`${'='.repeat(80)}`);
|
||||
|
||||
try {
|
||||
scraper.setCountry(church.country);
|
||||
const result = await scraper.scrape(church.url);
|
||||
|
||||
if (!result.success) {
|
||||
console.log(`❌ FAILED: ${result.error || 'Unknown error'}`);
|
||||
return;
|
||||
}
|
||||
|
||||
if (result.schedules.length === 0) {
|
||||
console.log(`⚠️ SUCCESS but NO SCHEDULES found`);
|
||||
return;
|
||||
}
|
||||
|
||||
// Group by day
|
||||
const byDay: Record<number, typeof result.schedules> = {};
|
||||
for (const sched of result.schedules) {
|
||||
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
|
||||
byDay[sched.dayOfWeek].push(sched);
|
||||
}
|
||||
|
||||
const dayNames = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'];
|
||||
console.log(`\n✅ Found ${result.schedules.length} schedules:\n`);
|
||||
|
||||
for (let i = 0; i < 7; i++) {
|
||||
if (byDay[i]) {
|
||||
const times = byDay[i].map(s => {
|
||||
let str = s.time;
|
||||
if (s.massType) str += ` (${s.massType})`;
|
||||
if (s.language && s.language !== 'English') str += ` [${s.language}]`;
|
||||
return str;
|
||||
}).join(', ');
|
||||
console.log(` ${dayNames[i]}: ${times}`);
|
||||
}
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.log(`❌ ERROR: ${error instanceof Error ? error.message : String(error)}`);
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
|
||||
console.log('🌍 INTERNATIONAL CHURCH SCRAPER TEST');
|
||||
console.log(`Testing ${testChurches.length} churches across ${new Set(testChurches.map(c => c.country)).size} countries`);
|
||||
|
||||
const results: { success: number; failed: number; noSchedules: number } = {
|
||||
success: 0,
|
||||
failed: 0,
|
||||
noSchedules: 0,
|
||||
};
|
||||
|
||||
for (const church of testChurches) {
|
||||
await testChurch(church, scraper);
|
||||
|
||||
// Brief delay between requests to be respectful
|
||||
await new Promise(resolve => setTimeout(resolve, 2000));
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
|
||||
console.log(`\n${'='.repeat(80)}`);
|
||||
console.log('📊 SUMMARY');
|
||||
console.log(`${'='.repeat(80)}`);
|
||||
console.log(`Total tested: ${testChurches.length}`);
|
||||
console.log(`✅ Success with schedules: ${results.success}`);
|
||||
console.log(`⚠️ Success but no schedules: ${results.noSchedules}`);
|
||||
console.log(`❌ Failed: ${results.failed}`);
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
Reference in New Issue
Block a user