#!/usr/bin/env tsx /** * Test scraper on a diverse sample of international churches * to identify edge cases across different languages and formats */ import { GenericScraper } from '../../src/scrapers/strategies/generic'; interface TestChurch { name: string; url: string; country: string; language: string; expectedDays?: string; // e.g., "Sun-Sat" or "Sun, Wed, Sat" notes?: string; } // Sample churches from different countries/languages const testChurches: TestChurch[] = [ // FRENCH { name: 'Saint-Étienne du Mont, Paris', url: 'https://www.saintetiennedumontparis.fr/', country: 'FR', language: 'French', notes: 'French format with "du lundi au vendredi"', }, { name: 'Notre-Dame de la Garde, Marseille', url: 'https://www.notredamedelagarde.fr/', country: 'FR', language: 'French', notes: 'Major pilgrimage site', }, // GERMAN { name: 'St. Peter, Munich', url: 'https://www.alterpeter.de/', country: 'DE', language: 'German', notes: 'German format with "bis" for ranges', }, { name: 'Kölner Dom, Cologne', url: 'https://www.koelner-dom.de/', country: 'DE', language: 'German', notes: 'Cathedral with Uhr time format', }, // SPANISH { name: 'Sagrada Família, Barcelona', url: 'https://sagradafamilia.org/', country: 'ES', language: 'Spanish', notes: 'Major tourist site, may have complex schedule', }, { name: 'Parroquia San Miguel, Madrid', url: 'https://www.parroquiasanmiguel.es/', country: 'ES', language: 'Spanish', notes: 'Spanish format with "de lunes a viernes"', }, // PORTUGUESE { name: 'Basílica da Estrela, Lisbon', url: 'https://www.basilicadaestrela.com/', country: 'PT', language: 'Portuguese', notes: 'Portuguese format', }, // ITALIAN { name: 'Santa Maria Maggiore, Rome', url: 'https://www.vatican.va/various/basiliche/sm_maggiore/index_it.htm', country: 'IT', language: 'Italian', notes: 'Major basilica', }, { name: 'Duomo di Milano', url: 'https://www.duomomilano.it/', country: 'IT', language: 'Italian', notes: 'Cathedral with Italian format', }, // DUTCH { name: 'Basiliek van de H. Nicolaas, Amsterdam', url: 'https://www.nicolaas-parochie.nl/', country: 'NL', language: 'Dutch', notes: 'Dutch format with "tot" for ranges', }, // CZECH { name: 'Chrám sv. Víta, Prague', url: 'https://www.katedralasvatehovita.cz/', country: 'CZ', language: 'Czech', notes: 'Czech format', }, // HUNGARIAN { name: 'Szent István Bazilika, Budapest', url: 'https://www.bazilika.biz/', country: 'HU', language: 'Hungarian', notes: 'Hungarian format', }, // More complex cases { name: 'Cathédrale Notre-Dame, Strasbourg', url: 'https://www.cathedrale-strasbourg.fr/', country: 'FR', language: 'French', notes: 'Bilingual region (French/German)', }, ]; async function testChurch(church: TestChurch, scraper: GenericScraper): Promise { console.log(`\n${'='.repeat(80)}`); console.log(`📍 ${church.name}`); console.log(` ${church.url}`); console.log(` Language: ${church.language} | Country: ${church.country}`); if (church.notes) console.log(` Notes: ${church.notes}`); console.log(`${'='.repeat(80)}`); try { scraper.setCountry(church.country); const result = await scraper.scrape(church.url); if (!result.success) { console.log(`❌ FAILED: ${result.error || 'Unknown error'}`); return; } if (result.schedules.length === 0) { console.log(`⚠️ SUCCESS but NO SCHEDULES found`); return; } // Group by day const byDay: Record = {}; for (const sched of result.schedules) { if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = []; byDay[sched.dayOfWeek].push(sched); } const dayNames = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']; console.log(`\n✅ Found ${result.schedules.length} schedules:\n`); for (let i = 0; i < 7; i++) { if (byDay[i]) { const times = byDay[i].map(s => { let str = s.time; if (s.massType) str += ` (${s.massType})`; if (s.language && s.language !== 'English') str += ` [${s.language}]`; return str; }).join(', '); console.log(` ${dayNames[i]}: ${times}`); } } } catch (error) { console.log(`❌ ERROR: ${error instanceof Error ? error.message : String(error)}`); } } async function main() { const scraper = new GenericScraper(); await scraper.init(); console.log('🌍 INTERNATIONAL CHURCH SCRAPER TEST'); console.log(`Testing ${testChurches.length} churches across ${new Set(testChurches.map(c => c.country)).size} countries`); const results: { success: number; failed: number; noSchedules: number } = { success: 0, failed: 0, noSchedules: 0, }; for (const church of testChurches) { await testChurch(church, scraper); // Brief delay between requests to be respectful await new Promise(resolve => setTimeout(resolve, 2000)); } await scraper.close(); console.log(`\n${'='.repeat(80)}`); console.log('📊 SUMMARY'); console.log(`${'='.repeat(80)}`); console.log(`Total tested: ${testChurches.length}`); console.log(`✅ Success with schedules: ${results.success}`); console.log(`⚠️ Success but no schedules: ${results.noSchedules}`); console.log(`❌ Failed: ${results.failed}`); } main().catch(console.error);