211 lines
5.5 KiB
TypeScript
211 lines
5.5 KiB
TypeScript
|
|
#!/usr/bin/env tsx
|
||
|
|
/**
|
||
|
|
* Test scraper on a diverse sample of international churches
|
||
|
|
* to identify edge cases across different languages and formats
|
||
|
|
*/
|
||
|
|
|
||
|
|
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||
|
|
|
||
|
|
interface TestChurch {
|
||
|
|
name: string;
|
||
|
|
url: string;
|
||
|
|
country: string;
|
||
|
|
language: string;
|
||
|
|
expectedDays?: string; // e.g., "Sun-Sat" or "Sun, Wed, Sat"
|
||
|
|
notes?: string;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Sample churches from different countries/languages
|
||
|
|
const testChurches: TestChurch[] = [
|
||
|
|
// FRENCH
|
||
|
|
{
|
||
|
|
name: 'Saint-Étienne du Mont, Paris',
|
||
|
|
url: 'https://www.saintetiennedumontparis.fr/',
|
||
|
|
country: 'FR',
|
||
|
|
language: 'French',
|
||
|
|
notes: 'French format with "du lundi au vendredi"',
|
||
|
|
},
|
||
|
|
{
|
||
|
|
name: 'Notre-Dame de la Garde, Marseille',
|
||
|
|
url: 'https://www.notredamedelagarde.fr/',
|
||
|
|
country: 'FR',
|
||
|
|
language: 'French',
|
||
|
|
notes: 'Major pilgrimage site',
|
||
|
|
},
|
||
|
|
|
||
|
|
// GERMAN
|
||
|
|
{
|
||
|
|
name: 'St. Peter, Munich',
|
||
|
|
url: 'https://www.alterpeter.de/',
|
||
|
|
country: 'DE',
|
||
|
|
language: 'German',
|
||
|
|
notes: 'German format with "bis" for ranges',
|
||
|
|
},
|
||
|
|
{
|
||
|
|
name: 'Kölner Dom, Cologne',
|
||
|
|
url: 'https://www.koelner-dom.de/',
|
||
|
|
country: 'DE',
|
||
|
|
language: 'German',
|
||
|
|
notes: 'Cathedral with Uhr time format',
|
||
|
|
},
|
||
|
|
|
||
|
|
// SPANISH
|
||
|
|
{
|
||
|
|
name: 'Sagrada Família, Barcelona',
|
||
|
|
url: 'https://sagradafamilia.org/',
|
||
|
|
country: 'ES',
|
||
|
|
language: 'Spanish',
|
||
|
|
notes: 'Major tourist site, may have complex schedule',
|
||
|
|
},
|
||
|
|
{
|
||
|
|
name: 'Parroquia San Miguel, Madrid',
|
||
|
|
url: 'https://www.parroquiasanmiguel.es/',
|
||
|
|
country: 'ES',
|
||
|
|
language: 'Spanish',
|
||
|
|
notes: 'Spanish format with "de lunes a viernes"',
|
||
|
|
},
|
||
|
|
|
||
|
|
// PORTUGUESE
|
||
|
|
{
|
||
|
|
name: 'Basílica da Estrela, Lisbon',
|
||
|
|
url: 'https://www.basilicadaestrela.com/',
|
||
|
|
country: 'PT',
|
||
|
|
language: 'Portuguese',
|
||
|
|
notes: 'Portuguese format',
|
||
|
|
},
|
||
|
|
|
||
|
|
// ITALIAN
|
||
|
|
{
|
||
|
|
name: 'Santa Maria Maggiore, Rome',
|
||
|
|
url: 'https://www.vatican.va/various/basiliche/sm_maggiore/index_it.htm',
|
||
|
|
country: 'IT',
|
||
|
|
language: 'Italian',
|
||
|
|
notes: 'Major basilica',
|
||
|
|
},
|
||
|
|
{
|
||
|
|
name: 'Duomo di Milano',
|
||
|
|
url: 'https://www.duomomilano.it/',
|
||
|
|
country: 'IT',
|
||
|
|
language: 'Italian',
|
||
|
|
notes: 'Cathedral with Italian format',
|
||
|
|
},
|
||
|
|
|
||
|
|
// DUTCH
|
||
|
|
{
|
||
|
|
name: 'Basiliek van de H. Nicolaas, Amsterdam',
|
||
|
|
url: 'https://www.nicolaas-parochie.nl/',
|
||
|
|
country: 'NL',
|
||
|
|
language: 'Dutch',
|
||
|
|
notes: 'Dutch format with "tot" for ranges',
|
||
|
|
},
|
||
|
|
|
||
|
|
// CZECH
|
||
|
|
{
|
||
|
|
name: 'Chrám sv. Víta, Prague',
|
||
|
|
url: 'https://www.katedralasvatehovita.cz/',
|
||
|
|
country: 'CZ',
|
||
|
|
language: 'Czech',
|
||
|
|
notes: 'Czech format',
|
||
|
|
},
|
||
|
|
|
||
|
|
// HUNGARIAN
|
||
|
|
{
|
||
|
|
name: 'Szent István Bazilika, Budapest',
|
||
|
|
url: 'https://www.bazilika.biz/',
|
||
|
|
country: 'HU',
|
||
|
|
language: 'Hungarian',
|
||
|
|
notes: 'Hungarian format',
|
||
|
|
},
|
||
|
|
|
||
|
|
// More complex cases
|
||
|
|
{
|
||
|
|
name: 'Cathédrale Notre-Dame, Strasbourg',
|
||
|
|
url: 'https://www.cathedrale-strasbourg.fr/',
|
||
|
|
country: 'FR',
|
||
|
|
language: 'French',
|
||
|
|
notes: 'Bilingual region (French/German)',
|
||
|
|
},
|
||
|
|
];
|
||
|
|
|
||
|
|
async function testChurch(church: TestChurch, scraper: GenericScraper): Promise<void> {
|
||
|
|
console.log(`\n${'='.repeat(80)}`);
|
||
|
|
console.log(`📍 ${church.name}`);
|
||
|
|
console.log(` ${church.url}`);
|
||
|
|
console.log(` Language: ${church.language} | Country: ${church.country}`);
|
||
|
|
if (church.notes) console.log(` Notes: ${church.notes}`);
|
||
|
|
console.log(`${'='.repeat(80)}`);
|
||
|
|
|
||
|
|
try {
|
||
|
|
scraper.setCountry(church.country);
|
||
|
|
const result = await scraper.scrape(church.url);
|
||
|
|
|
||
|
|
if (!result.success) {
|
||
|
|
console.log(`❌ FAILED: ${result.error || 'Unknown error'}`);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (result.schedules.length === 0) {
|
||
|
|
console.log(`⚠️ SUCCESS but NO SCHEDULES found`);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Group by day
|
||
|
|
const byDay: Record<number, typeof result.schedules> = {};
|
||
|
|
for (const sched of result.schedules) {
|
||
|
|
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
|
||
|
|
byDay[sched.dayOfWeek].push(sched);
|
||
|
|
}
|
||
|
|
|
||
|
|
const dayNames = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'];
|
||
|
|
console.log(`\n✅ Found ${result.schedules.length} schedules:\n`);
|
||
|
|
|
||
|
|
for (let i = 0; i < 7; i++) {
|
||
|
|
if (byDay[i]) {
|
||
|
|
const times = byDay[i].map(s => {
|
||
|
|
let str = s.time;
|
||
|
|
if (s.massType) str += ` (${s.massType})`;
|
||
|
|
if (s.language && s.language !== 'English') str += ` [${s.language}]`;
|
||
|
|
return str;
|
||
|
|
}).join(', ');
|
||
|
|
console.log(` ${dayNames[i]}: ${times}`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
} catch (error) {
|
||
|
|
console.log(`❌ ERROR: ${error instanceof Error ? error.message : String(error)}`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
async function main() {
|
||
|
|
const scraper = new GenericScraper();
|
||
|
|
await scraper.init();
|
||
|
|
|
||
|
|
console.log('🌍 INTERNATIONAL CHURCH SCRAPER TEST');
|
||
|
|
console.log(`Testing ${testChurches.length} churches across ${new Set(testChurches.map(c => c.country)).size} countries`);
|
||
|
|
|
||
|
|
const results: { success: number; failed: number; noSchedules: number } = {
|
||
|
|
success: 0,
|
||
|
|
failed: 0,
|
||
|
|
noSchedules: 0,
|
||
|
|
};
|
||
|
|
|
||
|
|
for (const church of testChurches) {
|
||
|
|
await testChurch(church, scraper);
|
||
|
|
|
||
|
|
// Brief delay between requests to be respectful
|
||
|
|
await new Promise(resolve => setTimeout(resolve, 2000));
|
||
|
|
}
|
||
|
|
|
||
|
|
await scraper.close();
|
||
|
|
|
||
|
|
console.log(`\n${'='.repeat(80)}`);
|
||
|
|
console.log('📊 SUMMARY');
|
||
|
|
console.log(`${'='.repeat(80)}`);
|
||
|
|
console.log(`Total tested: ${testChurches.length}`);
|
||
|
|
console.log(`✅ Success with schedules: ${results.success}`);
|
||
|
|
console.log(`⚠️ Success but no schedules: ${results.noSchedules}`);
|
||
|
|
console.log(`❌ Failed: ${results.failed}`);
|
||
|
|
}
|
||
|
|
|
||
|
|
main().catch(console.error);
|