chore: sync with Gitea master and restore local-only files

Reset local main to gitea/master (new source of truth) and restored
local-only files: web scrapers, admin dashboard, ChromaDB integration,
debug scripts, and utility libraries that aren't tracked in Gitea.

Gitea master adds: discovermass, buscarmisas-network, hk-parishes,
bohosluzby, kerknet, gottesdienstzeiten, miserend importers,
ClaimRequest model, forward geocoding, heartbeat healthcheck.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Albert
2026-04-12 19:11:22 -04:00
parent 76cca3ba75
commit 2c51513851
133 changed files with 30381 additions and 0 deletions

View File

@@ -0,0 +1,210 @@
#!/usr/bin/env tsx
/**
* Test scraper on a diverse sample of international churches
* to identify edge cases across different languages and formats
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
interface TestChurch {
name: string;
url: string;
country: string;
language: string;
expectedDays?: string; // e.g., "Sun-Sat" or "Sun, Wed, Sat"
notes?: string;
}
// Sample churches from different countries/languages
const testChurches: TestChurch[] = [
// FRENCH
{
name: 'Saint-Étienne du Mont, Paris',
url: 'https://www.saintetiennedumontparis.fr/',
country: 'FR',
language: 'French',
notes: 'French format with "du lundi au vendredi"',
},
{
name: 'Notre-Dame de la Garde, Marseille',
url: 'https://www.notredamedelagarde.fr/',
country: 'FR',
language: 'French',
notes: 'Major pilgrimage site',
},
// GERMAN
{
name: 'St. Peter, Munich',
url: 'https://www.alterpeter.de/',
country: 'DE',
language: 'German',
notes: 'German format with "bis" for ranges',
},
{
name: 'Kölner Dom, Cologne',
url: 'https://www.koelner-dom.de/',
country: 'DE',
language: 'German',
notes: 'Cathedral with Uhr time format',
},
// SPANISH
{
name: 'Sagrada Família, Barcelona',
url: 'https://sagradafamilia.org/',
country: 'ES',
language: 'Spanish',
notes: 'Major tourist site, may have complex schedule',
},
{
name: 'Parroquia San Miguel, Madrid',
url: 'https://www.parroquiasanmiguel.es/',
country: 'ES',
language: 'Spanish',
notes: 'Spanish format with "de lunes a viernes"',
},
// PORTUGUESE
{
name: 'Basílica da Estrela, Lisbon',
url: 'https://www.basilicadaestrela.com/',
country: 'PT',
language: 'Portuguese',
notes: 'Portuguese format',
},
// ITALIAN
{
name: 'Santa Maria Maggiore, Rome',
url: 'https://www.vatican.va/various/basiliche/sm_maggiore/index_it.htm',
country: 'IT',
language: 'Italian',
notes: 'Major basilica',
},
{
name: 'Duomo di Milano',
url: 'https://www.duomomilano.it/',
country: 'IT',
language: 'Italian',
notes: 'Cathedral with Italian format',
},
// DUTCH
{
name: 'Basiliek van de H. Nicolaas, Amsterdam',
url: 'https://www.nicolaas-parochie.nl/',
country: 'NL',
language: 'Dutch',
notes: 'Dutch format with "tot" for ranges',
},
// CZECH
{
name: 'Chrám sv. Víta, Prague',
url: 'https://www.katedralasvatehovita.cz/',
country: 'CZ',
language: 'Czech',
notes: 'Czech format',
},
// HUNGARIAN
{
name: 'Szent István Bazilika, Budapest',
url: 'https://www.bazilika.biz/',
country: 'HU',
language: 'Hungarian',
notes: 'Hungarian format',
},
// More complex cases
{
name: 'Cathédrale Notre-Dame, Strasbourg',
url: 'https://www.cathedrale-strasbourg.fr/',
country: 'FR',
language: 'French',
notes: 'Bilingual region (French/German)',
},
];
async function testChurch(church: TestChurch, scraper: GenericScraper): Promise<void> {
console.log(`\n${'='.repeat(80)}`);
console.log(`📍 ${church.name}`);
console.log(` ${church.url}`);
console.log(` Language: ${church.language} | Country: ${church.country}`);
if (church.notes) console.log(` Notes: ${church.notes}`);
console.log(`${'='.repeat(80)}`);
try {
scraper.setCountry(church.country);
const result = await scraper.scrape(church.url);
if (!result.success) {
console.log(`❌ FAILED: ${result.error || 'Unknown error'}`);
return;
}
if (result.schedules.length === 0) {
console.log(`⚠️ SUCCESS but NO SCHEDULES found`);
return;
}
// Group by day
const byDay: Record<number, typeof result.schedules> = {};
for (const sched of result.schedules) {
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
byDay[sched.dayOfWeek].push(sched);
}
const dayNames = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'];
console.log(`\n✅ Found ${result.schedules.length} schedules:\n`);
for (let i = 0; i < 7; i++) {
if (byDay[i]) {
const times = byDay[i].map(s => {
let str = s.time;
if (s.massType) str += ` (${s.massType})`;
if (s.language && s.language !== 'English') str += ` [${s.language}]`;
return str;
}).join(', ');
console.log(` ${dayNames[i]}: ${times}`);
}
}
} catch (error) {
console.log(`❌ ERROR: ${error instanceof Error ? error.message : String(error)}`);
}
}
async function main() {
const scraper = new GenericScraper();
await scraper.init();
console.log('🌍 INTERNATIONAL CHURCH SCRAPER TEST');
console.log(`Testing ${testChurches.length} churches across ${new Set(testChurches.map(c => c.country)).size} countries`);
const results: { success: number; failed: number; noSchedules: number } = {
success: 0,
failed: 0,
noSchedules: 0,
};
for (const church of testChurches) {
await testChurch(church, scraper);
// Brief delay between requests to be respectful
await new Promise(resolve => setTimeout(resolve, 2000));
}
await scraper.close();
console.log(`\n${'='.repeat(80)}`);
console.log('📊 SUMMARY');
console.log(`${'='.repeat(80)}`);
console.log(`Total tested: ${testChurches.length}`);
console.log(`✅ Success with schedules: ${results.success}`);
console.log(`⚠️ Success but no schedules: ${results.noSchedules}`);
console.log(`❌ Failed: ${results.failed}`);
}
main().catch(console.error);