#!/usr/bin/env tsx /** * Scrape diocese directories to discover parish URLs and mass schedules * * Usage: * npx tsx scripts/scrape-diocese-directory.ts --diocese # Single diocese * npx tsx scripts/scrape-diocese-directory.ts --country DE # All dioceses in country * npx tsx scripts/scrape-diocese-directory.ts --all # All active dioceses * npx tsx scripts/scrape-diocese-directory.ts --all --dry-run # Preview only * npx tsx scripts/scrape-diocese-directory.ts --job-id # Resume tracked job */ import dotenv from 'dotenv'; import path from 'path'; dotenv.config({ path: path.resolve(process.cwd(), '.env') }); import { Pool } from 'pg'; import { PrismaPg } from '@prisma/adapter-pg'; import { PrismaClient } from '@prisma/client'; import { DioceseDirectoryScraper, DioceseScrapeConfig } from '../src/scrapers/diocese-directory-scraper'; const pool = new Pool({ connectionString: process.env.DATABASE_URL }); const adapter = new PrismaPg(pool); const prisma = new PrismaClient({ adapter }); function log(msg: string) { console.log(`[${new Date().toISOString()}] ${msg}`); } function logError(msg: string) { console.error(`[${new Date().toISOString()}] ERROR: ${msg}`); } // Haversine distance in km function haversineKm(lat1: number, lon1: number, lat2: number, lon2: number): number { const R = 6371; const dLat = (lat2 - lat1) * Math.PI / 180; const dLon = (lon2 - lon1) * Math.PI / 180; const a = Math.sin(dLat / 2) ** 2 + Math.cos(lat1 * Math.PI / 180) * Math.cos(lat2 * Math.PI / 180) * Math.sin(dLon / 2) ** 2; return R * 2 * Math.asin(Math.sqrt(a)); } function normalizeForMatch(str: string): string { return str.toLowerCase() .normalize('NFD').replace(/[\u0300-\u036f]/g, '') .replace(/[^a-z0-9\s]/g, '') .replace(/\s+/g, ' ') .trim(); } interface MatchCandidate { id: string; name: string; latitude: number; longitude: number; distance: number; nameScore: number; } async function findMatchingChurch( name: string, address: string | undefined, city: string | undefined, country: string, ): Promise { // Search by name similarity + country const nameNorm = normalizeForMatch(name); const nameWords = nameNorm.split(' ').filter(w => w.length >= 3); if (nameWords.length === 0) return null; // Find churches in the same country const candidates = await prisma.church.findMany({ where: { country, ...(city ? { city: { contains: city, mode: 'insensitive' } } : {}), }, select: { id: true, name: true, latitude: true, longitude: true, website: true }, take: 50, }); let bestMatch: MatchCandidate | null = null; for (const church of candidates) { const churchNameNorm = normalizeForMatch(church.name); const churchWords = churchNameNorm.split(' ').filter(w => w.length >= 3); let matchingWords = 0; for (const w of nameWords) { if (churchWords.includes(w)) matchingWords++; } const nameScore = nameWords.length > 0 ? matchingWords / nameWords.length : 0; // Require at least 40% word overlap if (nameScore < 0.4) continue; if (!bestMatch || nameScore > bestMatch.nameScore) { bestMatch = { id: church.id, name: church.name, latitude: church.latitude, longitude: church.longitude, distance: 0, nameScore, }; } } return bestMatch; } // --- Job Tracking --- async function createOrResumeJob(args: string[]): Promise { const jobIdIndex = args.indexOf('--job-id'); if (jobIdIndex !== -1) { const jobId = args[jobIdIndex + 1]; await prisma.backgroundJob.update({ where: { id: jobId }, data: { status: 'running', startedAt: new Date() }, }); return jobId; } return null; } async function scrapeDiocese( dioceseId: string, dryRun: boolean, stats: { processed: number; matched: number; created: number; schedules: number; errors: number } ): Promise { const diocese = await prisma.diocese.findUnique({ where: { id: dioceseId } }); if (!diocese) { logError(`Diocese not found: ${dioceseId}`); return; } if (!diocese.directoryUrl) { log(` Skipping ${diocese.name}: no directory URL`); return; } const config = diocese.scrapeConfig as DioceseScrapeConfig | null; if (!config?.selectors) { log(` Skipping ${diocese.name}: no scrape config`); return; } log(`Scraping diocese: ${diocese.name} (${diocese.country})`); log(` Directory URL: ${diocese.directoryUrl}`); const scraper = new DioceseDirectoryScraper(); try { let parishes; if (config.scheduleInDirectory) { parishes = await scraper.scrapeDirectoryWithSchedules( diocese.directoryUrl, config, diocese.language ); } else { const discovered = await scraper.scrapeDirectory(diocese.directoryUrl, config); parishes = discovered.map(p => ({ ...p, scheduleText: '', schedules: [] as Array<{ dayOfWeek: number; time: string; massType?: string; language?: string; notes?: string }>, })); } log(` Discovered ${parishes.length} parishes`); for (const parish of parishes) { stats.processed++; // Try to match to existing church const match = await findMatchingChurch( parish.name, parish.address, parish.city, diocese.country, ); if (match) { stats.matched++; log(` Match: "${parish.name}" -> "${match.name}" (score: ${match.nameScore.toFixed(2)})`); if (!dryRun) { // Update matched church with website and diocese link await prisma.church.update({ where: { id: match.id }, data: { website: parish.url, hasWebsite: true, dioceseId: diocese.id, }, }); // Save schedules if available if ('schedules' in parish && parish.schedules.length > 0) { await prisma.massSchedule.deleteMany({ where: { churchId: match.id } }); await prisma.massSchedule.createMany({ data: parish.schedules.map(s => ({ churchId: match.id, dayOfWeek: s.dayOfWeek, time: s.time, massType: s.massType, language: s.language ?? 'English', notes: s.notes, })), }); stats.schedules += parish.schedules.length; } } } else { log(` No match: "${parish.name}" (${parish.city || 'no city'})`); stats.created++; // In non-dry-run, we could create new churches, but for safety // we only log unmatched parishes for manual review // (Creating churches from directory data without coordinates is risky) } } // Update diocese tracking if (!dryRun) { await prisma.diocese.update({ where: { id: diocese.id }, data: { lastScrapedAt: new Date(), lastSuccessAt: new Date(), churchCount: parishes.length, failureCount: 0, }, }); } } catch (err: any) { stats.errors++; logError(` Failed to scrape ${diocese.name}: ${err.message}`); if (!dryRun) { await prisma.diocese.update({ where: { id: diocese.id }, data: { lastScrapedAt: new Date(), lastFailureAt: new Date(), failureCount: { increment: 1 }, }, }); } } finally { await scraper.close(); } } async function main() { const args = process.argv.slice(2); const dryRun = args.includes('--dry-run'); const dioceseIdx = args.indexOf('--diocese'); const countryIdx = args.indexOf('--country'); const all = args.includes('--all'); const dioceseId = dioceseIdx !== -1 ? args[dioceseIdx + 1] : undefined; const country = countryIdx !== -1 ? args[countryIdx + 1] : undefined; log('============================================================'); log('Diocese Directory Scraper'); log('============================================================'); log(`Mode: ${dryRun ? 'Dry run' : 'Execute'}`); log(`Target: ${dioceseId ? `Diocese ${dioceseId}` : country ? `Country ${country}` : 'All active'}`); log('============================================================'); // Job tracking let jobId = await createOrResumeJob(args); if (!jobId && !dryRun) { const job = await prisma.backgroundJob.create({ data: { type: 'diocese-directory', status: 'running', startedAt: new Date(), config: { dioceseId, country, all, dryRun }, }, }); jobId = job.id; log(`Job ID: ${jobId}`); } const stats = { processed: 0, matched: 0, created: 0, schedules: 0, errors: 0 }; try { let dioceses; if (dioceseId) { dioceses = [{ id: dioceseId }]; } else { dioceses = await prisma.diocese.findMany({ where: { active: true, directoryUrl: { not: null }, ...(country ? { country } : {}), }, select: { id: true, name: true }, orderBy: { name: 'asc' }, }); } log(`Found ${dioceses.length} dioceses to scrape`); for (const d of dioceses) { await scrapeDiocese(d.id, dryRun, stats); // Check for job stop if (jobId) { await prisma.backgroundJob.update({ where: { id: jobId }, data: { processed: stats.processed, succeeded: stats.matched, itemsFound: stats.matched }, }); const job = await prisma.backgroundJob.findUnique({ where: { id: jobId } }); if (job?.status === 'stopping') { log('Job stop requested.'); break; } } } } catch (error: any) { logError(`Fatal error: ${error.message}`); if (jobId) { await prisma.backgroundJob.update({ where: { id: jobId }, data: { status: 'failed', error: error.message, completedAt: new Date() }, }); } throw error; } // Complete job if (jobId) { await prisma.backgroundJob.update({ where: { id: jobId }, data: { status: 'completed', completedAt: new Date(), processed: stats.processed, succeeded: stats.matched, itemsFound: stats.matched, }, }); } log(''); log('============================================================'); log('Diocese Directory Scraper Summary'); log('============================================================'); log(`Parishes discovered: ${stats.processed}`); log(`Matched to DB: ${stats.matched}`); log(`Unmatched (new): ${stats.created}`); log(`Schedules saved: ${stats.schedules}`); log(`Errors: ${stats.errors}`); log('============================================================'); await prisma.$disconnect(); await pool.end(); } main().catch((error) => { logError(`Fatal error: ${error.message}`); process.exit(1); });