#!/usr/bin/env tsx /** * Import Catholic churches and mass schedules from kerknet.be (Flanders, Belgium) * * Kerknet is the portal of the Catholic Church in Flanders (Dutch-speaking Belgium). * It has ~1,200 churches with structured data: name, address, coordinates (GeoJSON), * and date-specific celebration entries. * * Import strategy: * 1. Enumerate unique church slugs by paginating the celebration listing * 2. Scrape each /kerk/{slug} page for structured data (name, address, coords, nodeId) * 3. Fetch celebrations via AJAX endpoint per church * 4. Deduce recurring weekly schedules from date-specific celebrations * 5. Match against existing Belgian churches via church-matcher * 6. Upsert churches and mass schedules * * Usage: * npx tsx scripts/import-kerknet.ts --all --dry-run * npx tsx scripts/import-kerknet.ts --all * npx tsx scripts/import-kerknet.ts --slug o-l-vrouw-kerk-scherpenheuvel --dry-run * npx tsx scripts/import-kerknet.ts --all --resume-from 100 */ import dotenv from 'dotenv'; import path from 'path'; dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); dotenv.config({ path: path.resolve(process.cwd(), '.env') }); import { Pool } from 'pg'; import { PrismaPg } from '@prisma/adapter-pg'; import { PrismaClient } from '@prisma/client'; const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); const pool = new Pool({ connectionString: dbUrl, ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined, }); const adapter = new PrismaPg(pool); const prisma = new PrismaClient({ adapter }); import { findDuplicateChurch } from '../src/lib/church-matcher'; import type { ExistingChurch } from '../src/lib/church-matcher'; // ─── Constants ─────────────────────────────────────────────────────────────── const BASE_URL = 'https://www.kerknet.be'; const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)'; const ENUM_DELAY_MS = 2000; // Delay between listing pages (respecting crawl-delay spirit) const DETAIL_DELAY_MS = 3000; // Delay between church detail page fetches const CELEBRATION_DELAY_MS = 2000; // Delay between celebration AJAX calls const MAX_RETRIES = 3; const RETRY_DELAY_MS = 10000; const MAX_ENUM_PAGES = 2804; // Total celebration listing pages const ENUM_SAMPLE_INTERVAL = 5; // Check every Nth page (5 → ~560 pages to check) const STALE_THRESHOLD = 10; // Stop if N consecutive sampled pages yield no new slugs // Dutch day abbreviations → dayOfWeek (0=Sun, 1=Mon, ..., 6=Sat) const DUTCH_DAYS: Record = { 'zo': 0, 'ma': 1, 'di': 2, 'wo': 3, 'do': 4, 'vr': 5, 'za': 6, }; // ─── Types ─────────────────────────────────────────────────────────────────── interface ChurchData { slug: string; nodeId: string; name: string; address: string | null; zip: string | null; city: string | null; latitude: number; longitude: number; website: string | null; } interface CelebrationEntry { dayAbbrev: string; date: string; // DD/MM time: string; // HH.MM or HH:MM type: string; // Eucharistie, Gebedsdienst, etc. } interface ParsedSchedule { dayOfWeek: number; time: string; } interface ImportStats { slugsEnumerated: number; churchesFetched: number; churchesMatched: number; churchesCreated: number; churchesSkipped: number; schedulesCreated: number; errors: number; } interface CLIArgs { all: boolean; dryRun: boolean; resumeFrom?: number; slug?: string; jobId?: string; } // ─── HTTP Helpers ──────────────────────────────────────────────────────────── let requestCount = 0; function delay(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } async function fetchPage(url: string, delayMs: number): Promise { if (requestCount > 0) { await delay(delayMs); } requestCount++; for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) { try { const response = await fetch(url, { headers: { 'User-Agent': USER_AGENT }, }); if (response.status === 429 || response.status === 503) { if (attempt < MAX_RETRIES) { console.log(` HTTP ${response.status} — retrying in ${RETRY_DELAY_MS / 1000}s (attempt ${attempt}/${MAX_RETRIES})`); await delay(RETRY_DELAY_MS); continue; } return null; } if (!response.ok) { if (attempt < MAX_RETRIES) { await delay(RETRY_DELAY_MS); continue; } return null; } return await response.text(); } catch (error) { if (attempt < MAX_RETRIES) { console.log(` Network error — retrying (attempt ${attempt}/${MAX_RETRIES})`); await delay(RETRY_DELAY_MS); continue; } console.error(` Fetch failed after ${MAX_RETRIES} attempts: ${error instanceof Error ? error.message : error}`); return null; } } return null; } // ─── Phase 1: Enumerate Church Slugs ───────────────────────────────────────── async function enumerateChurchSlugs(): Promise { console.log('\nPhase 1: Enumerating church slugs from celebration listings...'); const slugs = new Set(); let consecutiveEmpty = 0; for (let page = 0; page < MAX_ENUM_PAGES; page += ENUM_SAMPLE_INTERVAL) { const url = `${BASE_URL}/zoeken/vieringen/lijst?page=${page}`; const html = await fetchPage(url, ENUM_DELAY_MS); if (!html) { consecutiveEmpty++; if (consecutiveEmpty >= STALE_THRESHOLD) { console.log(` Stopping enumeration: ${STALE_THRESHOLD} consecutive empty pages`); break; } continue; } // Extract /kerk/{slug} links (church building pages, NOT org pages like /kerk-jette/artikel/) const matches = html.match(/href="\/kerk\/([^"/]+)"/g); const prevSize = slugs.size; if (matches) { for (const match of matches) { const slugMatch = match.match(/href="\/kerk\/([^"/]+)"/); if (slugMatch) { slugs.add(slugMatch[1]); } } } const newCount = slugs.size - prevSize; if (newCount === 0) { consecutiveEmpty++; } else { consecutiveEmpty = 0; } if (page % 50 === 0 || newCount > 0) { console.log(` Page ${page}: ${slugs.size} unique churches found (+${newCount})`); } if (consecutiveEmpty >= STALE_THRESHOLD) { console.log(` Stopping enumeration: ${STALE_THRESHOLD} consecutive sampled pages with no new churches`); break; } } console.log(` Enumeration complete: ${slugs.size} unique church slugs found\n`); return Array.from(slugs).sort(); } // ─── Phase 2: Scrape Church Detail Page ────────────────────────────────────── function parseChurchPage(html: string, slug: string): ChurchData | null { // Extract coordinates from GeoJSON in Drupal settings const coordMatch = html.match(/"coordinates":\[(-?[\d.]+),(-?[\d.]+)\]/); if (!coordMatch) return null; // No coordinates = unusable const longitude = parseFloat(coordMatch[1]); const latitude = parseFloat(coordMatch[2]); if (latitude === 0 && longitude === 0) return null; // Extract node ID const nidMatch = html.match(/"currentNid":"(\d+)"/); const nodeId = nidMatch ? nidMatch[1] : slug; // Extract name from GeoJSON description or page title let name = slug; const descMatch = html.match(/"description":"([^"]+)"/); if (descMatch) { name = descMatch[1]; } else { const titleMatch = html.match(/([^|<]+)/); if (titleMatch) name = titleMatch[1].trim(); } // Extract address fields const streetMatch = html.match(/class="thoroughfare">([^<]+)</); const zipMatch = html.match(/class="postal-code">([^<]+)</); const cityMatch = html.match(/class="locality">([^<]+)</); const address = streetMatch ? streetMatch[1].trim() : null; const zip = zipMatch ? zipMatch[1].trim() : null; const city = cityMatch ? cityMatch[1].trim() : null; // Extract website let website: string | null = null; const websiteMatch = html.match(/class="website"[^>]*>.*?href="([^"]+)"/s); if (websiteMatch) { website = websiteMatch[1]; } else { // Try field-name-kn-website pattern const knWebsiteMatch = html.match(/field-name-kn-website.*?href="([^"]+)"/s); if (knWebsiteMatch) website = knWebsiteMatch[1]; } return { slug, nodeId, name, address, zip, city, latitude, longitude, website }; } // ─── Phase 3: Parse Celebrations ───────────────────────────────────────────── function parseCelebrations(html: string): CelebrationEntry[] { const entries: CelebrationEntry[] = []; // Match celebration blocks const celebBlocks = html.split('<div class="celebration">').slice(1); for (const block of celebBlocks) { // Extract day abbreviation const dayMatch = block.match(/celebration__date__day">\s*(\w+)\s*</); if (!dayMatch) continue; // Extract date (DD/MM) const dateMatch = block.match(/celebration__date__date">\s*([\d/]+)\s*</); // Extract time (HH.MM) const timeMatch = block.match(/celebration__time">\s*([\d.]+)\s*</); if (!timeMatch) continue; // Extract type const typeMatch = block.match(/celebration__info__type">\s*([^<]+)\s*</); entries.push({ dayAbbrev: dayMatch[1].toLowerCase().trim(), date: dateMatch ? dateMatch[1].trim() : '', time: timeMatch[1].trim(), type: typeMatch ? typeMatch[1].trim().toLowerCase() : 'eucharistie', }); } return entries; } function deduceSchedules(celebrations: CelebrationEntry[]): ParsedSchedule[] { // Only keep Eucharistie (mass) entries const masses = celebrations.filter(c => c.type === 'eucharistie' || c.type === 'eucharistieviering' ); const seen = new Set<string>(); const schedules: ParsedSchedule[] = []; for (const mass of masses) { const dayOfWeek = DUTCH_DAYS[mass.dayAbbrev]; if (dayOfWeek === undefined) continue; // Normalize time: "15.00" → "15:00" const time = mass.time.replace('.', ':').replace(/^(\d):/, '0$1:'); const key = `${dayOfWeek}:${time}`; if (!seen.has(key)) { seen.add(key); schedules.push({ dayOfWeek, time }); } } return schedules; } // ─── Database Operations ───────────────────────────────────────────────────── async function loadExistingBelgianChurches(): Promise<ExistingChurch[]> { console.log('Loading existing Belgian churches for deduplication...'); const churches = await prisma.church.findMany({ where: { country: 'BE' }, select: { id: true, name: true, latitude: true, longitude: true, osmId: true, baiduId: true, masstimesId: true, orarimesseId: true, massSchedulesPhId: true, philmassId: true, horariosMisasId: true, mszeInfoId: true, weekdayMassesId: true, messesInfoId: true, bohosluzbyId: true, miserendId: true, kerknetId: true, gottesdienstzeitenId: true, discovermassId: true, source: true, website: true, phone: true, address: true, }, }); console.log(`Loaded ${churches.length} existing Belgian churches`); return churches; } // ─── Import Logic ──────────────────────────────────────────────────────────── async function processChurch( slug: string, existingChurches: ExistingChurch[], dryRun: boolean, stats: ImportStats, ): Promise<void> { // Fetch church detail page const churchHtml = await fetchPage(`${BASE_URL}/kerk/${slug}`, DETAIL_DELAY_MS); if (!churchHtml) { stats.errors++; return; } const church = parseChurchPage(churchHtml, slug); if (!church) { stats.churchesSkipped++; return; } stats.churchesFetched++; // Fetch celebrations via AJAX let celebrations: CelebrationEntry[] = []; const celebHtml = await fetchPage( `${BASE_URL}/kerknet-celebration/churches/ajax/load-more/0/${church.nodeId}`, CELEBRATION_DELAY_MS, ); if (celebHtml) { celebrations = parseCelebrations(celebHtml); } const schedules = deduceSchedules(celebrations); const kerknetId = `kerknet-${church.nodeId}`; const candidate = { name: church.name, lat: church.latitude, lng: church.longitude, kerknetId, }; const duplicate = findDuplicateChurch(candidate, existingChurches); if (dryRun) { if (duplicate) { stats.churchesMatched++; } else { stats.churchesCreated++; } stats.schedulesCreated += schedules.length; return; } if (duplicate) { stats.churchesMatched++; const updateData: Record<string, unknown> = { kerknetId }; if (!duplicate.address && church.address) updateData.address = church.address; if (!duplicate.website && church.website) updateData.website = church.website; try { await prisma.church.update({ where: { id: duplicate.id }, data: updateData, }); } catch (error) { if (error instanceof Error && error.message.includes('Unique constraint')) { stats.churchesSkipped++; return; } throw error; } if (schedules.length > 0) { try { await prisma.$transaction(async (tx) => { await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } }); await tx.massSchedule.createMany({ data: schedules.map((s) => ({ churchId: duplicate.id, dayOfWeek: s.dayOfWeek, time: s.time, language: 'Dutch', })), }); await tx.church.update({ where: { id: duplicate.id }, data: { lastScrapedAt: new Date() }, }); }); stats.schedulesCreated += schedules.length; } catch (error) { stats.errors++; console.error(` Error saving schedules for ${slug}: ${error instanceof Error ? error.message : error}`); } } } else { try { const newChurch = await prisma.church.create({ data: { name: church.name, latitude: church.latitude, longitude: church.longitude, address: church.address, zip: church.zip, city: church.city, country: 'BE', website: church.website, hasWebsite: !!church.website, kerknetId, source: 'kerknet', websiteLanguage: 'nl', }, }); stats.churchesCreated++; existingChurches.push({ id: newChurch.id, name: church.name, latitude: church.latitude, longitude: church.longitude, osmId: null, baiduId: null, masstimesId: null, orarimesseId: null, massSchedulesPhId: null, philmassId: null, horariosMisasId: null, mszeInfoId: null, weekdayMassesId: null, messesInfoId: null, bohosluzbyId: null, miserendId: null, kerknetId, gottesdienstzeitenId: null, discovermassId: null, source: 'kerknet', website: church.website, phone: null, address: church.address, }); if (schedules.length > 0) { await prisma.massSchedule.createMany({ data: schedules.map((s) => ({ churchId: newChurch.id, dayOfWeek: s.dayOfWeek, time: s.time, language: 'Dutch', })), }); await prisma.church.update({ where: { id: newChurch.id }, data: { lastScrapedAt: new Date() }, }); stats.schedulesCreated += schedules.length; } } catch (error) { if (error instanceof Error && error.message.includes('Unique constraint')) { stats.churchesSkipped++; return; } stats.errors++; console.error(` Error creating ${slug}: ${error instanceof Error ? error.message : error}`); } } } // ─── CLI ───────────────────────────────────────────────────────────────────── function parseArgs(): CLIArgs { const args = process.argv.slice(2); const result: CLIArgs = { all: false, dryRun: false }; for (let i = 0; i < args.length; i++) { switch (args[i]) { case '--all': result.all = true; break; case '--dry-run': result.dryRun = true; break; case '--resume-from': result.resumeFrom = parseInt(args[++i]); break; case '--slug': result.slug = args[++i]; break; case '--job-id': result.jobId = args[++i]; break; case '--help': case '-h': console.log(` Usage: npx tsx scripts/import-kerknet.ts [options] Options: --all Import all churches from kerknet.be --slug <slug> Import a single church (e.g., o-l-vrouw-kerk-scherpenheuvel) --dry-run No database writes, just report what would happen --resume-from <n> Skip first N churches (after enumeration) --job-id <uuid> Background job tracking ID --help, -h Show this help message Examples: npx tsx scripts/import-kerknet.ts --slug o-l-vrouw-kerk-scherpenheuvel --dry-run npx tsx scripts/import-kerknet.ts --all --dry-run npx tsx scripts/import-kerknet.ts --all `); process.exit(0); } } if (!result.all && !result.slug) { console.error('Error: specify --all or --slug <slug>'); process.exit(1); } return result; } function formatDuration(ms: number): string { const seconds = Math.floor(ms / 1000); const minutes = Math.floor(seconds / 60); const hours = Math.floor(minutes / 60); if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`; if (minutes > 0) return `${minutes}m ${seconds % 60}s`; return `${seconds}s`; } // ─── Main ──────────────────────────────────────────────────────────────────── async function main() { const args = parseArgs(); const startTime = Date.now(); console.log('\n' + '='.repeat(70)); console.log('KERKNET.BE (BELGIUM/FLANDERS) IMPORTER'); console.log('='.repeat(70)); console.log(`Mode: ${args.slug ? `Single: ${args.slug}` : 'All churches'}`); console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`); if (args.resumeFrom) console.log(`Resume from: church index ${args.resumeFrom}`); console.log(`Time: ${new Date().toISOString()}`); console.log('='.repeat(70) + '\n'); if (args.jobId) { try { await prisma.backgroundJob.update({ where: { id: args.jobId }, data: { status: 'running', startedAt: new Date() }, }); } catch { /* Job might not exist */ } } const stats: ImportStats = { slugsEnumerated: 0, churchesFetched: 0, churchesMatched: 0, churchesCreated: 0, churchesSkipped: 0, schedulesCreated: 0, errors: 0, }; const existingChurches = await loadExistingBelgianChurches(); // Get list of church slugs let slugs: string[]; if (args.slug) { slugs = [args.slug]; } else { slugs = await enumerateChurchSlugs(); stats.slugsEnumerated = slugs.length; } if (args.resumeFrom && !args.slug) { slugs = slugs.slice(args.resumeFrom); console.log(`Resuming from church index ${args.resumeFrom} (${slugs[0]})\n`); } console.log(`Processing ${slugs.length} churches\n`); for (let i = 0; i < slugs.length; i++) { const slug = slugs[i]; const elapsed = formatDuration(Date.now() - startTime); if (i % 50 === 0 || slugs.length <= 10) { console.log(`[${i + 1}/${slugs.length}] ${slug} [${elapsed} elapsed, ${stats.churchesCreated} new, ${stats.churchesMatched} matched]`); } try { await processChurch(slug, existingChurches, args.dryRun, stats); } catch (error) { stats.errors++; console.error(` ERROR processing ${slug}: ${error instanceof Error ? error.message : error}`); } } const totalTime = Date.now() - startTime; console.log('\n' + '='.repeat(70)); console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`); console.log('='.repeat(70)); console.log(`Slugs enumerated: ${stats.slugsEnumerated}`); console.log(`Churches fetched: ${stats.churchesFetched}`); console.log(` Matched (existing): ${stats.churchesMatched}`); console.log(` Created (new): ${stats.churchesCreated}`); console.log(` Skipped: ${stats.churchesSkipped}`); console.log(`Schedules created: ${stats.schedulesCreated}`); console.log(`Errors: ${stats.errors}`); console.log(`Total time: ${formatDuration(totalTime)}`); console.log(`HTTP requests: ${requestCount}`); console.log('='.repeat(70) + '\n'); if (args.jobId) { try { await prisma.backgroundJob.update({ where: { id: args.jobId }, data: { status: stats.errors > 0 ? 'completed_with_errors' : 'completed', completedAt: new Date(), processed: stats.churchesFetched, succeeded: stats.churchesCreated + stats.churchesMatched, failed: stats.errors, itemsFound: stats.schedulesCreated, }, }); } catch { /* Ignore */ } } } main() .catch((error) => { console.error('Fatal error:', error); process.exit(1); }) .finally(async () => { await prisma.$disconnect(); await pool.end(); });