#!/usr/bin/env tsx /** * Import Catholic churches and mass schedules from gottesdienstzeiten.de (Germany) * * gottesdienstzeiten.de is a German worship service directory with ~6,878 Catholic * churches. It runs on WordPress with a fully open REST API at /wp-json/wp/v2/posts. * * Data includes: church name, address, coordinates (Google Maps embed), diocese, * mass schedules (day/type/time table), website, email, phone. * * Import strategy: * 1. Fetch all Catholic diocese category IDs from WP API * 2. Paginate through posts per category (100 per page) * 3. Parse HTML content for coordinates, address, schedule table, info table * 4. Match against existing German churches via church-matcher * 5. Upsert churches and mass schedules * * Usage: * npx tsx scripts/import-gottesdienstzeiten.ts --all --dry-run * npx tsx scripts/import-gottesdienstzeiten.ts --all * npx tsx scripts/import-gottesdienstzeiten.ts --diocese 129 --dry-run # Köln only * npx tsx scripts/import-gottesdienstzeiten.ts --all --resume-from 5 */ import dotenv from 'dotenv'; import path from 'path'; dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); dotenv.config({ path: path.resolve(process.cwd(), '.env') }); import { Pool } from 'pg'; import { PrismaPg } from '@prisma/adapter-pg'; import { PrismaClient } from '@prisma/client'; const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); const pool = new Pool({ connectionString: dbUrl, ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined, }); const adapter = new PrismaPg(pool); const prisma = new PrismaClient({ adapter }); import { findDuplicateChurch } from '../src/lib/church-matcher'; import type { ExistingChurch } from '../src/lib/church-matcher'; // ─── Constants ─────────────────────────────────────────────────────────────── const API_BASE = 'https://gottesdienstzeiten.de/wp-json/wp/v2'; const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)'; const REQUEST_DELAY_MS = 1000; const RETRY_DELAY_MS = 5000; const MAX_RETRIES = 3; const POSTS_PER_PAGE = 100; const CATHOLIC_PARENT_CATEGORY = 4; // German day names → dayOfWeek (0=Sun, 1=Mon, ..., 6=Sat) const GERMAN_DAYS: Record = { 'sonntags': 0, 'montags': 1, 'dienstags': 2, 'mittwochs': 3, 'donnerstags': 4, 'freitags': 5, 'samstags': 6, // Without -s suffix (some entries use these) 'sonntag': 0, 'montag': 1, 'dienstag': 2, 'mittwoch': 3, 'donnerstag': 4, 'freitag': 5, 'samstag': 6, }; // Mass-related types (filter out non-mass services) const MASS_TYPES = new Set([ 'messfeier', 'vorabendmesse', 'heilige messe', 'hl. messe', 'hochamt', 'festmesse', 'familienmesse', 'kindergottesdienst', 'jugendmesse', 'abendmesse', 'frühmesse', 'werktagsmesse', 'sonntagsmesse', 'messe', 'eucharistiefeier', ]); // ─── Types ─────────────────────────────────────────────────────────────────── interface DioceseCat { id: number; name: string; count: number; } interface ParsedChurch { wpId: number; slug: string; name: string; latitude: number; longitude: number; address: string | null; zip: string | null; city: string | null; diocese: string | null; website: string | null; email: string | null; phone: string | null; schedules: ParsedSchedule[]; } interface ParsedSchedule { dayOfWeek: number; time: string; } interface ImportStats { diocesesProcessed: number; postsFound: number; churchesParsed: number; churchesMatched: number; churchesCreated: number; churchesSkipped: number; schedulesCreated: number; errors: number; } interface CLIArgs { all: boolean; dryRun: boolean; resumeFrom?: number; diocese?: number; jobId?: string; } // ─── HTTP Helpers ──────────────────────────────────────────────────────────── let requestCount = 0; function delay(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } async function fetchJson(url: string): Promise { if (requestCount > 0) { await delay(REQUEST_DELAY_MS); } requestCount++; for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) { try { const response = await fetch(url, { headers: { 'User-Agent': USER_AGENT }, }); if (response.status === 429 || response.status === 503) { if (attempt < MAX_RETRIES) { console.log(` HTTP ${response.status} — retrying in ${RETRY_DELAY_MS / 1000}s`); await delay(RETRY_DELAY_MS); continue; } return null; } if (!response.ok) return null; return await response.json(); } catch (error) { if (attempt < MAX_RETRIES) { await delay(RETRY_DELAY_MS); continue; } console.error(` Fetch error: ${error instanceof Error ? error.message : error}`); return null; } } return null; } // ─── Parsing ───────────────────────────────────────────────────────────────── function stripHtml(html: string): string { return html.replace(/<[^>]+>/g, '').trim(); } function parsePost(post: any, dioceseName: string | null): ParsedChurch | null { const content: string = post.content?.rendered || ''; const wpId: number = post.id; const slug: string = post.slug; // Extract name from title — format: "(City) Church Name" let name = stripHtml(post.title?.rendered || ''); // Remove leading "(City)" prefix for cleaner name const nameMatch = name.match(/^\([^)]+\)\s*(.+)$/); if (nameMatch) name = nameMatch[1]; // Extract coordinates from Google Maps embed const coordMatch = content.match(/maps\?q=([-\d.]+),([-\d.]+)/); if (!coordMatch) return null; const latitude = parseFloat(coordMatch[1]); const longitude = parseFloat(coordMatch[2]); if (isNaN(latitude) || isNaN(longitude) || (latitude === 0 && longitude === 0)) return null; // Extract address from first tag (format: "Street, ZIP City") const addrMatch = content.match(/([^<]+)<\/strong>/); let address: string | null = null; let zip: string | null = null; let city: string | null = null; if (addrMatch) { const fullAddr = addrMatch[1].trim(); address = fullAddr; // Parse "Street, ZIP City" format const zipCityMatch = fullAddr.match(/,\s*(\d{5})\s+(.+)$/); if (zipCityMatch) { zip = zipCityMatch[1]; city = zipCityMatch[2]; address = fullAddr.replace(/,\s*\d{5}\s+.+$/, '').trim(); } } // Parse info table (second table) for website, email, phone let website: string | null = null; let email: string | null = null; let phone: string | null = null; const tables = content.match(/]*>([\s\S]*?)<\/table>/g) || []; if (tables.length >= 2) { const infoTable = tables[1]; // Website const websiteMatch = infoTable.match(/Website[\s\S]*?]*href="([^"]+)"/); if (websiteMatch) website = websiteMatch[1]; // Email const emailMatch = infoTable.match(/E-Mail[\s\S]*?]*>([\s\S]*?)<\/td>/); if (emailMatch) { const emailText = stripHtml(emailMatch[1]); if (emailText.includes('@')) email = emailText; } // Phone const phoneMatch = infoTable.match(/Telefon[\s\S]*?]*>([\s\S]*?)<\/td>/); if (phoneMatch) { const phoneText = stripHtml(phoneMatch[1]); if (phoneText.length > 3) phone = phoneText; } } // Parse schedule table (first table) const schedules: ParsedSchedule[] = []; if (tables.length >= 1) { const schedTable = tables[0]; const rows = schedTable.match(/]*>([\s\S]*?)<\/tr>/g) || []; let currentDay = -1; const seen = new Set(); for (const row of rows) { // Check for day header (in with ) const dayMatch = row.match(/]*>[\s\S]*?([^<]*)<\/em>/); if (dayMatch && dayMatch[1].trim()) { const dayName = dayMatch[1].trim().toLowerCase(); if (GERMAN_DAYS[dayName] !== undefined) { currentDay = GERMAN_DAYS[dayName]; } } // Get type and time from ... const cells = row.match(/]*>[\s\S]*?([^<]*)<\/em>[\s\S]*?<\/td>/g); if (!cells || cells.length < 2 || currentDay < 0) continue; const typeMatch = cells[0].match(/([^<]*)<\/em>/); const timeMatch = cells[1].match(/([^<]*)<\/em>/); if (!typeMatch || !timeMatch) continue; const massType = typeMatch[1].trim().toLowerCase(); const timeStr = timeMatch[1].trim(); // Only include mass-related types const isMass = MASS_TYPES.has(massType) || massType.includes('messe') || massType.includes('messfeier') || massType.includes('eucharistie'); if (!isMass) continue; // Parse time: "09.00 Uhr" or "18:30 Uhr" → "09:00" or "18:30" const parsedTime = timeStr .replace(/\s*Uhr\s*/i, '') .replace('.', ':') .trim(); const timeValidation = parsedTime.match(/^(\d{1,2}):(\d{2})$/); if (!timeValidation) continue; const normalizedTime = `${timeValidation[1].padStart(2, '0')}:${timeValidation[2]}`; const key = `${currentDay}:${normalizedTime}`; if (!seen.has(key)) { seen.add(key); schedules.push({ dayOfWeek: currentDay, time: normalizedTime }); } } } return { wpId, slug, name, latitude, longitude, address, zip, city, diocese: dioceseName, website, email, phone, schedules, }; } // ─── Database Operations ───────────────────────────────────────────────────── async function loadExistingGermanChurches(): Promise { console.log('Loading existing German churches for deduplication...'); const churches = await prisma.church.findMany({ where: { country: 'DE' }, select: { id: true, name: true, latitude: true, longitude: true, osmId: true, baiduId: true, masstimesId: true, orarimesseId: true, massSchedulesPhId: true, philmassId: true, horariosMisasId: true, mszeInfoId: true, weekdayMassesId: true, messesInfoId: true, bohosluzbyId: true, miserendId: true, kerknetId: true, gottesdienstzeitenId: true, source: true, website: true, phone: true, address: true, }, }); console.log(`Loaded ${churches.length} existing German churches`); return churches; } // ─── Import Logic ──────────────────────────────────────────────────────────── async function fetchDioceseCategories(): Promise { console.log('Fetching Catholic diocese categories...'); const data = await fetchJson( `${API_BASE}/categories?per_page=100&parent=${CATHOLIC_PARENT_CATEGORY}` ); if (!data) { console.error('Failed to fetch categories'); return []; } const cats: DioceseCat[] = data.map((c: any) => ({ id: c.id, name: c.name, count: c.count, })); const total = cats.reduce((s, c) => s + c.count, 0); console.log(`Found ${cats.length} diocese categories with ${total} total posts\n`); return cats.sort((a, b) => b.count - a.count); } async function processDiocese( cat: DioceseCat, existingChurches: ExistingChurch[], dryRun: boolean, stats: ImportStats, ): Promise { const totalPages = Math.ceil(cat.count / POSTS_PER_PAGE); for (let page = 1; page <= totalPages; page++) { const url = `${API_BASE}/posts?categories=${cat.id}&per_page=${POSTS_PER_PAGE}&page=${page}`; const posts = await fetchJson(url); if (!posts || !Array.isArray(posts) || posts.length === 0) break; stats.postsFound += posts.length; for (const post of posts) { const church = parsePost(post, cat.name); if (!church) { stats.churchesSkipped++; continue; } stats.churchesParsed++; const gdzId = String(church.wpId); const candidate = { name: church.name, lat: church.latitude, lng: church.longitude, gottesdienstzeitenId: gdzId, }; const duplicate = findDuplicateChurch(candidate, existingChurches); if (dryRun) { if (duplicate) { stats.churchesMatched++; } else { stats.churchesCreated++; } stats.schedulesCreated += church.schedules.length; continue; } if (duplicate) { stats.churchesMatched++; const updateData: Record = { gottesdienstzeitenId: gdzId }; if (!duplicate.address && church.address) updateData.address = church.address; if (!duplicate.website && church.website) { updateData.website = church.website; updateData.hasWebsite = true; } if (!duplicate.phone && church.phone) updateData.phone = church.phone; try { await prisma.church.update({ where: { id: duplicate.id }, data: updateData, }); } catch (error) { if (error instanceof Error && error.message.includes('Unique constraint')) { stats.churchesSkipped++; continue; } throw error; } if (church.schedules.length > 0) { try { await prisma.$transaction(async (tx) => { await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } }); await tx.massSchedule.createMany({ data: church.schedules.map((s) => ({ churchId: duplicate.id, dayOfWeek: s.dayOfWeek, time: s.time, language: 'German', })), }); await tx.church.update({ where: { id: duplicate.id }, data: { lastScrapedAt: new Date() }, }); }); stats.schedulesCreated += church.schedules.length; } catch (error) { stats.errors++; console.error(` Error saving schedules for ${church.slug}: ${error instanceof Error ? error.message : error}`); } } } else { try { const newChurch = await prisma.church.create({ data: { name: church.name, latitude: church.latitude, longitude: church.longitude, address: church.address, zip: church.zip, city: church.city, country: 'DE', diocese: church.diocese || undefined, website: church.website, hasWebsite: !!church.website, email: church.email, phone: church.phone, gottesdienstzeitenId: gdzId, source: 'gottesdienstzeiten', websiteLanguage: 'de', }, }); stats.churchesCreated++; existingChurches.push({ id: newChurch.id, name: church.name, latitude: church.latitude, longitude: church.longitude, osmId: null, baiduId: null, masstimesId: null, orarimesseId: null, massSchedulesPhId: null, philmassId: null, horariosMisasId: null, mszeInfoId: null, weekdayMassesId: null, messesInfoId: null, bohosluzbyId: null, miserendId: null, kerknetId: null, gottesdienstzeitenId: gdzId, source: 'gottesdienstzeiten', website: church.website, phone: church.phone, address: church.address, }); if (church.schedules.length > 0) { await prisma.massSchedule.createMany({ data: church.schedules.map((s) => ({ churchId: newChurch.id, dayOfWeek: s.dayOfWeek, time: s.time, language: 'German', })), }); await prisma.church.update({ where: { id: newChurch.id }, data: { lastScrapedAt: new Date() }, }); stats.schedulesCreated += church.schedules.length; } } catch (error) { if (error instanceof Error && error.message.includes('Unique constraint')) { stats.churchesSkipped++; continue; } stats.errors++; console.error(` Error creating ${church.slug}: ${error instanceof Error ? error.message : error}`); } } } } stats.diocesesProcessed++; } // ─── CLI ───────────────────────────────────────────────────────────────────── function parseArgs(): CLIArgs { const args = process.argv.slice(2); const result: CLIArgs = { all: false, dryRun: false }; for (let i = 0; i < args.length; i++) { switch (args[i]) { case '--all': result.all = true; break; case '--dry-run': result.dryRun = true; break; case '--resume-from': result.resumeFrom = parseInt(args[++i]); break; case '--diocese': result.diocese = parseInt(args[++i]); break; case '--job-id': result.jobId = args[++i]; break; case '--help': case '-h': console.log(` Usage: npx tsx scripts/import-gottesdienstzeiten.ts [options] Options: --all Import all Catholic diocese categories --diocese Import a single diocese category (e.g., 129 for Köln) --dry-run No database writes, just report what would happen --resume-from Skip first N diocese categories --job-id Background job tracking ID --help, -h Show this help message Examples: npx tsx scripts/import-gottesdienstzeiten.ts --diocese 129 --dry-run npx tsx scripts/import-gottesdienstzeiten.ts --all --dry-run npx tsx scripts/import-gottesdienstzeiten.ts --all `); process.exit(0); } } if (!result.all && !result.diocese) { console.error('Error: specify --all or --diocese '); process.exit(1); } return result; } function formatDuration(ms: number): string { const seconds = Math.floor(ms / 1000); const minutes = Math.floor(seconds / 60); const hours = Math.floor(minutes / 60); if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`; if (minutes > 0) return `${minutes}m ${seconds % 60}s`; return `${seconds}s`; } // ─── Main ──────────────────────────────────────────────────────────────────── async function main() { const args = parseArgs(); const startTime = Date.now(); console.log('\n' + '='.repeat(70)); console.log('GOTTESDIENSTZEITEN.DE (GERMANY) IMPORTER'); console.log('='.repeat(70)); console.log(`Mode: ${args.diocese ? `Diocese category ${args.diocese}` : 'All dioceses'}`); console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`); if (args.resumeFrom) console.log(`Resume from: diocese index ${args.resumeFrom}`); console.log(`Time: ${new Date().toISOString()}`); console.log('='.repeat(70) + '\n'); if (args.jobId) { try { await prisma.backgroundJob.update({ where: { id: args.jobId }, data: { status: 'running', startedAt: new Date() }, }); } catch { /* Job might not exist */ } } const stats: ImportStats = { diocesesProcessed: 0, postsFound: 0, churchesParsed: 0, churchesMatched: 0, churchesCreated: 0, churchesSkipped: 0, schedulesCreated: 0, errors: 0, }; const existingChurches = await loadExistingGermanChurches(); let categories: DioceseCat[]; if (args.diocese) { categories = [{ id: args.diocese, name: `Category ${args.diocese}`, count: 1000 }]; } else { categories = await fetchDioceseCategories(); } if (args.resumeFrom && !args.diocese) { categories = categories.slice(args.resumeFrom); console.log(`Resuming from diocese index ${args.resumeFrom} (${categories[0]?.name})\n`); } console.log(`Processing ${categories.length} diocese categories\n`); for (let i = 0; i < categories.length; i++) { const cat = categories[i]; const elapsed = formatDuration(Date.now() - startTime); console.log(`[${i + 1}/${categories.length}] ${cat.name} (${cat.count} posts) [${elapsed} elapsed]`); try { await processDiocese(cat, existingChurches, args.dryRun, stats); } catch (error) { stats.errors++; console.error(` ERROR processing ${cat.name}: ${error instanceof Error ? error.message : error}`); } } const totalTime = Date.now() - startTime; console.log('\n' + '='.repeat(70)); console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`); console.log('='.repeat(70)); console.log(`Dioceses processed: ${stats.diocesesProcessed}`); console.log(`WP posts found: ${stats.postsFound}`); console.log(`Churches parsed: ${stats.churchesParsed}`); console.log(` Matched (existing): ${stats.churchesMatched}`); console.log(` Created (new): ${stats.churchesCreated}`); console.log(` Skipped (no coords): ${stats.churchesSkipped}`); console.log(`Schedules created: ${stats.schedulesCreated}`); console.log(`Errors: ${stats.errors}`); console.log(`Total time: ${formatDuration(totalTime)}`); console.log(`HTTP requests: ${requestCount}`); console.log('='.repeat(70) + '\n'); if (args.jobId) { try { await prisma.backgroundJob.update({ where: { id: args.jobId }, data: { status: stats.errors > 0 ? 'completed_with_errors' : 'completed', completedAt: new Date(), processed: stats.churchesParsed, succeeded: stats.churchesCreated + stats.churchesMatched, failed: stats.errors, itemsFound: stats.schedulesCreated, }, }); } catch { /* Ignore */ } } } main() .catch((error) => { console.error('Fatal error:', error); process.exit(1); }) .finally(async () => { await prisma.$disconnect(); await pool.end(); });