Files
ScraperControl/scripts/import-gottesdienstzeiten.ts

685 lines
23 KiB
TypeScript
Raw Permalink Normal View History

#!/usr/bin/env tsx
/**
* Import Catholic churches and mass schedules from gottesdienstzeiten.de (Germany)
*
* gottesdienstzeiten.de is a German worship service directory with ~6,878 Catholic
* churches. It runs on WordPress with a fully open REST API at /wp-json/wp/v2/posts.
*
* Data includes: church name, address, coordinates (Google Maps embed), diocese,
* mass schedules (day/type/time table), website, email, phone.
*
* Import strategy:
* 1. Fetch all Catholic diocese category IDs from WP API
* 2. Paginate through posts per category (100 per page)
* 3. Parse HTML content for coordinates, address, schedule table, info table
* 4. Match against existing German churches via church-matcher
* 5. Upsert churches and mass schedules
*
* Usage:
* npx tsx scripts/import-gottesdienstzeiten.ts --all --dry-run
* npx tsx scripts/import-gottesdienstzeiten.ts --all
* npx tsx scripts/import-gottesdienstzeiten.ts --diocese 129 --dry-run # Köln only
* npx tsx scripts/import-gottesdienstzeiten.ts --all --resume-from 5
*/
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
const pool = new Pool({
connectionString: dbUrl,
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
});
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
import { findDuplicateChurch } from '../src/lib/church-matcher';
import type { ExistingChurch } from '../src/lib/church-matcher';
// ─── Constants ───────────────────────────────────────────────────────────────
const API_BASE = 'https://gottesdienstzeiten.de/wp-json/wp/v2';
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
const REQUEST_DELAY_MS = 1000;
const RETRY_DELAY_MS = 5000;
const MAX_RETRIES = 3;
const POSTS_PER_PAGE = 100;
const CATHOLIC_PARENT_CATEGORY = 4;
// German day names → dayOfWeek (0=Sun, 1=Mon, ..., 6=Sat)
const GERMAN_DAYS: Record<string, number> = {
'sonntags': 0, 'montags': 1, 'dienstags': 2, 'mittwochs': 3,
'donnerstags': 4, 'freitags': 5, 'samstags': 6,
// Without -s suffix (some entries use these)
'sonntag': 0, 'montag': 1, 'dienstag': 2, 'mittwoch': 3,
'donnerstag': 4, 'freitag': 5, 'samstag': 6,
};
// Mass-related types (filter out non-mass services)
const MASS_TYPES = new Set([
'messfeier', 'vorabendmesse', 'heilige messe', 'hl. messe',
'hochamt', 'festmesse', 'familienmesse', 'kindergottesdienst',
'jugendmesse', 'abendmesse', 'frühmesse', 'werktagsmesse',
'sonntagsmesse', 'messe', 'eucharistiefeier',
]);
// ─── Types ───────────────────────────────────────────────────────────────────
interface DioceseCat {
id: number;
name: string;
count: number;
}
interface ParsedChurch {
wpId: number;
slug: string;
name: string;
latitude: number;
longitude: number;
address: string | null;
zip: string | null;
city: string | null;
diocese: string | null;
website: string | null;
email: string | null;
phone: string | null;
schedules: ParsedSchedule[];
}
interface ParsedSchedule {
dayOfWeek: number;
time: string;
}
interface ImportStats {
diocesesProcessed: number;
postsFound: number;
churchesParsed: number;
churchesMatched: number;
churchesCreated: number;
churchesSkipped: number;
schedulesCreated: number;
errors: number;
}
interface CLIArgs {
all: boolean;
dryRun: boolean;
resumeFrom?: number;
diocese?: number;
jobId?: string;
}
// ─── HTTP Helpers ────────────────────────────────────────────────────────────
let requestCount = 0;
function delay(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function fetchJson(url: string): Promise<any | null> {
if (requestCount > 0) {
await delay(REQUEST_DELAY_MS);
}
requestCount++;
for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
try {
const response = await fetch(url, {
headers: { 'User-Agent': USER_AGENT },
});
if (response.status === 429 || response.status === 503) {
if (attempt < MAX_RETRIES) {
console.log(` HTTP ${response.status} — retrying in ${RETRY_DELAY_MS / 1000}s`);
await delay(RETRY_DELAY_MS);
continue;
}
return null;
}
if (!response.ok) return null;
return await response.json();
} catch (error) {
if (attempt < MAX_RETRIES) {
await delay(RETRY_DELAY_MS);
continue;
}
console.error(` Fetch error: ${error instanceof Error ? error.message : error}`);
return null;
}
}
return null;
}
// ─── Parsing ─────────────────────────────────────────────────────────────────
function stripHtml(html: string): string {
return html.replace(/<[^>]+>/g, '').trim();
}
function parsePost(post: any, dioceseName: string | null): ParsedChurch | null {
const content: string = post.content?.rendered || '';
const wpId: number = post.id;
const slug: string = post.slug;
// Extract name from title — format: "(City) Church Name"
let name = stripHtml(post.title?.rendered || '');
// Remove leading "(City)" prefix for cleaner name
const nameMatch = name.match(/^\([^)]+\)\s*(.+)$/);
if (nameMatch) name = nameMatch[1];
// Extract coordinates from Google Maps embed
const coordMatch = content.match(/maps\?q=([-\d.]+),([-\d.]+)/);
if (!coordMatch) return null;
const latitude = parseFloat(coordMatch[1]);
const longitude = parseFloat(coordMatch[2]);
if (isNaN(latitude) || isNaN(longitude) || (latitude === 0 && longitude === 0)) return null;
// Extract address from first <strong> tag (format: "Street, ZIP City")
const addrMatch = content.match(/<strong>([^<]+)<\/strong>/);
let address: string | null = null;
let zip: string | null = null;
let city: string | null = null;
if (addrMatch) {
const fullAddr = addrMatch[1].trim();
address = fullAddr;
// Parse "Street, ZIP City" format
const zipCityMatch = fullAddr.match(/,\s*(\d{5})\s+(.+)$/);
if (zipCityMatch) {
zip = zipCityMatch[1];
city = zipCityMatch[2];
address = fullAddr.replace(/,\s*\d{5}\s+.+$/, '').trim();
}
}
// Parse info table (second table) for website, email, phone
let website: string | null = null;
let email: string | null = null;
let phone: string | null = null;
const tables = content.match(/<table[^>]*>([\s\S]*?)<\/table>/g) || [];
if (tables.length >= 2) {
const infoTable = tables[1];
// Website
const websiteMatch = infoTable.match(/Website[\s\S]*?<a[^>]*href="([^"]+)"/);
if (websiteMatch) website = websiteMatch[1];
// Email
const emailMatch = infoTable.match(/E-Mail[\s\S]*?<td[^>]*>([\s\S]*?)<\/td>/);
if (emailMatch) {
const emailText = stripHtml(emailMatch[1]);
if (emailText.includes('@')) email = emailText;
}
// Phone
const phoneMatch = infoTable.match(/Telefon[\s\S]*?<td[^>]*>([\s\S]*?)<\/td>/);
if (phoneMatch) {
const phoneText = stripHtml(phoneMatch[1]);
if (phoneText.length > 3) phone = phoneText;
}
}
// Parse schedule table (first table)
const schedules: ParsedSchedule[] = [];
if (tables.length >= 1) {
const schedTable = tables[0];
const rows = schedTable.match(/<tr[^>]*>([\s\S]*?)<\/tr>/g) || [];
let currentDay = -1;
const seen = new Set<string>();
for (const row of rows) {
// Check for day header (in <th> with <em>)
const dayMatch = row.match(/<th[^>]*>[\s\S]*?<em>([^<]*)<\/em>/);
if (dayMatch && dayMatch[1].trim()) {
const dayName = dayMatch[1].trim().toLowerCase();
if (GERMAN_DAYS[dayName] !== undefined) {
currentDay = GERMAN_DAYS[dayName];
}
}
// Get type and time from <td><em>...</em></td>
const cells = row.match(/<td[^>]*>[\s\S]*?<em>([^<]*)<\/em>[\s\S]*?<\/td>/g);
if (!cells || cells.length < 2 || currentDay < 0) continue;
const typeMatch = cells[0].match(/<em>([^<]*)<\/em>/);
const timeMatch = cells[1].match(/<em>([^<]*)<\/em>/);
if (!typeMatch || !timeMatch) continue;
const massType = typeMatch[1].trim().toLowerCase();
const timeStr = timeMatch[1].trim();
// Only include mass-related types
const isMass = MASS_TYPES.has(massType) ||
massType.includes('messe') || massType.includes('messfeier') ||
massType.includes('eucharistie');
if (!isMass) continue;
// Parse time: "09.00 Uhr" or "18:30 Uhr" → "09:00" or "18:30"
const parsedTime = timeStr
.replace(/\s*Uhr\s*/i, '')
.replace('.', ':')
.trim();
const timeValidation = parsedTime.match(/^(\d{1,2}):(\d{2})$/);
if (!timeValidation) continue;
const normalizedTime = `${timeValidation[1].padStart(2, '0')}:${timeValidation[2]}`;
const key = `${currentDay}:${normalizedTime}`;
if (!seen.has(key)) {
seen.add(key);
schedules.push({ dayOfWeek: currentDay, time: normalizedTime });
}
}
}
return {
wpId, slug, name, latitude, longitude,
address, zip, city, diocese: dioceseName,
website, email, phone, schedules,
};
}
// ─── Database Operations ─────────────────────────────────────────────────────
async function loadExistingGermanChurches(): Promise<ExistingChurch[]> {
console.log('Loading existing German churches for deduplication...');
const churches = await prisma.church.findMany({
where: { country: 'DE' },
select: {
id: true,
name: true,
latitude: true,
longitude: true,
osmId: true,
baiduId: true,
masstimesId: true,
orarimesseId: true,
massSchedulesPhId: true,
philmassId: true,
horariosMisasId: true,
mszeInfoId: true,
weekdayMassesId: true,
messesInfoId: true,
bohosluzbyId: true,
miserendId: true,
kerknetId: true,
gottesdienstzeitenId: true,
source: true,
website: true,
phone: true,
address: true,
},
});
console.log(`Loaded ${churches.length} existing German churches`);
return churches;
}
// ─── Import Logic ────────────────────────────────────────────────────────────
async function fetchDioceseCategories(): Promise<DioceseCat[]> {
console.log('Fetching Catholic diocese categories...');
const data = await fetchJson(
`${API_BASE}/categories?per_page=100&parent=${CATHOLIC_PARENT_CATEGORY}`
);
if (!data) {
console.error('Failed to fetch categories');
return [];
}
const cats: DioceseCat[] = data.map((c: any) => ({
id: c.id, name: c.name, count: c.count,
}));
const total = cats.reduce((s, c) => s + c.count, 0);
console.log(`Found ${cats.length} diocese categories with ${total} total posts\n`);
return cats.sort((a, b) => b.count - a.count);
}
async function processDiocese(
cat: DioceseCat,
existingChurches: ExistingChurch[],
dryRun: boolean,
stats: ImportStats,
): Promise<void> {
const totalPages = Math.ceil(cat.count / POSTS_PER_PAGE);
for (let page = 1; page <= totalPages; page++) {
const url = `${API_BASE}/posts?categories=${cat.id}&per_page=${POSTS_PER_PAGE}&page=${page}`;
const posts = await fetchJson(url);
if (!posts || !Array.isArray(posts) || posts.length === 0) break;
stats.postsFound += posts.length;
for (const post of posts) {
const church = parsePost(post, cat.name);
if (!church) {
stats.churchesSkipped++;
continue;
}
stats.churchesParsed++;
const gdzId = String(church.wpId);
const candidate = {
name: church.name,
lat: church.latitude,
lng: church.longitude,
gottesdienstzeitenId: gdzId,
};
const duplicate = findDuplicateChurch(candidate, existingChurches);
if (dryRun) {
if (duplicate) {
stats.churchesMatched++;
} else {
stats.churchesCreated++;
}
stats.schedulesCreated += church.schedules.length;
continue;
}
if (duplicate) {
stats.churchesMatched++;
const updateData: Record<string, unknown> = { gottesdienstzeitenId: gdzId };
if (!duplicate.address && church.address) updateData.address = church.address;
if (!duplicate.website && church.website) {
updateData.website = church.website;
updateData.hasWebsite = true;
}
if (!duplicate.phone && church.phone) updateData.phone = church.phone;
try {
await prisma.church.update({
where: { id: duplicate.id },
data: updateData,
});
} catch (error) {
if (error instanceof Error && error.message.includes('Unique constraint')) {
stats.churchesSkipped++;
continue;
}
throw error;
}
if (church.schedules.length > 0) {
try {
await prisma.$transaction(async (tx) => {
await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } });
await tx.massSchedule.createMany({
data: church.schedules.map((s) => ({
churchId: duplicate.id,
dayOfWeek: s.dayOfWeek,
time: s.time,
language: 'German',
})),
});
await tx.church.update({
where: { id: duplicate.id },
data: { lastScrapedAt: new Date() },
});
});
stats.schedulesCreated += church.schedules.length;
} catch (error) {
stats.errors++;
console.error(` Error saving schedules for ${church.slug}: ${error instanceof Error ? error.message : error}`);
}
}
} else {
try {
const newChurch = await prisma.church.create({
data: {
name: church.name,
latitude: church.latitude,
longitude: church.longitude,
address: church.address,
zip: church.zip,
city: church.city,
country: 'DE',
diocese: church.diocese || undefined,
website: church.website,
hasWebsite: !!church.website,
email: church.email,
phone: church.phone,
gottesdienstzeitenId: gdzId,
source: 'gottesdienstzeiten',
websiteLanguage: 'de',
},
});
stats.churchesCreated++;
existingChurches.push({
id: newChurch.id,
name: church.name,
latitude: church.latitude,
longitude: church.longitude,
osmId: null,
baiduId: null,
masstimesId: null,
orarimesseId: null,
massSchedulesPhId: null,
philmassId: null,
horariosMisasId: null,
mszeInfoId: null,
weekdayMassesId: null,
messesInfoId: null,
bohosluzbyId: null,
miserendId: null,
kerknetId: null,
gottesdienstzeitenId: gdzId,
source: 'gottesdienstzeiten',
website: church.website,
phone: church.phone,
address: church.address,
});
if (church.schedules.length > 0) {
await prisma.massSchedule.createMany({
data: church.schedules.map((s) => ({
churchId: newChurch.id,
dayOfWeek: s.dayOfWeek,
time: s.time,
language: 'German',
})),
});
await prisma.church.update({
where: { id: newChurch.id },
data: { lastScrapedAt: new Date() },
});
stats.schedulesCreated += church.schedules.length;
}
} catch (error) {
if (error instanceof Error && error.message.includes('Unique constraint')) {
stats.churchesSkipped++;
continue;
}
stats.errors++;
console.error(` Error creating ${church.slug}: ${error instanceof Error ? error.message : error}`);
}
}
}
}
stats.diocesesProcessed++;
}
// ─── CLI ─────────────────────────────────────────────────────────────────────
function parseArgs(): CLIArgs {
const args = process.argv.slice(2);
const result: CLIArgs = { all: false, dryRun: false };
for (let i = 0; i < args.length; i++) {
switch (args[i]) {
case '--all':
result.all = true;
break;
case '--dry-run':
result.dryRun = true;
break;
case '--resume-from':
result.resumeFrom = parseInt(args[++i]);
break;
case '--diocese':
result.diocese = parseInt(args[++i]);
break;
case '--job-id':
result.jobId = args[++i];
break;
case '--help':
case '-h':
console.log(`
Usage: npx tsx scripts/import-gottesdienstzeiten.ts [options]
Options:
--all Import all Catholic diocese categories
--diocese <catId> Import a single diocese category (e.g., 129 for Köln)
--dry-run No database writes, just report what would happen
--resume-from <n> Skip first N diocese categories
--job-id <uuid> Background job tracking ID
--help, -h Show this help message
Examples:
npx tsx scripts/import-gottesdienstzeiten.ts --diocese 129 --dry-run
npx tsx scripts/import-gottesdienstzeiten.ts --all --dry-run
npx tsx scripts/import-gottesdienstzeiten.ts --all
`);
process.exit(0);
}
}
if (!result.all && !result.diocese) {
console.error('Error: specify --all or --diocese <categoryId>');
process.exit(1);
}
return result;
}
function formatDuration(ms: number): string {
const seconds = Math.floor(ms / 1000);
const minutes = Math.floor(seconds / 60);
const hours = Math.floor(minutes / 60);
if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`;
if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
return `${seconds}s`;
}
// ─── Main ────────────────────────────────────────────────────────────────────
async function main() {
const args = parseArgs();
const startTime = Date.now();
console.log('\n' + '='.repeat(70));
console.log('GOTTESDIENSTZEITEN.DE (GERMANY) IMPORTER');
console.log('='.repeat(70));
console.log(`Mode: ${args.diocese ? `Diocese category ${args.diocese}` : 'All dioceses'}`);
console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`);
if (args.resumeFrom) console.log(`Resume from: diocese index ${args.resumeFrom}`);
console.log(`Time: ${new Date().toISOString()}`);
console.log('='.repeat(70) + '\n');
if (args.jobId) {
try {
await prisma.backgroundJob.update({
where: { id: args.jobId },
data: { status: 'running', startedAt: new Date() },
});
} catch { /* Job might not exist */ }
}
const stats: ImportStats = {
diocesesProcessed: 0,
postsFound: 0,
churchesParsed: 0,
churchesMatched: 0,
churchesCreated: 0,
churchesSkipped: 0,
schedulesCreated: 0,
errors: 0,
};
const existingChurches = await loadExistingGermanChurches();
let categories: DioceseCat[];
if (args.diocese) {
categories = [{ id: args.diocese, name: `Category ${args.diocese}`, count: 1000 }];
} else {
categories = await fetchDioceseCategories();
}
if (args.resumeFrom && !args.diocese) {
categories = categories.slice(args.resumeFrom);
console.log(`Resuming from diocese index ${args.resumeFrom} (${categories[0]?.name})\n`);
}
console.log(`Processing ${categories.length} diocese categories\n`);
for (let i = 0; i < categories.length; i++) {
const cat = categories[i];
const elapsed = formatDuration(Date.now() - startTime);
console.log(`[${i + 1}/${categories.length}] ${cat.name} (${cat.count} posts) [${elapsed} elapsed]`);
try {
await processDiocese(cat, existingChurches, args.dryRun, stats);
} catch (error) {
stats.errors++;
console.error(` ERROR processing ${cat.name}: ${error instanceof Error ? error.message : error}`);
}
}
const totalTime = Date.now() - startTime;
console.log('\n' + '='.repeat(70));
console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
console.log('='.repeat(70));
console.log(`Dioceses processed: ${stats.diocesesProcessed}`);
console.log(`WP posts found: ${stats.postsFound}`);
console.log(`Churches parsed: ${stats.churchesParsed}`);
console.log(` Matched (existing): ${stats.churchesMatched}`);
console.log(` Created (new): ${stats.churchesCreated}`);
console.log(` Skipped (no coords): ${stats.churchesSkipped}`);
console.log(`Schedules created: ${stats.schedulesCreated}`);
console.log(`Errors: ${stats.errors}`);
console.log(`Total time: ${formatDuration(totalTime)}`);
console.log(`HTTP requests: ${requestCount}`);
console.log('='.repeat(70) + '\n');
if (args.jobId) {
try {
await prisma.backgroundJob.update({
where: { id: args.jobId },
data: {
status: stats.errors > 0 ? 'completed_with_errors' : 'completed',
completedAt: new Date(),
processed: stats.churchesParsed,
succeeded: stats.churchesCreated + stats.churchesMatched,
failed: stats.errors,
itemsFound: stats.schedulesCreated,
},
});
} catch { /* Ignore */ }
}
}
main()
.catch((error) => {
console.error('Fatal error:', error);
process.exit(1);
})
.finally(async () => {
await prisma.$disconnect();
await pool.end();
});