698 lines
22 KiB
TypeScript
698 lines
22 KiB
TypeScript
|
|
#!/usr/bin/env tsx
|
||
|
|
/**
|
||
|
|
* Import Catholic churches and mass schedules from kerknet.be (Flanders, Belgium)
|
||
|
|
*
|
||
|
|
* Kerknet is the portal of the Catholic Church in Flanders (Dutch-speaking Belgium).
|
||
|
|
* It has ~1,200 churches with structured data: name, address, coordinates (GeoJSON),
|
||
|
|
* and date-specific celebration entries.
|
||
|
|
*
|
||
|
|
* Import strategy:
|
||
|
|
* 1. Enumerate unique church slugs by paginating the celebration listing
|
||
|
|
* 2. Scrape each /kerk/{slug} page for structured data (name, address, coords, nodeId)
|
||
|
|
* 3. Fetch celebrations via AJAX endpoint per church
|
||
|
|
* 4. Deduce recurring weekly schedules from date-specific celebrations
|
||
|
|
* 5. Match against existing Belgian churches via church-matcher
|
||
|
|
* 6. Upsert churches and mass schedules
|
||
|
|
*
|
||
|
|
* Usage:
|
||
|
|
* npx tsx scripts/import-kerknet.ts --all --dry-run
|
||
|
|
* npx tsx scripts/import-kerknet.ts --all
|
||
|
|
* npx tsx scripts/import-kerknet.ts --slug o-l-vrouw-kerk-scherpenheuvel --dry-run
|
||
|
|
* npx tsx scripts/import-kerknet.ts --all --resume-from 100
|
||
|
|
*/
|
||
|
|
|
||
|
|
import dotenv from 'dotenv';
|
||
|
|
import path from 'path';
|
||
|
|
|
||
|
|
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||
|
|
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||
|
|
|
||
|
|
import { Pool } from 'pg';
|
||
|
|
import { PrismaPg } from '@prisma/adapter-pg';
|
||
|
|
import { PrismaClient } from '@prisma/client';
|
||
|
|
|
||
|
|
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
|
||
|
|
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
|
||
|
|
const pool = new Pool({
|
||
|
|
connectionString: dbUrl,
|
||
|
|
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
|
||
|
|
});
|
||
|
|
const adapter = new PrismaPg(pool);
|
||
|
|
const prisma = new PrismaClient({ adapter });
|
||
|
|
|
||
|
|
import { findDuplicateChurch } from '../src/lib/church-matcher';
|
||
|
|
import type { ExistingChurch } from '../src/lib/church-matcher';
|
||
|
|
|
||
|
|
// ─── Constants ───────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
const BASE_URL = 'https://www.kerknet.be';
|
||
|
|
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
|
||
|
|
const ENUM_DELAY_MS = 2000; // Delay between listing pages (respecting crawl-delay spirit)
|
||
|
|
const DETAIL_DELAY_MS = 3000; // Delay between church detail page fetches
|
||
|
|
const CELEBRATION_DELAY_MS = 2000; // Delay between celebration AJAX calls
|
||
|
|
const MAX_RETRIES = 3;
|
||
|
|
const RETRY_DELAY_MS = 10000;
|
||
|
|
const MAX_ENUM_PAGES = 2804; // Total celebration listing pages
|
||
|
|
const ENUM_SAMPLE_INTERVAL = 5; // Check every Nth page (5 → ~560 pages to check)
|
||
|
|
const STALE_THRESHOLD = 10; // Stop if N consecutive sampled pages yield no new slugs
|
||
|
|
|
||
|
|
// Dutch day abbreviations → dayOfWeek (0=Sun, 1=Mon, ..., 6=Sat)
|
||
|
|
const DUTCH_DAYS: Record<string, number> = {
|
||
|
|
'zo': 0, 'ma': 1, 'di': 2, 'wo': 3, 'do': 4, 'vr': 5, 'za': 6,
|
||
|
|
};
|
||
|
|
|
||
|
|
// ─── Types ───────────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
interface ChurchData {
|
||
|
|
slug: string;
|
||
|
|
nodeId: string;
|
||
|
|
name: string;
|
||
|
|
address: string | null;
|
||
|
|
zip: string | null;
|
||
|
|
city: string | null;
|
||
|
|
latitude: number;
|
||
|
|
longitude: number;
|
||
|
|
website: string | null;
|
||
|
|
}
|
||
|
|
|
||
|
|
interface CelebrationEntry {
|
||
|
|
dayAbbrev: string;
|
||
|
|
date: string; // DD/MM
|
||
|
|
time: string; // HH.MM or HH:MM
|
||
|
|
type: string; // Eucharistie, Gebedsdienst, etc.
|
||
|
|
}
|
||
|
|
|
||
|
|
interface ParsedSchedule {
|
||
|
|
dayOfWeek: number;
|
||
|
|
time: string;
|
||
|
|
}
|
||
|
|
|
||
|
|
interface ImportStats {
|
||
|
|
slugsEnumerated: number;
|
||
|
|
churchesFetched: number;
|
||
|
|
churchesMatched: number;
|
||
|
|
churchesCreated: number;
|
||
|
|
churchesSkipped: number;
|
||
|
|
schedulesCreated: number;
|
||
|
|
errors: number;
|
||
|
|
}
|
||
|
|
|
||
|
|
interface CLIArgs {
|
||
|
|
all: boolean;
|
||
|
|
dryRun: boolean;
|
||
|
|
resumeFrom?: number;
|
||
|
|
slug?: string;
|
||
|
|
jobId?: string;
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── HTTP Helpers ────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
let requestCount = 0;
|
||
|
|
|
||
|
|
function delay(ms: number): Promise<void> {
|
||
|
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
||
|
|
}
|
||
|
|
|
||
|
|
async function fetchPage(url: string, delayMs: number): Promise<string | null> {
|
||
|
|
if (requestCount > 0) {
|
||
|
|
await delay(delayMs);
|
||
|
|
}
|
||
|
|
requestCount++;
|
||
|
|
|
||
|
|
for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
|
||
|
|
try {
|
||
|
|
const response = await fetch(url, {
|
||
|
|
headers: { 'User-Agent': USER_AGENT },
|
||
|
|
});
|
||
|
|
|
||
|
|
if (response.status === 429 || response.status === 503) {
|
||
|
|
if (attempt < MAX_RETRIES) {
|
||
|
|
console.log(` HTTP ${response.status} — retrying in ${RETRY_DELAY_MS / 1000}s (attempt ${attempt}/${MAX_RETRIES})`);
|
||
|
|
await delay(RETRY_DELAY_MS);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!response.ok) {
|
||
|
|
if (attempt < MAX_RETRIES) {
|
||
|
|
await delay(RETRY_DELAY_MS);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
return await response.text();
|
||
|
|
} catch (error) {
|
||
|
|
if (attempt < MAX_RETRIES) {
|
||
|
|
console.log(` Network error — retrying (attempt ${attempt}/${MAX_RETRIES})`);
|
||
|
|
await delay(RETRY_DELAY_MS);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
console.error(` Fetch failed after ${MAX_RETRIES} attempts: ${error instanceof Error ? error.message : error}`);
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Phase 1: Enumerate Church Slugs ─────────────────────────────────────────
|
||
|
|
|
||
|
|
async function enumerateChurchSlugs(): Promise<string[]> {
|
||
|
|
console.log('\nPhase 1: Enumerating church slugs from celebration listings...');
|
||
|
|
const slugs = new Set<string>();
|
||
|
|
let consecutiveEmpty = 0;
|
||
|
|
|
||
|
|
for (let page = 0; page < MAX_ENUM_PAGES; page += ENUM_SAMPLE_INTERVAL) {
|
||
|
|
const url = `${BASE_URL}/zoeken/vieringen/lijst?page=${page}`;
|
||
|
|
const html = await fetchPage(url, ENUM_DELAY_MS);
|
||
|
|
|
||
|
|
if (!html) {
|
||
|
|
consecutiveEmpty++;
|
||
|
|
if (consecutiveEmpty >= STALE_THRESHOLD) {
|
||
|
|
console.log(` Stopping enumeration: ${STALE_THRESHOLD} consecutive empty pages`);
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Extract /kerk/{slug} links (church building pages, NOT org pages like /kerk-jette/artikel/)
|
||
|
|
const matches = html.match(/href="\/kerk\/([^"/]+)"/g);
|
||
|
|
const prevSize = slugs.size;
|
||
|
|
|
||
|
|
if (matches) {
|
||
|
|
for (const match of matches) {
|
||
|
|
const slugMatch = match.match(/href="\/kerk\/([^"/]+)"/);
|
||
|
|
if (slugMatch) {
|
||
|
|
slugs.add(slugMatch[1]);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
const newCount = slugs.size - prevSize;
|
||
|
|
if (newCount === 0) {
|
||
|
|
consecutiveEmpty++;
|
||
|
|
} else {
|
||
|
|
consecutiveEmpty = 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (page % 50 === 0 || newCount > 0) {
|
||
|
|
console.log(` Page ${page}: ${slugs.size} unique churches found (+${newCount})`);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (consecutiveEmpty >= STALE_THRESHOLD) {
|
||
|
|
console.log(` Stopping enumeration: ${STALE_THRESHOLD} consecutive sampled pages with no new churches`);
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log(` Enumeration complete: ${slugs.size} unique church slugs found\n`);
|
||
|
|
return Array.from(slugs).sort();
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Phase 2: Scrape Church Detail Page ──────────────────────────────────────
|
||
|
|
|
||
|
|
function parseChurchPage(html: string, slug: string): ChurchData | null {
|
||
|
|
// Extract coordinates from GeoJSON in Drupal settings
|
||
|
|
const coordMatch = html.match(/"coordinates":\[(-?[\d.]+),(-?[\d.]+)\]/);
|
||
|
|
if (!coordMatch) return null; // No coordinates = unusable
|
||
|
|
|
||
|
|
const longitude = parseFloat(coordMatch[1]);
|
||
|
|
const latitude = parseFloat(coordMatch[2]);
|
||
|
|
if (latitude === 0 && longitude === 0) return null;
|
||
|
|
|
||
|
|
// Extract node ID
|
||
|
|
const nidMatch = html.match(/"currentNid":"(\d+)"/);
|
||
|
|
const nodeId = nidMatch ? nidMatch[1] : slug;
|
||
|
|
|
||
|
|
// Extract name from GeoJSON description or page title
|
||
|
|
let name = slug;
|
||
|
|
const descMatch = html.match(/"description":"([^"]+)"/);
|
||
|
|
if (descMatch) {
|
||
|
|
name = descMatch[1];
|
||
|
|
} else {
|
||
|
|
const titleMatch = html.match(/<title>([^|<]+)/);
|
||
|
|
if (titleMatch) name = titleMatch[1].trim();
|
||
|
|
}
|
||
|
|
|
||
|
|
// Extract address fields
|
||
|
|
const streetMatch = html.match(/class="thoroughfare">([^<]+)</);
|
||
|
|
const zipMatch = html.match(/class="postal-code">([^<]+)</);
|
||
|
|
const cityMatch = html.match(/class="locality">([^<]+)</);
|
||
|
|
|
||
|
|
const address = streetMatch ? streetMatch[1].trim() : null;
|
||
|
|
const zip = zipMatch ? zipMatch[1].trim() : null;
|
||
|
|
const city = cityMatch ? cityMatch[1].trim() : null;
|
||
|
|
|
||
|
|
// Extract website
|
||
|
|
let website: string | null = null;
|
||
|
|
const websiteMatch = html.match(/class="website"[^>]*>.*?href="([^"]+)"/s);
|
||
|
|
if (websiteMatch) {
|
||
|
|
website = websiteMatch[1];
|
||
|
|
} else {
|
||
|
|
// Try field-name-kn-website pattern
|
||
|
|
const knWebsiteMatch = html.match(/field-name-kn-website.*?href="([^"]+)"/s);
|
||
|
|
if (knWebsiteMatch) website = knWebsiteMatch[1];
|
||
|
|
}
|
||
|
|
|
||
|
|
return { slug, nodeId, name, address, zip, city, latitude, longitude, website };
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Phase 3: Parse Celebrations ─────────────────────────────────────────────
|
||
|
|
|
||
|
|
function parseCelebrations(html: string): CelebrationEntry[] {
|
||
|
|
const entries: CelebrationEntry[] = [];
|
||
|
|
|
||
|
|
// Match celebration blocks
|
||
|
|
const celebBlocks = html.split('<div class="celebration">').slice(1);
|
||
|
|
|
||
|
|
for (const block of celebBlocks) {
|
||
|
|
// Extract day abbreviation
|
||
|
|
const dayMatch = block.match(/celebration__date__day">\s*(\w+)\s*</);
|
||
|
|
if (!dayMatch) continue;
|
||
|
|
|
||
|
|
// Extract date (DD/MM)
|
||
|
|
const dateMatch = block.match(/celebration__date__date">\s*([\d/]+)\s*</);
|
||
|
|
|
||
|
|
// Extract time (HH.MM)
|
||
|
|
const timeMatch = block.match(/celebration__time">\s*([\d.]+)\s*</);
|
||
|
|
if (!timeMatch) continue;
|
||
|
|
|
||
|
|
// Extract type
|
||
|
|
const typeMatch = block.match(/celebration__info__type">\s*([^<]+)\s*</);
|
||
|
|
|
||
|
|
entries.push({
|
||
|
|
dayAbbrev: dayMatch[1].toLowerCase().trim(),
|
||
|
|
date: dateMatch ? dateMatch[1].trim() : '',
|
||
|
|
time: timeMatch[1].trim(),
|
||
|
|
type: typeMatch ? typeMatch[1].trim().toLowerCase() : 'eucharistie',
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
return entries;
|
||
|
|
}
|
||
|
|
|
||
|
|
function deduceSchedules(celebrations: CelebrationEntry[]): ParsedSchedule[] {
|
||
|
|
// Only keep Eucharistie (mass) entries
|
||
|
|
const masses = celebrations.filter(c =>
|
||
|
|
c.type === 'eucharistie' || c.type === 'eucharistieviering'
|
||
|
|
);
|
||
|
|
|
||
|
|
const seen = new Set<string>();
|
||
|
|
const schedules: ParsedSchedule[] = [];
|
||
|
|
|
||
|
|
for (const mass of masses) {
|
||
|
|
const dayOfWeek = DUTCH_DAYS[mass.dayAbbrev];
|
||
|
|
if (dayOfWeek === undefined) continue;
|
||
|
|
|
||
|
|
// Normalize time: "15.00" → "15:00"
|
||
|
|
const time = mass.time.replace('.', ':').replace(/^(\d):/, '0$1:');
|
||
|
|
|
||
|
|
const key = `${dayOfWeek}:${time}`;
|
||
|
|
if (!seen.has(key)) {
|
||
|
|
seen.add(key);
|
||
|
|
schedules.push({ dayOfWeek, time });
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return schedules;
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Database Operations ─────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
async function loadExistingBelgianChurches(): Promise<ExistingChurch[]> {
|
||
|
|
console.log('Loading existing Belgian churches for deduplication...');
|
||
|
|
const churches = await prisma.church.findMany({
|
||
|
|
where: { country: 'BE' },
|
||
|
|
select: {
|
||
|
|
id: true,
|
||
|
|
name: true,
|
||
|
|
latitude: true,
|
||
|
|
longitude: true,
|
||
|
|
osmId: true,
|
||
|
|
baiduId: true,
|
||
|
|
masstimesId: true,
|
||
|
|
orarimesseId: true,
|
||
|
|
massSchedulesPhId: true,
|
||
|
|
philmassId: true,
|
||
|
|
horariosMisasId: true,
|
||
|
|
mszeInfoId: true,
|
||
|
|
weekdayMassesId: true,
|
||
|
|
messesInfoId: true,
|
||
|
|
bohosluzbyId: true,
|
||
|
|
miserendId: true,
|
||
|
|
kerknetId: true,
|
||
|
|
gottesdienstzeitenId: true,
|
||
|
|
discovermassId: true,
|
||
|
|
source: true,
|
||
|
|
website: true,
|
||
|
|
phone: true,
|
||
|
|
address: true,
|
||
|
|
},
|
||
|
|
});
|
||
|
|
console.log(`Loaded ${churches.length} existing Belgian churches`);
|
||
|
|
return churches;
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Import Logic ────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
async function processChurch(
|
||
|
|
slug: string,
|
||
|
|
existingChurches: ExistingChurch[],
|
||
|
|
dryRun: boolean,
|
||
|
|
stats: ImportStats,
|
||
|
|
): Promise<void> {
|
||
|
|
// Fetch church detail page
|
||
|
|
const churchHtml = await fetchPage(`${BASE_URL}/kerk/${slug}`, DETAIL_DELAY_MS);
|
||
|
|
if (!churchHtml) {
|
||
|
|
stats.errors++;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
const church = parseChurchPage(churchHtml, slug);
|
||
|
|
if (!church) {
|
||
|
|
stats.churchesSkipped++;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
stats.churchesFetched++;
|
||
|
|
|
||
|
|
// Fetch celebrations via AJAX
|
||
|
|
let celebrations: CelebrationEntry[] = [];
|
||
|
|
const celebHtml = await fetchPage(
|
||
|
|
`${BASE_URL}/kerknet-celebration/churches/ajax/load-more/0/${church.nodeId}`,
|
||
|
|
CELEBRATION_DELAY_MS,
|
||
|
|
);
|
||
|
|
if (celebHtml) {
|
||
|
|
celebrations = parseCelebrations(celebHtml);
|
||
|
|
}
|
||
|
|
|
||
|
|
const schedules = deduceSchedules(celebrations);
|
||
|
|
|
||
|
|
const kerknetId = `kerknet-${church.nodeId}`;
|
||
|
|
const candidate = {
|
||
|
|
name: church.name,
|
||
|
|
lat: church.latitude,
|
||
|
|
lng: church.longitude,
|
||
|
|
kerknetId,
|
||
|
|
};
|
||
|
|
|
||
|
|
const duplicate = findDuplicateChurch(candidate, existingChurches);
|
||
|
|
|
||
|
|
if (dryRun) {
|
||
|
|
if (duplicate) {
|
||
|
|
stats.churchesMatched++;
|
||
|
|
} else {
|
||
|
|
stats.churchesCreated++;
|
||
|
|
}
|
||
|
|
stats.schedulesCreated += schedules.length;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (duplicate) {
|
||
|
|
stats.churchesMatched++;
|
||
|
|
const updateData: Record<string, unknown> = { kerknetId };
|
||
|
|
|
||
|
|
if (!duplicate.address && church.address) updateData.address = church.address;
|
||
|
|
if (!duplicate.website && church.website) updateData.website = church.website;
|
||
|
|
|
||
|
|
try {
|
||
|
|
await prisma.church.update({
|
||
|
|
where: { id: duplicate.id },
|
||
|
|
data: updateData,
|
||
|
|
});
|
||
|
|
} catch (error) {
|
||
|
|
if (error instanceof Error && error.message.includes('Unique constraint')) {
|
||
|
|
stats.churchesSkipped++;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
throw error;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (schedules.length > 0) {
|
||
|
|
try {
|
||
|
|
await prisma.$transaction(async (tx) => {
|
||
|
|
await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } });
|
||
|
|
await tx.massSchedule.createMany({
|
||
|
|
data: schedules.map((s) => ({
|
||
|
|
churchId: duplicate.id,
|
||
|
|
dayOfWeek: s.dayOfWeek,
|
||
|
|
time: s.time,
|
||
|
|
language: 'Dutch',
|
||
|
|
})),
|
||
|
|
});
|
||
|
|
await tx.church.update({
|
||
|
|
where: { id: duplicate.id },
|
||
|
|
data: { lastScrapedAt: new Date() },
|
||
|
|
});
|
||
|
|
});
|
||
|
|
stats.schedulesCreated += schedules.length;
|
||
|
|
} catch (error) {
|
||
|
|
stats.errors++;
|
||
|
|
console.error(` Error saving schedules for ${slug}: ${error instanceof Error ? error.message : error}`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
try {
|
||
|
|
const newChurch = await prisma.church.create({
|
||
|
|
data: {
|
||
|
|
name: church.name,
|
||
|
|
latitude: church.latitude,
|
||
|
|
longitude: church.longitude,
|
||
|
|
address: church.address,
|
||
|
|
zip: church.zip,
|
||
|
|
city: church.city,
|
||
|
|
country: 'BE',
|
||
|
|
website: church.website,
|
||
|
|
hasWebsite: !!church.website,
|
||
|
|
kerknetId,
|
||
|
|
source: 'kerknet',
|
||
|
|
websiteLanguage: 'nl',
|
||
|
|
},
|
||
|
|
});
|
||
|
|
stats.churchesCreated++;
|
||
|
|
|
||
|
|
existingChurches.push({
|
||
|
|
id: newChurch.id,
|
||
|
|
name: church.name,
|
||
|
|
latitude: church.latitude,
|
||
|
|
longitude: church.longitude,
|
||
|
|
osmId: null,
|
||
|
|
baiduId: null,
|
||
|
|
masstimesId: null,
|
||
|
|
orarimesseId: null,
|
||
|
|
massSchedulesPhId: null,
|
||
|
|
philmassId: null,
|
||
|
|
horariosMisasId: null,
|
||
|
|
mszeInfoId: null,
|
||
|
|
weekdayMassesId: null,
|
||
|
|
messesInfoId: null,
|
||
|
|
bohosluzbyId: null,
|
||
|
|
miserendId: null,
|
||
|
|
kerknetId,
|
||
|
|
gottesdienstzeitenId: null,
|
||
|
|
discovermassId: null,
|
||
|
|
source: 'kerknet',
|
||
|
|
website: church.website,
|
||
|
|
phone: null,
|
||
|
|
address: church.address,
|
||
|
|
});
|
||
|
|
|
||
|
|
if (schedules.length > 0) {
|
||
|
|
await prisma.massSchedule.createMany({
|
||
|
|
data: schedules.map((s) => ({
|
||
|
|
churchId: newChurch.id,
|
||
|
|
dayOfWeek: s.dayOfWeek,
|
||
|
|
time: s.time,
|
||
|
|
language: 'Dutch',
|
||
|
|
})),
|
||
|
|
});
|
||
|
|
await prisma.church.update({
|
||
|
|
where: { id: newChurch.id },
|
||
|
|
data: { lastScrapedAt: new Date() },
|
||
|
|
});
|
||
|
|
stats.schedulesCreated += schedules.length;
|
||
|
|
}
|
||
|
|
} catch (error) {
|
||
|
|
if (error instanceof Error && error.message.includes('Unique constraint')) {
|
||
|
|
stats.churchesSkipped++;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
stats.errors++;
|
||
|
|
console.error(` Error creating ${slug}: ${error instanceof Error ? error.message : error}`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── CLI ─────────────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
function parseArgs(): CLIArgs {
|
||
|
|
const args = process.argv.slice(2);
|
||
|
|
const result: CLIArgs = { all: false, dryRun: false };
|
||
|
|
|
||
|
|
for (let i = 0; i < args.length; i++) {
|
||
|
|
switch (args[i]) {
|
||
|
|
case '--all':
|
||
|
|
result.all = true;
|
||
|
|
break;
|
||
|
|
case '--dry-run':
|
||
|
|
result.dryRun = true;
|
||
|
|
break;
|
||
|
|
case '--resume-from':
|
||
|
|
result.resumeFrom = parseInt(args[++i]);
|
||
|
|
break;
|
||
|
|
case '--slug':
|
||
|
|
result.slug = args[++i];
|
||
|
|
break;
|
||
|
|
case '--job-id':
|
||
|
|
result.jobId = args[++i];
|
||
|
|
break;
|
||
|
|
case '--help':
|
||
|
|
case '-h':
|
||
|
|
console.log(`
|
||
|
|
Usage: npx tsx scripts/import-kerknet.ts [options]
|
||
|
|
|
||
|
|
Options:
|
||
|
|
--all Import all churches from kerknet.be
|
||
|
|
--slug <slug> Import a single church (e.g., o-l-vrouw-kerk-scherpenheuvel)
|
||
|
|
--dry-run No database writes, just report what would happen
|
||
|
|
--resume-from <n> Skip first N churches (after enumeration)
|
||
|
|
--job-id <uuid> Background job tracking ID
|
||
|
|
--help, -h Show this help message
|
||
|
|
|
||
|
|
Examples:
|
||
|
|
npx tsx scripts/import-kerknet.ts --slug o-l-vrouw-kerk-scherpenheuvel --dry-run
|
||
|
|
npx tsx scripts/import-kerknet.ts --all --dry-run
|
||
|
|
npx tsx scripts/import-kerknet.ts --all
|
||
|
|
`);
|
||
|
|
process.exit(0);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!result.all && !result.slug) {
|
||
|
|
console.error('Error: specify --all or --slug <slug>');
|
||
|
|
process.exit(1);
|
||
|
|
}
|
||
|
|
|
||
|
|
return result;
|
||
|
|
}
|
||
|
|
|
||
|
|
function formatDuration(ms: number): string {
|
||
|
|
const seconds = Math.floor(ms / 1000);
|
||
|
|
const minutes = Math.floor(seconds / 60);
|
||
|
|
const hours = Math.floor(minutes / 60);
|
||
|
|
if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`;
|
||
|
|
if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
|
||
|
|
return `${seconds}s`;
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Main ────────────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
async function main() {
|
||
|
|
const args = parseArgs();
|
||
|
|
const startTime = Date.now();
|
||
|
|
|
||
|
|
console.log('\n' + '='.repeat(70));
|
||
|
|
console.log('KERKNET.BE (BELGIUM/FLANDERS) IMPORTER');
|
||
|
|
console.log('='.repeat(70));
|
||
|
|
console.log(`Mode: ${args.slug ? `Single: ${args.slug}` : 'All churches'}`);
|
||
|
|
console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`);
|
||
|
|
if (args.resumeFrom) console.log(`Resume from: church index ${args.resumeFrom}`);
|
||
|
|
console.log(`Time: ${new Date().toISOString()}`);
|
||
|
|
console.log('='.repeat(70) + '\n');
|
||
|
|
|
||
|
|
if (args.jobId) {
|
||
|
|
try {
|
||
|
|
await prisma.backgroundJob.update({
|
||
|
|
where: { id: args.jobId },
|
||
|
|
data: { status: 'running', startedAt: new Date() },
|
||
|
|
});
|
||
|
|
} catch { /* Job might not exist */ }
|
||
|
|
}
|
||
|
|
|
||
|
|
const stats: ImportStats = {
|
||
|
|
slugsEnumerated: 0,
|
||
|
|
churchesFetched: 0,
|
||
|
|
churchesMatched: 0,
|
||
|
|
churchesCreated: 0,
|
||
|
|
churchesSkipped: 0,
|
||
|
|
schedulesCreated: 0,
|
||
|
|
errors: 0,
|
||
|
|
};
|
||
|
|
|
||
|
|
const existingChurches = await loadExistingBelgianChurches();
|
||
|
|
|
||
|
|
// Get list of church slugs
|
||
|
|
let slugs: string[];
|
||
|
|
if (args.slug) {
|
||
|
|
slugs = [args.slug];
|
||
|
|
} else {
|
||
|
|
slugs = await enumerateChurchSlugs();
|
||
|
|
stats.slugsEnumerated = slugs.length;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (args.resumeFrom && !args.slug) {
|
||
|
|
slugs = slugs.slice(args.resumeFrom);
|
||
|
|
console.log(`Resuming from church index ${args.resumeFrom} (${slugs[0]})\n`);
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log(`Processing ${slugs.length} churches\n`);
|
||
|
|
|
||
|
|
for (let i = 0; i < slugs.length; i++) {
|
||
|
|
const slug = slugs[i];
|
||
|
|
const elapsed = formatDuration(Date.now() - startTime);
|
||
|
|
|
||
|
|
if (i % 50 === 0 || slugs.length <= 10) {
|
||
|
|
console.log(`[${i + 1}/${slugs.length}] ${slug} [${elapsed} elapsed, ${stats.churchesCreated} new, ${stats.churchesMatched} matched]`);
|
||
|
|
}
|
||
|
|
|
||
|
|
try {
|
||
|
|
await processChurch(slug, existingChurches, args.dryRun, stats);
|
||
|
|
} catch (error) {
|
||
|
|
stats.errors++;
|
||
|
|
console.error(` ERROR processing ${slug}: ${error instanceof Error ? error.message : error}`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
const totalTime = Date.now() - startTime;
|
||
|
|
console.log('\n' + '='.repeat(70));
|
||
|
|
console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
|
||
|
|
console.log('='.repeat(70));
|
||
|
|
console.log(`Slugs enumerated: ${stats.slugsEnumerated}`);
|
||
|
|
console.log(`Churches fetched: ${stats.churchesFetched}`);
|
||
|
|
console.log(` Matched (existing): ${stats.churchesMatched}`);
|
||
|
|
console.log(` Created (new): ${stats.churchesCreated}`);
|
||
|
|
console.log(` Skipped: ${stats.churchesSkipped}`);
|
||
|
|
console.log(`Schedules created: ${stats.schedulesCreated}`);
|
||
|
|
console.log(`Errors: ${stats.errors}`);
|
||
|
|
console.log(`Total time: ${formatDuration(totalTime)}`);
|
||
|
|
console.log(`HTTP requests: ${requestCount}`);
|
||
|
|
console.log('='.repeat(70) + '\n');
|
||
|
|
|
||
|
|
if (args.jobId) {
|
||
|
|
try {
|
||
|
|
await prisma.backgroundJob.update({
|
||
|
|
where: { id: args.jobId },
|
||
|
|
data: {
|
||
|
|
status: stats.errors > 0 ? 'completed_with_errors' : 'completed',
|
||
|
|
completedAt: new Date(),
|
||
|
|
processed: stats.churchesFetched,
|
||
|
|
succeeded: stats.churchesCreated + stats.churchesMatched,
|
||
|
|
failed: stats.errors,
|
||
|
|
itemsFound: stats.schedulesCreated,
|
||
|
|
},
|
||
|
|
});
|
||
|
|
} catch { /* Ignore */ }
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
main()
|
||
|
|
.catch((error) => {
|
||
|
|
console.error('Fatal error:', error);
|
||
|
|
process.exit(1);
|
||
|
|
})
|
||
|
|
.finally(async () => {
|
||
|
|
await prisma.$disconnect();
|
||
|
|
await pool.end();
|
||
|
|
});
|