From a046928ed081f7e800d3082af384b380eeeb16ab Mon Sep 17 00:00:00 2001 From: albertfj114 Date: Wed, 11 Mar 2026 06:52:05 -0400 Subject: [PATCH] feat: add discovermassId to church-matcher ExistingChurch and ChurchCandidate Add discovermassId field to ExistingChurch interface and ChurchCandidate type, insert a dedicated matching pass in findDuplicateChurch, and update all 15 importer push blocks plus 16 loadExistingChurches select queries to include the new field. Co-Authored-By: Claude Sonnet 4.6 --- scripts/import-baidu-churches.ts | 325 ++++++++ scripts/import-bohosluzby.ts | 641 +++++++++++++++ scripts/import-gcatholic.ts | 834 +++++++++++++++++++ scripts/import-gottesdienstzeiten.ts | 686 ++++++++++++++++ scripts/import-horariosmisas.ts | 1028 +++++++++++++++++++++++ scripts/import-kerknet.ts | 697 ++++++++++++++++ scripts/import-mass-schedules-ph.ts | 695 ++++++++++++++++ scripts/import-masstimes-api.ts | 672 +++++++++++++++ scripts/import-messesinfo.ts | 681 ++++++++++++++++ scripts/import-miserend.ts | 579 +++++++++++++ scripts/import-msze-info.ts | 746 +++++++++++++++++ scripts/import-orarimesse.ts | 771 ++++++++++++++++++ scripts/import-osm-churches.ts | 616 ++++++++++++++ scripts/import-osm-region.ts | 346 ++++++++ scripts/import-philmass.ts | 742 +++++++++++++++++ scripts/import-weekdaymasses.ts | 1121 ++++++++++++++++++++++++++ src/lib/church-matcher.ts | 396 +++++++++ 17 files changed, 11576 insertions(+) create mode 100644 scripts/import-baidu-churches.ts create mode 100644 scripts/import-bohosluzby.ts create mode 100644 scripts/import-gcatholic.ts create mode 100644 scripts/import-gottesdienstzeiten.ts create mode 100644 scripts/import-horariosmisas.ts create mode 100644 scripts/import-kerknet.ts create mode 100644 scripts/import-mass-schedules-ph.ts create mode 100644 scripts/import-masstimes-api.ts create mode 100644 scripts/import-messesinfo.ts create mode 100644 scripts/import-miserend.ts create mode 100644 scripts/import-msze-info.ts create mode 100644 scripts/import-orarimesse.ts create mode 100644 scripts/import-osm-churches.ts create mode 100644 scripts/import-osm-region.ts create mode 100644 scripts/import-philmass.ts create mode 100644 scripts/import-weekdaymasses.ts create mode 100644 src/lib/church-matcher.ts diff --git a/scripts/import-baidu-churches.ts b/scripts/import-baidu-churches.ts new file mode 100644 index 0000000..b62f6c6 --- /dev/null +++ b/scripts/import-baidu-churches.ts @@ -0,0 +1,325 @@ +#!/usr/bin/env tsx +/** + * Import Catholic churches from Baidu Maps (China) + * Usage: + * npx tsx scripts/import-baidu-churches.ts + * npx tsx scripts/import-baidu-churches.ts --dry-run + * npx tsx scripts/import-baidu-churches.ts --resume-from-cell 100 + * npx tsx scripts/import-baidu-churches.ts --job-id + */ + +import dotenv from 'dotenv'; +import path from 'path'; + +dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); +dotenv.config({ path: path.resolve(process.cwd(), '.env') }); + +import { Pool } from 'pg'; +import { PrismaPg } from '@prisma/adapter-pg'; +import { PrismaClient } from '@prisma/client'; + +const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; +console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); +const pool = new Pool({ + connectionString: dbUrl, + ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined, +}); +const adapter = new PrismaPg(pool); +const prisma = new PrismaClient({ adapter }); + +import { queryBaiduByGrid, type BaiduChurch } from '../src/lib/baidu-client'; +import { findDuplicateChurch, mergeBaiduData, type ExistingChurch } from '../src/lib/church-matcher'; + +interface ImportStats { + baiduChurchesFound: number; + newChurchesInserted: number; + existingUpdated: number; + existingLinked: number; + errors: number; +} + +function parseArgs(): { dryRun: boolean; resumeFromCell: number; jobId?: string } { + const args = process.argv.slice(2); + const result = { + dryRun: false, + resumeFromCell: 0, + jobId: undefined as string | undefined, + }; + + for (let i = 0; i < args.length; i++) { + if (args[i] === '--dry-run') { + result.dryRun = true; + } else if (args[i] === '--resume-from-cell' && args[i + 1]) { + result.resumeFromCell = parseInt(args[i + 1], 10); + i++; + } else if (args[i] === '--job-id' && args[i + 1]) { + result.jobId = args[i + 1]; + i++; + } + } + + return result; +} + +async function createOrResumeJob(jobId?: string): Promise { + if (jobId) { + await prisma.backgroundJob.update({ + where: { id: jobId }, + data: { status: 'running', startedAt: new Date() }, + }); + return jobId; + } + return null; +} + +async function completeJob(jobId: string | null, error?: string): Promise { + if (!jobId) return; + try { + await prisma.backgroundJob.update({ + where: { id: jobId }, + data: { + status: error ? 'failed' : 'completed', + error: error || null, + completedAt: new Date(), + }, + }); + } catch (err) { + console.error(`Failed to update job ${jobId}:`, err); + } +} + +async function updateJobProgress(jobId: string | null, stats: ImportStats, totalCells: number, currentCell: number): Promise { + if (!jobId) return; + try { + await prisma.backgroundJob.update({ + where: { id: jobId }, + data: { + totalItems: totalCells, + processed: currentCell, + succeeded: stats.newChurchesInserted + stats.existingUpdated + stats.existingLinked, + failed: stats.errors, + itemsFound: stats.baiduChurchesFound, + }, + }); + } catch (err) { + // Non-fatal — just log it + console.error(`Failed to update job progress:`, err); + } +} + +async function importFromBaidu(dryRun: boolean, resumeFromCell: number, jobId: string | null): Promise { + const stats: ImportStats = { + baiduChurchesFound: 0, + newChurchesInserted: 0, + existingUpdated: 0, + existingLinked: 0, + errors: 0, + }; + + const apiKey = process.env.BAIDU_MAPS_API_KEY; + if (!apiKey) { + throw new Error('Missing BAIDU_MAPS_API_KEY environment variable'); + } + + console.log(`\n${'='.repeat(60)}`); + console.log(`Importing Catholic churches from Baidu Maps (China)`); + console.log(`${'='.repeat(60)}\n`); + + // Step 1: Query Baidu API + console.log('Step 1: Querying Baidu Maps API...'); + const baiduChurches = await queryBaiduByGrid( + apiKey, + (progress) => { + updateJobProgress(jobId, stats, progress.totalCells, progress.cellIndex); + }, + resumeFromCell, + ); + + stats.baiduChurchesFound = baiduChurches.length; + console.log(`\nFound ${baiduChurches.length} churches from Baidu Maps`); + + if (baiduChurches.length === 0) { + console.log('No churches found'); + return stats; + } + + if (dryRun) { + console.log('\n[DRY RUN] Would import the following churches:'); + baiduChurches.slice(0, 20).forEach((church) => { + console.log(` - ${church.name} (${church.city || church.province || 'unknown'})`); + console.log(` Baidu ID: ${church.baiduId}, Coords: ${church.lat.toFixed(4)}, ${church.lng.toFixed(4)}`); + }); + if (baiduChurches.length > 20) { + console.log(` ... and ${baiduChurches.length - 20} more`); + } + return stats; + } + + // Step 2: Load existing churches in China for deduplication + console.log('\nStep 2: Loading existing churches in China for deduplication...'); + const existingChurches: ExistingChurch[] = await prisma.church.findMany({ + where: { country: 'CN' }, + select: { + id: true, + name: true, + latitude: true, + longitude: true, + osmId: true, + baiduId: true, + masstimesId: true, + orarimesseId: true, + massSchedulesPhId: true, + philmassId: true, + horariosMisasId: true, + mszeInfoId: true, + weekdayMassesId: true, + messesInfoId: true, + bohosluzbyId: true, + miserendId: true, + kerknetId: true, + gottesdienstzeitenId: true, + discovermassId: true, + source: true, + website: true, + phone: true, + address: true, + }, + }); + console.log(`Found ${existingChurches.length} existing churches in China`); + + // Step 3: Process each Baidu church + console.log('\nStep 3: Processing churches...'); + let processed = 0; + + for (const baiduChurch of baiduChurches) { + try { + const candidate = { + name: baiduChurch.name, + lat: baiduChurch.lat, + lng: baiduChurch.lng, + baiduId: baiduChurch.baiduId, + }; + + const duplicate = findDuplicateChurch(candidate, existingChurches); + + if (duplicate && duplicate.baiduId === baiduChurch.baiduId) { + // Existing church with matching baiduId — update it + const mergedData = mergeBaiduData(duplicate, baiduChurch); + await prisma.church.update({ + where: { id: duplicate.id }, + data: mergedData, + }); + stats.existingUpdated++; + } else if (duplicate) { + // Existing church matched by proximity/name — link it with baiduId + const mergedData = mergeBaiduData(duplicate, baiduChurch); + await prisma.church.update({ + where: { id: duplicate.id }, + data: mergedData, + }); + stats.existingLinked++; + } else { + // New church — insert it + const newChurch = await prisma.church.create({ + data: { + name: baiduChurch.name, + latitude: baiduChurch.lat, + longitude: baiduChurch.lng, + address: baiduChurch.address, + city: baiduChurch.city, + state: baiduChurch.province, + country: 'CN', + phone: baiduChurch.phone, + website: baiduChurch.website, + source: 'baidu', + baiduId: baiduChurch.baiduId, + baiduLastSyncedAt: new Date(), + hasWebsite: !!baiduChurch.website, + }, + }); + stats.newChurchesInserted++; + + // Add to existing churches list for dedup within this run + existingChurches.push({ + id: newChurch.id, + name: baiduChurch.name, + latitude: baiduChurch.lat, + longitude: baiduChurch.lng, + osmId: null, + baiduId: baiduChurch.baiduId, + masstimesId: null, + orarimesseId: null, + massSchedulesPhId: null, + philmassId: null, + horariosMisasId: null, + mszeInfoId: null, + weekdayMassesId: null, + messesInfoId: null, + bohosluzbyId: null, + miserendId: null, + kerknetId: null, + gottesdienstzeitenId: null, + discovermassId: null, + source: 'baidu', + website: baiduChurch.website || null, + phone: baiduChurch.phone || null, + address: baiduChurch.address || null, + }); + } + + processed++; + if (processed % 500 === 0) { + console.log(`Progress: ${processed}/${baiduChurches.length} churches processed`); + await updateJobProgress(jobId, stats, baiduChurches.length, processed); + } + } catch (error) { + console.error(`Error processing church ${baiduChurch.name} (${baiduChurch.baiduId}):`, error); + stats.errors++; + } + } + + console.log(`\nProcessed all ${baiduChurches.length} churches`); + return stats; +} + +function printSummary(stats: ImportStats, dryRun: boolean) { + console.log(`\n${'='.repeat(60)}`); + console.log(`Baidu Import Summary ${dryRun ? '(DRY RUN)' : ''}`); + console.log(`${'='.repeat(60)}`); + console.log(`Baidu churches found: ${stats.baiduChurchesFound}`); + + if (!dryRun) { + console.log(`New churches inserted: ${stats.newChurchesInserted}`); + console.log(`Existing churches updated: ${stats.existingUpdated} (matched by baiduId)`); + console.log(`Existing churches linked: ${stats.existingLinked} (matched by proximity)`); + } + + if (!dryRun && stats.errors > 0) { + console.log(`Errors encountered: ${stats.errors}`); + } + + console.log(`${'='.repeat(60)}\n`); +} + +async function main() { + const { dryRun, resumeFromCell, jobId: argJobId } = parseArgs(); + const jobId = await createOrResumeJob(argJobId); + + if (dryRun) { + console.log('\n*** DRY RUN MODE - No changes will be made to database ***\n'); + } + + try { + const stats = await importFromBaidu(dryRun, resumeFromCell, jobId); + printSummary(stats, dryRun); + await completeJob(jobId); + } catch (error) { + console.error('Fatal error:', error); + await completeJob(jobId, String(error)); + process.exit(1); + } finally { + await prisma.$disconnect(); + } +} + +main(); diff --git a/scripts/import-bohosluzby.ts b/scripts/import-bohosluzby.ts new file mode 100644 index 0000000..f7888bd --- /dev/null +++ b/scripts/import-bohosluzby.ts @@ -0,0 +1,641 @@ +#!/usr/bin/env tsx +/** + * Import Catholic churches and mass schedules from bohosluzby.cz (Czech Republic) + * + * bohosluzby.cz is the official Czech bishops' conference mass schedule finder. + * It exposes a JSON API with two main endpoints: + * - POST /index.php/apiWeb/allData — returns all churches (clustered by zoom level) + * - GET /index.php/apiWeb/detailById?id={id} — returns mass schedule details + * + * The API requires no authentication. We fetch all churches at zoom=7 (covers + * all of Czech Republic in one request with clustered results), then fetch + * individual detail pages for mass schedules. + * + * Import strategy: + * 1. Fetch all churches via allData endpoint (zoom=7, centered on Czech Republic) + * 2. Flatten clustered results to get individual church records + * 3. For each church, fetch detail to get mass schedules + * 4. Match against existing Czech churches via church-matcher + * 5. Upsert churches and mass schedules + * + * Usage: + * npx tsx scripts/import-bohosluzby.ts --all --dry-run + * npx tsx scripts/import-bohosluzby.ts --all + * npx tsx scripts/import-bohosluzby.ts --id 10009 --dry-run # Single church + * npx tsx scripts/import-bohosluzby.ts --all --resume-from 500 + */ + +import dotenv from 'dotenv'; +import path from 'path'; + +dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); +dotenv.config({ path: path.resolve(process.cwd(), '.env') }); + +import { Pool } from 'pg'; +import { PrismaPg } from '@prisma/adapter-pg'; +import { PrismaClient } from '@prisma/client'; + +const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; +console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); +const pool = new Pool({ + connectionString: dbUrl, + ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined, +}); +const adapter = new PrismaPg(pool); +const prisma = new PrismaClient({ adapter }); + +import { findDuplicateChurch } from '../src/lib/church-matcher'; +import type { ExistingChurch } from '../src/lib/church-matcher'; + +// ─── Constants ─────────────────────────────────────────────────────────────── + +const BASE_URL = 'https://bohosluzby.cirkev.cz'; +const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)'; +const REQUEST_DELAY_MS = 500; // Be polite — 0.5s between detail requests +const RETRY_DELAY_MS = 5000; +const MAX_RETRIES = 3; + +// Czech Republic center coordinates for the allData request +const CZ_CENTER_LAT = 49.8; +const CZ_CENTER_LNG = 15.5; +const CZ_ZOOM = 7; // Returns all churches clustered into ~7 groups + +// ─── Types ─────────────────────────────────────────────────────────────────── + +interface BohosluzbyChurch { + id: string; + name: string; + street: string | null; + city: string | null; + psc: string | null; // zip code + latitude: number; + longitude: number; + type: string; // KOSTEL, KAPLE, etc. +} + +interface BohosluzbySchedule { + dayOfWeek: number; // 0=Sunday, 1=Monday, ... + time: string; // HH:MM + language: string; + type: string; // "mše sv.", "růženec", etc. + note: string | null; +} + +interface ImportStats { + churchesFetched: number; + detailsFetched: number; + churchesMatched: number; + churchesCreated: number; + churchesSkipped: number; + schedulesCreated: number; + errors: number; +} + +interface CLIArgs { + all: boolean; + dryRun: boolean; + resumeFrom?: number; + churchId?: string; + jobId?: string; +} + +// ─── HTTP Client ───────────────────────────────────────────────────────────── + +let requestCount = 0; + +function delay(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +async function fetchWithRetry(url: string, options: RequestInit = {}): Promise { + if (requestCount > 0) { + await delay(REQUEST_DELAY_MS); + } + requestCount++; + + for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) { + try { + const response = await fetch(url, { + ...options, + headers: { + 'User-Agent': USER_AGENT, + ...options.headers, + }, + }); + + if (response.status === 503 || response.status === 429) { + if (attempt < MAX_RETRIES) { + console.log(` HTTP ${response.status} — retrying in ${RETRY_DELAY_MS / 1000}s (attempt ${attempt}/${MAX_RETRIES})`); + await delay(RETRY_DELAY_MS); + continue; + } + console.error(` HTTP ${response.status} after ${MAX_RETRIES} attempts`); + return null; + } + + if (!response.ok) { + console.error(` HTTP ${response.status} from ${url}`); + return null; + } + + return await response.json(); + } catch (error) { + if (attempt < MAX_RETRIES) { + console.log(` Network error — retrying in ${RETRY_DELAY_MS / 1000}s (attempt ${attempt}/${MAX_RETRIES})`); + await delay(RETRY_DELAY_MS); + continue; + } + console.error(` API error after ${MAX_RETRIES} attempts: ${error instanceof Error ? error.message : error}`); + return null; + } + } + return null; +} + +// ─── API Methods ───────────────────────────────────────────────────────────── + +/** + * Fetch all churches from the allData endpoint. + * Returns clustered results at zoom=7 — we flatten the clusters to get + * individual church records with id, name, lat, lng, city, street. + */ +async function fetchAllChurches(): Promise { + console.log('Fetching all churches from allData endpoint...'); + + const params = new URLSearchParams(); + params.append('institutionTypes', "'KOSTEL'"); + params.append('latitude', String(CZ_CENTER_LAT)); + params.append('longitude', String(CZ_CENTER_LNG)); + params.append('zoom', String(CZ_ZOOM)); + + const data = await fetchWithRetry(`${BASE_URL}/index.php/apiWeb/allData`, { + method: 'POST', + body: params, + headers: { 'Content-Type': 'application/x-www-form-urlencoded' }, + }); + + if (!data) { + console.error('Failed to fetch allData'); + return []; + } + + const churches: BohosluzbyChurch[] = []; + const kostelData = data["'KOSTEL'"] || []; + + for (const cluster of kostelData) { + // Add the cluster representative + churches.push({ + id: cluster.id, + name: cluster.name, + street: cluster.street || null, + city: cluster.city || null, + psc: cluster.psc || null, + latitude: parseFloat(cluster.latitude), + longitude: parseFloat(cluster.longitude), + type: cluster.type || 'KOSTEL', + }); + + // Add churches from the indices array (sub-items in the cluster) + if (Array.isArray(cluster.indices)) { + for (const sub of cluster.indices) { + churches.push({ + id: sub.id, + name: sub.name, + street: sub.street || null, + city: sub.city || null, + psc: sub.psc || null, + latitude: parseFloat(sub.latitude), + longitude: parseFloat(sub.longitude), + type: sub.type || 'KOSTEL', + }); + } + } + } + + console.log(`Fetched ${churches.length} churches from allData`); + return churches; +} + +/** + * Fetch mass schedule details for a single church. + * Returns parsed regular mass schedules. + */ +async function fetchChurchDetail(churchId: string): Promise { + const data = await fetchWithRetry(`${BASE_URL}/index.php/apiWeb/detailById?id=${churchId}`); + if (!data || !data.church) return []; + + const schedules: BohosluzbySchedule[] = []; + const regular = data.church.regular || []; + + for (const entry of regular) { + // Only import "mše sv." (Holy Mass) entries + if (entry.chst_name && !entry.chst_name.includes('mše')) continue; + + const time = entry.cas; // Already in HH:MM format + if (!time) continue; + + // Parse periodic_days: "12345" = Mon-Fri, "6" = Sat, "7" = Sun + // Convert to our dayOfWeek: 0=Sun, 1=Mon, ..., 6=Sat + const periodicDays = entry.periodic_days || ''; + for (const dayChar of periodicDays) { + const bohosluzbyDay = parseInt(dayChar); + if (isNaN(bohosluzbyDay)) continue; + + // bohosluzby: 1=Mon, 2=Tue, ..., 6=Sat, 7=Sun + // Our format: 0=Sun, 1=Mon, ..., 6=Sat + const dayOfWeek = bohosluzbyDay === 7 ? 0 : bohosluzbyDay; + + const key = `${dayOfWeek}:${time}`; + // Deduplicate within this church + if (!schedules.some(s => `${s.dayOfWeek}:${s.time}` === key)) { + schedules.push({ + dayOfWeek, + time, + language: entry.chsl_name || 'česky', + type: entry.chst_name || 'mše sv.', + note: entry.note || null, + }); + } + } + } + + return schedules; +} + +// ─── Database Operations ───────────────────────────────────────────────────── + +async function loadExistingCzechChurches(): Promise { + console.log('Loading existing Czech churches for deduplication...'); + const churches = await prisma.church.findMany({ + where: { country: 'CZ' }, + select: { + id: true, + name: true, + latitude: true, + longitude: true, + osmId: true, + baiduId: true, + masstimesId: true, + orarimesseId: true, + massSchedulesPhId: true, + philmassId: true, + horariosMisasId: true, + mszeInfoId: true, + weekdayMassesId: true, + messesInfoId: true, + bohosluzbyId: true, + miserendId: true, + kerknetId: true, + gottesdienstzeitenId: true, + discovermassId: true, + source: true, + website: true, + phone: true, + address: true, + }, + }); + console.log(`Loaded ${churches.length} existing Czech churches`); + return churches; +} + +// ─── Import Logic ──────────────────────────────────────────────────────────── + +async function processChurch( + church: BohosluzbyChurch, + existingChurches: ExistingChurch[], + dryRun: boolean, + stats: ImportStats, +): Promise { + if (church.latitude === 0 && church.longitude === 0) { + stats.churchesSkipped++; + return; + } + + // Fetch mass schedules + let schedules: BohosluzbySchedule[] = []; + if (!dryRun) { + schedules = await fetchChurchDetail(church.id); + stats.detailsFetched++; + } + + const candidate = { + name: church.name, + lat: church.latitude, + lng: church.longitude, + bohosluzbyId: church.id, + }; + + const duplicate = findDuplicateChurch(candidate, existingChurches); + + if (dryRun) { + if (duplicate) { + stats.churchesMatched++; + } else { + stats.churchesCreated++; + } + return; + } + + if (duplicate) { + stats.churchesMatched++; + const updateData: Record = { bohosluzbyId: church.id }; + + if (!duplicate.address && church.street) updateData.address = church.street; + + try { + await prisma.church.update({ + where: { id: duplicate.id }, + data: updateData, + }); + } catch (error) { + if (error instanceof Error && error.message.includes('Unique constraint')) { + stats.churchesSkipped++; + return; + } + throw error; + } + + if (schedules.length > 0) { + try { + await prisma.$transaction(async (tx) => { + await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } }); + await tx.massSchedule.createMany({ + data: schedules.map((s) => ({ + churchId: duplicate.id, + dayOfWeek: s.dayOfWeek, + time: s.time, + language: 'Czech', + })), + }); + await tx.church.update({ + where: { id: duplicate.id }, + data: { lastScrapedAt: new Date() }, + }); + }); + stats.schedulesCreated += schedules.length; + } catch (error) { + stats.errors++; + console.error(` Error saving schedules for ${church.id}: ${error instanceof Error ? error.message : error}`); + } + } + } else { + try { + const newChurch = await prisma.church.create({ + data: { + name: church.name, + latitude: church.latitude, + longitude: church.longitude, + address: church.street, + zip: church.psc, + city: church.city, + country: 'CZ', + bohosluzbyId: church.id, + source: 'bohosluzby', + websiteLanguage: 'cs', + }, + }); + stats.churchesCreated++; + + existingChurches.push({ + id: newChurch.id, + name: church.name, + latitude: church.latitude, + longitude: church.longitude, + osmId: null, + baiduId: null, + masstimesId: null, + orarimesseId: null, + massSchedulesPhId: null, + philmassId: null, + horariosMisasId: null, + mszeInfoId: null, + weekdayMassesId: null, + messesInfoId: null, + bohosluzbyId: church.id, + miserendId: null, + kerknetId: null, + gottesdienstzeitenId: null, + discovermassId: null, + source: 'bohosluzby', + website: null, + phone: null, + address: church.street, + }); + + if (schedules.length > 0) { + await prisma.massSchedule.createMany({ + data: schedules.map((s) => ({ + churchId: newChurch.id, + dayOfWeek: s.dayOfWeek, + time: s.time, + language: 'Czech', + })), + }); + await prisma.church.update({ + where: { id: newChurch.id }, + data: { lastScrapedAt: new Date() }, + }); + stats.schedulesCreated += schedules.length; + } + } catch (error) { + if (error instanceof Error && error.message.includes('Unique constraint')) { + stats.churchesSkipped++; + return; + } + stats.errors++; + console.error(` Error creating ${church.id}: ${error instanceof Error ? error.message : error}`); + } + } +} + +// ─── CLI ───────────────────────────────────────────────────────────────────── + +function parseArgs(): CLIArgs { + const args = process.argv.slice(2); + const result: CLIArgs = { all: false, dryRun: false }; + + for (let i = 0; i < args.length; i++) { + switch (args[i]) { + case '--all': + result.all = true; + break; + case '--dry-run': + result.dryRun = true; + break; + case '--resume-from': + result.resumeFrom = parseInt(args[++i]); + break; + case '--id': + result.churchId = args[++i]; + break; + case '--job-id': + result.jobId = args[++i]; + break; + case '--help': + case '-h': + console.log(` +Usage: npx tsx scripts/import-bohosluzby.ts [options] + +Options: + --all Import all churches + --id Import a single church by bohosluzby ID + --dry-run No database writes, just report what would happen + --resume-from Skip first N churches + --job-id Background job tracking ID + --help, -h Show this help message + +Examples: + npx tsx scripts/import-bohosluzby.ts --id 10009 --dry-run + npx tsx scripts/import-bohosluzby.ts --all --dry-run + npx tsx scripts/import-bohosluzby.ts --all +`); + process.exit(0); + } + } + + if (!result.all && !result.churchId) { + console.error('Error: specify --all or --id '); + process.exit(1); + } + + return result; +} + +function formatDuration(ms: number): string { + const seconds = Math.floor(ms / 1000); + const minutes = Math.floor(seconds / 60); + const hours = Math.floor(minutes / 60); + if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`; + if (minutes > 0) return `${minutes}m ${seconds % 60}s`; + return `${seconds}s`; +} + +// ─── Main ──────────────────────────────────────────────────────────────────── + +async function main() { + const args = parseArgs(); + const startTime = Date.now(); + + console.log('\n' + '='.repeat(70)); + console.log('BOHOSLUZBY.CZ (CZECH REPUBLIC) IMPORTER'); + console.log('='.repeat(70)); + console.log(`Mode: ${args.churchId ? `Church ID ${args.churchId}` : 'All churches'}`); + console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`); + if (args.resumeFrom) console.log(`Resume from: church index ${args.resumeFrom}`); + console.log(`Time: ${new Date().toISOString()}`); + console.log('='.repeat(70) + '\n'); + + if (args.jobId) { + try { + await prisma.backgroundJob.update({ + where: { id: args.jobId }, + data: { status: 'running', startedAt: new Date() }, + }); + } catch { /* Job might not exist */ } + } + + const stats: ImportStats = { + churchesFetched: 0, + detailsFetched: 0, + churchesMatched: 0, + churchesCreated: 0, + churchesSkipped: 0, + schedulesCreated: 0, + errors: 0, + }; + + const existingChurches = await loadExistingCzechChurches(); + + let churches: BohosluzbyChurch[]; + + if (args.churchId) { + // Single church mode — create a minimal record and fetch detail + churches = [{ + id: args.churchId, + name: `Church ${args.churchId}`, + street: null, + city: null, + psc: null, + latitude: 0, + longitude: 0, + type: 'KOSTEL', + }]; + // Fetch detail to get actual data + const detail = await fetchWithRetry(`${BASE_URL}/index.php/apiWeb/detailById?id=${args.churchId}`); + if (detail?.church?.institution?.[0]) { + const inst = detail.church.institution[0]; + churches[0].name = inst.name || churches[0].name; + churches[0].street = inst.street || null; + churches[0].city = inst.city || null; + churches[0].latitude = parseFloat(inst.latitude) || 0; + churches[0].longitude = parseFloat(inst.longitude) || 0; + } + } else { + churches = await fetchAllChurches(); + } + + stats.churchesFetched = churches.length; + + if (args.resumeFrom) { + churches = churches.slice(args.resumeFrom); + console.log(`Resuming from index ${args.resumeFrom} (${churches.length} remaining)\n`); + } + + console.log(`Processing ${churches.length} churches\n`); + + for (let i = 0; i < churches.length; i++) { + const church = churches[i]; + if (i % 100 === 0) { + const elapsed = formatDuration(Date.now() - startTime); + console.log(`[${i + 1}/${churches.length}] Processing ${church.name} (${church.id}) [${elapsed} elapsed]`); + } + + try { + await processChurch(church, existingChurches, args.dryRun, stats); + } catch (error) { + stats.errors++; + console.error(` ERROR processing church ${church.id}: ${error instanceof Error ? error.message : error}`); + } + } + + const totalTime = Date.now() - startTime; + console.log('\n' + '='.repeat(70)); + console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`); + console.log('='.repeat(70)); + console.log(`Churches fetched: ${stats.churchesFetched}`); + console.log(`Details fetched: ${stats.detailsFetched}`); + console.log(` Matched (existing): ${stats.churchesMatched}`); + console.log(` Created (new): ${stats.churchesCreated}`); + console.log(` Skipped: ${stats.churchesSkipped}`); + console.log(`Schedules created: ${stats.schedulesCreated}`); + console.log(`Errors: ${stats.errors}`); + console.log(`Total time: ${formatDuration(totalTime)}`); + console.log(`HTTP requests: ${requestCount}`); + console.log('='.repeat(70) + '\n'); + + if (args.jobId) { + try { + await prisma.backgroundJob.update({ + where: { id: args.jobId }, + data: { + status: stats.errors > 0 ? 'completed_with_errors' : 'completed', + completedAt: new Date(), + processed: stats.churchesFetched, + succeeded: stats.churchesCreated + stats.churchesMatched, + failed: stats.errors, + itemsFound: stats.schedulesCreated, + }, + }); + } catch { /* Ignore */ } + } +} + +main() + .catch((error) => { + console.error('Fatal error:', error); + process.exit(1); + }) + .finally(async () => { + await prisma.$disconnect(); + await pool.end(); + }); diff --git a/scripts/import-gcatholic.ts b/scripts/import-gcatholic.ts new file mode 100644 index 0000000..8151c16 --- /dev/null +++ b/scripts/import-gcatholic.ts @@ -0,0 +1,834 @@ +#!/usr/bin/env tsx +/** + * Import Catholic churches from GCatholic.org + * + * GCatholic is a comprehensive Catholic directory organized by diocese. + * Each church page includes a Google Plus Code (→ lat/lng), address, phone, website, etc. + * This script discovers churches via country → diocese → church page navigation. + * + * Usage: + * npx tsx scripts/import-gcatholic.ts --country CN + * npx tsx scripts/import-gcatholic.ts --country CN --dry-run + * npx tsx scripts/import-gcatholic.ts --diocese peki0 + * npx tsx scripts/import-gcatholic.ts --all + * npx tsx scripts/import-gcatholic.ts --all --limit 100 + * npx tsx scripts/import-gcatholic.ts --all --resume-from PL + */ + +// Load .env for database connection (before importing anything that uses process.env) +import dotenv from 'dotenv'; +import path from 'path'; + +dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); +dotenv.config({ path: path.resolve(process.cwd(), '.env') }); + +// Create a fresh Prisma client for this script (don't use cached pool from lib/db) +import { Pool } from 'pg'; +import { PrismaPg } from '@prisma/adapter-pg'; +import { PrismaClient } from '@prisma/client'; + +const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; +console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); +const pool = new Pool({ + connectionString: dbUrl, + ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined, +}); +const adapter = new PrismaPg(pool); +const prisma = new PrismaClient({ adapter }); + +import { findDuplicateChurch } from '../src/lib/church-matcher'; +import type { ExistingChurch } from '../src/lib/church-matcher'; + +// Plus Code decoder +// eslint-disable-next-line @typescript-eslint/no-require-imports +const { OpenLocationCode } = require('open-location-code'); +const olc = new OpenLocationCode(); + +// ─── Constants ─────────────────────────────────────────────────────────────── + +const BASE_URL = 'https://www.gcatholic.org'; +const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)'; +const DEFAULT_DELAY_MS = 1500; + +// ─── Types ─────────────────────────────────────────────────────────────────── + +interface GCatholicChurch { + gcatholicId: string; + name: string; + localName?: string; + lat: number; + lng: number; + address?: string; + city?: string; + state?: string; + country?: string; + phone?: string; + website?: string; + diocese?: string; + churchType?: string; + plusCode: string; + sourceUrl: string; +} + +interface ImportStats { + churchesFound: number; + newChurchesCreated: number; + existingChurchesMerged: number; + skipped: number; + errors: number; + errorDetails: string[]; +} + +interface CLIArgs { + country?: string; + all: boolean; + diocese?: string; + dryRun: boolean; + limit?: number; + delay: number; + resumeFrom?: string; +} + +// ─── HTTP Fetching ─────────────────────────────────────────────────────────── + +let requestCount = 0; + +async function fetchPage(url: string, delayMs: number): Promise { + // Rate limit + if (requestCount > 0) { + await new Promise((resolve) => setTimeout(resolve, delayMs)); + } + requestCount++; + + try { + const response = await fetch(url, { + headers: { + 'User-Agent': USER_AGENT, + 'Accept': 'text/html,application/xhtml+xml', + 'Accept-Language': 'en-US,en;q=0.9', + }, + }); + + if (!response.ok) { + if (response.status === 404) { + return null; // Expected for some pages + } + console.error(` HTTP ${response.status} for ${url}`); + return null; + } + + return await response.text(); + } catch (error) { + console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`); + return null; + } +} + +// ─── HTML Parsing ──────────────────────────────────────────────────────────── + +/** + * Extract all country codes from the GCatholic countries page. + * Links follow pattern: country/{ISO2} + */ +async function discoverCountries(delayMs: number): Promise { + console.log('Discovering countries from GCatholic...'); + const html = await fetchPage(`${BASE_URL}/dioceses/`, delayMs); + if (!html) { + console.error('Failed to fetch countries page'); + return []; + } + + const countryCodes = new Set(); + // Match links like: href="country/CN" or href="/dioceses/country/CN" + const regex = /href="(?:\.\.\/|\/dioceses\/)?country\/([A-Z]{2})(?:\.htm)?"/g; + let match; + while ((match = regex.exec(html)) !== null) { + countryCodes.add(match[1]); + } + + const codes = Array.from(countryCodes).sort(); + console.log(`Found ${codes.length} countries`); + return codes; +} + +/** + * Extract diocese codes from a country page. + * Links follow pattern: ../diocese/{code} or diocese/{code} + */ +async function discoverDioceses(countryCode: string, delayMs: number): Promise<{ code: string; name: string }[]> { + const html = await fetchPage(`${BASE_URL}/dioceses/country/${countryCode}.htm`, delayMs); + if (!html) { + return []; + } + + const dioceses: { code: string; name: string }[] = []; + const seen = new Set(); + + // Match links like: href="../diocese/peki0" or href="../../dioceses/diocese/peki0" + // The text after the link is the diocese name + const regex = /href="(?:\.\.\/)?(?:\.\.\/dioceses\/)?diocese\/([a-z0-9]+)(?:\.htm)?"[^>]*>([^<]+) { + const html = await fetchPage(`${BASE_URL}/dioceses/diocese/${dioceseCode}.htm`, delayMs); + if (!html) { + return []; + } + + const churchUrls = new Set(); + + // Match church links like: href="../../churches/china/46492" or href="../../churches/asia/1893" + const regex = /href="(?:\.\.\/)*churches\/([a-z0-9-]+\/\d+)(?:\.htm)?"/g; + let match; + while ((match = regex.exec(html)) !== null) { + const churchPath = match[1]; + churchUrls.add(`${BASE_URL}/churches/${churchPath}.htm`); + } + + return Array.from(churchUrls); +} + +/** + * Parse a single church page and extract structured data. + */ +function parseChurchPage(html: string, url: string, countryCode?: string): GCatholicChurch | null { + // Extract church name from

+ const h1Match = html.match(/

([^<]+)<\/h1>/); + if (!h1Match) return null; + const name = h1Match[1].trim(); + + // Extract local name from

+ const h2Match = html.match(/

([^<]+)<\/h2>/); + const localName = h2Match ? h2Match[1].trim() : undefined; + + // Extract Plus Code - it's in a link with onclick containing google maps + // Pattern: onclick="window.open('https://www.google.com/maps/search/?api=1&query=PLUSCODE','_blank')" + // The Plus Code text is like: >8PFRW9FF+C2< + let plusCode: string | null = null; + + // Try the onclick pattern first + const plusCodeOnclickMatch = html.match(/onclick="window\.open\('https:\/\/www\.google\.com\/maps\/search\/\?api=1&(?:amp;)?query=([^']+)'/); + if (plusCodeOnclickMatch) { + plusCode = decodeURIComponent(plusCodeOnclickMatch[1]); + } + + // Fallback: look for Plus Code pattern in text (format: XXXX+XX or longer) + if (!plusCode) { + const plusCodeTextMatch = html.match(/title="Plus Code">([A-Z0-9+]+)<\/a>/); + if (plusCodeTextMatch) { + plusCode = plusCodeTextMatch[1]; + } + } + + // Another fallback: look for the code near "Location:" label + if (!plusCode) { + const locationMatch = html.match(/Location:.*?>([2-9A-HJ-NP-Z][2-9A-HJ-NP-Z0-9]{3,7}\+[2-9A-HJ-NP-Z0-9]{2,3}) pattern + const getField = (label: string): string | undefined => { + // Pattern: Label: TEXT or TEXT + const escaped = label.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); + const regex = new RegExp(`${escaped}:?\\s*\\s*(.+?)(?:

|]+>/g, '').trim() || undefined; + }; + + // Extract address + const address = getField('Address'); + + // Extract phone + const phone = getField('Telephone'); + + // Extract website URL (it's in an tag) + let website: string | undefined; + const websiteMatch = html.match(/Website:?\s*<\/span>\s*Type:?\s*<\/span>.*?class="ch[a-z]">([^<]+)/); + if (typeMatch) { + churchType = typeMatch[1].trim(); + } + + // Extract country from page + let country = countryCode; + if (!country) { + const countryMatch = html.match(/href="[^"]*country\/([A-Z]{2})(?:\.htm)?"/); + if (countryMatch) { + country = countryMatch[1]; + } + } + + // Extract city from

tag: "City, Region, Country" + let city: string | undefined; + let state: string | undefined; + const h3Match = html.match(/

([^<]+?)(?:,\s*([^<]+)<\/span>)?(?:,\s*]*class="zcountry"[^>]*>[^<]+<\/a>)?\s*<\/h3>/); + if (h3Match) { + city = h3Match[1].trim(); + state = h3Match[2]?.trim(); + // Clean up: remove country code suffix if present (e.g., "Beijing 北京") + // Keep as-is since it may contain local language characters + } + + return { + gcatholicId, + name, + localName, + lat, + lng, + address, + city, + state, + country, + phone, + website, + diocese, + churchType, + plusCode, + sourceUrl: url, + }; +} + +// ─── CLI Argument Parsing ──────────────────────────────────────────────────── + +function parseArgs(): CLIArgs { + const args = process.argv.slice(2); + const result: CLIArgs = { + all: false, + dryRun: false, + delay: DEFAULT_DELAY_MS, + }; + + for (let i = 0; i < args.length; i++) { + switch (args[i]) { + case '--country': + result.country = args[++i]?.toUpperCase(); + break; + case '--all': + result.all = true; + break; + case '--diocese': + result.diocese = args[++i]; + break; + case '--dry-run': + result.dryRun = true; + break; + case '--limit': + result.limit = parseInt(args[++i], 10); + break; + case '--delay': + result.delay = parseInt(args[++i], 10); + break; + case '--resume-from': + result.resumeFrom = args[++i]?.toUpperCase(); + break; + } + } + + return result; +} + +// ─── Database Operations ───────────────────────────────────────────────────── + +async function loadExistingChurches(): Promise { + console.log('Loading existing churches for deduplication...'); + const churches = await prisma.church.findMany({ + select: { + id: true, + name: true, + latitude: true, + longitude: true, + osmId: true, + baiduId: true, + masstimesId: true, + orarimesseId: true, + massSchedulesPhId: true, + philmassId: true, + horariosMisasId: true, + mszeInfoId: true, + weekdayMassesId: true, + messesInfoId: true, + bohosluzbyId: true, + miserendId: true, + kerknetId: true, + gottesdienstzeitenId: true, + discovermassId: true, + source: true, + website: true, + phone: true, + address: true, + }, + }); + console.log(`Loaded ${churches.length} existing churches`); + return churches; +} + +async function importChurch( + church: GCatholicChurch, + existingChurches: ExistingChurch[], + dryRun: boolean, + stats: ImportStats, +): Promise { + // Build a candidate compatible with findDuplicateChurch (expects OSMChurch shape) + const candidate = { + osmId: `gcatholic-${church.gcatholicId}`, + name: church.name, + lat: church.lat, + lng: church.lng, + address: church.address, + city: church.city, + state: church.state, + country: church.country, + phone: church.phone, + website: church.website, + diocese: church.diocese, + }; + + const duplicate = findDuplicateChurch(candidate, existingChurches); + + if (dryRun) { + if (duplicate) { + console.log(` [MERGE] ${church.name} → existing: ${duplicate.name} (${duplicate.id})`); + stats.existingChurchesMerged++; + } else { + console.log(` [NEW] ${church.name} (${church.lat.toFixed(4)}, ${church.lng.toFixed(4)})`); + stats.newChurchesCreated++; + } + return; + } + + if (duplicate) { + // Merge: fill in missing fields only + const updateData: Record = {}; + + if (!duplicate.phone && church.phone) updateData.phone = church.phone; + if (!duplicate.website && church.website) { + updateData.website = church.website; + updateData.hasWebsite = true; + } + if (!duplicate.address && church.address) updateData.address = church.address; + + // Always set diocese if missing (GCatholic is great for this) + // We need to check diocese on the actual DB record + const dbRecord = await prisma.church.findUnique({ + where: { id: duplicate.id }, + select: { diocese: true }, + }); + if (dbRecord && !dbRecord.diocese && church.diocese) { + updateData.diocese = church.diocese; + } + + if (Object.keys(updateData).length > 0) { + await prisma.church.update({ + where: { id: duplicate.id }, + data: updateData, + }); + stats.existingChurchesMerged++; + } else { + stats.skipped++; + } + } else { + // Create new church + const newChurch = await prisma.church.create({ + data: { + name: church.name, + latitude: church.lat, + longitude: church.lng, + address: church.address, + city: church.city, + state: church.state, + country: church.country, + phone: church.phone, + website: church.website, + hasWebsite: !!church.website, + source: 'gcatholic', + diocese: church.diocese, + }, + }); + stats.newChurchesCreated++; + + // Add to existing list for future dedup within this run + existingChurches.push({ + id: newChurch.id, + name: church.name, + latitude: church.lat, + longitude: church.lng, + osmId: null, + baiduId: null, + masstimesId: null, + orarimesseId: null, + massSchedulesPhId: null, + philmassId: null, + horariosMisasId: null, + mszeInfoId: null, + weekdayMassesId: null, + messesInfoId: null, + bohosluzbyId: null, + miserendId: null, + kerknetId: null, + gottesdienstzeitenId: null, + discovermassId: null, + source: 'gcatholic', + website: church.website || null, + phone: church.phone || null, + address: church.address || null, + }); + } +} + +// ─── Import Logic ──────────────────────────────────────────────────────────── + +async function importDiocese( + dioceseCode: string, + dioceseName: string, + countryCode: string | undefined, + existingChurches: ExistingChurch[], + args: CLIArgs, + stats: ImportStats, + globalLimit?: { remaining: number }, +): Promise { + const churchUrls = await discoverChurchLinks(dioceseCode, args.delay); + + if (churchUrls.length === 0) { + return; + } + + console.log(` Diocese ${dioceseName} (${dioceseCode}): ${churchUrls.length} church pages found`); + + let dioceseNew = 0; + let dioceseMerged = 0; + let dioceseSkipped = 0; + let dioceseErrors = 0; + + for (const url of churchUrls) { + // Check global limit + if (globalLimit && globalLimit.remaining <= 0) { + console.log(` Limit reached, stopping`); + return; + } + + try { + const html = await fetchPage(url, args.delay); + if (!html) { + stats.errors++; + dioceseErrors++; + stats.errorDetails.push(`Failed to fetch: ${url}`); + continue; + } + + const church = parseChurchPage(html, url, countryCode); + if (!church) { + stats.skipped++; + dioceseSkipped++; + continue; + } + + stats.churchesFound++; + + const prevNew = stats.newChurchesCreated; + const prevMerged = stats.existingChurchesMerged; + + await importChurch(church, existingChurches, args.dryRun, stats); + + if (stats.newChurchesCreated > prevNew) dioceseNew++; + if (stats.existingChurchesMerged > prevMerged) dioceseMerged++; + + if (globalLimit) globalLimit.remaining--; + + } catch (error) { + stats.errors++; + dioceseErrors++; + const msg = error instanceof Error ? error.message : String(error); + stats.errorDetails.push(`${url}: ${msg}`); + console.error(` Error processing ${url}: ${msg}`); + } + } + + if (churchUrls.length > 0) { + const parts = [`${dioceseNew} new`, `${dioceseMerged} merged`]; + if (dioceseSkipped > 0) parts.push(`${dioceseSkipped} skipped`); + if (dioceseErrors > 0) parts.push(`${dioceseErrors} errors`); + console.log(` → ${parts.join(', ')}`); + } +} + +async function importCountry( + countryCode: string, + existingChurches: ExistingChurch[], + args: CLIArgs, + globalLimit?: { remaining: number }, +): Promise { + const stats: ImportStats = { + churchesFound: 0, + newChurchesCreated: 0, + existingChurchesMerged: 0, + skipped: 0, + errors: 0, + errorDetails: [], + }; + + console.log(`\n${'='.repeat(60)}`); + console.log(`Importing from GCatholic: ${countryCode}`); + console.log(`${'='.repeat(60)}`); + + // Discover dioceses + const dioceses = await discoverDioceses(countryCode, args.delay); + if (dioceses.length === 0) { + console.log(`No dioceses found for ${countryCode}`); + return stats; + } + console.log(`Found ${dioceses.length} dioceses in ${countryCode}`); + + // Process each diocese + for (const diocese of dioceses) { + if (globalLimit && globalLimit.remaining <= 0) break; + + await importDiocese( + diocese.code, + diocese.name, + countryCode, + existingChurches, + args, + stats, + globalLimit, + ); + } + + return stats; +} + +// ─── Summary Printing ──────────────────────────────────────────────────────── + +function printSummary(label: string, stats: ImportStats, dryRun: boolean): void { + console.log(`\n${'─'.repeat(60)}`); + console.log(`Summary: ${label} ${dryRun ? '(DRY RUN)' : ''}`); + console.log(`${'─'.repeat(60)}`); + console.log(`Churches found on GCatholic: ${stats.churchesFound}`); + console.log(`New churches created: ${stats.newChurchesCreated}`); + console.log(`Merged with existing: ${stats.existingChurchesMerged}`); + console.log(`Skipped (no data/dup): ${stats.skipped}`); + if (stats.errors > 0) { + console.log(`Errors: ${stats.errors}`); + } + console.log(`${'─'.repeat(60)}`); +} + +// ─── Job Tracking ──────────────────────────────────────────────────────────── + +async function createOrResumeJob(args: string[]): Promise { + const jobIdIndex = args.indexOf('--job-id'); + if (jobIdIndex !== -1) { + const jobId = args[jobIdIndex + 1]; + await prisma.backgroundJob.update({ + where: { id: jobId }, + data: { status: 'running', startedAt: new Date() }, + }); + return jobId; + } + return null; +} + +async function completeJob(jobId: string | null, error?: string): Promise { + if (!jobId) return; + try { + await prisma.backgroundJob.update({ + where: { id: jobId }, + data: { + status: error ? 'failed' : 'completed', + error: error || null, + completedAt: new Date(), + }, + }); + } catch (err) { + console.error(`Failed to update job ${jobId}:`, err); + } +} + +// ─── Main ──────────────────────────────────────────────────────────────────── + +async function main() { + const args = parseArgs(); + const jobId = await createOrResumeJob(process.argv.slice(2)); + + if (!args.country && !args.all && !args.diocese) { + console.error('Error: Must specify --country , --diocese , or --all'); + console.error('Usage:'); + console.error(' npx tsx scripts/import-gcatholic.ts --country CN'); + console.error(' npx tsx scripts/import-gcatholic.ts --country CN --dry-run'); + console.error(' npx tsx scripts/import-gcatholic.ts --diocese peki0'); + console.error(' npx tsx scripts/import-gcatholic.ts --all'); + console.error(' npx tsx scripts/import-gcatholic.ts --all --limit 500'); + console.error(' npx tsx scripts/import-gcatholic.ts --all --resume-from PL'); + process.exit(1); + } + + if (args.dryRun) { + console.log('\n*** DRY RUN MODE — no changes will be written to database ***\n'); + } + + console.log(`Delay between requests: ${args.delay}ms`); + if (args.limit) console.log(`Limit: ${args.limit} churches`); + + try { + const existingChurches = await loadExistingChurches(); + const globalLimit = args.limit ? { remaining: args.limit } : undefined; + + if (args.diocese) { + // Single diocese mode + const stats: ImportStats = { + churchesFound: 0, + newChurchesCreated: 0, + existingChurchesMerged: 0, + skipped: 0, + errors: 0, + errorDetails: [], + }; + + await importDiocese(args.diocese, args.diocese, args.country, existingChurches, args, stats, globalLimit); + printSummary(`Diocese ${args.diocese}`, stats, args.dryRun); + + } else if (args.country) { + // Single country mode + const stats = await importCountry(args.country, existingChurches, args, globalLimit); + printSummary(args.country, stats, args.dryRun); + + } else if (args.all) { + // All countries mode — discover from GCatholic + let countries = await discoverCountries(args.delay); + + if (countries.length === 0) { + console.error('Failed to discover countries'); + process.exit(1); + } + + // Handle --resume-from + if (args.resumeFrom) { + const idx = countries.indexOf(args.resumeFrom); + if (idx === -1) { + console.error(`Country ${args.resumeFrom} not found in GCatholic listing`); + process.exit(1); + } + console.log(`Resuming from ${args.resumeFrom} (skipping ${idx} countries)\n`); + countries = countries.slice(idx); + } + + console.log(`Will process ${countries.length} countries\n`); + + const totalStats: ImportStats = { + churchesFound: 0, + newChurchesCreated: 0, + existingChurchesMerged: 0, + skipped: 0, + errors: 0, + errorDetails: [], + }; + + let countriesProcessed = 0; + + for (const countryCode of countries) { + if (globalLimit && globalLimit.remaining <= 0) { + console.log(`\nGlobal limit reached, stopping.`); + break; + } + + const stats = await importCountry(countryCode, existingChurches, args, globalLimit); + printSummary(countryCode, stats, args.dryRun); + + // Aggregate + totalStats.churchesFound += stats.churchesFound; + totalStats.newChurchesCreated += stats.newChurchesCreated; + totalStats.existingChurchesMerged += stats.existingChurchesMerged; + totalStats.skipped += stats.skipped; + totalStats.errors += stats.errors; + totalStats.errorDetails.push(...stats.errorDetails); + countriesProcessed++; + + // Small extra delay between countries + await new Promise((resolve) => setTimeout(resolve, 2000)); + } + + // Overall summary + console.log(`\n${'='.repeat(60)}`); + console.log(`OVERALL SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`); + console.log(`${'='.repeat(60)}`); + console.log(`Countries processed: ${countriesProcessed}`); + console.log(`Total churches found: ${totalStats.churchesFound}`); + console.log(`Total new churches created: ${totalStats.newChurchesCreated}`); + console.log(`Total merged with existing: ${totalStats.existingChurchesMerged}`); + console.log(`Total skipped: ${totalStats.skipped}`); + if (totalStats.errors > 0) { + console.log(`Total errors: ${totalStats.errors}`); + } + console.log(`Total HTTP requests made: ${requestCount}`); + console.log(`${'='.repeat(60)}\n`); + + if (totalStats.errorDetails.length > 0 && totalStats.errorDetails.length <= 50) { + console.log('\nError details:'); + totalStats.errorDetails.forEach((e) => console.log(` - ${e}`)); + } else if (totalStats.errorDetails.length > 50) { + console.log(`\nFirst 50 errors (of ${totalStats.errorDetails.length}):`); + totalStats.errorDetails.slice(0, 50).forEach((e) => console.log(` - ${e}`)); + } + } + + await completeJob(jobId); + } catch (error) { + console.error('Fatal error:', error); + await completeJob(jobId, String(error)); + process.exit(1); + } finally { + await prisma.$disconnect(); + await pool.end(); + } +} + +main(); diff --git a/scripts/import-gottesdienstzeiten.ts b/scripts/import-gottesdienstzeiten.ts new file mode 100644 index 0000000..d9ab654 --- /dev/null +++ b/scripts/import-gottesdienstzeiten.ts @@ -0,0 +1,686 @@ +#!/usr/bin/env tsx +/** + * Import Catholic churches and mass schedules from gottesdienstzeiten.de (Germany) + * + * gottesdienstzeiten.de is a German worship service directory with ~6,878 Catholic + * churches. It runs on WordPress with a fully open REST API at /wp-json/wp/v2/posts. + * + * Data includes: church name, address, coordinates (Google Maps embed), diocese, + * mass schedules (day/type/time table), website, email, phone. + * + * Import strategy: + * 1. Fetch all Catholic diocese category IDs from WP API + * 2. Paginate through posts per category (100 per page) + * 3. Parse HTML content for coordinates, address, schedule table, info table + * 4. Match against existing German churches via church-matcher + * 5. Upsert churches and mass schedules + * + * Usage: + * npx tsx scripts/import-gottesdienstzeiten.ts --all --dry-run + * npx tsx scripts/import-gottesdienstzeiten.ts --all + * npx tsx scripts/import-gottesdienstzeiten.ts --diocese 129 --dry-run # Köln only + * npx tsx scripts/import-gottesdienstzeiten.ts --all --resume-from 5 + */ + +import dotenv from 'dotenv'; +import path from 'path'; + +dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); +dotenv.config({ path: path.resolve(process.cwd(), '.env') }); + +import { Pool } from 'pg'; +import { PrismaPg } from '@prisma/adapter-pg'; +import { PrismaClient } from '@prisma/client'; + +const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; +console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); +const pool = new Pool({ + connectionString: dbUrl, + ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined, +}); +const adapter = new PrismaPg(pool); +const prisma = new PrismaClient({ adapter }); + +import { findDuplicateChurch } from '../src/lib/church-matcher'; +import type { ExistingChurch } from '../src/lib/church-matcher'; + +// ─── Constants ─────────────────────────────────────────────────────────────── + +const API_BASE = 'https://gottesdienstzeiten.de/wp-json/wp/v2'; +const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)'; +const REQUEST_DELAY_MS = 1000; +const RETRY_DELAY_MS = 5000; +const MAX_RETRIES = 3; +const POSTS_PER_PAGE = 100; +const CATHOLIC_PARENT_CATEGORY = 4; + +// German day names → dayOfWeek (0=Sun, 1=Mon, ..., 6=Sat) +const GERMAN_DAYS: Record = { + 'sonntags': 0, 'montags': 1, 'dienstags': 2, 'mittwochs': 3, + 'donnerstags': 4, 'freitags': 5, 'samstags': 6, + // Without -s suffix (some entries use these) + 'sonntag': 0, 'montag': 1, 'dienstag': 2, 'mittwoch': 3, + 'donnerstag': 4, 'freitag': 5, 'samstag': 6, +}; + +// Mass-related types (filter out non-mass services) +const MASS_TYPES = new Set([ + 'messfeier', 'vorabendmesse', 'heilige messe', 'hl. messe', + 'hochamt', 'festmesse', 'familienmesse', 'kindergottesdienst', + 'jugendmesse', 'abendmesse', 'frühmesse', 'werktagsmesse', + 'sonntagsmesse', 'messe', 'eucharistiefeier', +]); + +// ─── Types ─────────────────────────────────────────────────────────────────── + +interface DioceseCat { + id: number; + name: string; + count: number; +} + +interface ParsedChurch { + wpId: number; + slug: string; + name: string; + latitude: number; + longitude: number; + address: string | null; + zip: string | null; + city: string | null; + diocese: string | null; + website: string | null; + email: string | null; + phone: string | null; + schedules: ParsedSchedule[]; +} + +interface ParsedSchedule { + dayOfWeek: number; + time: string; +} + +interface ImportStats { + diocesesProcessed: number; + postsFound: number; + churchesParsed: number; + churchesMatched: number; + churchesCreated: number; + churchesSkipped: number; + schedulesCreated: number; + errors: number; +} + +interface CLIArgs { + all: boolean; + dryRun: boolean; + resumeFrom?: number; + diocese?: number; + jobId?: string; +} + +// ─── HTTP Helpers ──────────────────────────────────────────────────────────── + +let requestCount = 0; + +function delay(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +async function fetchJson(url: string): Promise { + if (requestCount > 0) { + await delay(REQUEST_DELAY_MS); + } + requestCount++; + + for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) { + try { + const response = await fetch(url, { + headers: { 'User-Agent': USER_AGENT }, + }); + + if (response.status === 429 || response.status === 503) { + if (attempt < MAX_RETRIES) { + console.log(` HTTP ${response.status} — retrying in ${RETRY_DELAY_MS / 1000}s`); + await delay(RETRY_DELAY_MS); + continue; + } + return null; + } + + if (!response.ok) return null; + return await response.json(); + } catch (error) { + if (attempt < MAX_RETRIES) { + await delay(RETRY_DELAY_MS); + continue; + } + console.error(` Fetch error: ${error instanceof Error ? error.message : error}`); + return null; + } + } + return null; +} + +// ─── Parsing ───────────────────────────────────────────────────────────────── + +function stripHtml(html: string): string { + return html.replace(/<[^>]+>/g, '').trim(); +} + +function parsePost(post: any, dioceseName: string | null): ParsedChurch | null { + const content: string = post.content?.rendered || ''; + const wpId: number = post.id; + const slug: string = post.slug; + + // Extract name from title — format: "(City) Church Name" + let name = stripHtml(post.title?.rendered || ''); + // Remove leading "(City)" prefix for cleaner name + const nameMatch = name.match(/^\([^)]+\)\s*(.+)$/); + if (nameMatch) name = nameMatch[1]; + + // Extract coordinates from Google Maps embed + const coordMatch = content.match(/maps\?q=([-\d.]+),([-\d.]+)/); + if (!coordMatch) return null; + + const latitude = parseFloat(coordMatch[1]); + const longitude = parseFloat(coordMatch[2]); + if (isNaN(latitude) || isNaN(longitude) || (latitude === 0 && longitude === 0)) return null; + + // Extract address from first tag (format: "Street, ZIP City") + const addrMatch = content.match(/([^<]+)<\/strong>/); + let address: string | null = null; + let zip: string | null = null; + let city: string | null = null; + + if (addrMatch) { + const fullAddr = addrMatch[1].trim(); + address = fullAddr; + + // Parse "Street, ZIP City" format + const zipCityMatch = fullAddr.match(/,\s*(\d{5})\s+(.+)$/); + if (zipCityMatch) { + zip = zipCityMatch[1]; + city = zipCityMatch[2]; + address = fullAddr.replace(/,\s*\d{5}\s+.+$/, '').trim(); + } + } + + // Parse info table (second table) for website, email, phone + let website: string | null = null; + let email: string | null = null; + let phone: string | null = null; + + const tables = content.match(/]*>([\s\S]*?)<\/table>/g) || []; + if (tables.length >= 2) { + const infoTable = tables[1]; + // Website + const websiteMatch = infoTable.match(/Website[\s\S]*?]*href="([^"]+)"/); + if (websiteMatch) website = websiteMatch[1]; + // Email + const emailMatch = infoTable.match(/E-Mail[\s\S]*?]*>([\s\S]*?)<\/td>/); + if (emailMatch) { + const emailText = stripHtml(emailMatch[1]); + if (emailText.includes('@')) email = emailText; + } + // Phone + const phoneMatch = infoTable.match(/Telefon[\s\S]*?]*>([\s\S]*?)<\/td>/); + if (phoneMatch) { + const phoneText = stripHtml(phoneMatch[1]); + if (phoneText.length > 3) phone = phoneText; + } + } + + // Parse schedule table (first table) + const schedules: ParsedSchedule[] = []; + if (tables.length >= 1) { + const schedTable = tables[0]; + const rows = schedTable.match(/]*>([\s\S]*?)<\/tr>/g) || []; + + let currentDay = -1; + const seen = new Set(); + + for (const row of rows) { + // Check for day header (in with ) + const dayMatch = row.match(/]*>[\s\S]*?([^<]*)<\/em>/); + if (dayMatch && dayMatch[1].trim()) { + const dayName = dayMatch[1].trim().toLowerCase(); + if (GERMAN_DAYS[dayName] !== undefined) { + currentDay = GERMAN_DAYS[dayName]; + } + } + + // Get type and time from ... + const cells = row.match(/]*>[\s\S]*?([^<]*)<\/em>[\s\S]*?<\/td>/g); + if (!cells || cells.length < 2 || currentDay < 0) continue; + + const typeMatch = cells[0].match(/([^<]*)<\/em>/); + const timeMatch = cells[1].match(/([^<]*)<\/em>/); + if (!typeMatch || !timeMatch) continue; + + const massType = typeMatch[1].trim().toLowerCase(); + const timeStr = timeMatch[1].trim(); + + // Only include mass-related types + const isMass = MASS_TYPES.has(massType) || + massType.includes('messe') || massType.includes('messfeier') || + massType.includes('eucharistie'); + if (!isMass) continue; + + // Parse time: "09.00 Uhr" or "18:30 Uhr" → "09:00" or "18:30" + const parsedTime = timeStr + .replace(/\s*Uhr\s*/i, '') + .replace('.', ':') + .trim(); + const timeValidation = parsedTime.match(/^(\d{1,2}):(\d{2})$/); + if (!timeValidation) continue; + const normalizedTime = `${timeValidation[1].padStart(2, '0')}:${timeValidation[2]}`; + + const key = `${currentDay}:${normalizedTime}`; + if (!seen.has(key)) { + seen.add(key); + schedules.push({ dayOfWeek: currentDay, time: normalizedTime }); + } + } + } + + return { + wpId, slug, name, latitude, longitude, + address, zip, city, diocese: dioceseName, + website, email, phone, schedules, + }; +} + +// ─── Database Operations ───────────────────────────────────────────────────── + +async function loadExistingGermanChurches(): Promise { + console.log('Loading existing German churches for deduplication...'); + const churches = await prisma.church.findMany({ + where: { country: 'DE' }, + select: { + id: true, + name: true, + latitude: true, + longitude: true, + osmId: true, + baiduId: true, + masstimesId: true, + orarimesseId: true, + massSchedulesPhId: true, + philmassId: true, + horariosMisasId: true, + mszeInfoId: true, + weekdayMassesId: true, + messesInfoId: true, + bohosluzbyId: true, + miserendId: true, + kerknetId: true, + gottesdienstzeitenId: true, + discovermassId: true, + source: true, + website: true, + phone: true, + address: true, + }, + }); + console.log(`Loaded ${churches.length} existing German churches`); + return churches; +} + +// ─── Import Logic ──────────────────────────────────────────────────────────── + +async function fetchDioceseCategories(): Promise { + console.log('Fetching Catholic diocese categories...'); + const data = await fetchJson( + `${API_BASE}/categories?per_page=100&parent=${CATHOLIC_PARENT_CATEGORY}` + ); + if (!data) { + console.error('Failed to fetch categories'); + return []; + } + const cats: DioceseCat[] = data.map((c: any) => ({ + id: c.id, name: c.name, count: c.count, + })); + const total = cats.reduce((s, c) => s + c.count, 0); + console.log(`Found ${cats.length} diocese categories with ${total} total posts\n`); + return cats.sort((a, b) => b.count - a.count); +} + +async function processDiocese( + cat: DioceseCat, + existingChurches: ExistingChurch[], + dryRun: boolean, + stats: ImportStats, +): Promise { + const totalPages = Math.ceil(cat.count / POSTS_PER_PAGE); + + for (let page = 1; page <= totalPages; page++) { + const url = `${API_BASE}/posts?categories=${cat.id}&per_page=${POSTS_PER_PAGE}&page=${page}`; + const posts = await fetchJson(url); + if (!posts || !Array.isArray(posts) || posts.length === 0) break; + + stats.postsFound += posts.length; + + for (const post of posts) { + const church = parsePost(post, cat.name); + if (!church) { + stats.churchesSkipped++; + continue; + } + + stats.churchesParsed++; + const gdzId = String(church.wpId); + + const candidate = { + name: church.name, + lat: church.latitude, + lng: church.longitude, + gottesdienstzeitenId: gdzId, + }; + + const duplicate = findDuplicateChurch(candidate, existingChurches); + + if (dryRun) { + if (duplicate) { + stats.churchesMatched++; + } else { + stats.churchesCreated++; + } + stats.schedulesCreated += church.schedules.length; + continue; + } + + if (duplicate) { + stats.churchesMatched++; + const updateData: Record = { gottesdienstzeitenId: gdzId }; + + if (!duplicate.address && church.address) updateData.address = church.address; + if (!duplicate.website && church.website) { + updateData.website = church.website; + updateData.hasWebsite = true; + } + if (!duplicate.phone && church.phone) updateData.phone = church.phone; + + try { + await prisma.church.update({ + where: { id: duplicate.id }, + data: updateData, + }); + } catch (error) { + if (error instanceof Error && error.message.includes('Unique constraint')) { + stats.churchesSkipped++; + continue; + } + throw error; + } + + if (church.schedules.length > 0) { + try { + await prisma.$transaction(async (tx) => { + await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } }); + await tx.massSchedule.createMany({ + data: church.schedules.map((s) => ({ + churchId: duplicate.id, + dayOfWeek: s.dayOfWeek, + time: s.time, + language: 'German', + })), + }); + await tx.church.update({ + where: { id: duplicate.id }, + data: { lastScrapedAt: new Date() }, + }); + }); + stats.schedulesCreated += church.schedules.length; + } catch (error) { + stats.errors++; + console.error(` Error saving schedules for ${church.slug}: ${error instanceof Error ? error.message : error}`); + } + } + } else { + try { + const newChurch = await prisma.church.create({ + data: { + name: church.name, + latitude: church.latitude, + longitude: church.longitude, + address: church.address, + zip: church.zip, + city: church.city, + country: 'DE', + diocese: church.diocese || undefined, + website: church.website, + hasWebsite: !!church.website, + email: church.email, + phone: church.phone, + gottesdienstzeitenId: gdzId, + source: 'gottesdienstzeiten', + websiteLanguage: 'de', + }, + }); + stats.churchesCreated++; + + existingChurches.push({ + id: newChurch.id, + name: church.name, + latitude: church.latitude, + longitude: church.longitude, + osmId: null, + baiduId: null, + masstimesId: null, + orarimesseId: null, + massSchedulesPhId: null, + philmassId: null, + horariosMisasId: null, + mszeInfoId: null, + weekdayMassesId: null, + messesInfoId: null, + bohosluzbyId: null, + miserendId: null, + kerknetId: null, + gottesdienstzeitenId: gdzId, + discovermassId: null, + source: 'gottesdienstzeiten', + website: church.website, + phone: church.phone, + address: church.address, + }); + + if (church.schedules.length > 0) { + await prisma.massSchedule.createMany({ + data: church.schedules.map((s) => ({ + churchId: newChurch.id, + dayOfWeek: s.dayOfWeek, + time: s.time, + language: 'German', + })), + }); + await prisma.church.update({ + where: { id: newChurch.id }, + data: { lastScrapedAt: new Date() }, + }); + stats.schedulesCreated += church.schedules.length; + } + } catch (error) { + if (error instanceof Error && error.message.includes('Unique constraint')) { + stats.churchesSkipped++; + continue; + } + stats.errors++; + console.error(` Error creating ${church.slug}: ${error instanceof Error ? error.message : error}`); + } + } + } + } + + stats.diocesesProcessed++; +} + +// ─── CLI ───────────────────────────────────────────────────────────────────── + +function parseArgs(): CLIArgs { + const args = process.argv.slice(2); + const result: CLIArgs = { all: false, dryRun: false }; + + for (let i = 0; i < args.length; i++) { + switch (args[i]) { + case '--all': + result.all = true; + break; + case '--dry-run': + result.dryRun = true; + break; + case '--resume-from': + result.resumeFrom = parseInt(args[++i]); + break; + case '--diocese': + result.diocese = parseInt(args[++i]); + break; + case '--job-id': + result.jobId = args[++i]; + break; + case '--help': + case '-h': + console.log(` +Usage: npx tsx scripts/import-gottesdienstzeiten.ts [options] + +Options: + --all Import all Catholic diocese categories + --diocese Import a single diocese category (e.g., 129 for Köln) + --dry-run No database writes, just report what would happen + --resume-from Skip first N diocese categories + --job-id Background job tracking ID + --help, -h Show this help message + +Examples: + npx tsx scripts/import-gottesdienstzeiten.ts --diocese 129 --dry-run + npx tsx scripts/import-gottesdienstzeiten.ts --all --dry-run + npx tsx scripts/import-gottesdienstzeiten.ts --all +`); + process.exit(0); + } + } + + if (!result.all && !result.diocese) { + console.error('Error: specify --all or --diocese '); + process.exit(1); + } + + return result; +} + +function formatDuration(ms: number): string { + const seconds = Math.floor(ms / 1000); + const minutes = Math.floor(seconds / 60); + const hours = Math.floor(minutes / 60); + if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`; + if (minutes > 0) return `${minutes}m ${seconds % 60}s`; + return `${seconds}s`; +} + +// ─── Main ──────────────────────────────────────────────────────────────────── + +async function main() { + const args = parseArgs(); + const startTime = Date.now(); + + console.log('\n' + '='.repeat(70)); + console.log('GOTTESDIENSTZEITEN.DE (GERMANY) IMPORTER'); + console.log('='.repeat(70)); + console.log(`Mode: ${args.diocese ? `Diocese category ${args.diocese}` : 'All dioceses'}`); + console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`); + if (args.resumeFrom) console.log(`Resume from: diocese index ${args.resumeFrom}`); + console.log(`Time: ${new Date().toISOString()}`); + console.log('='.repeat(70) + '\n'); + + if (args.jobId) { + try { + await prisma.backgroundJob.update({ + where: { id: args.jobId }, + data: { status: 'running', startedAt: new Date() }, + }); + } catch { /* Job might not exist */ } + } + + const stats: ImportStats = { + diocesesProcessed: 0, + postsFound: 0, + churchesParsed: 0, + churchesMatched: 0, + churchesCreated: 0, + churchesSkipped: 0, + schedulesCreated: 0, + errors: 0, + }; + + const existingChurches = await loadExistingGermanChurches(); + + let categories: DioceseCat[]; + if (args.diocese) { + categories = [{ id: args.diocese, name: `Category ${args.diocese}`, count: 1000 }]; + } else { + categories = await fetchDioceseCategories(); + } + + if (args.resumeFrom && !args.diocese) { + categories = categories.slice(args.resumeFrom); + console.log(`Resuming from diocese index ${args.resumeFrom} (${categories[0]?.name})\n`); + } + + console.log(`Processing ${categories.length} diocese categories\n`); + + for (let i = 0; i < categories.length; i++) { + const cat = categories[i]; + const elapsed = formatDuration(Date.now() - startTime); + console.log(`[${i + 1}/${categories.length}] ${cat.name} (${cat.count} posts) [${elapsed} elapsed]`); + + try { + await processDiocese(cat, existingChurches, args.dryRun, stats); + } catch (error) { + stats.errors++; + console.error(` ERROR processing ${cat.name}: ${error instanceof Error ? error.message : error}`); + } + } + + const totalTime = Date.now() - startTime; + console.log('\n' + '='.repeat(70)); + console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`); + console.log('='.repeat(70)); + console.log(`Dioceses processed: ${stats.diocesesProcessed}`); + console.log(`WP posts found: ${stats.postsFound}`); + console.log(`Churches parsed: ${stats.churchesParsed}`); + console.log(` Matched (existing): ${stats.churchesMatched}`); + console.log(` Created (new): ${stats.churchesCreated}`); + console.log(` Skipped (no coords): ${stats.churchesSkipped}`); + console.log(`Schedules created: ${stats.schedulesCreated}`); + console.log(`Errors: ${stats.errors}`); + console.log(`Total time: ${formatDuration(totalTime)}`); + console.log(`HTTP requests: ${requestCount}`); + console.log('='.repeat(70) + '\n'); + + if (args.jobId) { + try { + await prisma.backgroundJob.update({ + where: { id: args.jobId }, + data: { + status: stats.errors > 0 ? 'completed_with_errors' : 'completed', + completedAt: new Date(), + processed: stats.churchesParsed, + succeeded: stats.churchesCreated + stats.churchesMatched, + failed: stats.errors, + itemsFound: stats.schedulesCreated, + }, + }); + } catch { /* Ignore */ } + } +} + +main() + .catch((error) => { + console.error('Fatal error:', error); + process.exit(1); + }) + .finally(async () => { + await prisma.$disconnect(); + await pool.end(); + }); diff --git a/scripts/import-horariosmisas.ts b/scripts/import-horariosmisas.ts new file mode 100644 index 0000000..fb4a53f --- /dev/null +++ b/scripts/import-horariosmisas.ts @@ -0,0 +1,1028 @@ +#!/usr/bin/env tsx +/** + * Import Catholic churches and mass schedules from horariosmisas.com (Spain) + * + * horariosmisas.com is a Spanish directory of Catholic parishes with mass + * schedules organized by province and city. The site uses a WordPress sitemap + * structure with ~20 post-sitemap files. + * + * Import strategy: + * 1. Fetch sitemap index → extract post-sitemap*.xml URLs + * 2. Fetch each post sitemap → extract church URLs (3 path segments) + * 3. Filter out non-church URLs (blog, legal pages, daily readings) + * 4. For each church: fetch HTML, parse name/address/phone/website/schedule + * 5. Match against existing ES churches, upsert + * 6. Optional geocoding pass via Nominatim + * + * Usage: + * npx tsx scripts/import-horariosmisas.ts --all + * npx tsx scripts/import-horariosmisas.ts --all --dry-run + * npx tsx scripts/import-horariosmisas.ts --province madrid + * npx tsx scripts/import-horariosmisas.ts --all --geocode + * npx tsx scripts/import-horariosmisas.ts --geocode-only + * npx tsx scripts/import-horariosmisas.ts --all --resume-from 500 + * npx tsx scripts/import-horariosmisas.ts --all --job-id {uuid} + */ + +import dotenv from 'dotenv'; +import path from 'path'; + +dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); +dotenv.config({ path: path.resolve(process.cwd(), '.env') }); + +import { Pool } from 'pg'; +import { PrismaPg } from '@prisma/adapter-pg'; +import { PrismaClient } from '@prisma/client'; + +const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; +console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); +const pool = new Pool({ + connectionString: dbUrl, + ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined, +}); +const adapter = new PrismaPg(pool); +const prisma = new PrismaClient({ adapter }); + +import { findDuplicateChurch } from '../src/lib/church-matcher'; +import type { ExistingChurch } from '../src/lib/church-matcher'; + +// ─── Constants ─────────────────────────────────────────────────────────────── + +const SITE_BASE = 'https://horariosmisas.com'; +const SITEMAP_INDEX_URL = `${SITE_BASE}/sitemap_index.xml`; +const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)'; +const REQUEST_DELAY_MS = 1500; +const NOMINATIM_DELAY_MS = 1100; +const NOMINATIM_URL = 'https://nominatim.openstreetmap.org/search'; + +// ─── Types ─────────────────────────────────────────────────────────────────── + +interface SitemapChurch { + province: string; + city: string; + slug: string; + url: string; +} + +interface ParsedChurch { + name: string; + address: string | null; + zip: string | null; + city: string | null; + phone: string | null; + website: string | null; +} + +interface ParsedSchedule { + dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat + time: string; // "05:00", "18:30" +} + +interface ImportStats { + churchesFound: number; + churchesMatched: number; + churchesCreated: number; + churchesSkipped: number; + schedulesProcessed: number; + massSchedulesCreated: number; + geocoded: number; + geocodeFailed: number; + errors: number; +} + +interface CLIArgs { + all: boolean; + province?: string; + dryRun: boolean; + geocode: boolean; + geocodeOnly: boolean; + resumeFrom?: number; + jobId?: string; +} + +// ─── Spanish Day Mapping ───────────────────────────────────────────────────── + +const DAY_MAP: Record = { + 'domingos y festivos': [0], + 'domingos': [0], + 'domingo': [0], + 'lunes': [1], + 'martes': [2], + 'miércoles': [3], + 'miercoles': [3], + 'jueves': [4], + 'viernes': [5], + 'sábado': [6], + 'sabado': [6], + 'sábados': [6], + 'sabados': [6], +}; + +const DAY_ORDER = ['domingo', 'lunes', 'martes', 'miércoles', 'jueves', 'viernes', 'sábado']; + +// URL patterns to exclude (not church pages) +const EXCLUDE_PATTERNS = [ + /\/misas-diarias\//, + /\/santos-del-dia\//, + /\/oraciones\//, + /\/noticias\//, + /\/blog\//, + /\/contacto\//, + /\/aviso-legal\//, + /\/politica-de-privacidad\//, + /\/politica-de-cookies\//, +]; + +// ─── HTTP Client ───────────────────────────────────────────────────────────── + +let requestCount = 0; + +function delay(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +async function fetchPage(url: string, delayMs: number = REQUEST_DELAY_MS): Promise { + if (requestCount > 0) { + await delay(delayMs); + } + requestCount++; + + try { + const response = await fetch(url, { + headers: { + 'User-Agent': USER_AGENT, + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + }, + }); + + if (!response.ok) { + console.error(` HTTP ${response.status} for ${url}`); + return null; + } + + return await response.text(); + } catch (error) { + console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`); + return null; + } +} + +// ─── Sitemap Parser ────────────────────────────────────────────────────────── + +async function fetchChurchUrlsFromSitemaps(): Promise { + console.log(`Fetching sitemap index: ${SITEMAP_INDEX_URL}`); + const indexXml = await fetchPage(SITEMAP_INDEX_URL); + if (!indexXml) { + throw new Error('Failed to fetch sitemap index'); + } + + // Extract post-sitemap URLs + const sitemapUrlRegex = /(https:\/\/horariosmisas\.com\/post-sitemap\d*\.xml)<\/loc>/g; + const sitemapUrls: string[] = []; + let match; + while ((match = sitemapUrlRegex.exec(indexXml)) !== null) { + sitemapUrls.push(match[1]); + } + + console.log(`Found ${sitemapUrls.length} post-sitemap files`); + + // Fetch each sitemap and extract church URLs + const allUrls: string[] = []; + for (const sitemapUrl of sitemapUrls) { + console.log(` Fetching ${sitemapUrl}...`); + const sitemapXml = await fetchPage(sitemapUrl); + if (!sitemapXml) { + console.error(` Failed to fetch ${sitemapUrl}`); + continue; + } + + const locRegex = /(https:\/\/horariosmisas\.com\/[^<]+)<\/loc>/g; + let locMatch; + while ((locMatch = locRegex.exec(sitemapXml)) !== null) { + allUrls.push(locMatch[1]); + } + } + + console.log(`Extracted ${allUrls.length} total URLs from sitemaps`); + + // Filter to church URLs: exactly 3 path segments (/{province}/{city}/{slug}/) + const seen = new Set(); + const churches: SitemapChurch[] = []; + + for (const url of allUrls) { + // Remove trailing slash and base URL to get path + const urlObj = new URL(url); + const pathSegments = urlObj.pathname.replace(/^\/|\/$/g, '').split('/'); + + // Must have exactly 3 segments + if (pathSegments.length !== 3) continue; + + // Exclude non-church patterns + const isExcluded = EXCLUDE_PATTERNS.some((pattern) => pattern.test(url)); + if (isExcluded) continue; + + const [province, city, slug] = pathSegments; + + // Deduplicate by slug + if (seen.has(slug)) continue; + seen.add(slug); + + churches.push({ + province, + city, + slug, + url: url.endsWith('/') ? url : `${url}/`, + }); + } + + // Sort alphabetically by province, then city, then slug + churches.sort((a, b) => + a.province.localeCompare(b.province) || + a.city.localeCompare(b.city) || + a.slug.localeCompare(b.slug), + ); + + console.log(`Found ${churches.length} unique church URLs after filtering`); + return churches; +} + +// ─── HTML Parsers ──────────────────────────────────────────────────────────── + +function parseChurchPage(html: string): ParsedChurch { + // Name from

Church Name (City)

+ const h1Match = html.match(/]*>([\s\S]*?)<\/h1>/i); + let name = ''; + if (h1Match) { + // Strip HTML tags, then strip (City) suffix + name = h1Match[1] + .replace(/<[^>]+>/g, '') + .replace(/\s*\([^)]*\)\s*$/, '') + .trim(); + } + + // Address: look for pin emoji followed by ... + // Handles both the emoji character and the HTML entity 📌 + let address: string | null = null; + let zip: string | null = null; + let city: string | null = null; + + const addressMatch = html.match(/(?:\u{1F4CC}|📌)\s*([\s\S]*?)<\/strong>/iu); + if (addressMatch) { + address = addressMatch[1] + .replace(/<[^>]+>/g, '') + .replace(/\s*\([^)]*\)\s*$/, '') // Strip (Province) suffix + .replace(/\s+/g, ' ') + .trim() || null; + + if (address) { + // Extract 5-digit Spanish postal code + const pcMatch = address.match(/\b(\d{5})\b/); + if (pcMatch) { + zip = pcMatch[1]; + // City is the text after the postal code + const afterPc = address.substring(address.indexOf(zip) + 5).trim(); + // Remove leading comma, dash, space + city = afterPc.replace(/^[,\-\s]+/, '').trim() || null; + } + } + } + + // Phone: Teléfono: number
(handle both accented and unaccented) + let phone: string | null = null; + const phoneMatch = html.match(/Tel[eé]fono:<\/strong>\s*]*>([\s\S]*?)<\/a>/i); + if (phoneMatch) { + phone = phoneMatch[1].replace(/<[^>]+>/g, '').trim() || null; + } + + // Website: Página Web: (handle both accented and unaccented) + let website: string | null = null; + const websiteMatch = html.match(/P[aá]gina\s+Web:<\/strong>\s*(); + + // Determine current season: Oct-May = winter, Jun-Sep = summer + const month = new Date().getMonth(); // 0-indexed + const isSummer = month >= 5 && month <= 8; // Jun(5) through Sep(8) + + // Try to split by seasonal headings + let relevantHtml = html; + + // Check for seasonal sections + const hasVerano = /verano/i.test(html); + const hasInvierno = /invierno/i.test(html); + + if (hasVerano && hasInvierno) { + // Split into seasonal sections + const veranoRegex = /(?:]*>|)[^<]*verano[^<]*(?:<\/h[2-4]>|<\/strong>)/gi; + const inviernoRegex = /(?:]*>|)[^<]*invierno[^<]*(?:<\/h[2-4]>|<\/strong>)/gi; + + const veranoMatch = veranoRegex.exec(html); + const inviernoMatch = inviernoRegex.exec(html); + + if (veranoMatch && inviernoMatch) { + if (isSummer) { + // Use the section starting from "verano" heading + const startIdx = veranoMatch.index; + const endIdx = inviernoMatch.index > startIdx + ? inviernoMatch.index + : html.length; + relevantHtml = html.substring(startIdx, endIdx); + } else { + // Use the section starting from "invierno" heading + const startIdx = inviernoMatch.index; + const endIdx = veranoMatch.index > startIdx + ? veranoMatch.index + : html.length; + relevantHtml = html.substring(startIdx, endIdx); + } + } + } + + // Find all elements with DÍA/HORARIO headers + const tableRegex = /]*>([\s\S]*?)<\/table>/gi; + let tableMatch; + + while ((tableMatch = tableRegex.exec(relevantHtml)) !== null) { + const tableHtml = tableMatch[1]; + + // Check if this looks like a schedule table (has DÍA or HORARIO headers) + if (!/d[ií]a/i.test(tableHtml) && !/horario/i.test(tableHtml)) { + continue; + } + + // Extract rows + const rowRegex = /]*>([\s\S]*?)<\/tr>/gi; + let rowMatch; + + while ((rowMatch = rowRegex.exec(tableHtml)) !== null) { + const row = rowMatch[1]; + + // Skip header rows + if (/
]*>([\s\S]*?)<\/td>/gi; + const cells: string[] = []; + let cellMatch; + while ((cellMatch = cellRegex.exec(row)) !== null) { + cells.push(cellMatch[1].replace(/<[^>]+>/g, '').trim()); + } + + if (cells.length < 2) continue; + + const dayText = cells[0]; + const timeText = cells[1]; + + // Resolve days + const days = resolveDays(dayText); + if (days.length === 0) continue; + + // Extract times + const times = extractTimes(timeText); + + // Create schedule entries + for (const day of days) { + for (const time of times) { + const key = `${day}:${time}`; + if (seen.has(key)) continue; + seen.add(key); + schedules.push({ dayOfWeek: day, time }); + } + } + } + } + + return schedules; +} + +function resolveDays(dayText: string): number[] { + const normalized = dayText.toLowerCase().trim(); + + // 1. Exact match in DAY_MAP + if (DAY_MAP[normalized]) { + return DAY_MAP[normalized]; + } + + // 2. Check for range: "Lunes a Viernes" + const rangeMatch = normalized.match(/^(\w+)\s+a\s+(\w+)$/); + if (rangeMatch) { + const startDay = findDayIndex(rangeMatch[1]); + const endDay = findDayIndex(rangeMatch[2]); + if (startDay !== -1 && endDay !== -1) { + const days: number[] = []; + // DAY_ORDER: domingo=0, lunes=1, ..., sábado=6 + for (let i = startDay; i <= endDay; i++) { + days.push(i); + } + return days; + } + } + + // 3. Check for compound: "Lunes, Miércoles y Viernes" + // Split by comma and "y" + const parts = normalized + .split(/[,]\s*/) + .flatMap((part) => part.split(/\s+y\s+/)) + .map((p) => p.trim()) + .filter((p) => p.length > 0); + + if (parts.length > 1) { + const days: number[] = []; + for (const part of parts) { + // Try exact match first + if (DAY_MAP[part]) { + days.push(...DAY_MAP[part]); + } else { + const idx = findDayIndex(part); + if (idx !== -1) days.push(idx); + } + } + if (days.length > 0) return days; + } + + // 4. Try partial match in DAY_MAP keys + for (const [key, value] of Object.entries(DAY_MAP)) { + if (normalized.includes(key)) { + return value; + } + } + + return []; +} + +function findDayIndex(dayName: string): number { + const normalized = dayName + .toLowerCase() + .replace(/á/g, 'a') + .replace(/é/g, 'e') + .replace(/í/g, 'i') + .replace(/ó/g, 'o') + .replace(/ú/g, 'u') + .replace(/s$/, ''); // Remove trailing 's' for plurals + + for (let i = 0; i < DAY_ORDER.length; i++) { + const dayNormalized = DAY_ORDER[i] + .replace(/á/g, 'a') + .replace(/é/g, 'e') + .replace(/í/g, 'i') + .replace(/ó/g, 'o') + .replace(/ú/g, 'u'); + + if (dayNormalized === normalized || dayNormalized.startsWith(normalized)) { + return i; + } + } + return -1; +} + +function extractTimes(text: string): string[] { + const times: string[] = []; + const timeRegex = /(\d{1,2}):(\d{2})\s*h?/g; + let match; + + while ((match = timeRegex.exec(text)) !== null) { + const hours = parseInt(match[1]); + const minutes = match[2]; + if (hours >= 0 && hours <= 23) { + times.push(`${String(hours).padStart(2, '0')}:${minutes}`); + } + } + + return times; +} + +// ─── Geocoding ─────────────────────────────────────────────────────────────── + +async function forwardGeocode( + address: string | null, + zip: string | null, + city: string | null, +): Promise<{ lat: number; lng: number } | null> { + // Try queries in order of specificity + const queries: string[] = []; + if (address) queries.push(address); + if (zip && city) queries.push(`${zip} ${city}, Spain`); + if (city) queries.push(`${city}, Spain`); + + for (const query of queries) { + await delay(NOMINATIM_DELAY_MS); + + try { + const params = new URLSearchParams({ + q: query, + countrycodes: 'es', + format: 'json', + limit: '1', + }); + + const response = await fetch(`${NOMINATIM_URL}?${params}`, { + headers: { 'User-Agent': USER_AGENT }, + }); + + if (!response.ok) continue; + + const results = await response.json() as Array<{ lat: string; lon: string }>; + if (results.length > 0) { + const lat = parseFloat(results[0].lat); + const lng = parseFloat(results[0].lon); + if (!isNaN(lat) && !isNaN(lng)) { + return { lat, lng }; + } + } + } catch { + // Try next query + } + } + + return null; +} + +// ─── Database Operations ───────────────────────────────────────────────────── + +async function loadExistingSpanishChurches(): Promise { + console.log('Loading existing Spanish churches for deduplication...'); + const churches = await prisma.church.findMany({ + where: { country: 'ES' }, + select: { + id: true, + name: true, + latitude: true, + longitude: true, + osmId: true, + baiduId: true, + masstimesId: true, + orarimesseId: true, + massSchedulesPhId: true, + philmassId: true, + horariosMisasId: true, + mszeInfoId: true, + weekdayMassesId: true, + messesInfoId: true, + bohosluzbyId: true, + miserendId: true, + kerknetId: true, + gottesdienstzeitenId: true, + discovermassId: true, + source: true, + website: true, + phone: true, + address: true, + }, + }); + console.log(`Loaded ${churches.length} existing Spanish churches`); + return churches; +} + +async function geocodeUnmatchedChurches(dryRun: boolean, stats: ImportStats): Promise { + console.log('\n--- Geocoding Phase ---'); + const churches = await prisma.church.findMany({ + where: { + country: 'ES', + latitude: 0, + longitude: 0, + address: { not: null }, + }, + select: { + id: true, + name: true, + address: true, + zip: true, + city: true, + }, + }); + + console.log(`Found ${churches.length} Spanish churches needing geocoding`); + + for (let i = 0; i < churches.length; i++) { + const church = churches[i]; + console.log(` [${i + 1}/${churches.length}] Geocoding "${church.name}"...`); + + const coords = await forwardGeocode(church.address, church.zip, church.city); + + if (coords) { + console.log(` Found: ${coords.lat}, ${coords.lng}`); + stats.geocoded++; + + if (!dryRun) { + await prisma.church.update({ + where: { id: church.id }, + data: { + latitude: coords.lat, + longitude: coords.lng, + reverseGeocodedAt: new Date(), + }, + }); + } + } else { + console.log(` No results`); + stats.geocodeFailed++; + } + } +} + +// ─── Import Logic ──────────────────────────────────────────────────────────── + +async function processChurch( + sitemapEntry: SitemapChurch, + existingChurches: ExistingChurch[], + dryRun: boolean, + stats: ImportStats, +): Promise { + stats.churchesFound++; + + // Fetch church page + const churchHtml = await fetchPage(sitemapEntry.url); + if (!churchHtml) { + stats.errors++; + return; + } + + const parsed = parseChurchPage(churchHtml); + if (!parsed.name) { + console.log(` Skipping ${sitemapEntry.slug}: no name found`); + stats.churchesSkipped++; + return; + } + + // Parse schedule + const schedules = parseScheduleTable(churchHtml); + + // Build candidate for dedup — use lat: 0, lng: 0 since we rely on horariosMisasId match + const candidate = { + name: parsed.name, + lat: 0, + lng: 0, + horariosMisasId: sitemapEntry.slug, + }; + + const duplicate = findDuplicateChurch(candidate, existingChurches); + + if (dryRun) { + if (duplicate) { + stats.churchesMatched++; + console.log(` [MATCH] "${parsed.name}" → existing "${duplicate.name}" (${duplicate.id})`); + } else { + stats.churchesCreated++; + console.log(` [NEW] "${parsed.name}" (${sitemapEntry.province}/${sitemapEntry.city})`); + } + if (schedules.length > 0) { + stats.schedulesProcessed++; + stats.massSchedulesCreated += schedules.length; + } + return; + } + + if (duplicate) { + // Update existing church + stats.churchesMatched++; + const updateData: Record = { + horariosMisasId: sitemapEntry.slug, + }; + + if (!duplicate.address && parsed.address) updateData.address = parsed.address; + if (!duplicate.phone && parsed.phone) updateData.phone = parsed.phone; + if (!duplicate.website && parsed.website) { + updateData.website = parsed.website; + updateData.hasWebsite = true; + } + + // Fill city/state/zip if not set + const dbRecord = await prisma.church.findUnique({ + where: { id: duplicate.id }, + select: { city: true, state: true, zip: true }, + }); + if (dbRecord && !dbRecord.city && parsed.city) updateData.city = parsed.city; + if (dbRecord && !dbRecord.state) updateData.state = sitemapEntry.province; + if (dbRecord && !dbRecord.zip && parsed.zip) updateData.zip = parsed.zip; + + try { + await prisma.church.update({ + where: { id: duplicate.id }, + data: updateData, + }); + } catch (error) { + if (error instanceof Error && error.message.includes('Unique constraint')) { + stats.churchesSkipped++; + return; + } + throw error; + } + + // Replace mass schedules + if (schedules.length > 0) { + try { + await prisma.$transaction(async (tx) => { + await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } }); + await tx.massSchedule.createMany({ + data: schedules.map((s) => ({ + churchId: duplicate.id, + dayOfWeek: s.dayOfWeek, + time: s.time, + language: 'Spanish', + })), + }); + await tx.church.update({ + where: { id: duplicate.id }, + data: { lastScrapedAt: new Date() }, + }); + }); + stats.schedulesProcessed++; + stats.massSchedulesCreated += schedules.length; + } catch (error) { + stats.errors++; + console.error(` Error saving schedules for ${sitemapEntry.slug}: ${error instanceof Error ? error.message : error}`); + } + } + } else { + // Create new church + try { + const newChurch = await prisma.church.create({ + data: { + name: parsed.name, + latitude: 0, + longitude: 0, + address: parsed.address, + zip: parsed.zip, + city: parsed.city || null, + state: sitemapEntry.province || null, + country: 'ES', + phone: parsed.phone, + website: parsed.website, + hasWebsite: !!parsed.website, + horariosMisasId: sitemapEntry.slug, + source: 'horariosmisas', + }, + }); + stats.churchesCreated++; + + // Add to in-memory array for within-run dedup + existingChurches.push({ + id: newChurch.id, + name: parsed.name, + latitude: 0, + longitude: 0, + osmId: null, + baiduId: null, + masstimesId: null, + orarimesseId: null, + massSchedulesPhId: null, + philmassId: null, + horariosMisasId: sitemapEntry.slug, + mszeInfoId: null, + weekdayMassesId: null, + messesInfoId: null, + bohosluzbyId: null, + miserendId: null, + kerknetId: null, + gottesdienstzeitenId: null, + discovermassId: null, + source: 'horariosmisas', + website: parsed.website, + phone: parsed.phone, + address: parsed.address, + }); + + // Create mass schedules + if (schedules.length > 0) { + await prisma.massSchedule.createMany({ + data: schedules.map((s) => ({ + churchId: newChurch.id, + dayOfWeek: s.dayOfWeek, + time: s.time, + language: 'Spanish', + })), + }); + await prisma.church.update({ + where: { id: newChurch.id }, + data: { lastScrapedAt: new Date() }, + }); + stats.schedulesProcessed++; + stats.massSchedulesCreated += schedules.length; + } + } catch (error) { + if (error instanceof Error && error.message.includes('Unique constraint')) { + stats.churchesSkipped++; + return; + } + throw error; + } + } +} + +// ─── CLI ───────────────────────────────────────────────────────────────────── + +function parseArgs(): CLIArgs { + const args = process.argv.slice(2); + const result: CLIArgs = { + all: false, + dryRun: false, + geocode: false, + geocodeOnly: false, + }; + + for (let i = 0; i < args.length; i++) { + switch (args[i]) { + case '--all': + result.all = true; + break; + case '--province': + result.province = args[++i]; + break; + case '--dry-run': + result.dryRun = true; + break; + case '--geocode': + result.geocode = true; + break; + case '--geocode-only': + result.geocodeOnly = true; + break; + case '--resume-from': + result.resumeFrom = parseInt(args[++i]); + break; + case '--job-id': + result.jobId = args[++i]; + break; + case '--help': + case '-h': + console.log(` +Usage: npx tsx scripts/import-horariosmisas.ts [options] + +Options: + --all Import all churches from sitemaps + --province Filter by province slug (e.g. "madrid") + --dry-run No database writes, just report what would happen + --geocode Geocode churches after import (Nominatim) + --geocode-only Only geocode existing churches (skip import) + --resume-from Skip first N churches + --job-id Background job tracking ID + --help, -h Show this help message + +Examples: + npx tsx scripts/import-horariosmisas.ts --all --dry-run + npx tsx scripts/import-horariosmisas.ts --all + npx tsx scripts/import-horariosmisas.ts --province madrid + npx tsx scripts/import-horariosmisas.ts --all --geocode + npx tsx scripts/import-horariosmisas.ts --geocode-only + npx tsx scripts/import-horariosmisas.ts --all --resume-from 500 +`); + process.exit(0); + } + } + + if (!result.all && !result.province && !result.geocodeOnly) { + console.error('Error: specify --all, --province , or --geocode-only'); + process.exit(1); + } + + return result; +} + +// ─── Helpers ───────────────────────────────────────────────────────────────── + +function formatDuration(ms: number): string { + const seconds = Math.floor(ms / 1000); + const minutes = Math.floor(seconds / 60); + const hours = Math.floor(minutes / 60); + if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`; + if (minutes > 0) return `${minutes}m ${seconds % 60}s`; + return `${seconds}s`; +} + +// ─── Main ──────────────────────────────────────────────────────────────────── + +async function main() { + const args = parseArgs(); + const startTime = Date.now(); + + console.log('\n' + '='.repeat(70)); + console.log('HORARIOSMISAS.COM (SPAIN) IMPORTER'); + console.log('='.repeat(70)); + console.log(`Mode: ${args.geocodeOnly ? 'Geocode only' : args.all ? 'All churches from sitemaps' : `Province: ${args.province}`}`); + console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`); + console.log(`Geocode: ${args.geocode || args.geocodeOnly ? 'YES' : 'NO'}`); + if (args.resumeFrom) console.log(`Resume from: ${args.resumeFrom}`); + console.log(`Time: ${new Date().toISOString()}`); + console.log('='.repeat(70) + '\n'); + + // Update background job status if provided + if (args.jobId) { + try { + await prisma.backgroundJob.update({ + where: { id: args.jobId }, + data: { status: 'running', startedAt: new Date() }, + }); + } catch { + // Job might not exist yet + } + } + + const stats: ImportStats = { + churchesFound: 0, + churchesMatched: 0, + churchesCreated: 0, + churchesSkipped: 0, + schedulesProcessed: 0, + massSchedulesCreated: 0, + geocoded: 0, + geocodeFailed: 0, + errors: 0, + }; + + if (!args.geocodeOnly) { + // Load existing Spanish churches for dedup + const existingChurches = await loadExistingSpanishChurches(); + + // Fetch church URLs from sitemaps + const allChurches = await fetchChurchUrlsFromSitemaps(); + + // Filter by province if specified + let churchesToProcess = allChurches; + if (args.province) { + churchesToProcess = allChurches.filter((c) => c.province === args.province); + console.log(`Filtered to ${churchesToProcess.length} churches in province "${args.province}"\n`); + } else { + console.log(`Processing ${churchesToProcess.length} churches\n`); + } + + // Handle --resume-from + if (args.resumeFrom) { + const before = churchesToProcess.length; + churchesToProcess = churchesToProcess.slice(args.resumeFrom); + console.log(`Resuming from index ${args.resumeFrom} (skipping ${before - churchesToProcess.length} churches)\n`); + } + + // Process each church + for (let i = 0; i < churchesToProcess.length; i++) { + const church = churchesToProcess[i]; + const elapsed = formatDuration(Date.now() - startTime); + console.log(`[${i + 1}/${churchesToProcess.length}] ${church.province}/${church.city}/${church.slug} [${elapsed} elapsed]`); + + try { + await processChurch(church, existingChurches, args.dryRun, stats); + } catch (error) { + stats.errors++; + console.error(` ERROR processing ${church.slug}: ${error instanceof Error ? error.message : error}`); + } + } + } + + // Geocode phase + if (args.geocode || args.geocodeOnly) { + await geocodeUnmatchedChurches(args.dryRun, stats); + } + + // Print summary + const totalTime = Date.now() - startTime; + console.log('\n' + '='.repeat(70)); + console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`); + console.log('='.repeat(70)); + console.log(`Churches found: ${stats.churchesFound}`); + console.log(` Matched (existing): ${stats.churchesMatched}`); + console.log(` Created (new): ${stats.churchesCreated}`); + console.log(` Skipped: ${stats.churchesSkipped}`); + console.log(`Schedules processed: ${stats.schedulesProcessed}`); + console.log(`Mass schedules created: ${stats.massSchedulesCreated}`); + if (args.geocode || args.geocodeOnly) { + console.log(`Geocoded: ${stats.geocoded}`); + console.log(`Geocode failed: ${stats.geocodeFailed}`); + } + console.log(`Errors: ${stats.errors}`); + console.log(`Total time: ${formatDuration(totalTime)}`); + console.log(`HTTP requests: ${requestCount}`); + console.log('='.repeat(70) + '\n'); + + // Update background job + if (args.jobId) { + try { + await prisma.backgroundJob.update({ + where: { id: args.jobId }, + data: { + status: stats.errors > 0 ? 'completed_with_errors' : 'completed', + completedAt: new Date(), + result: JSON.stringify(stats), + }, + }); + } catch { + // Ignore + } + } +} + +main() + .catch((error) => { + console.error('Fatal error:', error); + process.exit(1); + }) + .finally(async () => { + await prisma.$disconnect(); + await pool.end(); + }); diff --git a/scripts/import-kerknet.ts b/scripts/import-kerknet.ts new file mode 100644 index 0000000..a03167a --- /dev/null +++ b/scripts/import-kerknet.ts @@ -0,0 +1,697 @@ +#!/usr/bin/env tsx +/** + * Import Catholic churches and mass schedules from kerknet.be (Flanders, Belgium) + * + * Kerknet is the portal of the Catholic Church in Flanders (Dutch-speaking Belgium). + * It has ~1,200 churches with structured data: name, address, coordinates (GeoJSON), + * and date-specific celebration entries. + * + * Import strategy: + * 1. Enumerate unique church slugs by paginating the celebration listing + * 2. Scrape each /kerk/{slug} page for structured data (name, address, coords, nodeId) + * 3. Fetch celebrations via AJAX endpoint per church + * 4. Deduce recurring weekly schedules from date-specific celebrations + * 5. Match against existing Belgian churches via church-matcher + * 6. Upsert churches and mass schedules + * + * Usage: + * npx tsx scripts/import-kerknet.ts --all --dry-run + * npx tsx scripts/import-kerknet.ts --all + * npx tsx scripts/import-kerknet.ts --slug o-l-vrouw-kerk-scherpenheuvel --dry-run + * npx tsx scripts/import-kerknet.ts --all --resume-from 100 + */ + +import dotenv from 'dotenv'; +import path from 'path'; + +dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); +dotenv.config({ path: path.resolve(process.cwd(), '.env') }); + +import { Pool } from 'pg'; +import { PrismaPg } from '@prisma/adapter-pg'; +import { PrismaClient } from '@prisma/client'; + +const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; +console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); +const pool = new Pool({ + connectionString: dbUrl, + ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined, +}); +const adapter = new PrismaPg(pool); +const prisma = new PrismaClient({ adapter }); + +import { findDuplicateChurch } from '../src/lib/church-matcher'; +import type { ExistingChurch } from '../src/lib/church-matcher'; + +// ─── Constants ─────────────────────────────────────────────────────────────── + +const BASE_URL = 'https://www.kerknet.be'; +const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)'; +const ENUM_DELAY_MS = 2000; // Delay between listing pages (respecting crawl-delay spirit) +const DETAIL_DELAY_MS = 3000; // Delay between church detail page fetches +const CELEBRATION_DELAY_MS = 2000; // Delay between celebration AJAX calls +const MAX_RETRIES = 3; +const RETRY_DELAY_MS = 10000; +const MAX_ENUM_PAGES = 2804; // Total celebration listing pages +const ENUM_SAMPLE_INTERVAL = 5; // Check every Nth page (5 → ~560 pages to check) +const STALE_THRESHOLD = 10; // Stop if N consecutive sampled pages yield no new slugs + +// Dutch day abbreviations → dayOfWeek (0=Sun, 1=Mon, ..., 6=Sat) +const DUTCH_DAYS: Record = { + 'zo': 0, 'ma': 1, 'di': 2, 'wo': 3, 'do': 4, 'vr': 5, 'za': 6, +}; + +// ─── Types ─────────────────────────────────────────────────────────────────── + +interface ChurchData { + slug: string; + nodeId: string; + name: string; + address: string | null; + zip: string | null; + city: string | null; + latitude: number; + longitude: number; + website: string | null; +} + +interface CelebrationEntry { + dayAbbrev: string; + date: string; // DD/MM + time: string; // HH.MM or HH:MM + type: string; // Eucharistie, Gebedsdienst, etc. +} + +interface ParsedSchedule { + dayOfWeek: number; + time: string; +} + +interface ImportStats { + slugsEnumerated: number; + churchesFetched: number; + churchesMatched: number; + churchesCreated: number; + churchesSkipped: number; + schedulesCreated: number; + errors: number; +} + +interface CLIArgs { + all: boolean; + dryRun: boolean; + resumeFrom?: number; + slug?: string; + jobId?: string; +} + +// ─── HTTP Helpers ──────────────────────────────────────────────────────────── + +let requestCount = 0; + +function delay(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +async function fetchPage(url: string, delayMs: number): Promise { + if (requestCount > 0) { + await delay(delayMs); + } + requestCount++; + + for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) { + try { + const response = await fetch(url, { + headers: { 'User-Agent': USER_AGENT }, + }); + + if (response.status === 429 || response.status === 503) { + if (attempt < MAX_RETRIES) { + console.log(` HTTP ${response.status} — retrying in ${RETRY_DELAY_MS / 1000}s (attempt ${attempt}/${MAX_RETRIES})`); + await delay(RETRY_DELAY_MS); + continue; + } + return null; + } + + if (!response.ok) { + if (attempt < MAX_RETRIES) { + await delay(RETRY_DELAY_MS); + continue; + } + return null; + } + + return await response.text(); + } catch (error) { + if (attempt < MAX_RETRIES) { + console.log(` Network error — retrying (attempt ${attempt}/${MAX_RETRIES})`); + await delay(RETRY_DELAY_MS); + continue; + } + console.error(` Fetch failed after ${MAX_RETRIES} attempts: ${error instanceof Error ? error.message : error}`); + return null; + } + } + return null; +} + +// ─── Phase 1: Enumerate Church Slugs ───────────────────────────────────────── + +async function enumerateChurchSlugs(): Promise { + console.log('\nPhase 1: Enumerating church slugs from celebration listings...'); + const slugs = new Set(); + let consecutiveEmpty = 0; + + for (let page = 0; page < MAX_ENUM_PAGES; page += ENUM_SAMPLE_INTERVAL) { + const url = `${BASE_URL}/zoeken/vieringen/lijst?page=${page}`; + const html = await fetchPage(url, ENUM_DELAY_MS); + + if (!html) { + consecutiveEmpty++; + if (consecutiveEmpty >= STALE_THRESHOLD) { + console.log(` Stopping enumeration: ${STALE_THRESHOLD} consecutive empty pages`); + break; + } + continue; + } + + // Extract /kerk/{slug} links (church building pages, NOT org pages like /kerk-jette/artikel/) + const matches = html.match(/href="\/kerk\/([^"/]+)"/g); + const prevSize = slugs.size; + + if (matches) { + for (const match of matches) { + const slugMatch = match.match(/href="\/kerk\/([^"/]+)"/); + if (slugMatch) { + slugs.add(slugMatch[1]); + } + } + } + + const newCount = slugs.size - prevSize; + if (newCount === 0) { + consecutiveEmpty++; + } else { + consecutiveEmpty = 0; + } + + if (page % 50 === 0 || newCount > 0) { + console.log(` Page ${page}: ${slugs.size} unique churches found (+${newCount})`); + } + + if (consecutiveEmpty >= STALE_THRESHOLD) { + console.log(` Stopping enumeration: ${STALE_THRESHOLD} consecutive sampled pages with no new churches`); + break; + } + } + + console.log(` Enumeration complete: ${slugs.size} unique church slugs found\n`); + return Array.from(slugs).sort(); +} + +// ─── Phase 2: Scrape Church Detail Page ────────────────────────────────────── + +function parseChurchPage(html: string, slug: string): ChurchData | null { + // Extract coordinates from GeoJSON in Drupal settings + const coordMatch = html.match(/"coordinates":\[(-?[\d.]+),(-?[\d.]+)\]/); + if (!coordMatch) return null; // No coordinates = unusable + + const longitude = parseFloat(coordMatch[1]); + const latitude = parseFloat(coordMatch[2]); + if (latitude === 0 && longitude === 0) return null; + + // Extract node ID + const nidMatch = html.match(/"currentNid":"(\d+)"/); + const nodeId = nidMatch ? nidMatch[1] : slug; + + // Extract name from GeoJSON description or page title + let name = slug; + const descMatch = html.match(/"description":"([^"]+)"/); + if (descMatch) { + name = descMatch[1]; + } else { + const titleMatch = html.match(/([^|<]+)/); + if (titleMatch) name = titleMatch[1].trim(); + } + + // Extract address fields + const streetMatch = html.match(/class="thoroughfare">([^<]+)</); + const zipMatch = html.match(/class="postal-code">([^<]+)</); + const cityMatch = html.match(/class="locality">([^<]+)</); + + const address = streetMatch ? streetMatch[1].trim() : null; + const zip = zipMatch ? zipMatch[1].trim() : null; + const city = cityMatch ? cityMatch[1].trim() : null; + + // Extract website + let website: string | null = null; + const websiteMatch = html.match(/class="website"[^>]*>.*?href="([^"]+)"/s); + if (websiteMatch) { + website = websiteMatch[1]; + } else { + // Try field-name-kn-website pattern + const knWebsiteMatch = html.match(/field-name-kn-website.*?href="([^"]+)"/s); + if (knWebsiteMatch) website = knWebsiteMatch[1]; + } + + return { slug, nodeId, name, address, zip, city, latitude, longitude, website }; +} + +// ─── Phase 3: Parse Celebrations ───────────────────────────────────────────── + +function parseCelebrations(html: string): CelebrationEntry[] { + const entries: CelebrationEntry[] = []; + + // Match celebration blocks + const celebBlocks = html.split('<div class="celebration">').slice(1); + + for (const block of celebBlocks) { + // Extract day abbreviation + const dayMatch = block.match(/celebration__date__day">\s*(\w+)\s*</); + if (!dayMatch) continue; + + // Extract date (DD/MM) + const dateMatch = block.match(/celebration__date__date">\s*([\d/]+)\s*</); + + // Extract time (HH.MM) + const timeMatch = block.match(/celebration__time">\s*([\d.]+)\s*</); + if (!timeMatch) continue; + + // Extract type + const typeMatch = block.match(/celebration__info__type">\s*([^<]+)\s*</); + + entries.push({ + dayAbbrev: dayMatch[1].toLowerCase().trim(), + date: dateMatch ? dateMatch[1].trim() : '', + time: timeMatch[1].trim(), + type: typeMatch ? typeMatch[1].trim().toLowerCase() : 'eucharistie', + }); + } + + return entries; +} + +function deduceSchedules(celebrations: CelebrationEntry[]): ParsedSchedule[] { + // Only keep Eucharistie (mass) entries + const masses = celebrations.filter(c => + c.type === 'eucharistie' || c.type === 'eucharistieviering' + ); + + const seen = new Set<string>(); + const schedules: ParsedSchedule[] = []; + + for (const mass of masses) { + const dayOfWeek = DUTCH_DAYS[mass.dayAbbrev]; + if (dayOfWeek === undefined) continue; + + // Normalize time: "15.00" → "15:00" + const time = mass.time.replace('.', ':').replace(/^(\d):/, '0$1:'); + + const key = `${dayOfWeek}:${time}`; + if (!seen.has(key)) { + seen.add(key); + schedules.push({ dayOfWeek, time }); + } + } + + return schedules; +} + +// ─── Database Operations ───────────────────────────────────────────────────── + +async function loadExistingBelgianChurches(): Promise<ExistingChurch[]> { + console.log('Loading existing Belgian churches for deduplication...'); + const churches = await prisma.church.findMany({ + where: { country: 'BE' }, + select: { + id: true, + name: true, + latitude: true, + longitude: true, + osmId: true, + baiduId: true, + masstimesId: true, + orarimesseId: true, + massSchedulesPhId: true, + philmassId: true, + horariosMisasId: true, + mszeInfoId: true, + weekdayMassesId: true, + messesInfoId: true, + bohosluzbyId: true, + miserendId: true, + kerknetId: true, + gottesdienstzeitenId: true, + discovermassId: true, + source: true, + website: true, + phone: true, + address: true, + }, + }); + console.log(`Loaded ${churches.length} existing Belgian churches`); + return churches; +} + +// ─── Import Logic ──────────────────────────────────────────────────────────── + +async function processChurch( + slug: string, + existingChurches: ExistingChurch[], + dryRun: boolean, + stats: ImportStats, +): Promise<void> { + // Fetch church detail page + const churchHtml = await fetchPage(`${BASE_URL}/kerk/${slug}`, DETAIL_DELAY_MS); + if (!churchHtml) { + stats.errors++; + return; + } + + const church = parseChurchPage(churchHtml, slug); + if (!church) { + stats.churchesSkipped++; + return; + } + + stats.churchesFetched++; + + // Fetch celebrations via AJAX + let celebrations: CelebrationEntry[] = []; + const celebHtml = await fetchPage( + `${BASE_URL}/kerknet-celebration/churches/ajax/load-more/0/${church.nodeId}`, + CELEBRATION_DELAY_MS, + ); + if (celebHtml) { + celebrations = parseCelebrations(celebHtml); + } + + const schedules = deduceSchedules(celebrations); + + const kerknetId = `kerknet-${church.nodeId}`; + const candidate = { + name: church.name, + lat: church.latitude, + lng: church.longitude, + kerknetId, + }; + + const duplicate = findDuplicateChurch(candidate, existingChurches); + + if (dryRun) { + if (duplicate) { + stats.churchesMatched++; + } else { + stats.churchesCreated++; + } + stats.schedulesCreated += schedules.length; + return; + } + + if (duplicate) { + stats.churchesMatched++; + const updateData: Record<string, unknown> = { kerknetId }; + + if (!duplicate.address && church.address) updateData.address = church.address; + if (!duplicate.website && church.website) updateData.website = church.website; + + try { + await prisma.church.update({ + where: { id: duplicate.id }, + data: updateData, + }); + } catch (error) { + if (error instanceof Error && error.message.includes('Unique constraint')) { + stats.churchesSkipped++; + return; + } + throw error; + } + + if (schedules.length > 0) { + try { + await prisma.$transaction(async (tx) => { + await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } }); + await tx.massSchedule.createMany({ + data: schedules.map((s) => ({ + churchId: duplicate.id, + dayOfWeek: s.dayOfWeek, + time: s.time, + language: 'Dutch', + })), + }); + await tx.church.update({ + where: { id: duplicate.id }, + data: { lastScrapedAt: new Date() }, + }); + }); + stats.schedulesCreated += schedules.length; + } catch (error) { + stats.errors++; + console.error(` Error saving schedules for ${slug}: ${error instanceof Error ? error.message : error}`); + } + } + } else { + try { + const newChurch = await prisma.church.create({ + data: { + name: church.name, + latitude: church.latitude, + longitude: church.longitude, + address: church.address, + zip: church.zip, + city: church.city, + country: 'BE', + website: church.website, + hasWebsite: !!church.website, + kerknetId, + source: 'kerknet', + websiteLanguage: 'nl', + }, + }); + stats.churchesCreated++; + + existingChurches.push({ + id: newChurch.id, + name: church.name, + latitude: church.latitude, + longitude: church.longitude, + osmId: null, + baiduId: null, + masstimesId: null, + orarimesseId: null, + massSchedulesPhId: null, + philmassId: null, + horariosMisasId: null, + mszeInfoId: null, + weekdayMassesId: null, + messesInfoId: null, + bohosluzbyId: null, + miserendId: null, + kerknetId, + gottesdienstzeitenId: null, + discovermassId: null, + source: 'kerknet', + website: church.website, + phone: null, + address: church.address, + }); + + if (schedules.length > 0) { + await prisma.massSchedule.createMany({ + data: schedules.map((s) => ({ + churchId: newChurch.id, + dayOfWeek: s.dayOfWeek, + time: s.time, + language: 'Dutch', + })), + }); + await prisma.church.update({ + where: { id: newChurch.id }, + data: { lastScrapedAt: new Date() }, + }); + stats.schedulesCreated += schedules.length; + } + } catch (error) { + if (error instanceof Error && error.message.includes('Unique constraint')) { + stats.churchesSkipped++; + return; + } + stats.errors++; + console.error(` Error creating ${slug}: ${error instanceof Error ? error.message : error}`); + } + } +} + +// ─── CLI ───────────────────────────────────────────────────────────────────── + +function parseArgs(): CLIArgs { + const args = process.argv.slice(2); + const result: CLIArgs = { all: false, dryRun: false }; + + for (let i = 0; i < args.length; i++) { + switch (args[i]) { + case '--all': + result.all = true; + break; + case '--dry-run': + result.dryRun = true; + break; + case '--resume-from': + result.resumeFrom = parseInt(args[++i]); + break; + case '--slug': + result.slug = args[++i]; + break; + case '--job-id': + result.jobId = args[++i]; + break; + case '--help': + case '-h': + console.log(` +Usage: npx tsx scripts/import-kerknet.ts [options] + +Options: + --all Import all churches from kerknet.be + --slug <slug> Import a single church (e.g., o-l-vrouw-kerk-scherpenheuvel) + --dry-run No database writes, just report what would happen + --resume-from <n> Skip first N churches (after enumeration) + --job-id <uuid> Background job tracking ID + --help, -h Show this help message + +Examples: + npx tsx scripts/import-kerknet.ts --slug o-l-vrouw-kerk-scherpenheuvel --dry-run + npx tsx scripts/import-kerknet.ts --all --dry-run + npx tsx scripts/import-kerknet.ts --all +`); + process.exit(0); + } + } + + if (!result.all && !result.slug) { + console.error('Error: specify --all or --slug <slug>'); + process.exit(1); + } + + return result; +} + +function formatDuration(ms: number): string { + const seconds = Math.floor(ms / 1000); + const minutes = Math.floor(seconds / 60); + const hours = Math.floor(minutes / 60); + if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`; + if (minutes > 0) return `${minutes}m ${seconds % 60}s`; + return `${seconds}s`; +} + +// ─── Main ──────────────────────────────────────────────────────────────────── + +async function main() { + const args = parseArgs(); + const startTime = Date.now(); + + console.log('\n' + '='.repeat(70)); + console.log('KERKNET.BE (BELGIUM/FLANDERS) IMPORTER'); + console.log('='.repeat(70)); + console.log(`Mode: ${args.slug ? `Single: ${args.slug}` : 'All churches'}`); + console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`); + if (args.resumeFrom) console.log(`Resume from: church index ${args.resumeFrom}`); + console.log(`Time: ${new Date().toISOString()}`); + console.log('='.repeat(70) + '\n'); + + if (args.jobId) { + try { + await prisma.backgroundJob.update({ + where: { id: args.jobId }, + data: { status: 'running', startedAt: new Date() }, + }); + } catch { /* Job might not exist */ } + } + + const stats: ImportStats = { + slugsEnumerated: 0, + churchesFetched: 0, + churchesMatched: 0, + churchesCreated: 0, + churchesSkipped: 0, + schedulesCreated: 0, + errors: 0, + }; + + const existingChurches = await loadExistingBelgianChurches(); + + // Get list of church slugs + let slugs: string[]; + if (args.slug) { + slugs = [args.slug]; + } else { + slugs = await enumerateChurchSlugs(); + stats.slugsEnumerated = slugs.length; + } + + if (args.resumeFrom && !args.slug) { + slugs = slugs.slice(args.resumeFrom); + console.log(`Resuming from church index ${args.resumeFrom} (${slugs[0]})\n`); + } + + console.log(`Processing ${slugs.length} churches\n`); + + for (let i = 0; i < slugs.length; i++) { + const slug = slugs[i]; + const elapsed = formatDuration(Date.now() - startTime); + + if (i % 50 === 0 || slugs.length <= 10) { + console.log(`[${i + 1}/${slugs.length}] ${slug} [${elapsed} elapsed, ${stats.churchesCreated} new, ${stats.churchesMatched} matched]`); + } + + try { + await processChurch(slug, existingChurches, args.dryRun, stats); + } catch (error) { + stats.errors++; + console.error(` ERROR processing ${slug}: ${error instanceof Error ? error.message : error}`); + } + } + + const totalTime = Date.now() - startTime; + console.log('\n' + '='.repeat(70)); + console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`); + console.log('='.repeat(70)); + console.log(`Slugs enumerated: ${stats.slugsEnumerated}`); + console.log(`Churches fetched: ${stats.churchesFetched}`); + console.log(` Matched (existing): ${stats.churchesMatched}`); + console.log(` Created (new): ${stats.churchesCreated}`); + console.log(` Skipped: ${stats.churchesSkipped}`); + console.log(`Schedules created: ${stats.schedulesCreated}`); + console.log(`Errors: ${stats.errors}`); + console.log(`Total time: ${formatDuration(totalTime)}`); + console.log(`HTTP requests: ${requestCount}`); + console.log('='.repeat(70) + '\n'); + + if (args.jobId) { + try { + await prisma.backgroundJob.update({ + where: { id: args.jobId }, + data: { + status: stats.errors > 0 ? 'completed_with_errors' : 'completed', + completedAt: new Date(), + processed: stats.churchesFetched, + succeeded: stats.churchesCreated + stats.churchesMatched, + failed: stats.errors, + itemsFound: stats.schedulesCreated, + }, + }); + } catch { /* Ignore */ } + } +} + +main() + .catch((error) => { + console.error('Fatal error:', error); + process.exit(1); + }) + .finally(async () => { + await prisma.$disconnect(); + await pool.end(); + }); diff --git a/scripts/import-mass-schedules-ph.ts b/scripts/import-mass-schedules-ph.ts new file mode 100644 index 0000000..827e608 --- /dev/null +++ b/scripts/import-mass-schedules-ph.ts @@ -0,0 +1,695 @@ +#!/usr/bin/env tsx +/** + * Import Catholic churches and mass schedules from mass-schedules.com (Philippines) + * + * mass-schedules.com has been operating since 2008 and covers ~1,500 Philippine + * churches with weekly mass schedule tables and coordinates on separate map pages. + * + * Import strategy: + * 1. Fetch sitemap XML → extract all /catholic-church/{id}/ URLs + * 2. For each church: fetch page HTML, parse name/address/schedule, fetch map + * page for coordinates, match against existing PH churches, upsert + * + * Usage: + * npx tsx scripts/import-mass-schedules-ph.ts --all + * npx tsx scripts/import-mass-schedules-ph.ts --all --dry-run + * npx tsx scripts/import-mass-schedules-ph.ts --church-id 34 + * npx tsx scripts/import-mass-schedules-ph.ts --all --resume-from 500 + * npx tsx scripts/import-mass-schedules-ph.ts --all --skip-schedules + * npx tsx scripts/import-mass-schedules-ph.ts --all --job-id {uuid} + */ + +import dotenv from 'dotenv'; +import path from 'path'; + +dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); +dotenv.config({ path: path.resolve(process.cwd(), '.env') }); + +import { Pool } from 'pg'; +import { PrismaPg } from '@prisma/adapter-pg'; +import { PrismaClient } from '@prisma/client'; + +const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; +console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); +const pool = new Pool({ + connectionString: dbUrl, + ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined, +}); +const adapter = new PrismaPg(pool); +const prisma = new PrismaClient({ adapter }); + +import { findDuplicateChurch } from '../src/lib/church-matcher'; +import type { ExistingChurch } from '../src/lib/church-matcher'; + +// ─── Constants ─────────────────────────────────────────────────────────────── + +const SITE_BASE = 'https://www.mass-schedules.com'; +const SITEMAP_URL = `${SITE_BASE}/sitemaps/sitemap02272021.xml`; +const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)'; +const REQUEST_DELAY_MS = 1500; + +// ─── Types ─────────────────────────────────────────────────────────────────── + +interface SitemapChurch { + id: string; + slug: string; + url: string; +} + +interface ParsedChurch { + name: string; + address: string | null; + region: string | null; + city: string | null; + phone: string | null; + mapUrl: string | null; +} + +interface ParsedSchedule { + dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat + time: string; // "05:00", "18:30" +} + +interface ImportStats { + churchesFound: number; + churchesMatched: number; + churchesCreated: number; + churchesSkipped: number; + schedulesProcessed: number; + massSchedulesCreated: number; + errors: number; +} + +interface CLIArgs { + all: boolean; + churchId?: string; + dryRun: boolean; + skipSchedules: boolean; + resumeFrom?: number; + jobId?: string; +} + +// ─── HTTP Client ───────────────────────────────────────────────────────────── + +let requestCount = 0; + +function delay(ms: number): Promise<void> { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +async function fetchPage(url: string): Promise<string | null> { + if (requestCount > 0) { + await delay(REQUEST_DELAY_MS); + } + requestCount++; + + try { + const response = await fetch(url, { + headers: { + 'User-Agent': USER_AGENT, + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + }, + }); + + if (!response.ok) { + console.error(` HTTP ${response.status} for ${url}`); + return null; + } + + return await response.text(); + } catch (error) { + console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`); + return null; + } +} + +// ─── Sitemap Parser ────────────────────────────────────────────────────────── + +async function fetchChurchUrlsFromSitemap(): Promise<SitemapChurch[]> { + console.log(`Fetching sitemap: ${SITEMAP_URL}`); + const xml = await fetchPage(SITEMAP_URL); + if (!xml) { + throw new Error('Failed to fetch sitemap'); + } + + // Extract /catholic-church/{id}/{slug}.html URLs + const urlRegex = /\/catholic-church\/(\d+)\/([\w-]+)\.html/g; + const seen = new Set<string>(); + const churches: SitemapChurch[] = []; + + let match; + while ((match = urlRegex.exec(xml)) !== null) { + const id = match[1]; + if (seen.has(id)) continue; // Sitemap has duplicates + seen.add(id); + churches.push({ + id, + slug: match[2], + url: `${SITE_BASE}/catholic-church/${id}/${match[2]}.html`, + }); + } + + // Sort by ID for predictable ordering + churches.sort((a, b) => parseInt(a.id) - parseInt(b.id)); + return churches; +} + +// ─── HTML Parsers ──────────────────────────────────────────────────────────── + +function parseChurchPage(html: string): ParsedChurch { + // Name from <h1 class="page_title">...</h1> + const h1Match = html.match(/<h1[^>]*class="page_title"[^>]*>([\s\S]*?)<\/h1>/i); + let name = h1Match ? h1Match[1].trim() : ''; + // Remove " Mass Schedule" suffix + name = name.replace(/\s*Mass\s*Schedule\s*$/i, '').trim(); + + // Address from <label>address:</label> ... <p class="data">...</p> + const addressMatch = html.match(/<label>address:<\/label>\s*<p class="data">([\s\S]*?)<\/p>/i); + let address: string | null = null; + let mapUrl: string | null = null; + if (addressMatch) { + // Extract map link before cleaning + const mapLinkMatch = addressMatch[1].match(/href="(\/location-map\/[^"]+)"/); + if (mapLinkMatch) { + mapUrl = `${SITE_BASE}${mapLinkMatch[1]}`; + } + // Clean address: remove HTML tags, normalize whitespace + address = addressMatch[1] + .replace(/<[^>]+>/g, '') + .replace(/\(show location map\)/i, '') + .replace(/\s+/g, ' ') + .trim() || null; + } + + // Phone from <label>telephone number:</label> ... <p class="data_inline" id="TELEPHONE">...</p> + const phoneMatch = html.match(/id="TELEPHONE"[^>]*>([\s\S]*?)<\/p>/i); + const phone = phoneMatch ? phoneMatch[1].trim() || null : null; + + // Region and city from breadcrumbs + // Pattern: > {Region} > {City} + const breadcrumbMatches = [...html.matchAll(/class="normal"\s+href="[^"]*\/locations\/\d+\/[^"]*"[^>]*>([^<]+)<\/a>/gi)]; + const region = breadcrumbMatches.length > 0 ? breadcrumbMatches[0][1].trim() : null; + + const cityMatches = [...html.matchAll(/class="normal"\s+href="[^"]*\/catholic-churches\/\d+\/[^"]*"[^>]*>([^<]+)<\/a>/gi)]; + const city = cityMatches.length > 0 ? cityMatches[0][1].trim() : null; + + return { name, address, region, city, phone, mapUrl }; +} + +function parseScheduleTable(html: string): ParsedSchedule[] { + // The schedule table has 7 columns: Sun(0), Mon(1), Tue(2), Wed(3), Thu(4), Fri(5), Sat(6) + // Each row contains <td> cells with <p class="schedule">5:00 AM - 6:00 AM</p> + const schedules: ParsedSchedule[] = []; + const seen = new Set<string>(); + + // Extract all table rows from <tbody> + const tbodyMatch = html.match(/<tbody>([\s\S]*?)<\/tbody>/i); + if (!tbodyMatch) return schedules; + + const rows = tbodyMatch[1].match(/<tr>([\s\S]*?)<\/tr>/gi); + if (!rows) return schedules; + + for (const row of rows) { + // Extract all <td> cells + const cells = row.match(/<td>([\s\S]*?)<\/td>/gi); + if (!cells) continue; + + for (let colIndex = 0; colIndex < cells.length && colIndex < 7; colIndex++) { + const dayOfWeek = colIndex; // 0=Sun, 1=Mon, ..., 6=Sat + + // Extract time from <p class="schedule">5:00 AM - 6:00 AM</p> + const timeMatch = cells[colIndex].match(/<p class="schedule">\s*(\d{1,2}:\d{2}\s*[AP]M)/i); + if (!timeMatch) continue; + + const time = convertTo24Hour(timeMatch[1].trim()); + if (!time) continue; + + const key = `${dayOfWeek}:${time}`; + if (seen.has(key)) continue; + seen.add(key); + + schedules.push({ dayOfWeek, time }); + } + } + + return schedules; +} + +function convertTo24Hour(timeStr: string): string | null { + // "5:00 AM" → "05:00", "6:30 PM" → "18:30" + const match = timeStr.match(/^(\d{1,2}):(\d{2})\s*(AM|PM)$/i); + if (!match) return null; + + let hours = parseInt(match[1]); + const minutes = match[2]; + const period = match[3].toUpperCase(); + + if (period === 'AM' && hours === 12) hours = 0; + if (period === 'PM' && hours !== 12) hours += 12; + + return `${String(hours).padStart(2, '0')}:${minutes}`; +} + +function parseCoordinates(html: string): { lat: number; lng: number } | null { + // Coordinates in JS: ms.ui.church.params.lat = '14.598815' + const latMatch = html.match(/ms\.ui\.church\.params\.lat\s*=\s*'([^']+)'/); + const lngMatch = html.match(/ms\.ui\.church\.params\.lng\s*=\s*'([^']+)'/); + + if (!latMatch || !lngMatch) return null; + + const lat = parseFloat(latMatch[1]); + const lng = parseFloat(lngMatch[1]); + + if (isNaN(lat) || isNaN(lng) || lat === 0 || lng === 0) return null; + + return { lat, lng }; +} + +// ─── Database Operations ───────────────────────────────────────────────────── + +async function loadExistingPhilippineChurches(): Promise<ExistingChurch[]> { + console.log('Loading existing Philippine churches for deduplication...'); + const churches = await prisma.church.findMany({ + where: { country: 'PH' }, + select: { + id: true, + name: true, + latitude: true, + longitude: true, + osmId: true, + baiduId: true, + masstimesId: true, + orarimesseId: true, + massSchedulesPhId: true, + philmassId: true, + horariosMisasId: true, + mszeInfoId: true, + weekdayMassesId: true, + messesInfoId: true, + bohosluzbyId: true, + miserendId: true, + kerknetId: true, + gottesdienstzeitenId: true, + discovermassId: true, + source: true, + website: true, + phone: true, + address: true, + }, + }); + console.log(`Loaded ${churches.length} existing Philippine churches`); + return churches; +} + +// ─── Import Logic ──────────────────────────────────────────────────────────── + +async function processChurch( + sitemapEntry: SitemapChurch, + existingChurches: ExistingChurch[], + dryRun: boolean, + skipSchedules: boolean, + stats: ImportStats, +): Promise<void> { + stats.churchesFound++; + + // Fetch church page + const churchHtml = await fetchPage(sitemapEntry.url); + if (!churchHtml) { + stats.errors++; + return; + } + + const parsed = parseChurchPage(churchHtml); + if (!parsed.name) { + console.log(` Skipping ${sitemapEntry.id}: no name found`); + stats.churchesSkipped++; + return; + } + + // Fetch coordinates from map page + let coords: { lat: number; lng: number } | null = null; + if (parsed.mapUrl) { + const mapHtml = await fetchPage(parsed.mapUrl); + if (mapHtml) { + coords = parseCoordinates(mapHtml); + } + } + + if (!coords) { + console.log(` Skipping ${sitemapEntry.id} (${parsed.name}): no coordinates`); + stats.churchesSkipped++; + return; + } + + // Parse schedule + const schedules = skipSchedules ? [] : parseScheduleTable(churchHtml); + + // Build candidate for dedup + const candidate = { + name: parsed.name, + lat: coords.lat, + lng: coords.lng, + massSchedulesPhId: sitemapEntry.id, + }; + + const duplicate = findDuplicateChurch(candidate, existingChurches); + + if (dryRun) { + if (duplicate) { + stats.churchesMatched++; + console.log(` [MATCH] ${sitemapEntry.id}: "${parsed.name}" → existing "${duplicate.name}" (${duplicate.id})`); + } else { + stats.churchesCreated++; + console.log(` [NEW] ${sitemapEntry.id}: "${parsed.name}" at ${coords.lat},${coords.lng}`); + } + if (schedules.length > 0) { + stats.schedulesProcessed++; + stats.massSchedulesCreated += schedules.length; + } + return; + } + + if (duplicate) { + // Update existing church + stats.churchesMatched++; + const updateData: Record<string, unknown> = { + massSchedulesPhId: sitemapEntry.id, + }; + + if (!duplicate.address && parsed.address) updateData.address = parsed.address; + if (!duplicate.phone && parsed.phone) updateData.phone = parsed.phone; + + // Fill city/state from breadcrumbs + const dbRecord = await prisma.church.findUnique({ + where: { id: duplicate.id }, + select: { city: true, state: true }, + }); + if (dbRecord && !dbRecord.city && parsed.city) updateData.city = parsed.city; + if (dbRecord && !dbRecord.state && parsed.region) updateData.state = parsed.region; + + try { + await prisma.church.update({ + where: { id: duplicate.id }, + data: updateData, + }); + } catch (error) { + if (error instanceof Error && error.message.includes('Unique constraint')) { + stats.churchesSkipped++; + return; + } + throw error; + } + + // Replace mass schedules + if (schedules.length > 0 && !skipSchedules) { + try { + await prisma.$transaction(async (tx) => { + await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } }); + await tx.massSchedule.createMany({ + data: schedules.map((s) => ({ + churchId: duplicate.id, + dayOfWeek: s.dayOfWeek, + time: s.time, + language: 'English', + })), + }); + await tx.church.update({ + where: { id: duplicate.id }, + data: { lastScrapedAt: new Date() }, + }); + }); + stats.schedulesProcessed++; + stats.massSchedulesCreated += schedules.length; + } catch (error) { + stats.errors++; + console.error(` Error saving schedules for ${sitemapEntry.id}: ${error instanceof Error ? error.message : error}`); + } + } + } else { + // Create new church + try { + const newChurch = await prisma.church.create({ + data: { + name: parsed.name, + latitude: coords.lat, + longitude: coords.lng, + address: parsed.address, + city: parsed.city || null, + state: parsed.region || null, + country: 'PH', + phone: parsed.phone, + hasWebsite: false, + massSchedulesPhId: sitemapEntry.id, + source: 'mass-schedules-ph', + }, + }); + stats.churchesCreated++; + + // Add to in-memory array for within-run dedup + existingChurches.push({ + id: newChurch.id, + name: parsed.name, + latitude: coords.lat, + longitude: coords.lng, + osmId: null, + baiduId: null, + masstimesId: null, + orarimesseId: null, + massSchedulesPhId: sitemapEntry.id, + philmassId: null, + horariosMisasId: null, + mszeInfoId: null, + weekdayMassesId: null, + messesInfoId: null, + bohosluzbyId: null, + miserendId: null, + kerknetId: null, + gottesdienstzeitenId: null, + discovermassId: null, + source: 'mass-schedules-ph', + website: null, + phone: parsed.phone, + address: parsed.address, + }); + + // Create mass schedules + if (schedules.length > 0 && !skipSchedules) { + await prisma.massSchedule.createMany({ + data: schedules.map((s) => ({ + churchId: newChurch.id, + dayOfWeek: s.dayOfWeek, + time: s.time, + language: 'English', + })), + }); + await prisma.church.update({ + where: { id: newChurch.id }, + data: { lastScrapedAt: new Date() }, + }); + stats.schedulesProcessed++; + stats.massSchedulesCreated += schedules.length; + } + } catch (error) { + if (error instanceof Error && error.message.includes('Unique constraint')) { + stats.churchesSkipped++; + return; + } + throw error; + } + } +} + +// ─── CLI ───────────────────────────────────────────────────────────────────── + +function parseArgs(): CLIArgs { + const args = process.argv.slice(2); + const result: CLIArgs = { + all: false, + dryRun: false, + skipSchedules: false, + }; + + for (let i = 0; i < args.length; i++) { + switch (args[i]) { + case '--all': + result.all = true; + break; + case '--church-id': + result.churchId = args[++i]; + break; + case '--dry-run': + result.dryRun = true; + break; + case '--skip-schedules': + result.skipSchedules = true; + break; + case '--resume-from': + result.resumeFrom = parseInt(args[++i]); + break; + case '--job-id': + result.jobId = args[++i]; + break; + case '--help': + case '-h': + console.log(` +Usage: npx tsx scripts/import-mass-schedules-ph.ts [options] + +Options: + --all Import all churches from sitemap + --church-id <id> Import a single church by ID (e.g. "34") + --dry-run No database writes, just report what would happen + --skip-schedules Skip mass schedule import (churches only) + --resume-from <id> Skip churches with ID less than this value + --job-id <uuid> Background job tracking ID + --help, -h Show this help message + +Examples: + npx tsx scripts/import-mass-schedules-ph.ts --church-id 34 --dry-run + npx tsx scripts/import-mass-schedules-ph.ts --all + npx tsx scripts/import-mass-schedules-ph.ts --all --skip-schedules + npx tsx scripts/import-mass-schedules-ph.ts --all --resume-from 500 +`); + process.exit(0); + } + } + + if (!result.all && !result.churchId) { + console.error('Error: specify --all or --church-id <id>'); + process.exit(1); + } + + return result; +} + +// ─── Helpers ───────────────────────────────────────────────────────────────── + +function formatDuration(ms: number): string { + const seconds = Math.floor(ms / 1000); + const minutes = Math.floor(seconds / 60); + const hours = Math.floor(minutes / 60); + if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`; + if (minutes > 0) return `${minutes}m ${seconds % 60}s`; + return `${seconds}s`; +} + +// ─── Main ──────────────────────────────────────────────────────────────────── + +async function main() { + const args = parseArgs(); + const startTime = Date.now(); + + console.log('\n' + '='.repeat(70)); + console.log('MASS-SCHEDULES.COM (PHILIPPINES) IMPORTER'); + console.log('='.repeat(70)); + console.log(`Mode: ${args.all ? 'All churches from sitemap' : `Single church: ${args.churchId}`}`); + console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`); + console.log(`Skip schedules: ${args.skipSchedules ? 'YES' : 'NO'}`); + if (args.resumeFrom) console.log(`Resume from ID: ${args.resumeFrom}`); + console.log(`Time: ${new Date().toISOString()}`); + console.log('='.repeat(70) + '\n'); + + // Update background job status if provided + if (args.jobId) { + try { + await prisma.backgroundJob.update({ + where: { id: args.jobId }, + data: { status: 'running', startedAt: new Date() }, + }); + } catch { + // Job might not exist yet + } + } + + // Load existing Philippine churches for dedup + const existingChurches = await loadExistingPhilippineChurches(); + + // Build church list: skip sitemap for single-church mode + let churchesToProcess: SitemapChurch[]; + if (args.churchId) { + // Single church: construct URL directly, no sitemap needed + churchesToProcess = [{ + id: args.churchId, + slug: 'church', + url: `${SITE_BASE}/catholic-church/${args.churchId}/church.html`, + }]; + console.log(`Single church mode: ID ${args.churchId}\n`); + } else { + // Full mode: fetch sitemap + const allChurches = await fetchChurchUrlsFromSitemap(); + console.log(`Found ${allChurches.length} unique church URLs in sitemap\n`); + churchesToProcess = allChurches; + } + + // Handle --resume-from + if (args.resumeFrom) { + const before = churchesToProcess.length; + churchesToProcess = churchesToProcess.filter((c) => parseInt(c.id) >= args.resumeFrom!); + console.log(`Resuming from ID ${args.resumeFrom} (skipping ${before - churchesToProcess.length} churches)\n`); + } + + const stats: ImportStats = { + churchesFound: 0, + churchesMatched: 0, + churchesCreated: 0, + churchesSkipped: 0, + schedulesProcessed: 0, + massSchedulesCreated: 0, + errors: 0, + }; + + // Process each church + for (let i = 0; i < churchesToProcess.length; i++) { + const church = churchesToProcess[i]; + const elapsed = formatDuration(Date.now() - startTime); + console.log(`[${i + 1}/${churchesToProcess.length}] Church ID ${church.id} [${elapsed} elapsed]`); + + try { + await processChurch(church, existingChurches, args.dryRun, args.skipSchedules, stats); + } catch (error) { + stats.errors++; + console.error(` ERROR processing church ${church.id}: ${error instanceof Error ? error.message : error}`); + } + } + + // Print summary + const totalTime = Date.now() - startTime; + console.log('\n' + '='.repeat(70)); + console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`); + console.log('='.repeat(70)); + console.log(`Churches found: ${stats.churchesFound}`); + console.log(` Matched (existing): ${stats.churchesMatched}`); + console.log(` Created (new): ${stats.churchesCreated}`); + console.log(` Skipped: ${stats.churchesSkipped}`); + console.log(`Schedules processed: ${stats.schedulesProcessed}`); + console.log(`Mass schedules created: ${stats.massSchedulesCreated}`); + console.log(`Errors: ${stats.errors}`); + console.log(`Total time: ${formatDuration(totalTime)}`); + console.log(`HTTP requests: ${requestCount}`); + console.log('='.repeat(70) + '\n'); + + // Update background job + if (args.jobId) { + try { + await prisma.backgroundJob.update({ + where: { id: args.jobId }, + data: { + status: stats.errors > 0 ? 'completed_with_errors' : 'completed', + completedAt: new Date(), + result: JSON.stringify(stats), + }, + }); + } catch { + // Ignore + } + } +} + +main() + .catch((error) => { + console.error('Fatal error:', error); + process.exit(1); + }) + .finally(async () => { + await prisma.$disconnect(); + await pool.end(); + }); diff --git a/scripts/import-masstimes-api.ts b/scripts/import-masstimes-api.ts new file mode 100644 index 0000000..bcc8c33 --- /dev/null +++ b/scripts/import-masstimes-api.ts @@ -0,0 +1,672 @@ +#!/usr/bin/env tsx +/** + * Import Catholic churches and mass schedules globally from masstimes.org API + * + * masstimes.org has ~121,000 churches worldwide. This script queries their + * geo-search API with a grid of coordinates covering world landmass, then + * deduplicates and imports the results. + * + * API: GET https://masstimes.org/Churchs/?lat={lat}&long={lng}&pg={page} + * - Requires Referer header + * - Returns 30 results per page within 100-mile (~160km) radius + * - Paginate until empty array + * + * Grid strategy: + * - 2.5° latitude spacing (~278km), longitude adjusted for latitude + * - Continental bounding boxes to skip oceans + * - 100-mile radius means ~322km diameter — 2.5° spacing ensures overlap + * + * Usage: + * npx tsx scripts/import-masstimes-api.ts --all + * npx tsx scripts/import-masstimes-api.ts --all --dry-run + * npx tsx scripts/import-masstimes-api.ts --region europe + * npx tsx scripts/import-masstimes-api.ts --all --skip-us + * npx tsx scripts/import-masstimes-api.ts --all --job-id {uuid} + */ + +import dotenv from 'dotenv'; +import path from 'path'; + +dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); +dotenv.config({ path: path.resolve(process.cwd(), '.env') }); + +import { Pool } from 'pg'; +import { PrismaPg } from '@prisma/adapter-pg'; +import { PrismaClient } from '@prisma/client'; + +const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; +console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); +const pool = new Pool({ + connectionString: dbUrl, + ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined, +}); +const adapter = new PrismaPg(pool); +const prisma = new PrismaClient({ adapter }); + +import { findDuplicateChurch } from '../src/lib/church-matcher'; +import type { ExistingChurch } from '../src/lib/church-matcher'; + +// ─── Constants ─────────────────────────────────────────────────────────────── + +const API_BASE = 'https://masstimes.org/Churchs/'; +const REFERER = 'https://masstimes.org/map'; +const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)'; +const RATE_LIMIT_MS = 2000; // 2 seconds between requests — respectful rate +const PAGE_SIZE = 30; +const LAT_SPACING = 2.5; // degrees (~278km) +const TARGET_LNG_SPACING_KM = 250; // target spacing in km + +// Country name → ISO code mapping for masstimes country names +const COUNTRY_CODE_MAP: Record<string, string> = { + 'united states': 'US', 'canada': 'CA', 'mexico': 'MX', + 'united kingdom': 'GB', 'ireland': 'IE', 'france': 'FR', 'germany': 'DE', + 'spain': 'ES', 'italy': 'IT', 'portugal': 'PT', 'netherlands': 'NL', + 'belgium': 'BE', 'luxembourg': 'LU', 'switzerland': 'CH', 'austria': 'AT', + 'poland': 'PL', 'czech republic': 'CZ', 'czechia': 'CZ', 'slovakia': 'SK', + 'hungary': 'HU', 'croatia': 'HR', 'slovenia': 'SI', 'romania': 'RO', + 'bulgaria': 'BG', 'serbia': 'RS', 'bosnia and herzegovina': 'BA', + 'montenegro': 'ME', 'north macedonia': 'MK', 'albania': 'AL', 'kosovo': 'XK', + 'greece': 'GR', 'cyprus': 'CY', 'malta': 'MT', 'denmark': 'DK', + 'sweden': 'SE', 'norway': 'NO', 'finland': 'FI', 'iceland': 'IS', + 'estonia': 'EE', 'latvia': 'LV', 'lithuania': 'LT', + 'ukraine': 'UA', 'russia': 'RU', 'belarus': 'BY', 'moldova': 'MD', + 'georgia': 'GE', 'armenia': 'AM', 'azerbaijan': 'AZ', + 'turkey': 'TR', 'israel': 'IL', 'jordan': 'JO', 'lebanon': 'LB', + 'egypt': 'EG', 'morocco': 'MA', 'tunisia': 'TN', 'algeria': 'DZ', + 'india': 'IN', 'sri lanka': 'LK', 'pakistan': 'PK', 'bangladesh': 'BD', + 'nepal': 'NP', 'myanmar': 'MM', 'thailand': 'TH', 'vietnam': 'VN', + 'cambodia': 'KH', 'laos': 'LA', 'malaysia': 'MY', 'singapore': 'SG', + 'indonesia': 'ID', 'philippines': 'PH', 'china': 'CN', 'japan': 'JP', + 'south korea': 'KR', 'korea, south': 'KR', 'taiwan': 'TW', + 'hong kong': 'HK', 'macau': 'MO', 'mongolia': 'MN', + 'australia': 'AU', 'new zealand': 'NZ', 'fiji': 'FJ', + 'papua new guinea': 'PG', 'samoa': 'WS', 'tonga': 'TO', 'guam': 'GU', + 'nigeria': 'NG', 'ghana': 'GH', 'kenya': 'KE', 'tanzania': 'TZ', + 'uganda': 'UG', 'south africa': 'ZA', 'cameroon': 'CM', 'senegal': 'SN', + 'ethiopia': 'ET', 'madagascar': 'MG', 'mozambique': 'MZ', + 'zambia': 'ZM', 'zimbabwe': 'ZW', 'malawi': 'MW', 'rwanda': 'RW', + 'burundi': 'BI', 'congo, democratic republic of the': 'CD', + 'congo, republic of the': 'CG', "côte d'ivoire": 'CI', 'ivory coast': 'CI', + 'burkina faso': 'BF', 'mali': 'ML', 'niger': 'NE', 'chad': 'TD', + 'central african republic': 'CF', 'gabon': 'GA', 'equatorial guinea': 'GQ', + 'angola': 'AO', 'namibia': 'NA', 'botswana': 'BW', 'lesotho': 'LS', + 'eswatini': 'SZ', 'swaziland': 'SZ', 'mauritius': 'MU', + 'brazil': 'BR', 'argentina': 'AR', 'colombia': 'CO', 'peru': 'PE', + 'chile': 'CL', 'venezuela': 'VE', 'ecuador': 'EC', 'bolivia': 'BO', + 'paraguay': 'PY', 'uruguay': 'UY', 'guyana': 'GY', 'suriname': 'SR', + 'trinidad and tobago': 'TT', 'jamaica': 'JM', 'barbados': 'BB', + 'bahamas': 'BS', 'bahamas, the': 'BS', 'haiti': 'HT', + 'dominican republic': 'DO', 'cuba': 'CU', 'puerto rico': 'PR', + 'guatemala': 'GT', 'honduras': 'HN', 'el salvador': 'SV', + 'nicaragua': 'NI', 'costa rica': 'CR', 'panama': 'PA', 'belize': 'BZ', + 'grenada': 'GD', 'saint lucia': 'LC', 'dominica': 'DM', + 'saint vincent and the grenadines': 'VC', 'antigua and barbuda': 'AG', + 'saint kitts and nevis': 'KN', 'bermuda': 'BM', 'cayman islands': 'KY', + 'aruba': 'AW', 'curaçao': 'CW', 'curacao': 'CW', + 'united arab emirates': 'AE', 'saudi arabia': 'SA', 'qatar': 'QA', + 'bahrain': 'BH', 'kuwait': 'KW', 'oman': 'OM', 'iraq': 'IQ', + 'iran': 'IR', 'afghanistan': 'AF', + 'kazakhstan': 'KZ', 'uzbekistan': 'UZ', 'kyrgyzstan': 'KG', + 'tajikistan': 'TJ', 'turkmenistan': 'TM', + 'liechtenstein': 'LI', 'monaco': 'MC', 'andorra': 'AD', 'san marino': 'SM', + 'vatican city': 'VA', 'holy see (vatican city)': 'VA', + 'east timor': 'TL', 'timor-leste': 'TL', +}; + +// Continental bounding boxes (lat_min, lat_max, lng_min, lng_max) +const REGIONS: Record<string, Array<[number, number, number, number]>> = { + 'north-america': [[7, 72, -170, -50]], + 'central-america': [[7, 24, -120, -60]], + 'south-america': [[-56, 13, -82, -34]], + 'europe': [[35, 72, -12, 45]], + 'eastern-europe': [[40, 70, 20, 60]], + 'africa': [[-36, 38, -20, 55]], + 'middle-east': [[12, 42, 25, 65]], + 'south-asia': [[5, 38, 60, 98]], + 'east-asia': [[18, 55, 95, 150]], + 'southeast-asia': [[-12, 22, 92, 142]], + 'oceania': [[-48, -8, 110, 180], [-22, 0, 160, 180]], + 'central-asia': [[35, 55, 45, 90]], +}; + +// ─── Types ─────────────────────────────────────────────────────────────────── + +interface MasstimesChurch { + id: string; + name: string; + latitude: string; + longitude: string; + church_address_street_address: string; + church_address_city_name: string; + church_address_providence_name: string; + church_address_postal_code: string; + church_address_country_territory_name: string; + church_address_county: string | null; + diocese_name: string; + phone_number: string; + email: string; + url: string; + pastors_name: string; + church_worship_times: MasstimesWorshipTime[]; + distance: string; + wheel_chair_access: boolean; +} + +interface MasstimesWorshipTime { + day_of_week: string; + time_start: string; + time_end: string; + language: string | null; + service_typename: string; + comment: string; + is_perpetual: boolean; +} + +interface ImportStats { + gridPoints: number; + apiRequests: number; + churchesDiscovered: number; + churchesMatched: number; + churchesCreated: number; + churchesSkipped: number; + massSchedulesCreated: number; + errors: number; +} + +interface CLIArgs { + all: boolean; + region?: string; + dryRun: boolean; + skipUs: boolean; + resumeFrom: number; + jobId?: string; +} + +// ─── CLI ───────────────────────────────────────────────────────────────────── + +function parseArgs(): CLIArgs { + const args = process.argv.slice(2); + const result: CLIArgs = { all: false, dryRun: false, skipUs: false, resumeFrom: 0 }; + + for (let i = 0; i < args.length; i++) { + switch (args[i]) { + case '--all': result.all = true; break; + case '--region': result.region = args[++i]; break; + case '--dry-run': result.dryRun = true; break; + case '--skip-us': result.skipUs = true; break; + case '--resume-from': result.resumeFrom = parseInt(args[++i], 10); break; + case '--job-id': result.jobId = args[++i]; break; + case '--help': + console.log(`Usage: npx tsx scripts/import-masstimes-api.ts [options] + --all Query all regions globally + --region <name> Query specific region: ${Object.keys(REGIONS).join(', ')} + --skip-us Skip US grid points (already well-covered) + --dry-run No database writes + --resume-from <n> Skip first N grid points + --job-id <uuid> Background job tracking`); + process.exit(0); + } + } + + if (!result.all && !result.region) { + console.error('Error: specify --all or --region <name>'); + process.exit(1); + } + + return result; +} + +// ─── Grid Generation ───────────────────────────────────────────────────────── + +function generateGridPoints(regions: string[], skipUs: boolean): Array<{ lat: number; lng: number }> { + const points: Array<{ lat: number; lng: number }> = []; + const seen = new Set<string>(); + + for (const regionName of regions) { + const boxes = REGIONS[regionName]; + if (!boxes) { + console.error(`Unknown region: ${regionName}`); + continue; + } + + for (const [latMin, latMax, lngMin, lngMax] of boxes) { + for (let lat = latMin; lat <= latMax; lat += LAT_SPACING) { + // Adjust longitude spacing based on latitude (degrees get narrower) + const kmPerDegreeLng = 111.32 * Math.cos((lat * Math.PI) / 180); + const lngSpacing = kmPerDegreeLng > 0 + ? Math.max(LAT_SPACING, TARGET_LNG_SPACING_KM / kmPerDegreeLng) + : LAT_SPACING; + + for (let lng = lngMin; lng <= lngMax; lng += lngSpacing) { + const roundedLat = Math.round(lat * 10) / 10; + const roundedLng = Math.round(lng * 10) / 10; + const key = `${roundedLat},${roundedLng}`; + + if (!seen.has(key)) { + // Skip US continental bounding box if requested + if (skipUs && roundedLat >= 24 && roundedLat <= 50 + && roundedLng >= -125 && roundedLng <= -66) { + continue; + } + seen.add(key); + points.push({ lat: roundedLat, lng: roundedLng }); + } + } + } + } + } + + return points; +} + +// ─── API ───────────────────────────────────────────────────────────────────── + +async function sleep(ms: number): Promise<void> { + return new Promise(resolve => setTimeout(resolve, ms)); +} + +async function fetchPage(lat: number, lng: number, page: number): Promise<MasstimesChurch[]> { + const url = `${API_BASE}?lat=${lat}&long=${lng}&pg=${page}`; + const response = await fetch(url, { + headers: { + 'Referer': REFERER, + 'User-Agent': USER_AGENT, + 'Accept': 'application/json', + }, + }); + + if (!response.ok) { + if (response.status === 429) { + console.error(` Rate limited (429) — backing off 30s`); + await sleep(30000); + return fetchPage(lat, lng, page); // retry once + } + throw new Error(`HTTP ${response.status} for ${url}`); + } + + return response.json() as Promise<MasstimesChurch[]>; +} + +async function fetchAllForPoint( + lat: number, + lng: number, + stats: ImportStats, +): Promise<MasstimesChurch[]> { + const allChurches: MasstimesChurch[] = []; + let page = 1; + + while (true) { + stats.apiRequests++; + const results = await fetchPage(lat, lng, page); + if (results.length === 0) break; + + allChurches.push(...results); + + if (results.length < PAGE_SIZE) break; // last page + page++; + + await sleep(RATE_LIMIT_MS); + } + + return allChurches; +} + +// ─── Data Conversion ───────────────────────────────────────────────────────── + +function resolveCountryCode(countryName: string): string { + if (!countryName) return 'XX'; + const lower = countryName.trim().toLowerCase(); + return COUNTRY_CODE_MAP[lower] || 'XX'; +} + +const DAY_MAP: Record<string, number[]> = { + 'sunday': [0], + 'monday': [1], + 'tuesday': [2], + 'wednesday': [3], + 'thursday': [4], + 'friday': [5], + 'saturday': [6], + 'weekdays': [1, 2, 3, 4, 5], +}; + +function parseWorshipTimes(times: MasstimesWorshipTime[]): Array<{ + dayOfWeek: number; + time: string; + language: string; + notes: string | null; + massType: string | null; +}> { + const schedules: Array<{ + dayOfWeek: number; + time: string; + language: string; + notes: string | null; + massType: string | null; + }> = []; + + for (const wt of times) { + // Only import mass services (Weekend = Sun/Sat, Week Days = weekday masses) + if (wt.service_typename !== 'Weekend' && wt.service_typename !== 'Week Days') { + continue; + } + + const timeStr = wt.time_start?.trim(); + if (!timeStr || timeStr === '00:00:00') continue; + + // Parse "HH:MM:SS" → "HH:MM" + const timeParts = timeStr.split(':'); + const time24 = `${timeParts[0].padStart(2, '0')}:${timeParts[1] || '00'}`; + + const language = wt.language?.trim() || 'Unknown'; + const notes = wt.comment?.trim() || null; + + const dayKey = wt.day_of_week?.trim().toLowerCase(); + const days = DAY_MAP[dayKey]; + + if (days) { + for (const day of days) { + schedules.push({ dayOfWeek: day, time: time24, language, notes, massType: null }); + } + } + } + + return schedules; +} + +// ─── Database ──────────────────────────────────────────────────────────────── + +async function loadExistingChurches(): Promise<ExistingChurch[]> { + console.log('Loading existing churches for deduplication...'); + const churches = await prisma.church.findMany({ + select: { + id: true, + name: true, + latitude: true, + longitude: true, + osmId: true, + baiduId: true, + masstimesId: true, + orarimesseId: true, + massSchedulesPhId: true, + philmassId: true, + horariosMisasId: true, + mszeInfoId: true, + weekdayMassesId: true, + messesInfoId: true, + bohosluzbyId: true, + miserendId: true, + kerknetId: true, + gottesdienstzeitenId: true, + discovermassId: true, + source: true, + website: true, + phone: true, + address: true, + country: true, + }, + }); + console.log(`Loaded ${churches.length} existing churches`); + return churches; +} + +async function updateJobProgress(jobId: string, stats: ImportStats): Promise<void> { + try { + await prisma.backgroundJob.update({ + where: { id: jobId }, + data: { + processed: stats.gridPoints, + succeeded: stats.churchesMatched + stats.churchesCreated, + failed: stats.errors, + itemsFound: stats.churchesDiscovered, + }, + }); + } catch (err) { + console.error(`Failed to update job progress:`, err); + } +} + +// ─── Main Import ───────────────────────────────────────────────────────────── + +async function main() { + const args = parseArgs(); + + let regionNames: string[]; + if (args.all) { + regionNames = Object.keys(REGIONS); + } else { + regionNames = [args.region!]; + } + + const gridPoints = generateGridPoints(regionNames, args.skipUs); + + console.log(`\n${'='.repeat(70)}`); + console.log('MASSTIMES.ORG API GLOBAL IMPORTER'); + console.log('='.repeat(70)); + console.log(`Regions: ${regionNames.join(', ')}`); + console.log(`Grid points: ${gridPoints.length}`); + console.log(`Skip US: ${args.skipUs ? 'YES' : 'NO'}`); + console.log(`Dry run: ${args.dryRun ? 'YES' : 'NO'}`); + console.log(`Rate limit: ${RATE_LIMIT_MS}ms between requests`); + console.log(`Resume from: ${args.resumeFrom || 'start'}`); + const estHours = Math.round(gridPoints.length * 2 * RATE_LIMIT_MS / 1000 / 3600 * 10) / 10; + console.log(`Est. time: ~${estHours} hours (est. 2 pages/point avg)`); + console.log(`Time: ${new Date().toISOString()}`); + console.log('='.repeat(70)); + + const existingChurches = await loadExistingChurches(); + + // Build masstimesId lookup for fast dedup + const masstimesIdSet = new Set<string>(); + for (const c of existingChurches) { + if (c.masstimesId) masstimesIdSet.add(c.masstimesId); + } + + // Track discovered IDs to deduplicate across grid points + const discoveredIds = new Set<string>(); + + const stats: ImportStats = { + gridPoints: 0, + apiRequests: 0, + churchesDiscovered: 0, + churchesMatched: 0, + churchesCreated: 0, + churchesSkipped: 0, + massSchedulesCreated: 0, + errors: 0, + }; + + let jobId = args.jobId; + if (jobId) { + await prisma.backgroundJob.update({ + where: { id: jobId }, + data: { status: 'running', startedAt: new Date(), totalItems: gridPoints.length }, + }); + } + + const startTime = Date.now(); + + for (let i = 0; i < gridPoints.length; i++) { + const { lat, lng } = gridPoints[i]; + stats.gridPoints++; + + if (i < args.resumeFrom) continue; + + try { + const churches = await fetchAllForPoint(lat, lng, stats); + + if (churches.length > 0) { + let newInPoint = 0; + for (const mc of churches) { + if (discoveredIds.has(mc.id)) continue; + discoveredIds.add(mc.id); + stats.churchesDiscovered++; + + // Already in DB by masstimesId + if (masstimesIdSet.has(mc.id)) { + stats.churchesMatched++; + continue; + } + + const churchLat = parseFloat(mc.latitude); + const churchLng = parseFloat(mc.longitude); + if (isNaN(churchLat) || isNaN(churchLng) || (churchLat === 0 && churchLng === 0)) continue; + + const country = resolveCountryCode(mc.church_address_country_territory_name); + const address = [ + mc.church_address_street_address, + mc.church_address_city_name, + mc.church_address_providence_name, + mc.church_address_postal_code, + ].filter(s => s?.trim()).join(', ').trim() || null; + + // Proximity + name match + const candidate = { name: mc.name, lat: churchLat, lng: churchLng }; + const duplicate = findDuplicateChurch(candidate, existingChurches); + + if (duplicate) { + stats.churchesMatched++; + if (!args.dryRun) { + const updateData: Record<string, unknown> = { masstimesId: mc.id }; + if (!duplicate.phone && mc.phone_number?.trim()) updateData.phone = mc.phone_number.trim(); + if (!duplicate.website && mc.url?.trim()) { + updateData.website = mc.url.trim(); + updateData.hasWebsite = true; + } + if (!duplicate.address && address) updateData.address = address; + if (duplicate.country === 'XX' && country !== 'XX') updateData.country = country; + + try { + await prisma.church.update({ where: { id: duplicate.id }, data: updateData }); + masstimesIdSet.add(mc.id); + } catch (error) { + if (error instanceof Error && error.message.includes('Unique constraint')) { + stats.churchesSkipped++; + } else throw error; + } + } + continue; + } + + // Create new church + if (!args.dryRun) { + const schedules = parseWorshipTimes(mc.church_worship_times || []); + try { + const newChurch = await prisma.church.create({ + data: { + name: mc.name, + latitude: churchLat, + longitude: churchLng, + address, + city: mc.church_address_city_name?.trim() || null, + state: mc.church_address_providence_name?.trim() || null, + zip: mc.church_address_postal_code?.trim() || null, + country, + phone: mc.phone_number?.trim() || null, + website: mc.url?.trim() || null, + email: mc.email?.trim() || null, + hasWebsite: !!mc.url?.trim(), + masstimesId: mc.id, + source: 'masstimes', + diocese: mc.diocese_name?.trim() || null, + pastorName: mc.pastors_name?.trim() || null, + wheelchairAccess: mc.wheel_chair_access || false, + massSchedules: schedules.length > 0 ? { + create: schedules.map(s => ({ + dayOfWeek: s.dayOfWeek, + time: s.time, + language: s.language, + notes: s.notes, + massType: s.massType, + isActive: true, + })), + } : undefined, + }, + }); + + stats.churchesCreated++; + stats.massSchedulesCreated += schedules.length; + newInPoint++; + masstimesIdSet.add(mc.id); + + existingChurches.push({ + id: newChurch.id, name: mc.name, + latitude: churchLat, longitude: churchLng, + osmId: null, baiduId: null, masstimesId: mc.id, + orarimesseId: null, massSchedulesPhId: null, + philmassId: null, horariosMisasId: null, + mszeInfoId: null, weekdayMassesId: null, + messesInfoId: null, bohosluzbyId: null, miserendId: null, kerknetId: null, gottesdienstzeitenId: null, discovermassId: null, + source: 'masstimes', website: mc.url?.trim() || null, + phone: mc.phone_number?.trim() || null, address, country, + }); + } catch (error) { + if (error instanceof Error && error.message.includes('Unique constraint')) { + stats.churchesSkipped++; + } else { + stats.errors++; + console.error(` Error creating ${mc.name}: ${error instanceof Error ? error.message : error}`); + } + } + } else { + stats.churchesCreated++; + stats.massSchedulesCreated += parseWorshipTimes(mc.church_worship_times || []).length; + newInPoint++; + } + } + + if (newInPoint > 0) { + console.log(` Grid ${i + 1}/${gridPoints.length} (${lat},${lng}): ${churches.length} found, ${newInPoint} new`); + } + } + + await sleep(RATE_LIMIT_MS); + } catch (error) { + stats.errors++; + console.error(` Error at grid ${i + 1} (${lat},${lng}): ${error instanceof Error ? error.message : error}`); + await sleep(RATE_LIMIT_MS * 2); + } + + // Progress every 50 points + if ((i + 1) % 50 === 0 || i === gridPoints.length - 1) { + const elapsed = (Date.now() - startTime) / 1000; + const rate = elapsed > 0 ? Math.round(stats.apiRequests / elapsed * 3600) : 0; + console.log(` Progress: ${i + 1}/${gridPoints.length} grid points, ${stats.churchesDiscovered} discovered, ${stats.churchesCreated} new, ${stats.apiRequests} API calls [${Math.round(elapsed)}s, ~${rate}/hr]`); + } + + if (jobId && (i + 1) % 20 === 0) { + await updateJobProgress(jobId, stats); + } + } + + if (jobId) { + await updateJobProgress(jobId, stats); + await prisma.backgroundJob.update({ + where: { id: jobId }, + data: { status: 'completed', completedAt: new Date() }, + }); + } + + const elapsed = (Date.now() - startTime) / 1000; + console.log(`\n${'='.repeat(70)}`); + console.log('MASSTIMES API IMPORT SUMMARY'); + console.log('='.repeat(70)); + console.log(`Grid points queried: ${stats.gridPoints}`); + console.log(`API requests: ${stats.apiRequests}`); + console.log(`Churches discovered: ${stats.churchesDiscovered}`); + console.log(`Churches matched: ${stats.churchesMatched} (already in DB)`); + console.log(`Churches created: ${stats.churchesCreated}`); + console.log(`Churches skipped: ${stats.churchesSkipped} (duplicates)`); + console.log(`Mass schedules created: ${stats.massSchedulesCreated}`); + console.log(`Errors: ${stats.errors}`); + console.log(`Elapsed: ${Math.round(elapsed)}s (${(elapsed / 3600).toFixed(1)}h)`); + console.log('='.repeat(70)); + + await prisma.$disconnect(); + await pool.end(); +} + +main().catch((error) => { + console.error(`Fatal error: ${error.message}`); + process.exit(1); +}); diff --git a/scripts/import-messesinfo.ts b/scripts/import-messesinfo.ts new file mode 100644 index 0000000..974ddac --- /dev/null +++ b/scripts/import-messesinfo.ts @@ -0,0 +1,681 @@ +#!/usr/bin/env tsx +/** + * Import Catholic churches and mass schedules from messes.info (France) + * + * messes.info is the official French bishops' conference (CEF) mass schedule + * database. It exposes a GWT-RPC API returning structured JSON with parish + * data including name, address, coordinates, diocese, and celebration times. + * + * The API requires no authentication. We enumerate all French dioceses using + * the "community:{diocese_code}" query prefix, which returns all parishes + * within each diocese. + * + * Import strategy: + * 1. Query each of ~93 diocese codes via GWT-RPC API + * 2. Parse response: extract localities (churches) + celebrations (mass times) + * 3. Deduce recurring weekly schedule from date-specific celebration entries + * 4. Match against existing French churches via church-matcher + * 5. Upsert churches and mass schedules + * + * Usage: + * npx tsx scripts/import-messesinfo.ts --all --dry-run + * npx tsx scripts/import-messesinfo.ts --all + * npx tsx scripts/import-messesinfo.ts --diocese pa --dry-run # Paris only + * npx tsx scripts/import-messesinfo.ts --all --resume-from 20 + */ + +import dotenv from 'dotenv'; +import path from 'path'; + +dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); +dotenv.config({ path: path.resolve(process.cwd(), '.env') }); + +import { Pool } from 'pg'; +import { PrismaPg } from '@prisma/adapter-pg'; +import { PrismaClient } from '@prisma/client'; + +const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; +console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); +const pool = new Pool({ + connectionString: dbUrl, + ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined, +}); +const adapter = new PrismaPg(pool); +const prisma = new PrismaClient({ adapter }); + +import { findDuplicateChurch } from '../src/lib/church-matcher'; +import type { ExistingChurch } from '../src/lib/church-matcher'; + +// ─── Constants ─────────────────────────────────────────────────────────────── + +const API_URL = 'https://messes.info/gwtRequest'; +const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)'; +const REQUEST_DELAY_MS = 3000; +const RETRY_DELAY_MS = 10000; +const MAX_RETRIES = 3; +const RESULTS_PER_QUERY = 2000; + +// Diocese codes discovered from the API. Each code maps to a diocese in France. +// The query "community:{code}" returns all parishes within that diocese. +// Codes are 2-letter abbreviations (e.g., pa=Paris, ly=Lyon, st=Strasbourg). +const DIOCESE_CODES = [ + 'a', 'aa', 'ac', 'ad', 'ag', 'al', 'am', 'an', 'ar', 'au', 'av', 'ay', + 'ba', 'bb', 'be', 'bl', 'bm', 'bo', 'br', 'bs', 'bv', 'by', + 'ca', 'cb', 'cc', 'cd', 'ch', 'cl', 'cm', 'cn', 'cr', 'cs', + 'da', 'di', 'dj', 'dn', + 'et', 'ex', 'ey', + 'ft', + 'ga', 'gr', + 'lg', 'lh', 'li', 'lm', 'lp', 'lr', 'ls', 'lu', 'lv', 'ly', + 'ma', 'md', 'me', 'ml', 'mp', 'mt', 'mx', + 'na', 'nc', 'ni', 'nt', 'nv', 'ny', + 'or', + 'pa', 'pm', 'po', 'ps', 'pt', + 'qu', + 're', 'rn', 'ro', 'rv', + 'sl', 'ss', 'st', 'sz', + 'tl', 'to', 'ts', 'tu', + 'va', 'vd', 've', 'vl', 'vv', +]; + +// ─── Types ─────────────────────────────────────────────────────────────────── + +interface LocalityData { + idfixe: string; + name: string; + address: string | null; + city: string | null; + zipcode: string | null; + latitude: number; + longitude: number; + sector: string | null; + communityId: string | null; + localityId: string; // e.g. "75/paris-04/saint-louis-en-l-ile" +} + +interface CelebrationData { + date: string; + time: string; // normalized to "HH:MM" + recurrenceCategory: number; +} + +interface ParsedSchedule { + dayOfWeek: number; + time: string; +} + +interface ImportStats { + diocesesProcessed: number; + localitiesFound: number; + churchesMatched: number; + churchesCreated: number; + churchesSkipped: number; + schedulesCreated: number; + errors: number; +} + +interface CLIArgs { + all: boolean; + dryRun: boolean; + resumeFrom?: number; + diocese?: string; + jobId?: string; +} + +// ─── HTTP Client ───────────────────────────────────────────────────────────── + +let requestCount = 0; + +function delay(ms: number): Promise<void> { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +/** + * Convert MessesInfo time format "18h00" or "9h30" to "HH:MM" format. + */ +function normalizeTime(messesTime: string): string { + const match = messesTime.match(/^(\d{1,2})h(\d{2})$/); + if (match) { + return `${match[1].padStart(2, '0')}:${match[2]}`; + } + // Already in HH:MM format + if (/^\d{1,2}:\d{2}$/.test(messesTime)) { + const parts = messesTime.split(':'); + return `${parts[0].padStart(2, '0')}:${parts[1]}`; + } + return messesTime; +} + +async function fetchDioceseData(dioceseCode: string): Promise<any | null> { + if (requestCount > 0) { + await delay(REQUEST_DELAY_MS); + } + requestCount++; + + const body = JSON.stringify({ + F: 'cef.kephas.shared.request.AppRequestFactory', + I: [{ + O: 'Bzv0wi60qgwcW5aKiRKrtgNaLKo=', + P: [`community:${dioceseCode}`, 0, RESULTS_PER_QUERY, 1, null, '48.86:2.35', ''], + R: ['listCelebrationTime.locality'], + }], + }); + + for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) { + try { + const response = await fetch(API_URL, { + method: 'POST', + headers: { + 'User-Agent': USER_AGENT, + 'Content-Type': 'application/json', + 'Accept': 'application/json', + }, + body, + }); + + if (response.status === 503 || response.status === 429) { + if (attempt < MAX_RETRIES) { + console.log(` HTTP ${response.status} — retrying in ${RETRY_DELAY_MS / 1000}s (attempt ${attempt}/${MAX_RETRIES})`); + await delay(RETRY_DELAY_MS); + continue; + } + console.error(` HTTP ${response.status} after ${MAX_RETRIES} attempts`); + return null; + } + + if (!response.ok) { + console.error(` HTTP ${response.status} from API`); + return null; + } + + return await response.json(); + } catch (error) { + if (attempt < MAX_RETRIES) { + console.log(` Network error — retrying in ${RETRY_DELAY_MS / 1000}s (attempt ${attempt}/${MAX_RETRIES})`); + await delay(RETRY_DELAY_MS); + continue; + } + console.error(` API error after ${MAX_RETRIES} attempts: ${error instanceof Error ? error.message : error}`); + return null; + } + } + return null; +} + +// ─── Response Parser ───────────────────────────────────────────────────────── + +/** + * Parse the GWT-RPC response into a map of locality idfixe → data. + * + * The response O array contains interleaved objects: + * - Locality objects: have P.idfixe, P.name, P.address, P.latitude, etc. + * - Celebration objects: have P.date, P.time, P.localityId, P.recurrenceCategory + * - Metadata object: has P.size, P.sizeLocalities + * + * Localities and celebrations are linked by P.localityId matching P.id on localities. + */ +function parseApiResponse(data: any): Map<string, { locality: LocalityData; celebrations: CelebrationData[] }> { + const result = new Map<string, { locality: LocalityData; celebrations: CelebrationData[] }>(); + + if (!data?.O || !Array.isArray(data.O)) return result; + + // First pass: collect all localities by their id + const localitiesById = new Map<string, LocalityData>(); + for (const obj of data.O) { + const p = obj.P; + if (!p || typeof p !== 'object') continue; + + if (p.idfixe && p.name) { + const locality: LocalityData = { + idfixe: p.idfixe, + name: p.name, + address: p.address || null, + city: p.city || null, + zipcode: p.zipcode || null, + latitude: p.latitude || 0, + longitude: p.longitude || 0, + sector: p.sector || null, + communityId: p.communityId || null, + localityId: p.id || '', + }; + localitiesById.set(p.id, locality); + + // Initialize in result map (dedup by idfixe) + if (!result.has(p.idfixe)) { + result.set(p.idfixe, { locality, celebrations: [] }); + } + } + } + + // Second pass: collect celebrations and link to localities + for (const obj of data.O) { + const p = obj.P; + if (!p || typeof p !== 'object') continue; + + if (p.date && p.time && p.localityId) { + const locality = localitiesById.get(p.localityId); + if (locality && result.has(locality.idfixe)) { + result.get(locality.idfixe)!.celebrations.push({ + date: p.date, + time: normalizeTime(p.time), + recurrenceCategory: p.recurrenceCategory ?? 0, + }); + } + } + } + + return result; +} + +// ─── Schedule Deduction ────────────────────────────────────────────────────── + +function deduceSchedules(celebrations: CelebrationData[]): ParsedSchedule[] { + const seen = new Set<string>(); + const schedules: ParsedSchedule[] = []; + + // First pass: weekly recurring entries only (recurrenceCategory=1) + for (const celeb of celebrations) { + if (celeb.recurrenceCategory !== 1) continue; + const date = new Date(celeb.date + 'T12:00:00Z'); + const dayOfWeek = date.getUTCDay(); + const key = `${dayOfWeek}:${celeb.time}`; + if (!seen.has(key)) { + seen.add(key); + schedules.push({ dayOfWeek, time: celeb.time }); + } + } + + // Fallback: if no weekly entries, deduce from all + if (schedules.length === 0) { + for (const celeb of celebrations) { + const date = new Date(celeb.date + 'T12:00:00Z'); + const dayOfWeek = date.getUTCDay(); + const key = `${dayOfWeek}:${celeb.time}`; + if (!seen.has(key)) { + seen.add(key); + schedules.push({ dayOfWeek, time: celeb.time }); + } + } + } + + return schedules; +} + +// ─── Database Operations ───────────────────────────────────────────────────── + +async function loadExistingFrenchChurches(): Promise<ExistingChurch[]> { + console.log('Loading existing French churches for deduplication...'); + const churches = await prisma.church.findMany({ + where: { country: 'FR' }, + select: { + id: true, + name: true, + latitude: true, + longitude: true, + osmId: true, + baiduId: true, + masstimesId: true, + orarimesseId: true, + massSchedulesPhId: true, + philmassId: true, + horariosMisasId: true, + mszeInfoId: true, + weekdayMassesId: true, + messesInfoId: true, + bohosluzbyId: true, + miserendId: true, + kerknetId: true, + gottesdienstzeitenId: true, + discovermassId: true, + source: true, + website: true, + phone: true, + address: true, + }, + }); + console.log(`Loaded ${churches.length} existing French churches`); + return churches; +} + +// ─── Import Logic ──────────────────────────────────────────────────────────── + +async function processDiocese( + dioceseCode: string, + existingChurches: ExistingChurch[], + dryRun: boolean, + stats: ImportStats, +): Promise<void> { + const data = await fetchDioceseData(dioceseCode); + if (!data) { + stats.errors++; + return; + } + + // Check for API error + if (data.S && data.S[0] === false) { + console.log(` API error for diocese ${dioceseCode}`); + stats.errors++; + return; + } + + const localities = parseApiResponse(data); + console.log(` Found ${localities.size} unique localities`); + stats.localitiesFound += localities.size; + stats.diocesesProcessed++; + + for (const [idfixe, { locality, celebrations }] of localities) { + if (locality.latitude === 0 && locality.longitude === 0) { + stats.churchesSkipped++; + continue; + } + + const schedules = deduceSchedules(celebrations); + + const candidate = { + name: locality.name, + lat: locality.latitude, + lng: locality.longitude, + messesInfoId: idfixe, + }; + + const duplicate = findDuplicateChurch(candidate, existingChurches); + + if (dryRun) { + if (duplicate) { + stats.churchesMatched++; + } else { + stats.churchesCreated++; + } + stats.schedulesCreated += schedules.length; + continue; + } + + if (duplicate) { + stats.churchesMatched++; + const updateData: Record<string, unknown> = { messesInfoId: idfixe }; + + if (!duplicate.address && locality.address) updateData.address = locality.address; + if (duplicate.latitude === 0 && duplicate.longitude === 0 && locality.latitude !== 0) { + updateData.latitude = locality.latitude; + updateData.longitude = locality.longitude; + } + + try { + await prisma.church.update({ + where: { id: duplicate.id }, + data: updateData, + }); + } catch (error) { + if (error instanceof Error && error.message.includes('Unique constraint')) { + stats.churchesSkipped++; + continue; + } + throw error; + } + + if (schedules.length > 0) { + try { + await prisma.$transaction(async (tx) => { + await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } }); + await tx.massSchedule.createMany({ + data: schedules.map((s) => ({ + churchId: duplicate.id, + dayOfWeek: s.dayOfWeek, + time: s.time, + language: 'French', + })), + }); + await tx.church.update({ + where: { id: duplicate.id }, + data: { lastScrapedAt: new Date() }, + }); + }); + stats.schedulesCreated += schedules.length; + } catch (error) { + stats.errors++; + console.error(` Error saving schedules for ${idfixe}: ${error instanceof Error ? error.message : error}`); + } + } + } else { + // Determine country code from zipcode + let country = 'FR'; + if (locality.zipcode && /^97[1-6]/.test(locality.zipcode)) { + country = 'FR'; // DOM-TOM are still FR + } + + try { + const newChurch = await prisma.church.create({ + data: { + name: locality.name, + latitude: locality.latitude, + longitude: locality.longitude, + address: locality.address, + zip: locality.zipcode, + city: locality.city, + country, + diocese: locality.sector || undefined, + messesInfoId: idfixe, + source: 'messes-info', + websiteLanguage: 'fr', + }, + }); + stats.churchesCreated++; + + existingChurches.push({ + id: newChurch.id, + name: locality.name, + latitude: locality.latitude, + longitude: locality.longitude, + osmId: null, + baiduId: null, + masstimesId: null, + orarimesseId: null, + massSchedulesPhId: null, + philmassId: null, + horariosMisasId: null, + mszeInfoId: null, + weekdayMassesId: null, + messesInfoId: idfixe, + bohosluzbyId: null, + miserendId: null, + kerknetId: null, + gottesdienstzeitenId: null, + discovermassId: null, + source: 'messes-info', + website: null, + phone: null, + address: locality.address, + }); + + if (schedules.length > 0) { + await prisma.massSchedule.createMany({ + data: schedules.map((s) => ({ + churchId: newChurch.id, + dayOfWeek: s.dayOfWeek, + time: s.time, + language: 'French', + })), + }); + await prisma.church.update({ + where: { id: newChurch.id }, + data: { lastScrapedAt: new Date() }, + }); + stats.schedulesCreated += schedules.length; + } + } catch (error) { + if (error instanceof Error && error.message.includes('Unique constraint')) { + stats.churchesSkipped++; + continue; + } + stats.errors++; + console.error(` Error creating ${idfixe}: ${error instanceof Error ? error.message : error}`); + } + } + } +} + +// ─── CLI ───────────────────────────────────────────────────────────────────── + +function parseArgs(): CLIArgs { + const args = process.argv.slice(2); + const result: CLIArgs = { all: false, dryRun: false }; + + for (let i = 0; i < args.length; i++) { + switch (args[i]) { + case '--all': + result.all = true; + break; + case '--dry-run': + result.dryRun = true; + break; + case '--resume-from': + result.resumeFrom = parseInt(args[++i]); + break; + case '--diocese': + result.diocese = args[++i]; + break; + case '--job-id': + result.jobId = args[++i]; + break; + case '--help': + case '-h': + console.log(` +Usage: npx tsx scripts/import-messesinfo.ts [options] + +Options: + --all Import all dioceses + --diocese <code> Import a single diocese (e.g., pa for Paris) + --dry-run No database writes, just report what would happen + --resume-from <n> Skip first N dioceses + --job-id <uuid> Background job tracking ID + --help, -h Show this help message + +Examples: + npx tsx scripts/import-messesinfo.ts --diocese pa --dry-run + npx tsx scripts/import-messesinfo.ts --all --dry-run + npx tsx scripts/import-messesinfo.ts --all +`); + process.exit(0); + } + } + + if (!result.all && !result.diocese) { + console.error('Error: specify --all or --diocese <code>'); + process.exit(1); + } + + return result; +} + +function formatDuration(ms: number): string { + const seconds = Math.floor(ms / 1000); + const minutes = Math.floor(seconds / 60); + const hours = Math.floor(minutes / 60); + if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`; + if (minutes > 0) return `${minutes}m ${seconds % 60}s`; + return `${seconds}s`; +} + +// ─── Main ──────────────────────────────────────────────────────────────────── + +async function main() { + const args = parseArgs(); + const startTime = Date.now(); + + console.log('\n' + '='.repeat(70)); + console.log('MESSES.INFO (FRANCE) IMPORTER'); + console.log('='.repeat(70)); + console.log(`Mode: ${args.diocese ? `Diocese ${args.diocese}` : 'All dioceses'}`); + console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`); + if (args.resumeFrom) console.log(`Resume from: diocese index ${args.resumeFrom}`); + console.log(`Time: ${new Date().toISOString()}`); + console.log('='.repeat(70) + '\n'); + + if (args.jobId) { + try { + await prisma.backgroundJob.update({ + where: { id: args.jobId }, + data: { status: 'running', startedAt: new Date() }, + }); + } catch { /* Job might not exist */ } + } + + const stats: ImportStats = { + diocesesProcessed: 0, + localitiesFound: 0, + churchesMatched: 0, + churchesCreated: 0, + churchesSkipped: 0, + schedulesCreated: 0, + errors: 0, + }; + + const existingChurches = await loadExistingFrenchChurches(); + + let dioceses = args.diocese ? [args.diocese] : [...DIOCESE_CODES]; + + if (args.diocese && !DIOCESE_CODES.includes(args.diocese)) { + console.log(`Warning: diocese "${args.diocese}" not in known list, trying anyway...`); + } + + if (args.resumeFrom && !args.diocese) { + dioceses = dioceses.slice(args.resumeFrom); + console.log(`Resuming from diocese index ${args.resumeFrom} (${dioceses[0]})\n`); + } + + console.log(`Processing ${dioceses.length} dioceses\n`); + + for (let i = 0; i < dioceses.length; i++) { + const code = dioceses[i]; + const elapsed = formatDuration(Date.now() - startTime); + console.log(`[${i + 1}/${dioceses.length}] Diocese "${code}" [${elapsed} elapsed]`); + + try { + await processDiocese(code, existingChurches, args.dryRun, stats); + } catch (error) { + stats.errors++; + console.error(` ERROR processing diocese ${code}: ${error instanceof Error ? error.message : error}`); + } + } + + const totalTime = Date.now() - startTime; + console.log('\n' + '='.repeat(70)); + console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`); + console.log('='.repeat(70)); + console.log(`Dioceses processed: ${stats.diocesesProcessed}`); + console.log(`Localities found: ${stats.localitiesFound}`); + console.log(` Matched (existing): ${stats.churchesMatched}`); + console.log(` Created (new): ${stats.churchesCreated}`); + console.log(` Skipped: ${stats.churchesSkipped}`); + console.log(`Schedules created: ${stats.schedulesCreated}`); + console.log(`Errors: ${stats.errors}`); + console.log(`Total time: ${formatDuration(totalTime)}`); + console.log(`HTTP requests: ${requestCount}`); + console.log('='.repeat(70) + '\n'); + + if (args.jobId) { + try { + await prisma.backgroundJob.update({ + where: { id: args.jobId }, + data: { + status: stats.errors > 0 ? 'completed_with_errors' : 'completed', + completedAt: new Date(), + processed: stats.localitiesFound, + succeeded: stats.churchesCreated + stats.churchesMatched, + failed: stats.errors, + itemsFound: stats.schedulesCreated, + }, + }); + } catch { /* Ignore */ } + } +} + +main() + .catch((error) => { + console.error('Fatal error:', error); + process.exit(1); + }) + .finally(async () => { + await prisma.$disconnect(); + await pool.end(); + }); diff --git a/scripts/import-miserend.ts b/scripts/import-miserend.ts new file mode 100644 index 0000000..591be32 --- /dev/null +++ b/scripts/import-miserend.ts @@ -0,0 +1,579 @@ +#!/usr/bin/env tsx +/** + * Import Catholic churches and mass schedules from miserend.hu (Hungary) + * + * miserend.hu is the Hungarian Catholic mass schedule database, maintained by + * the community with ~5,055 churches (mostly Hungary, some Romania/Slovakia). + * It publishes a daily-updated SQLite database at: + * https://miserend.hu/fajlok/sqlite/miserend_v4.sqlite3 + * + * The SQLite contains: + * - templomok: churches (tid, nev, lat, lng, varos, cim, orszag, megye) + * - misek: date-specific mass entries (tid, ido, datumtol, datumig, nyelv) + * - kepek: church photos + * + * Import strategy: + * 1. Download the SQLite database + * 2. Extract all churches with coordinates + * 3. Deduce weekly recurring schedules from date-specific entries + * 4. Match against existing churches via church-matcher + * 5. Upsert churches and mass schedules + * + * Usage: + * npx tsx scripts/import-miserend.ts --all --dry-run + * npx tsx scripts/import-miserend.ts --all + * npx tsx scripts/import-miserend.ts --id 37 --dry-run # Single church + * npx tsx scripts/import-miserend.ts --all --resume-from 500 + */ + +import dotenv from 'dotenv'; +import path from 'path'; +import fs from 'fs'; +import { execFileSync } from 'child_process'; + +dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); +dotenv.config({ path: path.resolve(process.cwd(), '.env') }); + +import { Pool } from 'pg'; +import { PrismaPg } from '@prisma/adapter-pg'; +import { PrismaClient } from '@prisma/client'; + +const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; +console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); +const pool = new Pool({ + connectionString: dbUrl, + ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined, +}); +const adapter = new PrismaPg(pool); +const prisma = new PrismaClient({ adapter }); + +import { findDuplicateChurch } from '../src/lib/church-matcher'; +import type { ExistingChurch } from '../src/lib/church-matcher'; + +// ─── Constants ─────────────────────────────────────────────────────────────── + +const SQLITE_URL = 'https://miserend.hu/fajlok/sqlite/miserend_v4.sqlite3'; +const SQLITE_PATH = '/tmp/miserend_v4.sqlite3'; + +// Country mapping from Hungarian names to ISO codes +const COUNTRY_MAP: Record<string, string> = { + 'Magyarország': 'HU', + 'România': 'RO', + 'Slovensko': 'SK', + 'Szlovákia': 'SK', + 'Szerbia-Montenegro': 'RS', + 'Србија': 'RS', + 'Ukrajna': 'UA', + 'Україна': 'UA', + 'Österreich': 'AT', + 'Schweiz/Suisse/Svizzera/Svizra': 'CH', + 'België / Belgique / Belgien': 'BE', + 'Éire / Ireland': 'IE', + 'Россия': 'RU', +}; + +// ─── Types ─────────────────────────────────────────────────────────────────── + +interface MiserendChurch { + tid: number; + nev: string; + ismertnev: string | null; + orszag: string | null; + megye: string | null; + varos: string | null; + cim: string | null; + lat: number; + lng: number; +} + +interface MiserendMass { + mid: number; + tid: number; + datumtol: number; // MMDD format + datumig: number; + ido: string; // HH:MM:SS + nyelv: string | null; +} + +interface ParsedSchedule { + dayOfWeek: number; + time: string; +} + +interface ImportStats { + churchesFetched: number; + churchesMatched: number; + churchesCreated: number; + churchesSkipped: number; + schedulesCreated: number; + errors: number; +} + +interface CLIArgs { + all: boolean; + dryRun: boolean; + resumeFrom?: number; + churchId?: string; + jobId?: string; +} + +// ─── SQLite Helpers ────────────────────────────────────────────────────────── + +function sqliteQuery(query: string): string { + try { + return execFileSync('sqlite3', [SQLITE_PATH, query], { + encoding: 'utf-8', + maxBuffer: 100 * 1024 * 1024, // 100MB + }).trim(); + } catch { + return ''; + } +} + +function downloadSqlite(): void { + console.log('Downloading miserend SQLite database...'); + execFileSync('curl', ['-sL', '-o', SQLITE_PATH, SQLITE_URL], { timeout: 120000 }); + const size = fs.statSync(SQLITE_PATH).size; + console.log(`Downloaded ${(size / 1024 / 1024).toFixed(1)}MB`); +} + +function loadChurches(): MiserendChurch[] { + const raw = sqliteQuery( + "SELECT tid, nev, ismertnev, orszag, megye, varos, cim, lat, lng FROM templomok WHERE lat IS NOT NULL AND lng IS NOT NULL AND lat != 0 AND lng != 0;" + ); + if (!raw) return []; + + return raw.split('\n').map(line => { + const [tid, nev, ismertnev, orszag, megye, varos, cim, lat, lng] = line.split('|'); + return { + tid: parseInt(tid), + nev: nev || '', + ismertnev: ismertnev || null, + orszag: orszag || null, + megye: megye || null, + varos: varos || null, + cim: cim || null, + lat: parseFloat(lat), + lng: parseFloat(lng), + }; + }).filter(c => !isNaN(c.tid) && !isNaN(c.lat) && !isNaN(c.lng)); +} + +function loadMassesForChurch(tid: number): MiserendMass[] { + const raw = sqliteQuery( + `SELECT mid, tid, datumtol, datumig, ido, nyelv FROM misek WHERE tid=${tid};` + ); + if (!raw) return []; + + return raw.split('\n').map(line => { + const [mid, tidStr, datumtol, datumig, ido, nyelv] = line.split('|'); + return { + mid: parseInt(mid), + tid: parseInt(tidStr), + datumtol: parseInt(datumtol), + datumig: parseInt(datumig), + ido: ido || '', + nyelv: nyelv || null, + }; + }).filter(m => !isNaN(m.mid) && m.ido); +} + +// ─── Schedule Deduction ────────────────────────────────────────────────────── + +/** + * Deduce weekly recurring schedule from date-specific mass entries. + * Each entry has datumtol/datumig in MMDD format (e.g., 104 = Jan 4). + * We convert each date to a day of week and collect unique day+time combos. + */ +function deduceSchedules(masses: MiserendMass[]): ParsedSchedule[] { + const seen = new Set<string>(); + const schedules: ParsedSchedule[] = []; + + // Use current year for date conversion + const year = new Date().getFullYear(); + + for (const mass of masses) { + const time = mass.ido.substring(0, 5); // HH:MM from HH:MM:SS + if (!time || time === '00:00') continue; + + // Convert MMDD to a Date to get day of week + const mmdd = mass.datumtol; + const month = Math.floor(mmdd / 100); + const day = mmdd % 100; + if (month < 1 || month > 12 || day < 1 || day > 31) continue; + + const date = new Date(year, month - 1, day); + const dayOfWeek = date.getDay(); // 0=Sun, 1=Mon, ..., 6=Sat + + const key = `${dayOfWeek}:${time}`; + if (!seen.has(key)) { + seen.add(key); + schedules.push({ dayOfWeek, time }); + } + } + + return schedules; +} + +// ─── Database Operations ───────────────────────────────────────────────────── + +async function loadExistingChurches(countryCodes: string[]): Promise<ExistingChurch[]> { + console.log(`Loading existing churches for countries: ${countryCodes.join(', ')}...`); + const churches = await prisma.church.findMany({ + where: { country: { in: countryCodes } }, + select: { + id: true, + name: true, + latitude: true, + longitude: true, + osmId: true, + baiduId: true, + masstimesId: true, + orarimesseId: true, + massSchedulesPhId: true, + philmassId: true, + horariosMisasId: true, + mszeInfoId: true, + weekdayMassesId: true, + messesInfoId: true, + bohosluzbyId: true, + miserendId: true, + kerknetId: true, + gottesdienstzeitenId: true, + discovermassId: true, + source: true, + website: true, + phone: true, + address: true, + }, + }); + console.log(`Loaded ${churches.length} existing churches`); + return churches; +} + +// ─── Import Logic ──────────────────────────────────────────────────────────── + +async function processChurch( + church: MiserendChurch, + existingChurches: ExistingChurch[], + dryRun: boolean, + stats: ImportStats, +): Promise<void> { + const miserendId = String(church.tid); + const country = church.orszag ? (COUNTRY_MAP[church.orszag] || 'HU') : 'HU'; + + const candidate = { + name: church.nev, + lat: church.lat, + lng: church.lng, + miserendId, + }; + + const duplicate = findDuplicateChurch(candidate, existingChurches); + + // Deduce schedules + let schedules: ParsedSchedule[] = []; + if (!dryRun) { + const masses = loadMassesForChurch(church.tid); + schedules = deduceSchedules(masses); + } + + if (dryRun) { + if (duplicate) { + stats.churchesMatched++; + } else { + stats.churchesCreated++; + } + return; + } + + if (duplicate) { + stats.churchesMatched++; + const updateData: Record<string, unknown> = { miserendId }; + + if (!duplicate.address && church.cim) updateData.address = church.cim; + + try { + await prisma.church.update({ + where: { id: duplicate.id }, + data: updateData, + }); + } catch (error) { + if (error instanceof Error && error.message.includes('Unique constraint')) { + stats.churchesSkipped++; + return; + } + throw error; + } + + if (schedules.length > 0) { + try { + await prisma.$transaction(async (tx) => { + await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } }); + await tx.massSchedule.createMany({ + data: schedules.map((s) => ({ + churchId: duplicate.id, + dayOfWeek: s.dayOfWeek, + time: s.time, + language: 'Hungarian', + })), + }); + await tx.church.update({ + where: { id: duplicate.id }, + data: { lastScrapedAt: new Date() }, + }); + }); + stats.schedulesCreated += schedules.length; + } catch (error) { + stats.errors++; + console.error(` Error saving schedules for ${miserendId}: ${error instanceof Error ? error.message : error}`); + } + } + } else { + try { + const newChurch = await prisma.church.create({ + data: { + name: church.nev, + latitude: church.lat, + longitude: church.lng, + address: church.cim, + city: church.varos, + state: church.megye, + country, + miserendId, + source: 'miserend', + websiteLanguage: 'hu', + }, + }); + stats.churchesCreated++; + + existingChurches.push({ + id: newChurch.id, + name: church.nev, + latitude: church.lat, + longitude: church.lng, + osmId: null, + baiduId: null, + masstimesId: null, + orarimesseId: null, + massSchedulesPhId: null, + philmassId: null, + horariosMisasId: null, + mszeInfoId: null, + weekdayMassesId: null, + messesInfoId: null, + bohosluzbyId: null, + miserendId, + kerknetId: null, + gottesdienstzeitenId: null, + discovermassId: null, + source: 'miserend', + website: null, + phone: null, + address: church.cim, + }); + + if (schedules.length > 0) { + await prisma.massSchedule.createMany({ + data: schedules.map((s) => ({ + churchId: newChurch.id, + dayOfWeek: s.dayOfWeek, + time: s.time, + language: 'Hungarian', + })), + }); + await prisma.church.update({ + where: { id: newChurch.id }, + data: { lastScrapedAt: new Date() }, + }); + stats.schedulesCreated += schedules.length; + } + } catch (error) { + if (error instanceof Error && error.message.includes('Unique constraint')) { + stats.churchesSkipped++; + return; + } + stats.errors++; + console.error(` Error creating ${miserendId}: ${error instanceof Error ? error.message : error}`); + } + } +} + +// ─── CLI ───────────────────────────────────────────────────────────────────── + +function parseArgs(): CLIArgs { + const args = process.argv.slice(2); + const result: CLIArgs = { all: false, dryRun: false }; + + for (let i = 0; i < args.length; i++) { + switch (args[i]) { + case '--all': + result.all = true; + break; + case '--dry-run': + result.dryRun = true; + break; + case '--resume-from': + result.resumeFrom = parseInt(args[++i]); + break; + case '--id': + result.churchId = args[++i]; + break; + case '--job-id': + result.jobId = args[++i]; + break; + case '--help': + case '-h': + console.log(` +Usage: npx tsx scripts/import-miserend.ts [options] + +Options: + --all Import all churches + --id <tid> Import a single church by miserend ID + --dry-run No database writes, just report what would happen + --resume-from <n> Skip first N churches + --job-id <uuid> Background job tracking ID + --help, -h Show this help message + +Examples: + npx tsx scripts/import-miserend.ts --id 37 --dry-run + npx tsx scripts/import-miserend.ts --all --dry-run + npx tsx scripts/import-miserend.ts --all +`); + process.exit(0); + } + } + + if (!result.all && !result.churchId) { + console.error('Error: specify --all or --id <miserend_tid>'); + process.exit(1); + } + + return result; +} + +function formatDuration(ms: number): string { + const seconds = Math.floor(ms / 1000); + const minutes = Math.floor(seconds / 60); + const hours = Math.floor(minutes / 60); + if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`; + if (minutes > 0) return `${minutes}m ${seconds % 60}s`; + return `${seconds}s`; +} + +// ─── Main ──────────────────────────────────────────────────────────────────── + +async function main() { + const args = parseArgs(); + const startTime = Date.now(); + + console.log('\n' + '='.repeat(70)); + console.log('MISEREND.HU (HUNGARY) IMPORTER'); + console.log('='.repeat(70)); + console.log(`Mode: ${args.churchId ? `Church ID ${args.churchId}` : 'All churches'}`); + console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`); + if (args.resumeFrom) console.log(`Resume from: church index ${args.resumeFrom}`); + console.log(`Time: ${new Date().toISOString()}`); + console.log('='.repeat(70) + '\n'); + + if (args.jobId) { + try { + await prisma.backgroundJob.update({ + where: { id: args.jobId }, + data: { status: 'running', startedAt: new Date() }, + }); + } catch { /* Job might not exist */ } + } + + const stats: ImportStats = { + churchesFetched: 0, + churchesMatched: 0, + churchesCreated: 0, + churchesSkipped: 0, + schedulesCreated: 0, + errors: 0, + }; + + // Download SQLite database + downloadSqlite(); + + // Load churches from SQLite + let churches = loadChurches(); + stats.churchesFetched = churches.length; + console.log(`Found ${churches.length} churches with coordinates in SQLite\n`); + + if (args.churchId) { + churches = churches.filter(c => String(c.tid) === args.churchId); + if (churches.length === 0) { + console.error(`Church ID ${args.churchId} not found in SQLite database`); + return; + } + } + + // Get unique country codes from the data + const countryCodes = [...new Set(churches.map(c => { + return c.orszag ? (COUNTRY_MAP[c.orszag] || 'HU') : 'HU'; + }))]; + const existingChurches = await loadExistingChurches(countryCodes); + + if (args.resumeFrom) { + churches = churches.slice(args.resumeFrom); + console.log(`Resuming from index ${args.resumeFrom} (${churches.length} remaining)\n`); + } + + console.log(`Processing ${churches.length} churches\n`); + + for (let i = 0; i < churches.length; i++) { + const church = churches[i]; + if (i % 200 === 0) { + const elapsed = formatDuration(Date.now() - startTime); + console.log(`[${i + 1}/${churches.length}] Processing ${church.nev} (${church.tid}) [${elapsed} elapsed]`); + } + + try { + await processChurch(church, existingChurches, args.dryRun, stats); + } catch (error) { + stats.errors++; + console.error(` ERROR processing church ${church.tid}: ${error instanceof Error ? error.message : error}`); + } + } + + const totalTime = Date.now() - startTime; + console.log('\n' + '='.repeat(70)); + console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`); + console.log('='.repeat(70)); + console.log(`Churches in SQLite: ${stats.churchesFetched}`); + console.log(` Matched (existing): ${stats.churchesMatched}`); + console.log(` Created (new): ${stats.churchesCreated}`); + console.log(` Skipped: ${stats.churchesSkipped}`); + console.log(`Schedules created: ${stats.schedulesCreated}`); + console.log(`Errors: ${stats.errors}`); + console.log(`Total time: ${formatDuration(totalTime)}`); + console.log('='.repeat(70) + '\n'); + + if (args.jobId) { + try { + await prisma.backgroundJob.update({ + where: { id: args.jobId }, + data: { + status: stats.errors > 0 ? 'completed_with_errors' : 'completed', + completedAt: new Date(), + processed: stats.churchesFetched, + succeeded: stats.churchesCreated + stats.churchesMatched, + failed: stats.errors, + itemsFound: stats.schedulesCreated, + }, + }); + } catch { /* Ignore */ } + } +} + +main() + .catch((error) => { + console.error('Fatal error:', error); + process.exit(1); + }) + .finally(async () => { + await prisma.$disconnect(); + await pool.end(); + }); diff --git a/scripts/import-msze-info.ts b/scripts/import-msze-info.ts new file mode 100644 index 0000000..f224853 --- /dev/null +++ b/scripts/import-msze-info.ts @@ -0,0 +1,746 @@ +#!/usr/bin/env tsx +/** + * Import Catholic churches and mass schedules from msze.info (Poland) + * + * msze.info is a Polish directory of Catholic parishes with mass schedules. + * The site uses numbered sitemaps (Churches1.xml through Churches11.xml) + * with ~500 URLs each, containing both /kosciol/{id} (church pages) and + * /msze-online/{slug} (livestream pages). + * + * Import strategy: + * 1. Fetch all 11 sitemaps → extract /kosciol/{id} URLs (skip /msze-online/) + * 2. For each church: fetch HTML, parse name/address/phone/website/schedule + * 3. Extract coordinates from embedded tomtom_codeAddress() JS call + * 4. Match against existing PL churches, upsert + * + * Usage: + * npx tsx scripts/import-msze-info.ts --all + * npx tsx scripts/import-msze-info.ts --all --dry-run + * npx tsx scripts/import-msze-info.ts --all --resume-from 500 + * npx tsx scripts/import-msze-info.ts --all --job-id {uuid} + */ + +import dotenv from 'dotenv'; +import path from 'path'; + +dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); +dotenv.config({ path: path.resolve(process.cwd(), '.env') }); + +import { Pool } from 'pg'; +import { PrismaPg } from '@prisma/adapter-pg'; +import { PrismaClient } from '@prisma/client'; + +const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; +console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); +const pool = new Pool({ + connectionString: dbUrl, + ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined, +}); +const adapter = new PrismaPg(pool); +const prisma = new PrismaClient({ adapter }); + +import { findDuplicateChurch } from '../src/lib/church-matcher'; +import type { ExistingChurch } from '../src/lib/church-matcher'; + +// ─── Constants ─────────────────────────────────────────────────────────────── + +const SITE_BASE = 'https://www.msze.info'; +const SITEMAP_COUNT = 11; +const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)'; +const REQUEST_DELAY_MS = 1500; + +// ─── Types ─────────────────────────────────────────────────────────────────── + +interface ParsedChurch { + name: string; + address: string | null; + city: string | null; + zip: string | null; + phone: string | null; + website: string | null; + email: string | null; + latitude: number; + longitude: number; +} + +interface ParsedSchedule { + dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat + time: string; // "05:00", "18:30" +} + +interface ImportStats { + churchesFound: number; + churchesMatched: number; + churchesCreated: number; + churchesSkipped: number; + schedulesProcessed: number; + massSchedulesCreated: number; + errors: number; +} + +interface CLIArgs { + all: boolean; + dryRun: boolean; + resumeFrom?: number; + jobId?: string; +} + +// ─── HTTP Client ───────────────────────────────────────────────────────────── + +let requestCount = 0; + +function delay(ms: number): Promise<void> { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +async function fetchPage(url: string, delayMs: number = REQUEST_DELAY_MS): Promise<string | null> { + if (requestCount > 0) { + await delay(delayMs); + } + requestCount++; + + try { + const response = await fetch(url, { + headers: { + 'User-Agent': USER_AGENT, + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + }, + }); + + if (!response.ok) { + console.error(` HTTP ${response.status} for ${url}`); + return null; + } + + return await response.text(); + } catch (error) { + console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`); + return null; + } +} + +// ─── Sitemap Parser ────────────────────────────────────────────────────────── + +async function fetchChurchUrlsFromSitemaps(): Promise<string[]> { + const allIds: string[] = []; + const seen = new Set<string>(); + + for (let i = 1; i <= SITEMAP_COUNT; i++) { + const sitemapUrl = `${SITE_BASE}/sitemap/Churches${i}.xml`; + console.log(` Fetching ${sitemapUrl}...`); + const xml = await fetchPage(sitemapUrl); + if (!xml) { + console.error(` Failed to fetch ${sitemapUrl}`); + continue; + } + + // Extract /kosciol/{id} URLs, skip /msze-online/ + const locRegex = /<loc>https?:\/\/(?:www\.)?msze\.info\/kosciol\/(\d+)<\/loc>/g; + let match; + while ((match = locRegex.exec(xml)) !== null) { + const id = match[1]; + if (!seen.has(id)) { + seen.add(id); + allIds.push(id); + } + } + } + + // Sort numerically for deterministic order + allIds.sort((a, b) => parseInt(a) - parseInt(b)); + + console.log(`Found ${allIds.length} unique church IDs from ${SITEMAP_COUNT} sitemaps`); + return allIds; +} + +// ─── HTML Parsers ──────────────────────────────────────────────────────────── + +function parseChurchPage(html: string): ParsedChurch { + // Name: from <h1>Church Name, City</h1> + const h1Match = html.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i); + let name = ''; + let cityFromH1: string | null = null; + + if (h1Match) { + const raw = h1Match[1].replace(/<[^>]+>/g, '').trim(); + // Split "Church Name, City" — city is the last comma-separated part + const lastComma = raw.lastIndexOf(','); + if (lastComma > 0) { + name = raw.substring(0, lastComma).trim(); + cityFromH1 = raw.substring(lastComma + 1).trim(); + } else { + name = raw; + } + } + + // Address: look for "Adres:" or address-like patterns + // Pattern: <span class="highlight">Adres:</span> <strong>Street, City</strong> + let address: string | null = null; + let city: string | null = cityFromH1; + let zip: string | null = null; + + const addressMatch = html.match(/Adres:<\/span>\s*(?:<strong>)?([\s\S]*?)(?:<\/strong>|<br|<\/p)/i); + if (addressMatch) { + address = addressMatch[1] + .replace(/<[^>]+>/g, '') + .replace(/\s+/g, ' ') + .trim() || null; + } + + // Also try the tomtom_codeAddress first argument as fallback address + if (!address) { + const tomtomAddrMatch = html.match(/tomtom_codeAddress\s*\(\s*'([^']+)'/); + if (tomtomAddrMatch) { + address = tomtomAddrMatch[1].trim() || null; + } + } + + if (address) { + // Extract Polish postal code (XX-XXX format) + const zipMatch = address.match(/\b(\d{2}-\d{3})\b/); + if (zipMatch) { + zip = zipMatch[1]; + } + + // Extract city from address if not already from h1 + if (!city) { + // City is typically the last part after comma + const parts = address.split(','); + if (parts.length > 1) { + city = parts[parts.length - 1].replace(/\d{2}-\d{3}/, '').trim() || null; + } + } + } + + // Coordinates: from tomtom_codeAddress('addr', zoom, 'name', null, lat, lng) + let latitude = 0; + let longitude = 0; + const coordMatch = html.match( + /tomtom_codeAddress\s*\([^,]+,\s*\d+\s*,\s*[^,]+,\s*(?:null|'[^']*')\s*,\s*(-?[\d.]+)\s*,\s*(-?[\d.]+)\s*\)/ + ); + if (coordMatch) { + const lat = parseFloat(coordMatch[1]); + const lng = parseFloat(coordMatch[2]); + if (!isNaN(lat) && !isNaN(lng) && lat !== 0 && lng !== 0) { + latitude = lat; + longitude = lng; + } + } + + // Phone: <a href="tel:..."> + let phone: string | null = null; + const phoneMatch = html.match(/<a\s+href="tel:([^"]+)"/i); + if (phoneMatch) { + phone = phoneMatch[1].trim() || null; + } + + // Website: look for external link near "Witryna" text + let website: string | null = null; + const websiteMatch = html.match(/<a\s+href="(https?:\/\/[^"]+)"[^>]*>[^<]*Witryna/i); + if (websiteMatch) { + website = websiteMatch[1].trim() || null; + } + // Also try: link text that looks like a URL (www.xxx) + if (!website) { + const wwwMatch = html.match(/<a\s+href="(https?:\/\/[^"]+)"[^>]*>www\.[^<]+<\/a>/i); + if (wwwMatch) { + website = wwwMatch[1].trim() || null; + } + } + + // Email: not reliably available (Cloudflare-protected) + const email: string | null = null; + + return { name, address, city, zip, phone, website, email, latitude, longitude }; +} + +function parseMassSchedule(html: string): ParsedSchedule[] { + const schedules: ParsedSchedule[] = []; + const seen = new Set<string>(); + + // Find mass schedule sections by h2/h3 headings containing "MSZE" + // Pattern: <h2>MSZE NIEDZIELE I ŚWIĘTA - Church Name</h2> followed by "godz. ..." + // Pattern: <h3>MSZE DNI POWSZEDNIE - Church Name</h3> followed by "godz. ..." + const sectionRegex = /<h[2-4][^>]*>([\s\S]*?)<\/h[2-4]>([\s\S]*?)(?=<h[2-4]|<footer|<script|$)/gi; + let sectionMatch; + + while ((sectionMatch = sectionRegex.exec(html)) !== null) { + const heading = sectionMatch[1].replace(/<[^>]+>/g, '').trim().toUpperCase(); + const content = sectionMatch[2]; + + // Only process mass schedule headings (starts with "MSZE") + if (!heading.startsWith('MSZE')) continue; + + // Determine which days this section covers + const days = resolvePolishDays(heading); + if (days.length === 0) continue; + + // Extract times from "godz." patterns + const times = extractTimes(content); + + for (const day of days) { + for (const time of times) { + const key = `${day}:${time}`; + if (seen.has(key)) continue; + seen.add(key); + schedules.push({ dayOfWeek: day, time }); + } + } + } + + return schedules; +} + +function resolvePolishDays(heading: string): number[] { + const h = heading; // already uppercased by caller + + // "NIEDZIELE I ŚWIĘTA" or just "NIEDZIEL" → Sunday + if (h.includes('NIEDZIEL')) { + return [0]; + } + + // "DNI POWSZEDNIE" → Weekdays (Mon-Sat) + if (h.includes('DNI POWSZEDNIE') || h.includes('POWSZEDNI')) { + return [1, 2, 3, 4, 5, 6]; + } + + // Individual day names (rare but possible) + if (h.includes('PONIEDZIA')) return [1]; // poniedziałek + if (h.includes('WTOREK') || h.includes('WTORK')) return [2]; + if (h.includes('ŚRODA') || h.includes('SRODA') || h.includes('ŚROD')) return [3]; + if (h.includes('CZWARTEK') || h.includes('CZWART')) return [4]; + if (h.includes('PIĄTEK') || h.includes('PIATEK') || h.includes('PIĄT')) return [5]; + if (h.includes('SOBOT')) return [6]; + + return []; +} + +function extractTimes(text: string): string[] { + const times: string[] = []; + + // Match "godz." followed by times, or standalone HH:MM patterns + // Handles: "godz. 6:30, 8:00, 9:30" and "godz. 7:00" + const timeRegex = /(\d{1,2}):(\d{2})/g; + let match; + + // Only look at text near "godz." patterns + const godzSections = text.split(/godz\.\s*/i); + + for (let i = 1; i < godzSections.length; i++) { + // Take text until the next section break (paragraph, div, heading) + const section = godzSections[i].split(/<(?:p|div|br\s*\/?>|h[2-4])/i)[0]; + + while ((match = timeRegex.exec(section)) !== null) { + const hours = parseInt(match[1]); + const mins = parseInt(match[2]); + if (hours >= 0 && hours <= 23 && mins >= 0 && mins <= 59) { + times.push(`${String(hours).padStart(2, '0')}:${String(mins).padStart(2, '0')}`); + } + } + } + + return times; +} + +// ─── Database Operations ───────────────────────────────────────────────────── + +async function loadExistingPolishChurches(): Promise<ExistingChurch[]> { + console.log('Loading existing Polish churches for deduplication...'); + const churches = await prisma.church.findMany({ + where: { country: 'PL' }, + select: { + id: true, + name: true, + latitude: true, + longitude: true, + osmId: true, + baiduId: true, + masstimesId: true, + orarimesseId: true, + massSchedulesPhId: true, + philmassId: true, + horariosMisasId: true, + mszeInfoId: true, + weekdayMassesId: true, + messesInfoId: true, + bohosluzbyId: true, + miserendId: true, + kerknetId: true, + gottesdienstzeitenId: true, + discovermassId: true, + source: true, + website: true, + phone: true, + address: true, + }, + }); + console.log(`Loaded ${churches.length} existing Polish churches`); + return churches; +} + +// ─── Import Logic ──────────────────────────────────────────────────────────── + +async function processChurch( + churchId: string, + existingChurches: ExistingChurch[], + dryRun: boolean, + stats: ImportStats, +): Promise<void> { + stats.churchesFound++; + + const url = `${SITE_BASE}/kosciol/${churchId}`; + const churchHtml = await fetchPage(url); + if (!churchHtml) { + stats.errors++; + return; + } + + const parsed = parseChurchPage(churchHtml); + if (!parsed.name) { + console.log(` Skipping ${churchId}: no name found`); + stats.churchesSkipped++; + return; + } + + const schedules = parseMassSchedule(churchHtml); + + // Build candidate for dedup + const candidate = { + name: parsed.name, + lat: parsed.latitude, + lng: parsed.longitude, + mszeInfoId: churchId, + }; + + const duplicate = findDuplicateChurch(candidate, existingChurches); + + if (dryRun) { + if (duplicate) { + stats.churchesMatched++; + console.log(` [MATCH] "${parsed.name}" → existing "${duplicate.name}" (${duplicate.id})`); + } else { + stats.churchesCreated++; + console.log(` [NEW] "${parsed.name}" (${parsed.city || 'unknown city'})`); + } + if (schedules.length > 0) { + stats.schedulesProcessed++; + stats.massSchedulesCreated += schedules.length; + } + return; + } + + if (duplicate) { + // Update existing church + stats.churchesMatched++; + const updateData: Record<string, unknown> = { + mszeInfoId: churchId, + }; + + if (!duplicate.address && parsed.address) updateData.address = parsed.address; + if (!duplicate.phone && parsed.phone) updateData.phone = parsed.phone; + if (!duplicate.website && parsed.website) { + updateData.website = parsed.website; + updateData.hasWebsite = true; + } + + // Update coordinates if existing has none and we have them + if (duplicate.latitude === 0 && duplicate.longitude === 0 && parsed.latitude !== 0) { + updateData.latitude = parsed.latitude; + updateData.longitude = parsed.longitude; + } + + // Fill city/zip if not set + const dbRecord = await prisma.church.findUnique({ + where: { id: duplicate.id }, + select: { city: true, zip: true, email: true }, + }); + if (dbRecord && !dbRecord.city && parsed.city) updateData.city = parsed.city; + if (dbRecord && !dbRecord.zip && parsed.zip) updateData.zip = parsed.zip; + if (dbRecord && !dbRecord.email && parsed.email) updateData.email = parsed.email; + + try { + await prisma.church.update({ + where: { id: duplicate.id }, + data: updateData, + }); + } catch (error) { + if (error instanceof Error && error.message.includes('Unique constraint')) { + stats.churchesSkipped++; + return; + } + throw error; + } + + // Replace mass schedules + if (schedules.length > 0) { + try { + await prisma.$transaction(async (tx) => { + await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } }); + await tx.massSchedule.createMany({ + data: schedules.map((s) => ({ + churchId: duplicate.id, + dayOfWeek: s.dayOfWeek, + time: s.time, + language: 'Polish', + })), + }); + await tx.church.update({ + where: { id: duplicate.id }, + data: { lastScrapedAt: new Date() }, + }); + }); + stats.schedulesProcessed++; + stats.massSchedulesCreated += schedules.length; + } catch (error) { + stats.errors++; + console.error(` Error saving schedules for ${churchId}: ${error instanceof Error ? error.message : error}`); + } + } + } else { + // Create new church + try { + const newChurch = await prisma.church.create({ + data: { + name: parsed.name, + latitude: parsed.latitude, + longitude: parsed.longitude, + address: parsed.address, + zip: parsed.zip, + city: parsed.city, + country: 'PL', + phone: parsed.phone, + website: parsed.website, + email: parsed.email, + hasWebsite: !!parsed.website, + mszeInfoId: churchId, + source: 'msze-info', + }, + }); + stats.churchesCreated++; + + // Add to in-memory array for within-run dedup + existingChurches.push({ + id: newChurch.id, + name: parsed.name, + latitude: parsed.latitude, + longitude: parsed.longitude, + osmId: null, + baiduId: null, + masstimesId: null, + orarimesseId: null, + massSchedulesPhId: null, + philmassId: null, + horariosMisasId: null, + mszeInfoId: churchId, + weekdayMassesId: null, + messesInfoId: null, + bohosluzbyId: null, + miserendId: null, + kerknetId: null, + gottesdienstzeitenId: null, + discovermassId: null, + source: 'msze-info', + website: parsed.website, + phone: parsed.phone, + address: parsed.address, + }); + + // Create mass schedules + if (schedules.length > 0) { + await prisma.massSchedule.createMany({ + data: schedules.map((s) => ({ + churchId: newChurch.id, + dayOfWeek: s.dayOfWeek, + time: s.time, + language: 'Polish', + })), + }); + await prisma.church.update({ + where: { id: newChurch.id }, + data: { lastScrapedAt: new Date() }, + }); + stats.schedulesProcessed++; + stats.massSchedulesCreated += schedules.length; + } + } catch (error) { + if (error instanceof Error && error.message.includes('Unique constraint')) { + stats.churchesSkipped++; + return; + } + throw error; + } + } +} + +// ─── CLI ───────────────────────────────────────────────────────────────────── + +function parseArgs(): CLIArgs { + const args = process.argv.slice(2); + const result: CLIArgs = { + all: false, + dryRun: false, + }; + + for (let i = 0; i < args.length; i++) { + switch (args[i]) { + case '--all': + result.all = true; + break; + case '--dry-run': + result.dryRun = true; + break; + case '--resume-from': + result.resumeFrom = parseInt(args[++i]); + break; + case '--job-id': + result.jobId = args[++i]; + break; + case '--help': + case '-h': + console.log(` +Usage: npx tsx scripts/import-msze-info.ts [options] + +Options: + --all Import all churches from sitemaps + --dry-run No database writes, just report what would happen + --resume-from <n> Skip first N churches + --job-id <uuid> Background job tracking ID + --help, -h Show this help message + +Examples: + npx tsx scripts/import-msze-info.ts --all --dry-run + npx tsx scripts/import-msze-info.ts --all + npx tsx scripts/import-msze-info.ts --all --resume-from 500 +`); + process.exit(0); + } + } + + if (!result.all) { + console.error('Error: specify --all'); + process.exit(1); + } + + return result; +} + +// ─── Helpers ───────────────────────────────────────────────────────────────── + +function formatDuration(ms: number): string { + const seconds = Math.floor(ms / 1000); + const minutes = Math.floor(seconds / 60); + const hours = Math.floor(minutes / 60); + if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`; + if (minutes > 0) return `${minutes}m ${seconds % 60}s`; + return `${seconds}s`; +} + +// ─── Main ──────────────────────────────────────────────────────────────────── + +async function main() { + const args = parseArgs(); + const startTime = Date.now(); + + console.log('\n' + '='.repeat(70)); + console.log('MSZE.INFO (POLAND) IMPORTER'); + console.log('='.repeat(70)); + console.log(`Mode: All churches from sitemaps`); + console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`); + if (args.resumeFrom) console.log(`Resume from: ${args.resumeFrom}`); + console.log(`Time: ${new Date().toISOString()}`); + console.log('='.repeat(70) + '\n'); + + // Update background job status if provided + if (args.jobId) { + try { + await prisma.backgroundJob.update({ + where: { id: args.jobId }, + data: { status: 'running', startedAt: new Date() }, + }); + } catch { + // Job might not exist yet + } + } + + const stats: ImportStats = { + churchesFound: 0, + churchesMatched: 0, + churchesCreated: 0, + churchesSkipped: 0, + schedulesProcessed: 0, + massSchedulesCreated: 0, + errors: 0, + }; + + // Load existing Polish churches for dedup + const existingChurches = await loadExistingPolishChurches(); + + // Fetch church IDs from sitemaps + console.log('Fetching church URLs from sitemaps...'); + let churchIds = await fetchChurchUrlsFromSitemaps(); + + // Handle --resume-from + if (args.resumeFrom) { + const before = churchIds.length; + churchIds = churchIds.slice(args.resumeFrom); + console.log(`Resuming from index ${args.resumeFrom} (skipping ${before - churchIds.length} churches)\n`); + } else { + console.log(`Processing ${churchIds.length} churches\n`); + } + + // Process each church + for (let i = 0; i < churchIds.length; i++) { + const id = churchIds[i]; + const elapsed = formatDuration(Date.now() - startTime); + console.log(`[${i + 1}/${churchIds.length}] kosciol/${id} [${elapsed} elapsed]`); + + try { + await processChurch(id, existingChurches, args.dryRun, stats); + } catch (error) { + stats.errors++; + console.error(` ERROR processing ${id}: ${error instanceof Error ? error.message : error}`); + } + } + + // Print summary + const totalTime = Date.now() - startTime; + console.log('\n' + '='.repeat(70)); + console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`); + console.log('='.repeat(70)); + console.log(`Churches found: ${stats.churchesFound}`); + console.log(` Matched (existing): ${stats.churchesMatched}`); + console.log(` Created (new): ${stats.churchesCreated}`); + console.log(` Skipped: ${stats.churchesSkipped}`); + console.log(`Schedules processed: ${stats.schedulesProcessed}`); + console.log(`Mass schedules created: ${stats.massSchedulesCreated}`); + console.log(`Errors: ${stats.errors}`); + console.log(`Total time: ${formatDuration(totalTime)}`); + console.log(`HTTP requests: ${requestCount}`); + console.log('='.repeat(70) + '\n'); + + // Update background job + if (args.jobId) { + try { + await prisma.backgroundJob.update({ + where: { id: args.jobId }, + data: { + status: stats.errors > 0 ? 'completed_with_errors' : 'completed', + completedAt: new Date(), + result: JSON.stringify(stats), + }, + }); + } catch { + // Ignore + } + } +} + +main() + .catch((error) => { + console.error('Fatal error:', error); + process.exit(1); + }) + .finally(async () => { + await prisma.$disconnect(); + await pool.end(); + }); diff --git a/scripts/import-orarimesse.ts b/scripts/import-orarimesse.ts new file mode 100644 index 0000000..54065ac --- /dev/null +++ b/scripts/import-orarimesse.ts @@ -0,0 +1,771 @@ +#!/usr/bin/env tsx +/** + * Import Catholic churches and mass schedules from OrariMesse.it + * + * OrariMesse.it is the official CEI (Italian Bishops' Conference) platform for + * mass times in Italy. It provides a public REST API organized by diocese. + * + * Import strategy: + * Pass 1: For each diocese, fetch all churches → match against existing DB + * records (by ICSC code or proximity+name) → upsert + * Pass 2: For churches with active schedules, fetch detail endpoint → + * convert 8-day rolling schedule to recurring → replace mass schedules + * + * Usage: + * npx tsx scripts/import-orarimesse.ts --all + * npx tsx scripts/import-orarimesse.ts --diocese roma + * npx tsx scripts/import-orarimesse.ts --all --dry-run + * npx tsx scripts/import-orarimesse.ts --all --schedules-only + * npx tsx scripts/import-orarimesse.ts --all --resume-from napoli + * npx tsx scripts/import-orarimesse.ts --all --job-id {uuid} + */ + +import dotenv from 'dotenv'; +import path from 'path'; + +dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); +dotenv.config({ path: path.resolve(process.cwd(), '.env') }); + +import { Pool } from 'pg'; +import { PrismaPg } from '@prisma/adapter-pg'; +import { PrismaClient } from '@prisma/client'; + +const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; +console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); +const pool = new Pool({ + connectionString: dbUrl, + ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined, +}); +const adapter = new PrismaPg(pool); +const prisma = new PrismaClient({ adapter }); + +import { findDuplicateChurch } from '../src/lib/church-matcher'; +import type { ExistingChurch } from '../src/lib/church-matcher'; + +// ─── Constants ─────────────────────────────────────────────────────────────── + +const API_BASE = 'https://orarimesse.it/api'; +const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)'; +const DIOCESE_DELAY_MS = 2000; +const DETAIL_DELAY_MS = 1000; + +// ─── Italian Day Map ───────────────────────────────────────────────────────── + +const ITALIAN_DAY_MAP: Record<string, number> = { + 'domenica': 0, 'lunedì': 1, 'lunedi': 1, + 'martedì': 2, 'martedi': 2, 'mercoledì': 3, 'mercoledi': 3, + 'giovedì': 4, 'giovedi': 4, 'venerdì': 5, 'venerdi': 5, + 'sabato': 6, +}; + +// ─── Types ─────────────────────────────────────────────────────────────────── + +interface OrariMesseDiocese { + codice_cei: string; + title: string; + slug: string; + url: string; + countChiese: number; +} + +interface OrariMesseChurch { + idchurch: number; + address: string; + name: string; + conosciutaCome: string; + isopen: boolean; + nextmass: string; + lat: string; + lon: string; + sito: string; + emailLdc: string; + icsc: string; + comune: string; + tipologia: string; + accessibile: boolean; +} + +interface OrariMesseDioceseResponse { + codice_cei: string; + title: string; + slug: string; + countChiese: number; + listaChiese: OrariMesseChurch[]; +} + +interface OrariMesseMass { + idmass: number; + time: string; + noteOrarioMessa: string; +} + +interface OrariMesseDay { + day: string; + mass: OrariMesseMass[]; +} + +interface OrariMesseDetail { + idchurch: number; + name: string; + address: string; + lat: string; + lon: string; + icsc: string; + comune: string; + diocesi: string; + parroco: string; + telefono: string; + email: string; + sito: string; + days: OrariMesseDay[]; +} + +interface ImportStats { + diocesesProcessed: number; + churchesFound: number; + churchesMatched: number; + churchesCreated: number; + churchesSkipped: number; + schedulesProcessed: number; + massSchedulesCreated: number; + errors: number; +} + +interface CLIArgs { + all: boolean; + diocese?: string; + dryRun: boolean; + schedulesOnly: boolean; + resumeFrom?: string; + jobId?: string; +} + +// ─── API Client ────────────────────────────────────────────────────────────── + +let requestCount = 0; + +function delay(ms: number): Promise<void> { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +async function fetchApi<T>(endpoint: string, params: Record<string, string> = {}, delayMs: number): Promise<T | null> { + if (requestCount > 0) { + await delay(delayMs); + } + requestCount++; + + const url = new URL(`${API_BASE}/${endpoint}`); + for (const [key, value] of Object.entries(params)) { + url.searchParams.set(key, value); + } + + try { + const response = await fetch(url.toString(), { + headers: { + 'User-Agent': USER_AGENT, + 'Accept': 'application/json', + }, + }); + + if (!response.ok) { + console.error(` HTTP ${response.status} for ${url}`); + return null; + } + + const json = await response.json() as { status: boolean; code: string; data: T }; + if (json.status === true && json.code === 'OK') { + return json.data; + } + + console.error(` API error for ${url}: ${JSON.stringify(json).substring(0, 200)}`); + return null; + } catch (error) { + console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`); + return null; + } +} + +async function fetchDioceses(): Promise<OrariMesseDiocese[]> { + const data = await fetchApi<OrariMesseDiocese[]>('getDiocesi', {}, DIOCESE_DELAY_MS); + return data || []; +} + +async function fetchChurchesInDiocese(slug: string): Promise<OrariMesseDioceseResponse | null> { + const data = await fetchApi<OrariMesseDioceseResponse[]>( + 'getListaChiese', + { diocesi: slug, type: 'compact' }, + DIOCESE_DELAY_MS + ); + // Response is an array with a single diocese object + if (data && data.length > 0) { + return data[0]; + } + return null; +} + +async function fetchChurchDetail(idchurch: number): Promise<OrariMesseDetail | null> { + return fetchApi<OrariMesseDetail>( + 'getDettaglioMessa', + { idchurch: String(idchurch) }, + DETAIL_DELAY_MS + ); +} + +// ─── Day/Time Conversion ───────────────────────────────────────────────────── + +function parseItalianDay(dayString: string): number | null { + // "Giovedì 26 Febbraio" → extract first word → lowercase → lookup + const firstWord = dayString.split(' ')[0].toLowerCase(); + return ITALIAN_DAY_MAP[firstWord] ?? null; +} + +function convertTime(time: string): string { + // "07.00" → "07:00" + return time.replace('.', ':'); +} + +interface RecurringMass { + dayOfWeek: number; + time: string; + notes: string | null; +} + +function convertScheduleToRecurring(days: OrariMesseDay[]): RecurringMass[] { + // The API returns an 8-day rolling window. Same weekday can appear twice + // (e.g. Thursday this week and Thursday next week). We deduplicate by + // dayOfWeek+time to get the recurring weekly schedule. + const seen = new Set<string>(); + const result: RecurringMass[] = []; + + for (const day of days) { + const dayOfWeek = parseItalianDay(day.day); + if (dayOfWeek === null) continue; + + for (const mass of day.mass) { + const time = convertTime(mass.time); + const key = `${dayOfWeek}:${time}`; + if (seen.has(key)) continue; + seen.add(key); + + result.push({ + dayOfWeek, + time, + notes: mass.noteOrarioMessa || null, + }); + } + } + + return result; +} + +// ─── Database Operations ───────────────────────────────────────────────────── + +async function loadExistingItalianChurches(): Promise<ExistingChurch[]> { + console.log('Loading existing Italian churches for deduplication...'); + const churches = await prisma.church.findMany({ + where: { country: 'IT' }, + select: { + id: true, + name: true, + latitude: true, + longitude: true, + osmId: true, + baiduId: true, + masstimesId: true, + orarimesseId: true, + massSchedulesPhId: true, + philmassId: true, + horariosMisasId: true, + mszeInfoId: true, + weekdayMassesId: true, + messesInfoId: true, + bohosluzbyId: true, + miserendId: true, + kerknetId: true, + gottesdienstzeitenId: true, + discovermassId: true, + source: true, + website: true, + phone: true, + address: true, + }, + }); + console.log(`Loaded ${churches.length} existing Italian churches`); + return churches; +} + +// ─── Pass 1: Church Upsert ────────────────────────────────────────────────── + +async function processChurchesForDiocese( + dioceseSlug: string, + churches: OrariMesseChurch[], + existingChurches: ExistingChurch[], + idchurchToDbId: Map<number, string>, + dryRun: boolean, + stats: ImportStats, +): Promise<void> { + for (const church of churches) { + stats.churchesFound++; + + // Parse coordinates + const lat = parseFloat(church.lat); + const lon = parseFloat(church.lon); + if (isNaN(lat) || isNaN(lon) || lat === 0 || lon === 0) { + stats.churchesSkipped++; + continue; + } + + // Build candidate for dedup + const candidate = { + name: church.name, + lat, + lng: lon, + orarimesseId: church.icsc || undefined, + }; + + const duplicate = findDuplicateChurch(candidate, existingChurches); + + if (dryRun) { + if (duplicate) { + stats.churchesMatched++; + } else { + stats.churchesCreated++; + } + // Track idchurch for Pass 2 even in dry run + if (duplicate) { + idchurchToDbId.set(church.idchurch, duplicate.id); + } + continue; + } + + if (duplicate) { + // Update existing church: set orarimesseId, fill missing fields + stats.churchesMatched++; + const updateData: Record<string, unknown> = { + orarimesseId: church.icsc || undefined, + orarimesseLastSyncedAt: new Date(), + }; + + if (!duplicate.address && church.address) updateData.address = church.address; + if (!duplicate.website && church.sito) { + updateData.website = church.sito; + updateData.hasWebsite = true; + } + + // Check diocese on the actual DB record (not in ExistingChurch) + const dbRecord = await prisma.church.findUnique({ + where: { id: duplicate.id }, + select: { diocese: true, city: true, email: true }, + }); + if (dbRecord && !dbRecord.diocese && dioceseSlug) { + updateData.diocese = dioceseSlug; + } + if (dbRecord && !dbRecord.city && church.comune) { + updateData.city = church.comune; + } + if (dbRecord && !dbRecord.email && church.emailLdc) { + updateData.email = church.emailLdc; + } + + try { + await prisma.church.update({ + where: { id: duplicate.id }, + data: updateData, + }); + } catch (error) { + // Unique constraint violation on orarimesseId — another church already has this ICSC + if (error instanceof Error && error.message.includes('Unique constraint')) { + stats.churchesSkipped++; + continue; + } + throw error; + } + + idchurchToDbId.set(church.idchurch, duplicate.id); + } else { + // Create new church + try { + const newChurch = await prisma.church.create({ + data: { + name: church.name, + latitude: lat, + longitude: lon, + address: church.address || null, + city: church.comune || null, + country: 'IT', + diocese: dioceseSlug, + website: church.sito || null, + email: church.emailLdc || null, + hasWebsite: !!church.sito, + orarimesseId: church.icsc || null, + orarimesseLastSyncedAt: new Date(), + source: 'orarimesse', + wheelchairAccess: church.accessibile || false, + }, + }); + stats.churchesCreated++; + + // Add to in-memory array for within-run dedup + existingChurches.push({ + id: newChurch.id, + name: church.name, + latitude: lat, + longitude: lon, + osmId: null, + baiduId: null, + masstimesId: null, + orarimesseId: church.icsc || null, + massSchedulesPhId: null, + philmassId: null, + horariosMisasId: null, + mszeInfoId: null, + weekdayMassesId: null, + messesInfoId: null, + bohosluzbyId: null, + miserendId: null, + kerknetId: null, + gottesdienstzeitenId: null, + discovermassId: null, + source: 'orarimesse', + website: church.sito || null, + phone: null, + address: church.address || null, + }); + + idchurchToDbId.set(church.idchurch, newChurch.id); + } catch (error) { + if (error instanceof Error && error.message.includes('Unique constraint')) { + stats.churchesSkipped++; + continue; + } + throw error; + } + } + } +} + +// ─── Pass 2: Mass Schedules ───────────────────────────────────────────────── + +async function processSchedulesForDiocese( + churches: OrariMesseChurch[], + idchurchToDbId: Map<number, string>, + dryRun: boolean, + stats: ImportStats, +): Promise<void> { + // Filter to churches with active schedules + const churchesWithMass = churches.filter((c) => c.nextmass); + if (churchesWithMass.length === 0) return; + + console.log(` Pass 2: Fetching schedules for ${churchesWithMass.length} churches with active masses...`); + + for (const church of churchesWithMass) { + const dbId = idchurchToDbId.get(church.idchurch); + if (!dbId) continue; // Church not in our DB (skipped in Pass 1) + + const detail = await fetchChurchDetail(church.idchurch); + if (!detail || !detail.days || detail.days.length === 0) { + continue; + } + + stats.schedulesProcessed++; + + const recurring = convertScheduleToRecurring(detail.days); + if (recurring.length === 0) continue; + + if (dryRun) { + stats.massSchedulesCreated += recurring.length; + continue; + } + + try { + await prisma.$transaction(async (tx) => { + // Delete existing mass schedules for this church + await tx.massSchedule.deleteMany({ where: { churchId: dbId } }); + + // Create new recurring schedules + await tx.massSchedule.createMany({ + data: recurring.map((m) => ({ + churchId: dbId, + dayOfWeek: m.dayOfWeek, + time: m.time, + language: 'Italian', + notes: m.notes, + })), + }); + + // Mark church as scraped + await tx.church.update({ + where: { id: dbId }, + data: { lastScrapedAt: new Date() }, + }); + }); + + stats.massSchedulesCreated += recurring.length; + } catch (error) { + stats.errors++; + console.error(` Error saving schedules for idchurch=${church.idchurch}: ${error instanceof Error ? error.message : error}`); + } + } +} + +// ─── CLI ───────────────────────────────────────────────────────────────────── + +function parseArgs(): CLIArgs { + const args = process.argv.slice(2); + const result: CLIArgs = { + all: false, + dryRun: false, + schedulesOnly: false, + }; + + for (let i = 0; i < args.length; i++) { + switch (args[i]) { + case '--all': + result.all = true; + break; + case '--diocese': + result.diocese = args[++i]; + break; + case '--dry-run': + result.dryRun = true; + break; + case '--schedules-only': + result.schedulesOnly = true; + break; + case '--resume-from': + result.resumeFrom = args[++i]; + break; + case '--job-id': + result.jobId = args[++i]; + break; + case '--help': + case '-h': + console.log(` +Usage: npx tsx scripts/import-orarimesse.ts [options] + +Options: + --all Import from all 77 dioceses + --diocese <slug> Import from a single diocese (e.g. "roma") + --dry-run No database writes, just report what would happen + --schedules-only Skip Pass 1 (church upsert), only fetch schedules + --resume-from <slug> Skip dioceses until reaching this slug + --job-id <uuid> Background job tracking ID + --help, -h Show this help message + +Examples: + npx tsx scripts/import-orarimesse.ts --diocese roma --dry-run + npx tsx scripts/import-orarimesse.ts --all + npx tsx scripts/import-orarimesse.ts --all --schedules-only + npx tsx scripts/import-orarimesse.ts --all --resume-from napoli +`); + process.exit(0); + } + } + + if (!result.all && !result.diocese) { + console.error('Error: specify --all or --diocese <slug>'); + process.exit(1); + } + + return result; +} + +// ─── Helpers ───────────────────────────────────────────────────────────────── + +function formatDuration(ms: number): string { + const seconds = Math.floor(ms / 1000); + const minutes = Math.floor(seconds / 60); + const hours = Math.floor(minutes / 60); + if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`; + if (minutes > 0) return `${minutes}m ${seconds % 60}s`; + return `${seconds}s`; +} + +// ─── Main ──────────────────────────────────────────────────────────────────── + +async function main() { + const args = parseArgs(); + const startTime = Date.now(); + + console.log('\n' + '='.repeat(70)); + console.log('ORARIMESSE.IT IMPORTER'); + console.log('='.repeat(70)); + console.log(`Mode: ${args.all ? 'All dioceses' : `Single diocese: ${args.diocese}`}`); + console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`); + console.log(`Schedules only: ${args.schedulesOnly ? 'YES' : 'NO'}`); + if (args.resumeFrom) console.log(`Resume from: ${args.resumeFrom}`); + console.log(`Time: ${new Date().toISOString()}`); + console.log('='.repeat(70) + '\n'); + + // Update background job status if provided + if (args.jobId) { + try { + await prisma.backgroundJob.update({ + where: { id: args.jobId }, + data: { status: 'running', startedAt: new Date() }, + }); + } catch { + // Job might not exist yet, that's fine + } + } + + // Load existing Italian churches for dedup + const existingChurches = await loadExistingItalianChurches(); + + // Fetch diocese list + console.log('Fetching diocese list from OrariMesse.it...'); + const allDioceses = await fetchDioceses(); + console.log(`Found ${allDioceses.length} dioceses\n`); + + // Filter to requested dioceses + let diocesesToProcess: OrariMesseDiocese[]; + if (args.diocese) { + const found = allDioceses.find((d) => d.slug === args.diocese); + if (!found) { + console.error(`Diocese "${args.diocese}" not found. Available: ${allDioceses.map((d) => d.slug).join(', ')}`); + process.exit(1); + } + diocesesToProcess = [found]; + } else { + diocesesToProcess = allDioceses; + } + + // Handle --resume-from + if (args.resumeFrom) { + const idx = diocesesToProcess.findIndex((d) => d.slug === args.resumeFrom); + if (idx === -1) { + console.error(`Resume diocese "${args.resumeFrom}" not found.`); + process.exit(1); + } + console.log(`Resuming from diocese "${args.resumeFrom}" (skipping ${idx} dioceses)\n`); + diocesesToProcess = diocesesToProcess.slice(idx); + } + + const stats: ImportStats = { + diocesesProcessed: 0, + churchesFound: 0, + churchesMatched: 0, + churchesCreated: 0, + churchesSkipped: 0, + schedulesProcessed: 0, + massSchedulesCreated: 0, + errors: 0, + }; + + // Map OrariMesse idchurch → our DB id (for Pass 2 schedule lookups) + const idchurchToDbId = new Map<number, string>(); + + // If schedules-only mode, pre-populate the map from existing orarimesseId records + if (args.schedulesOnly) { + console.log('Schedules-only mode: loading existing orarimesseId mappings...'); + const mapped = await prisma.church.findMany({ + where: { orarimesseId: { not: null } }, + select: { id: true, orarimesseId: true }, + }); + // We'll build the idchurch map during diocese processing since we need the API's idchurch values + console.log(`Found ${mapped.length} churches with orarimesseId in DB\n`); + } + + // Process each diocese + for (let i = 0; i < diocesesToProcess.length; i++) { + const diocese = diocesesToProcess[i]; + const elapsed = formatDuration(Date.now() - startTime); + console.log(`[${i + 1}/${diocesesToProcess.length}] Diocese: ${diocese.title} (${diocese.slug}) [${elapsed} elapsed]`); + + try { + // Fetch churches in this diocese + const dioceseData = await fetchChurchesInDiocese(diocese.slug); + if (!dioceseData || !dioceseData.listaChiese || dioceseData.listaChiese.length === 0) { + console.log(` No churches found, skipping`); + stats.diocesesProcessed++; + continue; + } + + const churches = dioceseData.listaChiese; + console.log(` Found ${churches.length} churches (${churches.filter((c) => c.nextmass).length} with active masses)`); + + // Pass 1: Upsert churches + if (!args.schedulesOnly) { + const prevMatched = stats.churchesMatched; + const prevCreated = stats.churchesCreated; + const prevSkipped = stats.churchesSkipped; + + await processChurchesForDiocese( + diocese.slug, churches, existingChurches, idchurchToDbId, + args.dryRun, stats + ); + + const matched = stats.churchesMatched - prevMatched; + const created = stats.churchesCreated - prevCreated; + const skipped = stats.churchesSkipped - prevSkipped; + console.log(` Pass 1: ${matched} matched, ${created} created, ${skipped} skipped`); + } else { + // In schedules-only mode, still need to build idchurch → dbId map + for (const church of churches) { + if (church.icsc) { + const existing = existingChurches.find((e) => e.orarimesseId === church.icsc); + if (existing) { + idchurchToDbId.set(church.idchurch, existing.id); + } + } + } + } + + // Pass 2: Import schedules + const prevSchedules = stats.massSchedulesCreated; + await processSchedulesForDiocese(churches, idchurchToDbId, args.dryRun, stats); + const newSchedules = stats.massSchedulesCreated - prevSchedules; + if (newSchedules > 0) { + console.log(` Pass 2: ${stats.schedulesProcessed} churches processed, ${newSchedules} mass times created`); + } + + stats.diocesesProcessed++; + } catch (error) { + stats.errors++; + console.error(` ERROR processing diocese ${diocese.slug}: ${error instanceof Error ? error.message : error}`); + } + } + + // Print summary + const totalTime = Date.now() - startTime; + console.log('\n' + '='.repeat(70)); + console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`); + console.log('='.repeat(70)); + console.log(`Dioceses processed: ${stats.diocesesProcessed}`); + console.log(`Churches found: ${stats.churchesFound}`); + console.log(` Matched (existing): ${stats.churchesMatched}`); + console.log(` Created (new): ${stats.churchesCreated}`); + console.log(` Skipped: ${stats.churchesSkipped}`); + console.log(`Schedules processed: ${stats.schedulesProcessed}`); + console.log(`Mass schedules created: ${stats.massSchedulesCreated}`); + console.log(`Errors: ${stats.errors}`); + console.log(`Total time: ${formatDuration(totalTime)}`); + console.log(`API requests: ${requestCount}`); + console.log('='.repeat(70) + '\n'); + + // Update background job + if (args.jobId) { + try { + await prisma.backgroundJob.update({ + where: { id: args.jobId }, + data: { + status: stats.errors > 0 ? 'completed_with_errors' : 'completed', + completedAt: new Date(), + result: JSON.stringify(stats), + }, + }); + } catch { + // Ignore + } + } +} + +main() + .catch((error) => { + console.error('Fatal error:', error); + process.exit(1); + }) + .finally(async () => { + await prisma.$disconnect(); + await pool.end(); + }); diff --git a/scripts/import-osm-churches.ts b/scripts/import-osm-churches.ts new file mode 100644 index 0000000..753acac --- /dev/null +++ b/scripts/import-osm-churches.ts @@ -0,0 +1,616 @@ +#!/usr/bin/env tsx +/** + * Import Catholic churches from OpenStreetMap + * Usage: + * npx tsx scripts/import-osm-churches.ts --country US + * npx tsx scripts/import-osm-churches.ts --all + * npx tsx scripts/import-osm-churches.ts --country MX --dry-run + * npx tsx scripts/import-osm-churches.ts --all --sort-by-count + */ + +// Load .env for database connection (before importing anything that uses process.env) +import dotenv from 'dotenv'; +import path from 'path'; + +// Load .env.local first (production Neon URL), then .env (local fallback) +dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); +dotenv.config({ path: path.resolve(process.cwd(), '.env') }); + +// Create a fresh Prisma client for this script (don't use cached pool from lib/db) +import { Pool } from 'pg'; +import { PrismaPg } from '@prisma/adapter-pg'; +import { PrismaClient } from '@prisma/client'; + +const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; +console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); +const pool = new Pool({ + connectionString: dbUrl, + ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined +}); +const adapter = new PrismaPg(pool); +const prisma = new PrismaClient({ adapter }); +import { queryOverpassByCountryWithFallback, type OSMChurch } from '../src/lib/overpass-client'; +import { findDuplicateChurch, mergeChurchData } from '../src/lib/church-matcher'; +import { parseServiceTimes } from '../src/lib/service-times-parser'; + +// Countries with significant Catholic populations, organized by priority +const CATHOLIC_COUNTRIES = { + // Priority 1: Large Catholic populations (North & South America + major European/Asian countries) + priority1: [ + // North America + 'US', 'MX', 'CA', + // South America + 'BR', 'AR', 'CO', 'PE', 'VE', 'CL', 'EC', 'GT', 'CU', 'BO', 'DO', 'HT', 'HN', 'PY', 'SV', 'NI', 'CR', 'PA', 'UY', 'GY', 'SR', 'GF', + // Europe + 'IT', 'FR', 'ES', 'PL', 'DE', 'PT', 'BE', 'CZ', 'AT', 'HU', 'IE', 'HR', 'GB', + // Asia & Oceania + 'PH', 'AU', 'NG', 'CD', + ], + // Priority 2: Medium Catholic populations + priority2: [ + // Rest of Europe + 'NL', 'SK', 'SI', 'LT', 'CH', 'LU', 'MT', + 'UA', 'RO', 'LV', 'BY', + // Africa + 'AO', 'UG', 'TZ', 'KE', 'CM', 'RW', 'BI', 'MG', 'MW', 'ZM', 'ZW', 'MZ', 'BJ', 'TG', 'CI', 'BF', 'ML', 'NE', 'SN', 'GN', 'LR', 'SL', 'GH', 'GA', 'CG', 'CF', 'TD', 'SD', 'ET', 'ER', 'SO', + // Asia + 'IN', 'TL', 'VN', 'KR', 'JP', 'ID', 'MY', 'SG', 'TH', 'LA', 'KH', 'MM', 'CN', 'LK', 'BD', 'PK', + // Middle East + 'LB', 'IL', 'PS', 'JO', 'SY', 'IQ', + // Oceania + 'NZ', 'PG', 'FJ', 'NC', 'PF', + ], + // Priority 3: Smaller Catholic presence + priority3: [ + // Caribbean & Central America (smaller islands) + 'BS', 'BB', 'JM', 'TT', 'GD', 'LC', 'VC', 'AG', 'DM', 'KN', + // Europe (smaller countries + Balkans/Eastern) + 'MC', 'SM', 'VA', 'LI', 'AD', + 'RS', 'BA', 'MK', 'AL', 'EE', + // Caucasus + Russia + 'GE', 'AM', 'RU', + // Africa (rest) + 'NA', 'BW', 'LS', 'SZ', 'MU', 'SC', 'KM', 'CV', 'ST', 'GQ', 'DJ', 'GM', + // Asia (rest) + 'BT', 'NP', 'AF', 'KZ', 'UZ', 'TM', 'TJ', 'KG', 'MN', 'BN', 'MV', + // Oceania (rest) + 'WS', 'TO', 'VU', 'SB', 'KI', 'NR', 'TV', 'FM', 'MH', 'PW', + ], +}; + +interface ImportStats { + osmChurchesFound: number; + newChurchesInserted: number; + existingUpdated: number; + existingLinked: number; + churchesWithWebsites: number; + churchesWithoutWebsites: number; + churchesWithServiceTimes: number; + scheduleEntriesCreated: number; + errors: number; +} + +/** + * Parse command line arguments + */ +function parseArgs(): { country?: string; all: boolean; dryRun: boolean; resumeFrom?: string; priority?: number; sortByCount: boolean } { + const args = process.argv.slice(2); + const result = { + country: undefined as string | undefined, + all: false, + dryRun: false, + resumeFrom: undefined as string | undefined, + priority: undefined as number | undefined, + sortByCount: false, + }; + + for (let i = 0; i < args.length; i++) { + if (args[i] === '--country' && args[i + 1]) { + result.country = args[i + 1].toUpperCase(); + i++; + } else if (args[i] === '--all') { + result.all = true; + } else if (args[i] === '--dry-run') { + result.dryRun = true; + } else if (args[i] === '--resume-from' && args[i + 1]) { + result.resumeFrom = args[i + 1].toUpperCase(); + i++; + } else if (args[i] === '--priority' && args[i + 1]) { + const priority = parseInt(args[i + 1], 10); + if (priority >= 1 && priority <= 3) { + result.priority = priority; + } else { + console.error('Error: --priority must be 1, 2, or 3'); + process.exit(1); + } + i++; + } else if (args[i] === '--sort-by-count') { + result.sortByCount = true; + } + } + + return result; +} + +/** + * Import churches from a single country + */ +async function importFromOSM(countryCode: string, dryRun: boolean = false): Promise<ImportStats> { + const stats: ImportStats = { + osmChurchesFound: 0, + newChurchesInserted: 0, + existingUpdated: 0, + existingLinked: 0, + churchesWithWebsites: 0, + churchesWithoutWebsites: 0, + churchesWithServiceTimes: 0, + scheduleEntriesCreated: 0, + errors: 0, + }; + + console.log(`\n${'='.repeat(60)}`); + console.log(`Importing Catholic churches from ${countryCode}`); + console.log(`${'='.repeat(60)}\n`); + + try { + // Query Overpass API (with automatic fallback to regional bounding boxes) + const osmChurches = await queryOverpassByCountryWithFallback(countryCode); + stats.osmChurchesFound = osmChurches.length; + + if (osmChurches.length === 0) { + console.log(`No churches found in ${countryCode}`); + return stats; + } + + console.log(`Found ${osmChurches.length} Catholic churches in ${countryCode}`); + + if (dryRun) { + console.log('\n[DRY RUN] Would import the following churches:'); + osmChurches.slice(0, 10).forEach((church) => { + console.log(` - ${church.name} (${church.city || 'unknown city'})`); + console.log(` OSM ID: ${church.osmId}, Website: ${church.website || 'none'}`); + }); + if (osmChurches.length > 10) { + console.log(` ... and ${osmChurches.length - 10} more`); + } + + // Count websites + stats.churchesWithWebsites = osmChurches.filter((c) => c.website).length; + stats.churchesWithoutWebsites = osmChurches.length - stats.churchesWithWebsites; + + return stats; + } + + // Fetch all existing churches for deduplication + // For large datasets, we could optimize by fetching only churches in the same country/region + console.log('Fetching existing churches for deduplication...'); + const existingChurches = await prisma.church.findMany({ + select: { + id: true, + name: true, + latitude: true, + longitude: true, + osmId: true, + baiduId: true, + masstimesId: true, + orarimesseId: true, + massSchedulesPhId: true, + philmassId: true, + horariosMisasId: true, + mszeInfoId: true, + weekdayMassesId: true, + messesInfoId: true, + bohosluzbyId: true, + miserendId: true, + kerknetId: true, + gottesdienstzeitenId: true, + discovermassId: true, + source: true, + website: true, + phone: true, + address: true, + }, + }); + console.log(`Found ${existingChurches.length} existing churches in database`); + + // Process churches one by one (no batch transactions to avoid rollbacks) + let processed = 0; + + for (const osmChurch of osmChurches) { + try { + // Check for duplicate + const duplicate = findDuplicateChurch(osmChurch, existingChurches); + + if (duplicate && duplicate.osmId === osmChurch.osmId) { + // Existing church with matching osmId - update it + const mergedData = mergeChurchData(duplicate, osmChurch); + + // Verify the church exists in the database (not just in our temp list from this run) + const existsInDb = await prisma.church.findUnique({ where: { id: duplicate.id } }); + if (existsInDb) { + await prisma.church.update({ + where: { id: duplicate.id }, + data: mergedData, + }); + stats.existingUpdated++; + + // Import service_times for existing churches that don't have schedules yet + if (osmChurch.serviceTimes) { + const existingSchedules = await prisma.massSchedule.count({ where: { churchId: duplicate.id } }); + if (existingSchedules === 0) { + const scheduleEntries = parseServiceTimes(osmChurch.serviceTimes); + if (scheduleEntries.length > 0) { + await prisma.massSchedule.createMany({ + data: scheduleEntries.map(entry => ({ + churchId: duplicate.id, + dayOfWeek: entry.dayOfWeek, + time: entry.time, + massType: entry.dayOfWeek === 0 ? 'Sunday' : + entry.dayOfWeek === 6 ? 'Saturday' : 'Daily', + language: 'Unknown', + notes: 'From OSM service_times tag', + })), + }); + stats.churchesWithServiceTimes++; + stats.scheduleEntriesCreated += scheduleEntries.length; + } + } + } + } else { + // Duplicate from earlier in this run - skip (already processed) + stats.existingUpdated++; + } + + if (osmChurch.website) stats.churchesWithWebsites++; + else stats.churchesWithoutWebsites++; + + } else if (duplicate) { + // Existing church matched by proximity/name - link it with osmId + const mergedData = mergeChurchData(duplicate, osmChurch); + + // Verify the church exists in the database (not just in our temp list from this run) + const existsInDb = await prisma.church.findUnique({ where: { id: duplicate.id } }); + if (existsInDb) { + await prisma.church.update({ + where: { id: duplicate.id }, + data: mergedData, + }); + stats.existingLinked++; + + // Import service_times for linked churches that don't have schedules yet + if (osmChurch.serviceTimes) { + const existingSchedules = await prisma.massSchedule.count({ where: { churchId: duplicate.id } }); + if (existingSchedules === 0) { + const scheduleEntries = parseServiceTimes(osmChurch.serviceTimes); + if (scheduleEntries.length > 0) { + await prisma.massSchedule.createMany({ + data: scheduleEntries.map(entry => ({ + churchId: duplicate.id, + dayOfWeek: entry.dayOfWeek, + time: entry.time, + massType: entry.dayOfWeek === 0 ? 'Sunday' : + entry.dayOfWeek === 6 ? 'Saturday' : 'Daily', + language: 'Unknown', + notes: 'From OSM service_times tag', + })), + }); + stats.churchesWithServiceTimes++; + stats.scheduleEntriesCreated += scheduleEntries.length; + } + } + } + } else { + // Duplicate from earlier in this run - skip (already processed) + stats.existingLinked++; + } + + if (osmChurch.website) stats.churchesWithWebsites++; + else stats.churchesWithoutWebsites++; + + } else { + // New church - insert it and capture the real ID + const newChurch = await prisma.church.create({ + data: { + name: osmChurch.name, + latitude: osmChurch.lat, + longitude: osmChurch.lng, + address: osmChurch.address, + city: osmChurch.city, + state: osmChurch.state, + zip: osmChurch.zip, + country: osmChurch.country || countryCode, + phone: osmChurch.phone, + website: osmChurch.website, + diocese: osmChurch.diocese, + wheelchairAccess: osmChurch.wheelchairAccess ?? false, + source: 'osm', + osmId: osmChurch.osmId, + hasWebsite: !!osmChurch.website, + osmLastSyncedAt: new Date(), + }, + }); + stats.newChurchesInserted++; + + if (osmChurch.website) stats.churchesWithWebsites++; + else stats.churchesWithoutWebsites++; + + // Parse service_times tag and create mass schedules + if (osmChurch.serviceTimes) { + const scheduleEntries = parseServiceTimes(osmChurch.serviceTimes); + if (scheduleEntries.length > 0) { + await prisma.massSchedule.createMany({ + data: scheduleEntries.map(entry => ({ + churchId: newChurch.id, + dayOfWeek: entry.dayOfWeek, + time: entry.time, + massType: entry.dayOfWeek === 0 ? 'Sunday' : + entry.dayOfWeek === 6 ? 'Saturday' : 'Daily', + language: 'Unknown', + notes: 'From OSM service_times tag', + })), + }); + stats.churchesWithServiceTimes++; + stats.scheduleEntriesCreated += scheduleEntries.length; + + // Mark as scraped since we have schedule data + await prisma.church.update({ + where: { id: newChurch.id }, + data: { lastScrapedAt: new Date() }, + }); + } + } + + // Add to existing churches list for future deduplication in this run (use real DB ID) + existingChurches.push({ + id: newChurch.id, + name: osmChurch.name, + latitude: osmChurch.lat, + longitude: osmChurch.lng, + osmId: osmChurch.osmId, + baiduId: null, + masstimesId: null, + orarimesseId: null, + massSchedulesPhId: null, + philmassId: null, + horariosMisasId: null, + mszeInfoId: null, + weekdayMassesId: null, + messesInfoId: null, + bohosluzbyId: null, + miserendId: null, + kerknetId: null, + gottesdienstzeitenId: null, + discovermassId: null, + source: 'osm', + website: osmChurch.website || null, + phone: osmChurch.phone || null, + address: osmChurch.address || null, + }); + } + + processed++; + + // Log progress every 500 churches + if (processed % 500 === 0) { + console.log(`Progress: ${processed}/${osmChurches.length} churches processed`); + } + + } catch (error) { + console.error(`Error processing church ${osmChurch.name}:`, error); + stats.errors++; + } + } + + console.log(`\nProcessed all ${osmChurches.length} churches from ${countryCode}`); + + } catch (error) { + console.error(`Failed to import from ${countryCode}:`, error); + stats.errors++; + } + + return stats; +} + +/** + * Print import summary + */ +function printSummary(countryCode: string, stats: ImportStats, dryRun: boolean) { + console.log(`\n${'='.repeat(60)}`); + console.log(`Import Summary for ${countryCode} ${dryRun ? '(DRY RUN)' : ''}`); + console.log(`${'='.repeat(60)}`); + console.log(`OSM churches found: ${stats.osmChurchesFound}`); + + if (!dryRun) { + console.log(`New churches inserted: ${stats.newChurchesInserted}`); + console.log(`Existing churches updated: ${stats.existingUpdated} (matched by osmId)`); + console.log(`Existing churches linked: ${stats.existingLinked} (matched by proximity)`); + } + + console.log(`Churches with websites: ${stats.churchesWithWebsites}`); + console.log(`Churches without websites: ${stats.churchesWithoutWebsites}`); + + if (!dryRun && stats.churchesWithServiceTimes > 0) { + console.log(`Churches with service_times: ${stats.churchesWithServiceTimes}`); + console.log(`Schedule entries created: ${stats.scheduleEntriesCreated}`); + } + + if (!dryRun && stats.errors > 0) { + console.log(`Errors encountered: ${stats.errors}`); + } + + console.log(`${'='.repeat(60)}\n`); +} + +/** + * Main function + */ +async function createOrResumeJob(args: string[]): Promise<string | null> { + const jobIdIndex = args.indexOf('--job-id'); + if (jobIdIndex !== -1) { + const jobId = args[jobIdIndex + 1]; + await prisma.backgroundJob.update({ + where: { id: jobId }, + data: { status: 'running', startedAt: new Date() }, + }); + return jobId; + } + return null; +} + +async function completeJob(jobId: string | null, error?: string): Promise<void> { + if (!jobId) return; + try { + await prisma.backgroundJob.update({ + where: { id: jobId }, + data: { + status: error ? 'failed' : 'completed', + error: error || null, + completedAt: new Date(), + }, + }); + } catch (err) { + console.error(`Failed to update job ${jobId}:`, err); + } +} + +async function main() { + const { country, all, dryRun, resumeFrom, priority, sortByCount } = parseArgs(); + const jobId = await createOrResumeJob(process.argv.slice(2)); + + if (!country && !all && !priority) { + console.error('Error: Must specify --country <CODE>, --all, or --priority <1|2|3>'); + console.error('Usage:'); + console.error(' npx tsx scripts/import-osm-churches.ts --country US'); + console.error(' npx tsx scripts/import-osm-churches.ts --all'); + console.error(' npx tsx scripts/import-osm-churches.ts --priority 1'); + console.error(' npx tsx scripts/import-osm-churches.ts --all --resume-from IT'); + console.error(' npx tsx scripts/import-osm-churches.ts --country MX --dry-run'); + console.error(' npx tsx scripts/import-osm-churches.ts --all --sort-by-count'); + process.exit(1); + } + + if (dryRun) { + console.log('\n*** DRY RUN MODE - No changes will be made to database ***\n'); + } + + try { + if (country) { + // Import single country + const stats = await importFromOSM(country, dryRun); + printSummary(country, stats, dryRun); + + } else if (all || priority !== undefined) { + // Import all countries or specific priority + let allCountries: string[]; + + if (priority !== undefined) { + // Import only specified priority level + const priorityKey = `priority${priority}` as keyof typeof CATHOLIC_COUNTRIES; + allCountries = CATHOLIC_COUNTRIES[priorityKey]; + console.log(`Importing Priority ${priority} countries (${allCountries.length} countries)...\n`); + } else { + // Import all priorities + console.log('Importing all Catholic countries by priority...\n'); + allCountries = [ + ...CATHOLIC_COUNTRIES.priority1, + ...CATHOLIC_COUNTRIES.priority2, + ...CATHOLIC_COUNTRIES.priority3, + ]; + } + + // Sort by existing OSM church count (least first) if requested + if (sortByCount) { + console.log('Querying DB for current OSM church counts per country...'); + const countRows = await prisma.$queryRawUnsafe<Array<{ country: string; count: bigint }>>( + `SELECT country, COUNT(*) as count FROM churches WHERE source = 'osm' AND country IS NOT NULL GROUP BY country` + ); + const countMap = new Map<string, number>(); + for (const row of countRows) { + countMap.set(row.country, Number(row.count)); + } + + allCountries.sort((a, b) => (countMap.get(a) || 0) - (countMap.get(b) || 0)); + + console.log('Country processing order (least OSM churches first):'); + for (const c of allCountries) { + console.log(` ${c}: ${countMap.get(c) || 0} existing OSM churches`); + } + console.log(''); + } + + // Handle --resume-from flag + if (resumeFrom) { + const resumeIndex = allCountries.indexOf(resumeFrom); + if (resumeIndex === -1) { + console.error(`Error: Country ${resumeFrom} not found in the list`); + process.exit(1); + } + console.log(`Resuming from ${resumeFrom} (skipping first ${resumeIndex} countries)...\n`); + allCountries = allCountries.slice(resumeIndex); + } + + const totalStats: ImportStats = { + osmChurchesFound: 0, + newChurchesInserted: 0, + existingUpdated: 0, + existingLinked: 0, + churchesWithWebsites: 0, + churchesWithoutWebsites: 0, + churchesWithServiceTimes: 0, + scheduleEntriesCreated: 0, + errors: 0, + }; + + for (const countryCode of allCountries) { + const stats = await importFromOSM(countryCode, dryRun); + printSummary(countryCode, stats, dryRun); + + // Aggregate stats + totalStats.osmChurchesFound += stats.osmChurchesFound; + totalStats.newChurchesInserted += stats.newChurchesInserted; + totalStats.existingUpdated += stats.existingUpdated; + totalStats.existingLinked += stats.existingLinked; + totalStats.churchesWithWebsites += stats.churchesWithWebsites; + totalStats.churchesWithoutWebsites += stats.churchesWithoutWebsites; + totalStats.churchesWithServiceTimes += stats.churchesWithServiceTimes; + totalStats.scheduleEntriesCreated += stats.scheduleEntriesCreated; + totalStats.errors += stats.errors; + + // Small delay between countries to be respectful (rate limiting is also in the client) + await new Promise((resolve) => setTimeout(resolve, 2000)); + } + + // Print overall summary + console.log(`\n${'='.repeat(60)}`); + console.log(`OVERALL SUMMARY ${dryRun ? '(DRY RUN)' : ''}`); + console.log(`${'='.repeat(60)}`); + console.log(`Total countries processed: ${allCountries.length}`); + console.log(`Total OSM churches found: ${totalStats.osmChurchesFound}`); + + if (!dryRun) { + console.log(`Total new churches inserted: ${totalStats.newChurchesInserted}`); + console.log(`Total churches updated: ${totalStats.existingUpdated}`); + console.log(`Total churches linked: ${totalStats.existingLinked}`); + } + + console.log(`Total with websites: ${totalStats.churchesWithWebsites}`); + console.log(`Total without websites: ${totalStats.churchesWithoutWebsites}`); + + if (!dryRun && totalStats.errors > 0) { + console.log(`Total errors: ${totalStats.errors}`); + } + + console.log(`${'='.repeat(60)}\n`); + } + + await completeJob(jobId); + } catch (error) { + console.error('Fatal error:', error); + await completeJob(jobId, String(error)); + process.exit(1); + } finally { + await prisma.$disconnect(); + } +} + +main(); diff --git a/scripts/import-osm-region.ts b/scripts/import-osm-region.ts new file mode 100644 index 0000000..a0a7bc0 --- /dev/null +++ b/scripts/import-osm-region.ts @@ -0,0 +1,346 @@ +#!/usr/bin/env tsx +/** + * Import Catholic churches from a specific region of a country + * Usage: + * npx tsx scripts/import-osm-region.ts --country GB --region "England South" + * npx tsx scripts/import-osm-region.ts --country IT --region "North" --dry-run + */ + +// Load .env for database connection (before importing anything that uses process.env) +import dotenv from 'dotenv'; +import path from 'path'; + +// Load .env.local first (production Neon URL), then .env (local fallback) +dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); +dotenv.config({ path: path.resolve(process.cwd(), '.env') }); + +// Create a fresh Prisma client for this script (don't use cached pool from lib/db) +import { Pool } from 'pg'; +import { PrismaPg } from '@prisma/adapter-pg'; +import { PrismaClient } from '@prisma/client'; + +const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; +console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); +const pool = new Pool({ + connectionString: dbUrl, + ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined +}); +const adapter = new PrismaPg(pool); +const prisma = new PrismaClient({ adapter }); + +import { COUNTRY_BOUNDING_BOXES, queryOverpassByBoundingBox, type OSMChurch } from '../src/lib/overpass-client'; +import { findDuplicateChurch, mergeChurchData } from '../src/lib/church-matcher'; + +interface ImportStats { + osmChurchesFound: number; + newChurchesInserted: number; + existingUpdated: number; + existingLinked: number; + churchesWithWebsites: number; + churchesWithoutWebsites: number; + errors: number; +} + +/** + * Parse command line arguments + */ +function parseArgs(): { country?: string; region?: string; dryRun: boolean } { + const args = process.argv.slice(2); + const result = { + country: undefined as string | undefined, + region: undefined as string | undefined, + dryRun: false, + }; + + for (let i = 0; i < args.length; i++) { + if (args[i] === '--country' && args[i + 1]) { + result.country = args[i + 1].toUpperCase(); + i++; + } else if (args[i] === '--region' && args[i + 1]) { + result.region = args[i + 1]; + i++; + } else if (args[i] === '--dry-run') { + result.dryRun = true; + } + } + + return result; +} + +/** + * Import churches from a single region + */ +async function importFromRegion(countryCode: string, regionName: string, dryRun: boolean = false): Promise<ImportStats> { + const stats: ImportStats = { + osmChurchesFound: 0, + newChurchesInserted: 0, + existingUpdated: 0, + existingLinked: 0, + churchesWithWebsites: 0, + churchesWithoutWebsites: 0, + errors: 0, + }; + + console.log(`\n${'='.repeat(60)}`); + console.log(`Importing from ${countryCode} - ${regionName}`); + console.log(`${'='.repeat(60)}\n`); + + // Look up the bounding box + const regions = COUNTRY_BOUNDING_BOXES[countryCode]; + if (!regions) { + console.error(`Error: No bounding boxes defined for country ${countryCode}`); + console.error('Available countries:', Object.keys(COUNTRY_BOUNDING_BOXES).join(', ')); + process.exit(1); + } + + const region = regions.find(r => r.name === regionName); + if (!region) { + console.error(`Error: Region "${regionName}" not found for ${countryCode}`); + console.error('Available regions:', regions.map(r => r.name).join(', ')); + process.exit(1); + } + + try { + // Query Overpass API for this specific region + console.log(`Querying bounding box: (${region.south}, ${region.west}, ${region.north}, ${region.east})`); + const osmChurches = await queryOverpassByBoundingBox(region.south, region.west, region.north, region.east); + stats.osmChurchesFound = osmChurches.length; + + if (osmChurches.length === 0) { + console.log(`No churches found in ${regionName}`); + return stats; + } + + console.log(`Found ${osmChurches.length} Catholic churches in ${regionName}`); + + if (dryRun) { + console.log('\n[DRY RUN] Would import the following churches:'); + osmChurches.slice(0, 10).forEach((church) => { + console.log(` - ${church.name} (${church.city || 'unknown city'})`); + console.log(` OSM ID: ${church.osmId}, Website: ${church.website || 'none'}`); + }); + if (osmChurches.length > 10) { + console.log(` ... and ${osmChurches.length - 10} more`); + } + + // Count websites + stats.churchesWithWebsites = osmChurches.filter((c) => c.website).length; + stats.churchesWithoutWebsites = osmChurches.length - stats.churchesWithWebsites; + + return stats; + } + + // Fetch all existing churches for deduplication + console.log('Fetching existing churches for deduplication...'); + const existingChurches = await prisma.church.findMany({ + select: { + id: true, + name: true, + latitude: true, + longitude: true, + osmId: true, + baiduId: true, + masstimesId: true, + orarimesseId: true, + massSchedulesPhId: true, + philmassId: true, + horariosMisasId: true, + mszeInfoId: true, + weekdayMassesId: true, + messesInfoId: true, + bohosluzbyId: true, + miserendId: true, + kerknetId: true, + gottesdienstzeitenId: true, + discovermassId: true, + source: true, + website: true, + phone: true, + address: true, + }, + }); + console.log(`Found ${existingChurches.length} existing churches in database`); + + // Process churches one by one (no batch transactions to avoid rollbacks) + let processed = 0; + + for (const osmChurch of osmChurches) { + try { + // Check for duplicate + const duplicate = findDuplicateChurch(osmChurch, existingChurches); + + if (duplicate && duplicate.osmId === osmChurch.osmId) { + // Existing church with matching osmId - update it + const mergedData = mergeChurchData(duplicate, osmChurch); + + // Verify the church exists in the database (not just in our temp list from this run) + const existsInDb = await prisma.church.findUnique({ where: { id: duplicate.id } }); + if (existsInDb) { + await prisma.church.update({ + where: { id: duplicate.id }, + data: mergedData, + }); + stats.existingUpdated++; + } else { + // Duplicate from earlier in this run - skip (already processed) + stats.existingUpdated++; + } + + if (osmChurch.website) stats.churchesWithWebsites++; + else stats.churchesWithoutWebsites++; + + } else if (duplicate) { + // Existing church matched by proximity/name - link it with osmId + const mergedData = mergeChurchData(duplicate, osmChurch); + + // Verify the church exists in the database (not just in our temp list from this run) + const existsInDb = await prisma.church.findUnique({ where: { id: duplicate.id } }); + if (existsInDb) { + await prisma.church.update({ + where: { id: duplicate.id }, + data: mergedData, + }); + stats.existingLinked++; + } else { + // Duplicate from earlier in this run - skip (already processed) + stats.existingLinked++; + } + + if (osmChurch.website) stats.churchesWithWebsites++; + else stats.churchesWithoutWebsites++; + + } else { + // New church - insert it and capture the real ID + const newChurch = await prisma.church.create({ + data: { + name: osmChurch.name, + latitude: osmChurch.lat, + longitude: osmChurch.lng, + address: osmChurch.address, + city: osmChurch.city, + state: osmChurch.state, + zip: osmChurch.zip, + country: osmChurch.country || countryCode, + phone: osmChurch.phone, + website: osmChurch.website, + diocese: osmChurch.diocese, + wheelchairAccess: osmChurch.wheelchairAccess ?? false, + source: 'osm', + osmId: osmChurch.osmId, + hasWebsite: !!osmChurch.website, + osmLastSyncedAt: new Date(), + }, + }); + stats.newChurchesInserted++; + + if (osmChurch.website) stats.churchesWithWebsites++; + else stats.churchesWithoutWebsites++; + + // Add to existing churches list for future deduplication in this run (use real DB ID) + existingChurches.push({ + id: newChurch.id, + name: osmChurch.name, + latitude: osmChurch.lat, + longitude: osmChurch.lng, + osmId: osmChurch.osmId, + baiduId: null, + masstimesId: null, + orarimesseId: null, + massSchedulesPhId: null, + philmassId: null, + horariosMisasId: null, + mszeInfoId: null, + weekdayMassesId: null, + messesInfoId: null, + bohosluzbyId: null, + miserendId: null, + kerknetId: null, + gottesdienstzeitenId: null, + discovermassId: null, + source: 'osm', + website: osmChurch.website || null, + phone: osmChurch.phone || null, + address: osmChurch.address || null, + }); + } + + processed++; + + // Log progress every 100 churches + if (processed % 100 === 0) { + console.log(`Progress: ${processed}/${osmChurches.length} churches processed`); + } + + } catch (error) { + console.error(`Error processing church ${osmChurch.name}:`, error); + stats.errors++; + } + } + + console.log(`\nProcessed all ${osmChurches.length} churches from ${regionName}`); + + } catch (error) { + console.error(`Failed to import from ${regionName}:`, error); + stats.errors++; + } + + return stats; +} + +/** + * Print import summary + */ +function printSummary(countryCode: string, regionName: string, stats: ImportStats, dryRun: boolean) { + console.log(`\n${'='.repeat(60)}`); + console.log(`Import Summary for ${countryCode} - ${regionName} ${dryRun ? '(DRY RUN)' : ''}`); + console.log(`${'='.repeat(60)}`); + console.log(`OSM churches found: ${stats.osmChurchesFound}`); + + if (!dryRun) { + console.log(`New churches inserted: ${stats.newChurchesInserted}`); + console.log(`Existing churches updated: ${stats.existingUpdated} (matched by osmId)`); + console.log(`Existing churches linked: ${stats.existingLinked} (matched by proximity)`); + } + + console.log(`Churches with websites: ${stats.churchesWithWebsites}`); + console.log(`Churches without websites: ${stats.churchesWithoutWebsites}`); + + if (!dryRun && stats.errors > 0) { + console.log(`Errors encountered: ${stats.errors}`); + } + + console.log(`${'='.repeat(60)}\n`); +} + +/** + * Main function + */ +async function main() { + const { country, region, dryRun } = parseArgs(); + + if (!country || !region) { + console.error('Error: Must specify both --country <CODE> and --region <NAME>'); + console.error('Usage:'); + console.error(' npx tsx scripts/import-osm-region.ts --country GB --region "England South"'); + console.error(' npx tsx scripts/import-osm-region.ts --country IT --region "North" --dry-run'); + console.error('\nAvailable countries:', Object.keys(COUNTRY_BOUNDING_BOXES).join(', ')); + process.exit(1); + } + + if (dryRun) { + console.log('\n*** DRY RUN MODE - No changes will be made to database ***\n'); + } + + try { + const stats = await importFromRegion(country, region, dryRun); + printSummary(country, region, stats, dryRun); + + } catch (error) { + console.error('Fatal error:', error); + process.exit(1); + } finally { + await prisma.$disconnect(); + } +} + +main(); diff --git a/scripts/import-philmass.ts b/scripts/import-philmass.ts new file mode 100644 index 0000000..fe6cc2a --- /dev/null +++ b/scripts/import-philmass.ts @@ -0,0 +1,742 @@ +#!/usr/bin/env tsx +/** + * Import Catholic churches and mass schedules from Philmass.com + * + * Philmass.com provides rich Schema.org-annotated mass schedule data for + * Philippine churches. It has no coordinates, so we match against existing + * churches (OSM + mass-schedules.com) and only update matched records. + * Unmatched churches are logged for manual review. + * + * Discovery strategy: + * 1. Fetch Philippines page → extract province URLs + * 2. For each province → extract city listing URLs + * 3. For each city listing → extract church mass-schedule URLs + * 4. Deduplicate all church URLs globally + * 5. For each church: parse JSON-LD + Schema.org Events, match, upsert + * + * Usage: + * npx tsx scripts/import-philmass.ts --all + * npx tsx scripts/import-philmass.ts --all --dry-run + * npx tsx scripts/import-philmass.ts --province Metro-Manila + * npx tsx scripts/import-philmass.ts --all --resume-from Cebu + * npx tsx scripts/import-philmass.ts --all --job-id {uuid} + */ + +import dotenv from 'dotenv'; +import path from 'path'; + +dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); +dotenv.config({ path: path.resolve(process.cwd(), '.env') }); + +import { Pool } from 'pg'; +import { PrismaPg } from '@prisma/adapter-pg'; +import { PrismaClient } from '@prisma/client'; + +const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; +console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); +const pool = new Pool({ + connectionString: dbUrl, + ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined, +}); +const adapter = new PrismaPg(pool); +const prisma = new PrismaClient({ adapter }); + +import { findDuplicateChurch } from '../src/lib/church-matcher'; +import type { ExistingChurch } from '../src/lib/church-matcher'; + +// ─── Constants ─────────────────────────────────────────────────────────────── + +const SITE_BASE = 'https://www.philmass.com'; +const PHILIPPINES_URL = `${SITE_BASE}/Asia/Philippines.html`; +const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)'; +const REQUEST_DELAY_MS = 2000; + +// ─── Types ─────────────────────────────────────────────────────────────────── + +interface ProvinceInfo { + name: string; + url: string; +} + +interface ChurchUrl { + url: string; + slug: string; // URL slug used as philmassId + province: string; + city: string; +} + +interface ParsedPhilmassChurch { + name: string; + streetAddress: string | null; + city: string | null; + region: string | null; +} + +interface ParsedSchedule { + dayOfWeek: number; + time: string; +} + +interface ImportStats { + provincesProcessed: number; + citiesProcessed: number; + churchUrlsDiscovered: number; + churchesProcessed: number; + churchesMatched: number; + churchesUnmatched: number; + churchesSkipped: number; + schedulesUpdated: number; + massSchedulesCreated: number; + errors: number; +} + +interface CLIArgs { + all: boolean; + province?: string; + dryRun: boolean; + resumeFrom?: string; + jobId?: string; +} + +// ─── HTTP Client ───────────────────────────────────────────────────────────── + +let requestCount = 0; + +function delay(ms: number): Promise<void> { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +async function fetchPage(url: string): Promise<string | null> { + if (requestCount > 0) { + await delay(REQUEST_DELAY_MS); + } + requestCount++; + + try { + const response = await fetch(url, { + headers: { + 'User-Agent': USER_AGENT, + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + }, + }); + + if (!response.ok) { + console.error(` HTTP ${response.status} for ${url}`); + return null; + } + + return await response.text(); + } catch (error) { + console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`); + return null; + } +} + +// ─── Discovery: Province → City → Church URLs ─────────────────────────────── + +async function fetchProvinceUrls(): Promise<ProvinceInfo[]> { + console.log(`Fetching Philippines page: ${PHILIPPINES_URL}`); + const html = await fetchPage(PHILIPPINES_URL); + if (!html) throw new Error('Failed to fetch Philippines page'); + + const provinces: ProvinceInfo[] = []; + const seen = new Set<string>(); + + // Pattern: href="https://www.philmass.com/Asia/Philippines/{Province}.html" + const regex = /href="(https:\/\/www\.philmass\.com\/Asia\/Philippines\/([^/"]+)\.html)"/g; + let match; + while ((match = regex.exec(html)) !== null) { + const url = match[1]; + const name = match[2]; + // Skip non-province pages (weekly-sunday, etc.) + if (name.includes('weekly') || name.includes('Roman-Catholic') || seen.has(name)) continue; + seen.add(name); + provinces.push({ name, url }); + } + + return provinces; +} + +function decodeHtmlEntities(str: string): string { + return str + .replace(/&#(\d+);/g, (_, code: string) => String.fromCharCode(parseInt(code))) + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"'); +} + +async function fetchCityListingUrls(provinceUrl: string, provinceName: string): Promise<string[]> { + const html = await fetchPage(provinceUrl); + if (!html) return []; + + const urls: string[] = []; + const seen = new Set<string>(); + + // Pattern: href=".../{Province}/{City}/Roman-Catholic-Churches-in-{City}...html" + const regex = /href="(https:\/\/www\.philmass\.com\/Asia\/Philippines\/[^"]*\/Roman-Catholic-Churches-in-[^"]*\.html)"/g; + let match; + while ((match = regex.exec(html)) !== null) { + const url = decodeHtmlEntities(match[1]); + if (seen.has(url)) continue; + seen.add(url); + urls.push(url); + } + + return urls; +} + +async function fetchChurchUrlsFromCityPage(cityUrl: string, provinceName: string): Promise<ChurchUrl[]> { + const html = await fetchPage(cityUrl); + if (!html) return []; + + const churches: ChurchUrl[] = []; + const seen = new Set<string>(); + + // Pattern: href=".../Roman-Catholic-Churches/{Church-Name}/mass-schedule.html" + const regex = /href="(https:\/\/www\.philmass\.com\/Asia\/Philippines\/([^/]+)\/([^/]+)\/Roman-Catholic-Churches\/([^/]+)\/mass-schedule\.html)"/g; + let match; + while ((match = regex.exec(html)) !== null) { + const url = decodeHtmlEntities(match[1]); + const province = decodeURIComponent(decodeHtmlEntities(match[2])); + const city = decodeURIComponent(decodeHtmlEntities(match[3])); + const slug = decodeURIComponent(decodeHtmlEntities(match[4])); + + if (seen.has(url)) continue; + seen.add(url); + + churches.push({ url, slug, province, city }); + } + + return churches; +} + +// ─── HTML Parsers ──────────────────────────────────────────────────────────── + +function parseChurchJsonLd(html: string): ParsedPhilmassChurch | null { + // Extract JSON-LD: <script type="application/ld+json">{...}</script> + const jsonLdMatch = html.match(/<script\s+type="application\/ld\+json"\s*>([\s\S]*?)<\/script>/i); + if (!jsonLdMatch) return null; + + try { + const data = JSON.parse(jsonLdMatch[1]); + const church = data.mainEntityOfPage; + if (!church || church['@type'] !== 'PlaceOfWorship') return null; + + const address = church.address || {}; + return { + name: church.name || null, + streetAddress: address.streetAddress?.replace(/,\s*$/, '').trim() || null, + city: address.addressLocality || null, + region: address.addressRegion || null, + }; + } catch { + return null; + } +} + +function parseChurchNameFromH1(html: string): string | null { + // Fallback: <h1>Quiapo Church mass schedule 2026 - Minor Basilica of the Black Nazarene</h1> + const h1Match = html.match(/<h1>([^<]+)<\/h1>/i); + if (!h1Match) return null; + + let name = h1Match[1].trim(); + // Remove "mass schedule YYYY" and trailing " - " + name = name.replace(/\s*mass\s+schedule\s+\d{4}\s*/i, ''); + name = name.replace(/^\s*-\s*/, '').replace(/\s*-\s*$/, ''); + return name.trim() || null; +} + +function parseScheduleFromStartDates(html: string): ParsedSchedule[] { + // Extract all startDate ISO timestamps from Schema.org Event microdata + // Pattern: itemprop="startDate" content="2026-02-22T05:00:00+08:00" + const schedules: ParsedSchedule[] = []; + const seen = new Set<string>(); + + const regex = /itemprop="startDate"\s+content="(\d{4}-\d{2}-\d{2})T(\d{2}):(\d{2}):\d{2}[^"]*"/g; + let match; + while ((match = regex.exec(html)) !== null) { + const dateStr = match[1]; + const hours = match[2]; + const minutes = match[3]; + + // Derive dayOfWeek from the date + const date = new Date(`${dateStr}T12:00:00`); // noon to avoid TZ issues + const dayOfWeek = date.getDay(); // 0=Sun, 1=Mon, ..., 6=Sat + + const time = `${hours}:${minutes}`; + const key = `${dayOfWeek}:${time}`; + + if (seen.has(key)) continue; + seen.add(key); + + schedules.push({ dayOfWeek, time }); + } + + return schedules; +} + +// ─── Database Operations ───────────────────────────────────────────────────── + +async function loadExistingPhilippineChurches(): Promise<ExistingChurch[]> { + console.log('Loading existing Philippine churches for deduplication...'); + const churches = await prisma.church.findMany({ + where: { country: 'PH' }, + select: { + id: true, + name: true, + latitude: true, + longitude: true, + osmId: true, + baiduId: true, + masstimesId: true, + orarimesseId: true, + massSchedulesPhId: true, + philmassId: true, + horariosMisasId: true, + mszeInfoId: true, + weekdayMassesId: true, + messesInfoId: true, + bohosluzbyId: true, + miserendId: true, + kerknetId: true, + gottesdienstzeitenId: true, + discovermassId: true, + source: true, + website: true, + phone: true, + address: true, + }, + }); + console.log(`Loaded ${churches.length} existing Philippine churches`); + return churches; +} + +// ─── Import Logic ──────────────────────────────────────────────────────────── + +async function processChurch( + churchUrl: ChurchUrl, + existingChurches: ExistingChurch[], + unmatchedLog: string[], + dryRun: boolean, + stats: ImportStats, +): Promise<void> { + stats.churchesProcessed++; + + const html = await fetchPage(churchUrl.url); + if (!html) { + stats.errors++; + return; + } + + // Parse church info from JSON-LD + const jsonLd = parseChurchJsonLd(html); + const churchName = jsonLd?.name || parseChurchNameFromH1(html); + + if (!churchName) { + console.log(` Skipping ${churchUrl.slug}: no name found`); + stats.churchesSkipped++; + return; + } + + // Parse schedules from Schema.org startDate attributes + const schedules = parseScheduleFromStartDates(html); + + // Try to find a match by philmassId first + const existingByPhilmass = existingChurches.find((c) => c.philmassId === churchUrl.slug); + let matched = existingByPhilmass || null; + + // If no philmassId match, try name-based matching against churches with coordinates + if (!matched) { + // Try matching by name similarity against all PH churches + // We can't use findDuplicateChurch() without coordinates, so do name-only matching + const normalizedName = churchName.toLowerCase() + .replace(/\bst\.\s/g, 'saint ') + .replace(/\bst\s/g, 'saint ') + .replace(/\bcatholic church\b/g, '') + .replace(/\bparish\b/g, '') + .replace(/\bchurch\b/g, '') + .replace(/[^\w\s]/g, '') + .replace(/\s+/g, ' ') + .trim(); + + // Filter to churches in the same city if possible + const cityName = jsonLd?.city || churchUrl.city.replace(/-/g, ' '); + const candidatesInCity = existingChurches.filter((c) => { + if (!c.address) return false; + return c.address.toLowerCase().includes(cityName.toLowerCase()); + }); + + // Search in-city candidates first, then all PH churches + const searchPools = candidatesInCity.length > 0 + ? [candidatesInCity, existingChurches] + : [existingChurches]; + + for (const searchPool of searchPools) { + if (matched) break; + for (const existing of searchPool) { + const existingNorm = existing.name.toLowerCase() + .replace(/\bst\.\s/g, 'saint ') + .replace(/\bst\s/g, 'saint ') + .replace(/\bcatholic church\b/g, '') + .replace(/\bparish\b/g, '') + .replace(/\bchurch\b/g, '') + .replace(/[^\w\s]/g, '') + .replace(/\s+/g, ' ') + .trim(); + + // Require strong name match: one name contains the other, or very similar + // Guard against overly generic names ("chapel", "holy", etc.) by requiring + // that the shorter name is at least 8 chars after normalization + const shorter = normalizedName.length <= existingNorm.length ? normalizedName : existingNorm; + if (shorter.length >= 8) { + if (normalizedName.includes(existingNorm) || existingNorm.includes(normalizedName)) { + matched = existing; + break; + } + } + } + } + } + + if (dryRun) { + if (matched) { + stats.churchesMatched++; + console.log(` [MATCH] "${churchName}" → existing "${matched.name}" (${matched.id})`); + } else { + stats.churchesUnmatched++; + unmatchedLog.push(`${churchName} | ${jsonLd?.city || churchUrl.city} | ${churchUrl.url}`); + console.log(` [UNMATCHED] "${churchName}" in ${jsonLd?.city || churchUrl.city}`); + } + if (schedules.length > 0) { + stats.massSchedulesCreated += schedules.length; + } + return; + } + + if (!matched) { + stats.churchesUnmatched++; + unmatchedLog.push(`${churchName} | ${jsonLd?.city || churchUrl.city} | ${churchUrl.url}`); + return; + } + + stats.churchesMatched++; + + // Update existing church: set philmassId, fill missing fields + const updateData: Record<string, unknown> = { + philmassId: churchUrl.slug, + }; + + if (!matched.address && jsonLd?.streetAddress) { + const fullAddress = [jsonLd.streetAddress, jsonLd.city, jsonLd.region] + .filter(Boolean).join(', '); + updateData.address = fullAddress; + } + + // Fill city/state from JSON-LD or URL + const dbRecord = await prisma.church.findUnique({ + where: { id: matched.id }, + select: { city: true, state: true }, + }); + if (dbRecord && !dbRecord.city && (jsonLd?.city || churchUrl.city)) { + updateData.city = jsonLd?.city || churchUrl.city.replace(/-/g, ' '); + } + if (dbRecord && !dbRecord.state && (jsonLd?.region || churchUrl.province)) { + updateData.state = jsonLd?.region || churchUrl.province.replace(/-/g, ' '); + } + + try { + await prisma.church.update({ + where: { id: matched.id }, + data: updateData, + }); + } catch (error) { + if (error instanceof Error && error.message.includes('Unique constraint')) { + stats.churchesSkipped++; + return; + } + throw error; + } + + // Replace mass schedules if we have any + if (schedules.length > 0) { + try { + await prisma.$transaction(async (tx) => { + await tx.massSchedule.deleteMany({ where: { churchId: matched!.id } }); + await tx.massSchedule.createMany({ + data: schedules.map((s) => ({ + churchId: matched!.id, + dayOfWeek: s.dayOfWeek, + time: s.time, + language: 'English', + })), + }); + await tx.church.update({ + where: { id: matched!.id }, + data: { lastScrapedAt: new Date() }, + }); + }); + stats.schedulesUpdated++; + stats.massSchedulesCreated += schedules.length; + } catch (error) { + stats.errors++; + console.error(` Error saving schedules for ${churchUrl.slug}: ${error instanceof Error ? error.message : error}`); + } + } +} + +// ─── CLI ───────────────────────────────────────────────────────────────────── + +function parseArgs(): CLIArgs { + const args = process.argv.slice(2); + const result: CLIArgs = { + all: false, + dryRun: false, + }; + + for (let i = 0; i < args.length; i++) { + switch (args[i]) { + case '--all': + result.all = true; + break; + case '--province': + result.province = args[++i]; + break; + case '--dry-run': + result.dryRun = true; + break; + case '--resume-from': + result.resumeFrom = args[++i]; + break; + case '--job-id': + result.jobId = args[++i]; + break; + case '--help': + case '-h': + console.log(` +Usage: npx tsx scripts/import-philmass.ts [options] + +Options: + --all Import from all provinces + --province <name> Import from a single province (e.g. "Metro-Manila") + --dry-run No database writes, just report what would happen + --resume-from <province> Skip provinces until reaching this one + --job-id <uuid> Background job tracking ID + --help, -h Show this help message + +Examples: + npx tsx scripts/import-philmass.ts --province Metro-Manila --dry-run + npx tsx scripts/import-philmass.ts --all + npx tsx scripts/import-philmass.ts --all --resume-from Cebu +`); + process.exit(0); + } + } + + if (!result.all && !result.province) { + console.error('Error: specify --all or --province <name>'); + process.exit(1); + } + + return result; +} + +// ─── Helpers ───────────────────────────────────────────────────────────────── + +function formatDuration(ms: number): string { + const seconds = Math.floor(ms / 1000); + const minutes = Math.floor(seconds / 60); + const hours = Math.floor(minutes / 60); + if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`; + if (minutes > 0) return `${minutes}m ${seconds % 60}s`; + return `${seconds}s`; +} + +// ─── Main ──────────────────────────────────────────────────────────────────── + +async function main() { + const args = parseArgs(); + const startTime = Date.now(); + + console.log('\n' + '='.repeat(70)); + console.log('PHILMASS.COM IMPORTER'); + console.log('='.repeat(70)); + console.log(`Mode: ${args.all ? 'All provinces' : `Single province: ${args.province}`}`); + console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`); + if (args.resumeFrom) console.log(`Resume from: ${args.resumeFrom}`); + console.log(`Time: ${new Date().toISOString()}`); + console.log('='.repeat(70) + '\n'); + + // Update background job status if provided + if (args.jobId) { + try { + await prisma.backgroundJob.update({ + where: { id: args.jobId }, + data: { status: 'running', startedAt: new Date() }, + }); + } catch { + // Job might not exist yet + } + } + + // Load existing Philippine churches for dedup + const existingChurches = await loadExistingPhilippineChurches(); + + // ─── Phase 1: Discover all church URLs ─────────────────────────────────── + + console.log('=== Phase 1: Discovering church URLs ===\n'); + + const allProvinces = await fetchProvinceUrls(); + console.log(`Found ${allProvinces.length} provinces\n`); + + // Filter to requested provinces + let provincesToProcess: ProvinceInfo[]; + if (args.province) { + const found = allProvinces.find((p) => p.name === args.province); + if (!found) { + console.error(`Province "${args.province}" not found. Available: ${allProvinces.map((p) => p.name).join(', ')}`); + process.exit(1); + } + provincesToProcess = [found]; + } else { + provincesToProcess = allProvinces; + } + + // Handle --resume-from + if (args.resumeFrom) { + const idx = provincesToProcess.findIndex((p) => p.name === args.resumeFrom); + if (idx === -1) { + console.error(`Resume province "${args.resumeFrom}" not found.`); + process.exit(1); + } + console.log(`Resuming from province "${args.resumeFrom}" (skipping ${idx} provinces)\n`); + provincesToProcess = provincesToProcess.slice(idx); + } + + // Collect all unique church URLs across all provinces/cities + const allChurchUrls = new Map<string, ChurchUrl>(); // keyed by URL to deduplicate + + const stats: ImportStats = { + provincesProcessed: 0, + citiesProcessed: 0, + churchUrlsDiscovered: 0, + churchesProcessed: 0, + churchesMatched: 0, + churchesUnmatched: 0, + churchesSkipped: 0, + schedulesUpdated: 0, + massSchedulesCreated: 0, + errors: 0, + }; + + for (let pi = 0; pi < provincesToProcess.length; pi++) { + const province = provincesToProcess[pi]; + const elapsed = formatDuration(Date.now() - startTime); + console.log(`[${pi + 1}/${provincesToProcess.length}] Province: ${province.name} [${elapsed} elapsed]`); + + try { + // Get city listing URLs from province page + const cityUrls = await fetchCityListingUrls(province.url, province.name); + console.log(` Found ${cityUrls.length} city listing pages`); + + for (const cityUrl of cityUrls) { + const churchUrls = await fetchChurchUrlsFromCityPage(cityUrl, province.name); + stats.citiesProcessed++; + + for (const church of churchUrls) { + if (!allChurchUrls.has(church.url)) { + allChurchUrls.set(church.url, church); + } + } + } + + stats.provincesProcessed++; + console.log(` Total unique churches so far: ${allChurchUrls.size}`); + } catch (error) { + stats.errors++; + console.error(` ERROR discovering ${province.name}: ${error instanceof Error ? error.message : error}`); + } + } + + stats.churchUrlsDiscovered = allChurchUrls.size; + console.log(`\nDiscovery complete: ${allChurchUrls.size} unique church URLs across ${stats.citiesProcessed} city pages\n`); + + // ─── Phase 2: Process each church ───────────────────────────────────────── + + console.log('=== Phase 2: Processing churches ===\n'); + + const churchList = [...allChurchUrls.values()]; + const unmatchedLog: string[] = []; + + for (let i = 0; i < churchList.length; i++) { + const church = churchList[i]; + const elapsed = formatDuration(Date.now() - startTime); + if ((i + 1) % 50 === 0 || i === 0) { + console.log(`[${i + 1}/${churchList.length}] Processing churches... [${elapsed} elapsed]`); + } + + try { + await processChurch(church, existingChurches, unmatchedLog, args.dryRun, stats); + } catch (error) { + stats.errors++; + console.error(` ERROR processing ${church.slug}: ${error instanceof Error ? error.message : error}`); + } + } + + // Print summary + const totalTime = Date.now() - startTime; + console.log('\n' + '='.repeat(70)); + console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`); + console.log('='.repeat(70)); + console.log(`Provinces processed: ${stats.provincesProcessed}`); + console.log(`Cities processed: ${stats.citiesProcessed}`); + console.log(`Church URLs discovered: ${stats.churchUrlsDiscovered}`); + console.log(`Churches processed: ${stats.churchesProcessed}`); + console.log(` Matched (updated): ${stats.churchesMatched}`); + console.log(` Unmatched (skipped): ${stats.churchesUnmatched}`); + console.log(` Skipped (other): ${stats.churchesSkipped}`); + console.log(`Schedules updated: ${stats.schedulesUpdated}`); + console.log(`Mass schedules created: ${stats.massSchedulesCreated}`); + console.log(`Errors: ${stats.errors}`); + console.log(`Total time: ${formatDuration(totalTime)}`); + console.log(`HTTP requests: ${requestCount}`); + console.log('='.repeat(70)); + + // Log unmatched churches for manual review + if (unmatchedLog.length > 0) { + console.log(`\nUnmatched churches (${unmatchedLog.length}):`); + console.log('-'.repeat(70)); + for (const line of unmatchedLog) { + console.log(` ${line}`); + } + console.log('-'.repeat(70)); + } + + console.log(''); + + // Update background job + if (args.jobId) { + try { + await prisma.backgroundJob.update({ + where: { id: args.jobId }, + data: { + status: stats.errors > 0 ? 'completed_with_errors' : 'completed', + completedAt: new Date(), + result: JSON.stringify(stats), + }, + }); + } catch { + // Ignore + } + } +} + +main() + .catch((error) => { + console.error('Fatal error:', error); + process.exit(1); + }) + .finally(async () => { + await prisma.$disconnect(); + await pool.end(); + }); diff --git a/scripts/import-weekdaymasses.ts b/scripts/import-weekdaymasses.ts new file mode 100644 index 0000000..e5af79a --- /dev/null +++ b/scripts/import-weekdaymasses.ts @@ -0,0 +1,1121 @@ +#!/usr/bin/env tsx +/** + * Import Catholic churches and mass schedules from weekdaymasses.org.uk + * + * weekdaymasses.org.uk covers ~4,000+ churches globally (GB, Ireland, and 49+ + * international countries). All data is served on single HTML pages per area. + * + * Import strategy: + * 1. Fetch area pages (gb, ireland, outside-gb) + * 2. Parse `.church` divs for name, coordinates, address, phone, website, mass times + * 3. Convert mass times from H.MMam/pm to HH:MM 24h format + * 4. Detect country from address patterns (for outside-gb) + * 5. Match against existing churches, upsert with mass schedules + * + * Usage: + * npx tsx scripts/import-weekdaymasses.ts --all + * npx tsx scripts/import-weekdaymasses.ts --area gb + * npx tsx scripts/import-weekdaymasses.ts --area outside-gb --dry-run + * npx tsx scripts/import-weekdaymasses.ts --all --resume-from 500 + * npx tsx scripts/import-weekdaymasses.ts --all --job-id {uuid} + */ + +import dotenv from 'dotenv'; +import path from 'path'; + +dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); +dotenv.config({ path: path.resolve(process.cwd(), '.env') }); + +import { Pool } from 'pg'; +import { PrismaPg } from '@prisma/adapter-pg'; +import { PrismaClient } from '@prisma/client'; + +const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; +console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); +const pool = new Pool({ + connectionString: dbUrl, + ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined, +}); +const adapter = new PrismaPg(pool); +const prisma = new PrismaClient({ adapter }); + +import { findDuplicateChurch } from '../src/lib/church-matcher'; +import type { ExistingChurch } from '../src/lib/church-matcher'; + +// ─── Constants ─────────────────────────────────────────────────────────────── + +const SITE_BASE = 'https://weekdaymasses.org.uk'; +const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)'; + +const AREA_PAGES: Record<string, { url: string; defaultCountry: string }> = { + 'gb': { url: '/en/area/gb/churches', defaultCountry: 'GB' }, + 'ireland': { url: '/en/area/ireland/churches', defaultCountry: 'IE' }, + 'outside-gb': { url: '/en/area/outside-gb/churches', defaultCountry: '' }, // needs detection +}; + +// Known languages that may appear in parentheses after mass times +const KNOWN_LANGUAGES = new Set([ + 'english', 'tamil', 'sinhala', 'sinhalese', 'french', 'spanish', 'portuguese', + 'polish', 'italian', 'german', 'latin', 'korean', 'japanese', 'chinese', + 'mandarin', 'cantonese', 'tagalog', 'filipino', 'hindi', 'malayalam', + 'konkani', 'telugu', 'kannada', 'marathi', 'bengali', 'urdu', 'arabic', + 'vietnamese', 'indonesian', 'malay', 'dutch', 'hungarian', 'czech', 'slovak', + 'slovenian', 'croatian', 'swahili', 'igbo', 'yoruba', 'ga', 'twi', 'ewe', + 'shona', 'zulu', 'sesotho', 'afrikaans', +]); + +// Country name patterns (matched anywhere in address, no $ anchor — addresses have trailing \r\n) +const COUNTRY_NAME_MAP: Record<string, string> = { + 'india': 'IN', 'sri lanka': 'LK', 'france': 'FR', 'italy': 'IT', 'spain': 'ES', + 'portugal': 'PT', 'germany': 'DE', 'south korea': 'KR', 'korea': 'KR', 'japan': 'JP', + 'philippines': 'PH', 'singapore': 'SG', 'malaysia': 'MY', 'hong kong': 'HK', + 'thailand': 'TH', 'indonesia': 'ID', 'vietnam': 'VN', 'pakistan': 'PK', + 'bangladesh': 'BD', 'nepal': 'NP', 'myanmar': 'MM', 'nigeria': 'NG', 'ghana': 'GH', + 'kenya': 'KE', 'tanzania': 'TZ', 'uganda': 'UG', 'south africa': 'ZA', + 'australia': 'AU', 'new zealand': 'NZ', 'canada': 'CA', 'belgium': 'BE', + 'netherlands': 'NL', 'luxembourg': 'LU', 'switzerland': 'CH', 'austria': 'AT', + 'poland': 'PL', 'hungary': 'HU', 'czech republic': 'CZ', 'czechia': 'CZ', + 'mexico': 'MX', 'brazil': 'BR', 'argentina': 'AR', 'colombia': 'CO', 'peru': 'PE', + 'chile': 'CL', 'china': 'CN', 'taiwan': 'TW', 'ireland': 'IE', 'malta': 'MT', + 'cyprus': 'CY', 'croatia': 'HR', 'slovenia': 'SI', 'romania': 'RO', 'slovakia': 'SK', + 'senegal': 'SN', 'grenada': 'GD', 'greece': 'GR', 'denmark': 'DK', 'sweden': 'SE', + 'norway': 'NO', 'finland': 'FI', 'iceland': 'IS', 'latvia': 'LV', 'lithuania': 'LT', + 'estonia': 'EE', 'ukraine': 'UA', 'russia': 'RU', 'georgia': 'GE', 'armenia': 'AM', + 'jordan': 'JO', 'lebanon': 'LB', 'israel': 'IL', 'turkey': 'TR', 'egypt': 'EG', + 'morocco': 'MA', 'tunisia': 'TN', 'cameroon': 'CM', 'ethiopia': 'ET', + 'madagascar': 'MG', 'mozambique': 'MZ', 'zambia': 'ZM', 'zimbabwe': 'ZW', + 'trinidad': 'TT', 'trinidad and tobago': 'TT', 'jamaica': 'JM', 'barbados': 'BB', + 'bahamas': 'BS', 'bermuda': 'BM', 'costa rica': 'CR', 'panama': 'PA', + 'guatemala': 'GT', 'honduras': 'HN', 'el salvador': 'SV', 'nicaragua': 'NI', + 'ecuador': 'EC', 'venezuela': 'VE', 'bolivia': 'BO', 'paraguay': 'PY', 'uruguay': 'UY', + 'puerto rico': 'PR', 'fiji': 'FJ', 'samoa': 'WS', 'tonga': 'TO', 'guam': 'GU', + 'liechtenstein': 'LI', 'monaco': 'MC', 'andorra': 'AD', 'san marino': 'SM', + 'serbia': 'RS', 'bosnia': 'BA', 'montenegro': 'ME', 'north macedonia': 'MK', + 'albania': 'AL', 'kosovo': 'XK', 'bulgaria': 'BG', 'moldova': 'MD', 'belarus': 'BY', + 'kazakhstan': 'KZ', 'uzbekistan': 'UZ', 'kyrgyzstan': 'KG', 'tajikistan': 'TJ', + 'cambodia': 'KH', 'laos': 'LA', 'brunei': 'BN', 'east timor': 'TL', 'timor-leste': 'TL', + 'papua new guinea': 'PG', 'mongolia': 'MN', + 'curaçao': 'CW', 'curacao': 'CW', 'cape verde': 'CV', 'cabo verde': 'CV', + 'the gambia': 'GM', 'gambia': 'GM', + 'congo': 'CD', 'ivory coast': 'CI', "côte d'ivoire": 'CI', 'burkina faso': 'BF', + 'suriname': 'SR', 'guyana': 'GY', 'belize': 'BZ', 'haiti': 'HT', + 'dominican republic': 'DO', 'cuba': 'CU', 'qatar': 'QA', + 'united arab emirates': 'AE', 'u.a.e.': 'AE', 'uae': 'AE', 'dubai': 'AE', 'abu dhabi': 'AE', + 'saudi arabia': 'SA', 'bahrain': 'BH', 'kuwait': 'KW', 'oman': 'OM', + 'antigua and barbuda': 'AG', 'antigua': 'AG', + 'mauritius': 'MU', 'réunion': 'RE', 'reunion': 'RE', 'seychelles': 'SC', + 'saint lucia': 'LC', 'st. lucia': 'LC', 'dominica': 'DM', + 'saint vincent': 'VC', 'st. vincent': 'VC', + 'saint kitts': 'KN', 'st. kitts': 'KN', + 'u.s. virgin islands': 'VI', 'us virgin islands': 'VI', 'saint croix': 'VI', + 'saint thomas': 'VI', 'virgin islands': 'VI', + 'aruba': 'AW', 'bonaire': 'BQ', 'sint maarten': 'SX', + 'iraq': 'IQ', 'iran': 'IR', 'afghanistan': 'AF', + 'macao': 'MO', 'macau': 'MO', +}; + +// City/region-based detection fallback (for addresses without country names) +const CITY_COUNTRY_MAP: Record<string, string> = { + // Major cities that unambiguously identify a country + 'jakarta': 'ID', 'surabaya': 'ID', 'bandung': 'ID', 'yogyakarta': 'ID', + 'budapest': 'HU', 'berlin': 'DE', 'münchen': 'DE', 'munich': 'DE', 'hamburg': 'DE', + 'köln': 'DE', 'frankfurt': 'DE', 'düsseldorf': 'DE', 'stuttgart': 'DE', + 'paris': 'FR', 'lyon': 'FR', 'marseille': 'FR', 'toulouse': 'FR', 'lille': 'FR', + 'nantes': 'FR', 'bordeaux': 'FR', 'strasbourg': 'FR', 'rennes': 'FR', + 'roma': 'IT', 'rome': 'IT', 'milano': 'IT', 'milan': 'IT', 'napoli': 'IT', + 'torino': 'IT', 'firenze': 'IT', 'florence': 'IT', 'bologna': 'IT', 'genova': 'IT', + 'madrid': 'ES', 'barcelona': 'ES', 'valencia': 'ES', 'sevilla': 'ES', 'seville': 'ES', + 'málaga': 'ES', 'bilbao': 'ES', 'mallorca': 'ES', 'tenerife': 'ES', + 'lisboa': 'PT', 'lisbon': 'PT', 'porto': 'PT', 'faro': 'PT', + 'warszawa': 'PL', 'warsaw': 'PL', 'kraków': 'PL', 'krakow': 'PL', + 'praha': 'CZ', 'prague': 'CZ', 'brno': 'CZ', + 'wien': 'AT', 'vienna': 'AT', 'innsbruck': 'AT', 'salzburg': 'AT', 'graz': 'AT', + 'zürich': 'CH', 'zurich': 'CH', 'genève': 'CH', 'geneva': 'CH', 'bern': 'CH', 'basel': 'CH', + 'amsterdam': 'NL', 'rotterdam': 'NL', 'den haag': 'NL', + 'bruxelles': 'BE', 'brussels': 'BE', 'brugge': 'BE', 'antwerpen': 'BE', + 'københavn': 'DK', 'copenhagen': 'DK', 'aarhus': 'DK', 'aalborg': 'DK', + 'stockholm': 'SE', 'göteborg': 'SE', 'malmö': 'SE', + 'oslo': 'NO', 'bergen': 'NO', + 'helsinki': 'FI', + 'reykjavik': 'IS', + 'riga': 'LV', 'vilnius': 'LT', 'tallinn': 'EE', + 'kyiv': 'UA', 'київ': 'UA', 'lviv': 'UA', + 'москва': 'RU', 'moscow': 'RU', 'санкт-петербург': 'RU', 'магадан': 'RU', + 'калуга': 'RU', 'новосибирск': 'RU', 'владивосток': 'RU', + 'tbilisi': 'GE', 'yerevan': 'AM', + 'amman': 'JO', 'beirut': 'LB', 'istanbul': 'TR', 'ankara': 'TR', + 'cairo': 'EG', 'casablanca': 'MA', 'tunis': 'TN', + 'nairobi': 'KE', 'dar es salaam': 'TZ', 'kampala': 'UG', 'lagos': 'NG', 'accra': 'GH', + 'johannesburg': 'ZA', 'cape town': 'ZA', 'durban': 'ZA', 'pretoria': 'ZA', + 'seoul': 'KR', 'busan': 'KR', 'tokyo': 'JP', 'osaka': 'JP', 'yokohama': 'JP', + 'nagasaki': 'JP', 'kyoto': 'JP', 'beijing': 'CN', 'shanghai': 'CN', + 'taipei': 'TW', 'mumbai': 'IN', 'chennai': 'IN', 'kolkata': 'IN', 'delhi': 'IN', + 'new delhi': 'IN', 'bangalore': 'IN', 'bengaluru': 'IN', 'hyderabad': 'IN', 'goa': 'IN', + 'colombo': 'LK', 'matara': 'LK', 'kandy': 'LK', 'galle': 'LK', + 'kuala lumpur': 'MY', 'penang': 'MY', + 'manila': 'PH', 'cebu': 'PH', + 'bangkok': 'TH', 'chiang mai': 'TH', + 'hà nội': 'VN', 'hanoi': 'VN', 'ho chi minh': 'VN', 'saigon': 'VN', + 'phnom penh': 'KH', 'vientiane': 'LA', + 'sydney': 'AU', 'melbourne': 'AU', 'brisbane': 'AU', 'perth': 'AU', 'adelaide': 'AU', + 'auckland': 'NZ', 'wellington': 'NZ', 'christchurch': 'NZ', + 'toronto': 'CA', 'vancouver': 'CA', 'montreal': 'CA', 'ottawa': 'CA', + 'mexico city': 'MX', 'guadalajara': 'MX', 'monterrey': 'MX', + 'são paulo': 'BR', 'rio de janeiro': 'BR', 'brasília': 'BR', + 'buenos aires': 'AR', 'bogotá': 'CO', 'lima': 'PE', 'santiago': 'CL', + 'vaduz': 'LI', 'monaco': 'MC', + 'valletta': 'MT', 'nicosia': 'CY', 'zagreb': 'HR', 'ljubljana': 'SI', + 'bratislava': 'SK', 'bucharest': 'RO', 'sofia': 'BG', 'belgrade': 'RS', + 'nadi': 'FJ', 'suva': 'FJ', + 'san juan': 'PR', 'viejo san juan': 'PR', + // Cities missed in the first pass + 'calais': 'FR', 'lourdes': 'FR', 'nice': 'FR', 'montpellier': 'FR', 'toulon': 'FR', + 'abidjan': 'CI', 'douala': 'CM', 'yaoundé': 'CM', 'kinshasa': 'CD', 'lusaka': 'ZM', + 'harare': 'ZW', 'maputo': 'MZ', 'antananarivo': 'MG', 'dakar': 'SN', + 'pademangan': 'ID', 'jakarta utara': 'ID', 'denpasar': 'ID', 'semarang': 'ID', + 'makassar': 'ID', 'medan': 'ID', 'bogor': 'ID', 'malang': 'ID', 'palembang': 'ID', + '서울': 'KR', '부산': 'KR', // Seoul, Busan in Korean + // Japanese city names in kanji + '東京': 'JP', '大阪': 'JP', '横浜': 'JP', '名古屋': 'JP', '長崎': 'JP', + '京都': 'JP', '神戸': 'JP', '福岡': 'JP', '札幌': 'JP', '仙台': 'JP', '広島': 'JP', + // Chinese city names in hanzi + '北京': 'CN', '上海': 'CN', '深圳': 'CN', '广州': 'CN', '香港': 'HK', + // More missing cities + 'kuching': 'MY', 'kota kinabalu': 'MY', 'ipoh': 'MY', 'johor bahru': 'MY', 'sarawak': 'MY', + 'trondheim': 'NO', 'stavanger': 'NO', 'tromsø': 'NO', + 'taastrup': 'DK', 'odense': 'DK', + 'cancún': 'MX', 'playa del carmen': 'MX', 'mérida': 'MX', 'puebla': 'MX', 'cancun': 'MX', + 'addis ababa': 'ET', + 'la paz': 'BO', 'cochabamba': 'BO', 'santa cruz': 'BO', + 'willemstad': 'CW', 'curaçao': 'CW', 'curacao': 'CW', + 'port of spain': 'TT', 'bridgetown': 'BB', 'nassau': 'BS', + 'phnom penh': 'KH', 'siem reap': 'KH', + 'port moresby': 'PG', + 'ulaanbaatar': 'MN', + 'praia': 'CV', 'cape verde': 'CV', + 'celebration': 'US', // Celebration, Florida — city not great, but helps + 'the gambia': 'GM', 'gambia': 'GM', 'banjul': 'GM', + 'playa blanca': 'ES', 'gran canaria': 'ES', 'fuerteventura': 'ES', 'lanzarote': 'ES', + 'tirana': 'AL', 'durrës': 'AL', + 'podgorica': 'ME', 'budva': 'ME', + 'skopje': 'MK', 'pristina': 'XK', 'sarajevo': 'BA', + 'minsk': 'BY', 'chișinău': 'MD', 'chisinau': 'MD', + 'bishkek': 'KG', 'dushanbe': 'TJ', 'tashkent': 'UZ', 'almaty': 'KZ', 'astana': 'KZ', + 'lekki': 'NG', 'abuja': 'NG', 'enugu': 'NG', 'yaba': 'NG', 'ikeja': 'NG', + // Serbian + 'beograd': 'RS', 'novi sad': 'RS', + // Thai + 'phuket': 'TH', 'pattaya': 'TH', 'hua hin': 'TH', + // Spanish cities + 'alicante': 'ES', 'zaragoza': 'ES', 'murcia': 'ES', 'palma': 'ES', + 'granada': 'ES', 'córdoba': 'ES', 'santander': 'ES', 'cádiz': 'ES', + 'san sebastián': 'ES', 'las palmas': 'ES', 'santa cruz de tenerife': 'ES', + // Belgian + 'woluwe': 'BE', 'ixelles': 'BE', 'schaerbeek': 'BE', 'liège': 'BE', 'namur': 'BE', + // Portuguese + 'loulé': 'PT', 'albufeira': 'PT', 'coimbra': 'PT', 'braga': 'PT', 'funchal': 'PT', + // Turkish + 'mersin': 'TR', 'izmir': 'TR', 'antalya': 'TR', 'trabzon': 'TR', + // Lebanese (French spelling) + 'beyrouth': 'LB', + // Burkina Faso + 'ouagadougou': 'BF', 'bobo-dioulasso': 'BF', + // Greek + 'heraklion': 'GR', 'ηράκλειο': 'GR', 'μυτιλήνη': 'GR', 'αθήνα': 'GR', + 'athens': 'GR', 'thessaloniki': 'GR', 'patras': 'GR', + // Bulgarian (transliterated) + 'plovdiv': 'BG', 'пловдив': 'BG', 'варна': 'BG', + // Vietnamese with diacritics + 'sài gòn': 'VN', 'hồ chí minh': 'VN', 'đà nẵng': 'VN', + // Moldovan + 'chişinău': 'MD', + // Hungarian + 'ferenciek': 'HU', 'debrecen': 'HU', 'szeged': 'HU', 'pécs': 'HU', + // Polish cities + 'kalisz': 'PL', 'gdańsk': 'PL', 'wrocław': 'PL', 'poznań': 'PL', 'łódź': 'PL', + 'katowice': 'PL', 'lublin': 'PL', 'szczecin': 'PL', + // Bermuda + 'warwick': 'BM', + // Maltese + 'sliema': 'MT', 'valletta': 'MT', +}; + +// Postal code / state code patterns +const POSTAL_PATTERNS: Array<{ pattern: RegExp; country: string }> = [ + { pattern: /\b[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}\b/, country: 'GB' }, // UK postcode + { pattern: /\b[A-Z]\d{2}\s*[A-Z0-9]{4}\b/, country: 'IE' }, // Irish Eircode + { pattern: /\bCittà del Vaticano\b/i, country: 'VA' }, + { pattern: /\b\d{3}\s*\d{3}\b/, country: 'IN' }, // Indian 6-digit with optional space + { pattern: /\bNSW\s+\d{4}\b/, country: 'AU' }, // Australian state codes + { pattern: /\bVIC\s+\d{4}\b/, country: 'AU' }, + { pattern: /\bQLD\s+\d{4}\b/, country: 'AU' }, + { pattern: /\bSA\s+\d{4}\b/, country: 'AU' }, + { pattern: /\bWA\s+\d{4}\b/, country: 'AU' }, + { pattern: /\bTAS\s+\d{4}\b/, country: 'AU' }, + { pattern: /\bACT\s+\d{4}\b/, country: 'AU' }, + { pattern: /\bNT\s+\d{4}\b/, country: 'AU' }, + { pattern: /\bA-\d{4}\b/, country: 'AT' }, // Austrian postal prefix + { pattern: /\b, PR,?\s*\d{5}\b/, country: 'PR' }, // Puerto Rico + { pattern: /\b\d{2}-\d{3}\b/, country: 'PL' }, // Polish postal code (XX-XXX) + // US state abbreviation + ZIP (e.g., "NY, 11201" or "NY 11201") + { pattern: /\b(AL|AK|AZ|AR|CA|CO|CT|DE|FL|GA|HI|ID|IL|IN|IA|KS|KY|LA|ME|MD|MA|MI|MN|MS|MO|MT|NE|NV|NH|NJ|NM|NY|NC|ND|OH|OK|OR|PA|RI|SC|SD|TN|TX|UT|VT|VA|WA|WV|WI|WY)[,\s]+\d{5}\b/, country: 'US' }, +]; + +// ─── Types ─────────────────────────────────────────────────────────────────── + +interface ParsedChurch { + churchId: string; // weekdaymasses numeric ID + name: string; + latitude: number; + longitude: number; + address: string | null; + phone: string | null; + website: string | null; + country: string; + schedules: ParsedSchedule[]; +} + +interface ParsedSchedule { + dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat + time: string; // "07:00", "18:30" + language: string; + notes: string | null; +} + +interface ImportStats { + churchesParsed: number; + churchesMatched: number; + churchesCreated: number; + churchesSkipped: number; + massSchedulesCreated: number; + errors: number; +} + +interface CLIArgs { + all: boolean; + area?: string; + dryRun: boolean; + resumeFrom: number; + jobId?: string; +} + +// ─── CLI ───────────────────────────────────────────────────────────────────── + +function parseArgs(): CLIArgs { + const args = process.argv.slice(2); + const result: CLIArgs = { all: false, dryRun: false, resumeFrom: 0 }; + + for (let i = 0; i < args.length; i++) { + switch (args[i]) { + case '--all': result.all = true; break; + case '--area': result.area = args[++i]; break; + case '--dry-run': result.dryRun = true; break; + case '--resume-from': result.resumeFrom = parseInt(args[++i], 10); break; + case '--job-id': result.jobId = args[++i]; break; + case '--help': + console.log(`Usage: npx tsx scripts/import-weekdaymasses.ts [options] + --all Import all 3 area pages (gb, ireland, outside-gb) + --area <name> Import specific area (gb, ireland, outside-gb) + --dry-run No database writes + --resume-from <n> Skip first N churches + --job-id <uuid> Background job tracking`); + process.exit(0); + } + } + + if (!result.all && !result.area) { + console.error('Error: specify --all or --area <name>'); + process.exit(1); + } + + return result; +} + +// ─── HTTP ──────────────────────────────────────────────────────────────────── + +async function fetchPage(url: string): Promise<string | null> { + try { + const response = await fetch(url, { + headers: { + 'User-Agent': USER_AGENT, + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + }, + }); + if (!response.ok) { + console.error(` HTTP ${response.status} for ${url}`); + return null; + } + return await response.text(); + } catch (error) { + console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`); + return null; + } +} + +// ─── HTML Parsing ──────────────────────────────────────────────────────────── + +/** + * Extract all church divs from an area page HTML. + * Each church is a `<div class="church" id="pNNNNN">...</div>`. + */ +function extractChurchBlocks(html: string): string[] { + const blocks: string[] = []; + const regex = /<div\s+class="church"\s+id="p(\d+)">/g; + let match: RegExpExecArray | null; + + while ((match = regex.exec(html)) !== null) { + const start = match.index; + // Find the closing </div> at the right nesting level + let depth = 1; + let pos = start + match[0].length; + while (depth > 0 && pos < html.length) { + const nextOpen = html.indexOf('<div', pos); + const nextClose = html.indexOf('</div>', pos); + + if (nextClose === -1) break; + if (nextOpen !== -1 && nextOpen < nextClose) { + depth++; + pos = nextOpen + 4; + } else { + depth--; + pos = nextClose + 6; + } + } + blocks.push(html.substring(start, pos)); + } + + return blocks; +} + +/** + * Parse a single church block HTML into structured data. + */ +function parseChurchBlock(html: string, defaultCountry: string): ParsedChurch | null { + // Church ID from div id="pNNNNN" + const idMatch = html.match(/id="p(\d+)"/); + if (!idMatch) return null; + const churchId = idMatch[1]; + + // Name from h3 + const nameMatch = html.match(/<h3>(.*?)<\/h3>/s); + if (!nameMatch) return null; + const name = decodeHtmlEntities(nameMatch[1].trim()); + + // Coordinates from map link + let latitude = 0; + let longitude = 0; + const mapMatch = html.match(/lat=(-?[\d.]+)&(?:amp;)?lon=(-?[\d.]+)/); + if (mapMatch) { + latitude = parseFloat(mapMatch[1]); + longitude = parseFloat(mapMatch[2]); + } + + // Address from p.address — text after the <br> tag + let address: string | null = null; + const addressMatch = html.match(/<p\s+class="address">([\s\S]*?)<\/p>/); + if (addressMatch) { + const addressHtml = addressMatch[1]; + // Get text after last <br> (or after Streetview link) + const brIdx = addressHtml.lastIndexOf('<br'); + if (brIdx !== -1) { + const afterBr = addressHtml.substring(brIdx); + const textAfterTag = afterBr.replace(/<br\s*\/?>/, '').trim(); + address = stripHtmlTags(textAfterTag).trim() || null; + } + } + + // Phone from p.telephone + let phone: string | null = null; + const phoneMatch = html.match(/<p\s+class="telephone">[\s\S]*?Tel:<\/span>\s*(.*?)<\/p>/); + if (phoneMatch) { + phone = phoneMatch[1].trim() || null; + } + + // Website from p.transport with "Link to church website:" + let website: string | null = null; + const websiteMatch = html.match(/Link to church website:<\/span>\s*<a[^>]+href="([^"]+)"/); + if (websiteMatch) { + website = websiteMatch[1]; + } + + // Country detection + let country = defaultCountry; + if (!country && address) { + country = detectCountry(address); + } + if (!country) country = 'XX'; // Unknown + + // Mass schedules from p.times + const schedules = parseScheduleBlocks(html); + + return { churchId, name, latitude, longitude, address, phone, website, country, schedules }; +} + +/** + * Detect country from address text using three strategies: + * 1. Country name anywhere in address + * 2. City/region name matching + * 3. Postal code / state code patterns + */ +function detectCountry(address: string): string { + // Clean address for matching + const cleaned = address.replace(/\r?\n/g, ' ').trim(); + const lower = cleaned.toLowerCase(); + + // 1. Country name match (check longer names first to avoid "India" matching in "Indiana") + const sortedCountries = Object.entries(COUNTRY_NAME_MAP) + .sort((a, b) => b[0].length - a[0].length); + for (const [name, code] of sortedCountries) { + if (lower.includes(name)) return code; + } + + // 2. City/region match + for (const [city, code] of Object.entries(CITY_COUNTRY_MAP)) { + // Use word boundary-like check to avoid partial matches + const idx = lower.indexOf(city); + if (idx !== -1) { + const before = idx > 0 ? lower[idx - 1] : ' '; + const after = idx + city.length < lower.length ? lower[idx + city.length] : ' '; + if (/[\s,.(]/.test(before) || idx === 0) { + if (/[\s,.):\r\n]/.test(after) || idx + city.length === lower.length) { + return code; + } + } + } + } + + // 3. US state detection — match "State, ZIPCODE" or "State ZIPCODE" patterns + // to avoid false positives from country names like "Georgia" + const US_STATES = [ + 'alabama', 'alaska', 'arizona', 'arkansas', 'california', 'colorado', + 'connecticut', 'delaware', 'florida', 'hawaii', 'idaho', + 'illinois', 'indiana', 'iowa', 'kansas', 'kentucky', 'louisiana', + 'maine', 'maryland', 'massachusetts', 'michigan', 'minnesota', + 'mississippi', 'missouri', 'montana', 'nebraska', 'nevada', + 'new hampshire', 'new jersey', 'new mexico', 'new york', 'north carolina', + 'north dakota', 'ohio', 'oklahoma', 'oregon', 'pennsylvania', + 'rhode island', 'south carolina', 'south dakota', 'tennessee', 'texas', + 'utah', 'vermont', 'virginia', 'washington', 'west virginia', + 'wisconsin', 'wyoming', 'georgia', // Georgia last — also a country, but with ZIP it's US + ]; + for (const state of US_STATES) { + const idx = lower.indexOf(state); + if (idx !== -1) { + // Check if followed by comma+ZIP or just ZIP (US address pattern) + const after = cleaned.substring(idx + state.length); + if (/^[,\s]+\d{5}/.test(after)) return 'US'; + // Also match state abbreviation patterns like "FL 34747" — 2-letter code + ZIP + if (state !== 'georgia' && /^[,\s]*$/.test(after)) return 'US'; + } + } + + // 4. Postal code patterns + for (const { pattern, country } of POSTAL_PATTERNS) { + if (pattern.test(cleaned)) return country; + } + + return ''; +} + +/** + * Parse all <p class="times"> blocks into schedule entries. + */ +function parseScheduleBlocks(html: string): ParsedSchedule[] { + const schedules: ParsedSchedule[] = []; + const timesRegex = /<p\s+class="times">([\s\S]*?)<\/p>/g; + let match: RegExpExecArray | null; + + while ((match = timesRegex.exec(html)) !== null) { + const text = stripHtmlTags(match[1]) + .replace(/\s+/g, ' ') + .trim(); + + const parsed = parseTimesLine(text); + schedules.push(...parsed); + } + + return schedules; +} + +/** + * Parse a single mass times line like: + * "Sunday: 6.30am(Tamil), 8.30am(Tamil), 5.30pm(English)" + * "Weekday: Monday, Tuesday, Wednesday 6.15am" + * "Mon Tue Wed Thu Fri: 6.30am(Tamil)" + */ +function parseTimesLine(text: string): ParsedSchedule[] { + const schedules: ParsedSchedule[] = []; + + // Split on colon — left side is days, right side is times + const colonIdx = text.indexOf(':'); + if (colonIdx === -1) return schedules; + + const dayPart = text.substring(0, colonIdx).trim(); + const timePart = text.substring(colonIdx + 1).trim(); + + // Parse default day numbers from the day label + const defaultDays = parseDayLabel(dayPart); + + // Split on semicolons to handle "Monday 10.00am; Thursday 7.30pm" patterns + const parts = timePart.split(';').map(p => p.trim()).filter(Boolean); + + for (const part of parts) { + // Check if specific day names appear at the start of this part + const { specificDays, cleanedTimePart } = extractSpecificDays(part); + const days = specificDays.length > 0 ? specificDays : defaultDays; + + if (days.length === 0) continue; + + const timeStr = specificDays.length > 0 ? cleanedTimePart : part; + + // Extract individual time entries: "7.00am(Tamil), 8.30am(English), ..." + const timeEntries = extractTimeEntries(timeStr); + + for (const entry of timeEntries) { + const time24 = convertTo24h(entry.time); + if (!time24) continue; + + for (const day of days) { + schedules.push({ + dayOfWeek: day, + time: time24, + language: entry.language, + notes: entry.notes, + }); + } + } + } + + return schedules; +} + +// Day name mappings +const DAY_MAP: Record<string, number> = { + 'sunday': 0, 'sun': 0, + 'monday': 1, 'mon': 1, + 'tuesday': 2, 'tue': 2, + 'wednesday': 3, 'wed': 3, + 'thursday': 4, 'thu': 4, + 'friday': 5, 'fri': 5, + 'saturday': 6, 'sat': 6, +}; + +/** + * Parse day label (left of colon) into day numbers. + */ +function parseDayLabel(label: string): number[] { + const lower = label.toLowerCase().trim(); + + // "Weekday" = Mon-Fri + if (lower === 'weekday' || lower === 'weekdays') { + return [1, 2, 3, 4, 5]; + } + + // "Holy Day" or "Holyday" + if (lower.includes('holy day') || lower.includes('holyday')) { + return []; // Skip holy days — not a regular schedule + } + + // Try to parse individual day names from the label + // e.g., "Mon Tue Wed Thu Fri" or "Monday Tuesday" + const days: number[] = []; + const words = lower.split(/[\s,]+/); + for (const word of words) { + const dayNum = DAY_MAP[word]; + if (dayNum !== undefined) { + days.push(dayNum); + } + } + + return days; +} + +/** + * Check if the time part starts with specific day names. + * e.g., "Monday, Tuesday, Wednesday 6.15am" -> days=[1,2,3], cleaned="6.15am" + */ +function extractSpecificDays(timePart: string): { specificDays: number[]; cleanedTimePart: string } { + const days: number[] = []; + let remaining = timePart; + + // Match day names at the start, separated by commas/spaces + const dayPattern = /^((?:(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|Mon|Tue|Wed|Thu|Fri|Sat|Sun)[,\s]*)+)/i; + const match = remaining.match(dayPattern); + + if (match) { + const dayStr = match[1]; + const words = dayStr.split(/[\s,]+/).filter(Boolean); + + let allAreDays = true; + for (const word of words) { + const lower = word.toLowerCase(); + if (DAY_MAP[lower] !== undefined) { + days.push(DAY_MAP[lower]); + } else { + allAreDays = false; + break; + } + } + + if (allAreDays && days.length > 0) { + remaining = remaining.substring(match[0].length).trim(); + } else { + days.length = 0; // Reset if not all words were days + } + } + + return { specificDays: days, cleanedTimePart: remaining }; +} + +interface TimeEntry { + time: string; // Raw time: "7.00am", "6.30pm" + language: string; + notes: string | null; +} + +/** + * Extract time entries from a times string. + * e.g., "7.00am(Tamil), 8.30am(English), 12.00pm" -> [{time: "7.00am", language: "Tamil"}, ...] + */ +function extractTimeEntries(text: string): TimeEntry[] { + const entries: TimeEntry[] = []; + // Match time patterns: digits.digitsam/pm optionally followed by (annotation) + const pattern = /(\d{1,2}\.\d{2}\s*(?:am|pm))(?:\s*\(([^)]*)\))?/gi; + let match: RegExpExecArray | null; + + while ((match = pattern.exec(text)) !== null) { + const rawTime = match[1].replace(/\s/g, ''); + const annotation = match[2]?.trim() || null; + + let language = 'English'; + let notes: string | null = null; + + if (annotation) { + if (KNOWN_LANGUAGES.has(annotation.toLowerCase())) { + language = annotation.charAt(0).toUpperCase() + annotation.slice(1).toLowerCase(); + } else { + notes = annotation; + } + } + + entries.push({ time: rawTime, language, notes }); + } + + return entries; +} + +/** + * Convert time from "H.MMam/pm" format to "HH:MM" 24h format. + */ +function convertTo24h(time: string): string | null { + const match = time.match(/^(\d{1,2})\.(\d{2})(am|pm)$/i); + if (!match) return null; + + let hours = parseInt(match[1], 10); + const mins = parseInt(match[2], 10); + const period = match[3].toLowerCase(); + + if (mins < 0 || mins > 59) return null; + + if (period === 'am') { + if (hours === 12) hours = 0; + } else { + if (hours !== 12) hours += 12; + } + + if (hours < 0 || hours > 23) return null; + + return `${String(hours).padStart(2, '0')}:${String(mins).padStart(2, '0')}`; +} + +// ─── Helpers ──────────────────────────────────────────────────────────────── + +function stripHtmlTags(html: string): string { + return html.replace(/<[^>]+>/g, ''); +} + +function decodeHtmlEntities(text: string): string { + return text + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/'/g, "'") + .replace(/'/g, "'") + .replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10))); +} + +// ─── Job Management ───────────────────────────────────────────────────────── + +async function createOrResumeJob(jobId?: string): Promise<string | null> { + if (jobId) { + await prisma.backgroundJob.update({ + where: { id: jobId }, + data: { status: 'running', startedAt: new Date() }, + }); + return jobId; + } + return null; +} + +async function completeJob(jobId: string | null, error?: string): Promise<void> { + if (!jobId) return; + try { + await prisma.backgroundJob.update({ + where: { id: jobId }, + data: { + status: error ? 'failed' : 'completed', + error: error || null, + completedAt: new Date(), + }, + }); + } catch (err) { + console.error(`Failed to update job ${jobId}:`, err); + } +} + +async function updateJobProgress(jobId: string | null, stats: ImportStats, total: number, processed: number): Promise<void> { + if (!jobId) return; + try { + await prisma.backgroundJob.update({ + where: { id: jobId }, + data: { + totalItems: total, + processed, + succeeded: stats.churchesMatched + stats.churchesCreated, + failed: stats.errors, + itemsFound: stats.churchesParsed, + }, + }); + } catch (err) { + console.error(`Failed to update job progress:`, err); + } +} + +// ─── Database ─────────────────────────────────────────────────────────────── + +async function loadExistingChurches(): Promise<ExistingChurch[]> { + const churches = await prisma.church.findMany({ + where: { + OR: [ + { weekdayMassesId: { not: null } }, + { country: { in: ['GB', 'IE', 'IN', 'LK', 'FR', 'IT', 'VA', 'PT', 'ES', 'KR', 'JP', 'PH', 'SG', 'MY', 'HK'] } }, + ], + }, + select: { + id: true, + name: true, + latitude: true, + longitude: true, + osmId: true, + baiduId: true, + masstimesId: true, + orarimesseId: true, + massSchedulesPhId: true, + philmassId: true, + horariosMisasId: true, + mszeInfoId: true, + weekdayMassesId: true, + messesInfoId: true, + bohosluzbyId: true, + miserendId: true, + kerknetId: true, + gottesdienstzeitenId: true, + discovermassId: true, + source: true, + website: true, + phone: true, + address: true, + country: true, + }, + }); + return churches; +} + +// ─── Main Import ──────────────────────────────────────────────────────────── + +async function importAreaBlocks( + areaName: string, + config: { url: string; defaultCountry: string }, + blocks: string[], + existingChurches: ExistingChurch[], + stats: ImportStats, + dryRun: boolean, + resumeFrom: number, + jobId: string | null, + globalProcessed: number, + globalTotal: number, +): Promise<number> { + console.log(`\nProcessing ${areaName}: ${blocks.length} churches`); + + const startTime = Date.now(); + + for (let i = 0; i < blocks.length; i++) { + const absoluteIndex = globalProcessed + i; + if (absoluteIndex < resumeFrom) continue; + + const church = parseChurchBlock(blocks[i], config.defaultCountry); + if (!church) { + stats.errors++; + continue; + } + stats.churchesParsed++; + + if (dryRun) { + if (stats.churchesParsed <= 20) { + const elapsed = ((Date.now() - startTime) / 1000).toFixed(0); + console.log(` [${areaName}] ${church.name} (${church.country}) — ${church.schedules.length} schedules, coords: ${church.latitude.toFixed(4)}, ${church.longitude.toFixed(4)} [${elapsed}s]`); + } + continue; + } + + try { + const candidate = { + name: church.name, + lat: church.latitude, + lng: church.longitude, + weekdayMassesId: church.churchId, + }; + + const duplicate = findDuplicateChurch(candidate, existingChurches); + + if (duplicate) { + // Update existing church + const updateData: Record<string, unknown> = { + weekdayMassesId: church.churchId, + lastScrapedAt: new Date(), + }; + // Only fill in missing fields + if (!duplicate.phone && church.phone) updateData.phone = church.phone; + if (!duplicate.website && church.website) { + updateData.website = church.website; + updateData.hasWebsite = true; + } + if (!duplicate.address && church.address) updateData.address = church.address; + // Update country if existing is unknown (XX) and we detected a real one + if (duplicate.country === 'XX' && church.country !== 'XX') { + updateData.country = church.country; + } + + try { + await prisma.church.update({ + where: { id: duplicate.id }, + data: updateData, + }); + } catch (error) { + if (error instanceof Error && error.message.includes('Unique constraint')) { + stats.churchesSkipped++; + continue; + } + throw error; + } + + // Replace mass schedules if we have new ones + if (church.schedules.length > 0) { + await prisma.$transaction(async (tx) => { + await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } }); + await tx.massSchedule.createMany({ + data: church.schedules.map((s) => ({ + churchId: duplicate.id, + dayOfWeek: s.dayOfWeek, + time: s.time, + language: s.language, + notes: s.notes, + })), + }); + }); + stats.massSchedulesCreated += church.schedules.length; + } + + stats.churchesMatched++; + } else { + // Create new church + try { + const newChurch = await prisma.church.create({ + data: { + name: church.name, + latitude: church.latitude, + longitude: church.longitude, + address: church.address, + country: church.country, + phone: church.phone, + website: church.website, + hasWebsite: !!church.website, + weekdayMassesId: church.churchId, + source: 'weekdaymasses', + lastScrapedAt: church.schedules.length > 0 ? new Date() : null, + }, + }); + + // Create mass schedules + if (church.schedules.length > 0) { + await prisma.massSchedule.createMany({ + data: church.schedules.map((s) => ({ + churchId: newChurch.id, + dayOfWeek: s.dayOfWeek, + time: s.time, + language: s.language, + notes: s.notes, + })), + }); + stats.massSchedulesCreated += church.schedules.length; + } + + // Add to in-memory dedup list + existingChurches.push({ + id: newChurch.id, + name: church.name, + latitude: church.latitude, + longitude: church.longitude, + osmId: null, + baiduId: null, + masstimesId: null, + orarimesseId: null, + massSchedulesPhId: null, + philmassId: null, + horariosMisasId: null, + mszeInfoId: null, + weekdayMassesId: church.churchId, + messesInfoId: null, + bohosluzbyId: null, + miserendId: null, + kerknetId: null, + gottesdienstzeitenId: null, + discovermassId: null, + source: 'weekdaymasses', + website: church.website, + phone: church.phone, + address: church.address, + country: church.country, + }); + + stats.churchesCreated++; + } catch (error) { + if (error instanceof Error && error.message.includes('Unique constraint')) { + stats.churchesSkipped++; + continue; + } + throw error; + } + } + } catch (error) { + console.error(` Error processing ${church.name} (${church.churchId}): ${error instanceof Error ? error.message : error}`); + stats.errors++; + } + + // Progress logging + const totalProcessed = absoluteIndex + 1; + if (totalProcessed % 500 === 0) { + const elapsed = ((Date.now() - startTime) / 1000).toFixed(0); + console.log(` Progress: ${totalProcessed}/${globalTotal} [${elapsed}s]`); + await updateJobProgress(jobId, stats, globalTotal, totalProcessed); + } + } + + return globalProcessed + blocks.length; +} + +// ─── Main ─────────────────────────────────────────────────────────────────── + +async function main() { + const args = parseArgs(); + const jobId = await createOrResumeJob(args.jobId); + + console.log(`\n${'='.repeat(70)}`); + console.log('WEEKDAYMASSES.ORG.UK IMPORTER'); + console.log('='.repeat(70)); + console.log(`Mode: ${args.all ? 'All areas' : `Area: ${args.area}`}`); + console.log(`Dry run: ${args.dryRun ? 'YES' : 'NO'}`); + if (args.resumeFrom > 0) console.log(`Resume from: ${args.resumeFrom}`); + console.log(`Time: ${new Date().toISOString()}`); + console.log('='.repeat(70)); + + const stats: ImportStats = { + churchesParsed: 0, + churchesMatched: 0, + churchesCreated: 0, + churchesSkipped: 0, + massSchedulesCreated: 0, + errors: 0, + }; + + // Determine which areas to import + const areas: Array<[string, { url: string; defaultCountry: string }]> = []; + if (args.all) { + areas.push(...Object.entries(AREA_PAGES)); + } else if (args.area) { + const config = AREA_PAGES[args.area]; + if (!config) { + console.error(`Unknown area: ${args.area}. Valid: ${Object.keys(AREA_PAGES).join(', ')}`); + process.exit(1); + } + areas.push([args.area, config]); + } + + // Load existing churches for deduplication + if (!args.dryRun) { + console.log('\nLoading existing churches for deduplication...'); + } + const existingChurches = args.dryRun ? [] : await loadExistingChurches(); + if (!args.dryRun) { + console.log(`Loaded ${existingChurches.length} existing churches`); + } + + // Pre-fetch all area pages to get accurate total count for progress tracking + console.log('\nFetching area pages...'); + const fetchedAreas: Array<{ name: string; config: { url: string; defaultCountry: string }; blocks: string[] }> = []; + let globalTotal = 0; + for (const [areaName, config] of areas) { + console.log(` Fetching ${areaName}: ${SITE_BASE}${config.url}`); + const html = await fetchPage(`${SITE_BASE}${config.url}`); + if (!html) { + console.error(` Failed to fetch ${areaName} page`); + continue; + } + console.log(` Page size: ${(html.length / 1024 / 1024).toFixed(1)} MB`); + const blocks = extractChurchBlocks(html); + console.log(` Found ${blocks.length} church blocks`); + globalTotal += blocks.length; + fetchedAreas.push({ name: areaName, config, blocks }); + } + console.log(`\nTotal churches across all areas: ${globalTotal}`); + + let globalProcessed = 0; + + try { + for (const { name: areaName, config, blocks } of fetchedAreas) { + globalProcessed = await importAreaBlocks( + areaName, config, blocks, existingChurches, stats, + args.dryRun, args.resumeFrom, jobId, + globalProcessed, globalTotal, + ); + } + + // Print summary + console.log(`\n${'='.repeat(70)}`); + console.log(`WEEKDAYMASSES IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`); + console.log('='.repeat(70)); + console.log(`Churches parsed: ${stats.churchesParsed}`); + if (!args.dryRun) { + console.log(`Churches matched: ${stats.churchesMatched}`); + console.log(`Churches created: ${stats.churchesCreated}`); + console.log(`Churches skipped: ${stats.churchesSkipped} (duplicates)`); + console.log(`Mass schedules created: ${stats.massSchedulesCreated}`); + } + if (stats.errors > 0) { + console.log(`Errors: ${stats.errors}`); + } + console.log('='.repeat(70)); + + await completeJob(jobId); + } catch (error) { + console.error('Fatal error:', error); + await completeJob(jobId, String(error)); + process.exit(1); + } finally { + await prisma.$disconnect(); + } +} + +main(); diff --git a/src/lib/church-matcher.ts b/src/lib/church-matcher.ts new file mode 100644 index 0000000..373cbdf --- /dev/null +++ b/src/lib/church-matcher.ts @@ -0,0 +1,396 @@ +/** + * Church matching and deduplication logic + * Used to avoid duplicate churches when importing from multiple sources (OSM, MassTimes, etc.) + */ + +import { calculateDistance } from './geo'; +import type { OSMChurch } from './overpass-client'; +import type { BaiduChurch } from './baidu-client'; + +// Type for existing church from database +export interface ExistingChurch { + id: string; + name: string; + latitude: number; + longitude: number; + osmId: string | null; + baiduId: string | null; + masstimesId: string | null; + orarimesseId: string | null; + massSchedulesPhId: string | null; + philmassId: string | null; + horariosMisasId: string | null; + mszeInfoId: string | null; + weekdayMassesId: string | null; + messesInfoId: string | null; + bohosluzbyId: string | null; + miserendId: string | null; + kerknetId: string | null; + gottesdienstzeitenId: string | null; + discovermassId: string | null; + source: string; + website: string | null; + phone: string | null; + address: string | null; + country?: string; +} + +// Maximum distance in km to consider churches as potential duplicates +const DUPLICATE_DISTANCE_KM = 0.2; // 200 meters + +/** + * Normalize church name for comparison + * - Lowercase + * - Expand "St." to "Saint" + * - Remove common suffixes like "Catholic Church", "Parish", etc. + * - Remove punctuation + */ +function normalizeName(name: string): string { + return name + .toLowerCase() + .replace(/\bst\.\s/g, 'saint ') + .replace(/\bst\s/g, 'saint ') + .replace(/\bcatholic church\b/g, '') + .replace(/\bparish\b/g, '') + .replace(/\broman catholic\b/g, '') + .replace(/\bchurch\b/g, '') + .replace(/[^\w\s]/g, '') // Remove punctuation + .replace(/\s+/g, ' ') // Normalize whitespace + .trim(); +} + +/** + * Calculate Levenshtein distance between two strings + * Used for fuzzy name matching + */ +function levenshteinDistance(a: string, b: string): number { + const matrix: number[][] = []; + + for (let i = 0; i <= b.length; i++) { + matrix[i] = [i]; + } + + for (let j = 0; j <= a.length; j++) { + matrix[0][j] = j; + } + + for (let i = 1; i <= b.length; i++) { + for (let j = 1; j <= a.length; j++) { + if (b.charAt(i - 1) === a.charAt(j - 1)) { + matrix[i][j] = matrix[i - 1][j - 1]; + } else { + matrix[i][j] = Math.min( + matrix[i - 1][j - 1] + 1, // substitution + matrix[i][j - 1] + 1, // insertion + matrix[i - 1][j] + 1 // deletion + ); + } + } + } + + return matrix[b.length][a.length]; +} + +/** + * Check if two normalized names are similar + * Returns true if they have a common substring of 5+ characters OR Levenshtein distance < 5 + */ +function namesAreSimilar(name1: string, name2: string): boolean { + const normalized1 = normalizeName(name1); + const normalized2 = normalizeName(name2); + + // Check for common substring of 5+ characters + const minLength = Math.min(normalized1.length, normalized2.length); + if (minLength >= 5) { + for (let i = 0; i <= normalized1.length - 5; i++) { + const substring = normalized1.substring(i, i + 5); + if (normalized2.includes(substring)) { + return true; + } + } + } + + // Check Levenshtein distance + const distance = levenshteinDistance(normalized1, normalized2); + if (distance < 5) { + return true; + } + + return false; +} + +// Candidate type for deduplication — works with OSM, Baidu, or any source +export type ChurchCandidate = { + name: string; + lat: number; + lng: number; + osmId?: string; + baiduId?: string; + orarimesseId?: string; + massSchedulesPhId?: string; + philmassId?: string; + horariosMisasId?: string; + mszeInfoId?: string; + weekdayMassesId?: string; + messesInfoId?: string; + bohosluzbyId?: string; + miserendId?: string; + kerknetId?: string; + gottesdienstzeitenId?: string; + discovermassId?: string; +}; + +/** + * Find duplicate church in existing database + * Returns the best match or null if no duplicate found + * + * Matching strategy (in priority order): + * 1. Exact osmId match + * 2. Exact baiduId match + * 3-9. Exact importer ID matches (orarimesse, massSchedulesPh, philmass, horariosMisas, mszeInfo, weekdayMasses, messesInfo) + * 10. Proximity + name similarity (within 200m + similar name) + */ +export function findDuplicateChurch( + candidate: ChurchCandidate, + existingChurches: ExistingChurch[] +): ExistingChurch | null { + // First pass: exact osmId match + if (candidate.osmId) { + const osmMatch = existingChurches.find((church) => church.osmId === candidate.osmId); + if (osmMatch) { + return osmMatch; + } + } + + // Second pass: exact baiduId match + if (candidate.baiduId) { + const baiduMatch = existingChurches.find((church) => church.baiduId === candidate.baiduId); + if (baiduMatch) { + return baiduMatch; + } + } + + // Third pass: exact orarimesseId match + if (candidate.orarimesseId) { + const orarimesseMatch = existingChurches.find( + (church) => church.orarimesseId === candidate.orarimesseId + ); + if (orarimesseMatch) return orarimesseMatch; + } + + // Fourth pass: exact massSchedulesPhId match + if (candidate.massSchedulesPhId) { + const msphMatch = existingChurches.find( + (church) => church.massSchedulesPhId === candidate.massSchedulesPhId + ); + if (msphMatch) return msphMatch; + } + + // Fifth pass: exact philmassId match + if (candidate.philmassId) { + const philmassMatch = existingChurches.find( + (church) => church.philmassId === candidate.philmassId + ); + if (philmassMatch) return philmassMatch; + } + + // Sixth pass: exact horariosMisasId match + if (candidate.horariosMisasId) { + const horariosMisasMatch = existingChurches.find( + (church) => church.horariosMisasId === candidate.horariosMisasId + ); + if (horariosMisasMatch) return horariosMisasMatch; + } + + // Seventh pass: exact mszeInfoId match + if (candidate.mszeInfoId) { + const mszeInfoMatch = existingChurches.find( + (church) => church.mszeInfoId === candidate.mszeInfoId + ); + if (mszeInfoMatch) return mszeInfoMatch; + } + + // Eighth pass: exact weekdayMassesId match + if (candidate.weekdayMassesId) { + const weekdayMassesMatch = existingChurches.find( + (church) => church.weekdayMassesId === candidate.weekdayMassesId + ); + if (weekdayMassesMatch) return weekdayMassesMatch; + } + + // Ninth pass: exact messesInfoId match + if (candidate.messesInfoId) { + const messesInfoMatch = existingChurches.find( + (church) => church.messesInfoId === candidate.messesInfoId + ); + if (messesInfoMatch) return messesInfoMatch; + } + + // Tenth pass: exact bohosluzbyId match + if (candidate.bohosluzbyId) { + const bohosluzbyMatch = existingChurches.find( + (church) => church.bohosluzbyId === candidate.bohosluzbyId + ); + if (bohosluzbyMatch) return bohosluzbyMatch; + } + + // Eleventh pass: exact miserendId match + if (candidate.miserendId) { + const miserendMatch = existingChurches.find( + (church) => church.miserendId === candidate.miserendId + ); + if (miserendMatch) return miserendMatch; + } + + // Twelfth pass: exact kerknetId match + if (candidate.kerknetId) { + const kerknetMatch = existingChurches.find( + (church) => church.kerknetId === candidate.kerknetId + ); + if (kerknetMatch) return kerknetMatch; + } + + // Thirteenth pass: exact gottesdienstzeitenId match + if (candidate.gottesdienstzeitenId) { + const gdzMatch = existingChurches.find( + (church) => church.gottesdienstzeitenId === candidate.gottesdienstzeitenId + ); + if (gdzMatch) return gdzMatch; + } + + // Fourteenth pass: exact discovermassId match + if (candidate.discovermassId) { + const match = existingChurches.find(c => c.discovermassId === candidate.discovermassId); + if (match) return match; + } + + // Fifteenth pass: proximity + name match (skip if candidate has no real coordinates) + if (candidate.lat === 0 && candidate.lng === 0) { + return null; + } + const nearbyChurches = existingChurches.filter((church) => { + const distance = calculateDistance( + { lat: candidate.lat, lng: candidate.lng }, + { lat: church.latitude, lng: church.longitude } + ); + return distance <= DUPLICATE_DISTANCE_KM; + }); + + if (nearbyChurches.length === 0) { + return null; + } + + // Among nearby churches, find one with similar name + for (const church of nearbyChurches) { + if (namesAreSimilar(candidate.name, church.name)) { + return church; + } + } + + return null; +} + +/** + * Merge OSM data into existing church record + * Only overwrites fields that are null/empty in existing with non-null OSM data + * + * Rules: + * - Never overwrite: name (if existing has one), massSchedules, scraperConfig + * - Always update: osmId, osmLastSyncedAt, hasWebsite + * - Prefer existing data for: phone, address, website (if already populated) + * - Use OSM data for: phone, address, website (only if existing field is null) + */ +export function mergeChurchData( + existing: ExistingChurch, + osmData: OSMChurch +): Partial<ExistingChurch> & { osmId: string; osmLastSyncedAt: Date; hasWebsite: boolean } { + const merged: any = { + osmId: osmData.osmId, + osmLastSyncedAt: new Date(), + hasWebsite: !!osmData.website, + }; + + // Only update coordinates if they differ significantly (more than 50m) + const coordDistance = calculateDistance( + { lat: existing.latitude, lng: existing.longitude }, + { lat: osmData.lat, lng: osmData.lng } + ); + if (coordDistance > 0.05) { + merged.latitude = osmData.lat; + merged.longitude = osmData.lng; + } + + // Update address fields only if existing is null + if (!existing.address && osmData.address) { + merged.address = osmData.address; + } + + // Update phone only if existing is null + if (!existing.phone && osmData.phone) { + merged.phone = osmData.phone; + } + + // Update website only if existing is null + if (!existing.website && osmData.website) { + merged.website = osmData.website; + } + + // Update source to "osm" if currently "manual" + if (existing.source === 'manual') { + merged.source = 'osm'; + } + + return merged; +} + +/** + * Merge Baidu Maps data into existing church record + * Similar to mergeChurchData but for Baidu source + * + * Rules: + * - Always set: baiduId, baiduLastSyncedAt + * - Prefer existing data for: phone, address, website (if already populated) + * - Use Baidu data only if existing field is null + */ +export function mergeBaiduData( + existing: ExistingChurch, + baiduData: BaiduChurch +): Record<string, unknown> { + const merged: Record<string, unknown> = { + baiduId: baiduData.baiduId, + baiduLastSyncedAt: new Date(), + }; + + // Only update coordinates if they differ significantly (more than 50m) + const coordDistance = calculateDistance( + { lat: existing.latitude, lng: existing.longitude }, + { lat: baiduData.lat, lng: baiduData.lng } + ); + if (coordDistance > 0.05) { + // Only update coords if existing has no osmId (OSM coords are more reliable) + if (!existing.osmId) { + merged.latitude = baiduData.lat; + merged.longitude = baiduData.lng; + } + } + + // Update address only if existing is null + if (!existing.address && baiduData.address) { + merged.address = baiduData.address; + } + + // Update phone only if existing is null + if (!existing.phone && baiduData.phone) { + merged.phone = baiduData.phone; + } + + // Set city/province if not set + if (baiduData.city) { + merged.city = merged.city || baiduData.city; + } + if (baiduData.province) { + merged.state = merged.state || baiduData.province; + } + + return merged; +}