#!/usr/bin/env tsx /** * Import Catholic churches from GCatholic.org * * GCatholic is a comprehensive Catholic directory organized by diocese. * Each church page includes a Google Plus Code (→ lat/lng), address, phone, website, etc. * This script discovers churches via country → diocese → church page navigation. * * Usage: * npx tsx scripts/import-gcatholic.ts --country CN * npx tsx scripts/import-gcatholic.ts --country CN --dry-run * npx tsx scripts/import-gcatholic.ts --diocese peki0 * npx tsx scripts/import-gcatholic.ts --all * npx tsx scripts/import-gcatholic.ts --all --limit 100 * npx tsx scripts/import-gcatholic.ts --all --resume-from PL */ // Load .env for database connection (before importing anything that uses process.env) import dotenv from 'dotenv'; import path from 'path'; dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); dotenv.config({ path: path.resolve(process.cwd(), '.env') }); // Create a fresh Prisma client for this script (don't use cached pool from lib/db) import { Pool } from 'pg'; import { PrismaPg } from '@prisma/adapter-pg'; import { PrismaClient } from '@prisma/client'; const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); const pool = new Pool({ connectionString: dbUrl, ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined, }); const adapter = new PrismaPg(pool); const prisma = new PrismaClient({ adapter }); import { findDuplicateChurch } from '../src/lib/church-matcher'; import type { ExistingChurch } from '../src/lib/church-matcher'; // Plus Code decoder // eslint-disable-next-line @typescript-eslint/no-require-imports const { OpenLocationCode } = require('open-location-code'); const olc = new OpenLocationCode(); // ─── Constants ─────────────────────────────────────────────────────────────── const BASE_URL = 'https://www.gcatholic.org'; const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)'; const DEFAULT_DELAY_MS = 1500; // ─── Types ─────────────────────────────────────────────────────────────────── interface GCatholicChurch { gcatholicId: string; name: string; localName?: string; lat: number; lng: number; address?: string; city?: string; state?: string; country?: string; phone?: string; website?: string; diocese?: string; churchType?: string; plusCode: string; sourceUrl: string; } interface ImportStats { churchesFound: number; newChurchesCreated: number; existingChurchesMerged: number; skipped: number; errors: number; errorDetails: string[]; } interface CLIArgs { country?: string; all: boolean; diocese?: string; dryRun: boolean; limit?: number; delay: number; resumeFrom?: string; } // ─── HTTP Fetching ─────────────────────────────────────────────────────────── let requestCount = 0; async function fetchPage(url: string, delayMs: number): Promise { // Rate limit if (requestCount > 0) { await new Promise((resolve) => setTimeout(resolve, delayMs)); } requestCount++; try { const response = await fetch(url, { headers: { 'User-Agent': USER_AGENT, 'Accept': 'text/html,application/xhtml+xml', 'Accept-Language': 'en-US,en;q=0.9', }, }); if (!response.ok) { if (response.status === 404) { return null; // Expected for some pages } console.error(` HTTP ${response.status} for ${url}`); return null; } return await response.text(); } catch (error) { console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`); return null; } } // ─── HTML Parsing ──────────────────────────────────────────────────────────── /** * Extract all country codes from the GCatholic countries page. * Links follow pattern: country/{ISO2} */ async function discoverCountries(delayMs: number): Promise { console.log('Discovering countries from GCatholic...'); const html = await fetchPage(`${BASE_URL}/dioceses/`, delayMs); if (!html) { console.error('Failed to fetch countries page'); return []; } const countryCodes = new Set(); // Match links like: href="country/CN" or href="/dioceses/country/CN" const regex = /href="(?:\.\.\/|\/dioceses\/)?country\/([A-Z]{2})(?:\.htm)?"/g; let match; while ((match = regex.exec(html)) !== null) { countryCodes.add(match[1]); } const codes = Array.from(countryCodes).sort(); console.log(`Found ${codes.length} countries`); return codes; } /** * Extract diocese codes from a country page. * Links follow pattern: ../diocese/{code} or diocese/{code} */ async function discoverDioceses(countryCode: string, delayMs: number): Promise<{ code: string; name: string }[]> { const html = await fetchPage(`${BASE_URL}/dioceses/country/${countryCode}.htm`, delayMs); if (!html) { return []; } const dioceses: { code: string; name: string }[] = []; const seen = new Set(); // Match links like: href="../diocese/peki0" or href="../../dioceses/diocese/peki0" // The text after the link is the diocese name const regex = /href="(?:\.\.\/)?(?:\.\.\/dioceses\/)?diocese\/([a-z0-9]+)(?:\.htm)?"[^>]*>([^<]+) { const html = await fetchPage(`${BASE_URL}/dioceses/diocese/${dioceseCode}.htm`, delayMs); if (!html) { return []; } const churchUrls = new Set(); // Match church links like: href="../../churches/china/46492" or href="../../churches/asia/1893" const regex = /href="(?:\.\.\/)*churches\/([a-z0-9-]+\/\d+)(?:\.htm)?"/g; let match; while ((match = regex.exec(html)) !== null) { const churchPath = match[1]; churchUrls.add(`${BASE_URL}/churches/${churchPath}.htm`); } return Array.from(churchUrls); } /** * Parse a single church page and extract structured data. */ function parseChurchPage(html: string, url: string, countryCode?: string): GCatholicChurch | null { // Extract church name from

const h1Match = html.match(/

([^<]+)<\/h1>/); if (!h1Match) return null; const name = h1Match[1].trim(); // Extract local name from

const h2Match = html.match(/

([^<]+)<\/h2>/); const localName = h2Match ? h2Match[1].trim() : undefined; // Extract Plus Code - it's in a link with onclick containing google maps // Pattern: onclick="window.open('https://www.google.com/maps/search/?api=1&query=PLUSCODE','_blank')" // The Plus Code text is like: >8PFRW9FF+C2< let plusCode: string | null = null; // Try the onclick pattern first const plusCodeOnclickMatch = html.match(/onclick="window\.open\('https:\/\/www\.google\.com\/maps\/search\/\?api=1&(?:amp;)?query=([^']+)'/); if (plusCodeOnclickMatch) { plusCode = decodeURIComponent(plusCodeOnclickMatch[1]); } // Fallback: look for Plus Code pattern in text (format: XXXX+XX or longer) if (!plusCode) { const plusCodeTextMatch = html.match(/title="Plus Code">([A-Z0-9+]+)<\/a>/); if (plusCodeTextMatch) { plusCode = plusCodeTextMatch[1]; } } // Another fallback: look for the code near "Location:" label if (!plusCode) { const locationMatch = html.match(/Location:.*?>([2-9A-HJ-NP-Z][2-9A-HJ-NP-Z0-9]{3,7}\+[2-9A-HJ-NP-Z0-9]{2,3}) pattern const getField = (label: string): string | undefined => { // Pattern: Label: TEXT or TEXT const escaped = label.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); const regex = new RegExp(`${escaped}:?\\s*\\s*(.+?)(?:

|]+>/g, '').trim() || undefined; }; // Extract address const address = getField('Address'); // Extract phone const phone = getField('Telephone'); // Extract website URL (it's in an tag) let website: string | undefined; const websiteMatch = html.match(/Website:?\s*<\/span>\s*Type:?\s*<\/span>.*?class="ch[a-z]">([^<]+)/); if (typeMatch) { churchType = typeMatch[1].trim(); } // Extract country from page let country = countryCode; if (!country) { const countryMatch = html.match(/href="[^"]*country\/([A-Z]{2})(?:\.htm)?"/); if (countryMatch) { country = countryMatch[1]; } } // Extract city from

tag: "City, Region, Country" let city: string | undefined; let state: string | undefined; const h3Match = html.match(/

([^<]+?)(?:,\s*([^<]+)<\/span>)?(?:,\s*]*class="zcountry"[^>]*>[^<]+<\/a>)?\s*<\/h3>/); if (h3Match) { city = h3Match[1].trim(); state = h3Match[2]?.trim(); // Clean up: remove country code suffix if present (e.g., "Beijing 北京") // Keep as-is since it may contain local language characters } return { gcatholicId, name, localName, lat, lng, address, city, state, country, phone, website, diocese, churchType, plusCode, sourceUrl: url, }; } // ─── CLI Argument Parsing ──────────────────────────────────────────────────── function parseArgs(): CLIArgs { const args = process.argv.slice(2); const result: CLIArgs = { all: false, dryRun: false, delay: DEFAULT_DELAY_MS, }; for (let i = 0; i < args.length; i++) { switch (args[i]) { case '--country': result.country = args[++i]?.toUpperCase(); break; case '--all': result.all = true; break; case '--diocese': result.diocese = args[++i]; break; case '--dry-run': result.dryRun = true; break; case '--limit': result.limit = parseInt(args[++i], 10); break; case '--delay': result.delay = parseInt(args[++i], 10); break; case '--resume-from': result.resumeFrom = args[++i]?.toUpperCase(); break; } } return result; } // ─── Database Operations ───────────────────────────────────────────────────── async function loadExistingChurches(): Promise { console.log('Loading existing churches for deduplication...'); const churches = await prisma.church.findMany({ select: { id: true, name: true, latitude: true, longitude: true, osmId: true, baiduId: true, masstimesId: true, orarimesseId: true, massSchedulesPhId: true, philmassId: true, horariosMisasId: true, mszeInfoId: true, weekdayMassesId: true, messesInfoId: true, bohosluzbyId: true, miserendId: true, kerknetId: true, gottesdienstzeitenId: true, discovermassId: true, source: true, website: true, phone: true, address: true, }, }); console.log(`Loaded ${churches.length} existing churches`); return churches; } async function importChurch( church: GCatholicChurch, existingChurches: ExistingChurch[], dryRun: boolean, stats: ImportStats, ): Promise { // Build a candidate compatible with findDuplicateChurch (expects OSMChurch shape) const candidate = { osmId: `gcatholic-${church.gcatholicId}`, name: church.name, lat: church.lat, lng: church.lng, address: church.address, city: church.city, state: church.state, country: church.country, phone: church.phone, website: church.website, diocese: church.diocese, }; const duplicate = findDuplicateChurch(candidate, existingChurches); if (dryRun) { if (duplicate) { console.log(` [MERGE] ${church.name} → existing: ${duplicate.name} (${duplicate.id})`); stats.existingChurchesMerged++; } else { console.log(` [NEW] ${church.name} (${church.lat.toFixed(4)}, ${church.lng.toFixed(4)})`); stats.newChurchesCreated++; } return; } if (duplicate) { // Merge: fill in missing fields only const updateData: Record = {}; if (!duplicate.phone && church.phone) updateData.phone = church.phone; if (!duplicate.website && church.website) { updateData.website = church.website; updateData.hasWebsite = true; } if (!duplicate.address && church.address) updateData.address = church.address; // Always set diocese if missing (GCatholic is great for this) // We need to check diocese on the actual DB record const dbRecord = await prisma.church.findUnique({ where: { id: duplicate.id }, select: { diocese: true }, }); if (dbRecord && !dbRecord.diocese && church.diocese) { updateData.diocese = church.diocese; } if (Object.keys(updateData).length > 0) { await prisma.church.update({ where: { id: duplicate.id }, data: updateData, }); stats.existingChurchesMerged++; } else { stats.skipped++; } } else { // Create new church const newChurch = await prisma.church.create({ data: { name: church.name, latitude: church.lat, longitude: church.lng, address: church.address, city: church.city, state: church.state, country: church.country, phone: church.phone, website: church.website, hasWebsite: !!church.website, source: 'gcatholic', diocese: church.diocese, }, }); stats.newChurchesCreated++; // Add to existing list for future dedup within this run existingChurches.push({ id: newChurch.id, name: church.name, latitude: church.lat, longitude: church.lng, osmId: null, baiduId: null, masstimesId: null, orarimesseId: null, massSchedulesPhId: null, philmassId: null, horariosMisasId: null, mszeInfoId: null, weekdayMassesId: null, messesInfoId: null, bohosluzbyId: null, miserendId: null, kerknetId: null, gottesdienstzeitenId: null, discovermassId: null, source: 'gcatholic', website: church.website || null, phone: church.phone || null, address: church.address || null, }); } } // ─── Import Logic ──────────────────────────────────────────────────────────── async function importDiocese( dioceseCode: string, dioceseName: string, countryCode: string | undefined, existingChurches: ExistingChurch[], args: CLIArgs, stats: ImportStats, globalLimit?: { remaining: number }, ): Promise { const churchUrls = await discoverChurchLinks(dioceseCode, args.delay); if (churchUrls.length === 0) { return; } console.log(` Diocese ${dioceseName} (${dioceseCode}): ${churchUrls.length} church pages found`); let dioceseNew = 0; let dioceseMerged = 0; let dioceseSkipped = 0; let dioceseErrors = 0; for (const url of churchUrls) { // Check global limit if (globalLimit && globalLimit.remaining <= 0) { console.log(` Limit reached, stopping`); return; } try { const html = await fetchPage(url, args.delay); if (!html) { stats.errors++; dioceseErrors++; stats.errorDetails.push(`Failed to fetch: ${url}`); continue; } const church = parseChurchPage(html, url, countryCode); if (!church) { stats.skipped++; dioceseSkipped++; continue; } stats.churchesFound++; const prevNew = stats.newChurchesCreated; const prevMerged = stats.existingChurchesMerged; await importChurch(church, existingChurches, args.dryRun, stats); if (stats.newChurchesCreated > prevNew) dioceseNew++; if (stats.existingChurchesMerged > prevMerged) dioceseMerged++; if (globalLimit) globalLimit.remaining--; } catch (error) { stats.errors++; dioceseErrors++; const msg = error instanceof Error ? error.message : String(error); stats.errorDetails.push(`${url}: ${msg}`); console.error(` Error processing ${url}: ${msg}`); } } if (churchUrls.length > 0) { const parts = [`${dioceseNew} new`, `${dioceseMerged} merged`]; if (dioceseSkipped > 0) parts.push(`${dioceseSkipped} skipped`); if (dioceseErrors > 0) parts.push(`${dioceseErrors} errors`); console.log(` → ${parts.join(', ')}`); } } async function importCountry( countryCode: string, existingChurches: ExistingChurch[], args: CLIArgs, globalLimit?: { remaining: number }, ): Promise { const stats: ImportStats = { churchesFound: 0, newChurchesCreated: 0, existingChurchesMerged: 0, skipped: 0, errors: 0, errorDetails: [], }; console.log(`\n${'='.repeat(60)}`); console.log(`Importing from GCatholic: ${countryCode}`); console.log(`${'='.repeat(60)}`); // Discover dioceses const dioceses = await discoverDioceses(countryCode, args.delay); if (dioceses.length === 0) { console.log(`No dioceses found for ${countryCode}`); return stats; } console.log(`Found ${dioceses.length} dioceses in ${countryCode}`); // Process each diocese for (const diocese of dioceses) { if (globalLimit && globalLimit.remaining <= 0) break; await importDiocese( diocese.code, diocese.name, countryCode, existingChurches, args, stats, globalLimit, ); } return stats; } // ─── Summary Printing ──────────────────────────────────────────────────────── function printSummary(label: string, stats: ImportStats, dryRun: boolean): void { console.log(`\n${'─'.repeat(60)}`); console.log(`Summary: ${label} ${dryRun ? '(DRY RUN)' : ''}`); console.log(`${'─'.repeat(60)}`); console.log(`Churches found on GCatholic: ${stats.churchesFound}`); console.log(`New churches created: ${stats.newChurchesCreated}`); console.log(`Merged with existing: ${stats.existingChurchesMerged}`); console.log(`Skipped (no data/dup): ${stats.skipped}`); if (stats.errors > 0) { console.log(`Errors: ${stats.errors}`); } console.log(`${'─'.repeat(60)}`); } // ─── Job Tracking ──────────────────────────────────────────────────────────── async function createOrResumeJob(args: string[]): Promise { const jobIdIndex = args.indexOf('--job-id'); if (jobIdIndex !== -1) { const jobId = args[jobIdIndex + 1]; await prisma.backgroundJob.update({ where: { id: jobId }, data: { status: 'running', startedAt: new Date() }, }); return jobId; } return null; } async function completeJob(jobId: string | null, error?: string): Promise { if (!jobId) return; try { await prisma.backgroundJob.update({ where: { id: jobId }, data: { status: error ? 'failed' : 'completed', error: error || null, completedAt: new Date(), }, }); } catch (err) { console.error(`Failed to update job ${jobId}:`, err); } } // ─── Main ──────────────────────────────────────────────────────────────────── async function main() { const args = parseArgs(); const jobId = await createOrResumeJob(process.argv.slice(2)); if (!args.country && !args.all && !args.diocese) { console.error('Error: Must specify --country , --diocese , or --all'); console.error('Usage:'); console.error(' npx tsx scripts/import-gcatholic.ts --country CN'); console.error(' npx tsx scripts/import-gcatholic.ts --country CN --dry-run'); console.error(' npx tsx scripts/import-gcatholic.ts --diocese peki0'); console.error(' npx tsx scripts/import-gcatholic.ts --all'); console.error(' npx tsx scripts/import-gcatholic.ts --all --limit 500'); console.error(' npx tsx scripts/import-gcatholic.ts --all --resume-from PL'); process.exit(1); } if (args.dryRun) { console.log('\n*** DRY RUN MODE — no changes will be written to database ***\n'); } console.log(`Delay between requests: ${args.delay}ms`); if (args.limit) console.log(`Limit: ${args.limit} churches`); try { const existingChurches = await loadExistingChurches(); const globalLimit = args.limit ? { remaining: args.limit } : undefined; if (args.diocese) { // Single diocese mode const stats: ImportStats = { churchesFound: 0, newChurchesCreated: 0, existingChurchesMerged: 0, skipped: 0, errors: 0, errorDetails: [], }; await importDiocese(args.diocese, args.diocese, args.country, existingChurches, args, stats, globalLimit); printSummary(`Diocese ${args.diocese}`, stats, args.dryRun); } else if (args.country) { // Single country mode const stats = await importCountry(args.country, existingChurches, args, globalLimit); printSummary(args.country, stats, args.dryRun); } else if (args.all) { // All countries mode — discover from GCatholic let countries = await discoverCountries(args.delay); if (countries.length === 0) { console.error('Failed to discover countries'); process.exit(1); } // Handle --resume-from if (args.resumeFrom) { const idx = countries.indexOf(args.resumeFrom); if (idx === -1) { console.error(`Country ${args.resumeFrom} not found in GCatholic listing`); process.exit(1); } console.log(`Resuming from ${args.resumeFrom} (skipping ${idx} countries)\n`); countries = countries.slice(idx); } console.log(`Will process ${countries.length} countries\n`); const totalStats: ImportStats = { churchesFound: 0, newChurchesCreated: 0, existingChurchesMerged: 0, skipped: 0, errors: 0, errorDetails: [], }; let countriesProcessed = 0; for (const countryCode of countries) { if (globalLimit && globalLimit.remaining <= 0) { console.log(`\nGlobal limit reached, stopping.`); break; } const stats = await importCountry(countryCode, existingChurches, args, globalLimit); printSummary(countryCode, stats, args.dryRun); // Aggregate totalStats.churchesFound += stats.churchesFound; totalStats.newChurchesCreated += stats.newChurchesCreated; totalStats.existingChurchesMerged += stats.existingChurchesMerged; totalStats.skipped += stats.skipped; totalStats.errors += stats.errors; totalStats.errorDetails.push(...stats.errorDetails); countriesProcessed++; // Small extra delay between countries await new Promise((resolve) => setTimeout(resolve, 2000)); } // Overall summary console.log(`\n${'='.repeat(60)}`); console.log(`OVERALL SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`); console.log(`${'='.repeat(60)}`); console.log(`Countries processed: ${countriesProcessed}`); console.log(`Total churches found: ${totalStats.churchesFound}`); console.log(`Total new churches created: ${totalStats.newChurchesCreated}`); console.log(`Total merged with existing: ${totalStats.existingChurchesMerged}`); console.log(`Total skipped: ${totalStats.skipped}`); if (totalStats.errors > 0) { console.log(`Total errors: ${totalStats.errors}`); } console.log(`Total HTTP requests made: ${requestCount}`); console.log(`${'='.repeat(60)}\n`); if (totalStats.errorDetails.length > 0 && totalStats.errorDetails.length <= 50) { console.log('\nError details:'); totalStats.errorDetails.forEach((e) => console.log(` - ${e}`)); } else if (totalStats.errorDetails.length > 50) { console.log(`\nFirst 50 errors (of ${totalStats.errorDetails.length}):`); totalStats.errorDetails.slice(0, 50).forEach((e) => console.log(` - ${e}`)); } } await completeJob(jobId); } catch (error) { console.error('Fatal error:', error); await completeJob(jobId, String(error)); process.exit(1); } finally { await prisma.$disconnect(); await pool.end(); } } main();