#!/usr/bin/env tsx
/**
* Import Catholic churches from GCatholic.org
*
* GCatholic is a comprehensive Catholic directory organized by diocese.
* Each church page includes a Google Plus Code (→ lat/lng), address, phone, website, etc.
* This script discovers churches via country → diocese → church page navigation.
*
* Usage:
* npx tsx scripts/import-gcatholic.ts --country CN
* npx tsx scripts/import-gcatholic.ts --country CN --dry-run
* npx tsx scripts/import-gcatholic.ts --diocese peki0
* npx tsx scripts/import-gcatholic.ts --all
* npx tsx scripts/import-gcatholic.ts --all --limit 100
* npx tsx scripts/import-gcatholic.ts --all --resume-from PL
*/
// Load .env for database connection (before importing anything that uses process.env)
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
// Create a fresh Prisma client for this script (don't use cached pool from lib/db)
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
const pool = new Pool({
connectionString: dbUrl,
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
});
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
import { findDuplicateChurch } from '../src/lib/church-matcher';
import type { ExistingChurch } from '../src/lib/church-matcher';
// Plus Code decoder
// eslint-disable-next-line @typescript-eslint/no-require-imports
const { OpenLocationCode } = require('open-location-code');
const olc = new OpenLocationCode();
// ─── Constants ───────────────────────────────────────────────────────────────
const BASE_URL = 'https://www.gcatholic.org';
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
const DEFAULT_DELAY_MS = 1500;
// ─── Types ───────────────────────────────────────────────────────────────────
interface GCatholicChurch {
gcatholicId: string;
name: string;
localName?: string;
lat: number;
lng: number;
address?: string;
city?: string;
state?: string;
country?: string;
phone?: string;
website?: string;
diocese?: string;
churchType?: string;
plusCode: string;
sourceUrl: string;
}
interface ImportStats {
churchesFound: number;
newChurchesCreated: number;
existingChurchesMerged: number;
skipped: number;
errors: number;
errorDetails: string[];
}
interface CLIArgs {
country?: string;
all: boolean;
diocese?: string;
dryRun: boolean;
limit?: number;
delay: number;
resumeFrom?: string;
}
// ─── HTTP Fetching ───────────────────────────────────────────────────────────
let requestCount = 0;
async function fetchPage(url: string, delayMs: number): Promise
const h1Match = html.match(/
([^<]+)<\/h1>/);
if (!h1Match) return null;
const name = h1Match[1].trim();
// Extract local name from
const h2Match = html.match(/
([^<]+)<\/h2>/);
const localName = h2Match ? h2Match[1].trim() : undefined;
// Extract Plus Code - it's in a link with onclick containing google maps
// Pattern: onclick="window.open('https://www.google.com/maps/search/?api=1&query=PLUSCODE','_blank')"
// The Plus Code text is like: >8PFRW9FF+C2<
let plusCode: string | null = null;
// Try the onclick pattern first
const plusCodeOnclickMatch = html.match(/onclick="window\.open\('https:\/\/www\.google\.com\/maps\/search\/\?api=1&(?:amp;)?query=([^']+)'/);
if (plusCodeOnclickMatch) {
plusCode = decodeURIComponent(plusCodeOnclickMatch[1]);
}
// Fallback: look for Plus Code pattern in text (format: XXXX+XX or longer)
if (!plusCode) {
const plusCodeTextMatch = html.match(/title="Plus Code">([A-Z0-9+]+)<\/a>/);
if (plusCodeTextMatch) {
plusCode = plusCodeTextMatch[1];
}
}
// Another fallback: look for the code near "Location:" label
if (!plusCode) {
const locationMatch = html.match(/Location:.*?>([2-9A-HJ-NP-Z][2-9A-HJ-NP-Z0-9]{3,7}\+[2-9A-HJ-NP-Z0-9]{2,3}));
if (locationMatch) {
plusCode = locationMatch[1];
}
}
if (!plusCode) {
return null; // Can't geolocate without Plus Code
}
// Decode Plus Code to lat/lng
let lat: number, lng: number;
try {
const decoded = olc.decode(plusCode);
lat = decoded.latitudeCenter;
lng = decoded.longitudeCenter;
} catch {
return null; // Invalid Plus Code
}
// Extract GCatholic ID from URL
const idMatch = url.match(/\/(\d+)(?:\.htm)?$/);
const gcatholicId = idMatch ? idMatch[1] : '';
// Extract labeled fields using the consistent pattern
const getField = (label: string): string | undefined => {
// Pattern: Label: TEXT or TEXT
const escaped = label.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const regex = new RegExp(`${escaped}:?\\s*\\s*(.+?)(?:
, or --all');
console.error('Usage:');
console.error(' npx tsx scripts/import-gcatholic.ts --country CN');
console.error(' npx tsx scripts/import-gcatholic.ts --country CN --dry-run');
console.error(' npx tsx scripts/import-gcatholic.ts --diocese peki0');
console.error(' npx tsx scripts/import-gcatholic.ts --all');
console.error(' npx tsx scripts/import-gcatholic.ts --all --limit 500');
console.error(' npx tsx scripts/import-gcatholic.ts --all --resume-from PL');
process.exit(1);
}
if (args.dryRun) {
console.log('\n*** DRY RUN MODE — no changes will be written to database ***\n');
}
console.log(`Delay between requests: ${args.delay}ms`);
if (args.limit) console.log(`Limit: ${args.limit} churches`);
try {
const existingChurches = await loadExistingChurches();
const globalLimit = args.limit ? { remaining: args.limit } : undefined;
if (args.diocese) {
// Single diocese mode
const stats: ImportStats = {
churchesFound: 0,
newChurchesCreated: 0,
existingChurchesMerged: 0,
skipped: 0,
errors: 0,
errorDetails: [],
};
await importDiocese(args.diocese, args.diocese, args.country, existingChurches, args, stats, globalLimit);
printSummary(`Diocese ${args.diocese}`, stats, args.dryRun);
} else if (args.country) {
// Single country mode
const stats = await importCountry(args.country, existingChurches, args, globalLimit);
printSummary(args.country, stats, args.dryRun);
} else if (args.all) {
// All countries mode — discover from GCatholic
let countries = await discoverCountries(args.delay);
if (countries.length === 0) {
console.error('Failed to discover countries');
process.exit(1);
}
// Handle --resume-from
if (args.resumeFrom) {
const idx = countries.indexOf(args.resumeFrom);
if (idx === -1) {
console.error(`Country ${args.resumeFrom} not found in GCatholic listing`);
process.exit(1);
}
console.log(`Resuming from ${args.resumeFrom} (skipping ${idx} countries)\n`);
countries = countries.slice(idx);
}
console.log(`Will process ${countries.length} countries\n`);
const totalStats: ImportStats = {
churchesFound: 0,
newChurchesCreated: 0,
existingChurchesMerged: 0,
skipped: 0,
errors: 0,
errorDetails: [],
};
let countriesProcessed = 0;
for (const countryCode of countries) {
if (globalLimit && globalLimit.remaining <= 0) {
console.log(`\nGlobal limit reached, stopping.`);
break;
}
const stats = await importCountry(countryCode, existingChurches, args, globalLimit);
printSummary(countryCode, stats, args.dryRun);
// Aggregate
totalStats.churchesFound += stats.churchesFound;
totalStats.newChurchesCreated += stats.newChurchesCreated;
totalStats.existingChurchesMerged += stats.existingChurchesMerged;
totalStats.skipped += stats.skipped;
totalStats.errors += stats.errors;
totalStats.errorDetails.push(...stats.errorDetails);
countriesProcessed++;
// Small extra delay between countries
await new Promise((resolve) => setTimeout(resolve, 2000));
}
// Overall summary
console.log(`\n${'='.repeat(60)}`);
console.log(`OVERALL SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
console.log(`${'='.repeat(60)}`);
console.log(`Countries processed: ${countriesProcessed}`);
console.log(`Total churches found: ${totalStats.churchesFound}`);
console.log(`Total new churches created: ${totalStats.newChurchesCreated}`);
console.log(`Total merged with existing: ${totalStats.existingChurchesMerged}`);
console.log(`Total skipped: ${totalStats.skipped}`);
if (totalStats.errors > 0) {
console.log(`Total errors: ${totalStats.errors}`);
}
console.log(`Total HTTP requests made: ${requestCount}`);
console.log(`${'='.repeat(60)}\n`);
if (totalStats.errorDetails.length > 0 && totalStats.errorDetails.length <= 50) {
console.log('\nError details:');
totalStats.errorDetails.forEach((e) => console.log(` - ${e}`));
} else if (totalStats.errorDetails.length > 50) {
console.log(`\nFirst 50 errors (of ${totalStats.errorDetails.length}):`);
totalStats.errorDetails.slice(0, 50).forEach((e) => console.log(` - ${e}`));
}
}
await completeJob(jobId);
} catch (error) {
console.error('Fatal error:', error);
await completeJob(jobId, String(error));
process.exit(1);
} finally {
await prisma.$disconnect();
await pool.end();
}
}
main();