835 lines
26 KiB
TypeScript
835 lines
26 KiB
TypeScript
|
|
#!/usr/bin/env tsx
|
||
|
|
/**
|
||
|
|
* Import Catholic churches from GCatholic.org
|
||
|
|
*
|
||
|
|
* GCatholic is a comprehensive Catholic directory organized by diocese.
|
||
|
|
* Each church page includes a Google Plus Code (→ lat/lng), address, phone, website, etc.
|
||
|
|
* This script discovers churches via country → diocese → church page navigation.
|
||
|
|
*
|
||
|
|
* Usage:
|
||
|
|
* npx tsx scripts/import-gcatholic.ts --country CN
|
||
|
|
* npx tsx scripts/import-gcatholic.ts --country CN --dry-run
|
||
|
|
* npx tsx scripts/import-gcatholic.ts --diocese peki0
|
||
|
|
* npx tsx scripts/import-gcatholic.ts --all
|
||
|
|
* npx tsx scripts/import-gcatholic.ts --all --limit 100
|
||
|
|
* npx tsx scripts/import-gcatholic.ts --all --resume-from PL
|
||
|
|
*/
|
||
|
|
|
||
|
|
// Load .env for database connection (before importing anything that uses process.env)
|
||
|
|
import dotenv from 'dotenv';
|
||
|
|
import path from 'path';
|
||
|
|
|
||
|
|
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||
|
|
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||
|
|
|
||
|
|
// Create a fresh Prisma client for this script (don't use cached pool from lib/db)
|
||
|
|
import { Pool } from 'pg';
|
||
|
|
import { PrismaPg } from '@prisma/adapter-pg';
|
||
|
|
import { PrismaClient } from '@prisma/client';
|
||
|
|
|
||
|
|
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
|
||
|
|
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
|
||
|
|
const pool = new Pool({
|
||
|
|
connectionString: dbUrl,
|
||
|
|
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
|
||
|
|
});
|
||
|
|
const adapter = new PrismaPg(pool);
|
||
|
|
const prisma = new PrismaClient({ adapter });
|
||
|
|
|
||
|
|
import { findDuplicateChurch } from '../src/lib/church-matcher';
|
||
|
|
import type { ExistingChurch } from '../src/lib/church-matcher';
|
||
|
|
|
||
|
|
// Plus Code decoder
|
||
|
|
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
||
|
|
const { OpenLocationCode } = require('open-location-code');
|
||
|
|
const olc = new OpenLocationCode();
|
||
|
|
|
||
|
|
// ─── Constants ───────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
const BASE_URL = 'https://www.gcatholic.org';
|
||
|
|
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
|
||
|
|
const DEFAULT_DELAY_MS = 1500;
|
||
|
|
|
||
|
|
// ─── Types ───────────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
interface GCatholicChurch {
|
||
|
|
gcatholicId: string;
|
||
|
|
name: string;
|
||
|
|
localName?: string;
|
||
|
|
lat: number;
|
||
|
|
lng: number;
|
||
|
|
address?: string;
|
||
|
|
city?: string;
|
||
|
|
state?: string;
|
||
|
|
country?: string;
|
||
|
|
phone?: string;
|
||
|
|
website?: string;
|
||
|
|
diocese?: string;
|
||
|
|
churchType?: string;
|
||
|
|
plusCode: string;
|
||
|
|
sourceUrl: string;
|
||
|
|
}
|
||
|
|
|
||
|
|
interface ImportStats {
|
||
|
|
churchesFound: number;
|
||
|
|
newChurchesCreated: number;
|
||
|
|
existingChurchesMerged: number;
|
||
|
|
skipped: number;
|
||
|
|
errors: number;
|
||
|
|
errorDetails: string[];
|
||
|
|
}
|
||
|
|
|
||
|
|
interface CLIArgs {
|
||
|
|
country?: string;
|
||
|
|
all: boolean;
|
||
|
|
diocese?: string;
|
||
|
|
dryRun: boolean;
|
||
|
|
limit?: number;
|
||
|
|
delay: number;
|
||
|
|
resumeFrom?: string;
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── HTTP Fetching ───────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
let requestCount = 0;
|
||
|
|
|
||
|
|
async function fetchPage(url: string, delayMs: number): Promise<string | null> {
|
||
|
|
// Rate limit
|
||
|
|
if (requestCount > 0) {
|
||
|
|
await new Promise((resolve) => setTimeout(resolve, delayMs));
|
||
|
|
}
|
||
|
|
requestCount++;
|
||
|
|
|
||
|
|
try {
|
||
|
|
const response = await fetch(url, {
|
||
|
|
headers: {
|
||
|
|
'User-Agent': USER_AGENT,
|
||
|
|
'Accept': 'text/html,application/xhtml+xml',
|
||
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
||
|
|
},
|
||
|
|
});
|
||
|
|
|
||
|
|
if (!response.ok) {
|
||
|
|
if (response.status === 404) {
|
||
|
|
return null; // Expected for some pages
|
||
|
|
}
|
||
|
|
console.error(` HTTP ${response.status} for ${url}`);
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
return await response.text();
|
||
|
|
} catch (error) {
|
||
|
|
console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`);
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── HTML Parsing ────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Extract all country codes from the GCatholic countries page.
|
||
|
|
* Links follow pattern: country/{ISO2}
|
||
|
|
*/
|
||
|
|
async function discoverCountries(delayMs: number): Promise<string[]> {
|
||
|
|
console.log('Discovering countries from GCatholic...');
|
||
|
|
const html = await fetchPage(`${BASE_URL}/dioceses/`, delayMs);
|
||
|
|
if (!html) {
|
||
|
|
console.error('Failed to fetch countries page');
|
||
|
|
return [];
|
||
|
|
}
|
||
|
|
|
||
|
|
const countryCodes = new Set<string>();
|
||
|
|
// Match links like: href="country/CN" or href="/dioceses/country/CN"
|
||
|
|
const regex = /href="(?:\.\.\/|\/dioceses\/)?country\/([A-Z]{2})(?:\.htm)?"/g;
|
||
|
|
let match;
|
||
|
|
while ((match = regex.exec(html)) !== null) {
|
||
|
|
countryCodes.add(match[1]);
|
||
|
|
}
|
||
|
|
|
||
|
|
const codes = Array.from(countryCodes).sort();
|
||
|
|
console.log(`Found ${codes.length} countries`);
|
||
|
|
return codes;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Extract diocese codes from a country page.
|
||
|
|
* Links follow pattern: ../diocese/{code} or diocese/{code}
|
||
|
|
*/
|
||
|
|
async function discoverDioceses(countryCode: string, delayMs: number): Promise<{ code: string; name: string }[]> {
|
||
|
|
const html = await fetchPage(`${BASE_URL}/dioceses/country/${countryCode}.htm`, delayMs);
|
||
|
|
if (!html) {
|
||
|
|
return [];
|
||
|
|
}
|
||
|
|
|
||
|
|
const dioceses: { code: string; name: string }[] = [];
|
||
|
|
const seen = new Set<string>();
|
||
|
|
|
||
|
|
// Match links like: href="../diocese/peki0" or href="../../dioceses/diocese/peki0"
|
||
|
|
// The text after the link is the diocese name
|
||
|
|
const regex = /href="(?:\.\.\/)?(?:\.\.\/dioceses\/)?diocese\/([a-z0-9]+)(?:\.htm)?"[^>]*>([^<]+)</g;
|
||
|
|
let match;
|
||
|
|
while ((match = regex.exec(html)) !== null) {
|
||
|
|
const code = match[1];
|
||
|
|
const name = match[2].trim();
|
||
|
|
if (!seen.has(code)) {
|
||
|
|
seen.add(code);
|
||
|
|
dioceses.push({ code, name });
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return dioceses;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Extract church page links from a diocese page.
|
||
|
|
* Church links follow pattern: ../../churches/{region}/{id}
|
||
|
|
*/
|
||
|
|
async function discoverChurchLinks(dioceseCode: string, delayMs: number): Promise<string[]> {
|
||
|
|
const html = await fetchPage(`${BASE_URL}/dioceses/diocese/${dioceseCode}.htm`, delayMs);
|
||
|
|
if (!html) {
|
||
|
|
return [];
|
||
|
|
}
|
||
|
|
|
||
|
|
const churchUrls = new Set<string>();
|
||
|
|
|
||
|
|
// Match church links like: href="../../churches/china/46492" or href="../../churches/asia/1893"
|
||
|
|
const regex = /href="(?:\.\.\/)*churches\/([a-z0-9-]+\/\d+)(?:\.htm)?"/g;
|
||
|
|
let match;
|
||
|
|
while ((match = regex.exec(html)) !== null) {
|
||
|
|
const churchPath = match[1];
|
||
|
|
churchUrls.add(`${BASE_URL}/churches/${churchPath}.htm`);
|
||
|
|
}
|
||
|
|
|
||
|
|
return Array.from(churchUrls);
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Parse a single church page and extract structured data.
|
||
|
|
*/
|
||
|
|
function parseChurchPage(html: string, url: string, countryCode?: string): GCatholicChurch | null {
|
||
|
|
// Extract church name from <h1>
|
||
|
|
const h1Match = html.match(/<h1>([^<]+)<\/h1>/);
|
||
|
|
if (!h1Match) return null;
|
||
|
|
const name = h1Match[1].trim();
|
||
|
|
|
||
|
|
// Extract local name from <h2>
|
||
|
|
const h2Match = html.match(/<h2>([^<]+)<\/h2>/);
|
||
|
|
const localName = h2Match ? h2Match[1].trim() : undefined;
|
||
|
|
|
||
|
|
// Extract Plus Code - it's in a link with onclick containing google maps
|
||
|
|
// Pattern: onclick="window.open('https://www.google.com/maps/search/?api=1&query=PLUSCODE','_blank')"
|
||
|
|
// The Plus Code text is like: >8PFRW9FF+C2<
|
||
|
|
let plusCode: string | null = null;
|
||
|
|
|
||
|
|
// Try the onclick pattern first
|
||
|
|
const plusCodeOnclickMatch = html.match(/onclick="window\.open\('https:\/\/www\.google\.com\/maps\/search\/\?api=1&(?:amp;)?query=([^']+)'/);
|
||
|
|
if (plusCodeOnclickMatch) {
|
||
|
|
plusCode = decodeURIComponent(plusCodeOnclickMatch[1]);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Fallback: look for Plus Code pattern in text (format: XXXX+XX or longer)
|
||
|
|
if (!plusCode) {
|
||
|
|
const plusCodeTextMatch = html.match(/title="Plus Code">([A-Z0-9+]+)<\/a>/);
|
||
|
|
if (plusCodeTextMatch) {
|
||
|
|
plusCode = plusCodeTextMatch[1];
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Another fallback: look for the code near "Location:" label
|
||
|
|
if (!plusCode) {
|
||
|
|
const locationMatch = html.match(/Location:.*?>([2-9A-HJ-NP-Z][2-9A-HJ-NP-Z0-9]{3,7}\+[2-9A-HJ-NP-Z0-9]{2,3})</);
|
||
|
|
if (locationMatch) {
|
||
|
|
plusCode = locationMatch[1];
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!plusCode) {
|
||
|
|
return null; // Can't geolocate without Plus Code
|
||
|
|
}
|
||
|
|
|
||
|
|
// Decode Plus Code to lat/lng
|
||
|
|
let lat: number, lng: number;
|
||
|
|
try {
|
||
|
|
const decoded = olc.decode(plusCode);
|
||
|
|
lat = decoded.latitudeCenter;
|
||
|
|
lng = decoded.longitudeCenter;
|
||
|
|
} catch {
|
||
|
|
return null; // Invalid Plus Code
|
||
|
|
}
|
||
|
|
|
||
|
|
// Extract GCatholic ID from URL
|
||
|
|
const idMatch = url.match(/\/(\d+)(?:\.htm)?$/);
|
||
|
|
const gcatholicId = idMatch ? idMatch[1] : '';
|
||
|
|
|
||
|
|
// Extract labeled fields using the consistent <span class="label"> pattern
|
||
|
|
const getField = (label: string): string | undefined => {
|
||
|
|
// Pattern: <span class="label">Label: </span>TEXT or <a>TEXT</a>
|
||
|
|
const escaped = label.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||
|
|
const regex = new RegExp(`<span class="label">${escaped}:?\\s*</span>\\s*(.+?)(?:</p>|<br)`, 's');
|
||
|
|
const match = html.match(regex);
|
||
|
|
if (!match) return undefined;
|
||
|
|
// Strip HTML tags to get plain text
|
||
|
|
return match[1].replace(/<[^>]+>/g, '').trim() || undefined;
|
||
|
|
};
|
||
|
|
|
||
|
|
// Extract address
|
||
|
|
const address = getField('Address');
|
||
|
|
|
||
|
|
// Extract phone
|
||
|
|
const phone = getField('Telephone');
|
||
|
|
|
||
|
|
// Extract website URL (it's in an <a> tag)
|
||
|
|
let website: string | undefined;
|
||
|
|
const websiteMatch = html.match(/<span class="label">Website:?\s*<\/span>\s*<a\s+href="([^"]+)"/);
|
||
|
|
if (websiteMatch) {
|
||
|
|
website = websiteMatch[1];
|
||
|
|
// Ensure it's an external URL
|
||
|
|
if (website && !website.startsWith('http')) {
|
||
|
|
website = undefined;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Extract diocese name
|
||
|
|
const diocese = getField('Jurisdiction');
|
||
|
|
|
||
|
|
// Extract church type
|
||
|
|
let churchType: string | undefined;
|
||
|
|
const typeMatch = html.match(/<span class="label">Type:?\s*<\/span>.*?class="ch[a-z]">([^<]+)/);
|
||
|
|
if (typeMatch) {
|
||
|
|
churchType = typeMatch[1].trim();
|
||
|
|
}
|
||
|
|
|
||
|
|
// Extract country from page
|
||
|
|
let country = countryCode;
|
||
|
|
if (!country) {
|
||
|
|
const countryMatch = html.match(/href="[^"]*country\/([A-Z]{2})(?:\.htm)?"/);
|
||
|
|
if (countryMatch) {
|
||
|
|
country = countryMatch[1];
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Extract city from <h3> tag: "City, Region, Country"
|
||
|
|
let city: string | undefined;
|
||
|
|
let state: string | undefined;
|
||
|
|
const h3Match = html.match(/<h3>([^<]+?)(?:,\s*<span class="zregion">([^<]+)<\/span>)?(?:,\s*<a[^>]*class="zcountry"[^>]*>[^<]+<\/a>)?\s*<\/h3>/);
|
||
|
|
if (h3Match) {
|
||
|
|
city = h3Match[1].trim();
|
||
|
|
state = h3Match[2]?.trim();
|
||
|
|
// Clean up: remove country code suffix if present (e.g., "Beijing 北京")
|
||
|
|
// Keep as-is since it may contain local language characters
|
||
|
|
}
|
||
|
|
|
||
|
|
return {
|
||
|
|
gcatholicId,
|
||
|
|
name,
|
||
|
|
localName,
|
||
|
|
lat,
|
||
|
|
lng,
|
||
|
|
address,
|
||
|
|
city,
|
||
|
|
state,
|
||
|
|
country,
|
||
|
|
phone,
|
||
|
|
website,
|
||
|
|
diocese,
|
||
|
|
churchType,
|
||
|
|
plusCode,
|
||
|
|
sourceUrl: url,
|
||
|
|
};
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── CLI Argument Parsing ────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
function parseArgs(): CLIArgs {
|
||
|
|
const args = process.argv.slice(2);
|
||
|
|
const result: CLIArgs = {
|
||
|
|
all: false,
|
||
|
|
dryRun: false,
|
||
|
|
delay: DEFAULT_DELAY_MS,
|
||
|
|
};
|
||
|
|
|
||
|
|
for (let i = 0; i < args.length; i++) {
|
||
|
|
switch (args[i]) {
|
||
|
|
case '--country':
|
||
|
|
result.country = args[++i]?.toUpperCase();
|
||
|
|
break;
|
||
|
|
case '--all':
|
||
|
|
result.all = true;
|
||
|
|
break;
|
||
|
|
case '--diocese':
|
||
|
|
result.diocese = args[++i];
|
||
|
|
break;
|
||
|
|
case '--dry-run':
|
||
|
|
result.dryRun = true;
|
||
|
|
break;
|
||
|
|
case '--limit':
|
||
|
|
result.limit = parseInt(args[++i], 10);
|
||
|
|
break;
|
||
|
|
case '--delay':
|
||
|
|
result.delay = parseInt(args[++i], 10);
|
||
|
|
break;
|
||
|
|
case '--resume-from':
|
||
|
|
result.resumeFrom = args[++i]?.toUpperCase();
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return result;
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Database Operations ─────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
async function loadExistingChurches(): Promise<ExistingChurch[]> {
|
||
|
|
console.log('Loading existing churches for deduplication...');
|
||
|
|
const churches = await prisma.church.findMany({
|
||
|
|
select: {
|
||
|
|
id: true,
|
||
|
|
name: true,
|
||
|
|
latitude: true,
|
||
|
|
longitude: true,
|
||
|
|
osmId: true,
|
||
|
|
baiduId: true,
|
||
|
|
masstimesId: true,
|
||
|
|
orarimesseId: true,
|
||
|
|
massSchedulesPhId: true,
|
||
|
|
philmassId: true,
|
||
|
|
horariosMisasId: true,
|
||
|
|
mszeInfoId: true,
|
||
|
|
weekdayMassesId: true,
|
||
|
|
messesInfoId: true,
|
||
|
|
bohosluzbyId: true,
|
||
|
|
miserendId: true,
|
||
|
|
kerknetId: true,
|
||
|
|
gottesdienstzeitenId: true,
|
||
|
|
discovermassId: true,
|
||
|
|
source: true,
|
||
|
|
website: true,
|
||
|
|
phone: true,
|
||
|
|
address: true,
|
||
|
|
},
|
||
|
|
});
|
||
|
|
console.log(`Loaded ${churches.length} existing churches`);
|
||
|
|
return churches;
|
||
|
|
}
|
||
|
|
|
||
|
|
async function importChurch(
|
||
|
|
church: GCatholicChurch,
|
||
|
|
existingChurches: ExistingChurch[],
|
||
|
|
dryRun: boolean,
|
||
|
|
stats: ImportStats,
|
||
|
|
): Promise<void> {
|
||
|
|
// Build a candidate compatible with findDuplicateChurch (expects OSMChurch shape)
|
||
|
|
const candidate = {
|
||
|
|
osmId: `gcatholic-${church.gcatholicId}`,
|
||
|
|
name: church.name,
|
||
|
|
lat: church.lat,
|
||
|
|
lng: church.lng,
|
||
|
|
address: church.address,
|
||
|
|
city: church.city,
|
||
|
|
state: church.state,
|
||
|
|
country: church.country,
|
||
|
|
phone: church.phone,
|
||
|
|
website: church.website,
|
||
|
|
diocese: church.diocese,
|
||
|
|
};
|
||
|
|
|
||
|
|
const duplicate = findDuplicateChurch(candidate, existingChurches);
|
||
|
|
|
||
|
|
if (dryRun) {
|
||
|
|
if (duplicate) {
|
||
|
|
console.log(` [MERGE] ${church.name} → existing: ${duplicate.name} (${duplicate.id})`);
|
||
|
|
stats.existingChurchesMerged++;
|
||
|
|
} else {
|
||
|
|
console.log(` [NEW] ${church.name} (${church.lat.toFixed(4)}, ${church.lng.toFixed(4)})`);
|
||
|
|
stats.newChurchesCreated++;
|
||
|
|
}
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (duplicate) {
|
||
|
|
// Merge: fill in missing fields only
|
||
|
|
const updateData: Record<string, unknown> = {};
|
||
|
|
|
||
|
|
if (!duplicate.phone && church.phone) updateData.phone = church.phone;
|
||
|
|
if (!duplicate.website && church.website) {
|
||
|
|
updateData.website = church.website;
|
||
|
|
updateData.hasWebsite = true;
|
||
|
|
}
|
||
|
|
if (!duplicate.address && church.address) updateData.address = church.address;
|
||
|
|
|
||
|
|
// Always set diocese if missing (GCatholic is great for this)
|
||
|
|
// We need to check diocese on the actual DB record
|
||
|
|
const dbRecord = await prisma.church.findUnique({
|
||
|
|
where: { id: duplicate.id },
|
||
|
|
select: { diocese: true },
|
||
|
|
});
|
||
|
|
if (dbRecord && !dbRecord.diocese && church.diocese) {
|
||
|
|
updateData.diocese = church.diocese;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (Object.keys(updateData).length > 0) {
|
||
|
|
await prisma.church.update({
|
||
|
|
where: { id: duplicate.id },
|
||
|
|
data: updateData,
|
||
|
|
});
|
||
|
|
stats.existingChurchesMerged++;
|
||
|
|
} else {
|
||
|
|
stats.skipped++;
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
// Create new church
|
||
|
|
const newChurch = await prisma.church.create({
|
||
|
|
data: {
|
||
|
|
name: church.name,
|
||
|
|
latitude: church.lat,
|
||
|
|
longitude: church.lng,
|
||
|
|
address: church.address,
|
||
|
|
city: church.city,
|
||
|
|
state: church.state,
|
||
|
|
country: church.country,
|
||
|
|
phone: church.phone,
|
||
|
|
website: church.website,
|
||
|
|
hasWebsite: !!church.website,
|
||
|
|
source: 'gcatholic',
|
||
|
|
diocese: church.diocese,
|
||
|
|
},
|
||
|
|
});
|
||
|
|
stats.newChurchesCreated++;
|
||
|
|
|
||
|
|
// Add to existing list for future dedup within this run
|
||
|
|
existingChurches.push({
|
||
|
|
id: newChurch.id,
|
||
|
|
name: church.name,
|
||
|
|
latitude: church.lat,
|
||
|
|
longitude: church.lng,
|
||
|
|
osmId: null,
|
||
|
|
baiduId: null,
|
||
|
|
masstimesId: null,
|
||
|
|
orarimesseId: null,
|
||
|
|
massSchedulesPhId: null,
|
||
|
|
philmassId: null,
|
||
|
|
horariosMisasId: null,
|
||
|
|
mszeInfoId: null,
|
||
|
|
weekdayMassesId: null,
|
||
|
|
messesInfoId: null,
|
||
|
|
bohosluzbyId: null,
|
||
|
|
miserendId: null,
|
||
|
|
kerknetId: null,
|
||
|
|
gottesdienstzeitenId: null,
|
||
|
|
discovermassId: null,
|
||
|
|
source: 'gcatholic',
|
||
|
|
website: church.website || null,
|
||
|
|
phone: church.phone || null,
|
||
|
|
address: church.address || null,
|
||
|
|
});
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Import Logic ────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
async function importDiocese(
|
||
|
|
dioceseCode: string,
|
||
|
|
dioceseName: string,
|
||
|
|
countryCode: string | undefined,
|
||
|
|
existingChurches: ExistingChurch[],
|
||
|
|
args: CLIArgs,
|
||
|
|
stats: ImportStats,
|
||
|
|
globalLimit?: { remaining: number },
|
||
|
|
): Promise<void> {
|
||
|
|
const churchUrls = await discoverChurchLinks(dioceseCode, args.delay);
|
||
|
|
|
||
|
|
if (churchUrls.length === 0) {
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log(` Diocese ${dioceseName} (${dioceseCode}): ${churchUrls.length} church pages found`);
|
||
|
|
|
||
|
|
let dioceseNew = 0;
|
||
|
|
let dioceseMerged = 0;
|
||
|
|
let dioceseSkipped = 0;
|
||
|
|
let dioceseErrors = 0;
|
||
|
|
|
||
|
|
for (const url of churchUrls) {
|
||
|
|
// Check global limit
|
||
|
|
if (globalLimit && globalLimit.remaining <= 0) {
|
||
|
|
console.log(` Limit reached, stopping`);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
try {
|
||
|
|
const html = await fetchPage(url, args.delay);
|
||
|
|
if (!html) {
|
||
|
|
stats.errors++;
|
||
|
|
dioceseErrors++;
|
||
|
|
stats.errorDetails.push(`Failed to fetch: ${url}`);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
const church = parseChurchPage(html, url, countryCode);
|
||
|
|
if (!church) {
|
||
|
|
stats.skipped++;
|
||
|
|
dioceseSkipped++;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
stats.churchesFound++;
|
||
|
|
|
||
|
|
const prevNew = stats.newChurchesCreated;
|
||
|
|
const prevMerged = stats.existingChurchesMerged;
|
||
|
|
|
||
|
|
await importChurch(church, existingChurches, args.dryRun, stats);
|
||
|
|
|
||
|
|
if (stats.newChurchesCreated > prevNew) dioceseNew++;
|
||
|
|
if (stats.existingChurchesMerged > prevMerged) dioceseMerged++;
|
||
|
|
|
||
|
|
if (globalLimit) globalLimit.remaining--;
|
||
|
|
|
||
|
|
} catch (error) {
|
||
|
|
stats.errors++;
|
||
|
|
dioceseErrors++;
|
||
|
|
const msg = error instanceof Error ? error.message : String(error);
|
||
|
|
stats.errorDetails.push(`${url}: ${msg}`);
|
||
|
|
console.error(` Error processing ${url}: ${msg}`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (churchUrls.length > 0) {
|
||
|
|
const parts = [`${dioceseNew} new`, `${dioceseMerged} merged`];
|
||
|
|
if (dioceseSkipped > 0) parts.push(`${dioceseSkipped} skipped`);
|
||
|
|
if (dioceseErrors > 0) parts.push(`${dioceseErrors} errors`);
|
||
|
|
console.log(` → ${parts.join(', ')}`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
async function importCountry(
|
||
|
|
countryCode: string,
|
||
|
|
existingChurches: ExistingChurch[],
|
||
|
|
args: CLIArgs,
|
||
|
|
globalLimit?: { remaining: number },
|
||
|
|
): Promise<ImportStats> {
|
||
|
|
const stats: ImportStats = {
|
||
|
|
churchesFound: 0,
|
||
|
|
newChurchesCreated: 0,
|
||
|
|
existingChurchesMerged: 0,
|
||
|
|
skipped: 0,
|
||
|
|
errors: 0,
|
||
|
|
errorDetails: [],
|
||
|
|
};
|
||
|
|
|
||
|
|
console.log(`\n${'='.repeat(60)}`);
|
||
|
|
console.log(`Importing from GCatholic: ${countryCode}`);
|
||
|
|
console.log(`${'='.repeat(60)}`);
|
||
|
|
|
||
|
|
// Discover dioceses
|
||
|
|
const dioceses = await discoverDioceses(countryCode, args.delay);
|
||
|
|
if (dioceses.length === 0) {
|
||
|
|
console.log(`No dioceses found for ${countryCode}`);
|
||
|
|
return stats;
|
||
|
|
}
|
||
|
|
console.log(`Found ${dioceses.length} dioceses in ${countryCode}`);
|
||
|
|
|
||
|
|
// Process each diocese
|
||
|
|
for (const diocese of dioceses) {
|
||
|
|
if (globalLimit && globalLimit.remaining <= 0) break;
|
||
|
|
|
||
|
|
await importDiocese(
|
||
|
|
diocese.code,
|
||
|
|
diocese.name,
|
||
|
|
countryCode,
|
||
|
|
existingChurches,
|
||
|
|
args,
|
||
|
|
stats,
|
||
|
|
globalLimit,
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
return stats;
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Summary Printing ────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
function printSummary(label: string, stats: ImportStats, dryRun: boolean): void {
|
||
|
|
console.log(`\n${'─'.repeat(60)}`);
|
||
|
|
console.log(`Summary: ${label} ${dryRun ? '(DRY RUN)' : ''}`);
|
||
|
|
console.log(`${'─'.repeat(60)}`);
|
||
|
|
console.log(`Churches found on GCatholic: ${stats.churchesFound}`);
|
||
|
|
console.log(`New churches created: ${stats.newChurchesCreated}`);
|
||
|
|
console.log(`Merged with existing: ${stats.existingChurchesMerged}`);
|
||
|
|
console.log(`Skipped (no data/dup): ${stats.skipped}`);
|
||
|
|
if (stats.errors > 0) {
|
||
|
|
console.log(`Errors: ${stats.errors}`);
|
||
|
|
}
|
||
|
|
console.log(`${'─'.repeat(60)}`);
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Job Tracking ────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
async function createOrResumeJob(args: string[]): Promise<string | null> {
|
||
|
|
const jobIdIndex = args.indexOf('--job-id');
|
||
|
|
if (jobIdIndex !== -1) {
|
||
|
|
const jobId = args[jobIdIndex + 1];
|
||
|
|
await prisma.backgroundJob.update({
|
||
|
|
where: { id: jobId },
|
||
|
|
data: { status: 'running', startedAt: new Date() },
|
||
|
|
});
|
||
|
|
return jobId;
|
||
|
|
}
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
async function completeJob(jobId: string | null, error?: string): Promise<void> {
|
||
|
|
if (!jobId) return;
|
||
|
|
try {
|
||
|
|
await prisma.backgroundJob.update({
|
||
|
|
where: { id: jobId },
|
||
|
|
data: {
|
||
|
|
status: error ? 'failed' : 'completed',
|
||
|
|
error: error || null,
|
||
|
|
completedAt: new Date(),
|
||
|
|
},
|
||
|
|
});
|
||
|
|
} catch (err) {
|
||
|
|
console.error(`Failed to update job ${jobId}:`, err);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Main ────────────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
async function main() {
|
||
|
|
const args = parseArgs();
|
||
|
|
const jobId = await createOrResumeJob(process.argv.slice(2));
|
||
|
|
|
||
|
|
if (!args.country && !args.all && !args.diocese) {
|
||
|
|
console.error('Error: Must specify --country <ISO2>, --diocese <code>, or --all');
|
||
|
|
console.error('Usage:');
|
||
|
|
console.error(' npx tsx scripts/import-gcatholic.ts --country CN');
|
||
|
|
console.error(' npx tsx scripts/import-gcatholic.ts --country CN --dry-run');
|
||
|
|
console.error(' npx tsx scripts/import-gcatholic.ts --diocese peki0');
|
||
|
|
console.error(' npx tsx scripts/import-gcatholic.ts --all');
|
||
|
|
console.error(' npx tsx scripts/import-gcatholic.ts --all --limit 500');
|
||
|
|
console.error(' npx tsx scripts/import-gcatholic.ts --all --resume-from PL');
|
||
|
|
process.exit(1);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (args.dryRun) {
|
||
|
|
console.log('\n*** DRY RUN MODE — no changes will be written to database ***\n');
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log(`Delay between requests: ${args.delay}ms`);
|
||
|
|
if (args.limit) console.log(`Limit: ${args.limit} churches`);
|
||
|
|
|
||
|
|
try {
|
||
|
|
const existingChurches = await loadExistingChurches();
|
||
|
|
const globalLimit = args.limit ? { remaining: args.limit } : undefined;
|
||
|
|
|
||
|
|
if (args.diocese) {
|
||
|
|
// Single diocese mode
|
||
|
|
const stats: ImportStats = {
|
||
|
|
churchesFound: 0,
|
||
|
|
newChurchesCreated: 0,
|
||
|
|
existingChurchesMerged: 0,
|
||
|
|
skipped: 0,
|
||
|
|
errors: 0,
|
||
|
|
errorDetails: [],
|
||
|
|
};
|
||
|
|
|
||
|
|
await importDiocese(args.diocese, args.diocese, args.country, existingChurches, args, stats, globalLimit);
|
||
|
|
printSummary(`Diocese ${args.diocese}`, stats, args.dryRun);
|
||
|
|
|
||
|
|
} else if (args.country) {
|
||
|
|
// Single country mode
|
||
|
|
const stats = await importCountry(args.country, existingChurches, args, globalLimit);
|
||
|
|
printSummary(args.country, stats, args.dryRun);
|
||
|
|
|
||
|
|
} else if (args.all) {
|
||
|
|
// All countries mode — discover from GCatholic
|
||
|
|
let countries = await discoverCountries(args.delay);
|
||
|
|
|
||
|
|
if (countries.length === 0) {
|
||
|
|
console.error('Failed to discover countries');
|
||
|
|
process.exit(1);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Handle --resume-from
|
||
|
|
if (args.resumeFrom) {
|
||
|
|
const idx = countries.indexOf(args.resumeFrom);
|
||
|
|
if (idx === -1) {
|
||
|
|
console.error(`Country ${args.resumeFrom} not found in GCatholic listing`);
|
||
|
|
process.exit(1);
|
||
|
|
}
|
||
|
|
console.log(`Resuming from ${args.resumeFrom} (skipping ${idx} countries)\n`);
|
||
|
|
countries = countries.slice(idx);
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log(`Will process ${countries.length} countries\n`);
|
||
|
|
|
||
|
|
const totalStats: ImportStats = {
|
||
|
|
churchesFound: 0,
|
||
|
|
newChurchesCreated: 0,
|
||
|
|
existingChurchesMerged: 0,
|
||
|
|
skipped: 0,
|
||
|
|
errors: 0,
|
||
|
|
errorDetails: [],
|
||
|
|
};
|
||
|
|
|
||
|
|
let countriesProcessed = 0;
|
||
|
|
|
||
|
|
for (const countryCode of countries) {
|
||
|
|
if (globalLimit && globalLimit.remaining <= 0) {
|
||
|
|
console.log(`\nGlobal limit reached, stopping.`);
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
const stats = await importCountry(countryCode, existingChurches, args, globalLimit);
|
||
|
|
printSummary(countryCode, stats, args.dryRun);
|
||
|
|
|
||
|
|
// Aggregate
|
||
|
|
totalStats.churchesFound += stats.churchesFound;
|
||
|
|
totalStats.newChurchesCreated += stats.newChurchesCreated;
|
||
|
|
totalStats.existingChurchesMerged += stats.existingChurchesMerged;
|
||
|
|
totalStats.skipped += stats.skipped;
|
||
|
|
totalStats.errors += stats.errors;
|
||
|
|
totalStats.errorDetails.push(...stats.errorDetails);
|
||
|
|
countriesProcessed++;
|
||
|
|
|
||
|
|
// Small extra delay between countries
|
||
|
|
await new Promise((resolve) => setTimeout(resolve, 2000));
|
||
|
|
}
|
||
|
|
|
||
|
|
// Overall summary
|
||
|
|
console.log(`\n${'='.repeat(60)}`);
|
||
|
|
console.log(`OVERALL SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
|
||
|
|
console.log(`${'='.repeat(60)}`);
|
||
|
|
console.log(`Countries processed: ${countriesProcessed}`);
|
||
|
|
console.log(`Total churches found: ${totalStats.churchesFound}`);
|
||
|
|
console.log(`Total new churches created: ${totalStats.newChurchesCreated}`);
|
||
|
|
console.log(`Total merged with existing: ${totalStats.existingChurchesMerged}`);
|
||
|
|
console.log(`Total skipped: ${totalStats.skipped}`);
|
||
|
|
if (totalStats.errors > 0) {
|
||
|
|
console.log(`Total errors: ${totalStats.errors}`);
|
||
|
|
}
|
||
|
|
console.log(`Total HTTP requests made: ${requestCount}`);
|
||
|
|
console.log(`${'='.repeat(60)}\n`);
|
||
|
|
|
||
|
|
if (totalStats.errorDetails.length > 0 && totalStats.errorDetails.length <= 50) {
|
||
|
|
console.log('\nError details:');
|
||
|
|
totalStats.errorDetails.forEach((e) => console.log(` - ${e}`));
|
||
|
|
} else if (totalStats.errorDetails.length > 50) {
|
||
|
|
console.log(`\nFirst 50 errors (of ${totalStats.errorDetails.length}):`);
|
||
|
|
totalStats.errorDetails.slice(0, 50).forEach((e) => console.log(` - ${e}`));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
await completeJob(jobId);
|
||
|
|
} catch (error) {
|
||
|
|
console.error('Fatal error:', error);
|
||
|
|
await completeJob(jobId, String(error));
|
||
|
|
process.exit(1);
|
||
|
|
} finally {
|
||
|
|
await prisma.$disconnect();
|
||
|
|
await pool.end();
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
main();
|