#!/usr/bin/env tsx /** * Import Catholic churches from OpenStreetMap * Usage: * npx tsx scripts/import-osm-churches.ts --country US * npx tsx scripts/import-osm-churches.ts --all * npx tsx scripts/import-osm-churches.ts --country MX --dry-run * npx tsx scripts/import-osm-churches.ts --all --sort-by-count */ // Load .env for database connection (before importing anything that uses process.env) import dotenv from 'dotenv'; import path from 'path'; // Load .env.local first (production Neon URL), then .env (local fallback) dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); dotenv.config({ path: path.resolve(process.cwd(), '.env') }); // Create a fresh Prisma client for this script (don't use cached pool from lib/db) import { Pool } from 'pg'; import { PrismaPg } from '@prisma/adapter-pg'; import { PrismaClient } from '@prisma/client'; const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); const pool = new Pool({ connectionString: dbUrl, ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined }); const adapter = new PrismaPg(pool); const prisma = new PrismaClient({ adapter }); import { queryOverpassByCountryWithFallback, type OSMChurch } from '../src/lib/overpass-client'; import { findDuplicateChurch, mergeChurchData } from '../src/lib/church-matcher'; import { parseServiceTimes } from '../src/lib/service-times-parser'; // Countries with significant Catholic populations, organized by priority const CATHOLIC_COUNTRIES = { // Priority 1: Large Catholic populations (North & South America + major European/Asian countries) priority1: [ // North America 'US', 'MX', 'CA', // South America 'BR', 'AR', 'CO', 'PE', 'VE', 'CL', 'EC', 'GT', 'CU', 'BO', 'DO', 'HT', 'HN', 'PY', 'SV', 'NI', 'CR', 'PA', 'UY', 'GY', 'SR', 'GF', // Europe 'IT', 'FR', 'ES', 'PL', 'DE', 'PT', 'BE', 'CZ', 'AT', 'HU', 'IE', 'HR', 'GB', // Asia & Oceania 'PH', 'AU', 'NG', 'CD', ], // Priority 2: Medium Catholic populations priority2: [ // Rest of Europe 'NL', 'SK', 'SI', 'LT', 'CH', 'LU', 'MT', 'UA', 'RO', 'LV', 'BY', // Africa 'AO', 'UG', 'TZ', 'KE', 'CM', 'RW', 'BI', 'MG', 'MW', 'ZM', 'ZW', 'MZ', 'BJ', 'TG', 'CI', 'BF', 'ML', 'NE', 'SN', 'GN', 'LR', 'SL', 'GH', 'GA', 'CG', 'CF', 'TD', 'SD', 'ET', 'ER', 'SO', // Asia 'IN', 'TL', 'VN', 'KR', 'JP', 'ID', 'MY', 'SG', 'TH', 'LA', 'KH', 'MM', 'CN', 'LK', 'BD', 'PK', // Middle East 'LB', 'IL', 'PS', 'JO', 'SY', 'IQ', // Oceania 'NZ', 'PG', 'FJ', 'NC', 'PF', ], // Priority 3: Smaller Catholic presence priority3: [ // Caribbean & Central America (smaller islands) 'BS', 'BB', 'JM', 'TT', 'GD', 'LC', 'VC', 'AG', 'DM', 'KN', // Europe (smaller countries + Balkans/Eastern) 'MC', 'SM', 'VA', 'LI', 'AD', 'RS', 'BA', 'MK', 'AL', 'EE', // Caucasus + Russia 'GE', 'AM', 'RU', // Africa (rest) 'NA', 'BW', 'LS', 'SZ', 'MU', 'SC', 'KM', 'CV', 'ST', 'GQ', 'DJ', 'GM', // Asia (rest) 'BT', 'NP', 'AF', 'KZ', 'UZ', 'TM', 'TJ', 'KG', 'MN', 'BN', 'MV', // Oceania (rest) 'WS', 'TO', 'VU', 'SB', 'KI', 'NR', 'TV', 'FM', 'MH', 'PW', ], }; interface ImportStats { osmChurchesFound: number; newChurchesInserted: number; existingUpdated: number; existingLinked: number; churchesWithWebsites: number; churchesWithoutWebsites: number; churchesWithServiceTimes: number; scheduleEntriesCreated: number; errors: number; } /** * Parse command line arguments */ function parseArgs(): { country?: string; all: boolean; dryRun: boolean; resumeFrom?: string; priority?: number; sortByCount: boolean } { const args = process.argv.slice(2); const result = { country: undefined as string | undefined, all: false, dryRun: false, resumeFrom: undefined as string | undefined, priority: undefined as number | undefined, sortByCount: false, }; for (let i = 0; i < args.length; i++) { if (args[i] === '--country' && args[i + 1]) { result.country = args[i + 1].toUpperCase(); i++; } else if (args[i] === '--all') { result.all = true; } else if (args[i] === '--dry-run') { result.dryRun = true; } else if (args[i] === '--resume-from' && args[i + 1]) { result.resumeFrom = args[i + 1].toUpperCase(); i++; } else if (args[i] === '--priority' && args[i + 1]) { const priority = parseInt(args[i + 1], 10); if (priority >= 1 && priority <= 3) { result.priority = priority; } else { console.error('Error: --priority must be 1, 2, or 3'); process.exit(1); } i++; } else if (args[i] === '--sort-by-count') { result.sortByCount = true; } } return result; } /** * Import churches from a single country */ async function importFromOSM(countryCode: string, dryRun: boolean = false): Promise { const stats: ImportStats = { osmChurchesFound: 0, newChurchesInserted: 0, existingUpdated: 0, existingLinked: 0, churchesWithWebsites: 0, churchesWithoutWebsites: 0, churchesWithServiceTimes: 0, scheduleEntriesCreated: 0, errors: 0, }; console.log(`\n${'='.repeat(60)}`); console.log(`Importing Catholic churches from ${countryCode}`); console.log(`${'='.repeat(60)}\n`); try { // Query Overpass API (with automatic fallback to regional bounding boxes) const osmChurches = await queryOverpassByCountryWithFallback(countryCode); stats.osmChurchesFound = osmChurches.length; if (osmChurches.length === 0) { console.log(`No churches found in ${countryCode}`); return stats; } console.log(`Found ${osmChurches.length} Catholic churches in ${countryCode}`); if (dryRun) { console.log('\n[DRY RUN] Would import the following churches:'); osmChurches.slice(0, 10).forEach((church) => { console.log(` - ${church.name} (${church.city || 'unknown city'})`); console.log(` OSM ID: ${church.osmId}, Website: ${church.website || 'none'}`); }); if (osmChurches.length > 10) { console.log(` ... and ${osmChurches.length - 10} more`); } // Count websites stats.churchesWithWebsites = osmChurches.filter((c) => c.website).length; stats.churchesWithoutWebsites = osmChurches.length - stats.churchesWithWebsites; return stats; } // Fetch all existing churches for deduplication // For large datasets, we could optimize by fetching only churches in the same country/region console.log('Fetching existing churches for deduplication...'); const existingChurches = await prisma.church.findMany({ select: { id: true, name: true, latitude: true, longitude: true, osmId: true, baiduId: true, masstimesId: true, orarimesseId: true, massSchedulesPhId: true, philmassId: true, horariosMisasId: true, mszeInfoId: true, weekdayMassesId: true, messesInfoId: true, bohosluzbyId: true, miserendId: true, kerknetId: true, gottesdienstzeitenId: true, discovermassId: true, source: true, website: true, phone: true, address: true, }, }); console.log(`Found ${existingChurches.length} existing churches in database`); // Process churches one by one (no batch transactions to avoid rollbacks) let processed = 0; for (const osmChurch of osmChurches) { try { // Check for duplicate const duplicate = findDuplicateChurch(osmChurch, existingChurches); if (duplicate && duplicate.osmId === osmChurch.osmId) { // Existing church with matching osmId - update it const mergedData = mergeChurchData(duplicate, osmChurch); // Verify the church exists in the database (not just in our temp list from this run) const existsInDb = await prisma.church.findUnique({ where: { id: duplicate.id } }); if (existsInDb) { await prisma.church.update({ where: { id: duplicate.id }, data: mergedData, }); stats.existingUpdated++; // Import service_times for existing churches that don't have schedules yet if (osmChurch.serviceTimes) { const existingSchedules = await prisma.massSchedule.count({ where: { churchId: duplicate.id } }); if (existingSchedules === 0) { const scheduleEntries = parseServiceTimes(osmChurch.serviceTimes); if (scheduleEntries.length > 0) { await prisma.massSchedule.createMany({ data: scheduleEntries.map(entry => ({ churchId: duplicate.id, dayOfWeek: entry.dayOfWeek, time: entry.time, massType: entry.dayOfWeek === 0 ? 'Sunday' : entry.dayOfWeek === 6 ? 'Saturday' : 'Daily', language: 'Unknown', notes: 'From OSM service_times tag', })), }); stats.churchesWithServiceTimes++; stats.scheduleEntriesCreated += scheduleEntries.length; } } } } else { // Duplicate from earlier in this run - skip (already processed) stats.existingUpdated++; } if (osmChurch.website) stats.churchesWithWebsites++; else stats.churchesWithoutWebsites++; } else if (duplicate) { // Existing church matched by proximity/name - link it with osmId const mergedData = mergeChurchData(duplicate, osmChurch); // Verify the church exists in the database (not just in our temp list from this run) const existsInDb = await prisma.church.findUnique({ where: { id: duplicate.id } }); if (existsInDb) { await prisma.church.update({ where: { id: duplicate.id }, data: mergedData, }); stats.existingLinked++; // Import service_times for linked churches that don't have schedules yet if (osmChurch.serviceTimes) { const existingSchedules = await prisma.massSchedule.count({ where: { churchId: duplicate.id } }); if (existingSchedules === 0) { const scheduleEntries = parseServiceTimes(osmChurch.serviceTimes); if (scheduleEntries.length > 0) { await prisma.massSchedule.createMany({ data: scheduleEntries.map(entry => ({ churchId: duplicate.id, dayOfWeek: entry.dayOfWeek, time: entry.time, massType: entry.dayOfWeek === 0 ? 'Sunday' : entry.dayOfWeek === 6 ? 'Saturday' : 'Daily', language: 'Unknown', notes: 'From OSM service_times tag', })), }); stats.churchesWithServiceTimes++; stats.scheduleEntriesCreated += scheduleEntries.length; } } } } else { // Duplicate from earlier in this run - skip (already processed) stats.existingLinked++; } if (osmChurch.website) stats.churchesWithWebsites++; else stats.churchesWithoutWebsites++; } else { // New church - insert it and capture the real ID const newChurch = await prisma.church.create({ data: { name: osmChurch.name, latitude: osmChurch.lat, longitude: osmChurch.lng, address: osmChurch.address, city: osmChurch.city, state: osmChurch.state, zip: osmChurch.zip, country: osmChurch.country || countryCode, phone: osmChurch.phone, website: osmChurch.website, diocese: osmChurch.diocese, wheelchairAccess: osmChurch.wheelchairAccess ?? false, source: 'osm', osmId: osmChurch.osmId, hasWebsite: !!osmChurch.website, osmLastSyncedAt: new Date(), }, }); stats.newChurchesInserted++; if (osmChurch.website) stats.churchesWithWebsites++; else stats.churchesWithoutWebsites++; // Parse service_times tag and create mass schedules if (osmChurch.serviceTimes) { const scheduleEntries = parseServiceTimes(osmChurch.serviceTimes); if (scheduleEntries.length > 0) { await prisma.massSchedule.createMany({ data: scheduleEntries.map(entry => ({ churchId: newChurch.id, dayOfWeek: entry.dayOfWeek, time: entry.time, massType: entry.dayOfWeek === 0 ? 'Sunday' : entry.dayOfWeek === 6 ? 'Saturday' : 'Daily', language: 'Unknown', notes: 'From OSM service_times tag', })), }); stats.churchesWithServiceTimes++; stats.scheduleEntriesCreated += scheduleEntries.length; // Mark as scraped since we have schedule data await prisma.church.update({ where: { id: newChurch.id }, data: { lastScrapedAt: new Date() }, }); } } // Add to existing churches list for future deduplication in this run (use real DB ID) existingChurches.push({ id: newChurch.id, name: osmChurch.name, latitude: osmChurch.lat, longitude: osmChurch.lng, osmId: osmChurch.osmId, baiduId: null, masstimesId: null, orarimesseId: null, massSchedulesPhId: null, philmassId: null, horariosMisasId: null, mszeInfoId: null, weekdayMassesId: null, messesInfoId: null, bohosluzbyId: null, miserendId: null, kerknetId: null, gottesdienstzeitenId: null, discovermassId: null, source: 'osm', website: osmChurch.website || null, phone: osmChurch.phone || null, address: osmChurch.address || null, }); } processed++; // Log progress every 500 churches if (processed % 500 === 0) { console.log(`Progress: ${processed}/${osmChurches.length} churches processed`); } } catch (error) { console.error(`Error processing church ${osmChurch.name}:`, error); stats.errors++; } } console.log(`\nProcessed all ${osmChurches.length} churches from ${countryCode}`); } catch (error) { console.error(`Failed to import from ${countryCode}:`, error); stats.errors++; } return stats; } /** * Print import summary */ function printSummary(countryCode: string, stats: ImportStats, dryRun: boolean) { console.log(`\n${'='.repeat(60)}`); console.log(`Import Summary for ${countryCode} ${dryRun ? '(DRY RUN)' : ''}`); console.log(`${'='.repeat(60)}`); console.log(`OSM churches found: ${stats.osmChurchesFound}`); if (!dryRun) { console.log(`New churches inserted: ${stats.newChurchesInserted}`); console.log(`Existing churches updated: ${stats.existingUpdated} (matched by osmId)`); console.log(`Existing churches linked: ${stats.existingLinked} (matched by proximity)`); } console.log(`Churches with websites: ${stats.churchesWithWebsites}`); console.log(`Churches without websites: ${stats.churchesWithoutWebsites}`); if (!dryRun && stats.churchesWithServiceTimes > 0) { console.log(`Churches with service_times: ${stats.churchesWithServiceTimes}`); console.log(`Schedule entries created: ${stats.scheduleEntriesCreated}`); } if (!dryRun && stats.errors > 0) { console.log(`Errors encountered: ${stats.errors}`); } console.log(`${'='.repeat(60)}\n`); } /** * Main function */ async function createOrResumeJob(args: string[]): Promise { const jobIdIndex = args.indexOf('--job-id'); if (jobIdIndex !== -1) { const jobId = args[jobIdIndex + 1]; await prisma.backgroundJob.update({ where: { id: jobId }, data: { status: 'running', startedAt: new Date() }, }); return jobId; } return null; } async function completeJob(jobId: string | null, error?: string): Promise { if (!jobId) return; try { await prisma.backgroundJob.update({ where: { id: jobId }, data: { status: error ? 'failed' : 'completed', error: error || null, completedAt: new Date(), }, }); } catch (err) { console.error(`Failed to update job ${jobId}:`, err); } } async function main() { const { country, all, dryRun, resumeFrom, priority, sortByCount } = parseArgs(); const jobId = await createOrResumeJob(process.argv.slice(2)); if (!country && !all && !priority) { console.error('Error: Must specify --country , --all, or --priority <1|2|3>'); console.error('Usage:'); console.error(' npx tsx scripts/import-osm-churches.ts --country US'); console.error(' npx tsx scripts/import-osm-churches.ts --all'); console.error(' npx tsx scripts/import-osm-churches.ts --priority 1'); console.error(' npx tsx scripts/import-osm-churches.ts --all --resume-from IT'); console.error(' npx tsx scripts/import-osm-churches.ts --country MX --dry-run'); console.error(' npx tsx scripts/import-osm-churches.ts --all --sort-by-count'); process.exit(1); } if (dryRun) { console.log('\n*** DRY RUN MODE - No changes will be made to database ***\n'); } try { if (country) { // Import single country const stats = await importFromOSM(country, dryRun); printSummary(country, stats, dryRun); } else if (all || priority !== undefined) { // Import all countries or specific priority let allCountries: string[]; if (priority !== undefined) { // Import only specified priority level const priorityKey = `priority${priority}` as keyof typeof CATHOLIC_COUNTRIES; allCountries = CATHOLIC_COUNTRIES[priorityKey]; console.log(`Importing Priority ${priority} countries (${allCountries.length} countries)...\n`); } else { // Import all priorities console.log('Importing all Catholic countries by priority...\n'); allCountries = [ ...CATHOLIC_COUNTRIES.priority1, ...CATHOLIC_COUNTRIES.priority2, ...CATHOLIC_COUNTRIES.priority3, ]; } // Sort by existing OSM church count (least first) if requested if (sortByCount) { console.log('Querying DB for current OSM church counts per country...'); const countRows = await prisma.$queryRawUnsafe>( `SELECT country, COUNT(*) as count FROM churches WHERE source = 'osm' AND country IS NOT NULL GROUP BY country` ); const countMap = new Map(); for (const row of countRows) { countMap.set(row.country, Number(row.count)); } allCountries.sort((a, b) => (countMap.get(a) || 0) - (countMap.get(b) || 0)); console.log('Country processing order (least OSM churches first):'); for (const c of allCountries) { console.log(` ${c}: ${countMap.get(c) || 0} existing OSM churches`); } console.log(''); } // Handle --resume-from flag if (resumeFrom) { const resumeIndex = allCountries.indexOf(resumeFrom); if (resumeIndex === -1) { console.error(`Error: Country ${resumeFrom} not found in the list`); process.exit(1); } console.log(`Resuming from ${resumeFrom} (skipping first ${resumeIndex} countries)...\n`); allCountries = allCountries.slice(resumeIndex); } const totalStats: ImportStats = { osmChurchesFound: 0, newChurchesInserted: 0, existingUpdated: 0, existingLinked: 0, churchesWithWebsites: 0, churchesWithoutWebsites: 0, churchesWithServiceTimes: 0, scheduleEntriesCreated: 0, errors: 0, }; for (const countryCode of allCountries) { const stats = await importFromOSM(countryCode, dryRun); printSummary(countryCode, stats, dryRun); // Aggregate stats totalStats.osmChurchesFound += stats.osmChurchesFound; totalStats.newChurchesInserted += stats.newChurchesInserted; totalStats.existingUpdated += stats.existingUpdated; totalStats.existingLinked += stats.existingLinked; totalStats.churchesWithWebsites += stats.churchesWithWebsites; totalStats.churchesWithoutWebsites += stats.churchesWithoutWebsites; totalStats.churchesWithServiceTimes += stats.churchesWithServiceTimes; totalStats.scheduleEntriesCreated += stats.scheduleEntriesCreated; totalStats.errors += stats.errors; // Small delay between countries to be respectful (rate limiting is also in the client) await new Promise((resolve) => setTimeout(resolve, 2000)); } // Print overall summary console.log(`\n${'='.repeat(60)}`); console.log(`OVERALL SUMMARY ${dryRun ? '(DRY RUN)' : ''}`); console.log(`${'='.repeat(60)}`); console.log(`Total countries processed: ${allCountries.length}`); console.log(`Total OSM churches found: ${totalStats.osmChurchesFound}`); if (!dryRun) { console.log(`Total new churches inserted: ${totalStats.newChurchesInserted}`); console.log(`Total churches updated: ${totalStats.existingUpdated}`); console.log(`Total churches linked: ${totalStats.existingLinked}`); } console.log(`Total with websites: ${totalStats.churchesWithWebsites}`); console.log(`Total without websites: ${totalStats.churchesWithoutWebsites}`); if (!dryRun && totalStats.errors > 0) { console.log(`Total errors: ${totalStats.errors}`); } console.log(`${'='.repeat(60)}\n`); } await completeJob(jobId); } catch (error) { console.error('Fatal error:', error); await completeJob(jobId, String(error)); process.exit(1); } finally { await prisma.$disconnect(); } } main();