Files
ScraperControl/scripts/import-osm-churches.ts
albertfj114 033f805965 fix: clean up church-matcher types and add HK OSM bounding box
- Remove discovermassId/buscarmisasNetworkId from findDuplicateChurch match
  passes (importers now do their own pre-check dedup); restore as optional
  fields on ExistingChurch to keep type/runtime in sync
- Add HK bounding box to COUNTRY_BOUNDING_BOXES; fix silent 0-result
  fallback when country query returns empty from mirror server
- discovermass importer: add --limit flag and skip-already-imported
  pre-check using importedSlugs set
- Import scripts: remove discovermassId from ExistingChurch select/stubs
  (field not needed in shared matcher context)
- Schema: reorder discovermassId/kerknetId/gottesdienstzeitenId fields

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-01 22:20:45 -04:00

615 lines
22 KiB
TypeScript

#!/usr/bin/env tsx
/**
* Import Catholic churches from OpenStreetMap
* Usage:
* npx tsx scripts/import-osm-churches.ts --country US
* npx tsx scripts/import-osm-churches.ts --all
* npx tsx scripts/import-osm-churches.ts --country MX --dry-run
* npx tsx scripts/import-osm-churches.ts --all --sort-by-count
*/
// Load .env for database connection (before importing anything that uses process.env)
import dotenv from 'dotenv';
import path from 'path';
// Load .env.local first (production Neon URL), then .env (local fallback)
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
// Create a fresh Prisma client for this script (don't use cached pool from lib/db)
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
const pool = new Pool({
connectionString: dbUrl,
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined
});
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
import { queryOverpassByCountryWithFallback, type OSMChurch } from '../src/lib/overpass-client';
import { findDuplicateChurch, mergeChurchData } from '../src/lib/church-matcher';
import { parseServiceTimes } from '../src/lib/service-times-parser';
// Countries with significant Catholic populations, organized by priority
const CATHOLIC_COUNTRIES = {
// Priority 1: Large Catholic populations (North & South America + major European/Asian countries)
priority1: [
// North America
'US', 'MX', 'CA',
// South America
'BR', 'AR', 'CO', 'PE', 'VE', 'CL', 'EC', 'GT', 'CU', 'BO', 'DO', 'HT', 'HN', 'PY', 'SV', 'NI', 'CR', 'PA', 'UY', 'GY', 'SR', 'GF',
// Europe
'IT', 'FR', 'ES', 'PL', 'DE', 'PT', 'BE', 'CZ', 'AT', 'HU', 'IE', 'HR', 'GB',
// Asia & Oceania
'PH', 'AU', 'NG', 'CD',
],
// Priority 2: Medium Catholic populations
priority2: [
// Rest of Europe
'NL', 'SK', 'SI', 'LT', 'CH', 'LU', 'MT',
'UA', 'RO', 'LV', 'BY',
// Africa
'AO', 'UG', 'TZ', 'KE', 'CM', 'RW', 'BI', 'MG', 'MW', 'ZM', 'ZW', 'MZ', 'BJ', 'TG', 'CI', 'BF', 'ML', 'NE', 'SN', 'GN', 'LR', 'SL', 'GH', 'GA', 'CG', 'CF', 'TD', 'SD', 'ET', 'ER', 'SO',
// Asia
'IN', 'TL', 'VN', 'KR', 'JP', 'ID', 'MY', 'SG', 'TH', 'LA', 'KH', 'MM', 'CN', 'LK', 'BD', 'PK',
// Middle East
'LB', 'IL', 'PS', 'JO', 'SY', 'IQ',
// Oceania
'NZ', 'PG', 'FJ', 'NC', 'PF',
],
// Priority 3: Smaller Catholic presence
priority3: [
// Caribbean & Central America (smaller islands)
'BS', 'BB', 'JM', 'TT', 'GD', 'LC', 'VC', 'AG', 'DM', 'KN',
// Europe (smaller countries + Balkans/Eastern)
'MC', 'SM', 'VA', 'LI', 'AD',
'RS', 'BA', 'MK', 'AL', 'EE',
// Caucasus + Russia
'GE', 'AM', 'RU',
// Africa (rest)
'NA', 'BW', 'LS', 'SZ', 'MU', 'SC', 'KM', 'CV', 'ST', 'GQ', 'DJ', 'GM',
// Asia (rest)
'BT', 'NP', 'AF', 'KZ', 'UZ', 'TM', 'TJ', 'KG', 'MN', 'BN', 'MV',
// Oceania (rest)
'WS', 'TO', 'VU', 'SB', 'KI', 'NR', 'TV', 'FM', 'MH', 'PW',
],
};
interface ImportStats {
osmChurchesFound: number;
newChurchesInserted: number;
existingUpdated: number;
existingLinked: number;
churchesWithWebsites: number;
churchesWithoutWebsites: number;
churchesWithServiceTimes: number;
scheduleEntriesCreated: number;
errors: number;
}
/**
* Parse command line arguments
*/
function parseArgs(): { country?: string; all: boolean; dryRun: boolean; resumeFrom?: string; priority?: number; sortByCount: boolean } {
const args = process.argv.slice(2);
const result = {
country: undefined as string | undefined,
all: false,
dryRun: false,
resumeFrom: undefined as string | undefined,
priority: undefined as number | undefined,
sortByCount: false,
};
for (let i = 0; i < args.length; i++) {
if (args[i] === '--country' && args[i + 1]) {
result.country = args[i + 1].toUpperCase();
i++;
} else if (args[i] === '--all') {
result.all = true;
} else if (args[i] === '--dry-run') {
result.dryRun = true;
} else if (args[i] === '--resume-from' && args[i + 1]) {
result.resumeFrom = args[i + 1].toUpperCase();
i++;
} else if (args[i] === '--priority' && args[i + 1]) {
const priority = parseInt(args[i + 1], 10);
if (priority >= 1 && priority <= 3) {
result.priority = priority;
} else {
console.error('Error: --priority must be 1, 2, or 3');
process.exit(1);
}
i++;
} else if (args[i] === '--sort-by-count') {
result.sortByCount = true;
}
}
return result;
}
/**
* Import churches from a single country
*/
async function importFromOSM(countryCode: string, dryRun: boolean = false): Promise<ImportStats> {
const stats: ImportStats = {
osmChurchesFound: 0,
newChurchesInserted: 0,
existingUpdated: 0,
existingLinked: 0,
churchesWithWebsites: 0,
churchesWithoutWebsites: 0,
churchesWithServiceTimes: 0,
scheduleEntriesCreated: 0,
errors: 0,
};
console.log(`\n${'='.repeat(60)}`);
console.log(`Importing Catholic churches from ${countryCode}`);
console.log(`${'='.repeat(60)}\n`);
try {
// Query Overpass API (with automatic fallback to regional bounding boxes)
const osmChurches = await queryOverpassByCountryWithFallback(countryCode);
stats.osmChurchesFound = osmChurches.length;
if (osmChurches.length === 0) {
console.log(`No churches found in ${countryCode}`);
return stats;
}
console.log(`Found ${osmChurches.length} Catholic churches in ${countryCode}`);
if (dryRun) {
console.log('\n[DRY RUN] Would import the following churches:');
osmChurches.slice(0, 10).forEach((church) => {
console.log(` - ${church.name} (${church.city || 'unknown city'})`);
console.log(` OSM ID: ${church.osmId}, Website: ${church.website || 'none'}`);
});
if (osmChurches.length > 10) {
console.log(` ... and ${osmChurches.length - 10} more`);
}
// Count websites
stats.churchesWithWebsites = osmChurches.filter((c) => c.website).length;
stats.churchesWithoutWebsites = osmChurches.length - stats.churchesWithWebsites;
return stats;
}
// Fetch all existing churches for deduplication
// For large datasets, we could optimize by fetching only churches in the same country/region
console.log('Fetching existing churches for deduplication...');
const existingChurches = await prisma.church.findMany({
select: {
id: true,
name: true,
latitude: true,
longitude: true,
osmId: true,
baiduId: true,
masstimesId: true,
orarimesseId: true,
massSchedulesPhId: true,
philmassId: true,
horariosMisasId: true,
mszeInfoId: true,
weekdayMassesId: true,
messesInfoId: true,
bohosluzbyId: true,
miserendId: true,
kerknetId: true,
gottesdienstzeitenId: true,
source: true,
website: true,
phone: true,
address: true,
},
});
console.log(`Found ${existingChurches.length} existing churches in database`);
// Process churches one by one (no batch transactions to avoid rollbacks)
let processed = 0;
for (const osmChurch of osmChurches) {
try {
// Check for duplicate
const duplicate = findDuplicateChurch(osmChurch, existingChurches);
if (duplicate && duplicate.osmId === osmChurch.osmId) {
// Existing church with matching osmId - update it
const mergedData = mergeChurchData(duplicate, osmChurch);
// Verify the church exists in the database (not just in our temp list from this run)
const existsInDb = await prisma.church.findUnique({ where: { id: duplicate.id } });
if (existsInDb) {
await prisma.church.update({
where: { id: duplicate.id },
data: mergedData,
});
stats.existingUpdated++;
// Import service_times for existing churches that don't have schedules yet
if (osmChurch.serviceTimes) {
const existingSchedules = await prisma.massSchedule.count({ where: { churchId: duplicate.id } });
if (existingSchedules === 0) {
const scheduleEntries = parseServiceTimes(osmChurch.serviceTimes);
if (scheduleEntries.length > 0) {
await prisma.massSchedule.createMany({
data: scheduleEntries.map(entry => ({
churchId: duplicate.id,
dayOfWeek: entry.dayOfWeek,
time: entry.time,
massType: entry.dayOfWeek === 0 ? 'Sunday' :
entry.dayOfWeek === 6 ? 'Saturday' : 'Daily',
language: 'Unknown',
notes: 'From OSM service_times tag',
})),
});
stats.churchesWithServiceTimes++;
stats.scheduleEntriesCreated += scheduleEntries.length;
}
}
}
} else {
// Duplicate from earlier in this run - skip (already processed)
stats.existingUpdated++;
}
if (osmChurch.website) stats.churchesWithWebsites++;
else stats.churchesWithoutWebsites++;
} else if (duplicate) {
// Existing church matched by proximity/name - link it with osmId
const mergedData = mergeChurchData(duplicate, osmChurch);
// Verify the church exists in the database (not just in our temp list from this run)
const existsInDb = await prisma.church.findUnique({ where: { id: duplicate.id } });
if (existsInDb) {
await prisma.church.update({
where: { id: duplicate.id },
data: mergedData,
});
stats.existingLinked++;
// Import service_times for linked churches that don't have schedules yet
if (osmChurch.serviceTimes) {
const existingSchedules = await prisma.massSchedule.count({ where: { churchId: duplicate.id } });
if (existingSchedules === 0) {
const scheduleEntries = parseServiceTimes(osmChurch.serviceTimes);
if (scheduleEntries.length > 0) {
await prisma.massSchedule.createMany({
data: scheduleEntries.map(entry => ({
churchId: duplicate.id,
dayOfWeek: entry.dayOfWeek,
time: entry.time,
massType: entry.dayOfWeek === 0 ? 'Sunday' :
entry.dayOfWeek === 6 ? 'Saturday' : 'Daily',
language: 'Unknown',
notes: 'From OSM service_times tag',
})),
});
stats.churchesWithServiceTimes++;
stats.scheduleEntriesCreated += scheduleEntries.length;
}
}
}
} else {
// Duplicate from earlier in this run - skip (already processed)
stats.existingLinked++;
}
if (osmChurch.website) stats.churchesWithWebsites++;
else stats.churchesWithoutWebsites++;
} else {
// New church - insert it and capture the real ID
const newChurch = await prisma.church.create({
data: {
name: osmChurch.name,
latitude: osmChurch.lat,
longitude: osmChurch.lng,
address: osmChurch.address,
city: osmChurch.city,
state: osmChurch.state,
zip: osmChurch.zip,
country: osmChurch.country || countryCode,
phone: osmChurch.phone,
website: osmChurch.website,
diocese: osmChurch.diocese,
wheelchairAccess: osmChurch.wheelchairAccess ?? false,
source: 'osm',
osmId: osmChurch.osmId,
hasWebsite: !!osmChurch.website,
osmLastSyncedAt: new Date(),
},
});
stats.newChurchesInserted++;
if (osmChurch.website) stats.churchesWithWebsites++;
else stats.churchesWithoutWebsites++;
// Parse service_times tag and create mass schedules
if (osmChurch.serviceTimes) {
const scheduleEntries = parseServiceTimes(osmChurch.serviceTimes);
if (scheduleEntries.length > 0) {
await prisma.massSchedule.createMany({
data: scheduleEntries.map(entry => ({
churchId: newChurch.id,
dayOfWeek: entry.dayOfWeek,
time: entry.time,
massType: entry.dayOfWeek === 0 ? 'Sunday' :
entry.dayOfWeek === 6 ? 'Saturday' : 'Daily',
language: 'Unknown',
notes: 'From OSM service_times tag',
})),
});
stats.churchesWithServiceTimes++;
stats.scheduleEntriesCreated += scheduleEntries.length;
// Mark as scraped since we have schedule data
await prisma.church.update({
where: { id: newChurch.id },
data: { lastScrapedAt: new Date() },
});
}
}
// Add to existing churches list for future deduplication in this run (use real DB ID)
existingChurches.push({
id: newChurch.id,
name: osmChurch.name,
latitude: osmChurch.lat,
longitude: osmChurch.lng,
osmId: osmChurch.osmId,
baiduId: null,
masstimesId: null,
orarimesseId: null,
massSchedulesPhId: null,
philmassId: null,
horariosMisasId: null,
mszeInfoId: null,
weekdayMassesId: null,
messesInfoId: null,
bohosluzbyId: null,
miserendId: null,
kerknetId: null,
gottesdienstzeitenId: null,
source: 'osm',
website: osmChurch.website || null,
phone: osmChurch.phone || null,
address: osmChurch.address || null,
});
}
processed++;
// Log progress every 500 churches
if (processed % 500 === 0) {
console.log(`Progress: ${processed}/${osmChurches.length} churches processed`);
}
} catch (error) {
console.error(`Error processing church ${osmChurch.name}:`, error);
stats.errors++;
}
}
console.log(`\nProcessed all ${osmChurches.length} churches from ${countryCode}`);
} catch (error) {
console.error(`Failed to import from ${countryCode}:`, error);
stats.errors++;
}
return stats;
}
/**
* Print import summary
*/
function printSummary(countryCode: string, stats: ImportStats, dryRun: boolean) {
console.log(`\n${'='.repeat(60)}`);
console.log(`Import Summary for ${countryCode} ${dryRun ? '(DRY RUN)' : ''}`);
console.log(`${'='.repeat(60)}`);
console.log(`OSM churches found: ${stats.osmChurchesFound}`);
if (!dryRun) {
console.log(`New churches inserted: ${stats.newChurchesInserted}`);
console.log(`Existing churches updated: ${stats.existingUpdated} (matched by osmId)`);
console.log(`Existing churches linked: ${stats.existingLinked} (matched by proximity)`);
}
console.log(`Churches with websites: ${stats.churchesWithWebsites}`);
console.log(`Churches without websites: ${stats.churchesWithoutWebsites}`);
if (!dryRun && stats.churchesWithServiceTimes > 0) {
console.log(`Churches with service_times: ${stats.churchesWithServiceTimes}`);
console.log(`Schedule entries created: ${stats.scheduleEntriesCreated}`);
}
if (!dryRun && stats.errors > 0) {
console.log(`Errors encountered: ${stats.errors}`);
}
console.log(`${'='.repeat(60)}\n`);
}
/**
* Main function
*/
async function createOrResumeJob(args: string[]): Promise<string | null> {
const jobIdIndex = args.indexOf('--job-id');
if (jobIdIndex !== -1) {
const jobId = args[jobIdIndex + 1];
await prisma.backgroundJob.update({
where: { id: jobId },
data: { status: 'running', startedAt: new Date() },
});
return jobId;
}
return null;
}
async function completeJob(jobId: string | null, error?: string): Promise<void> {
if (!jobId) return;
try {
await prisma.backgroundJob.update({
where: { id: jobId },
data: {
status: error ? 'failed' : 'completed',
error: error || null,
completedAt: new Date(),
},
});
} catch (err) {
console.error(`Failed to update job ${jobId}:`, err);
}
}
async function main() {
const { country, all, dryRun, resumeFrom, priority, sortByCount } = parseArgs();
const jobId = await createOrResumeJob(process.argv.slice(2));
if (!country && !all && !priority) {
console.error('Error: Must specify --country <CODE>, --all, or --priority <1|2|3>');
console.error('Usage:');
console.error(' npx tsx scripts/import-osm-churches.ts --country US');
console.error(' npx tsx scripts/import-osm-churches.ts --all');
console.error(' npx tsx scripts/import-osm-churches.ts --priority 1');
console.error(' npx tsx scripts/import-osm-churches.ts --all --resume-from IT');
console.error(' npx tsx scripts/import-osm-churches.ts --country MX --dry-run');
console.error(' npx tsx scripts/import-osm-churches.ts --all --sort-by-count');
process.exit(1);
}
if (dryRun) {
console.log('\n*** DRY RUN MODE - No changes will be made to database ***\n');
}
try {
if (country) {
// Import single country
const stats = await importFromOSM(country, dryRun);
printSummary(country, stats, dryRun);
} else if (all || priority !== undefined) {
// Import all countries or specific priority
let allCountries: string[];
if (priority !== undefined) {
// Import only specified priority level
const priorityKey = `priority${priority}` as keyof typeof CATHOLIC_COUNTRIES;
allCountries = CATHOLIC_COUNTRIES[priorityKey];
console.log(`Importing Priority ${priority} countries (${allCountries.length} countries)...\n`);
} else {
// Import all priorities
console.log('Importing all Catholic countries by priority...\n');
allCountries = [
...CATHOLIC_COUNTRIES.priority1,
...CATHOLIC_COUNTRIES.priority2,
...CATHOLIC_COUNTRIES.priority3,
];
}
// Sort by existing OSM church count (least first) if requested
if (sortByCount) {
console.log('Querying DB for current OSM church counts per country...');
const countRows = await prisma.$queryRawUnsafe<Array<{ country: string; count: bigint }>>(
`SELECT country, COUNT(*) as count FROM churches WHERE source = 'osm' AND country IS NOT NULL GROUP BY country`
);
const countMap = new Map<string, number>();
for (const row of countRows) {
countMap.set(row.country, Number(row.count));
}
allCountries.sort((a, b) => (countMap.get(a) || 0) - (countMap.get(b) || 0));
console.log('Country processing order (least OSM churches first):');
for (const c of allCountries) {
console.log(` ${c}: ${countMap.get(c) || 0} existing OSM churches`);
}
console.log('');
}
// Handle --resume-from flag
if (resumeFrom) {
const resumeIndex = allCountries.indexOf(resumeFrom);
if (resumeIndex === -1) {
console.error(`Error: Country ${resumeFrom} not found in the list`);
process.exit(1);
}
console.log(`Resuming from ${resumeFrom} (skipping first ${resumeIndex} countries)...\n`);
allCountries = allCountries.slice(resumeIndex);
}
const totalStats: ImportStats = {
osmChurchesFound: 0,
newChurchesInserted: 0,
existingUpdated: 0,
existingLinked: 0,
churchesWithWebsites: 0,
churchesWithoutWebsites: 0,
churchesWithServiceTimes: 0,
scheduleEntriesCreated: 0,
errors: 0,
};
for (const countryCode of allCountries) {
const stats = await importFromOSM(countryCode, dryRun);
printSummary(countryCode, stats, dryRun);
// Aggregate stats
totalStats.osmChurchesFound += stats.osmChurchesFound;
totalStats.newChurchesInserted += stats.newChurchesInserted;
totalStats.existingUpdated += stats.existingUpdated;
totalStats.existingLinked += stats.existingLinked;
totalStats.churchesWithWebsites += stats.churchesWithWebsites;
totalStats.churchesWithoutWebsites += stats.churchesWithoutWebsites;
totalStats.churchesWithServiceTimes += stats.churchesWithServiceTimes;
totalStats.scheduleEntriesCreated += stats.scheduleEntriesCreated;
totalStats.errors += stats.errors;
// Small delay between countries to be respectful (rate limiting is also in the client)
await new Promise((resolve) => setTimeout(resolve, 2000));
}
// Print overall summary
console.log(`\n${'='.repeat(60)}`);
console.log(`OVERALL SUMMARY ${dryRun ? '(DRY RUN)' : ''}`);
console.log(`${'='.repeat(60)}`);
console.log(`Total countries processed: ${allCountries.length}`);
console.log(`Total OSM churches found: ${totalStats.osmChurchesFound}`);
if (!dryRun) {
console.log(`Total new churches inserted: ${totalStats.newChurchesInserted}`);
console.log(`Total churches updated: ${totalStats.existingUpdated}`);
console.log(`Total churches linked: ${totalStats.existingLinked}`);
}
console.log(`Total with websites: ${totalStats.churchesWithWebsites}`);
console.log(`Total without websites: ${totalStats.churchesWithoutWebsites}`);
if (!dryRun && totalStats.errors > 0) {
console.log(`Total errors: ${totalStats.errors}`);
}
console.log(`${'='.repeat(60)}\n`);
}
await completeJob(jobId);
} catch (error) {
console.error('Fatal error:', error);
await completeJob(jobId, String(error));
process.exit(1);
} finally {
await prisma.$disconnect();
}
}
main();