Files
ScraperControl/scripts/import-osm-region.ts
albertfj114 a046928ed0 feat: add discovermassId to church-matcher ExistingChurch and ChurchCandidate
Add discovermassId field to ExistingChurch interface and ChurchCandidate type,
insert a dedicated matching pass in findDuplicateChurch, and update all 15 importer
push blocks plus 16 loadExistingChurches select queries to include the new field.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-11 06:52:05 -04:00

347 lines
12 KiB
TypeScript

#!/usr/bin/env tsx
/**
* Import Catholic churches from a specific region of a country
* Usage:
* npx tsx scripts/import-osm-region.ts --country GB --region "England South"
* npx tsx scripts/import-osm-region.ts --country IT --region "North" --dry-run
*/
// Load .env for database connection (before importing anything that uses process.env)
import dotenv from 'dotenv';
import path from 'path';
// Load .env.local first (production Neon URL), then .env (local fallback)
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
// Create a fresh Prisma client for this script (don't use cached pool from lib/db)
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
const pool = new Pool({
connectionString: dbUrl,
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined
});
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
import { COUNTRY_BOUNDING_BOXES, queryOverpassByBoundingBox, type OSMChurch } from '../src/lib/overpass-client';
import { findDuplicateChurch, mergeChurchData } from '../src/lib/church-matcher';
interface ImportStats {
osmChurchesFound: number;
newChurchesInserted: number;
existingUpdated: number;
existingLinked: number;
churchesWithWebsites: number;
churchesWithoutWebsites: number;
errors: number;
}
/**
* Parse command line arguments
*/
function parseArgs(): { country?: string; region?: string; dryRun: boolean } {
const args = process.argv.slice(2);
const result = {
country: undefined as string | undefined,
region: undefined as string | undefined,
dryRun: false,
};
for (let i = 0; i < args.length; i++) {
if (args[i] === '--country' && args[i + 1]) {
result.country = args[i + 1].toUpperCase();
i++;
} else if (args[i] === '--region' && args[i + 1]) {
result.region = args[i + 1];
i++;
} else if (args[i] === '--dry-run') {
result.dryRun = true;
}
}
return result;
}
/**
* Import churches from a single region
*/
async function importFromRegion(countryCode: string, regionName: string, dryRun: boolean = false): Promise<ImportStats> {
const stats: ImportStats = {
osmChurchesFound: 0,
newChurchesInserted: 0,
existingUpdated: 0,
existingLinked: 0,
churchesWithWebsites: 0,
churchesWithoutWebsites: 0,
errors: 0,
};
console.log(`\n${'='.repeat(60)}`);
console.log(`Importing from ${countryCode} - ${regionName}`);
console.log(`${'='.repeat(60)}\n`);
// Look up the bounding box
const regions = COUNTRY_BOUNDING_BOXES[countryCode];
if (!regions) {
console.error(`Error: No bounding boxes defined for country ${countryCode}`);
console.error('Available countries:', Object.keys(COUNTRY_BOUNDING_BOXES).join(', '));
process.exit(1);
}
const region = regions.find(r => r.name === regionName);
if (!region) {
console.error(`Error: Region "${regionName}" not found for ${countryCode}`);
console.error('Available regions:', regions.map(r => r.name).join(', '));
process.exit(1);
}
try {
// Query Overpass API for this specific region
console.log(`Querying bounding box: (${region.south}, ${region.west}, ${region.north}, ${region.east})`);
const osmChurches = await queryOverpassByBoundingBox(region.south, region.west, region.north, region.east);
stats.osmChurchesFound = osmChurches.length;
if (osmChurches.length === 0) {
console.log(`No churches found in ${regionName}`);
return stats;
}
console.log(`Found ${osmChurches.length} Catholic churches in ${regionName}`);
if (dryRun) {
console.log('\n[DRY RUN] Would import the following churches:');
osmChurches.slice(0, 10).forEach((church) => {
console.log(` - ${church.name} (${church.city || 'unknown city'})`);
console.log(` OSM ID: ${church.osmId}, Website: ${church.website || 'none'}`);
});
if (osmChurches.length > 10) {
console.log(` ... and ${osmChurches.length - 10} more`);
}
// Count websites
stats.churchesWithWebsites = osmChurches.filter((c) => c.website).length;
stats.churchesWithoutWebsites = osmChurches.length - stats.churchesWithWebsites;
return stats;
}
// Fetch all existing churches for deduplication
console.log('Fetching existing churches for deduplication...');
const existingChurches = await prisma.church.findMany({
select: {
id: true,
name: true,
latitude: true,
longitude: true,
osmId: true,
baiduId: true,
masstimesId: true,
orarimesseId: true,
massSchedulesPhId: true,
philmassId: true,
horariosMisasId: true,
mszeInfoId: true,
weekdayMassesId: true,
messesInfoId: true,
bohosluzbyId: true,
miserendId: true,
kerknetId: true,
gottesdienstzeitenId: true,
discovermassId: true,
source: true,
website: true,
phone: true,
address: true,
},
});
console.log(`Found ${existingChurches.length} existing churches in database`);
// Process churches one by one (no batch transactions to avoid rollbacks)
let processed = 0;
for (const osmChurch of osmChurches) {
try {
// Check for duplicate
const duplicate = findDuplicateChurch(osmChurch, existingChurches);
if (duplicate && duplicate.osmId === osmChurch.osmId) {
// Existing church with matching osmId - update it
const mergedData = mergeChurchData(duplicate, osmChurch);
// Verify the church exists in the database (not just in our temp list from this run)
const existsInDb = await prisma.church.findUnique({ where: { id: duplicate.id } });
if (existsInDb) {
await prisma.church.update({
where: { id: duplicate.id },
data: mergedData,
});
stats.existingUpdated++;
} else {
// Duplicate from earlier in this run - skip (already processed)
stats.existingUpdated++;
}
if (osmChurch.website) stats.churchesWithWebsites++;
else stats.churchesWithoutWebsites++;
} else if (duplicate) {
// Existing church matched by proximity/name - link it with osmId
const mergedData = mergeChurchData(duplicate, osmChurch);
// Verify the church exists in the database (not just in our temp list from this run)
const existsInDb = await prisma.church.findUnique({ where: { id: duplicate.id } });
if (existsInDb) {
await prisma.church.update({
where: { id: duplicate.id },
data: mergedData,
});
stats.existingLinked++;
} else {
// Duplicate from earlier in this run - skip (already processed)
stats.existingLinked++;
}
if (osmChurch.website) stats.churchesWithWebsites++;
else stats.churchesWithoutWebsites++;
} else {
// New church - insert it and capture the real ID
const newChurch = await prisma.church.create({
data: {
name: osmChurch.name,
latitude: osmChurch.lat,
longitude: osmChurch.lng,
address: osmChurch.address,
city: osmChurch.city,
state: osmChurch.state,
zip: osmChurch.zip,
country: osmChurch.country || countryCode,
phone: osmChurch.phone,
website: osmChurch.website,
diocese: osmChurch.diocese,
wheelchairAccess: osmChurch.wheelchairAccess ?? false,
source: 'osm',
osmId: osmChurch.osmId,
hasWebsite: !!osmChurch.website,
osmLastSyncedAt: new Date(),
},
});
stats.newChurchesInserted++;
if (osmChurch.website) stats.churchesWithWebsites++;
else stats.churchesWithoutWebsites++;
// Add to existing churches list for future deduplication in this run (use real DB ID)
existingChurches.push({
id: newChurch.id,
name: osmChurch.name,
latitude: osmChurch.lat,
longitude: osmChurch.lng,
osmId: osmChurch.osmId,
baiduId: null,
masstimesId: null,
orarimesseId: null,
massSchedulesPhId: null,
philmassId: null,
horariosMisasId: null,
mszeInfoId: null,
weekdayMassesId: null,
messesInfoId: null,
bohosluzbyId: null,
miserendId: null,
kerknetId: null,
gottesdienstzeitenId: null,
discovermassId: null,
source: 'osm',
website: osmChurch.website || null,
phone: osmChurch.phone || null,
address: osmChurch.address || null,
});
}
processed++;
// Log progress every 100 churches
if (processed % 100 === 0) {
console.log(`Progress: ${processed}/${osmChurches.length} churches processed`);
}
} catch (error) {
console.error(`Error processing church ${osmChurch.name}:`, error);
stats.errors++;
}
}
console.log(`\nProcessed all ${osmChurches.length} churches from ${regionName}`);
} catch (error) {
console.error(`Failed to import from ${regionName}:`, error);
stats.errors++;
}
return stats;
}
/**
* Print import summary
*/
function printSummary(countryCode: string, regionName: string, stats: ImportStats, dryRun: boolean) {
console.log(`\n${'='.repeat(60)}`);
console.log(`Import Summary for ${countryCode} - ${regionName} ${dryRun ? '(DRY RUN)' : ''}`);
console.log(`${'='.repeat(60)}`);
console.log(`OSM churches found: ${stats.osmChurchesFound}`);
if (!dryRun) {
console.log(`New churches inserted: ${stats.newChurchesInserted}`);
console.log(`Existing churches updated: ${stats.existingUpdated} (matched by osmId)`);
console.log(`Existing churches linked: ${stats.existingLinked} (matched by proximity)`);
}
console.log(`Churches with websites: ${stats.churchesWithWebsites}`);
console.log(`Churches without websites: ${stats.churchesWithoutWebsites}`);
if (!dryRun && stats.errors > 0) {
console.log(`Errors encountered: ${stats.errors}`);
}
console.log(`${'='.repeat(60)}\n`);
}
/**
* Main function
*/
async function main() {
const { country, region, dryRun } = parseArgs();
if (!country || !region) {
console.error('Error: Must specify both --country <CODE> and --region <NAME>');
console.error('Usage:');
console.error(' npx tsx scripts/import-osm-region.ts --country GB --region "England South"');
console.error(' npx tsx scripts/import-osm-region.ts --country IT --region "North" --dry-run');
console.error('\nAvailable countries:', Object.keys(COUNTRY_BOUNDING_BOXES).join(', '));
process.exit(1);
}
if (dryRun) {
console.log('\n*** DRY RUN MODE - No changes will be made to database ***\n');
}
try {
const stats = await importFromRegion(country, region, dryRun);
printSummary(country, region, stats, dryRun);
} catch (error) {
console.error('Fatal error:', error);
process.exit(1);
} finally {
await prisma.$disconnect();
}
}
main();