feat: add discovermassId to church-matcher ExistingChurch and ChurchCandidate

Add discovermassId field to ExistingChurch interface and ChurchCandidate type,
insert a dedicated matching pass in findDuplicateChurch, and update all 15 importer
push blocks plus 16 loadExistingChurches select queries to include the new field.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
albertfj114
2026-03-11 06:52:05 -04:00
parent 2706708c51
commit a046928ed0
17 changed files with 11576 additions and 0 deletions

View File

@@ -0,0 +1,325 @@
#!/usr/bin/env tsx
/**
* Import Catholic churches from Baidu Maps (China)
* Usage:
* npx tsx scripts/import-baidu-churches.ts
* npx tsx scripts/import-baidu-churches.ts --dry-run
* npx tsx scripts/import-baidu-churches.ts --resume-from-cell 100
* npx tsx scripts/import-baidu-churches.ts --job-id <uuid>
*/
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
const pool = new Pool({
connectionString: dbUrl,
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
});
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
import { queryBaiduByGrid, type BaiduChurch } from '../src/lib/baidu-client';
import { findDuplicateChurch, mergeBaiduData, type ExistingChurch } from '../src/lib/church-matcher';
interface ImportStats {
baiduChurchesFound: number;
newChurchesInserted: number;
existingUpdated: number;
existingLinked: number;
errors: number;
}
function parseArgs(): { dryRun: boolean; resumeFromCell: number; jobId?: string } {
const args = process.argv.slice(2);
const result = {
dryRun: false,
resumeFromCell: 0,
jobId: undefined as string | undefined,
};
for (let i = 0; i < args.length; i++) {
if (args[i] === '--dry-run') {
result.dryRun = true;
} else if (args[i] === '--resume-from-cell' && args[i + 1]) {
result.resumeFromCell = parseInt(args[i + 1], 10);
i++;
} else if (args[i] === '--job-id' && args[i + 1]) {
result.jobId = args[i + 1];
i++;
}
}
return result;
}
async function createOrResumeJob(jobId?: string): Promise<string | null> {
if (jobId) {
await prisma.backgroundJob.update({
where: { id: jobId },
data: { status: 'running', startedAt: new Date() },
});
return jobId;
}
return null;
}
async function completeJob(jobId: string | null, error?: string): Promise<void> {
if (!jobId) return;
try {
await prisma.backgroundJob.update({
where: { id: jobId },
data: {
status: error ? 'failed' : 'completed',
error: error || null,
completedAt: new Date(),
},
});
} catch (err) {
console.error(`Failed to update job ${jobId}:`, err);
}
}
async function updateJobProgress(jobId: string | null, stats: ImportStats, totalCells: number, currentCell: number): Promise<void> {
if (!jobId) return;
try {
await prisma.backgroundJob.update({
where: { id: jobId },
data: {
totalItems: totalCells,
processed: currentCell,
succeeded: stats.newChurchesInserted + stats.existingUpdated + stats.existingLinked,
failed: stats.errors,
itemsFound: stats.baiduChurchesFound,
},
});
} catch (err) {
// Non-fatal — just log it
console.error(`Failed to update job progress:`, err);
}
}
async function importFromBaidu(dryRun: boolean, resumeFromCell: number, jobId: string | null): Promise<ImportStats> {
const stats: ImportStats = {
baiduChurchesFound: 0,
newChurchesInserted: 0,
existingUpdated: 0,
existingLinked: 0,
errors: 0,
};
const apiKey = process.env.BAIDU_MAPS_API_KEY;
if (!apiKey) {
throw new Error('Missing BAIDU_MAPS_API_KEY environment variable');
}
console.log(`\n${'='.repeat(60)}`);
console.log(`Importing Catholic churches from Baidu Maps (China)`);
console.log(`${'='.repeat(60)}\n`);
// Step 1: Query Baidu API
console.log('Step 1: Querying Baidu Maps API...');
const baiduChurches = await queryBaiduByGrid(
apiKey,
(progress) => {
updateJobProgress(jobId, stats, progress.totalCells, progress.cellIndex);
},
resumeFromCell,
);
stats.baiduChurchesFound = baiduChurches.length;
console.log(`\nFound ${baiduChurches.length} churches from Baidu Maps`);
if (baiduChurches.length === 0) {
console.log('No churches found');
return stats;
}
if (dryRun) {
console.log('\n[DRY RUN] Would import the following churches:');
baiduChurches.slice(0, 20).forEach((church) => {
console.log(` - ${church.name} (${church.city || church.province || 'unknown'})`);
console.log(` Baidu ID: ${church.baiduId}, Coords: ${church.lat.toFixed(4)}, ${church.lng.toFixed(4)}`);
});
if (baiduChurches.length > 20) {
console.log(` ... and ${baiduChurches.length - 20} more`);
}
return stats;
}
// Step 2: Load existing churches in China for deduplication
console.log('\nStep 2: Loading existing churches in China for deduplication...');
const existingChurches: ExistingChurch[] = await prisma.church.findMany({
where: { country: 'CN' },
select: {
id: true,
name: true,
latitude: true,
longitude: true,
osmId: true,
baiduId: true,
masstimesId: true,
orarimesseId: true,
massSchedulesPhId: true,
philmassId: true,
horariosMisasId: true,
mszeInfoId: true,
weekdayMassesId: true,
messesInfoId: true,
bohosluzbyId: true,
miserendId: true,
kerknetId: true,
gottesdienstzeitenId: true,
discovermassId: true,
source: true,
website: true,
phone: true,
address: true,
},
});
console.log(`Found ${existingChurches.length} existing churches in China`);
// Step 3: Process each Baidu church
console.log('\nStep 3: Processing churches...');
let processed = 0;
for (const baiduChurch of baiduChurches) {
try {
const candidate = {
name: baiduChurch.name,
lat: baiduChurch.lat,
lng: baiduChurch.lng,
baiduId: baiduChurch.baiduId,
};
const duplicate = findDuplicateChurch(candidate, existingChurches);
if (duplicate && duplicate.baiduId === baiduChurch.baiduId) {
// Existing church with matching baiduId — update it
const mergedData = mergeBaiduData(duplicate, baiduChurch);
await prisma.church.update({
where: { id: duplicate.id },
data: mergedData,
});
stats.existingUpdated++;
} else if (duplicate) {
// Existing church matched by proximity/name — link it with baiduId
const mergedData = mergeBaiduData(duplicate, baiduChurch);
await prisma.church.update({
where: { id: duplicate.id },
data: mergedData,
});
stats.existingLinked++;
} else {
// New church — insert it
const newChurch = await prisma.church.create({
data: {
name: baiduChurch.name,
latitude: baiduChurch.lat,
longitude: baiduChurch.lng,
address: baiduChurch.address,
city: baiduChurch.city,
state: baiduChurch.province,
country: 'CN',
phone: baiduChurch.phone,
website: baiduChurch.website,
source: 'baidu',
baiduId: baiduChurch.baiduId,
baiduLastSyncedAt: new Date(),
hasWebsite: !!baiduChurch.website,
},
});
stats.newChurchesInserted++;
// Add to existing churches list for dedup within this run
existingChurches.push({
id: newChurch.id,
name: baiduChurch.name,
latitude: baiduChurch.lat,
longitude: baiduChurch.lng,
osmId: null,
baiduId: baiduChurch.baiduId,
masstimesId: null,
orarimesseId: null,
massSchedulesPhId: null,
philmassId: null,
horariosMisasId: null,
mszeInfoId: null,
weekdayMassesId: null,
messesInfoId: null,
bohosluzbyId: null,
miserendId: null,
kerknetId: null,
gottesdienstzeitenId: null,
discovermassId: null,
source: 'baidu',
website: baiduChurch.website || null,
phone: baiduChurch.phone || null,
address: baiduChurch.address || null,
});
}
processed++;
if (processed % 500 === 0) {
console.log(`Progress: ${processed}/${baiduChurches.length} churches processed`);
await updateJobProgress(jobId, stats, baiduChurches.length, processed);
}
} catch (error) {
console.error(`Error processing church ${baiduChurch.name} (${baiduChurch.baiduId}):`, error);
stats.errors++;
}
}
console.log(`\nProcessed all ${baiduChurches.length} churches`);
return stats;
}
function printSummary(stats: ImportStats, dryRun: boolean) {
console.log(`\n${'='.repeat(60)}`);
console.log(`Baidu Import Summary ${dryRun ? '(DRY RUN)' : ''}`);
console.log(`${'='.repeat(60)}`);
console.log(`Baidu churches found: ${stats.baiduChurchesFound}`);
if (!dryRun) {
console.log(`New churches inserted: ${stats.newChurchesInserted}`);
console.log(`Existing churches updated: ${stats.existingUpdated} (matched by baiduId)`);
console.log(`Existing churches linked: ${stats.existingLinked} (matched by proximity)`);
}
if (!dryRun && stats.errors > 0) {
console.log(`Errors encountered: ${stats.errors}`);
}
console.log(`${'='.repeat(60)}\n`);
}
async function main() {
const { dryRun, resumeFromCell, jobId: argJobId } = parseArgs();
const jobId = await createOrResumeJob(argJobId);
if (dryRun) {
console.log('\n*** DRY RUN MODE - No changes will be made to database ***\n');
}
try {
const stats = await importFromBaidu(dryRun, resumeFromCell, jobId);
printSummary(stats, dryRun);
await completeJob(jobId);
} catch (error) {
console.error('Fatal error:', error);
await completeJob(jobId, String(error));
process.exit(1);
} finally {
await prisma.$disconnect();
}
}
main();

View File

@@ -0,0 +1,641 @@
#!/usr/bin/env tsx
/**
* Import Catholic churches and mass schedules from bohosluzby.cz (Czech Republic)
*
* bohosluzby.cz is the official Czech bishops' conference mass schedule finder.
* It exposes a JSON API with two main endpoints:
* - POST /index.php/apiWeb/allData — returns all churches (clustered by zoom level)
* - GET /index.php/apiWeb/detailById?id={id} — returns mass schedule details
*
* The API requires no authentication. We fetch all churches at zoom=7 (covers
* all of Czech Republic in one request with clustered results), then fetch
* individual detail pages for mass schedules.
*
* Import strategy:
* 1. Fetch all churches via allData endpoint (zoom=7, centered on Czech Republic)
* 2. Flatten clustered results to get individual church records
* 3. For each church, fetch detail to get mass schedules
* 4. Match against existing Czech churches via church-matcher
* 5. Upsert churches and mass schedules
*
* Usage:
* npx tsx scripts/import-bohosluzby.ts --all --dry-run
* npx tsx scripts/import-bohosluzby.ts --all
* npx tsx scripts/import-bohosluzby.ts --id 10009 --dry-run # Single church
* npx tsx scripts/import-bohosluzby.ts --all --resume-from 500
*/
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
const pool = new Pool({
connectionString: dbUrl,
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
});
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
import { findDuplicateChurch } from '../src/lib/church-matcher';
import type { ExistingChurch } from '../src/lib/church-matcher';
// ─── Constants ───────────────────────────────────────────────────────────────
const BASE_URL = 'https://bohosluzby.cirkev.cz';
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
const REQUEST_DELAY_MS = 500; // Be polite — 0.5s between detail requests
const RETRY_DELAY_MS = 5000;
const MAX_RETRIES = 3;
// Czech Republic center coordinates for the allData request
const CZ_CENTER_LAT = 49.8;
const CZ_CENTER_LNG = 15.5;
const CZ_ZOOM = 7; // Returns all churches clustered into ~7 groups
// ─── Types ───────────────────────────────────────────────────────────────────
interface BohosluzbyChurch {
id: string;
name: string;
street: string | null;
city: string | null;
psc: string | null; // zip code
latitude: number;
longitude: number;
type: string; // KOSTEL, KAPLE, etc.
}
interface BohosluzbySchedule {
dayOfWeek: number; // 0=Sunday, 1=Monday, ...
time: string; // HH:MM
language: string;
type: string; // "mše sv.", "růženec", etc.
note: string | null;
}
interface ImportStats {
churchesFetched: number;
detailsFetched: number;
churchesMatched: number;
churchesCreated: number;
churchesSkipped: number;
schedulesCreated: number;
errors: number;
}
interface CLIArgs {
all: boolean;
dryRun: boolean;
resumeFrom?: number;
churchId?: string;
jobId?: string;
}
// ─── HTTP Client ─────────────────────────────────────────────────────────────
let requestCount = 0;
function delay(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function fetchWithRetry(url: string, options: RequestInit = {}): Promise<any | null> {
if (requestCount > 0) {
await delay(REQUEST_DELAY_MS);
}
requestCount++;
for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
try {
const response = await fetch(url, {
...options,
headers: {
'User-Agent': USER_AGENT,
...options.headers,
},
});
if (response.status === 503 || response.status === 429) {
if (attempt < MAX_RETRIES) {
console.log(` HTTP ${response.status} — retrying in ${RETRY_DELAY_MS / 1000}s (attempt ${attempt}/${MAX_RETRIES})`);
await delay(RETRY_DELAY_MS);
continue;
}
console.error(` HTTP ${response.status} after ${MAX_RETRIES} attempts`);
return null;
}
if (!response.ok) {
console.error(` HTTP ${response.status} from ${url}`);
return null;
}
return await response.json();
} catch (error) {
if (attempt < MAX_RETRIES) {
console.log(` Network error — retrying in ${RETRY_DELAY_MS / 1000}s (attempt ${attempt}/${MAX_RETRIES})`);
await delay(RETRY_DELAY_MS);
continue;
}
console.error(` API error after ${MAX_RETRIES} attempts: ${error instanceof Error ? error.message : error}`);
return null;
}
}
return null;
}
// ─── API Methods ─────────────────────────────────────────────────────────────
/**
* Fetch all churches from the allData endpoint.
* Returns clustered results at zoom=7 — we flatten the clusters to get
* individual church records with id, name, lat, lng, city, street.
*/
async function fetchAllChurches(): Promise<BohosluzbyChurch[]> {
console.log('Fetching all churches from allData endpoint...');
const params = new URLSearchParams();
params.append('institutionTypes', "'KOSTEL'");
params.append('latitude', String(CZ_CENTER_LAT));
params.append('longitude', String(CZ_CENTER_LNG));
params.append('zoom', String(CZ_ZOOM));
const data = await fetchWithRetry(`${BASE_URL}/index.php/apiWeb/allData`, {
method: 'POST',
body: params,
headers: { 'Content-Type': 'application/x-www-form-urlencoded' },
});
if (!data) {
console.error('Failed to fetch allData');
return [];
}
const churches: BohosluzbyChurch[] = [];
const kostelData = data["'KOSTEL'"] || [];
for (const cluster of kostelData) {
// Add the cluster representative
churches.push({
id: cluster.id,
name: cluster.name,
street: cluster.street || null,
city: cluster.city || null,
psc: cluster.psc || null,
latitude: parseFloat(cluster.latitude),
longitude: parseFloat(cluster.longitude),
type: cluster.type || 'KOSTEL',
});
// Add churches from the indices array (sub-items in the cluster)
if (Array.isArray(cluster.indices)) {
for (const sub of cluster.indices) {
churches.push({
id: sub.id,
name: sub.name,
street: sub.street || null,
city: sub.city || null,
psc: sub.psc || null,
latitude: parseFloat(sub.latitude),
longitude: parseFloat(sub.longitude),
type: sub.type || 'KOSTEL',
});
}
}
}
console.log(`Fetched ${churches.length} churches from allData`);
return churches;
}
/**
* Fetch mass schedule details for a single church.
* Returns parsed regular mass schedules.
*/
async function fetchChurchDetail(churchId: string): Promise<BohosluzbySchedule[]> {
const data = await fetchWithRetry(`${BASE_URL}/index.php/apiWeb/detailById?id=${churchId}`);
if (!data || !data.church) return [];
const schedules: BohosluzbySchedule[] = [];
const regular = data.church.regular || [];
for (const entry of regular) {
// Only import "mše sv." (Holy Mass) entries
if (entry.chst_name && !entry.chst_name.includes('mše')) continue;
const time = entry.cas; // Already in HH:MM format
if (!time) continue;
// Parse periodic_days: "12345" = Mon-Fri, "6" = Sat, "7" = Sun
// Convert to our dayOfWeek: 0=Sun, 1=Mon, ..., 6=Sat
const periodicDays = entry.periodic_days || '';
for (const dayChar of periodicDays) {
const bohosluzbyDay = parseInt(dayChar);
if (isNaN(bohosluzbyDay)) continue;
// bohosluzby: 1=Mon, 2=Tue, ..., 6=Sat, 7=Sun
// Our format: 0=Sun, 1=Mon, ..., 6=Sat
const dayOfWeek = bohosluzbyDay === 7 ? 0 : bohosluzbyDay;
const key = `${dayOfWeek}:${time}`;
// Deduplicate within this church
if (!schedules.some(s => `${s.dayOfWeek}:${s.time}` === key)) {
schedules.push({
dayOfWeek,
time,
language: entry.chsl_name || 'česky',
type: entry.chst_name || 'mše sv.',
note: entry.note || null,
});
}
}
}
return schedules;
}
// ─── Database Operations ─────────────────────────────────────────────────────
async function loadExistingCzechChurches(): Promise<ExistingChurch[]> {
console.log('Loading existing Czech churches for deduplication...');
const churches = await prisma.church.findMany({
where: { country: 'CZ' },
select: {
id: true,
name: true,
latitude: true,
longitude: true,
osmId: true,
baiduId: true,
masstimesId: true,
orarimesseId: true,
massSchedulesPhId: true,
philmassId: true,
horariosMisasId: true,
mszeInfoId: true,
weekdayMassesId: true,
messesInfoId: true,
bohosluzbyId: true,
miserendId: true,
kerknetId: true,
gottesdienstzeitenId: true,
discovermassId: true,
source: true,
website: true,
phone: true,
address: true,
},
});
console.log(`Loaded ${churches.length} existing Czech churches`);
return churches;
}
// ─── Import Logic ────────────────────────────────────────────────────────────
async function processChurch(
church: BohosluzbyChurch,
existingChurches: ExistingChurch[],
dryRun: boolean,
stats: ImportStats,
): Promise<void> {
if (church.latitude === 0 && church.longitude === 0) {
stats.churchesSkipped++;
return;
}
// Fetch mass schedules
let schedules: BohosluzbySchedule[] = [];
if (!dryRun) {
schedules = await fetchChurchDetail(church.id);
stats.detailsFetched++;
}
const candidate = {
name: church.name,
lat: church.latitude,
lng: church.longitude,
bohosluzbyId: church.id,
};
const duplicate = findDuplicateChurch(candidate, existingChurches);
if (dryRun) {
if (duplicate) {
stats.churchesMatched++;
} else {
stats.churchesCreated++;
}
return;
}
if (duplicate) {
stats.churchesMatched++;
const updateData: Record<string, unknown> = { bohosluzbyId: church.id };
if (!duplicate.address && church.street) updateData.address = church.street;
try {
await prisma.church.update({
where: { id: duplicate.id },
data: updateData,
});
} catch (error) {
if (error instanceof Error && error.message.includes('Unique constraint')) {
stats.churchesSkipped++;
return;
}
throw error;
}
if (schedules.length > 0) {
try {
await prisma.$transaction(async (tx) => {
await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } });
await tx.massSchedule.createMany({
data: schedules.map((s) => ({
churchId: duplicate.id,
dayOfWeek: s.dayOfWeek,
time: s.time,
language: 'Czech',
})),
});
await tx.church.update({
where: { id: duplicate.id },
data: { lastScrapedAt: new Date() },
});
});
stats.schedulesCreated += schedules.length;
} catch (error) {
stats.errors++;
console.error(` Error saving schedules for ${church.id}: ${error instanceof Error ? error.message : error}`);
}
}
} else {
try {
const newChurch = await prisma.church.create({
data: {
name: church.name,
latitude: church.latitude,
longitude: church.longitude,
address: church.street,
zip: church.psc,
city: church.city,
country: 'CZ',
bohosluzbyId: church.id,
source: 'bohosluzby',
websiteLanguage: 'cs',
},
});
stats.churchesCreated++;
existingChurches.push({
id: newChurch.id,
name: church.name,
latitude: church.latitude,
longitude: church.longitude,
osmId: null,
baiduId: null,
masstimesId: null,
orarimesseId: null,
massSchedulesPhId: null,
philmassId: null,
horariosMisasId: null,
mszeInfoId: null,
weekdayMassesId: null,
messesInfoId: null,
bohosluzbyId: church.id,
miserendId: null,
kerknetId: null,
gottesdienstzeitenId: null,
discovermassId: null,
source: 'bohosluzby',
website: null,
phone: null,
address: church.street,
});
if (schedules.length > 0) {
await prisma.massSchedule.createMany({
data: schedules.map((s) => ({
churchId: newChurch.id,
dayOfWeek: s.dayOfWeek,
time: s.time,
language: 'Czech',
})),
});
await prisma.church.update({
where: { id: newChurch.id },
data: { lastScrapedAt: new Date() },
});
stats.schedulesCreated += schedules.length;
}
} catch (error) {
if (error instanceof Error && error.message.includes('Unique constraint')) {
stats.churchesSkipped++;
return;
}
stats.errors++;
console.error(` Error creating ${church.id}: ${error instanceof Error ? error.message : error}`);
}
}
}
// ─── CLI ─────────────────────────────────────────────────────────────────────
function parseArgs(): CLIArgs {
const args = process.argv.slice(2);
const result: CLIArgs = { all: false, dryRun: false };
for (let i = 0; i < args.length; i++) {
switch (args[i]) {
case '--all':
result.all = true;
break;
case '--dry-run':
result.dryRun = true;
break;
case '--resume-from':
result.resumeFrom = parseInt(args[++i]);
break;
case '--id':
result.churchId = args[++i];
break;
case '--job-id':
result.jobId = args[++i];
break;
case '--help':
case '-h':
console.log(`
Usage: npx tsx scripts/import-bohosluzby.ts [options]
Options:
--all Import all churches
--id <id> Import a single church by bohosluzby ID
--dry-run No database writes, just report what would happen
--resume-from <n> Skip first N churches
--job-id <uuid> Background job tracking ID
--help, -h Show this help message
Examples:
npx tsx scripts/import-bohosluzby.ts --id 10009 --dry-run
npx tsx scripts/import-bohosluzby.ts --all --dry-run
npx tsx scripts/import-bohosluzby.ts --all
`);
process.exit(0);
}
}
if (!result.all && !result.churchId) {
console.error('Error: specify --all or --id <bohosluzby_id>');
process.exit(1);
}
return result;
}
function formatDuration(ms: number): string {
const seconds = Math.floor(ms / 1000);
const minutes = Math.floor(seconds / 60);
const hours = Math.floor(minutes / 60);
if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`;
if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
return `${seconds}s`;
}
// ─── Main ────────────────────────────────────────────────────────────────────
async function main() {
const args = parseArgs();
const startTime = Date.now();
console.log('\n' + '='.repeat(70));
console.log('BOHOSLUZBY.CZ (CZECH REPUBLIC) IMPORTER');
console.log('='.repeat(70));
console.log(`Mode: ${args.churchId ? `Church ID ${args.churchId}` : 'All churches'}`);
console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`);
if (args.resumeFrom) console.log(`Resume from: church index ${args.resumeFrom}`);
console.log(`Time: ${new Date().toISOString()}`);
console.log('='.repeat(70) + '\n');
if (args.jobId) {
try {
await prisma.backgroundJob.update({
where: { id: args.jobId },
data: { status: 'running', startedAt: new Date() },
});
} catch { /* Job might not exist */ }
}
const stats: ImportStats = {
churchesFetched: 0,
detailsFetched: 0,
churchesMatched: 0,
churchesCreated: 0,
churchesSkipped: 0,
schedulesCreated: 0,
errors: 0,
};
const existingChurches = await loadExistingCzechChurches();
let churches: BohosluzbyChurch[];
if (args.churchId) {
// Single church mode — create a minimal record and fetch detail
churches = [{
id: args.churchId,
name: `Church ${args.churchId}`,
street: null,
city: null,
psc: null,
latitude: 0,
longitude: 0,
type: 'KOSTEL',
}];
// Fetch detail to get actual data
const detail = await fetchWithRetry(`${BASE_URL}/index.php/apiWeb/detailById?id=${args.churchId}`);
if (detail?.church?.institution?.[0]) {
const inst = detail.church.institution[0];
churches[0].name = inst.name || churches[0].name;
churches[0].street = inst.street || null;
churches[0].city = inst.city || null;
churches[0].latitude = parseFloat(inst.latitude) || 0;
churches[0].longitude = parseFloat(inst.longitude) || 0;
}
} else {
churches = await fetchAllChurches();
}
stats.churchesFetched = churches.length;
if (args.resumeFrom) {
churches = churches.slice(args.resumeFrom);
console.log(`Resuming from index ${args.resumeFrom} (${churches.length} remaining)\n`);
}
console.log(`Processing ${churches.length} churches\n`);
for (let i = 0; i < churches.length; i++) {
const church = churches[i];
if (i % 100 === 0) {
const elapsed = formatDuration(Date.now() - startTime);
console.log(`[${i + 1}/${churches.length}] Processing ${church.name} (${church.id}) [${elapsed} elapsed]`);
}
try {
await processChurch(church, existingChurches, args.dryRun, stats);
} catch (error) {
stats.errors++;
console.error(` ERROR processing church ${church.id}: ${error instanceof Error ? error.message : error}`);
}
}
const totalTime = Date.now() - startTime;
console.log('\n' + '='.repeat(70));
console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
console.log('='.repeat(70));
console.log(`Churches fetched: ${stats.churchesFetched}`);
console.log(`Details fetched: ${stats.detailsFetched}`);
console.log(` Matched (existing): ${stats.churchesMatched}`);
console.log(` Created (new): ${stats.churchesCreated}`);
console.log(` Skipped: ${stats.churchesSkipped}`);
console.log(`Schedules created: ${stats.schedulesCreated}`);
console.log(`Errors: ${stats.errors}`);
console.log(`Total time: ${formatDuration(totalTime)}`);
console.log(`HTTP requests: ${requestCount}`);
console.log('='.repeat(70) + '\n');
if (args.jobId) {
try {
await prisma.backgroundJob.update({
where: { id: args.jobId },
data: {
status: stats.errors > 0 ? 'completed_with_errors' : 'completed',
completedAt: new Date(),
processed: stats.churchesFetched,
succeeded: stats.churchesCreated + stats.churchesMatched,
failed: stats.errors,
itemsFound: stats.schedulesCreated,
},
});
} catch { /* Ignore */ }
}
}
main()
.catch((error) => {
console.error('Fatal error:', error);
process.exit(1);
})
.finally(async () => {
await prisma.$disconnect();
await pool.end();
});

834
scripts/import-gcatholic.ts Normal file
View File

@@ -0,0 +1,834 @@
#!/usr/bin/env tsx
/**
* Import Catholic churches from GCatholic.org
*
* GCatholic is a comprehensive Catholic directory organized by diocese.
* Each church page includes a Google Plus Code (→ lat/lng), address, phone, website, etc.
* This script discovers churches via country → diocese → church page navigation.
*
* Usage:
* npx tsx scripts/import-gcatholic.ts --country CN
* npx tsx scripts/import-gcatholic.ts --country CN --dry-run
* npx tsx scripts/import-gcatholic.ts --diocese peki0
* npx tsx scripts/import-gcatholic.ts --all
* npx tsx scripts/import-gcatholic.ts --all --limit 100
* npx tsx scripts/import-gcatholic.ts --all --resume-from PL
*/
// Load .env for database connection (before importing anything that uses process.env)
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
// Create a fresh Prisma client for this script (don't use cached pool from lib/db)
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
const pool = new Pool({
connectionString: dbUrl,
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
});
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
import { findDuplicateChurch } from '../src/lib/church-matcher';
import type { ExistingChurch } from '../src/lib/church-matcher';
// Plus Code decoder
// eslint-disable-next-line @typescript-eslint/no-require-imports
const { OpenLocationCode } = require('open-location-code');
const olc = new OpenLocationCode();
// ─── Constants ───────────────────────────────────────────────────────────────
const BASE_URL = 'https://www.gcatholic.org';
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
const DEFAULT_DELAY_MS = 1500;
// ─── Types ───────────────────────────────────────────────────────────────────
interface GCatholicChurch {
gcatholicId: string;
name: string;
localName?: string;
lat: number;
lng: number;
address?: string;
city?: string;
state?: string;
country?: string;
phone?: string;
website?: string;
diocese?: string;
churchType?: string;
plusCode: string;
sourceUrl: string;
}
interface ImportStats {
churchesFound: number;
newChurchesCreated: number;
existingChurchesMerged: number;
skipped: number;
errors: number;
errorDetails: string[];
}
interface CLIArgs {
country?: string;
all: boolean;
diocese?: string;
dryRun: boolean;
limit?: number;
delay: number;
resumeFrom?: string;
}
// ─── HTTP Fetching ───────────────────────────────────────────────────────────
let requestCount = 0;
async function fetchPage(url: string, delayMs: number): Promise<string | null> {
// Rate limit
if (requestCount > 0) {
await new Promise((resolve) => setTimeout(resolve, delayMs));
}
requestCount++;
try {
const response = await fetch(url, {
headers: {
'User-Agent': USER_AGENT,
'Accept': 'text/html,application/xhtml+xml',
'Accept-Language': 'en-US,en;q=0.9',
},
});
if (!response.ok) {
if (response.status === 404) {
return null; // Expected for some pages
}
console.error(` HTTP ${response.status} for ${url}`);
return null;
}
return await response.text();
} catch (error) {
console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`);
return null;
}
}
// ─── HTML Parsing ────────────────────────────────────────────────────────────
/**
* Extract all country codes from the GCatholic countries page.
* Links follow pattern: country/{ISO2}
*/
async function discoverCountries(delayMs: number): Promise<string[]> {
console.log('Discovering countries from GCatholic...');
const html = await fetchPage(`${BASE_URL}/dioceses/`, delayMs);
if (!html) {
console.error('Failed to fetch countries page');
return [];
}
const countryCodes = new Set<string>();
// Match links like: href="country/CN" or href="/dioceses/country/CN"
const regex = /href="(?:\.\.\/|\/dioceses\/)?country\/([A-Z]{2})(?:\.htm)?"/g;
let match;
while ((match = regex.exec(html)) !== null) {
countryCodes.add(match[1]);
}
const codes = Array.from(countryCodes).sort();
console.log(`Found ${codes.length} countries`);
return codes;
}
/**
* Extract diocese codes from a country page.
* Links follow pattern: ../diocese/{code} or diocese/{code}
*/
async function discoverDioceses(countryCode: string, delayMs: number): Promise<{ code: string; name: string }[]> {
const html = await fetchPage(`${BASE_URL}/dioceses/country/${countryCode}.htm`, delayMs);
if (!html) {
return [];
}
const dioceses: { code: string; name: string }[] = [];
const seen = new Set<string>();
// Match links like: href="../diocese/peki0" or href="../../dioceses/diocese/peki0"
// The text after the link is the diocese name
const regex = /href="(?:\.\.\/)?(?:\.\.\/dioceses\/)?diocese\/([a-z0-9]+)(?:\.htm)?"[^>]*>([^<]+)</g;
let match;
while ((match = regex.exec(html)) !== null) {
const code = match[1];
const name = match[2].trim();
if (!seen.has(code)) {
seen.add(code);
dioceses.push({ code, name });
}
}
return dioceses;
}
/**
* Extract church page links from a diocese page.
* Church links follow pattern: ../../churches/{region}/{id}
*/
async function discoverChurchLinks(dioceseCode: string, delayMs: number): Promise<string[]> {
const html = await fetchPage(`${BASE_URL}/dioceses/diocese/${dioceseCode}.htm`, delayMs);
if (!html) {
return [];
}
const churchUrls = new Set<string>();
// Match church links like: href="../../churches/china/46492" or href="../../churches/asia/1893"
const regex = /href="(?:\.\.\/)*churches\/([a-z0-9-]+\/\d+)(?:\.htm)?"/g;
let match;
while ((match = regex.exec(html)) !== null) {
const churchPath = match[1];
churchUrls.add(`${BASE_URL}/churches/${churchPath}.htm`);
}
return Array.from(churchUrls);
}
/**
* Parse a single church page and extract structured data.
*/
function parseChurchPage(html: string, url: string, countryCode?: string): GCatholicChurch | null {
// Extract church name from <h1>
const h1Match = html.match(/<h1>([^<]+)<\/h1>/);
if (!h1Match) return null;
const name = h1Match[1].trim();
// Extract local name from <h2>
const h2Match = html.match(/<h2>([^<]+)<\/h2>/);
const localName = h2Match ? h2Match[1].trim() : undefined;
// Extract Plus Code - it's in a link with onclick containing google maps
// Pattern: onclick="window.open('https://www.google.com/maps/search/?api=1&query=PLUSCODE','_blank')"
// The Plus Code text is like: >8PFRW9FF+C2<
let plusCode: string | null = null;
// Try the onclick pattern first
const plusCodeOnclickMatch = html.match(/onclick="window\.open\('https:\/\/www\.google\.com\/maps\/search\/\?api=1&(?:amp;)?query=([^']+)'/);
if (plusCodeOnclickMatch) {
plusCode = decodeURIComponent(plusCodeOnclickMatch[1]);
}
// Fallback: look for Plus Code pattern in text (format: XXXX+XX or longer)
if (!plusCode) {
const plusCodeTextMatch = html.match(/title="Plus Code">([A-Z0-9+]+)<\/a>/);
if (plusCodeTextMatch) {
plusCode = plusCodeTextMatch[1];
}
}
// Another fallback: look for the code near "Location:" label
if (!plusCode) {
const locationMatch = html.match(/Location:.*?>([2-9A-HJ-NP-Z][2-9A-HJ-NP-Z0-9]{3,7}\+[2-9A-HJ-NP-Z0-9]{2,3})</);
if (locationMatch) {
plusCode = locationMatch[1];
}
}
if (!plusCode) {
return null; // Can't geolocate without Plus Code
}
// Decode Plus Code to lat/lng
let lat: number, lng: number;
try {
const decoded = olc.decode(plusCode);
lat = decoded.latitudeCenter;
lng = decoded.longitudeCenter;
} catch {
return null; // Invalid Plus Code
}
// Extract GCatholic ID from URL
const idMatch = url.match(/\/(\d+)(?:\.htm)?$/);
const gcatholicId = idMatch ? idMatch[1] : '';
// Extract labeled fields using the consistent <span class="label"> pattern
const getField = (label: string): string | undefined => {
// Pattern: <span class="label">Label: </span>TEXT or <a>TEXT</a>
const escaped = label.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const regex = new RegExp(`<span class="label">${escaped}:?\\s*</span>\\s*(.+?)(?:</p>|<br)`, 's');
const match = html.match(regex);
if (!match) return undefined;
// Strip HTML tags to get plain text
return match[1].replace(/<[^>]+>/g, '').trim() || undefined;
};
// Extract address
const address = getField('Address');
// Extract phone
const phone = getField('Telephone');
// Extract website URL (it's in an <a> tag)
let website: string | undefined;
const websiteMatch = html.match(/<span class="label">Website:?\s*<\/span>\s*<a\s+href="([^"]+)"/);
if (websiteMatch) {
website = websiteMatch[1];
// Ensure it's an external URL
if (website && !website.startsWith('http')) {
website = undefined;
}
}
// Extract diocese name
const diocese = getField('Jurisdiction');
// Extract church type
let churchType: string | undefined;
const typeMatch = html.match(/<span class="label">Type:?\s*<\/span>.*?class="ch[a-z]">([^<]+)/);
if (typeMatch) {
churchType = typeMatch[1].trim();
}
// Extract country from page
let country = countryCode;
if (!country) {
const countryMatch = html.match(/href="[^"]*country\/([A-Z]{2})(?:\.htm)?"/);
if (countryMatch) {
country = countryMatch[1];
}
}
// Extract city from <h3> tag: "City, Region, Country"
let city: string | undefined;
let state: string | undefined;
const h3Match = html.match(/<h3>([^<]+?)(?:,\s*<span class="zregion">([^<]+)<\/span>)?(?:,\s*<a[^>]*class="zcountry"[^>]*>[^<]+<\/a>)?\s*<\/h3>/);
if (h3Match) {
city = h3Match[1].trim();
state = h3Match[2]?.trim();
// Clean up: remove country code suffix if present (e.g., "Beijing 北京")
// Keep as-is since it may contain local language characters
}
return {
gcatholicId,
name,
localName,
lat,
lng,
address,
city,
state,
country,
phone,
website,
diocese,
churchType,
plusCode,
sourceUrl: url,
};
}
// ─── CLI Argument Parsing ────────────────────────────────────────────────────
function parseArgs(): CLIArgs {
const args = process.argv.slice(2);
const result: CLIArgs = {
all: false,
dryRun: false,
delay: DEFAULT_DELAY_MS,
};
for (let i = 0; i < args.length; i++) {
switch (args[i]) {
case '--country':
result.country = args[++i]?.toUpperCase();
break;
case '--all':
result.all = true;
break;
case '--diocese':
result.diocese = args[++i];
break;
case '--dry-run':
result.dryRun = true;
break;
case '--limit':
result.limit = parseInt(args[++i], 10);
break;
case '--delay':
result.delay = parseInt(args[++i], 10);
break;
case '--resume-from':
result.resumeFrom = args[++i]?.toUpperCase();
break;
}
}
return result;
}
// ─── Database Operations ─────────────────────────────────────────────────────
async function loadExistingChurches(): Promise<ExistingChurch[]> {
console.log('Loading existing churches for deduplication...');
const churches = await prisma.church.findMany({
select: {
id: true,
name: true,
latitude: true,
longitude: true,
osmId: true,
baiduId: true,
masstimesId: true,
orarimesseId: true,
massSchedulesPhId: true,
philmassId: true,
horariosMisasId: true,
mszeInfoId: true,
weekdayMassesId: true,
messesInfoId: true,
bohosluzbyId: true,
miserendId: true,
kerknetId: true,
gottesdienstzeitenId: true,
discovermassId: true,
source: true,
website: true,
phone: true,
address: true,
},
});
console.log(`Loaded ${churches.length} existing churches`);
return churches;
}
async function importChurch(
church: GCatholicChurch,
existingChurches: ExistingChurch[],
dryRun: boolean,
stats: ImportStats,
): Promise<void> {
// Build a candidate compatible with findDuplicateChurch (expects OSMChurch shape)
const candidate = {
osmId: `gcatholic-${church.gcatholicId}`,
name: church.name,
lat: church.lat,
lng: church.lng,
address: church.address,
city: church.city,
state: church.state,
country: church.country,
phone: church.phone,
website: church.website,
diocese: church.diocese,
};
const duplicate = findDuplicateChurch(candidate, existingChurches);
if (dryRun) {
if (duplicate) {
console.log(` [MERGE] ${church.name} → existing: ${duplicate.name} (${duplicate.id})`);
stats.existingChurchesMerged++;
} else {
console.log(` [NEW] ${church.name} (${church.lat.toFixed(4)}, ${church.lng.toFixed(4)})`);
stats.newChurchesCreated++;
}
return;
}
if (duplicate) {
// Merge: fill in missing fields only
const updateData: Record<string, unknown> = {};
if (!duplicate.phone && church.phone) updateData.phone = church.phone;
if (!duplicate.website && church.website) {
updateData.website = church.website;
updateData.hasWebsite = true;
}
if (!duplicate.address && church.address) updateData.address = church.address;
// Always set diocese if missing (GCatholic is great for this)
// We need to check diocese on the actual DB record
const dbRecord = await prisma.church.findUnique({
where: { id: duplicate.id },
select: { diocese: true },
});
if (dbRecord && !dbRecord.diocese && church.diocese) {
updateData.diocese = church.diocese;
}
if (Object.keys(updateData).length > 0) {
await prisma.church.update({
where: { id: duplicate.id },
data: updateData,
});
stats.existingChurchesMerged++;
} else {
stats.skipped++;
}
} else {
// Create new church
const newChurch = await prisma.church.create({
data: {
name: church.name,
latitude: church.lat,
longitude: church.lng,
address: church.address,
city: church.city,
state: church.state,
country: church.country,
phone: church.phone,
website: church.website,
hasWebsite: !!church.website,
source: 'gcatholic',
diocese: church.diocese,
},
});
stats.newChurchesCreated++;
// Add to existing list for future dedup within this run
existingChurches.push({
id: newChurch.id,
name: church.name,
latitude: church.lat,
longitude: church.lng,
osmId: null,
baiduId: null,
masstimesId: null,
orarimesseId: null,
massSchedulesPhId: null,
philmassId: null,
horariosMisasId: null,
mszeInfoId: null,
weekdayMassesId: null,
messesInfoId: null,
bohosluzbyId: null,
miserendId: null,
kerknetId: null,
gottesdienstzeitenId: null,
discovermassId: null,
source: 'gcatholic',
website: church.website || null,
phone: church.phone || null,
address: church.address || null,
});
}
}
// ─── Import Logic ────────────────────────────────────────────────────────────
async function importDiocese(
dioceseCode: string,
dioceseName: string,
countryCode: string | undefined,
existingChurches: ExistingChurch[],
args: CLIArgs,
stats: ImportStats,
globalLimit?: { remaining: number },
): Promise<void> {
const churchUrls = await discoverChurchLinks(dioceseCode, args.delay);
if (churchUrls.length === 0) {
return;
}
console.log(` Diocese ${dioceseName} (${dioceseCode}): ${churchUrls.length} church pages found`);
let dioceseNew = 0;
let dioceseMerged = 0;
let dioceseSkipped = 0;
let dioceseErrors = 0;
for (const url of churchUrls) {
// Check global limit
if (globalLimit && globalLimit.remaining <= 0) {
console.log(` Limit reached, stopping`);
return;
}
try {
const html = await fetchPage(url, args.delay);
if (!html) {
stats.errors++;
dioceseErrors++;
stats.errorDetails.push(`Failed to fetch: ${url}`);
continue;
}
const church = parseChurchPage(html, url, countryCode);
if (!church) {
stats.skipped++;
dioceseSkipped++;
continue;
}
stats.churchesFound++;
const prevNew = stats.newChurchesCreated;
const prevMerged = stats.existingChurchesMerged;
await importChurch(church, existingChurches, args.dryRun, stats);
if (stats.newChurchesCreated > prevNew) dioceseNew++;
if (stats.existingChurchesMerged > prevMerged) dioceseMerged++;
if (globalLimit) globalLimit.remaining--;
} catch (error) {
stats.errors++;
dioceseErrors++;
const msg = error instanceof Error ? error.message : String(error);
stats.errorDetails.push(`${url}: ${msg}`);
console.error(` Error processing ${url}: ${msg}`);
}
}
if (churchUrls.length > 0) {
const parts = [`${dioceseNew} new`, `${dioceseMerged} merged`];
if (dioceseSkipped > 0) parts.push(`${dioceseSkipped} skipped`);
if (dioceseErrors > 0) parts.push(`${dioceseErrors} errors`);
console.log(`${parts.join(', ')}`);
}
}
async function importCountry(
countryCode: string,
existingChurches: ExistingChurch[],
args: CLIArgs,
globalLimit?: { remaining: number },
): Promise<ImportStats> {
const stats: ImportStats = {
churchesFound: 0,
newChurchesCreated: 0,
existingChurchesMerged: 0,
skipped: 0,
errors: 0,
errorDetails: [],
};
console.log(`\n${'='.repeat(60)}`);
console.log(`Importing from GCatholic: ${countryCode}`);
console.log(`${'='.repeat(60)}`);
// Discover dioceses
const dioceses = await discoverDioceses(countryCode, args.delay);
if (dioceses.length === 0) {
console.log(`No dioceses found for ${countryCode}`);
return stats;
}
console.log(`Found ${dioceses.length} dioceses in ${countryCode}`);
// Process each diocese
for (const diocese of dioceses) {
if (globalLimit && globalLimit.remaining <= 0) break;
await importDiocese(
diocese.code,
diocese.name,
countryCode,
existingChurches,
args,
stats,
globalLimit,
);
}
return stats;
}
// ─── Summary Printing ────────────────────────────────────────────────────────
function printSummary(label: string, stats: ImportStats, dryRun: boolean): void {
console.log(`\n${'─'.repeat(60)}`);
console.log(`Summary: ${label} ${dryRun ? '(DRY RUN)' : ''}`);
console.log(`${'─'.repeat(60)}`);
console.log(`Churches found on GCatholic: ${stats.churchesFound}`);
console.log(`New churches created: ${stats.newChurchesCreated}`);
console.log(`Merged with existing: ${stats.existingChurchesMerged}`);
console.log(`Skipped (no data/dup): ${stats.skipped}`);
if (stats.errors > 0) {
console.log(`Errors: ${stats.errors}`);
}
console.log(`${'─'.repeat(60)}`);
}
// ─── Job Tracking ────────────────────────────────────────────────────────────
async function createOrResumeJob(args: string[]): Promise<string | null> {
const jobIdIndex = args.indexOf('--job-id');
if (jobIdIndex !== -1) {
const jobId = args[jobIdIndex + 1];
await prisma.backgroundJob.update({
where: { id: jobId },
data: { status: 'running', startedAt: new Date() },
});
return jobId;
}
return null;
}
async function completeJob(jobId: string | null, error?: string): Promise<void> {
if (!jobId) return;
try {
await prisma.backgroundJob.update({
where: { id: jobId },
data: {
status: error ? 'failed' : 'completed',
error: error || null,
completedAt: new Date(),
},
});
} catch (err) {
console.error(`Failed to update job ${jobId}:`, err);
}
}
// ─── Main ────────────────────────────────────────────────────────────────────
async function main() {
const args = parseArgs();
const jobId = await createOrResumeJob(process.argv.slice(2));
if (!args.country && !args.all && !args.diocese) {
console.error('Error: Must specify --country <ISO2>, --diocese <code>, or --all');
console.error('Usage:');
console.error(' npx tsx scripts/import-gcatholic.ts --country CN');
console.error(' npx tsx scripts/import-gcatholic.ts --country CN --dry-run');
console.error(' npx tsx scripts/import-gcatholic.ts --diocese peki0');
console.error(' npx tsx scripts/import-gcatholic.ts --all');
console.error(' npx tsx scripts/import-gcatholic.ts --all --limit 500');
console.error(' npx tsx scripts/import-gcatholic.ts --all --resume-from PL');
process.exit(1);
}
if (args.dryRun) {
console.log('\n*** DRY RUN MODE — no changes will be written to database ***\n');
}
console.log(`Delay between requests: ${args.delay}ms`);
if (args.limit) console.log(`Limit: ${args.limit} churches`);
try {
const existingChurches = await loadExistingChurches();
const globalLimit = args.limit ? { remaining: args.limit } : undefined;
if (args.diocese) {
// Single diocese mode
const stats: ImportStats = {
churchesFound: 0,
newChurchesCreated: 0,
existingChurchesMerged: 0,
skipped: 0,
errors: 0,
errorDetails: [],
};
await importDiocese(args.diocese, args.diocese, args.country, existingChurches, args, stats, globalLimit);
printSummary(`Diocese ${args.diocese}`, stats, args.dryRun);
} else if (args.country) {
// Single country mode
const stats = await importCountry(args.country, existingChurches, args, globalLimit);
printSummary(args.country, stats, args.dryRun);
} else if (args.all) {
// All countries mode — discover from GCatholic
let countries = await discoverCountries(args.delay);
if (countries.length === 0) {
console.error('Failed to discover countries');
process.exit(1);
}
// Handle --resume-from
if (args.resumeFrom) {
const idx = countries.indexOf(args.resumeFrom);
if (idx === -1) {
console.error(`Country ${args.resumeFrom} not found in GCatholic listing`);
process.exit(1);
}
console.log(`Resuming from ${args.resumeFrom} (skipping ${idx} countries)\n`);
countries = countries.slice(idx);
}
console.log(`Will process ${countries.length} countries\n`);
const totalStats: ImportStats = {
churchesFound: 0,
newChurchesCreated: 0,
existingChurchesMerged: 0,
skipped: 0,
errors: 0,
errorDetails: [],
};
let countriesProcessed = 0;
for (const countryCode of countries) {
if (globalLimit && globalLimit.remaining <= 0) {
console.log(`\nGlobal limit reached, stopping.`);
break;
}
const stats = await importCountry(countryCode, existingChurches, args, globalLimit);
printSummary(countryCode, stats, args.dryRun);
// Aggregate
totalStats.churchesFound += stats.churchesFound;
totalStats.newChurchesCreated += stats.newChurchesCreated;
totalStats.existingChurchesMerged += stats.existingChurchesMerged;
totalStats.skipped += stats.skipped;
totalStats.errors += stats.errors;
totalStats.errorDetails.push(...stats.errorDetails);
countriesProcessed++;
// Small extra delay between countries
await new Promise((resolve) => setTimeout(resolve, 2000));
}
// Overall summary
console.log(`\n${'='.repeat(60)}`);
console.log(`OVERALL SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
console.log(`${'='.repeat(60)}`);
console.log(`Countries processed: ${countriesProcessed}`);
console.log(`Total churches found: ${totalStats.churchesFound}`);
console.log(`Total new churches created: ${totalStats.newChurchesCreated}`);
console.log(`Total merged with existing: ${totalStats.existingChurchesMerged}`);
console.log(`Total skipped: ${totalStats.skipped}`);
if (totalStats.errors > 0) {
console.log(`Total errors: ${totalStats.errors}`);
}
console.log(`Total HTTP requests made: ${requestCount}`);
console.log(`${'='.repeat(60)}\n`);
if (totalStats.errorDetails.length > 0 && totalStats.errorDetails.length <= 50) {
console.log('\nError details:');
totalStats.errorDetails.forEach((e) => console.log(` - ${e}`));
} else if (totalStats.errorDetails.length > 50) {
console.log(`\nFirst 50 errors (of ${totalStats.errorDetails.length}):`);
totalStats.errorDetails.slice(0, 50).forEach((e) => console.log(` - ${e}`));
}
}
await completeJob(jobId);
} catch (error) {
console.error('Fatal error:', error);
await completeJob(jobId, String(error));
process.exit(1);
} finally {
await prisma.$disconnect();
await pool.end();
}
}
main();

View File

@@ -0,0 +1,686 @@
#!/usr/bin/env tsx
/**
* Import Catholic churches and mass schedules from gottesdienstzeiten.de (Germany)
*
* gottesdienstzeiten.de is a German worship service directory with ~6,878 Catholic
* churches. It runs on WordPress with a fully open REST API at /wp-json/wp/v2/posts.
*
* Data includes: church name, address, coordinates (Google Maps embed), diocese,
* mass schedules (day/type/time table), website, email, phone.
*
* Import strategy:
* 1. Fetch all Catholic diocese category IDs from WP API
* 2. Paginate through posts per category (100 per page)
* 3. Parse HTML content for coordinates, address, schedule table, info table
* 4. Match against existing German churches via church-matcher
* 5. Upsert churches and mass schedules
*
* Usage:
* npx tsx scripts/import-gottesdienstzeiten.ts --all --dry-run
* npx tsx scripts/import-gottesdienstzeiten.ts --all
* npx tsx scripts/import-gottesdienstzeiten.ts --diocese 129 --dry-run # Köln only
* npx tsx scripts/import-gottesdienstzeiten.ts --all --resume-from 5
*/
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
const pool = new Pool({
connectionString: dbUrl,
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
});
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
import { findDuplicateChurch } from '../src/lib/church-matcher';
import type { ExistingChurch } from '../src/lib/church-matcher';
// ─── Constants ───────────────────────────────────────────────────────────────
const API_BASE = 'https://gottesdienstzeiten.de/wp-json/wp/v2';
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
const REQUEST_DELAY_MS = 1000;
const RETRY_DELAY_MS = 5000;
const MAX_RETRIES = 3;
const POSTS_PER_PAGE = 100;
const CATHOLIC_PARENT_CATEGORY = 4;
// German day names → dayOfWeek (0=Sun, 1=Mon, ..., 6=Sat)
const GERMAN_DAYS: Record<string, number> = {
'sonntags': 0, 'montags': 1, 'dienstags': 2, 'mittwochs': 3,
'donnerstags': 4, 'freitags': 5, 'samstags': 6,
// Without -s suffix (some entries use these)
'sonntag': 0, 'montag': 1, 'dienstag': 2, 'mittwoch': 3,
'donnerstag': 4, 'freitag': 5, 'samstag': 6,
};
// Mass-related types (filter out non-mass services)
const MASS_TYPES = new Set([
'messfeier', 'vorabendmesse', 'heilige messe', 'hl. messe',
'hochamt', 'festmesse', 'familienmesse', 'kindergottesdienst',
'jugendmesse', 'abendmesse', 'frühmesse', 'werktagsmesse',
'sonntagsmesse', 'messe', 'eucharistiefeier',
]);
// ─── Types ───────────────────────────────────────────────────────────────────
interface DioceseCat {
id: number;
name: string;
count: number;
}
interface ParsedChurch {
wpId: number;
slug: string;
name: string;
latitude: number;
longitude: number;
address: string | null;
zip: string | null;
city: string | null;
diocese: string | null;
website: string | null;
email: string | null;
phone: string | null;
schedules: ParsedSchedule[];
}
interface ParsedSchedule {
dayOfWeek: number;
time: string;
}
interface ImportStats {
diocesesProcessed: number;
postsFound: number;
churchesParsed: number;
churchesMatched: number;
churchesCreated: number;
churchesSkipped: number;
schedulesCreated: number;
errors: number;
}
interface CLIArgs {
all: boolean;
dryRun: boolean;
resumeFrom?: number;
diocese?: number;
jobId?: string;
}
// ─── HTTP Helpers ────────────────────────────────────────────────────────────
let requestCount = 0;
function delay(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function fetchJson(url: string): Promise<any | null> {
if (requestCount > 0) {
await delay(REQUEST_DELAY_MS);
}
requestCount++;
for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
try {
const response = await fetch(url, {
headers: { 'User-Agent': USER_AGENT },
});
if (response.status === 429 || response.status === 503) {
if (attempt < MAX_RETRIES) {
console.log(` HTTP ${response.status} — retrying in ${RETRY_DELAY_MS / 1000}s`);
await delay(RETRY_DELAY_MS);
continue;
}
return null;
}
if (!response.ok) return null;
return await response.json();
} catch (error) {
if (attempt < MAX_RETRIES) {
await delay(RETRY_DELAY_MS);
continue;
}
console.error(` Fetch error: ${error instanceof Error ? error.message : error}`);
return null;
}
}
return null;
}
// ─── Parsing ─────────────────────────────────────────────────────────────────
function stripHtml(html: string): string {
return html.replace(/<[^>]+>/g, '').trim();
}
function parsePost(post: any, dioceseName: string | null): ParsedChurch | null {
const content: string = post.content?.rendered || '';
const wpId: number = post.id;
const slug: string = post.slug;
// Extract name from title — format: "(City) Church Name"
let name = stripHtml(post.title?.rendered || '');
// Remove leading "(City)" prefix for cleaner name
const nameMatch = name.match(/^\([^)]+\)\s*(.+)$/);
if (nameMatch) name = nameMatch[1];
// Extract coordinates from Google Maps embed
const coordMatch = content.match(/maps\?q=([-\d.]+),([-\d.]+)/);
if (!coordMatch) return null;
const latitude = parseFloat(coordMatch[1]);
const longitude = parseFloat(coordMatch[2]);
if (isNaN(latitude) || isNaN(longitude) || (latitude === 0 && longitude === 0)) return null;
// Extract address from first <strong> tag (format: "Street, ZIP City")
const addrMatch = content.match(/<strong>([^<]+)<\/strong>/);
let address: string | null = null;
let zip: string | null = null;
let city: string | null = null;
if (addrMatch) {
const fullAddr = addrMatch[1].trim();
address = fullAddr;
// Parse "Street, ZIP City" format
const zipCityMatch = fullAddr.match(/,\s*(\d{5})\s+(.+)$/);
if (zipCityMatch) {
zip = zipCityMatch[1];
city = zipCityMatch[2];
address = fullAddr.replace(/,\s*\d{5}\s+.+$/, '').trim();
}
}
// Parse info table (second table) for website, email, phone
let website: string | null = null;
let email: string | null = null;
let phone: string | null = null;
const tables = content.match(/<table[^>]*>([\s\S]*?)<\/table>/g) || [];
if (tables.length >= 2) {
const infoTable = tables[1];
// Website
const websiteMatch = infoTable.match(/Website[\s\S]*?<a[^>]*href="([^"]+)"/);
if (websiteMatch) website = websiteMatch[1];
// Email
const emailMatch = infoTable.match(/E-Mail[\s\S]*?<td[^>]*>([\s\S]*?)<\/td>/);
if (emailMatch) {
const emailText = stripHtml(emailMatch[1]);
if (emailText.includes('@')) email = emailText;
}
// Phone
const phoneMatch = infoTable.match(/Telefon[\s\S]*?<td[^>]*>([\s\S]*?)<\/td>/);
if (phoneMatch) {
const phoneText = stripHtml(phoneMatch[1]);
if (phoneText.length > 3) phone = phoneText;
}
}
// Parse schedule table (first table)
const schedules: ParsedSchedule[] = [];
if (tables.length >= 1) {
const schedTable = tables[0];
const rows = schedTable.match(/<tr[^>]*>([\s\S]*?)<\/tr>/g) || [];
let currentDay = -1;
const seen = new Set<string>();
for (const row of rows) {
// Check for day header (in <th> with <em>)
const dayMatch = row.match(/<th[^>]*>[\s\S]*?<em>([^<]*)<\/em>/);
if (dayMatch && dayMatch[1].trim()) {
const dayName = dayMatch[1].trim().toLowerCase();
if (GERMAN_DAYS[dayName] !== undefined) {
currentDay = GERMAN_DAYS[dayName];
}
}
// Get type and time from <td><em>...</em></td>
const cells = row.match(/<td[^>]*>[\s\S]*?<em>([^<]*)<\/em>[\s\S]*?<\/td>/g);
if (!cells || cells.length < 2 || currentDay < 0) continue;
const typeMatch = cells[0].match(/<em>([^<]*)<\/em>/);
const timeMatch = cells[1].match(/<em>([^<]*)<\/em>/);
if (!typeMatch || !timeMatch) continue;
const massType = typeMatch[1].trim().toLowerCase();
const timeStr = timeMatch[1].trim();
// Only include mass-related types
const isMass = MASS_TYPES.has(massType) ||
massType.includes('messe') || massType.includes('messfeier') ||
massType.includes('eucharistie');
if (!isMass) continue;
// Parse time: "09.00 Uhr" or "18:30 Uhr" → "09:00" or "18:30"
const parsedTime = timeStr
.replace(/\s*Uhr\s*/i, '')
.replace('.', ':')
.trim();
const timeValidation = parsedTime.match(/^(\d{1,2}):(\d{2})$/);
if (!timeValidation) continue;
const normalizedTime = `${timeValidation[1].padStart(2, '0')}:${timeValidation[2]}`;
const key = `${currentDay}:${normalizedTime}`;
if (!seen.has(key)) {
seen.add(key);
schedules.push({ dayOfWeek: currentDay, time: normalizedTime });
}
}
}
return {
wpId, slug, name, latitude, longitude,
address, zip, city, diocese: dioceseName,
website, email, phone, schedules,
};
}
// ─── Database Operations ─────────────────────────────────────────────────────
async function loadExistingGermanChurches(): Promise<ExistingChurch[]> {
console.log('Loading existing German churches for deduplication...');
const churches = await prisma.church.findMany({
where: { country: 'DE' },
select: {
id: true,
name: true,
latitude: true,
longitude: true,
osmId: true,
baiduId: true,
masstimesId: true,
orarimesseId: true,
massSchedulesPhId: true,
philmassId: true,
horariosMisasId: true,
mszeInfoId: true,
weekdayMassesId: true,
messesInfoId: true,
bohosluzbyId: true,
miserendId: true,
kerknetId: true,
gottesdienstzeitenId: true,
discovermassId: true,
source: true,
website: true,
phone: true,
address: true,
},
});
console.log(`Loaded ${churches.length} existing German churches`);
return churches;
}
// ─── Import Logic ────────────────────────────────────────────────────────────
async function fetchDioceseCategories(): Promise<DioceseCat[]> {
console.log('Fetching Catholic diocese categories...');
const data = await fetchJson(
`${API_BASE}/categories?per_page=100&parent=${CATHOLIC_PARENT_CATEGORY}`
);
if (!data) {
console.error('Failed to fetch categories');
return [];
}
const cats: DioceseCat[] = data.map((c: any) => ({
id: c.id, name: c.name, count: c.count,
}));
const total = cats.reduce((s, c) => s + c.count, 0);
console.log(`Found ${cats.length} diocese categories with ${total} total posts\n`);
return cats.sort((a, b) => b.count - a.count);
}
async function processDiocese(
cat: DioceseCat,
existingChurches: ExistingChurch[],
dryRun: boolean,
stats: ImportStats,
): Promise<void> {
const totalPages = Math.ceil(cat.count / POSTS_PER_PAGE);
for (let page = 1; page <= totalPages; page++) {
const url = `${API_BASE}/posts?categories=${cat.id}&per_page=${POSTS_PER_PAGE}&page=${page}`;
const posts = await fetchJson(url);
if (!posts || !Array.isArray(posts) || posts.length === 0) break;
stats.postsFound += posts.length;
for (const post of posts) {
const church = parsePost(post, cat.name);
if (!church) {
stats.churchesSkipped++;
continue;
}
stats.churchesParsed++;
const gdzId = String(church.wpId);
const candidate = {
name: church.name,
lat: church.latitude,
lng: church.longitude,
gottesdienstzeitenId: gdzId,
};
const duplicate = findDuplicateChurch(candidate, existingChurches);
if (dryRun) {
if (duplicate) {
stats.churchesMatched++;
} else {
stats.churchesCreated++;
}
stats.schedulesCreated += church.schedules.length;
continue;
}
if (duplicate) {
stats.churchesMatched++;
const updateData: Record<string, unknown> = { gottesdienstzeitenId: gdzId };
if (!duplicate.address && church.address) updateData.address = church.address;
if (!duplicate.website && church.website) {
updateData.website = church.website;
updateData.hasWebsite = true;
}
if (!duplicate.phone && church.phone) updateData.phone = church.phone;
try {
await prisma.church.update({
where: { id: duplicate.id },
data: updateData,
});
} catch (error) {
if (error instanceof Error && error.message.includes('Unique constraint')) {
stats.churchesSkipped++;
continue;
}
throw error;
}
if (church.schedules.length > 0) {
try {
await prisma.$transaction(async (tx) => {
await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } });
await tx.massSchedule.createMany({
data: church.schedules.map((s) => ({
churchId: duplicate.id,
dayOfWeek: s.dayOfWeek,
time: s.time,
language: 'German',
})),
});
await tx.church.update({
where: { id: duplicate.id },
data: { lastScrapedAt: new Date() },
});
});
stats.schedulesCreated += church.schedules.length;
} catch (error) {
stats.errors++;
console.error(` Error saving schedules for ${church.slug}: ${error instanceof Error ? error.message : error}`);
}
}
} else {
try {
const newChurch = await prisma.church.create({
data: {
name: church.name,
latitude: church.latitude,
longitude: church.longitude,
address: church.address,
zip: church.zip,
city: church.city,
country: 'DE',
diocese: church.diocese || undefined,
website: church.website,
hasWebsite: !!church.website,
email: church.email,
phone: church.phone,
gottesdienstzeitenId: gdzId,
source: 'gottesdienstzeiten',
websiteLanguage: 'de',
},
});
stats.churchesCreated++;
existingChurches.push({
id: newChurch.id,
name: church.name,
latitude: church.latitude,
longitude: church.longitude,
osmId: null,
baiduId: null,
masstimesId: null,
orarimesseId: null,
massSchedulesPhId: null,
philmassId: null,
horariosMisasId: null,
mszeInfoId: null,
weekdayMassesId: null,
messesInfoId: null,
bohosluzbyId: null,
miserendId: null,
kerknetId: null,
gottesdienstzeitenId: gdzId,
discovermassId: null,
source: 'gottesdienstzeiten',
website: church.website,
phone: church.phone,
address: church.address,
});
if (church.schedules.length > 0) {
await prisma.massSchedule.createMany({
data: church.schedules.map((s) => ({
churchId: newChurch.id,
dayOfWeek: s.dayOfWeek,
time: s.time,
language: 'German',
})),
});
await prisma.church.update({
where: { id: newChurch.id },
data: { lastScrapedAt: new Date() },
});
stats.schedulesCreated += church.schedules.length;
}
} catch (error) {
if (error instanceof Error && error.message.includes('Unique constraint')) {
stats.churchesSkipped++;
continue;
}
stats.errors++;
console.error(` Error creating ${church.slug}: ${error instanceof Error ? error.message : error}`);
}
}
}
}
stats.diocesesProcessed++;
}
// ─── CLI ─────────────────────────────────────────────────────────────────────
function parseArgs(): CLIArgs {
const args = process.argv.slice(2);
const result: CLIArgs = { all: false, dryRun: false };
for (let i = 0; i < args.length; i++) {
switch (args[i]) {
case '--all':
result.all = true;
break;
case '--dry-run':
result.dryRun = true;
break;
case '--resume-from':
result.resumeFrom = parseInt(args[++i]);
break;
case '--diocese':
result.diocese = parseInt(args[++i]);
break;
case '--job-id':
result.jobId = args[++i];
break;
case '--help':
case '-h':
console.log(`
Usage: npx tsx scripts/import-gottesdienstzeiten.ts [options]
Options:
--all Import all Catholic diocese categories
--diocese <catId> Import a single diocese category (e.g., 129 for Köln)
--dry-run No database writes, just report what would happen
--resume-from <n> Skip first N diocese categories
--job-id <uuid> Background job tracking ID
--help, -h Show this help message
Examples:
npx tsx scripts/import-gottesdienstzeiten.ts --diocese 129 --dry-run
npx tsx scripts/import-gottesdienstzeiten.ts --all --dry-run
npx tsx scripts/import-gottesdienstzeiten.ts --all
`);
process.exit(0);
}
}
if (!result.all && !result.diocese) {
console.error('Error: specify --all or --diocese <categoryId>');
process.exit(1);
}
return result;
}
function formatDuration(ms: number): string {
const seconds = Math.floor(ms / 1000);
const minutes = Math.floor(seconds / 60);
const hours = Math.floor(minutes / 60);
if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`;
if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
return `${seconds}s`;
}
// ─── Main ────────────────────────────────────────────────────────────────────
async function main() {
const args = parseArgs();
const startTime = Date.now();
console.log('\n' + '='.repeat(70));
console.log('GOTTESDIENSTZEITEN.DE (GERMANY) IMPORTER');
console.log('='.repeat(70));
console.log(`Mode: ${args.diocese ? `Diocese category ${args.diocese}` : 'All dioceses'}`);
console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`);
if (args.resumeFrom) console.log(`Resume from: diocese index ${args.resumeFrom}`);
console.log(`Time: ${new Date().toISOString()}`);
console.log('='.repeat(70) + '\n');
if (args.jobId) {
try {
await prisma.backgroundJob.update({
where: { id: args.jobId },
data: { status: 'running', startedAt: new Date() },
});
} catch { /* Job might not exist */ }
}
const stats: ImportStats = {
diocesesProcessed: 0,
postsFound: 0,
churchesParsed: 0,
churchesMatched: 0,
churchesCreated: 0,
churchesSkipped: 0,
schedulesCreated: 0,
errors: 0,
};
const existingChurches = await loadExistingGermanChurches();
let categories: DioceseCat[];
if (args.diocese) {
categories = [{ id: args.diocese, name: `Category ${args.diocese}`, count: 1000 }];
} else {
categories = await fetchDioceseCategories();
}
if (args.resumeFrom && !args.diocese) {
categories = categories.slice(args.resumeFrom);
console.log(`Resuming from diocese index ${args.resumeFrom} (${categories[0]?.name})\n`);
}
console.log(`Processing ${categories.length} diocese categories\n`);
for (let i = 0; i < categories.length; i++) {
const cat = categories[i];
const elapsed = formatDuration(Date.now() - startTime);
console.log(`[${i + 1}/${categories.length}] ${cat.name} (${cat.count} posts) [${elapsed} elapsed]`);
try {
await processDiocese(cat, existingChurches, args.dryRun, stats);
} catch (error) {
stats.errors++;
console.error(` ERROR processing ${cat.name}: ${error instanceof Error ? error.message : error}`);
}
}
const totalTime = Date.now() - startTime;
console.log('\n' + '='.repeat(70));
console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
console.log('='.repeat(70));
console.log(`Dioceses processed: ${stats.diocesesProcessed}`);
console.log(`WP posts found: ${stats.postsFound}`);
console.log(`Churches parsed: ${stats.churchesParsed}`);
console.log(` Matched (existing): ${stats.churchesMatched}`);
console.log(` Created (new): ${stats.churchesCreated}`);
console.log(` Skipped (no coords): ${stats.churchesSkipped}`);
console.log(`Schedules created: ${stats.schedulesCreated}`);
console.log(`Errors: ${stats.errors}`);
console.log(`Total time: ${formatDuration(totalTime)}`);
console.log(`HTTP requests: ${requestCount}`);
console.log('='.repeat(70) + '\n');
if (args.jobId) {
try {
await prisma.backgroundJob.update({
where: { id: args.jobId },
data: {
status: stats.errors > 0 ? 'completed_with_errors' : 'completed',
completedAt: new Date(),
processed: stats.churchesParsed,
succeeded: stats.churchesCreated + stats.churchesMatched,
failed: stats.errors,
itemsFound: stats.schedulesCreated,
},
});
} catch { /* Ignore */ }
}
}
main()
.catch((error) => {
console.error('Fatal error:', error);
process.exit(1);
})
.finally(async () => {
await prisma.$disconnect();
await pool.end();
});

File diff suppressed because it is too large Load Diff

697
scripts/import-kerknet.ts Normal file
View File

@@ -0,0 +1,697 @@
#!/usr/bin/env tsx
/**
* Import Catholic churches and mass schedules from kerknet.be (Flanders, Belgium)
*
* Kerknet is the portal of the Catholic Church in Flanders (Dutch-speaking Belgium).
* It has ~1,200 churches with structured data: name, address, coordinates (GeoJSON),
* and date-specific celebration entries.
*
* Import strategy:
* 1. Enumerate unique church slugs by paginating the celebration listing
* 2. Scrape each /kerk/{slug} page for structured data (name, address, coords, nodeId)
* 3. Fetch celebrations via AJAX endpoint per church
* 4. Deduce recurring weekly schedules from date-specific celebrations
* 5. Match against existing Belgian churches via church-matcher
* 6. Upsert churches and mass schedules
*
* Usage:
* npx tsx scripts/import-kerknet.ts --all --dry-run
* npx tsx scripts/import-kerknet.ts --all
* npx tsx scripts/import-kerknet.ts --slug o-l-vrouw-kerk-scherpenheuvel --dry-run
* npx tsx scripts/import-kerknet.ts --all --resume-from 100
*/
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
const pool = new Pool({
connectionString: dbUrl,
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
});
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
import { findDuplicateChurch } from '../src/lib/church-matcher';
import type { ExistingChurch } from '../src/lib/church-matcher';
// ─── Constants ───────────────────────────────────────────────────────────────
const BASE_URL = 'https://www.kerknet.be';
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
const ENUM_DELAY_MS = 2000; // Delay between listing pages (respecting crawl-delay spirit)
const DETAIL_DELAY_MS = 3000; // Delay between church detail page fetches
const CELEBRATION_DELAY_MS = 2000; // Delay between celebration AJAX calls
const MAX_RETRIES = 3;
const RETRY_DELAY_MS = 10000;
const MAX_ENUM_PAGES = 2804; // Total celebration listing pages
const ENUM_SAMPLE_INTERVAL = 5; // Check every Nth page (5 → ~560 pages to check)
const STALE_THRESHOLD = 10; // Stop if N consecutive sampled pages yield no new slugs
// Dutch day abbreviations → dayOfWeek (0=Sun, 1=Mon, ..., 6=Sat)
const DUTCH_DAYS: Record<string, number> = {
'zo': 0, 'ma': 1, 'di': 2, 'wo': 3, 'do': 4, 'vr': 5, 'za': 6,
};
// ─── Types ───────────────────────────────────────────────────────────────────
interface ChurchData {
slug: string;
nodeId: string;
name: string;
address: string | null;
zip: string | null;
city: string | null;
latitude: number;
longitude: number;
website: string | null;
}
interface CelebrationEntry {
dayAbbrev: string;
date: string; // DD/MM
time: string; // HH.MM or HH:MM
type: string; // Eucharistie, Gebedsdienst, etc.
}
interface ParsedSchedule {
dayOfWeek: number;
time: string;
}
interface ImportStats {
slugsEnumerated: number;
churchesFetched: number;
churchesMatched: number;
churchesCreated: number;
churchesSkipped: number;
schedulesCreated: number;
errors: number;
}
interface CLIArgs {
all: boolean;
dryRun: boolean;
resumeFrom?: number;
slug?: string;
jobId?: string;
}
// ─── HTTP Helpers ────────────────────────────────────────────────────────────
let requestCount = 0;
function delay(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function fetchPage(url: string, delayMs: number): Promise<string | null> {
if (requestCount > 0) {
await delay(delayMs);
}
requestCount++;
for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
try {
const response = await fetch(url, {
headers: { 'User-Agent': USER_AGENT },
});
if (response.status === 429 || response.status === 503) {
if (attempt < MAX_RETRIES) {
console.log(` HTTP ${response.status} — retrying in ${RETRY_DELAY_MS / 1000}s (attempt ${attempt}/${MAX_RETRIES})`);
await delay(RETRY_DELAY_MS);
continue;
}
return null;
}
if (!response.ok) {
if (attempt < MAX_RETRIES) {
await delay(RETRY_DELAY_MS);
continue;
}
return null;
}
return await response.text();
} catch (error) {
if (attempt < MAX_RETRIES) {
console.log(` Network error — retrying (attempt ${attempt}/${MAX_RETRIES})`);
await delay(RETRY_DELAY_MS);
continue;
}
console.error(` Fetch failed after ${MAX_RETRIES} attempts: ${error instanceof Error ? error.message : error}`);
return null;
}
}
return null;
}
// ─── Phase 1: Enumerate Church Slugs ─────────────────────────────────────────
async function enumerateChurchSlugs(): Promise<string[]> {
console.log('\nPhase 1: Enumerating church slugs from celebration listings...');
const slugs = new Set<string>();
let consecutiveEmpty = 0;
for (let page = 0; page < MAX_ENUM_PAGES; page += ENUM_SAMPLE_INTERVAL) {
const url = `${BASE_URL}/zoeken/vieringen/lijst?page=${page}`;
const html = await fetchPage(url, ENUM_DELAY_MS);
if (!html) {
consecutiveEmpty++;
if (consecutiveEmpty >= STALE_THRESHOLD) {
console.log(` Stopping enumeration: ${STALE_THRESHOLD} consecutive empty pages`);
break;
}
continue;
}
// Extract /kerk/{slug} links (church building pages, NOT org pages like /kerk-jette/artikel/)
const matches = html.match(/href="\/kerk\/([^"/]+)"/g);
const prevSize = slugs.size;
if (matches) {
for (const match of matches) {
const slugMatch = match.match(/href="\/kerk\/([^"/]+)"/);
if (slugMatch) {
slugs.add(slugMatch[1]);
}
}
}
const newCount = slugs.size - prevSize;
if (newCount === 0) {
consecutiveEmpty++;
} else {
consecutiveEmpty = 0;
}
if (page % 50 === 0 || newCount > 0) {
console.log(` Page ${page}: ${slugs.size} unique churches found (+${newCount})`);
}
if (consecutiveEmpty >= STALE_THRESHOLD) {
console.log(` Stopping enumeration: ${STALE_THRESHOLD} consecutive sampled pages with no new churches`);
break;
}
}
console.log(` Enumeration complete: ${slugs.size} unique church slugs found\n`);
return Array.from(slugs).sort();
}
// ─── Phase 2: Scrape Church Detail Page ──────────────────────────────────────
function parseChurchPage(html: string, slug: string): ChurchData | null {
// Extract coordinates from GeoJSON in Drupal settings
const coordMatch = html.match(/"coordinates":\[(-?[\d.]+),(-?[\d.]+)\]/);
if (!coordMatch) return null; // No coordinates = unusable
const longitude = parseFloat(coordMatch[1]);
const latitude = parseFloat(coordMatch[2]);
if (latitude === 0 && longitude === 0) return null;
// Extract node ID
const nidMatch = html.match(/"currentNid":"(\d+)"/);
const nodeId = nidMatch ? nidMatch[1] : slug;
// Extract name from GeoJSON description or page title
let name = slug;
const descMatch = html.match(/"description":"([^"]+)"/);
if (descMatch) {
name = descMatch[1];
} else {
const titleMatch = html.match(/<title>([^|<]+)/);
if (titleMatch) name = titleMatch[1].trim();
}
// Extract address fields
const streetMatch = html.match(/class="thoroughfare">([^<]+)</);
const zipMatch = html.match(/class="postal-code">([^<]+)</);
const cityMatch = html.match(/class="locality">([^<]+)</);
const address = streetMatch ? streetMatch[1].trim() : null;
const zip = zipMatch ? zipMatch[1].trim() : null;
const city = cityMatch ? cityMatch[1].trim() : null;
// Extract website
let website: string | null = null;
const websiteMatch = html.match(/class="website"[^>]*>.*?href="([^"]+)"/s);
if (websiteMatch) {
website = websiteMatch[1];
} else {
// Try field-name-kn-website pattern
const knWebsiteMatch = html.match(/field-name-kn-website.*?href="([^"]+)"/s);
if (knWebsiteMatch) website = knWebsiteMatch[1];
}
return { slug, nodeId, name, address, zip, city, latitude, longitude, website };
}
// ─── Phase 3: Parse Celebrations ─────────────────────────────────────────────
function parseCelebrations(html: string): CelebrationEntry[] {
const entries: CelebrationEntry[] = [];
// Match celebration blocks
const celebBlocks = html.split('<div class="celebration">').slice(1);
for (const block of celebBlocks) {
// Extract day abbreviation
const dayMatch = block.match(/celebration__date__day">\s*(\w+)\s*</);
if (!dayMatch) continue;
// Extract date (DD/MM)
const dateMatch = block.match(/celebration__date__date">\s*([\d/]+)\s*</);
// Extract time (HH.MM)
const timeMatch = block.match(/celebration__time">\s*([\d.]+)\s*</);
if (!timeMatch) continue;
// Extract type
const typeMatch = block.match(/celebration__info__type">\s*([^<]+)\s*</);
entries.push({
dayAbbrev: dayMatch[1].toLowerCase().trim(),
date: dateMatch ? dateMatch[1].trim() : '',
time: timeMatch[1].trim(),
type: typeMatch ? typeMatch[1].trim().toLowerCase() : 'eucharistie',
});
}
return entries;
}
function deduceSchedules(celebrations: CelebrationEntry[]): ParsedSchedule[] {
// Only keep Eucharistie (mass) entries
const masses = celebrations.filter(c =>
c.type === 'eucharistie' || c.type === 'eucharistieviering'
);
const seen = new Set<string>();
const schedules: ParsedSchedule[] = [];
for (const mass of masses) {
const dayOfWeek = DUTCH_DAYS[mass.dayAbbrev];
if (dayOfWeek === undefined) continue;
// Normalize time: "15.00" → "15:00"
const time = mass.time.replace('.', ':').replace(/^(\d):/, '0$1:');
const key = `${dayOfWeek}:${time}`;
if (!seen.has(key)) {
seen.add(key);
schedules.push({ dayOfWeek, time });
}
}
return schedules;
}
// ─── Database Operations ─────────────────────────────────────────────────────
async function loadExistingBelgianChurches(): Promise<ExistingChurch[]> {
console.log('Loading existing Belgian churches for deduplication...');
const churches = await prisma.church.findMany({
where: { country: 'BE' },
select: {
id: true,
name: true,
latitude: true,
longitude: true,
osmId: true,
baiduId: true,
masstimesId: true,
orarimesseId: true,
massSchedulesPhId: true,
philmassId: true,
horariosMisasId: true,
mszeInfoId: true,
weekdayMassesId: true,
messesInfoId: true,
bohosluzbyId: true,
miserendId: true,
kerknetId: true,
gottesdienstzeitenId: true,
discovermassId: true,
source: true,
website: true,
phone: true,
address: true,
},
});
console.log(`Loaded ${churches.length} existing Belgian churches`);
return churches;
}
// ─── Import Logic ────────────────────────────────────────────────────────────
async function processChurch(
slug: string,
existingChurches: ExistingChurch[],
dryRun: boolean,
stats: ImportStats,
): Promise<void> {
// Fetch church detail page
const churchHtml = await fetchPage(`${BASE_URL}/kerk/${slug}`, DETAIL_DELAY_MS);
if (!churchHtml) {
stats.errors++;
return;
}
const church = parseChurchPage(churchHtml, slug);
if (!church) {
stats.churchesSkipped++;
return;
}
stats.churchesFetched++;
// Fetch celebrations via AJAX
let celebrations: CelebrationEntry[] = [];
const celebHtml = await fetchPage(
`${BASE_URL}/kerknet-celebration/churches/ajax/load-more/0/${church.nodeId}`,
CELEBRATION_DELAY_MS,
);
if (celebHtml) {
celebrations = parseCelebrations(celebHtml);
}
const schedules = deduceSchedules(celebrations);
const kerknetId = `kerknet-${church.nodeId}`;
const candidate = {
name: church.name,
lat: church.latitude,
lng: church.longitude,
kerknetId,
};
const duplicate = findDuplicateChurch(candidate, existingChurches);
if (dryRun) {
if (duplicate) {
stats.churchesMatched++;
} else {
stats.churchesCreated++;
}
stats.schedulesCreated += schedules.length;
return;
}
if (duplicate) {
stats.churchesMatched++;
const updateData: Record<string, unknown> = { kerknetId };
if (!duplicate.address && church.address) updateData.address = church.address;
if (!duplicate.website && church.website) updateData.website = church.website;
try {
await prisma.church.update({
where: { id: duplicate.id },
data: updateData,
});
} catch (error) {
if (error instanceof Error && error.message.includes('Unique constraint')) {
stats.churchesSkipped++;
return;
}
throw error;
}
if (schedules.length > 0) {
try {
await prisma.$transaction(async (tx) => {
await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } });
await tx.massSchedule.createMany({
data: schedules.map((s) => ({
churchId: duplicate.id,
dayOfWeek: s.dayOfWeek,
time: s.time,
language: 'Dutch',
})),
});
await tx.church.update({
where: { id: duplicate.id },
data: { lastScrapedAt: new Date() },
});
});
stats.schedulesCreated += schedules.length;
} catch (error) {
stats.errors++;
console.error(` Error saving schedules for ${slug}: ${error instanceof Error ? error.message : error}`);
}
}
} else {
try {
const newChurch = await prisma.church.create({
data: {
name: church.name,
latitude: church.latitude,
longitude: church.longitude,
address: church.address,
zip: church.zip,
city: church.city,
country: 'BE',
website: church.website,
hasWebsite: !!church.website,
kerknetId,
source: 'kerknet',
websiteLanguage: 'nl',
},
});
stats.churchesCreated++;
existingChurches.push({
id: newChurch.id,
name: church.name,
latitude: church.latitude,
longitude: church.longitude,
osmId: null,
baiduId: null,
masstimesId: null,
orarimesseId: null,
massSchedulesPhId: null,
philmassId: null,
horariosMisasId: null,
mszeInfoId: null,
weekdayMassesId: null,
messesInfoId: null,
bohosluzbyId: null,
miserendId: null,
kerknetId,
gottesdienstzeitenId: null,
discovermassId: null,
source: 'kerknet',
website: church.website,
phone: null,
address: church.address,
});
if (schedules.length > 0) {
await prisma.massSchedule.createMany({
data: schedules.map((s) => ({
churchId: newChurch.id,
dayOfWeek: s.dayOfWeek,
time: s.time,
language: 'Dutch',
})),
});
await prisma.church.update({
where: { id: newChurch.id },
data: { lastScrapedAt: new Date() },
});
stats.schedulesCreated += schedules.length;
}
} catch (error) {
if (error instanceof Error && error.message.includes('Unique constraint')) {
stats.churchesSkipped++;
return;
}
stats.errors++;
console.error(` Error creating ${slug}: ${error instanceof Error ? error.message : error}`);
}
}
}
// ─── CLI ─────────────────────────────────────────────────────────────────────
function parseArgs(): CLIArgs {
const args = process.argv.slice(2);
const result: CLIArgs = { all: false, dryRun: false };
for (let i = 0; i < args.length; i++) {
switch (args[i]) {
case '--all':
result.all = true;
break;
case '--dry-run':
result.dryRun = true;
break;
case '--resume-from':
result.resumeFrom = parseInt(args[++i]);
break;
case '--slug':
result.slug = args[++i];
break;
case '--job-id':
result.jobId = args[++i];
break;
case '--help':
case '-h':
console.log(`
Usage: npx tsx scripts/import-kerknet.ts [options]
Options:
--all Import all churches from kerknet.be
--slug <slug> Import a single church (e.g., o-l-vrouw-kerk-scherpenheuvel)
--dry-run No database writes, just report what would happen
--resume-from <n> Skip first N churches (after enumeration)
--job-id <uuid> Background job tracking ID
--help, -h Show this help message
Examples:
npx tsx scripts/import-kerknet.ts --slug o-l-vrouw-kerk-scherpenheuvel --dry-run
npx tsx scripts/import-kerknet.ts --all --dry-run
npx tsx scripts/import-kerknet.ts --all
`);
process.exit(0);
}
}
if (!result.all && !result.slug) {
console.error('Error: specify --all or --slug <slug>');
process.exit(1);
}
return result;
}
function formatDuration(ms: number): string {
const seconds = Math.floor(ms / 1000);
const minutes = Math.floor(seconds / 60);
const hours = Math.floor(minutes / 60);
if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`;
if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
return `${seconds}s`;
}
// ─── Main ────────────────────────────────────────────────────────────────────
async function main() {
const args = parseArgs();
const startTime = Date.now();
console.log('\n' + '='.repeat(70));
console.log('KERKNET.BE (BELGIUM/FLANDERS) IMPORTER');
console.log('='.repeat(70));
console.log(`Mode: ${args.slug ? `Single: ${args.slug}` : 'All churches'}`);
console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`);
if (args.resumeFrom) console.log(`Resume from: church index ${args.resumeFrom}`);
console.log(`Time: ${new Date().toISOString()}`);
console.log('='.repeat(70) + '\n');
if (args.jobId) {
try {
await prisma.backgroundJob.update({
where: { id: args.jobId },
data: { status: 'running', startedAt: new Date() },
});
} catch { /* Job might not exist */ }
}
const stats: ImportStats = {
slugsEnumerated: 0,
churchesFetched: 0,
churchesMatched: 0,
churchesCreated: 0,
churchesSkipped: 0,
schedulesCreated: 0,
errors: 0,
};
const existingChurches = await loadExistingBelgianChurches();
// Get list of church slugs
let slugs: string[];
if (args.slug) {
slugs = [args.slug];
} else {
slugs = await enumerateChurchSlugs();
stats.slugsEnumerated = slugs.length;
}
if (args.resumeFrom && !args.slug) {
slugs = slugs.slice(args.resumeFrom);
console.log(`Resuming from church index ${args.resumeFrom} (${slugs[0]})\n`);
}
console.log(`Processing ${slugs.length} churches\n`);
for (let i = 0; i < slugs.length; i++) {
const slug = slugs[i];
const elapsed = formatDuration(Date.now() - startTime);
if (i % 50 === 0 || slugs.length <= 10) {
console.log(`[${i + 1}/${slugs.length}] ${slug} [${elapsed} elapsed, ${stats.churchesCreated} new, ${stats.churchesMatched} matched]`);
}
try {
await processChurch(slug, existingChurches, args.dryRun, stats);
} catch (error) {
stats.errors++;
console.error(` ERROR processing ${slug}: ${error instanceof Error ? error.message : error}`);
}
}
const totalTime = Date.now() - startTime;
console.log('\n' + '='.repeat(70));
console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
console.log('='.repeat(70));
console.log(`Slugs enumerated: ${stats.slugsEnumerated}`);
console.log(`Churches fetched: ${stats.churchesFetched}`);
console.log(` Matched (existing): ${stats.churchesMatched}`);
console.log(` Created (new): ${stats.churchesCreated}`);
console.log(` Skipped: ${stats.churchesSkipped}`);
console.log(`Schedules created: ${stats.schedulesCreated}`);
console.log(`Errors: ${stats.errors}`);
console.log(`Total time: ${formatDuration(totalTime)}`);
console.log(`HTTP requests: ${requestCount}`);
console.log('='.repeat(70) + '\n');
if (args.jobId) {
try {
await prisma.backgroundJob.update({
where: { id: args.jobId },
data: {
status: stats.errors > 0 ? 'completed_with_errors' : 'completed',
completedAt: new Date(),
processed: stats.churchesFetched,
succeeded: stats.churchesCreated + stats.churchesMatched,
failed: stats.errors,
itemsFound: stats.schedulesCreated,
},
});
} catch { /* Ignore */ }
}
}
main()
.catch((error) => {
console.error('Fatal error:', error);
process.exit(1);
})
.finally(async () => {
await prisma.$disconnect();
await pool.end();
});

View File

@@ -0,0 +1,695 @@
#!/usr/bin/env tsx
/**
* Import Catholic churches and mass schedules from mass-schedules.com (Philippines)
*
* mass-schedules.com has been operating since 2008 and covers ~1,500 Philippine
* churches with weekly mass schedule tables and coordinates on separate map pages.
*
* Import strategy:
* 1. Fetch sitemap XML → extract all /catholic-church/{id}/ URLs
* 2. For each church: fetch page HTML, parse name/address/schedule, fetch map
* page for coordinates, match against existing PH churches, upsert
*
* Usage:
* npx tsx scripts/import-mass-schedules-ph.ts --all
* npx tsx scripts/import-mass-schedules-ph.ts --all --dry-run
* npx tsx scripts/import-mass-schedules-ph.ts --church-id 34
* npx tsx scripts/import-mass-schedules-ph.ts --all --resume-from 500
* npx tsx scripts/import-mass-schedules-ph.ts --all --skip-schedules
* npx tsx scripts/import-mass-schedules-ph.ts --all --job-id {uuid}
*/
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
const pool = new Pool({
connectionString: dbUrl,
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
});
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
import { findDuplicateChurch } from '../src/lib/church-matcher';
import type { ExistingChurch } from '../src/lib/church-matcher';
// ─── Constants ───────────────────────────────────────────────────────────────
const SITE_BASE = 'https://www.mass-schedules.com';
const SITEMAP_URL = `${SITE_BASE}/sitemaps/sitemap02272021.xml`;
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
const REQUEST_DELAY_MS = 1500;
// ─── Types ───────────────────────────────────────────────────────────────────
interface SitemapChurch {
id: string;
slug: string;
url: string;
}
interface ParsedChurch {
name: string;
address: string | null;
region: string | null;
city: string | null;
phone: string | null;
mapUrl: string | null;
}
interface ParsedSchedule {
dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat
time: string; // "05:00", "18:30"
}
interface ImportStats {
churchesFound: number;
churchesMatched: number;
churchesCreated: number;
churchesSkipped: number;
schedulesProcessed: number;
massSchedulesCreated: number;
errors: number;
}
interface CLIArgs {
all: boolean;
churchId?: string;
dryRun: boolean;
skipSchedules: boolean;
resumeFrom?: number;
jobId?: string;
}
// ─── HTTP Client ─────────────────────────────────────────────────────────────
let requestCount = 0;
function delay(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function fetchPage(url: string): Promise<string | null> {
if (requestCount > 0) {
await delay(REQUEST_DELAY_MS);
}
requestCount++;
try {
const response = await fetch(url, {
headers: {
'User-Agent': USER_AGENT,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
},
});
if (!response.ok) {
console.error(` HTTP ${response.status} for ${url}`);
return null;
}
return await response.text();
} catch (error) {
console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`);
return null;
}
}
// ─── Sitemap Parser ──────────────────────────────────────────────────────────
async function fetchChurchUrlsFromSitemap(): Promise<SitemapChurch[]> {
console.log(`Fetching sitemap: ${SITEMAP_URL}`);
const xml = await fetchPage(SITEMAP_URL);
if (!xml) {
throw new Error('Failed to fetch sitemap');
}
// Extract /catholic-church/{id}/{slug}.html URLs
const urlRegex = /\/catholic-church\/(\d+)\/([\w-]+)\.html/g;
const seen = new Set<string>();
const churches: SitemapChurch[] = [];
let match;
while ((match = urlRegex.exec(xml)) !== null) {
const id = match[1];
if (seen.has(id)) continue; // Sitemap has duplicates
seen.add(id);
churches.push({
id,
slug: match[2],
url: `${SITE_BASE}/catholic-church/${id}/${match[2]}.html`,
});
}
// Sort by ID for predictable ordering
churches.sort((a, b) => parseInt(a.id) - parseInt(b.id));
return churches;
}
// ─── HTML Parsers ────────────────────────────────────────────────────────────
function parseChurchPage(html: string): ParsedChurch {
// Name from <h1 class="page_title">...</h1>
const h1Match = html.match(/<h1[^>]*class="page_title"[^>]*>([\s\S]*?)<\/h1>/i);
let name = h1Match ? h1Match[1].trim() : '';
// Remove " Mass Schedule" suffix
name = name.replace(/\s*Mass\s*Schedule\s*$/i, '').trim();
// Address from <label>address:</label> ... <p class="data">...</p>
const addressMatch = html.match(/<label>address:<\/label>\s*<p class="data">([\s\S]*?)<\/p>/i);
let address: string | null = null;
let mapUrl: string | null = null;
if (addressMatch) {
// Extract map link before cleaning
const mapLinkMatch = addressMatch[1].match(/href="(\/location-map\/[^"]+)"/);
if (mapLinkMatch) {
mapUrl = `${SITE_BASE}${mapLinkMatch[1]}`;
}
// Clean address: remove HTML tags, normalize whitespace
address = addressMatch[1]
.replace(/<[^>]+>/g, '')
.replace(/\(show location map\)/i, '')
.replace(/\s+/g, ' ')
.trim() || null;
}
// Phone from <label>telephone number:</label> ... <p class="data_inline" id="TELEPHONE">...</p>
const phoneMatch = html.match(/id="TELEPHONE"[^>]*>([\s\S]*?)<\/p>/i);
const phone = phoneMatch ? phoneMatch[1].trim() || null : null;
// Region and city from breadcrumbs
// Pattern: > {Region} > {City}
const breadcrumbMatches = [...html.matchAll(/class="normal"\s+href="[^"]*\/locations\/\d+\/[^"]*"[^>]*>([^<]+)<\/a>/gi)];
const region = breadcrumbMatches.length > 0 ? breadcrumbMatches[0][1].trim() : null;
const cityMatches = [...html.matchAll(/class="normal"\s+href="[^"]*\/catholic-churches\/\d+\/[^"]*"[^>]*>([^<]+)<\/a>/gi)];
const city = cityMatches.length > 0 ? cityMatches[0][1].trim() : null;
return { name, address, region, city, phone, mapUrl };
}
function parseScheduleTable(html: string): ParsedSchedule[] {
// The schedule table has 7 columns: Sun(0), Mon(1), Tue(2), Wed(3), Thu(4), Fri(5), Sat(6)
// Each row contains <td> cells with <p class="schedule">5:00 AM - 6:00 AM</p>
const schedules: ParsedSchedule[] = [];
const seen = new Set<string>();
// Extract all table rows from <tbody>
const tbodyMatch = html.match(/<tbody>([\s\S]*?)<\/tbody>/i);
if (!tbodyMatch) return schedules;
const rows = tbodyMatch[1].match(/<tr>([\s\S]*?)<\/tr>/gi);
if (!rows) return schedules;
for (const row of rows) {
// Extract all <td> cells
const cells = row.match(/<td>([\s\S]*?)<\/td>/gi);
if (!cells) continue;
for (let colIndex = 0; colIndex < cells.length && colIndex < 7; colIndex++) {
const dayOfWeek = colIndex; // 0=Sun, 1=Mon, ..., 6=Sat
// Extract time from <p class="schedule">5:00 AM - 6:00 AM</p>
const timeMatch = cells[colIndex].match(/<p class="schedule">\s*(\d{1,2}:\d{2}\s*[AP]M)/i);
if (!timeMatch) continue;
const time = convertTo24Hour(timeMatch[1].trim());
if (!time) continue;
const key = `${dayOfWeek}:${time}`;
if (seen.has(key)) continue;
seen.add(key);
schedules.push({ dayOfWeek, time });
}
}
return schedules;
}
function convertTo24Hour(timeStr: string): string | null {
// "5:00 AM" → "05:00", "6:30 PM" → "18:30"
const match = timeStr.match(/^(\d{1,2}):(\d{2})\s*(AM|PM)$/i);
if (!match) return null;
let hours = parseInt(match[1]);
const minutes = match[2];
const period = match[3].toUpperCase();
if (period === 'AM' && hours === 12) hours = 0;
if (period === 'PM' && hours !== 12) hours += 12;
return `${String(hours).padStart(2, '0')}:${minutes}`;
}
function parseCoordinates(html: string): { lat: number; lng: number } | null {
// Coordinates in JS: ms.ui.church.params.lat = '14.598815'
const latMatch = html.match(/ms\.ui\.church\.params\.lat\s*=\s*'([^']+)'/);
const lngMatch = html.match(/ms\.ui\.church\.params\.lng\s*=\s*'([^']+)'/);
if (!latMatch || !lngMatch) return null;
const lat = parseFloat(latMatch[1]);
const lng = parseFloat(lngMatch[1]);
if (isNaN(lat) || isNaN(lng) || lat === 0 || lng === 0) return null;
return { lat, lng };
}
// ─── Database Operations ─────────────────────────────────────────────────────
async function loadExistingPhilippineChurches(): Promise<ExistingChurch[]> {
console.log('Loading existing Philippine churches for deduplication...');
const churches = await prisma.church.findMany({
where: { country: 'PH' },
select: {
id: true,
name: true,
latitude: true,
longitude: true,
osmId: true,
baiduId: true,
masstimesId: true,
orarimesseId: true,
massSchedulesPhId: true,
philmassId: true,
horariosMisasId: true,
mszeInfoId: true,
weekdayMassesId: true,
messesInfoId: true,
bohosluzbyId: true,
miserendId: true,
kerknetId: true,
gottesdienstzeitenId: true,
discovermassId: true,
source: true,
website: true,
phone: true,
address: true,
},
});
console.log(`Loaded ${churches.length} existing Philippine churches`);
return churches;
}
// ─── Import Logic ────────────────────────────────────────────────────────────
async function processChurch(
sitemapEntry: SitemapChurch,
existingChurches: ExistingChurch[],
dryRun: boolean,
skipSchedules: boolean,
stats: ImportStats,
): Promise<void> {
stats.churchesFound++;
// Fetch church page
const churchHtml = await fetchPage(sitemapEntry.url);
if (!churchHtml) {
stats.errors++;
return;
}
const parsed = parseChurchPage(churchHtml);
if (!parsed.name) {
console.log(` Skipping ${sitemapEntry.id}: no name found`);
stats.churchesSkipped++;
return;
}
// Fetch coordinates from map page
let coords: { lat: number; lng: number } | null = null;
if (parsed.mapUrl) {
const mapHtml = await fetchPage(parsed.mapUrl);
if (mapHtml) {
coords = parseCoordinates(mapHtml);
}
}
if (!coords) {
console.log(` Skipping ${sitemapEntry.id} (${parsed.name}): no coordinates`);
stats.churchesSkipped++;
return;
}
// Parse schedule
const schedules = skipSchedules ? [] : parseScheduleTable(churchHtml);
// Build candidate for dedup
const candidate = {
name: parsed.name,
lat: coords.lat,
lng: coords.lng,
massSchedulesPhId: sitemapEntry.id,
};
const duplicate = findDuplicateChurch(candidate, existingChurches);
if (dryRun) {
if (duplicate) {
stats.churchesMatched++;
console.log(` [MATCH] ${sitemapEntry.id}: "${parsed.name}" → existing "${duplicate.name}" (${duplicate.id})`);
} else {
stats.churchesCreated++;
console.log(` [NEW] ${sitemapEntry.id}: "${parsed.name}" at ${coords.lat},${coords.lng}`);
}
if (schedules.length > 0) {
stats.schedulesProcessed++;
stats.massSchedulesCreated += schedules.length;
}
return;
}
if (duplicate) {
// Update existing church
stats.churchesMatched++;
const updateData: Record<string, unknown> = {
massSchedulesPhId: sitemapEntry.id,
};
if (!duplicate.address && parsed.address) updateData.address = parsed.address;
if (!duplicate.phone && parsed.phone) updateData.phone = parsed.phone;
// Fill city/state from breadcrumbs
const dbRecord = await prisma.church.findUnique({
where: { id: duplicate.id },
select: { city: true, state: true },
});
if (dbRecord && !dbRecord.city && parsed.city) updateData.city = parsed.city;
if (dbRecord && !dbRecord.state && parsed.region) updateData.state = parsed.region;
try {
await prisma.church.update({
where: { id: duplicate.id },
data: updateData,
});
} catch (error) {
if (error instanceof Error && error.message.includes('Unique constraint')) {
stats.churchesSkipped++;
return;
}
throw error;
}
// Replace mass schedules
if (schedules.length > 0 && !skipSchedules) {
try {
await prisma.$transaction(async (tx) => {
await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } });
await tx.massSchedule.createMany({
data: schedules.map((s) => ({
churchId: duplicate.id,
dayOfWeek: s.dayOfWeek,
time: s.time,
language: 'English',
})),
});
await tx.church.update({
where: { id: duplicate.id },
data: { lastScrapedAt: new Date() },
});
});
stats.schedulesProcessed++;
stats.massSchedulesCreated += schedules.length;
} catch (error) {
stats.errors++;
console.error(` Error saving schedules for ${sitemapEntry.id}: ${error instanceof Error ? error.message : error}`);
}
}
} else {
// Create new church
try {
const newChurch = await prisma.church.create({
data: {
name: parsed.name,
latitude: coords.lat,
longitude: coords.lng,
address: parsed.address,
city: parsed.city || null,
state: parsed.region || null,
country: 'PH',
phone: parsed.phone,
hasWebsite: false,
massSchedulesPhId: sitemapEntry.id,
source: 'mass-schedules-ph',
},
});
stats.churchesCreated++;
// Add to in-memory array for within-run dedup
existingChurches.push({
id: newChurch.id,
name: parsed.name,
latitude: coords.lat,
longitude: coords.lng,
osmId: null,
baiduId: null,
masstimesId: null,
orarimesseId: null,
massSchedulesPhId: sitemapEntry.id,
philmassId: null,
horariosMisasId: null,
mszeInfoId: null,
weekdayMassesId: null,
messesInfoId: null,
bohosluzbyId: null,
miserendId: null,
kerknetId: null,
gottesdienstzeitenId: null,
discovermassId: null,
source: 'mass-schedules-ph',
website: null,
phone: parsed.phone,
address: parsed.address,
});
// Create mass schedules
if (schedules.length > 0 && !skipSchedules) {
await prisma.massSchedule.createMany({
data: schedules.map((s) => ({
churchId: newChurch.id,
dayOfWeek: s.dayOfWeek,
time: s.time,
language: 'English',
})),
});
await prisma.church.update({
where: { id: newChurch.id },
data: { lastScrapedAt: new Date() },
});
stats.schedulesProcessed++;
stats.massSchedulesCreated += schedules.length;
}
} catch (error) {
if (error instanceof Error && error.message.includes('Unique constraint')) {
stats.churchesSkipped++;
return;
}
throw error;
}
}
}
// ─── CLI ─────────────────────────────────────────────────────────────────────
function parseArgs(): CLIArgs {
const args = process.argv.slice(2);
const result: CLIArgs = {
all: false,
dryRun: false,
skipSchedules: false,
};
for (let i = 0; i < args.length; i++) {
switch (args[i]) {
case '--all':
result.all = true;
break;
case '--church-id':
result.churchId = args[++i];
break;
case '--dry-run':
result.dryRun = true;
break;
case '--skip-schedules':
result.skipSchedules = true;
break;
case '--resume-from':
result.resumeFrom = parseInt(args[++i]);
break;
case '--job-id':
result.jobId = args[++i];
break;
case '--help':
case '-h':
console.log(`
Usage: npx tsx scripts/import-mass-schedules-ph.ts [options]
Options:
--all Import all churches from sitemap
--church-id <id> Import a single church by ID (e.g. "34")
--dry-run No database writes, just report what would happen
--skip-schedules Skip mass schedule import (churches only)
--resume-from <id> Skip churches with ID less than this value
--job-id <uuid> Background job tracking ID
--help, -h Show this help message
Examples:
npx tsx scripts/import-mass-schedules-ph.ts --church-id 34 --dry-run
npx tsx scripts/import-mass-schedules-ph.ts --all
npx tsx scripts/import-mass-schedules-ph.ts --all --skip-schedules
npx tsx scripts/import-mass-schedules-ph.ts --all --resume-from 500
`);
process.exit(0);
}
}
if (!result.all && !result.churchId) {
console.error('Error: specify --all or --church-id <id>');
process.exit(1);
}
return result;
}
// ─── Helpers ─────────────────────────────────────────────────────────────────
function formatDuration(ms: number): string {
const seconds = Math.floor(ms / 1000);
const minutes = Math.floor(seconds / 60);
const hours = Math.floor(minutes / 60);
if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`;
if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
return `${seconds}s`;
}
// ─── Main ────────────────────────────────────────────────────────────────────
async function main() {
const args = parseArgs();
const startTime = Date.now();
console.log('\n' + '='.repeat(70));
console.log('MASS-SCHEDULES.COM (PHILIPPINES) IMPORTER');
console.log('='.repeat(70));
console.log(`Mode: ${args.all ? 'All churches from sitemap' : `Single church: ${args.churchId}`}`);
console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`);
console.log(`Skip schedules: ${args.skipSchedules ? 'YES' : 'NO'}`);
if (args.resumeFrom) console.log(`Resume from ID: ${args.resumeFrom}`);
console.log(`Time: ${new Date().toISOString()}`);
console.log('='.repeat(70) + '\n');
// Update background job status if provided
if (args.jobId) {
try {
await prisma.backgroundJob.update({
where: { id: args.jobId },
data: { status: 'running', startedAt: new Date() },
});
} catch {
// Job might not exist yet
}
}
// Load existing Philippine churches for dedup
const existingChurches = await loadExistingPhilippineChurches();
// Build church list: skip sitemap for single-church mode
let churchesToProcess: SitemapChurch[];
if (args.churchId) {
// Single church: construct URL directly, no sitemap needed
churchesToProcess = [{
id: args.churchId,
slug: 'church',
url: `${SITE_BASE}/catholic-church/${args.churchId}/church.html`,
}];
console.log(`Single church mode: ID ${args.churchId}\n`);
} else {
// Full mode: fetch sitemap
const allChurches = await fetchChurchUrlsFromSitemap();
console.log(`Found ${allChurches.length} unique church URLs in sitemap\n`);
churchesToProcess = allChurches;
}
// Handle --resume-from
if (args.resumeFrom) {
const before = churchesToProcess.length;
churchesToProcess = churchesToProcess.filter((c) => parseInt(c.id) >= args.resumeFrom!);
console.log(`Resuming from ID ${args.resumeFrom} (skipping ${before - churchesToProcess.length} churches)\n`);
}
const stats: ImportStats = {
churchesFound: 0,
churchesMatched: 0,
churchesCreated: 0,
churchesSkipped: 0,
schedulesProcessed: 0,
massSchedulesCreated: 0,
errors: 0,
};
// Process each church
for (let i = 0; i < churchesToProcess.length; i++) {
const church = churchesToProcess[i];
const elapsed = formatDuration(Date.now() - startTime);
console.log(`[${i + 1}/${churchesToProcess.length}] Church ID ${church.id} [${elapsed} elapsed]`);
try {
await processChurch(church, existingChurches, args.dryRun, args.skipSchedules, stats);
} catch (error) {
stats.errors++;
console.error(` ERROR processing church ${church.id}: ${error instanceof Error ? error.message : error}`);
}
}
// Print summary
const totalTime = Date.now() - startTime;
console.log('\n' + '='.repeat(70));
console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
console.log('='.repeat(70));
console.log(`Churches found: ${stats.churchesFound}`);
console.log(` Matched (existing): ${stats.churchesMatched}`);
console.log(` Created (new): ${stats.churchesCreated}`);
console.log(` Skipped: ${stats.churchesSkipped}`);
console.log(`Schedules processed: ${stats.schedulesProcessed}`);
console.log(`Mass schedules created: ${stats.massSchedulesCreated}`);
console.log(`Errors: ${stats.errors}`);
console.log(`Total time: ${formatDuration(totalTime)}`);
console.log(`HTTP requests: ${requestCount}`);
console.log('='.repeat(70) + '\n');
// Update background job
if (args.jobId) {
try {
await prisma.backgroundJob.update({
where: { id: args.jobId },
data: {
status: stats.errors > 0 ? 'completed_with_errors' : 'completed',
completedAt: new Date(),
result: JSON.stringify(stats),
},
});
} catch {
// Ignore
}
}
}
main()
.catch((error) => {
console.error('Fatal error:', error);
process.exit(1);
})
.finally(async () => {
await prisma.$disconnect();
await pool.end();
});

View File

@@ -0,0 +1,672 @@
#!/usr/bin/env tsx
/**
* Import Catholic churches and mass schedules globally from masstimes.org API
*
* masstimes.org has ~121,000 churches worldwide. This script queries their
* geo-search API with a grid of coordinates covering world landmass, then
* deduplicates and imports the results.
*
* API: GET https://masstimes.org/Churchs/?lat={lat}&long={lng}&pg={page}
* - Requires Referer header
* - Returns 30 results per page within 100-mile (~160km) radius
* - Paginate until empty array
*
* Grid strategy:
* - 2.5° latitude spacing (~278km), longitude adjusted for latitude
* - Continental bounding boxes to skip oceans
* - 100-mile radius means ~322km diameter — 2.5° spacing ensures overlap
*
* Usage:
* npx tsx scripts/import-masstimes-api.ts --all
* npx tsx scripts/import-masstimes-api.ts --all --dry-run
* npx tsx scripts/import-masstimes-api.ts --region europe
* npx tsx scripts/import-masstimes-api.ts --all --skip-us
* npx tsx scripts/import-masstimes-api.ts --all --job-id {uuid}
*/
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
const pool = new Pool({
connectionString: dbUrl,
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
});
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
import { findDuplicateChurch } from '../src/lib/church-matcher';
import type { ExistingChurch } from '../src/lib/church-matcher';
// ─── Constants ───────────────────────────────────────────────────────────────
const API_BASE = 'https://masstimes.org/Churchs/';
const REFERER = 'https://masstimes.org/map';
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
const RATE_LIMIT_MS = 2000; // 2 seconds between requests — respectful rate
const PAGE_SIZE = 30;
const LAT_SPACING = 2.5; // degrees (~278km)
const TARGET_LNG_SPACING_KM = 250; // target spacing in km
// Country name → ISO code mapping for masstimes country names
const COUNTRY_CODE_MAP: Record<string, string> = {
'united states': 'US', 'canada': 'CA', 'mexico': 'MX',
'united kingdom': 'GB', 'ireland': 'IE', 'france': 'FR', 'germany': 'DE',
'spain': 'ES', 'italy': 'IT', 'portugal': 'PT', 'netherlands': 'NL',
'belgium': 'BE', 'luxembourg': 'LU', 'switzerland': 'CH', 'austria': 'AT',
'poland': 'PL', 'czech republic': 'CZ', 'czechia': 'CZ', 'slovakia': 'SK',
'hungary': 'HU', 'croatia': 'HR', 'slovenia': 'SI', 'romania': 'RO',
'bulgaria': 'BG', 'serbia': 'RS', 'bosnia and herzegovina': 'BA',
'montenegro': 'ME', 'north macedonia': 'MK', 'albania': 'AL', 'kosovo': 'XK',
'greece': 'GR', 'cyprus': 'CY', 'malta': 'MT', 'denmark': 'DK',
'sweden': 'SE', 'norway': 'NO', 'finland': 'FI', 'iceland': 'IS',
'estonia': 'EE', 'latvia': 'LV', 'lithuania': 'LT',
'ukraine': 'UA', 'russia': 'RU', 'belarus': 'BY', 'moldova': 'MD',
'georgia': 'GE', 'armenia': 'AM', 'azerbaijan': 'AZ',
'turkey': 'TR', 'israel': 'IL', 'jordan': 'JO', 'lebanon': 'LB',
'egypt': 'EG', 'morocco': 'MA', 'tunisia': 'TN', 'algeria': 'DZ',
'india': 'IN', 'sri lanka': 'LK', 'pakistan': 'PK', 'bangladesh': 'BD',
'nepal': 'NP', 'myanmar': 'MM', 'thailand': 'TH', 'vietnam': 'VN',
'cambodia': 'KH', 'laos': 'LA', 'malaysia': 'MY', 'singapore': 'SG',
'indonesia': 'ID', 'philippines': 'PH', 'china': 'CN', 'japan': 'JP',
'south korea': 'KR', 'korea, south': 'KR', 'taiwan': 'TW',
'hong kong': 'HK', 'macau': 'MO', 'mongolia': 'MN',
'australia': 'AU', 'new zealand': 'NZ', 'fiji': 'FJ',
'papua new guinea': 'PG', 'samoa': 'WS', 'tonga': 'TO', 'guam': 'GU',
'nigeria': 'NG', 'ghana': 'GH', 'kenya': 'KE', 'tanzania': 'TZ',
'uganda': 'UG', 'south africa': 'ZA', 'cameroon': 'CM', 'senegal': 'SN',
'ethiopia': 'ET', 'madagascar': 'MG', 'mozambique': 'MZ',
'zambia': 'ZM', 'zimbabwe': 'ZW', 'malawi': 'MW', 'rwanda': 'RW',
'burundi': 'BI', 'congo, democratic republic of the': 'CD',
'congo, republic of the': 'CG', "côte d'ivoire": 'CI', 'ivory coast': 'CI',
'burkina faso': 'BF', 'mali': 'ML', 'niger': 'NE', 'chad': 'TD',
'central african republic': 'CF', 'gabon': 'GA', 'equatorial guinea': 'GQ',
'angola': 'AO', 'namibia': 'NA', 'botswana': 'BW', 'lesotho': 'LS',
'eswatini': 'SZ', 'swaziland': 'SZ', 'mauritius': 'MU',
'brazil': 'BR', 'argentina': 'AR', 'colombia': 'CO', 'peru': 'PE',
'chile': 'CL', 'venezuela': 'VE', 'ecuador': 'EC', 'bolivia': 'BO',
'paraguay': 'PY', 'uruguay': 'UY', 'guyana': 'GY', 'suriname': 'SR',
'trinidad and tobago': 'TT', 'jamaica': 'JM', 'barbados': 'BB',
'bahamas': 'BS', 'bahamas, the': 'BS', 'haiti': 'HT',
'dominican republic': 'DO', 'cuba': 'CU', 'puerto rico': 'PR',
'guatemala': 'GT', 'honduras': 'HN', 'el salvador': 'SV',
'nicaragua': 'NI', 'costa rica': 'CR', 'panama': 'PA', 'belize': 'BZ',
'grenada': 'GD', 'saint lucia': 'LC', 'dominica': 'DM',
'saint vincent and the grenadines': 'VC', 'antigua and barbuda': 'AG',
'saint kitts and nevis': 'KN', 'bermuda': 'BM', 'cayman islands': 'KY',
'aruba': 'AW', 'curaçao': 'CW', 'curacao': 'CW',
'united arab emirates': 'AE', 'saudi arabia': 'SA', 'qatar': 'QA',
'bahrain': 'BH', 'kuwait': 'KW', 'oman': 'OM', 'iraq': 'IQ',
'iran': 'IR', 'afghanistan': 'AF',
'kazakhstan': 'KZ', 'uzbekistan': 'UZ', 'kyrgyzstan': 'KG',
'tajikistan': 'TJ', 'turkmenistan': 'TM',
'liechtenstein': 'LI', 'monaco': 'MC', 'andorra': 'AD', 'san marino': 'SM',
'vatican city': 'VA', 'holy see (vatican city)': 'VA',
'east timor': 'TL', 'timor-leste': 'TL',
};
// Continental bounding boxes (lat_min, lat_max, lng_min, lng_max)
const REGIONS: Record<string, Array<[number, number, number, number]>> = {
'north-america': [[7, 72, -170, -50]],
'central-america': [[7, 24, -120, -60]],
'south-america': [[-56, 13, -82, -34]],
'europe': [[35, 72, -12, 45]],
'eastern-europe': [[40, 70, 20, 60]],
'africa': [[-36, 38, -20, 55]],
'middle-east': [[12, 42, 25, 65]],
'south-asia': [[5, 38, 60, 98]],
'east-asia': [[18, 55, 95, 150]],
'southeast-asia': [[-12, 22, 92, 142]],
'oceania': [[-48, -8, 110, 180], [-22, 0, 160, 180]],
'central-asia': [[35, 55, 45, 90]],
};
// ─── Types ───────────────────────────────────────────────────────────────────
interface MasstimesChurch {
id: string;
name: string;
latitude: string;
longitude: string;
church_address_street_address: string;
church_address_city_name: string;
church_address_providence_name: string;
church_address_postal_code: string;
church_address_country_territory_name: string;
church_address_county: string | null;
diocese_name: string;
phone_number: string;
email: string;
url: string;
pastors_name: string;
church_worship_times: MasstimesWorshipTime[];
distance: string;
wheel_chair_access: boolean;
}
interface MasstimesWorshipTime {
day_of_week: string;
time_start: string;
time_end: string;
language: string | null;
service_typename: string;
comment: string;
is_perpetual: boolean;
}
interface ImportStats {
gridPoints: number;
apiRequests: number;
churchesDiscovered: number;
churchesMatched: number;
churchesCreated: number;
churchesSkipped: number;
massSchedulesCreated: number;
errors: number;
}
interface CLIArgs {
all: boolean;
region?: string;
dryRun: boolean;
skipUs: boolean;
resumeFrom: number;
jobId?: string;
}
// ─── CLI ─────────────────────────────────────────────────────────────────────
function parseArgs(): CLIArgs {
const args = process.argv.slice(2);
const result: CLIArgs = { all: false, dryRun: false, skipUs: false, resumeFrom: 0 };
for (let i = 0; i < args.length; i++) {
switch (args[i]) {
case '--all': result.all = true; break;
case '--region': result.region = args[++i]; break;
case '--dry-run': result.dryRun = true; break;
case '--skip-us': result.skipUs = true; break;
case '--resume-from': result.resumeFrom = parseInt(args[++i], 10); break;
case '--job-id': result.jobId = args[++i]; break;
case '--help':
console.log(`Usage: npx tsx scripts/import-masstimes-api.ts [options]
--all Query all regions globally
--region <name> Query specific region: ${Object.keys(REGIONS).join(', ')}
--skip-us Skip US grid points (already well-covered)
--dry-run No database writes
--resume-from <n> Skip first N grid points
--job-id <uuid> Background job tracking`);
process.exit(0);
}
}
if (!result.all && !result.region) {
console.error('Error: specify --all or --region <name>');
process.exit(1);
}
return result;
}
// ─── Grid Generation ─────────────────────────────────────────────────────────
function generateGridPoints(regions: string[], skipUs: boolean): Array<{ lat: number; lng: number }> {
const points: Array<{ lat: number; lng: number }> = [];
const seen = new Set<string>();
for (const regionName of regions) {
const boxes = REGIONS[regionName];
if (!boxes) {
console.error(`Unknown region: ${regionName}`);
continue;
}
for (const [latMin, latMax, lngMin, lngMax] of boxes) {
for (let lat = latMin; lat <= latMax; lat += LAT_SPACING) {
// Adjust longitude spacing based on latitude (degrees get narrower)
const kmPerDegreeLng = 111.32 * Math.cos((lat * Math.PI) / 180);
const lngSpacing = kmPerDegreeLng > 0
? Math.max(LAT_SPACING, TARGET_LNG_SPACING_KM / kmPerDegreeLng)
: LAT_SPACING;
for (let lng = lngMin; lng <= lngMax; lng += lngSpacing) {
const roundedLat = Math.round(lat * 10) / 10;
const roundedLng = Math.round(lng * 10) / 10;
const key = `${roundedLat},${roundedLng}`;
if (!seen.has(key)) {
// Skip US continental bounding box if requested
if (skipUs && roundedLat >= 24 && roundedLat <= 50
&& roundedLng >= -125 && roundedLng <= -66) {
continue;
}
seen.add(key);
points.push({ lat: roundedLat, lng: roundedLng });
}
}
}
}
}
return points;
}
// ─── API ─────────────────────────────────────────────────────────────────────
async function sleep(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms));
}
async function fetchPage(lat: number, lng: number, page: number): Promise<MasstimesChurch[]> {
const url = `${API_BASE}?lat=${lat}&long=${lng}&pg=${page}`;
const response = await fetch(url, {
headers: {
'Referer': REFERER,
'User-Agent': USER_AGENT,
'Accept': 'application/json',
},
});
if (!response.ok) {
if (response.status === 429) {
console.error(` Rate limited (429) — backing off 30s`);
await sleep(30000);
return fetchPage(lat, lng, page); // retry once
}
throw new Error(`HTTP ${response.status} for ${url}`);
}
return response.json() as Promise<MasstimesChurch[]>;
}
async function fetchAllForPoint(
lat: number,
lng: number,
stats: ImportStats,
): Promise<MasstimesChurch[]> {
const allChurches: MasstimesChurch[] = [];
let page = 1;
while (true) {
stats.apiRequests++;
const results = await fetchPage(lat, lng, page);
if (results.length === 0) break;
allChurches.push(...results);
if (results.length < PAGE_SIZE) break; // last page
page++;
await sleep(RATE_LIMIT_MS);
}
return allChurches;
}
// ─── Data Conversion ─────────────────────────────────────────────────────────
function resolveCountryCode(countryName: string): string {
if (!countryName) return 'XX';
const lower = countryName.trim().toLowerCase();
return COUNTRY_CODE_MAP[lower] || 'XX';
}
const DAY_MAP: Record<string, number[]> = {
'sunday': [0],
'monday': [1],
'tuesday': [2],
'wednesday': [3],
'thursday': [4],
'friday': [5],
'saturday': [6],
'weekdays': [1, 2, 3, 4, 5],
};
function parseWorshipTimes(times: MasstimesWorshipTime[]): Array<{
dayOfWeek: number;
time: string;
language: string;
notes: string | null;
massType: string | null;
}> {
const schedules: Array<{
dayOfWeek: number;
time: string;
language: string;
notes: string | null;
massType: string | null;
}> = [];
for (const wt of times) {
// Only import mass services (Weekend = Sun/Sat, Week Days = weekday masses)
if (wt.service_typename !== 'Weekend' && wt.service_typename !== 'Week Days') {
continue;
}
const timeStr = wt.time_start?.trim();
if (!timeStr || timeStr === '00:00:00') continue;
// Parse "HH:MM:SS" → "HH:MM"
const timeParts = timeStr.split(':');
const time24 = `${timeParts[0].padStart(2, '0')}:${timeParts[1] || '00'}`;
const language = wt.language?.trim() || 'Unknown';
const notes = wt.comment?.trim() || null;
const dayKey = wt.day_of_week?.trim().toLowerCase();
const days = DAY_MAP[dayKey];
if (days) {
for (const day of days) {
schedules.push({ dayOfWeek: day, time: time24, language, notes, massType: null });
}
}
}
return schedules;
}
// ─── Database ────────────────────────────────────────────────────────────────
async function loadExistingChurches(): Promise<ExistingChurch[]> {
console.log('Loading existing churches for deduplication...');
const churches = await prisma.church.findMany({
select: {
id: true,
name: true,
latitude: true,
longitude: true,
osmId: true,
baiduId: true,
masstimesId: true,
orarimesseId: true,
massSchedulesPhId: true,
philmassId: true,
horariosMisasId: true,
mszeInfoId: true,
weekdayMassesId: true,
messesInfoId: true,
bohosluzbyId: true,
miserendId: true,
kerknetId: true,
gottesdienstzeitenId: true,
discovermassId: true,
source: true,
website: true,
phone: true,
address: true,
country: true,
},
});
console.log(`Loaded ${churches.length} existing churches`);
return churches;
}
async function updateJobProgress(jobId: string, stats: ImportStats): Promise<void> {
try {
await prisma.backgroundJob.update({
where: { id: jobId },
data: {
processed: stats.gridPoints,
succeeded: stats.churchesMatched + stats.churchesCreated,
failed: stats.errors,
itemsFound: stats.churchesDiscovered,
},
});
} catch (err) {
console.error(`Failed to update job progress:`, err);
}
}
// ─── Main Import ─────────────────────────────────────────────────────────────
async function main() {
const args = parseArgs();
let regionNames: string[];
if (args.all) {
regionNames = Object.keys(REGIONS);
} else {
regionNames = [args.region!];
}
const gridPoints = generateGridPoints(regionNames, args.skipUs);
console.log(`\n${'='.repeat(70)}`);
console.log('MASSTIMES.ORG API GLOBAL IMPORTER');
console.log('='.repeat(70));
console.log(`Regions: ${regionNames.join(', ')}`);
console.log(`Grid points: ${gridPoints.length}`);
console.log(`Skip US: ${args.skipUs ? 'YES' : 'NO'}`);
console.log(`Dry run: ${args.dryRun ? 'YES' : 'NO'}`);
console.log(`Rate limit: ${RATE_LIMIT_MS}ms between requests`);
console.log(`Resume from: ${args.resumeFrom || 'start'}`);
const estHours = Math.round(gridPoints.length * 2 * RATE_LIMIT_MS / 1000 / 3600 * 10) / 10;
console.log(`Est. time: ~${estHours} hours (est. 2 pages/point avg)`);
console.log(`Time: ${new Date().toISOString()}`);
console.log('='.repeat(70));
const existingChurches = await loadExistingChurches();
// Build masstimesId lookup for fast dedup
const masstimesIdSet = new Set<string>();
for (const c of existingChurches) {
if (c.masstimesId) masstimesIdSet.add(c.masstimesId);
}
// Track discovered IDs to deduplicate across grid points
const discoveredIds = new Set<string>();
const stats: ImportStats = {
gridPoints: 0,
apiRequests: 0,
churchesDiscovered: 0,
churchesMatched: 0,
churchesCreated: 0,
churchesSkipped: 0,
massSchedulesCreated: 0,
errors: 0,
};
let jobId = args.jobId;
if (jobId) {
await prisma.backgroundJob.update({
where: { id: jobId },
data: { status: 'running', startedAt: new Date(), totalItems: gridPoints.length },
});
}
const startTime = Date.now();
for (let i = 0; i < gridPoints.length; i++) {
const { lat, lng } = gridPoints[i];
stats.gridPoints++;
if (i < args.resumeFrom) continue;
try {
const churches = await fetchAllForPoint(lat, lng, stats);
if (churches.length > 0) {
let newInPoint = 0;
for (const mc of churches) {
if (discoveredIds.has(mc.id)) continue;
discoveredIds.add(mc.id);
stats.churchesDiscovered++;
// Already in DB by masstimesId
if (masstimesIdSet.has(mc.id)) {
stats.churchesMatched++;
continue;
}
const churchLat = parseFloat(mc.latitude);
const churchLng = parseFloat(mc.longitude);
if (isNaN(churchLat) || isNaN(churchLng) || (churchLat === 0 && churchLng === 0)) continue;
const country = resolveCountryCode(mc.church_address_country_territory_name);
const address = [
mc.church_address_street_address,
mc.church_address_city_name,
mc.church_address_providence_name,
mc.church_address_postal_code,
].filter(s => s?.trim()).join(', ').trim() || null;
// Proximity + name match
const candidate = { name: mc.name, lat: churchLat, lng: churchLng };
const duplicate = findDuplicateChurch(candidate, existingChurches);
if (duplicate) {
stats.churchesMatched++;
if (!args.dryRun) {
const updateData: Record<string, unknown> = { masstimesId: mc.id };
if (!duplicate.phone && mc.phone_number?.trim()) updateData.phone = mc.phone_number.trim();
if (!duplicate.website && mc.url?.trim()) {
updateData.website = mc.url.trim();
updateData.hasWebsite = true;
}
if (!duplicate.address && address) updateData.address = address;
if (duplicate.country === 'XX' && country !== 'XX') updateData.country = country;
try {
await prisma.church.update({ where: { id: duplicate.id }, data: updateData });
masstimesIdSet.add(mc.id);
} catch (error) {
if (error instanceof Error && error.message.includes('Unique constraint')) {
stats.churchesSkipped++;
} else throw error;
}
}
continue;
}
// Create new church
if (!args.dryRun) {
const schedules = parseWorshipTimes(mc.church_worship_times || []);
try {
const newChurch = await prisma.church.create({
data: {
name: mc.name,
latitude: churchLat,
longitude: churchLng,
address,
city: mc.church_address_city_name?.trim() || null,
state: mc.church_address_providence_name?.trim() || null,
zip: mc.church_address_postal_code?.trim() || null,
country,
phone: mc.phone_number?.trim() || null,
website: mc.url?.trim() || null,
email: mc.email?.trim() || null,
hasWebsite: !!mc.url?.trim(),
masstimesId: mc.id,
source: 'masstimes',
diocese: mc.diocese_name?.trim() || null,
pastorName: mc.pastors_name?.trim() || null,
wheelchairAccess: mc.wheel_chair_access || false,
massSchedules: schedules.length > 0 ? {
create: schedules.map(s => ({
dayOfWeek: s.dayOfWeek,
time: s.time,
language: s.language,
notes: s.notes,
massType: s.massType,
isActive: true,
})),
} : undefined,
},
});
stats.churchesCreated++;
stats.massSchedulesCreated += schedules.length;
newInPoint++;
masstimesIdSet.add(mc.id);
existingChurches.push({
id: newChurch.id, name: mc.name,
latitude: churchLat, longitude: churchLng,
osmId: null, baiduId: null, masstimesId: mc.id,
orarimesseId: null, massSchedulesPhId: null,
philmassId: null, horariosMisasId: null,
mszeInfoId: null, weekdayMassesId: null,
messesInfoId: null, bohosluzbyId: null, miserendId: null, kerknetId: null, gottesdienstzeitenId: null, discovermassId: null,
source: 'masstimes', website: mc.url?.trim() || null,
phone: mc.phone_number?.trim() || null, address, country,
});
} catch (error) {
if (error instanceof Error && error.message.includes('Unique constraint')) {
stats.churchesSkipped++;
} else {
stats.errors++;
console.error(` Error creating ${mc.name}: ${error instanceof Error ? error.message : error}`);
}
}
} else {
stats.churchesCreated++;
stats.massSchedulesCreated += parseWorshipTimes(mc.church_worship_times || []).length;
newInPoint++;
}
}
if (newInPoint > 0) {
console.log(` Grid ${i + 1}/${gridPoints.length} (${lat},${lng}): ${churches.length} found, ${newInPoint} new`);
}
}
await sleep(RATE_LIMIT_MS);
} catch (error) {
stats.errors++;
console.error(` Error at grid ${i + 1} (${lat},${lng}): ${error instanceof Error ? error.message : error}`);
await sleep(RATE_LIMIT_MS * 2);
}
// Progress every 50 points
if ((i + 1) % 50 === 0 || i === gridPoints.length - 1) {
const elapsed = (Date.now() - startTime) / 1000;
const rate = elapsed > 0 ? Math.round(stats.apiRequests / elapsed * 3600) : 0;
console.log(` Progress: ${i + 1}/${gridPoints.length} grid points, ${stats.churchesDiscovered} discovered, ${stats.churchesCreated} new, ${stats.apiRequests} API calls [${Math.round(elapsed)}s, ~${rate}/hr]`);
}
if (jobId && (i + 1) % 20 === 0) {
await updateJobProgress(jobId, stats);
}
}
if (jobId) {
await updateJobProgress(jobId, stats);
await prisma.backgroundJob.update({
where: { id: jobId },
data: { status: 'completed', completedAt: new Date() },
});
}
const elapsed = (Date.now() - startTime) / 1000;
console.log(`\n${'='.repeat(70)}`);
console.log('MASSTIMES API IMPORT SUMMARY');
console.log('='.repeat(70));
console.log(`Grid points queried: ${stats.gridPoints}`);
console.log(`API requests: ${stats.apiRequests}`);
console.log(`Churches discovered: ${stats.churchesDiscovered}`);
console.log(`Churches matched: ${stats.churchesMatched} (already in DB)`);
console.log(`Churches created: ${stats.churchesCreated}`);
console.log(`Churches skipped: ${stats.churchesSkipped} (duplicates)`);
console.log(`Mass schedules created: ${stats.massSchedulesCreated}`);
console.log(`Errors: ${stats.errors}`);
console.log(`Elapsed: ${Math.round(elapsed)}s (${(elapsed / 3600).toFixed(1)}h)`);
console.log('='.repeat(70));
await prisma.$disconnect();
await pool.end();
}
main().catch((error) => {
console.error(`Fatal error: ${error.message}`);
process.exit(1);
});

View File

@@ -0,0 +1,681 @@
#!/usr/bin/env tsx
/**
* Import Catholic churches and mass schedules from messes.info (France)
*
* messes.info is the official French bishops' conference (CEF) mass schedule
* database. It exposes a GWT-RPC API returning structured JSON with parish
* data including name, address, coordinates, diocese, and celebration times.
*
* The API requires no authentication. We enumerate all French dioceses using
* the "community:{diocese_code}" query prefix, which returns all parishes
* within each diocese.
*
* Import strategy:
* 1. Query each of ~93 diocese codes via GWT-RPC API
* 2. Parse response: extract localities (churches) + celebrations (mass times)
* 3. Deduce recurring weekly schedule from date-specific celebration entries
* 4. Match against existing French churches via church-matcher
* 5. Upsert churches and mass schedules
*
* Usage:
* npx tsx scripts/import-messesinfo.ts --all --dry-run
* npx tsx scripts/import-messesinfo.ts --all
* npx tsx scripts/import-messesinfo.ts --diocese pa --dry-run # Paris only
* npx tsx scripts/import-messesinfo.ts --all --resume-from 20
*/
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
const pool = new Pool({
connectionString: dbUrl,
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
});
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
import { findDuplicateChurch } from '../src/lib/church-matcher';
import type { ExistingChurch } from '../src/lib/church-matcher';
// ─── Constants ───────────────────────────────────────────────────────────────
const API_URL = 'https://messes.info/gwtRequest';
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
const REQUEST_DELAY_MS = 3000;
const RETRY_DELAY_MS = 10000;
const MAX_RETRIES = 3;
const RESULTS_PER_QUERY = 2000;
// Diocese codes discovered from the API. Each code maps to a diocese in France.
// The query "community:{code}" returns all parishes within that diocese.
// Codes are 2-letter abbreviations (e.g., pa=Paris, ly=Lyon, st=Strasbourg).
const DIOCESE_CODES = [
'a', 'aa', 'ac', 'ad', 'ag', 'al', 'am', 'an', 'ar', 'au', 'av', 'ay',
'ba', 'bb', 'be', 'bl', 'bm', 'bo', 'br', 'bs', 'bv', 'by',
'ca', 'cb', 'cc', 'cd', 'ch', 'cl', 'cm', 'cn', 'cr', 'cs',
'da', 'di', 'dj', 'dn',
'et', 'ex', 'ey',
'ft',
'ga', 'gr',
'lg', 'lh', 'li', 'lm', 'lp', 'lr', 'ls', 'lu', 'lv', 'ly',
'ma', 'md', 'me', 'ml', 'mp', 'mt', 'mx',
'na', 'nc', 'ni', 'nt', 'nv', 'ny',
'or',
'pa', 'pm', 'po', 'ps', 'pt',
'qu',
're', 'rn', 'ro', 'rv',
'sl', 'ss', 'st', 'sz',
'tl', 'to', 'ts', 'tu',
'va', 'vd', 've', 'vl', 'vv',
];
// ─── Types ───────────────────────────────────────────────────────────────────
interface LocalityData {
idfixe: string;
name: string;
address: string | null;
city: string | null;
zipcode: string | null;
latitude: number;
longitude: number;
sector: string | null;
communityId: string | null;
localityId: string; // e.g. "75/paris-04/saint-louis-en-l-ile"
}
interface CelebrationData {
date: string;
time: string; // normalized to "HH:MM"
recurrenceCategory: number;
}
interface ParsedSchedule {
dayOfWeek: number;
time: string;
}
interface ImportStats {
diocesesProcessed: number;
localitiesFound: number;
churchesMatched: number;
churchesCreated: number;
churchesSkipped: number;
schedulesCreated: number;
errors: number;
}
interface CLIArgs {
all: boolean;
dryRun: boolean;
resumeFrom?: number;
diocese?: string;
jobId?: string;
}
// ─── HTTP Client ─────────────────────────────────────────────────────────────
let requestCount = 0;
function delay(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
/**
* Convert MessesInfo time format "18h00" or "9h30" to "HH:MM" format.
*/
function normalizeTime(messesTime: string): string {
const match = messesTime.match(/^(\d{1,2})h(\d{2})$/);
if (match) {
return `${match[1].padStart(2, '0')}:${match[2]}`;
}
// Already in HH:MM format
if (/^\d{1,2}:\d{2}$/.test(messesTime)) {
const parts = messesTime.split(':');
return `${parts[0].padStart(2, '0')}:${parts[1]}`;
}
return messesTime;
}
async function fetchDioceseData(dioceseCode: string): Promise<any | null> {
if (requestCount > 0) {
await delay(REQUEST_DELAY_MS);
}
requestCount++;
const body = JSON.stringify({
F: 'cef.kephas.shared.request.AppRequestFactory',
I: [{
O: 'Bzv0wi60qgwcW5aKiRKrtgNaLKo=',
P: [`community:${dioceseCode}`, 0, RESULTS_PER_QUERY, 1, null, '48.86:2.35', ''],
R: ['listCelebrationTime.locality'],
}],
});
for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
try {
const response = await fetch(API_URL, {
method: 'POST',
headers: {
'User-Agent': USER_AGENT,
'Content-Type': 'application/json',
'Accept': 'application/json',
},
body,
});
if (response.status === 503 || response.status === 429) {
if (attempt < MAX_RETRIES) {
console.log(` HTTP ${response.status} — retrying in ${RETRY_DELAY_MS / 1000}s (attempt ${attempt}/${MAX_RETRIES})`);
await delay(RETRY_DELAY_MS);
continue;
}
console.error(` HTTP ${response.status} after ${MAX_RETRIES} attempts`);
return null;
}
if (!response.ok) {
console.error(` HTTP ${response.status} from API`);
return null;
}
return await response.json();
} catch (error) {
if (attempt < MAX_RETRIES) {
console.log(` Network error — retrying in ${RETRY_DELAY_MS / 1000}s (attempt ${attempt}/${MAX_RETRIES})`);
await delay(RETRY_DELAY_MS);
continue;
}
console.error(` API error after ${MAX_RETRIES} attempts: ${error instanceof Error ? error.message : error}`);
return null;
}
}
return null;
}
// ─── Response Parser ─────────────────────────────────────────────────────────
/**
* Parse the GWT-RPC response into a map of locality idfixe → data.
*
* The response O array contains interleaved objects:
* - Locality objects: have P.idfixe, P.name, P.address, P.latitude, etc.
* - Celebration objects: have P.date, P.time, P.localityId, P.recurrenceCategory
* - Metadata object: has P.size, P.sizeLocalities
*
* Localities and celebrations are linked by P.localityId matching P.id on localities.
*/
function parseApiResponse(data: any): Map<string, { locality: LocalityData; celebrations: CelebrationData[] }> {
const result = new Map<string, { locality: LocalityData; celebrations: CelebrationData[] }>();
if (!data?.O || !Array.isArray(data.O)) return result;
// First pass: collect all localities by their id
const localitiesById = new Map<string, LocalityData>();
for (const obj of data.O) {
const p = obj.P;
if (!p || typeof p !== 'object') continue;
if (p.idfixe && p.name) {
const locality: LocalityData = {
idfixe: p.idfixe,
name: p.name,
address: p.address || null,
city: p.city || null,
zipcode: p.zipcode || null,
latitude: p.latitude || 0,
longitude: p.longitude || 0,
sector: p.sector || null,
communityId: p.communityId || null,
localityId: p.id || '',
};
localitiesById.set(p.id, locality);
// Initialize in result map (dedup by idfixe)
if (!result.has(p.idfixe)) {
result.set(p.idfixe, { locality, celebrations: [] });
}
}
}
// Second pass: collect celebrations and link to localities
for (const obj of data.O) {
const p = obj.P;
if (!p || typeof p !== 'object') continue;
if (p.date && p.time && p.localityId) {
const locality = localitiesById.get(p.localityId);
if (locality && result.has(locality.idfixe)) {
result.get(locality.idfixe)!.celebrations.push({
date: p.date,
time: normalizeTime(p.time),
recurrenceCategory: p.recurrenceCategory ?? 0,
});
}
}
}
return result;
}
// ─── Schedule Deduction ──────────────────────────────────────────────────────
function deduceSchedules(celebrations: CelebrationData[]): ParsedSchedule[] {
const seen = new Set<string>();
const schedules: ParsedSchedule[] = [];
// First pass: weekly recurring entries only (recurrenceCategory=1)
for (const celeb of celebrations) {
if (celeb.recurrenceCategory !== 1) continue;
const date = new Date(celeb.date + 'T12:00:00Z');
const dayOfWeek = date.getUTCDay();
const key = `${dayOfWeek}:${celeb.time}`;
if (!seen.has(key)) {
seen.add(key);
schedules.push({ dayOfWeek, time: celeb.time });
}
}
// Fallback: if no weekly entries, deduce from all
if (schedules.length === 0) {
for (const celeb of celebrations) {
const date = new Date(celeb.date + 'T12:00:00Z');
const dayOfWeek = date.getUTCDay();
const key = `${dayOfWeek}:${celeb.time}`;
if (!seen.has(key)) {
seen.add(key);
schedules.push({ dayOfWeek, time: celeb.time });
}
}
}
return schedules;
}
// ─── Database Operations ─────────────────────────────────────────────────────
async function loadExistingFrenchChurches(): Promise<ExistingChurch[]> {
console.log('Loading existing French churches for deduplication...');
const churches = await prisma.church.findMany({
where: { country: 'FR' },
select: {
id: true,
name: true,
latitude: true,
longitude: true,
osmId: true,
baiduId: true,
masstimesId: true,
orarimesseId: true,
massSchedulesPhId: true,
philmassId: true,
horariosMisasId: true,
mszeInfoId: true,
weekdayMassesId: true,
messesInfoId: true,
bohosluzbyId: true,
miserendId: true,
kerknetId: true,
gottesdienstzeitenId: true,
discovermassId: true,
source: true,
website: true,
phone: true,
address: true,
},
});
console.log(`Loaded ${churches.length} existing French churches`);
return churches;
}
// ─── Import Logic ────────────────────────────────────────────────────────────
async function processDiocese(
dioceseCode: string,
existingChurches: ExistingChurch[],
dryRun: boolean,
stats: ImportStats,
): Promise<void> {
const data = await fetchDioceseData(dioceseCode);
if (!data) {
stats.errors++;
return;
}
// Check for API error
if (data.S && data.S[0] === false) {
console.log(` API error for diocese ${dioceseCode}`);
stats.errors++;
return;
}
const localities = parseApiResponse(data);
console.log(` Found ${localities.size} unique localities`);
stats.localitiesFound += localities.size;
stats.diocesesProcessed++;
for (const [idfixe, { locality, celebrations }] of localities) {
if (locality.latitude === 0 && locality.longitude === 0) {
stats.churchesSkipped++;
continue;
}
const schedules = deduceSchedules(celebrations);
const candidate = {
name: locality.name,
lat: locality.latitude,
lng: locality.longitude,
messesInfoId: idfixe,
};
const duplicate = findDuplicateChurch(candidate, existingChurches);
if (dryRun) {
if (duplicate) {
stats.churchesMatched++;
} else {
stats.churchesCreated++;
}
stats.schedulesCreated += schedules.length;
continue;
}
if (duplicate) {
stats.churchesMatched++;
const updateData: Record<string, unknown> = { messesInfoId: idfixe };
if (!duplicate.address && locality.address) updateData.address = locality.address;
if (duplicate.latitude === 0 && duplicate.longitude === 0 && locality.latitude !== 0) {
updateData.latitude = locality.latitude;
updateData.longitude = locality.longitude;
}
try {
await prisma.church.update({
where: { id: duplicate.id },
data: updateData,
});
} catch (error) {
if (error instanceof Error && error.message.includes('Unique constraint')) {
stats.churchesSkipped++;
continue;
}
throw error;
}
if (schedules.length > 0) {
try {
await prisma.$transaction(async (tx) => {
await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } });
await tx.massSchedule.createMany({
data: schedules.map((s) => ({
churchId: duplicate.id,
dayOfWeek: s.dayOfWeek,
time: s.time,
language: 'French',
})),
});
await tx.church.update({
where: { id: duplicate.id },
data: { lastScrapedAt: new Date() },
});
});
stats.schedulesCreated += schedules.length;
} catch (error) {
stats.errors++;
console.error(` Error saving schedules for ${idfixe}: ${error instanceof Error ? error.message : error}`);
}
}
} else {
// Determine country code from zipcode
let country = 'FR';
if (locality.zipcode && /^97[1-6]/.test(locality.zipcode)) {
country = 'FR'; // DOM-TOM are still FR
}
try {
const newChurch = await prisma.church.create({
data: {
name: locality.name,
latitude: locality.latitude,
longitude: locality.longitude,
address: locality.address,
zip: locality.zipcode,
city: locality.city,
country,
diocese: locality.sector || undefined,
messesInfoId: idfixe,
source: 'messes-info',
websiteLanguage: 'fr',
},
});
stats.churchesCreated++;
existingChurches.push({
id: newChurch.id,
name: locality.name,
latitude: locality.latitude,
longitude: locality.longitude,
osmId: null,
baiduId: null,
masstimesId: null,
orarimesseId: null,
massSchedulesPhId: null,
philmassId: null,
horariosMisasId: null,
mszeInfoId: null,
weekdayMassesId: null,
messesInfoId: idfixe,
bohosluzbyId: null,
miserendId: null,
kerknetId: null,
gottesdienstzeitenId: null,
discovermassId: null,
source: 'messes-info',
website: null,
phone: null,
address: locality.address,
});
if (schedules.length > 0) {
await prisma.massSchedule.createMany({
data: schedules.map((s) => ({
churchId: newChurch.id,
dayOfWeek: s.dayOfWeek,
time: s.time,
language: 'French',
})),
});
await prisma.church.update({
where: { id: newChurch.id },
data: { lastScrapedAt: new Date() },
});
stats.schedulesCreated += schedules.length;
}
} catch (error) {
if (error instanceof Error && error.message.includes('Unique constraint')) {
stats.churchesSkipped++;
continue;
}
stats.errors++;
console.error(` Error creating ${idfixe}: ${error instanceof Error ? error.message : error}`);
}
}
}
}
// ─── CLI ─────────────────────────────────────────────────────────────────────
function parseArgs(): CLIArgs {
const args = process.argv.slice(2);
const result: CLIArgs = { all: false, dryRun: false };
for (let i = 0; i < args.length; i++) {
switch (args[i]) {
case '--all':
result.all = true;
break;
case '--dry-run':
result.dryRun = true;
break;
case '--resume-from':
result.resumeFrom = parseInt(args[++i]);
break;
case '--diocese':
result.diocese = args[++i];
break;
case '--job-id':
result.jobId = args[++i];
break;
case '--help':
case '-h':
console.log(`
Usage: npx tsx scripts/import-messesinfo.ts [options]
Options:
--all Import all dioceses
--diocese <code> Import a single diocese (e.g., pa for Paris)
--dry-run No database writes, just report what would happen
--resume-from <n> Skip first N dioceses
--job-id <uuid> Background job tracking ID
--help, -h Show this help message
Examples:
npx tsx scripts/import-messesinfo.ts --diocese pa --dry-run
npx tsx scripts/import-messesinfo.ts --all --dry-run
npx tsx scripts/import-messesinfo.ts --all
`);
process.exit(0);
}
}
if (!result.all && !result.diocese) {
console.error('Error: specify --all or --diocese <code>');
process.exit(1);
}
return result;
}
function formatDuration(ms: number): string {
const seconds = Math.floor(ms / 1000);
const minutes = Math.floor(seconds / 60);
const hours = Math.floor(minutes / 60);
if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`;
if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
return `${seconds}s`;
}
// ─── Main ────────────────────────────────────────────────────────────────────
async function main() {
const args = parseArgs();
const startTime = Date.now();
console.log('\n' + '='.repeat(70));
console.log('MESSES.INFO (FRANCE) IMPORTER');
console.log('='.repeat(70));
console.log(`Mode: ${args.diocese ? `Diocese ${args.diocese}` : 'All dioceses'}`);
console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`);
if (args.resumeFrom) console.log(`Resume from: diocese index ${args.resumeFrom}`);
console.log(`Time: ${new Date().toISOString()}`);
console.log('='.repeat(70) + '\n');
if (args.jobId) {
try {
await prisma.backgroundJob.update({
where: { id: args.jobId },
data: { status: 'running', startedAt: new Date() },
});
} catch { /* Job might not exist */ }
}
const stats: ImportStats = {
diocesesProcessed: 0,
localitiesFound: 0,
churchesMatched: 0,
churchesCreated: 0,
churchesSkipped: 0,
schedulesCreated: 0,
errors: 0,
};
const existingChurches = await loadExistingFrenchChurches();
let dioceses = args.diocese ? [args.diocese] : [...DIOCESE_CODES];
if (args.diocese && !DIOCESE_CODES.includes(args.diocese)) {
console.log(`Warning: diocese "${args.diocese}" not in known list, trying anyway...`);
}
if (args.resumeFrom && !args.diocese) {
dioceses = dioceses.slice(args.resumeFrom);
console.log(`Resuming from diocese index ${args.resumeFrom} (${dioceses[0]})\n`);
}
console.log(`Processing ${dioceses.length} dioceses\n`);
for (let i = 0; i < dioceses.length; i++) {
const code = dioceses[i];
const elapsed = formatDuration(Date.now() - startTime);
console.log(`[${i + 1}/${dioceses.length}] Diocese "${code}" [${elapsed} elapsed]`);
try {
await processDiocese(code, existingChurches, args.dryRun, stats);
} catch (error) {
stats.errors++;
console.error(` ERROR processing diocese ${code}: ${error instanceof Error ? error.message : error}`);
}
}
const totalTime = Date.now() - startTime;
console.log('\n' + '='.repeat(70));
console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
console.log('='.repeat(70));
console.log(`Dioceses processed: ${stats.diocesesProcessed}`);
console.log(`Localities found: ${stats.localitiesFound}`);
console.log(` Matched (existing): ${stats.churchesMatched}`);
console.log(` Created (new): ${stats.churchesCreated}`);
console.log(` Skipped: ${stats.churchesSkipped}`);
console.log(`Schedules created: ${stats.schedulesCreated}`);
console.log(`Errors: ${stats.errors}`);
console.log(`Total time: ${formatDuration(totalTime)}`);
console.log(`HTTP requests: ${requestCount}`);
console.log('='.repeat(70) + '\n');
if (args.jobId) {
try {
await prisma.backgroundJob.update({
where: { id: args.jobId },
data: {
status: stats.errors > 0 ? 'completed_with_errors' : 'completed',
completedAt: new Date(),
processed: stats.localitiesFound,
succeeded: stats.churchesCreated + stats.churchesMatched,
failed: stats.errors,
itemsFound: stats.schedulesCreated,
},
});
} catch { /* Ignore */ }
}
}
main()
.catch((error) => {
console.error('Fatal error:', error);
process.exit(1);
})
.finally(async () => {
await prisma.$disconnect();
await pool.end();
});

579
scripts/import-miserend.ts Normal file
View File

@@ -0,0 +1,579 @@
#!/usr/bin/env tsx
/**
* Import Catholic churches and mass schedules from miserend.hu (Hungary)
*
* miserend.hu is the Hungarian Catholic mass schedule database, maintained by
* the community with ~5,055 churches (mostly Hungary, some Romania/Slovakia).
* It publishes a daily-updated SQLite database at:
* https://miserend.hu/fajlok/sqlite/miserend_v4.sqlite3
*
* The SQLite contains:
* - templomok: churches (tid, nev, lat, lng, varos, cim, orszag, megye)
* - misek: date-specific mass entries (tid, ido, datumtol, datumig, nyelv)
* - kepek: church photos
*
* Import strategy:
* 1. Download the SQLite database
* 2. Extract all churches with coordinates
* 3. Deduce weekly recurring schedules from date-specific entries
* 4. Match against existing churches via church-matcher
* 5. Upsert churches and mass schedules
*
* Usage:
* npx tsx scripts/import-miserend.ts --all --dry-run
* npx tsx scripts/import-miserend.ts --all
* npx tsx scripts/import-miserend.ts --id 37 --dry-run # Single church
* npx tsx scripts/import-miserend.ts --all --resume-from 500
*/
import dotenv from 'dotenv';
import path from 'path';
import fs from 'fs';
import { execFileSync } from 'child_process';
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
const pool = new Pool({
connectionString: dbUrl,
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
});
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
import { findDuplicateChurch } from '../src/lib/church-matcher';
import type { ExistingChurch } from '../src/lib/church-matcher';
// ─── Constants ───────────────────────────────────────────────────────────────
const SQLITE_URL = 'https://miserend.hu/fajlok/sqlite/miserend_v4.sqlite3';
const SQLITE_PATH = '/tmp/miserend_v4.sqlite3';
// Country mapping from Hungarian names to ISO codes
const COUNTRY_MAP: Record<string, string> = {
'Magyarország': 'HU',
'România': 'RO',
'Slovensko': 'SK',
'Szlovákia': 'SK',
'Szerbia-Montenegro': 'RS',
'Србија': 'RS',
'Ukrajna': 'UA',
'Україна': 'UA',
'Österreich': 'AT',
'Schweiz/Suisse/Svizzera/Svizra': 'CH',
'België / Belgique / Belgien': 'BE',
'Éire / Ireland': 'IE',
'Россия': 'RU',
};
// ─── Types ───────────────────────────────────────────────────────────────────
interface MiserendChurch {
tid: number;
nev: string;
ismertnev: string | null;
orszag: string | null;
megye: string | null;
varos: string | null;
cim: string | null;
lat: number;
lng: number;
}
interface MiserendMass {
mid: number;
tid: number;
datumtol: number; // MMDD format
datumig: number;
ido: string; // HH:MM:SS
nyelv: string | null;
}
interface ParsedSchedule {
dayOfWeek: number;
time: string;
}
interface ImportStats {
churchesFetched: number;
churchesMatched: number;
churchesCreated: number;
churchesSkipped: number;
schedulesCreated: number;
errors: number;
}
interface CLIArgs {
all: boolean;
dryRun: boolean;
resumeFrom?: number;
churchId?: string;
jobId?: string;
}
// ─── SQLite Helpers ──────────────────────────────────────────────────────────
function sqliteQuery(query: string): string {
try {
return execFileSync('sqlite3', [SQLITE_PATH, query], {
encoding: 'utf-8',
maxBuffer: 100 * 1024 * 1024, // 100MB
}).trim();
} catch {
return '';
}
}
function downloadSqlite(): void {
console.log('Downloading miserend SQLite database...');
execFileSync('curl', ['-sL', '-o', SQLITE_PATH, SQLITE_URL], { timeout: 120000 });
const size = fs.statSync(SQLITE_PATH).size;
console.log(`Downloaded ${(size / 1024 / 1024).toFixed(1)}MB`);
}
function loadChurches(): MiserendChurch[] {
const raw = sqliteQuery(
"SELECT tid, nev, ismertnev, orszag, megye, varos, cim, lat, lng FROM templomok WHERE lat IS NOT NULL AND lng IS NOT NULL AND lat != 0 AND lng != 0;"
);
if (!raw) return [];
return raw.split('\n').map(line => {
const [tid, nev, ismertnev, orszag, megye, varos, cim, lat, lng] = line.split('|');
return {
tid: parseInt(tid),
nev: nev || '',
ismertnev: ismertnev || null,
orszag: orszag || null,
megye: megye || null,
varos: varos || null,
cim: cim || null,
lat: parseFloat(lat),
lng: parseFloat(lng),
};
}).filter(c => !isNaN(c.tid) && !isNaN(c.lat) && !isNaN(c.lng));
}
function loadMassesForChurch(tid: number): MiserendMass[] {
const raw = sqliteQuery(
`SELECT mid, tid, datumtol, datumig, ido, nyelv FROM misek WHERE tid=${tid};`
);
if (!raw) return [];
return raw.split('\n').map(line => {
const [mid, tidStr, datumtol, datumig, ido, nyelv] = line.split('|');
return {
mid: parseInt(mid),
tid: parseInt(tidStr),
datumtol: parseInt(datumtol),
datumig: parseInt(datumig),
ido: ido || '',
nyelv: nyelv || null,
};
}).filter(m => !isNaN(m.mid) && m.ido);
}
// ─── Schedule Deduction ──────────────────────────────────────────────────────
/**
* Deduce weekly recurring schedule from date-specific mass entries.
* Each entry has datumtol/datumig in MMDD format (e.g., 104 = Jan 4).
* We convert each date to a day of week and collect unique day+time combos.
*/
function deduceSchedules(masses: MiserendMass[]): ParsedSchedule[] {
const seen = new Set<string>();
const schedules: ParsedSchedule[] = [];
// Use current year for date conversion
const year = new Date().getFullYear();
for (const mass of masses) {
const time = mass.ido.substring(0, 5); // HH:MM from HH:MM:SS
if (!time || time === '00:00') continue;
// Convert MMDD to a Date to get day of week
const mmdd = mass.datumtol;
const month = Math.floor(mmdd / 100);
const day = mmdd % 100;
if (month < 1 || month > 12 || day < 1 || day > 31) continue;
const date = new Date(year, month - 1, day);
const dayOfWeek = date.getDay(); // 0=Sun, 1=Mon, ..., 6=Sat
const key = `${dayOfWeek}:${time}`;
if (!seen.has(key)) {
seen.add(key);
schedules.push({ dayOfWeek, time });
}
}
return schedules;
}
// ─── Database Operations ─────────────────────────────────────────────────────
async function loadExistingChurches(countryCodes: string[]): Promise<ExistingChurch[]> {
console.log(`Loading existing churches for countries: ${countryCodes.join(', ')}...`);
const churches = await prisma.church.findMany({
where: { country: { in: countryCodes } },
select: {
id: true,
name: true,
latitude: true,
longitude: true,
osmId: true,
baiduId: true,
masstimesId: true,
orarimesseId: true,
massSchedulesPhId: true,
philmassId: true,
horariosMisasId: true,
mszeInfoId: true,
weekdayMassesId: true,
messesInfoId: true,
bohosluzbyId: true,
miserendId: true,
kerknetId: true,
gottesdienstzeitenId: true,
discovermassId: true,
source: true,
website: true,
phone: true,
address: true,
},
});
console.log(`Loaded ${churches.length} existing churches`);
return churches;
}
// ─── Import Logic ────────────────────────────────────────────────────────────
async function processChurch(
church: MiserendChurch,
existingChurches: ExistingChurch[],
dryRun: boolean,
stats: ImportStats,
): Promise<void> {
const miserendId = String(church.tid);
const country = church.orszag ? (COUNTRY_MAP[church.orszag] || 'HU') : 'HU';
const candidate = {
name: church.nev,
lat: church.lat,
lng: church.lng,
miserendId,
};
const duplicate = findDuplicateChurch(candidate, existingChurches);
// Deduce schedules
let schedules: ParsedSchedule[] = [];
if (!dryRun) {
const masses = loadMassesForChurch(church.tid);
schedules = deduceSchedules(masses);
}
if (dryRun) {
if (duplicate) {
stats.churchesMatched++;
} else {
stats.churchesCreated++;
}
return;
}
if (duplicate) {
stats.churchesMatched++;
const updateData: Record<string, unknown> = { miserendId };
if (!duplicate.address && church.cim) updateData.address = church.cim;
try {
await prisma.church.update({
where: { id: duplicate.id },
data: updateData,
});
} catch (error) {
if (error instanceof Error && error.message.includes('Unique constraint')) {
stats.churchesSkipped++;
return;
}
throw error;
}
if (schedules.length > 0) {
try {
await prisma.$transaction(async (tx) => {
await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } });
await tx.massSchedule.createMany({
data: schedules.map((s) => ({
churchId: duplicate.id,
dayOfWeek: s.dayOfWeek,
time: s.time,
language: 'Hungarian',
})),
});
await tx.church.update({
where: { id: duplicate.id },
data: { lastScrapedAt: new Date() },
});
});
stats.schedulesCreated += schedules.length;
} catch (error) {
stats.errors++;
console.error(` Error saving schedules for ${miserendId}: ${error instanceof Error ? error.message : error}`);
}
}
} else {
try {
const newChurch = await prisma.church.create({
data: {
name: church.nev,
latitude: church.lat,
longitude: church.lng,
address: church.cim,
city: church.varos,
state: church.megye,
country,
miserendId,
source: 'miserend',
websiteLanguage: 'hu',
},
});
stats.churchesCreated++;
existingChurches.push({
id: newChurch.id,
name: church.nev,
latitude: church.lat,
longitude: church.lng,
osmId: null,
baiduId: null,
masstimesId: null,
orarimesseId: null,
massSchedulesPhId: null,
philmassId: null,
horariosMisasId: null,
mszeInfoId: null,
weekdayMassesId: null,
messesInfoId: null,
bohosluzbyId: null,
miserendId,
kerknetId: null,
gottesdienstzeitenId: null,
discovermassId: null,
source: 'miserend',
website: null,
phone: null,
address: church.cim,
});
if (schedules.length > 0) {
await prisma.massSchedule.createMany({
data: schedules.map((s) => ({
churchId: newChurch.id,
dayOfWeek: s.dayOfWeek,
time: s.time,
language: 'Hungarian',
})),
});
await prisma.church.update({
where: { id: newChurch.id },
data: { lastScrapedAt: new Date() },
});
stats.schedulesCreated += schedules.length;
}
} catch (error) {
if (error instanceof Error && error.message.includes('Unique constraint')) {
stats.churchesSkipped++;
return;
}
stats.errors++;
console.error(` Error creating ${miserendId}: ${error instanceof Error ? error.message : error}`);
}
}
}
// ─── CLI ─────────────────────────────────────────────────────────────────────
function parseArgs(): CLIArgs {
const args = process.argv.slice(2);
const result: CLIArgs = { all: false, dryRun: false };
for (let i = 0; i < args.length; i++) {
switch (args[i]) {
case '--all':
result.all = true;
break;
case '--dry-run':
result.dryRun = true;
break;
case '--resume-from':
result.resumeFrom = parseInt(args[++i]);
break;
case '--id':
result.churchId = args[++i];
break;
case '--job-id':
result.jobId = args[++i];
break;
case '--help':
case '-h':
console.log(`
Usage: npx tsx scripts/import-miserend.ts [options]
Options:
--all Import all churches
--id <tid> Import a single church by miserend ID
--dry-run No database writes, just report what would happen
--resume-from <n> Skip first N churches
--job-id <uuid> Background job tracking ID
--help, -h Show this help message
Examples:
npx tsx scripts/import-miserend.ts --id 37 --dry-run
npx tsx scripts/import-miserend.ts --all --dry-run
npx tsx scripts/import-miserend.ts --all
`);
process.exit(0);
}
}
if (!result.all && !result.churchId) {
console.error('Error: specify --all or --id <miserend_tid>');
process.exit(1);
}
return result;
}
function formatDuration(ms: number): string {
const seconds = Math.floor(ms / 1000);
const minutes = Math.floor(seconds / 60);
const hours = Math.floor(minutes / 60);
if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`;
if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
return `${seconds}s`;
}
// ─── Main ────────────────────────────────────────────────────────────────────
async function main() {
const args = parseArgs();
const startTime = Date.now();
console.log('\n' + '='.repeat(70));
console.log('MISEREND.HU (HUNGARY) IMPORTER');
console.log('='.repeat(70));
console.log(`Mode: ${args.churchId ? `Church ID ${args.churchId}` : 'All churches'}`);
console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`);
if (args.resumeFrom) console.log(`Resume from: church index ${args.resumeFrom}`);
console.log(`Time: ${new Date().toISOString()}`);
console.log('='.repeat(70) + '\n');
if (args.jobId) {
try {
await prisma.backgroundJob.update({
where: { id: args.jobId },
data: { status: 'running', startedAt: new Date() },
});
} catch { /* Job might not exist */ }
}
const stats: ImportStats = {
churchesFetched: 0,
churchesMatched: 0,
churchesCreated: 0,
churchesSkipped: 0,
schedulesCreated: 0,
errors: 0,
};
// Download SQLite database
downloadSqlite();
// Load churches from SQLite
let churches = loadChurches();
stats.churchesFetched = churches.length;
console.log(`Found ${churches.length} churches with coordinates in SQLite\n`);
if (args.churchId) {
churches = churches.filter(c => String(c.tid) === args.churchId);
if (churches.length === 0) {
console.error(`Church ID ${args.churchId} not found in SQLite database`);
return;
}
}
// Get unique country codes from the data
const countryCodes = [...new Set(churches.map(c => {
return c.orszag ? (COUNTRY_MAP[c.orszag] || 'HU') : 'HU';
}))];
const existingChurches = await loadExistingChurches(countryCodes);
if (args.resumeFrom) {
churches = churches.slice(args.resumeFrom);
console.log(`Resuming from index ${args.resumeFrom} (${churches.length} remaining)\n`);
}
console.log(`Processing ${churches.length} churches\n`);
for (let i = 0; i < churches.length; i++) {
const church = churches[i];
if (i % 200 === 0) {
const elapsed = formatDuration(Date.now() - startTime);
console.log(`[${i + 1}/${churches.length}] Processing ${church.nev} (${church.tid}) [${elapsed} elapsed]`);
}
try {
await processChurch(church, existingChurches, args.dryRun, stats);
} catch (error) {
stats.errors++;
console.error(` ERROR processing church ${church.tid}: ${error instanceof Error ? error.message : error}`);
}
}
const totalTime = Date.now() - startTime;
console.log('\n' + '='.repeat(70));
console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
console.log('='.repeat(70));
console.log(`Churches in SQLite: ${stats.churchesFetched}`);
console.log(` Matched (existing): ${stats.churchesMatched}`);
console.log(` Created (new): ${stats.churchesCreated}`);
console.log(` Skipped: ${stats.churchesSkipped}`);
console.log(`Schedules created: ${stats.schedulesCreated}`);
console.log(`Errors: ${stats.errors}`);
console.log(`Total time: ${formatDuration(totalTime)}`);
console.log('='.repeat(70) + '\n');
if (args.jobId) {
try {
await prisma.backgroundJob.update({
where: { id: args.jobId },
data: {
status: stats.errors > 0 ? 'completed_with_errors' : 'completed',
completedAt: new Date(),
processed: stats.churchesFetched,
succeeded: stats.churchesCreated + stats.churchesMatched,
failed: stats.errors,
itemsFound: stats.schedulesCreated,
},
});
} catch { /* Ignore */ }
}
}
main()
.catch((error) => {
console.error('Fatal error:', error);
process.exit(1);
})
.finally(async () => {
await prisma.$disconnect();
await pool.end();
});

746
scripts/import-msze-info.ts Normal file
View File

@@ -0,0 +1,746 @@
#!/usr/bin/env tsx
/**
* Import Catholic churches and mass schedules from msze.info (Poland)
*
* msze.info is a Polish directory of Catholic parishes with mass schedules.
* The site uses numbered sitemaps (Churches1.xml through Churches11.xml)
* with ~500 URLs each, containing both /kosciol/{id} (church pages) and
* /msze-online/{slug} (livestream pages).
*
* Import strategy:
* 1. Fetch all 11 sitemaps → extract /kosciol/{id} URLs (skip /msze-online/)
* 2. For each church: fetch HTML, parse name/address/phone/website/schedule
* 3. Extract coordinates from embedded tomtom_codeAddress() JS call
* 4. Match against existing PL churches, upsert
*
* Usage:
* npx tsx scripts/import-msze-info.ts --all
* npx tsx scripts/import-msze-info.ts --all --dry-run
* npx tsx scripts/import-msze-info.ts --all --resume-from 500
* npx tsx scripts/import-msze-info.ts --all --job-id {uuid}
*/
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
const pool = new Pool({
connectionString: dbUrl,
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
});
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
import { findDuplicateChurch } from '../src/lib/church-matcher';
import type { ExistingChurch } from '../src/lib/church-matcher';
// ─── Constants ───────────────────────────────────────────────────────────────
const SITE_BASE = 'https://www.msze.info';
const SITEMAP_COUNT = 11;
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
const REQUEST_DELAY_MS = 1500;
// ─── Types ───────────────────────────────────────────────────────────────────
interface ParsedChurch {
name: string;
address: string | null;
city: string | null;
zip: string | null;
phone: string | null;
website: string | null;
email: string | null;
latitude: number;
longitude: number;
}
interface ParsedSchedule {
dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat
time: string; // "05:00", "18:30"
}
interface ImportStats {
churchesFound: number;
churchesMatched: number;
churchesCreated: number;
churchesSkipped: number;
schedulesProcessed: number;
massSchedulesCreated: number;
errors: number;
}
interface CLIArgs {
all: boolean;
dryRun: boolean;
resumeFrom?: number;
jobId?: string;
}
// ─── HTTP Client ─────────────────────────────────────────────────────────────
let requestCount = 0;
function delay(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function fetchPage(url: string, delayMs: number = REQUEST_DELAY_MS): Promise<string | null> {
if (requestCount > 0) {
await delay(delayMs);
}
requestCount++;
try {
const response = await fetch(url, {
headers: {
'User-Agent': USER_AGENT,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
},
});
if (!response.ok) {
console.error(` HTTP ${response.status} for ${url}`);
return null;
}
return await response.text();
} catch (error) {
console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`);
return null;
}
}
// ─── Sitemap Parser ──────────────────────────────────────────────────────────
async function fetchChurchUrlsFromSitemaps(): Promise<string[]> {
const allIds: string[] = [];
const seen = new Set<string>();
for (let i = 1; i <= SITEMAP_COUNT; i++) {
const sitemapUrl = `${SITE_BASE}/sitemap/Churches${i}.xml`;
console.log(` Fetching ${sitemapUrl}...`);
const xml = await fetchPage(sitemapUrl);
if (!xml) {
console.error(` Failed to fetch ${sitemapUrl}`);
continue;
}
// Extract /kosciol/{id} URLs, skip /msze-online/
const locRegex = /<loc>https?:\/\/(?:www\.)?msze\.info\/kosciol\/(\d+)<\/loc>/g;
let match;
while ((match = locRegex.exec(xml)) !== null) {
const id = match[1];
if (!seen.has(id)) {
seen.add(id);
allIds.push(id);
}
}
}
// Sort numerically for deterministic order
allIds.sort((a, b) => parseInt(a) - parseInt(b));
console.log(`Found ${allIds.length} unique church IDs from ${SITEMAP_COUNT} sitemaps`);
return allIds;
}
// ─── HTML Parsers ────────────────────────────────────────────────────────────
function parseChurchPage(html: string): ParsedChurch {
// Name: from <h1>Church Name, City</h1>
const h1Match = html.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i);
let name = '';
let cityFromH1: string | null = null;
if (h1Match) {
const raw = h1Match[1].replace(/<[^>]+>/g, '').trim();
// Split "Church Name, City" — city is the last comma-separated part
const lastComma = raw.lastIndexOf(',');
if (lastComma > 0) {
name = raw.substring(0, lastComma).trim();
cityFromH1 = raw.substring(lastComma + 1).trim();
} else {
name = raw;
}
}
// Address: look for "Adres:" or address-like patterns
// Pattern: <span class="highlight">Adres:</span> <strong>Street, City</strong>
let address: string | null = null;
let city: string | null = cityFromH1;
let zip: string | null = null;
const addressMatch = html.match(/Adres:<\/span>\s*(?:<strong>)?([\s\S]*?)(?:<\/strong>|<br|<\/p)/i);
if (addressMatch) {
address = addressMatch[1]
.replace(/<[^>]+>/g, '')
.replace(/\s+/g, ' ')
.trim() || null;
}
// Also try the tomtom_codeAddress first argument as fallback address
if (!address) {
const tomtomAddrMatch = html.match(/tomtom_codeAddress\s*\(\s*'([^']+)'/);
if (tomtomAddrMatch) {
address = tomtomAddrMatch[1].trim() || null;
}
}
if (address) {
// Extract Polish postal code (XX-XXX format)
const zipMatch = address.match(/\b(\d{2}-\d{3})\b/);
if (zipMatch) {
zip = zipMatch[1];
}
// Extract city from address if not already from h1
if (!city) {
// City is typically the last part after comma
const parts = address.split(',');
if (parts.length > 1) {
city = parts[parts.length - 1].replace(/\d{2}-\d{3}/, '').trim() || null;
}
}
}
// Coordinates: from tomtom_codeAddress('addr', zoom, 'name', null, lat, lng)
let latitude = 0;
let longitude = 0;
const coordMatch = html.match(
/tomtom_codeAddress\s*\([^,]+,\s*\d+\s*,\s*[^,]+,\s*(?:null|'[^']*')\s*,\s*(-?[\d.]+)\s*,\s*(-?[\d.]+)\s*\)/
);
if (coordMatch) {
const lat = parseFloat(coordMatch[1]);
const lng = parseFloat(coordMatch[2]);
if (!isNaN(lat) && !isNaN(lng) && lat !== 0 && lng !== 0) {
latitude = lat;
longitude = lng;
}
}
// Phone: <a href="tel:...">
let phone: string | null = null;
const phoneMatch = html.match(/<a\s+href="tel:([^"]+)"/i);
if (phoneMatch) {
phone = phoneMatch[1].trim() || null;
}
// Website: look for external link near "Witryna" text
let website: string | null = null;
const websiteMatch = html.match(/<a\s+href="(https?:\/\/[^"]+)"[^>]*>[^<]*Witryna/i);
if (websiteMatch) {
website = websiteMatch[1].trim() || null;
}
// Also try: link text that looks like a URL (www.xxx)
if (!website) {
const wwwMatch = html.match(/<a\s+href="(https?:\/\/[^"]+)"[^>]*>www\.[^<]+<\/a>/i);
if (wwwMatch) {
website = wwwMatch[1].trim() || null;
}
}
// Email: not reliably available (Cloudflare-protected)
const email: string | null = null;
return { name, address, city, zip, phone, website, email, latitude, longitude };
}
function parseMassSchedule(html: string): ParsedSchedule[] {
const schedules: ParsedSchedule[] = [];
const seen = new Set<string>();
// Find mass schedule sections by h2/h3 headings containing "MSZE"
// Pattern: <h2>MSZE NIEDZIELE I ŚWIĘTA - Church Name</h2> followed by "godz. ..."
// Pattern: <h3>MSZE DNI POWSZEDNIE - Church Name</h3> followed by "godz. ..."
const sectionRegex = /<h[2-4][^>]*>([\s\S]*?)<\/h[2-4]>([\s\S]*?)(?=<h[2-4]|<footer|<script|$)/gi;
let sectionMatch;
while ((sectionMatch = sectionRegex.exec(html)) !== null) {
const heading = sectionMatch[1].replace(/<[^>]+>/g, '').trim().toUpperCase();
const content = sectionMatch[2];
// Only process mass schedule headings (starts with "MSZE")
if (!heading.startsWith('MSZE')) continue;
// Determine which days this section covers
const days = resolvePolishDays(heading);
if (days.length === 0) continue;
// Extract times from "godz." patterns
const times = extractTimes(content);
for (const day of days) {
for (const time of times) {
const key = `${day}:${time}`;
if (seen.has(key)) continue;
seen.add(key);
schedules.push({ dayOfWeek: day, time });
}
}
}
return schedules;
}
function resolvePolishDays(heading: string): number[] {
const h = heading; // already uppercased by caller
// "NIEDZIELE I ŚWIĘTA" or just "NIEDZIEL" → Sunday
if (h.includes('NIEDZIEL')) {
return [0];
}
// "DNI POWSZEDNIE" → Weekdays (Mon-Sat)
if (h.includes('DNI POWSZEDNIE') || h.includes('POWSZEDNI')) {
return [1, 2, 3, 4, 5, 6];
}
// Individual day names (rare but possible)
if (h.includes('PONIEDZIA')) return [1]; // poniedziałek
if (h.includes('WTOREK') || h.includes('WTORK')) return [2];
if (h.includes('ŚRODA') || h.includes('SRODA') || h.includes('ŚROD')) return [3];
if (h.includes('CZWARTEK') || h.includes('CZWART')) return [4];
if (h.includes('PIĄTEK') || h.includes('PIATEK') || h.includes('PIĄT')) return [5];
if (h.includes('SOBOT')) return [6];
return [];
}
function extractTimes(text: string): string[] {
const times: string[] = [];
// Match "godz." followed by times, or standalone HH:MM patterns
// Handles: "godz. 6:30, 8:00, 9:30" and "godz. 7:00"
const timeRegex = /(\d{1,2}):(\d{2})/g;
let match;
// Only look at text near "godz." patterns
const godzSections = text.split(/godz\.\s*/i);
for (let i = 1; i < godzSections.length; i++) {
// Take text until the next section break (paragraph, div, heading)
const section = godzSections[i].split(/<(?:p|div|br\s*\/?>|h[2-4])/i)[0];
while ((match = timeRegex.exec(section)) !== null) {
const hours = parseInt(match[1]);
const mins = parseInt(match[2]);
if (hours >= 0 && hours <= 23 && mins >= 0 && mins <= 59) {
times.push(`${String(hours).padStart(2, '0')}:${String(mins).padStart(2, '0')}`);
}
}
}
return times;
}
// ─── Database Operations ─────────────────────────────────────────────────────
async function loadExistingPolishChurches(): Promise<ExistingChurch[]> {
console.log('Loading existing Polish churches for deduplication...');
const churches = await prisma.church.findMany({
where: { country: 'PL' },
select: {
id: true,
name: true,
latitude: true,
longitude: true,
osmId: true,
baiduId: true,
masstimesId: true,
orarimesseId: true,
massSchedulesPhId: true,
philmassId: true,
horariosMisasId: true,
mszeInfoId: true,
weekdayMassesId: true,
messesInfoId: true,
bohosluzbyId: true,
miserendId: true,
kerknetId: true,
gottesdienstzeitenId: true,
discovermassId: true,
source: true,
website: true,
phone: true,
address: true,
},
});
console.log(`Loaded ${churches.length} existing Polish churches`);
return churches;
}
// ─── Import Logic ────────────────────────────────────────────────────────────
async function processChurch(
churchId: string,
existingChurches: ExistingChurch[],
dryRun: boolean,
stats: ImportStats,
): Promise<void> {
stats.churchesFound++;
const url = `${SITE_BASE}/kosciol/${churchId}`;
const churchHtml = await fetchPage(url);
if (!churchHtml) {
stats.errors++;
return;
}
const parsed = parseChurchPage(churchHtml);
if (!parsed.name) {
console.log(` Skipping ${churchId}: no name found`);
stats.churchesSkipped++;
return;
}
const schedules = parseMassSchedule(churchHtml);
// Build candidate for dedup
const candidate = {
name: parsed.name,
lat: parsed.latitude,
lng: parsed.longitude,
mszeInfoId: churchId,
};
const duplicate = findDuplicateChurch(candidate, existingChurches);
if (dryRun) {
if (duplicate) {
stats.churchesMatched++;
console.log(` [MATCH] "${parsed.name}" → existing "${duplicate.name}" (${duplicate.id})`);
} else {
stats.churchesCreated++;
console.log(` [NEW] "${parsed.name}" (${parsed.city || 'unknown city'})`);
}
if (schedules.length > 0) {
stats.schedulesProcessed++;
stats.massSchedulesCreated += schedules.length;
}
return;
}
if (duplicate) {
// Update existing church
stats.churchesMatched++;
const updateData: Record<string, unknown> = {
mszeInfoId: churchId,
};
if (!duplicate.address && parsed.address) updateData.address = parsed.address;
if (!duplicate.phone && parsed.phone) updateData.phone = parsed.phone;
if (!duplicate.website && parsed.website) {
updateData.website = parsed.website;
updateData.hasWebsite = true;
}
// Update coordinates if existing has none and we have them
if (duplicate.latitude === 0 && duplicate.longitude === 0 && parsed.latitude !== 0) {
updateData.latitude = parsed.latitude;
updateData.longitude = parsed.longitude;
}
// Fill city/zip if not set
const dbRecord = await prisma.church.findUnique({
where: { id: duplicate.id },
select: { city: true, zip: true, email: true },
});
if (dbRecord && !dbRecord.city && parsed.city) updateData.city = parsed.city;
if (dbRecord && !dbRecord.zip && parsed.zip) updateData.zip = parsed.zip;
if (dbRecord && !dbRecord.email && parsed.email) updateData.email = parsed.email;
try {
await prisma.church.update({
where: { id: duplicate.id },
data: updateData,
});
} catch (error) {
if (error instanceof Error && error.message.includes('Unique constraint')) {
stats.churchesSkipped++;
return;
}
throw error;
}
// Replace mass schedules
if (schedules.length > 0) {
try {
await prisma.$transaction(async (tx) => {
await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } });
await tx.massSchedule.createMany({
data: schedules.map((s) => ({
churchId: duplicate.id,
dayOfWeek: s.dayOfWeek,
time: s.time,
language: 'Polish',
})),
});
await tx.church.update({
where: { id: duplicate.id },
data: { lastScrapedAt: new Date() },
});
});
stats.schedulesProcessed++;
stats.massSchedulesCreated += schedules.length;
} catch (error) {
stats.errors++;
console.error(` Error saving schedules for ${churchId}: ${error instanceof Error ? error.message : error}`);
}
}
} else {
// Create new church
try {
const newChurch = await prisma.church.create({
data: {
name: parsed.name,
latitude: parsed.latitude,
longitude: parsed.longitude,
address: parsed.address,
zip: parsed.zip,
city: parsed.city,
country: 'PL',
phone: parsed.phone,
website: parsed.website,
email: parsed.email,
hasWebsite: !!parsed.website,
mszeInfoId: churchId,
source: 'msze-info',
},
});
stats.churchesCreated++;
// Add to in-memory array for within-run dedup
existingChurches.push({
id: newChurch.id,
name: parsed.name,
latitude: parsed.latitude,
longitude: parsed.longitude,
osmId: null,
baiduId: null,
masstimesId: null,
orarimesseId: null,
massSchedulesPhId: null,
philmassId: null,
horariosMisasId: null,
mszeInfoId: churchId,
weekdayMassesId: null,
messesInfoId: null,
bohosluzbyId: null,
miserendId: null,
kerknetId: null,
gottesdienstzeitenId: null,
discovermassId: null,
source: 'msze-info',
website: parsed.website,
phone: parsed.phone,
address: parsed.address,
});
// Create mass schedules
if (schedules.length > 0) {
await prisma.massSchedule.createMany({
data: schedules.map((s) => ({
churchId: newChurch.id,
dayOfWeek: s.dayOfWeek,
time: s.time,
language: 'Polish',
})),
});
await prisma.church.update({
where: { id: newChurch.id },
data: { lastScrapedAt: new Date() },
});
stats.schedulesProcessed++;
stats.massSchedulesCreated += schedules.length;
}
} catch (error) {
if (error instanceof Error && error.message.includes('Unique constraint')) {
stats.churchesSkipped++;
return;
}
throw error;
}
}
}
// ─── CLI ─────────────────────────────────────────────────────────────────────
function parseArgs(): CLIArgs {
const args = process.argv.slice(2);
const result: CLIArgs = {
all: false,
dryRun: false,
};
for (let i = 0; i < args.length; i++) {
switch (args[i]) {
case '--all':
result.all = true;
break;
case '--dry-run':
result.dryRun = true;
break;
case '--resume-from':
result.resumeFrom = parseInt(args[++i]);
break;
case '--job-id':
result.jobId = args[++i];
break;
case '--help':
case '-h':
console.log(`
Usage: npx tsx scripts/import-msze-info.ts [options]
Options:
--all Import all churches from sitemaps
--dry-run No database writes, just report what would happen
--resume-from <n> Skip first N churches
--job-id <uuid> Background job tracking ID
--help, -h Show this help message
Examples:
npx tsx scripts/import-msze-info.ts --all --dry-run
npx tsx scripts/import-msze-info.ts --all
npx tsx scripts/import-msze-info.ts --all --resume-from 500
`);
process.exit(0);
}
}
if (!result.all) {
console.error('Error: specify --all');
process.exit(1);
}
return result;
}
// ─── Helpers ─────────────────────────────────────────────────────────────────
function formatDuration(ms: number): string {
const seconds = Math.floor(ms / 1000);
const minutes = Math.floor(seconds / 60);
const hours = Math.floor(minutes / 60);
if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`;
if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
return `${seconds}s`;
}
// ─── Main ────────────────────────────────────────────────────────────────────
async function main() {
const args = parseArgs();
const startTime = Date.now();
console.log('\n' + '='.repeat(70));
console.log('MSZE.INFO (POLAND) IMPORTER');
console.log('='.repeat(70));
console.log(`Mode: All churches from sitemaps`);
console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`);
if (args.resumeFrom) console.log(`Resume from: ${args.resumeFrom}`);
console.log(`Time: ${new Date().toISOString()}`);
console.log('='.repeat(70) + '\n');
// Update background job status if provided
if (args.jobId) {
try {
await prisma.backgroundJob.update({
where: { id: args.jobId },
data: { status: 'running', startedAt: new Date() },
});
} catch {
// Job might not exist yet
}
}
const stats: ImportStats = {
churchesFound: 0,
churchesMatched: 0,
churchesCreated: 0,
churchesSkipped: 0,
schedulesProcessed: 0,
massSchedulesCreated: 0,
errors: 0,
};
// Load existing Polish churches for dedup
const existingChurches = await loadExistingPolishChurches();
// Fetch church IDs from sitemaps
console.log('Fetching church URLs from sitemaps...');
let churchIds = await fetchChurchUrlsFromSitemaps();
// Handle --resume-from
if (args.resumeFrom) {
const before = churchIds.length;
churchIds = churchIds.slice(args.resumeFrom);
console.log(`Resuming from index ${args.resumeFrom} (skipping ${before - churchIds.length} churches)\n`);
} else {
console.log(`Processing ${churchIds.length} churches\n`);
}
// Process each church
for (let i = 0; i < churchIds.length; i++) {
const id = churchIds[i];
const elapsed = formatDuration(Date.now() - startTime);
console.log(`[${i + 1}/${churchIds.length}] kosciol/${id} [${elapsed} elapsed]`);
try {
await processChurch(id, existingChurches, args.dryRun, stats);
} catch (error) {
stats.errors++;
console.error(` ERROR processing ${id}: ${error instanceof Error ? error.message : error}`);
}
}
// Print summary
const totalTime = Date.now() - startTime;
console.log('\n' + '='.repeat(70));
console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
console.log('='.repeat(70));
console.log(`Churches found: ${stats.churchesFound}`);
console.log(` Matched (existing): ${stats.churchesMatched}`);
console.log(` Created (new): ${stats.churchesCreated}`);
console.log(` Skipped: ${stats.churchesSkipped}`);
console.log(`Schedules processed: ${stats.schedulesProcessed}`);
console.log(`Mass schedules created: ${stats.massSchedulesCreated}`);
console.log(`Errors: ${stats.errors}`);
console.log(`Total time: ${formatDuration(totalTime)}`);
console.log(`HTTP requests: ${requestCount}`);
console.log('='.repeat(70) + '\n');
// Update background job
if (args.jobId) {
try {
await prisma.backgroundJob.update({
where: { id: args.jobId },
data: {
status: stats.errors > 0 ? 'completed_with_errors' : 'completed',
completedAt: new Date(),
result: JSON.stringify(stats),
},
});
} catch {
// Ignore
}
}
}
main()
.catch((error) => {
console.error('Fatal error:', error);
process.exit(1);
})
.finally(async () => {
await prisma.$disconnect();
await pool.end();
});

View File

@@ -0,0 +1,771 @@
#!/usr/bin/env tsx
/**
* Import Catholic churches and mass schedules from OrariMesse.it
*
* OrariMesse.it is the official CEI (Italian Bishops' Conference) platform for
* mass times in Italy. It provides a public REST API organized by diocese.
*
* Import strategy:
* Pass 1: For each diocese, fetch all churches → match against existing DB
* records (by ICSC code or proximity+name) → upsert
* Pass 2: For churches with active schedules, fetch detail endpoint →
* convert 8-day rolling schedule to recurring → replace mass schedules
*
* Usage:
* npx tsx scripts/import-orarimesse.ts --all
* npx tsx scripts/import-orarimesse.ts --diocese roma
* npx tsx scripts/import-orarimesse.ts --all --dry-run
* npx tsx scripts/import-orarimesse.ts --all --schedules-only
* npx tsx scripts/import-orarimesse.ts --all --resume-from napoli
* npx tsx scripts/import-orarimesse.ts --all --job-id {uuid}
*/
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
const pool = new Pool({
connectionString: dbUrl,
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
});
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
import { findDuplicateChurch } from '../src/lib/church-matcher';
import type { ExistingChurch } from '../src/lib/church-matcher';
// ─── Constants ───────────────────────────────────────────────────────────────
const API_BASE = 'https://orarimesse.it/api';
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
const DIOCESE_DELAY_MS = 2000;
const DETAIL_DELAY_MS = 1000;
// ─── Italian Day Map ─────────────────────────────────────────────────────────
const ITALIAN_DAY_MAP: Record<string, number> = {
'domenica': 0, 'lunedì': 1, 'lunedi': 1,
'martedì': 2, 'martedi': 2, 'mercoledì': 3, 'mercoledi': 3,
'giovedì': 4, 'giovedi': 4, 'venerdì': 5, 'venerdi': 5,
'sabato': 6,
};
// ─── Types ───────────────────────────────────────────────────────────────────
interface OrariMesseDiocese {
codice_cei: string;
title: string;
slug: string;
url: string;
countChiese: number;
}
interface OrariMesseChurch {
idchurch: number;
address: string;
name: string;
conosciutaCome: string;
isopen: boolean;
nextmass: string;
lat: string;
lon: string;
sito: string;
emailLdc: string;
icsc: string;
comune: string;
tipologia: string;
accessibile: boolean;
}
interface OrariMesseDioceseResponse {
codice_cei: string;
title: string;
slug: string;
countChiese: number;
listaChiese: OrariMesseChurch[];
}
interface OrariMesseMass {
idmass: number;
time: string;
noteOrarioMessa: string;
}
interface OrariMesseDay {
day: string;
mass: OrariMesseMass[];
}
interface OrariMesseDetail {
idchurch: number;
name: string;
address: string;
lat: string;
lon: string;
icsc: string;
comune: string;
diocesi: string;
parroco: string;
telefono: string;
email: string;
sito: string;
days: OrariMesseDay[];
}
interface ImportStats {
diocesesProcessed: number;
churchesFound: number;
churchesMatched: number;
churchesCreated: number;
churchesSkipped: number;
schedulesProcessed: number;
massSchedulesCreated: number;
errors: number;
}
interface CLIArgs {
all: boolean;
diocese?: string;
dryRun: boolean;
schedulesOnly: boolean;
resumeFrom?: string;
jobId?: string;
}
// ─── API Client ──────────────────────────────────────────────────────────────
let requestCount = 0;
function delay(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function fetchApi<T>(endpoint: string, params: Record<string, string> = {}, delayMs: number): Promise<T | null> {
if (requestCount > 0) {
await delay(delayMs);
}
requestCount++;
const url = new URL(`${API_BASE}/${endpoint}`);
for (const [key, value] of Object.entries(params)) {
url.searchParams.set(key, value);
}
try {
const response = await fetch(url.toString(), {
headers: {
'User-Agent': USER_AGENT,
'Accept': 'application/json',
},
});
if (!response.ok) {
console.error(` HTTP ${response.status} for ${url}`);
return null;
}
const json = await response.json() as { status: boolean; code: string; data: T };
if (json.status === true && json.code === 'OK') {
return json.data;
}
console.error(` API error for ${url}: ${JSON.stringify(json).substring(0, 200)}`);
return null;
} catch (error) {
console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`);
return null;
}
}
async function fetchDioceses(): Promise<OrariMesseDiocese[]> {
const data = await fetchApi<OrariMesseDiocese[]>('getDiocesi', {}, DIOCESE_DELAY_MS);
return data || [];
}
async function fetchChurchesInDiocese(slug: string): Promise<OrariMesseDioceseResponse | null> {
const data = await fetchApi<OrariMesseDioceseResponse[]>(
'getListaChiese',
{ diocesi: slug, type: 'compact' },
DIOCESE_DELAY_MS
);
// Response is an array with a single diocese object
if (data && data.length > 0) {
return data[0];
}
return null;
}
async function fetchChurchDetail(idchurch: number): Promise<OrariMesseDetail | null> {
return fetchApi<OrariMesseDetail>(
'getDettaglioMessa',
{ idchurch: String(idchurch) },
DETAIL_DELAY_MS
);
}
// ─── Day/Time Conversion ─────────────────────────────────────────────────────
function parseItalianDay(dayString: string): number | null {
// "Giovedì 26 Febbraio" → extract first word → lowercase → lookup
const firstWord = dayString.split(' ')[0].toLowerCase();
return ITALIAN_DAY_MAP[firstWord] ?? null;
}
function convertTime(time: string): string {
// "07.00" → "07:00"
return time.replace('.', ':');
}
interface RecurringMass {
dayOfWeek: number;
time: string;
notes: string | null;
}
function convertScheduleToRecurring(days: OrariMesseDay[]): RecurringMass[] {
// The API returns an 8-day rolling window. Same weekday can appear twice
// (e.g. Thursday this week and Thursday next week). We deduplicate by
// dayOfWeek+time to get the recurring weekly schedule.
const seen = new Set<string>();
const result: RecurringMass[] = [];
for (const day of days) {
const dayOfWeek = parseItalianDay(day.day);
if (dayOfWeek === null) continue;
for (const mass of day.mass) {
const time = convertTime(mass.time);
const key = `${dayOfWeek}:${time}`;
if (seen.has(key)) continue;
seen.add(key);
result.push({
dayOfWeek,
time,
notes: mass.noteOrarioMessa || null,
});
}
}
return result;
}
// ─── Database Operations ─────────────────────────────────────────────────────
async function loadExistingItalianChurches(): Promise<ExistingChurch[]> {
console.log('Loading existing Italian churches for deduplication...');
const churches = await prisma.church.findMany({
where: { country: 'IT' },
select: {
id: true,
name: true,
latitude: true,
longitude: true,
osmId: true,
baiduId: true,
masstimesId: true,
orarimesseId: true,
massSchedulesPhId: true,
philmassId: true,
horariosMisasId: true,
mszeInfoId: true,
weekdayMassesId: true,
messesInfoId: true,
bohosluzbyId: true,
miserendId: true,
kerknetId: true,
gottesdienstzeitenId: true,
discovermassId: true,
source: true,
website: true,
phone: true,
address: true,
},
});
console.log(`Loaded ${churches.length} existing Italian churches`);
return churches;
}
// ─── Pass 1: Church Upsert ──────────────────────────────────────────────────
async function processChurchesForDiocese(
dioceseSlug: string,
churches: OrariMesseChurch[],
existingChurches: ExistingChurch[],
idchurchToDbId: Map<number, string>,
dryRun: boolean,
stats: ImportStats,
): Promise<void> {
for (const church of churches) {
stats.churchesFound++;
// Parse coordinates
const lat = parseFloat(church.lat);
const lon = parseFloat(church.lon);
if (isNaN(lat) || isNaN(lon) || lat === 0 || lon === 0) {
stats.churchesSkipped++;
continue;
}
// Build candidate for dedup
const candidate = {
name: church.name,
lat,
lng: lon,
orarimesseId: church.icsc || undefined,
};
const duplicate = findDuplicateChurch(candidate, existingChurches);
if (dryRun) {
if (duplicate) {
stats.churchesMatched++;
} else {
stats.churchesCreated++;
}
// Track idchurch for Pass 2 even in dry run
if (duplicate) {
idchurchToDbId.set(church.idchurch, duplicate.id);
}
continue;
}
if (duplicate) {
// Update existing church: set orarimesseId, fill missing fields
stats.churchesMatched++;
const updateData: Record<string, unknown> = {
orarimesseId: church.icsc || undefined,
orarimesseLastSyncedAt: new Date(),
};
if (!duplicate.address && church.address) updateData.address = church.address;
if (!duplicate.website && church.sito) {
updateData.website = church.sito;
updateData.hasWebsite = true;
}
// Check diocese on the actual DB record (not in ExistingChurch)
const dbRecord = await prisma.church.findUnique({
where: { id: duplicate.id },
select: { diocese: true, city: true, email: true },
});
if (dbRecord && !dbRecord.diocese && dioceseSlug) {
updateData.diocese = dioceseSlug;
}
if (dbRecord && !dbRecord.city && church.comune) {
updateData.city = church.comune;
}
if (dbRecord && !dbRecord.email && church.emailLdc) {
updateData.email = church.emailLdc;
}
try {
await prisma.church.update({
where: { id: duplicate.id },
data: updateData,
});
} catch (error) {
// Unique constraint violation on orarimesseId — another church already has this ICSC
if (error instanceof Error && error.message.includes('Unique constraint')) {
stats.churchesSkipped++;
continue;
}
throw error;
}
idchurchToDbId.set(church.idchurch, duplicate.id);
} else {
// Create new church
try {
const newChurch = await prisma.church.create({
data: {
name: church.name,
latitude: lat,
longitude: lon,
address: church.address || null,
city: church.comune || null,
country: 'IT',
diocese: dioceseSlug,
website: church.sito || null,
email: church.emailLdc || null,
hasWebsite: !!church.sito,
orarimesseId: church.icsc || null,
orarimesseLastSyncedAt: new Date(),
source: 'orarimesse',
wheelchairAccess: church.accessibile || false,
},
});
stats.churchesCreated++;
// Add to in-memory array for within-run dedup
existingChurches.push({
id: newChurch.id,
name: church.name,
latitude: lat,
longitude: lon,
osmId: null,
baiduId: null,
masstimesId: null,
orarimesseId: church.icsc || null,
massSchedulesPhId: null,
philmassId: null,
horariosMisasId: null,
mszeInfoId: null,
weekdayMassesId: null,
messesInfoId: null,
bohosluzbyId: null,
miserendId: null,
kerknetId: null,
gottesdienstzeitenId: null,
discovermassId: null,
source: 'orarimesse',
website: church.sito || null,
phone: null,
address: church.address || null,
});
idchurchToDbId.set(church.idchurch, newChurch.id);
} catch (error) {
if (error instanceof Error && error.message.includes('Unique constraint')) {
stats.churchesSkipped++;
continue;
}
throw error;
}
}
}
}
// ─── Pass 2: Mass Schedules ─────────────────────────────────────────────────
async function processSchedulesForDiocese(
churches: OrariMesseChurch[],
idchurchToDbId: Map<number, string>,
dryRun: boolean,
stats: ImportStats,
): Promise<void> {
// Filter to churches with active schedules
const churchesWithMass = churches.filter((c) => c.nextmass);
if (churchesWithMass.length === 0) return;
console.log(` Pass 2: Fetching schedules for ${churchesWithMass.length} churches with active masses...`);
for (const church of churchesWithMass) {
const dbId = idchurchToDbId.get(church.idchurch);
if (!dbId) continue; // Church not in our DB (skipped in Pass 1)
const detail = await fetchChurchDetail(church.idchurch);
if (!detail || !detail.days || detail.days.length === 0) {
continue;
}
stats.schedulesProcessed++;
const recurring = convertScheduleToRecurring(detail.days);
if (recurring.length === 0) continue;
if (dryRun) {
stats.massSchedulesCreated += recurring.length;
continue;
}
try {
await prisma.$transaction(async (tx) => {
// Delete existing mass schedules for this church
await tx.massSchedule.deleteMany({ where: { churchId: dbId } });
// Create new recurring schedules
await tx.massSchedule.createMany({
data: recurring.map((m) => ({
churchId: dbId,
dayOfWeek: m.dayOfWeek,
time: m.time,
language: 'Italian',
notes: m.notes,
})),
});
// Mark church as scraped
await tx.church.update({
where: { id: dbId },
data: { lastScrapedAt: new Date() },
});
});
stats.massSchedulesCreated += recurring.length;
} catch (error) {
stats.errors++;
console.error(` Error saving schedules for idchurch=${church.idchurch}: ${error instanceof Error ? error.message : error}`);
}
}
}
// ─── CLI ─────────────────────────────────────────────────────────────────────
function parseArgs(): CLIArgs {
const args = process.argv.slice(2);
const result: CLIArgs = {
all: false,
dryRun: false,
schedulesOnly: false,
};
for (let i = 0; i < args.length; i++) {
switch (args[i]) {
case '--all':
result.all = true;
break;
case '--diocese':
result.diocese = args[++i];
break;
case '--dry-run':
result.dryRun = true;
break;
case '--schedules-only':
result.schedulesOnly = true;
break;
case '--resume-from':
result.resumeFrom = args[++i];
break;
case '--job-id':
result.jobId = args[++i];
break;
case '--help':
case '-h':
console.log(`
Usage: npx tsx scripts/import-orarimesse.ts [options]
Options:
--all Import from all 77 dioceses
--diocese <slug> Import from a single diocese (e.g. "roma")
--dry-run No database writes, just report what would happen
--schedules-only Skip Pass 1 (church upsert), only fetch schedules
--resume-from <slug> Skip dioceses until reaching this slug
--job-id <uuid> Background job tracking ID
--help, -h Show this help message
Examples:
npx tsx scripts/import-orarimesse.ts --diocese roma --dry-run
npx tsx scripts/import-orarimesse.ts --all
npx tsx scripts/import-orarimesse.ts --all --schedules-only
npx tsx scripts/import-orarimesse.ts --all --resume-from napoli
`);
process.exit(0);
}
}
if (!result.all && !result.diocese) {
console.error('Error: specify --all or --diocese <slug>');
process.exit(1);
}
return result;
}
// ─── Helpers ─────────────────────────────────────────────────────────────────
function formatDuration(ms: number): string {
const seconds = Math.floor(ms / 1000);
const minutes = Math.floor(seconds / 60);
const hours = Math.floor(minutes / 60);
if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`;
if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
return `${seconds}s`;
}
// ─── Main ────────────────────────────────────────────────────────────────────
async function main() {
const args = parseArgs();
const startTime = Date.now();
console.log('\n' + '='.repeat(70));
console.log('ORARIMESSE.IT IMPORTER');
console.log('='.repeat(70));
console.log(`Mode: ${args.all ? 'All dioceses' : `Single diocese: ${args.diocese}`}`);
console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`);
console.log(`Schedules only: ${args.schedulesOnly ? 'YES' : 'NO'}`);
if (args.resumeFrom) console.log(`Resume from: ${args.resumeFrom}`);
console.log(`Time: ${new Date().toISOString()}`);
console.log('='.repeat(70) + '\n');
// Update background job status if provided
if (args.jobId) {
try {
await prisma.backgroundJob.update({
where: { id: args.jobId },
data: { status: 'running', startedAt: new Date() },
});
} catch {
// Job might not exist yet, that's fine
}
}
// Load existing Italian churches for dedup
const existingChurches = await loadExistingItalianChurches();
// Fetch diocese list
console.log('Fetching diocese list from OrariMesse.it...');
const allDioceses = await fetchDioceses();
console.log(`Found ${allDioceses.length} dioceses\n`);
// Filter to requested dioceses
let diocesesToProcess: OrariMesseDiocese[];
if (args.diocese) {
const found = allDioceses.find((d) => d.slug === args.diocese);
if (!found) {
console.error(`Diocese "${args.diocese}" not found. Available: ${allDioceses.map((d) => d.slug).join(', ')}`);
process.exit(1);
}
diocesesToProcess = [found];
} else {
diocesesToProcess = allDioceses;
}
// Handle --resume-from
if (args.resumeFrom) {
const idx = diocesesToProcess.findIndex((d) => d.slug === args.resumeFrom);
if (idx === -1) {
console.error(`Resume diocese "${args.resumeFrom}" not found.`);
process.exit(1);
}
console.log(`Resuming from diocese "${args.resumeFrom}" (skipping ${idx} dioceses)\n`);
diocesesToProcess = diocesesToProcess.slice(idx);
}
const stats: ImportStats = {
diocesesProcessed: 0,
churchesFound: 0,
churchesMatched: 0,
churchesCreated: 0,
churchesSkipped: 0,
schedulesProcessed: 0,
massSchedulesCreated: 0,
errors: 0,
};
// Map OrariMesse idchurch → our DB id (for Pass 2 schedule lookups)
const idchurchToDbId = new Map<number, string>();
// If schedules-only mode, pre-populate the map from existing orarimesseId records
if (args.schedulesOnly) {
console.log('Schedules-only mode: loading existing orarimesseId mappings...');
const mapped = await prisma.church.findMany({
where: { orarimesseId: { not: null } },
select: { id: true, orarimesseId: true },
});
// We'll build the idchurch map during diocese processing since we need the API's idchurch values
console.log(`Found ${mapped.length} churches with orarimesseId in DB\n`);
}
// Process each diocese
for (let i = 0; i < diocesesToProcess.length; i++) {
const diocese = diocesesToProcess[i];
const elapsed = formatDuration(Date.now() - startTime);
console.log(`[${i + 1}/${diocesesToProcess.length}] Diocese: ${diocese.title} (${diocese.slug}) [${elapsed} elapsed]`);
try {
// Fetch churches in this diocese
const dioceseData = await fetchChurchesInDiocese(diocese.slug);
if (!dioceseData || !dioceseData.listaChiese || dioceseData.listaChiese.length === 0) {
console.log(` No churches found, skipping`);
stats.diocesesProcessed++;
continue;
}
const churches = dioceseData.listaChiese;
console.log(` Found ${churches.length} churches (${churches.filter((c) => c.nextmass).length} with active masses)`);
// Pass 1: Upsert churches
if (!args.schedulesOnly) {
const prevMatched = stats.churchesMatched;
const prevCreated = stats.churchesCreated;
const prevSkipped = stats.churchesSkipped;
await processChurchesForDiocese(
diocese.slug, churches, existingChurches, idchurchToDbId,
args.dryRun, stats
);
const matched = stats.churchesMatched - prevMatched;
const created = stats.churchesCreated - prevCreated;
const skipped = stats.churchesSkipped - prevSkipped;
console.log(` Pass 1: ${matched} matched, ${created} created, ${skipped} skipped`);
} else {
// In schedules-only mode, still need to build idchurch → dbId map
for (const church of churches) {
if (church.icsc) {
const existing = existingChurches.find((e) => e.orarimesseId === church.icsc);
if (existing) {
idchurchToDbId.set(church.idchurch, existing.id);
}
}
}
}
// Pass 2: Import schedules
const prevSchedules = stats.massSchedulesCreated;
await processSchedulesForDiocese(churches, idchurchToDbId, args.dryRun, stats);
const newSchedules = stats.massSchedulesCreated - prevSchedules;
if (newSchedules > 0) {
console.log(` Pass 2: ${stats.schedulesProcessed} churches processed, ${newSchedules} mass times created`);
}
stats.diocesesProcessed++;
} catch (error) {
stats.errors++;
console.error(` ERROR processing diocese ${diocese.slug}: ${error instanceof Error ? error.message : error}`);
}
}
// Print summary
const totalTime = Date.now() - startTime;
console.log('\n' + '='.repeat(70));
console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
console.log('='.repeat(70));
console.log(`Dioceses processed: ${stats.diocesesProcessed}`);
console.log(`Churches found: ${stats.churchesFound}`);
console.log(` Matched (existing): ${stats.churchesMatched}`);
console.log(` Created (new): ${stats.churchesCreated}`);
console.log(` Skipped: ${stats.churchesSkipped}`);
console.log(`Schedules processed: ${stats.schedulesProcessed}`);
console.log(`Mass schedules created: ${stats.massSchedulesCreated}`);
console.log(`Errors: ${stats.errors}`);
console.log(`Total time: ${formatDuration(totalTime)}`);
console.log(`API requests: ${requestCount}`);
console.log('='.repeat(70) + '\n');
// Update background job
if (args.jobId) {
try {
await prisma.backgroundJob.update({
where: { id: args.jobId },
data: {
status: stats.errors > 0 ? 'completed_with_errors' : 'completed',
completedAt: new Date(),
result: JSON.stringify(stats),
},
});
} catch {
// Ignore
}
}
}
main()
.catch((error) => {
console.error('Fatal error:', error);
process.exit(1);
})
.finally(async () => {
await prisma.$disconnect();
await pool.end();
});

View File

@@ -0,0 +1,616 @@
#!/usr/bin/env tsx
/**
* Import Catholic churches from OpenStreetMap
* Usage:
* npx tsx scripts/import-osm-churches.ts --country US
* npx tsx scripts/import-osm-churches.ts --all
* npx tsx scripts/import-osm-churches.ts --country MX --dry-run
* npx tsx scripts/import-osm-churches.ts --all --sort-by-count
*/
// Load .env for database connection (before importing anything that uses process.env)
import dotenv from 'dotenv';
import path from 'path';
// Load .env.local first (production Neon URL), then .env (local fallback)
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
// Create a fresh Prisma client for this script (don't use cached pool from lib/db)
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
const pool = new Pool({
connectionString: dbUrl,
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined
});
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
import { queryOverpassByCountryWithFallback, type OSMChurch } from '../src/lib/overpass-client';
import { findDuplicateChurch, mergeChurchData } from '../src/lib/church-matcher';
import { parseServiceTimes } from '../src/lib/service-times-parser';
// Countries with significant Catholic populations, organized by priority
const CATHOLIC_COUNTRIES = {
// Priority 1: Large Catholic populations (North & South America + major European/Asian countries)
priority1: [
// North America
'US', 'MX', 'CA',
// South America
'BR', 'AR', 'CO', 'PE', 'VE', 'CL', 'EC', 'GT', 'CU', 'BO', 'DO', 'HT', 'HN', 'PY', 'SV', 'NI', 'CR', 'PA', 'UY', 'GY', 'SR', 'GF',
// Europe
'IT', 'FR', 'ES', 'PL', 'DE', 'PT', 'BE', 'CZ', 'AT', 'HU', 'IE', 'HR', 'GB',
// Asia & Oceania
'PH', 'AU', 'NG', 'CD',
],
// Priority 2: Medium Catholic populations
priority2: [
// Rest of Europe
'NL', 'SK', 'SI', 'LT', 'CH', 'LU', 'MT',
'UA', 'RO', 'LV', 'BY',
// Africa
'AO', 'UG', 'TZ', 'KE', 'CM', 'RW', 'BI', 'MG', 'MW', 'ZM', 'ZW', 'MZ', 'BJ', 'TG', 'CI', 'BF', 'ML', 'NE', 'SN', 'GN', 'LR', 'SL', 'GH', 'GA', 'CG', 'CF', 'TD', 'SD', 'ET', 'ER', 'SO',
// Asia
'IN', 'TL', 'VN', 'KR', 'JP', 'ID', 'MY', 'SG', 'TH', 'LA', 'KH', 'MM', 'CN', 'LK', 'BD', 'PK',
// Middle East
'LB', 'IL', 'PS', 'JO', 'SY', 'IQ',
// Oceania
'NZ', 'PG', 'FJ', 'NC', 'PF',
],
// Priority 3: Smaller Catholic presence
priority3: [
// Caribbean & Central America (smaller islands)
'BS', 'BB', 'JM', 'TT', 'GD', 'LC', 'VC', 'AG', 'DM', 'KN',
// Europe (smaller countries + Balkans/Eastern)
'MC', 'SM', 'VA', 'LI', 'AD',
'RS', 'BA', 'MK', 'AL', 'EE',
// Caucasus + Russia
'GE', 'AM', 'RU',
// Africa (rest)
'NA', 'BW', 'LS', 'SZ', 'MU', 'SC', 'KM', 'CV', 'ST', 'GQ', 'DJ', 'GM',
// Asia (rest)
'BT', 'NP', 'AF', 'KZ', 'UZ', 'TM', 'TJ', 'KG', 'MN', 'BN', 'MV',
// Oceania (rest)
'WS', 'TO', 'VU', 'SB', 'KI', 'NR', 'TV', 'FM', 'MH', 'PW',
],
};
interface ImportStats {
osmChurchesFound: number;
newChurchesInserted: number;
existingUpdated: number;
existingLinked: number;
churchesWithWebsites: number;
churchesWithoutWebsites: number;
churchesWithServiceTimes: number;
scheduleEntriesCreated: number;
errors: number;
}
/**
* Parse command line arguments
*/
function parseArgs(): { country?: string; all: boolean; dryRun: boolean; resumeFrom?: string; priority?: number; sortByCount: boolean } {
const args = process.argv.slice(2);
const result = {
country: undefined as string | undefined,
all: false,
dryRun: false,
resumeFrom: undefined as string | undefined,
priority: undefined as number | undefined,
sortByCount: false,
};
for (let i = 0; i < args.length; i++) {
if (args[i] === '--country' && args[i + 1]) {
result.country = args[i + 1].toUpperCase();
i++;
} else if (args[i] === '--all') {
result.all = true;
} else if (args[i] === '--dry-run') {
result.dryRun = true;
} else if (args[i] === '--resume-from' && args[i + 1]) {
result.resumeFrom = args[i + 1].toUpperCase();
i++;
} else if (args[i] === '--priority' && args[i + 1]) {
const priority = parseInt(args[i + 1], 10);
if (priority >= 1 && priority <= 3) {
result.priority = priority;
} else {
console.error('Error: --priority must be 1, 2, or 3');
process.exit(1);
}
i++;
} else if (args[i] === '--sort-by-count') {
result.sortByCount = true;
}
}
return result;
}
/**
* Import churches from a single country
*/
async function importFromOSM(countryCode: string, dryRun: boolean = false): Promise<ImportStats> {
const stats: ImportStats = {
osmChurchesFound: 0,
newChurchesInserted: 0,
existingUpdated: 0,
existingLinked: 0,
churchesWithWebsites: 0,
churchesWithoutWebsites: 0,
churchesWithServiceTimes: 0,
scheduleEntriesCreated: 0,
errors: 0,
};
console.log(`\n${'='.repeat(60)}`);
console.log(`Importing Catholic churches from ${countryCode}`);
console.log(`${'='.repeat(60)}\n`);
try {
// Query Overpass API (with automatic fallback to regional bounding boxes)
const osmChurches = await queryOverpassByCountryWithFallback(countryCode);
stats.osmChurchesFound = osmChurches.length;
if (osmChurches.length === 0) {
console.log(`No churches found in ${countryCode}`);
return stats;
}
console.log(`Found ${osmChurches.length} Catholic churches in ${countryCode}`);
if (dryRun) {
console.log('\n[DRY RUN] Would import the following churches:');
osmChurches.slice(0, 10).forEach((church) => {
console.log(` - ${church.name} (${church.city || 'unknown city'})`);
console.log(` OSM ID: ${church.osmId}, Website: ${church.website || 'none'}`);
});
if (osmChurches.length > 10) {
console.log(` ... and ${osmChurches.length - 10} more`);
}
// Count websites
stats.churchesWithWebsites = osmChurches.filter((c) => c.website).length;
stats.churchesWithoutWebsites = osmChurches.length - stats.churchesWithWebsites;
return stats;
}
// Fetch all existing churches for deduplication
// For large datasets, we could optimize by fetching only churches in the same country/region
console.log('Fetching existing churches for deduplication...');
const existingChurches = await prisma.church.findMany({
select: {
id: true,
name: true,
latitude: true,
longitude: true,
osmId: true,
baiduId: true,
masstimesId: true,
orarimesseId: true,
massSchedulesPhId: true,
philmassId: true,
horariosMisasId: true,
mszeInfoId: true,
weekdayMassesId: true,
messesInfoId: true,
bohosluzbyId: true,
miserendId: true,
kerknetId: true,
gottesdienstzeitenId: true,
discovermassId: true,
source: true,
website: true,
phone: true,
address: true,
},
});
console.log(`Found ${existingChurches.length} existing churches in database`);
// Process churches one by one (no batch transactions to avoid rollbacks)
let processed = 0;
for (const osmChurch of osmChurches) {
try {
// Check for duplicate
const duplicate = findDuplicateChurch(osmChurch, existingChurches);
if (duplicate && duplicate.osmId === osmChurch.osmId) {
// Existing church with matching osmId - update it
const mergedData = mergeChurchData(duplicate, osmChurch);
// Verify the church exists in the database (not just in our temp list from this run)
const existsInDb = await prisma.church.findUnique({ where: { id: duplicate.id } });
if (existsInDb) {
await prisma.church.update({
where: { id: duplicate.id },
data: mergedData,
});
stats.existingUpdated++;
// Import service_times for existing churches that don't have schedules yet
if (osmChurch.serviceTimes) {
const existingSchedules = await prisma.massSchedule.count({ where: { churchId: duplicate.id } });
if (existingSchedules === 0) {
const scheduleEntries = parseServiceTimes(osmChurch.serviceTimes);
if (scheduleEntries.length > 0) {
await prisma.massSchedule.createMany({
data: scheduleEntries.map(entry => ({
churchId: duplicate.id,
dayOfWeek: entry.dayOfWeek,
time: entry.time,
massType: entry.dayOfWeek === 0 ? 'Sunday' :
entry.dayOfWeek === 6 ? 'Saturday' : 'Daily',
language: 'Unknown',
notes: 'From OSM service_times tag',
})),
});
stats.churchesWithServiceTimes++;
stats.scheduleEntriesCreated += scheduleEntries.length;
}
}
}
} else {
// Duplicate from earlier in this run - skip (already processed)
stats.existingUpdated++;
}
if (osmChurch.website) stats.churchesWithWebsites++;
else stats.churchesWithoutWebsites++;
} else if (duplicate) {
// Existing church matched by proximity/name - link it with osmId
const mergedData = mergeChurchData(duplicate, osmChurch);
// Verify the church exists in the database (not just in our temp list from this run)
const existsInDb = await prisma.church.findUnique({ where: { id: duplicate.id } });
if (existsInDb) {
await prisma.church.update({
where: { id: duplicate.id },
data: mergedData,
});
stats.existingLinked++;
// Import service_times for linked churches that don't have schedules yet
if (osmChurch.serviceTimes) {
const existingSchedules = await prisma.massSchedule.count({ where: { churchId: duplicate.id } });
if (existingSchedules === 0) {
const scheduleEntries = parseServiceTimes(osmChurch.serviceTimes);
if (scheduleEntries.length > 0) {
await prisma.massSchedule.createMany({
data: scheduleEntries.map(entry => ({
churchId: duplicate.id,
dayOfWeek: entry.dayOfWeek,
time: entry.time,
massType: entry.dayOfWeek === 0 ? 'Sunday' :
entry.dayOfWeek === 6 ? 'Saturday' : 'Daily',
language: 'Unknown',
notes: 'From OSM service_times tag',
})),
});
stats.churchesWithServiceTimes++;
stats.scheduleEntriesCreated += scheduleEntries.length;
}
}
}
} else {
// Duplicate from earlier in this run - skip (already processed)
stats.existingLinked++;
}
if (osmChurch.website) stats.churchesWithWebsites++;
else stats.churchesWithoutWebsites++;
} else {
// New church - insert it and capture the real ID
const newChurch = await prisma.church.create({
data: {
name: osmChurch.name,
latitude: osmChurch.lat,
longitude: osmChurch.lng,
address: osmChurch.address,
city: osmChurch.city,
state: osmChurch.state,
zip: osmChurch.zip,
country: osmChurch.country || countryCode,
phone: osmChurch.phone,
website: osmChurch.website,
diocese: osmChurch.diocese,
wheelchairAccess: osmChurch.wheelchairAccess ?? false,
source: 'osm',
osmId: osmChurch.osmId,
hasWebsite: !!osmChurch.website,
osmLastSyncedAt: new Date(),
},
});
stats.newChurchesInserted++;
if (osmChurch.website) stats.churchesWithWebsites++;
else stats.churchesWithoutWebsites++;
// Parse service_times tag and create mass schedules
if (osmChurch.serviceTimes) {
const scheduleEntries = parseServiceTimes(osmChurch.serviceTimes);
if (scheduleEntries.length > 0) {
await prisma.massSchedule.createMany({
data: scheduleEntries.map(entry => ({
churchId: newChurch.id,
dayOfWeek: entry.dayOfWeek,
time: entry.time,
massType: entry.dayOfWeek === 0 ? 'Sunday' :
entry.dayOfWeek === 6 ? 'Saturday' : 'Daily',
language: 'Unknown',
notes: 'From OSM service_times tag',
})),
});
stats.churchesWithServiceTimes++;
stats.scheduleEntriesCreated += scheduleEntries.length;
// Mark as scraped since we have schedule data
await prisma.church.update({
where: { id: newChurch.id },
data: { lastScrapedAt: new Date() },
});
}
}
// Add to existing churches list for future deduplication in this run (use real DB ID)
existingChurches.push({
id: newChurch.id,
name: osmChurch.name,
latitude: osmChurch.lat,
longitude: osmChurch.lng,
osmId: osmChurch.osmId,
baiduId: null,
masstimesId: null,
orarimesseId: null,
massSchedulesPhId: null,
philmassId: null,
horariosMisasId: null,
mszeInfoId: null,
weekdayMassesId: null,
messesInfoId: null,
bohosluzbyId: null,
miserendId: null,
kerknetId: null,
gottesdienstzeitenId: null,
discovermassId: null,
source: 'osm',
website: osmChurch.website || null,
phone: osmChurch.phone || null,
address: osmChurch.address || null,
});
}
processed++;
// Log progress every 500 churches
if (processed % 500 === 0) {
console.log(`Progress: ${processed}/${osmChurches.length} churches processed`);
}
} catch (error) {
console.error(`Error processing church ${osmChurch.name}:`, error);
stats.errors++;
}
}
console.log(`\nProcessed all ${osmChurches.length} churches from ${countryCode}`);
} catch (error) {
console.error(`Failed to import from ${countryCode}:`, error);
stats.errors++;
}
return stats;
}
/**
* Print import summary
*/
function printSummary(countryCode: string, stats: ImportStats, dryRun: boolean) {
console.log(`\n${'='.repeat(60)}`);
console.log(`Import Summary for ${countryCode} ${dryRun ? '(DRY RUN)' : ''}`);
console.log(`${'='.repeat(60)}`);
console.log(`OSM churches found: ${stats.osmChurchesFound}`);
if (!dryRun) {
console.log(`New churches inserted: ${stats.newChurchesInserted}`);
console.log(`Existing churches updated: ${stats.existingUpdated} (matched by osmId)`);
console.log(`Existing churches linked: ${stats.existingLinked} (matched by proximity)`);
}
console.log(`Churches with websites: ${stats.churchesWithWebsites}`);
console.log(`Churches without websites: ${stats.churchesWithoutWebsites}`);
if (!dryRun && stats.churchesWithServiceTimes > 0) {
console.log(`Churches with service_times: ${stats.churchesWithServiceTimes}`);
console.log(`Schedule entries created: ${stats.scheduleEntriesCreated}`);
}
if (!dryRun && stats.errors > 0) {
console.log(`Errors encountered: ${stats.errors}`);
}
console.log(`${'='.repeat(60)}\n`);
}
/**
* Main function
*/
async function createOrResumeJob(args: string[]): Promise<string | null> {
const jobIdIndex = args.indexOf('--job-id');
if (jobIdIndex !== -1) {
const jobId = args[jobIdIndex + 1];
await prisma.backgroundJob.update({
where: { id: jobId },
data: { status: 'running', startedAt: new Date() },
});
return jobId;
}
return null;
}
async function completeJob(jobId: string | null, error?: string): Promise<void> {
if (!jobId) return;
try {
await prisma.backgroundJob.update({
where: { id: jobId },
data: {
status: error ? 'failed' : 'completed',
error: error || null,
completedAt: new Date(),
},
});
} catch (err) {
console.error(`Failed to update job ${jobId}:`, err);
}
}
async function main() {
const { country, all, dryRun, resumeFrom, priority, sortByCount } = parseArgs();
const jobId = await createOrResumeJob(process.argv.slice(2));
if (!country && !all && !priority) {
console.error('Error: Must specify --country <CODE>, --all, or --priority <1|2|3>');
console.error('Usage:');
console.error(' npx tsx scripts/import-osm-churches.ts --country US');
console.error(' npx tsx scripts/import-osm-churches.ts --all');
console.error(' npx tsx scripts/import-osm-churches.ts --priority 1');
console.error(' npx tsx scripts/import-osm-churches.ts --all --resume-from IT');
console.error(' npx tsx scripts/import-osm-churches.ts --country MX --dry-run');
console.error(' npx tsx scripts/import-osm-churches.ts --all --sort-by-count');
process.exit(1);
}
if (dryRun) {
console.log('\n*** DRY RUN MODE - No changes will be made to database ***\n');
}
try {
if (country) {
// Import single country
const stats = await importFromOSM(country, dryRun);
printSummary(country, stats, dryRun);
} else if (all || priority !== undefined) {
// Import all countries or specific priority
let allCountries: string[];
if (priority !== undefined) {
// Import only specified priority level
const priorityKey = `priority${priority}` as keyof typeof CATHOLIC_COUNTRIES;
allCountries = CATHOLIC_COUNTRIES[priorityKey];
console.log(`Importing Priority ${priority} countries (${allCountries.length} countries)...\n`);
} else {
// Import all priorities
console.log('Importing all Catholic countries by priority...\n');
allCountries = [
...CATHOLIC_COUNTRIES.priority1,
...CATHOLIC_COUNTRIES.priority2,
...CATHOLIC_COUNTRIES.priority3,
];
}
// Sort by existing OSM church count (least first) if requested
if (sortByCount) {
console.log('Querying DB for current OSM church counts per country...');
const countRows = await prisma.$queryRawUnsafe<Array<{ country: string; count: bigint }>>(
`SELECT country, COUNT(*) as count FROM churches WHERE source = 'osm' AND country IS NOT NULL GROUP BY country`
);
const countMap = new Map<string, number>();
for (const row of countRows) {
countMap.set(row.country, Number(row.count));
}
allCountries.sort((a, b) => (countMap.get(a) || 0) - (countMap.get(b) || 0));
console.log('Country processing order (least OSM churches first):');
for (const c of allCountries) {
console.log(` ${c}: ${countMap.get(c) || 0} existing OSM churches`);
}
console.log('');
}
// Handle --resume-from flag
if (resumeFrom) {
const resumeIndex = allCountries.indexOf(resumeFrom);
if (resumeIndex === -1) {
console.error(`Error: Country ${resumeFrom} not found in the list`);
process.exit(1);
}
console.log(`Resuming from ${resumeFrom} (skipping first ${resumeIndex} countries)...\n`);
allCountries = allCountries.slice(resumeIndex);
}
const totalStats: ImportStats = {
osmChurchesFound: 0,
newChurchesInserted: 0,
existingUpdated: 0,
existingLinked: 0,
churchesWithWebsites: 0,
churchesWithoutWebsites: 0,
churchesWithServiceTimes: 0,
scheduleEntriesCreated: 0,
errors: 0,
};
for (const countryCode of allCountries) {
const stats = await importFromOSM(countryCode, dryRun);
printSummary(countryCode, stats, dryRun);
// Aggregate stats
totalStats.osmChurchesFound += stats.osmChurchesFound;
totalStats.newChurchesInserted += stats.newChurchesInserted;
totalStats.existingUpdated += stats.existingUpdated;
totalStats.existingLinked += stats.existingLinked;
totalStats.churchesWithWebsites += stats.churchesWithWebsites;
totalStats.churchesWithoutWebsites += stats.churchesWithoutWebsites;
totalStats.churchesWithServiceTimes += stats.churchesWithServiceTimes;
totalStats.scheduleEntriesCreated += stats.scheduleEntriesCreated;
totalStats.errors += stats.errors;
// Small delay between countries to be respectful (rate limiting is also in the client)
await new Promise((resolve) => setTimeout(resolve, 2000));
}
// Print overall summary
console.log(`\n${'='.repeat(60)}`);
console.log(`OVERALL SUMMARY ${dryRun ? '(DRY RUN)' : ''}`);
console.log(`${'='.repeat(60)}`);
console.log(`Total countries processed: ${allCountries.length}`);
console.log(`Total OSM churches found: ${totalStats.osmChurchesFound}`);
if (!dryRun) {
console.log(`Total new churches inserted: ${totalStats.newChurchesInserted}`);
console.log(`Total churches updated: ${totalStats.existingUpdated}`);
console.log(`Total churches linked: ${totalStats.existingLinked}`);
}
console.log(`Total with websites: ${totalStats.churchesWithWebsites}`);
console.log(`Total without websites: ${totalStats.churchesWithoutWebsites}`);
if (!dryRun && totalStats.errors > 0) {
console.log(`Total errors: ${totalStats.errors}`);
}
console.log(`${'='.repeat(60)}\n`);
}
await completeJob(jobId);
} catch (error) {
console.error('Fatal error:', error);
await completeJob(jobId, String(error));
process.exit(1);
} finally {
await prisma.$disconnect();
}
}
main();

View File

@@ -0,0 +1,346 @@
#!/usr/bin/env tsx
/**
* Import Catholic churches from a specific region of a country
* Usage:
* npx tsx scripts/import-osm-region.ts --country GB --region "England South"
* npx tsx scripts/import-osm-region.ts --country IT --region "North" --dry-run
*/
// Load .env for database connection (before importing anything that uses process.env)
import dotenv from 'dotenv';
import path from 'path';
// Load .env.local first (production Neon URL), then .env (local fallback)
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
// Create a fresh Prisma client for this script (don't use cached pool from lib/db)
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
const pool = new Pool({
connectionString: dbUrl,
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined
});
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
import { COUNTRY_BOUNDING_BOXES, queryOverpassByBoundingBox, type OSMChurch } from '../src/lib/overpass-client';
import { findDuplicateChurch, mergeChurchData } from '../src/lib/church-matcher';
interface ImportStats {
osmChurchesFound: number;
newChurchesInserted: number;
existingUpdated: number;
existingLinked: number;
churchesWithWebsites: number;
churchesWithoutWebsites: number;
errors: number;
}
/**
* Parse command line arguments
*/
function parseArgs(): { country?: string; region?: string; dryRun: boolean } {
const args = process.argv.slice(2);
const result = {
country: undefined as string | undefined,
region: undefined as string | undefined,
dryRun: false,
};
for (let i = 0; i < args.length; i++) {
if (args[i] === '--country' && args[i + 1]) {
result.country = args[i + 1].toUpperCase();
i++;
} else if (args[i] === '--region' && args[i + 1]) {
result.region = args[i + 1];
i++;
} else if (args[i] === '--dry-run') {
result.dryRun = true;
}
}
return result;
}
/**
* Import churches from a single region
*/
async function importFromRegion(countryCode: string, regionName: string, dryRun: boolean = false): Promise<ImportStats> {
const stats: ImportStats = {
osmChurchesFound: 0,
newChurchesInserted: 0,
existingUpdated: 0,
existingLinked: 0,
churchesWithWebsites: 0,
churchesWithoutWebsites: 0,
errors: 0,
};
console.log(`\n${'='.repeat(60)}`);
console.log(`Importing from ${countryCode} - ${regionName}`);
console.log(`${'='.repeat(60)}\n`);
// Look up the bounding box
const regions = COUNTRY_BOUNDING_BOXES[countryCode];
if (!regions) {
console.error(`Error: No bounding boxes defined for country ${countryCode}`);
console.error('Available countries:', Object.keys(COUNTRY_BOUNDING_BOXES).join(', '));
process.exit(1);
}
const region = regions.find(r => r.name === regionName);
if (!region) {
console.error(`Error: Region "${regionName}" not found for ${countryCode}`);
console.error('Available regions:', regions.map(r => r.name).join(', '));
process.exit(1);
}
try {
// Query Overpass API for this specific region
console.log(`Querying bounding box: (${region.south}, ${region.west}, ${region.north}, ${region.east})`);
const osmChurches = await queryOverpassByBoundingBox(region.south, region.west, region.north, region.east);
stats.osmChurchesFound = osmChurches.length;
if (osmChurches.length === 0) {
console.log(`No churches found in ${regionName}`);
return stats;
}
console.log(`Found ${osmChurches.length} Catholic churches in ${regionName}`);
if (dryRun) {
console.log('\n[DRY RUN] Would import the following churches:');
osmChurches.slice(0, 10).forEach((church) => {
console.log(` - ${church.name} (${church.city || 'unknown city'})`);
console.log(` OSM ID: ${church.osmId}, Website: ${church.website || 'none'}`);
});
if (osmChurches.length > 10) {
console.log(` ... and ${osmChurches.length - 10} more`);
}
// Count websites
stats.churchesWithWebsites = osmChurches.filter((c) => c.website).length;
stats.churchesWithoutWebsites = osmChurches.length - stats.churchesWithWebsites;
return stats;
}
// Fetch all existing churches for deduplication
console.log('Fetching existing churches for deduplication...');
const existingChurches = await prisma.church.findMany({
select: {
id: true,
name: true,
latitude: true,
longitude: true,
osmId: true,
baiduId: true,
masstimesId: true,
orarimesseId: true,
massSchedulesPhId: true,
philmassId: true,
horariosMisasId: true,
mszeInfoId: true,
weekdayMassesId: true,
messesInfoId: true,
bohosluzbyId: true,
miserendId: true,
kerknetId: true,
gottesdienstzeitenId: true,
discovermassId: true,
source: true,
website: true,
phone: true,
address: true,
},
});
console.log(`Found ${existingChurches.length} existing churches in database`);
// Process churches one by one (no batch transactions to avoid rollbacks)
let processed = 0;
for (const osmChurch of osmChurches) {
try {
// Check for duplicate
const duplicate = findDuplicateChurch(osmChurch, existingChurches);
if (duplicate && duplicate.osmId === osmChurch.osmId) {
// Existing church with matching osmId - update it
const mergedData = mergeChurchData(duplicate, osmChurch);
// Verify the church exists in the database (not just in our temp list from this run)
const existsInDb = await prisma.church.findUnique({ where: { id: duplicate.id } });
if (existsInDb) {
await prisma.church.update({
where: { id: duplicate.id },
data: mergedData,
});
stats.existingUpdated++;
} else {
// Duplicate from earlier in this run - skip (already processed)
stats.existingUpdated++;
}
if (osmChurch.website) stats.churchesWithWebsites++;
else stats.churchesWithoutWebsites++;
} else if (duplicate) {
// Existing church matched by proximity/name - link it with osmId
const mergedData = mergeChurchData(duplicate, osmChurch);
// Verify the church exists in the database (not just in our temp list from this run)
const existsInDb = await prisma.church.findUnique({ where: { id: duplicate.id } });
if (existsInDb) {
await prisma.church.update({
where: { id: duplicate.id },
data: mergedData,
});
stats.existingLinked++;
} else {
// Duplicate from earlier in this run - skip (already processed)
stats.existingLinked++;
}
if (osmChurch.website) stats.churchesWithWebsites++;
else stats.churchesWithoutWebsites++;
} else {
// New church - insert it and capture the real ID
const newChurch = await prisma.church.create({
data: {
name: osmChurch.name,
latitude: osmChurch.lat,
longitude: osmChurch.lng,
address: osmChurch.address,
city: osmChurch.city,
state: osmChurch.state,
zip: osmChurch.zip,
country: osmChurch.country || countryCode,
phone: osmChurch.phone,
website: osmChurch.website,
diocese: osmChurch.diocese,
wheelchairAccess: osmChurch.wheelchairAccess ?? false,
source: 'osm',
osmId: osmChurch.osmId,
hasWebsite: !!osmChurch.website,
osmLastSyncedAt: new Date(),
},
});
stats.newChurchesInserted++;
if (osmChurch.website) stats.churchesWithWebsites++;
else stats.churchesWithoutWebsites++;
// Add to existing churches list for future deduplication in this run (use real DB ID)
existingChurches.push({
id: newChurch.id,
name: osmChurch.name,
latitude: osmChurch.lat,
longitude: osmChurch.lng,
osmId: osmChurch.osmId,
baiduId: null,
masstimesId: null,
orarimesseId: null,
massSchedulesPhId: null,
philmassId: null,
horariosMisasId: null,
mszeInfoId: null,
weekdayMassesId: null,
messesInfoId: null,
bohosluzbyId: null,
miserendId: null,
kerknetId: null,
gottesdienstzeitenId: null,
discovermassId: null,
source: 'osm',
website: osmChurch.website || null,
phone: osmChurch.phone || null,
address: osmChurch.address || null,
});
}
processed++;
// Log progress every 100 churches
if (processed % 100 === 0) {
console.log(`Progress: ${processed}/${osmChurches.length} churches processed`);
}
} catch (error) {
console.error(`Error processing church ${osmChurch.name}:`, error);
stats.errors++;
}
}
console.log(`\nProcessed all ${osmChurches.length} churches from ${regionName}`);
} catch (error) {
console.error(`Failed to import from ${regionName}:`, error);
stats.errors++;
}
return stats;
}
/**
* Print import summary
*/
function printSummary(countryCode: string, regionName: string, stats: ImportStats, dryRun: boolean) {
console.log(`\n${'='.repeat(60)}`);
console.log(`Import Summary for ${countryCode} - ${regionName} ${dryRun ? '(DRY RUN)' : ''}`);
console.log(`${'='.repeat(60)}`);
console.log(`OSM churches found: ${stats.osmChurchesFound}`);
if (!dryRun) {
console.log(`New churches inserted: ${stats.newChurchesInserted}`);
console.log(`Existing churches updated: ${stats.existingUpdated} (matched by osmId)`);
console.log(`Existing churches linked: ${stats.existingLinked} (matched by proximity)`);
}
console.log(`Churches with websites: ${stats.churchesWithWebsites}`);
console.log(`Churches without websites: ${stats.churchesWithoutWebsites}`);
if (!dryRun && stats.errors > 0) {
console.log(`Errors encountered: ${stats.errors}`);
}
console.log(`${'='.repeat(60)}\n`);
}
/**
* Main function
*/
async function main() {
const { country, region, dryRun } = parseArgs();
if (!country || !region) {
console.error('Error: Must specify both --country <CODE> and --region <NAME>');
console.error('Usage:');
console.error(' npx tsx scripts/import-osm-region.ts --country GB --region "England South"');
console.error(' npx tsx scripts/import-osm-region.ts --country IT --region "North" --dry-run');
console.error('\nAvailable countries:', Object.keys(COUNTRY_BOUNDING_BOXES).join(', '));
process.exit(1);
}
if (dryRun) {
console.log('\n*** DRY RUN MODE - No changes will be made to database ***\n');
}
try {
const stats = await importFromRegion(country, region, dryRun);
printSummary(country, region, stats, dryRun);
} catch (error) {
console.error('Fatal error:', error);
process.exit(1);
} finally {
await prisma.$disconnect();
}
}
main();

742
scripts/import-philmass.ts Normal file
View File

@@ -0,0 +1,742 @@
#!/usr/bin/env tsx
/**
* Import Catholic churches and mass schedules from Philmass.com
*
* Philmass.com provides rich Schema.org-annotated mass schedule data for
* Philippine churches. It has no coordinates, so we match against existing
* churches (OSM + mass-schedules.com) and only update matched records.
* Unmatched churches are logged for manual review.
*
* Discovery strategy:
* 1. Fetch Philippines page → extract province URLs
* 2. For each province → extract city listing URLs
* 3. For each city listing → extract church mass-schedule URLs
* 4. Deduplicate all church URLs globally
* 5. For each church: parse JSON-LD + Schema.org Events, match, upsert
*
* Usage:
* npx tsx scripts/import-philmass.ts --all
* npx tsx scripts/import-philmass.ts --all --dry-run
* npx tsx scripts/import-philmass.ts --province Metro-Manila
* npx tsx scripts/import-philmass.ts --all --resume-from Cebu
* npx tsx scripts/import-philmass.ts --all --job-id {uuid}
*/
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
const pool = new Pool({
connectionString: dbUrl,
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
});
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
import { findDuplicateChurch } from '../src/lib/church-matcher';
import type { ExistingChurch } from '../src/lib/church-matcher';
// ─── Constants ───────────────────────────────────────────────────────────────
const SITE_BASE = 'https://www.philmass.com';
const PHILIPPINES_URL = `${SITE_BASE}/Asia/Philippines.html`;
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
const REQUEST_DELAY_MS = 2000;
// ─── Types ───────────────────────────────────────────────────────────────────
interface ProvinceInfo {
name: string;
url: string;
}
interface ChurchUrl {
url: string;
slug: string; // URL slug used as philmassId
province: string;
city: string;
}
interface ParsedPhilmassChurch {
name: string;
streetAddress: string | null;
city: string | null;
region: string | null;
}
interface ParsedSchedule {
dayOfWeek: number;
time: string;
}
interface ImportStats {
provincesProcessed: number;
citiesProcessed: number;
churchUrlsDiscovered: number;
churchesProcessed: number;
churchesMatched: number;
churchesUnmatched: number;
churchesSkipped: number;
schedulesUpdated: number;
massSchedulesCreated: number;
errors: number;
}
interface CLIArgs {
all: boolean;
province?: string;
dryRun: boolean;
resumeFrom?: string;
jobId?: string;
}
// ─── HTTP Client ─────────────────────────────────────────────────────────────
let requestCount = 0;
function delay(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function fetchPage(url: string): Promise<string | null> {
if (requestCount > 0) {
await delay(REQUEST_DELAY_MS);
}
requestCount++;
try {
const response = await fetch(url, {
headers: {
'User-Agent': USER_AGENT,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
},
});
if (!response.ok) {
console.error(` HTTP ${response.status} for ${url}`);
return null;
}
return await response.text();
} catch (error) {
console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`);
return null;
}
}
// ─── Discovery: Province → City → Church URLs ───────────────────────────────
async function fetchProvinceUrls(): Promise<ProvinceInfo[]> {
console.log(`Fetching Philippines page: ${PHILIPPINES_URL}`);
const html = await fetchPage(PHILIPPINES_URL);
if (!html) throw new Error('Failed to fetch Philippines page');
const provinces: ProvinceInfo[] = [];
const seen = new Set<string>();
// Pattern: href="https://www.philmass.com/Asia/Philippines/{Province}.html"
const regex = /href="(https:\/\/www\.philmass\.com\/Asia\/Philippines\/([^/"]+)\.html)"/g;
let match;
while ((match = regex.exec(html)) !== null) {
const url = match[1];
const name = match[2];
// Skip non-province pages (weekly-sunday, etc.)
if (name.includes('weekly') || name.includes('Roman-Catholic') || seen.has(name)) continue;
seen.add(name);
provinces.push({ name, url });
}
return provinces;
}
function decodeHtmlEntities(str: string): string {
return str
.replace(/&#(\d+);/g, (_, code: string) => String.fromCharCode(parseInt(code)))
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"');
}
async function fetchCityListingUrls(provinceUrl: string, provinceName: string): Promise<string[]> {
const html = await fetchPage(provinceUrl);
if (!html) return [];
const urls: string[] = [];
const seen = new Set<string>();
// Pattern: href=".../{Province}/{City}/Roman-Catholic-Churches-in-{City}...html"
const regex = /href="(https:\/\/www\.philmass\.com\/Asia\/Philippines\/[^"]*\/Roman-Catholic-Churches-in-[^"]*\.html)"/g;
let match;
while ((match = regex.exec(html)) !== null) {
const url = decodeHtmlEntities(match[1]);
if (seen.has(url)) continue;
seen.add(url);
urls.push(url);
}
return urls;
}
async function fetchChurchUrlsFromCityPage(cityUrl: string, provinceName: string): Promise<ChurchUrl[]> {
const html = await fetchPage(cityUrl);
if (!html) return [];
const churches: ChurchUrl[] = [];
const seen = new Set<string>();
// Pattern: href=".../Roman-Catholic-Churches/{Church-Name}/mass-schedule.html"
const regex = /href="(https:\/\/www\.philmass\.com\/Asia\/Philippines\/([^/]+)\/([^/]+)\/Roman-Catholic-Churches\/([^/]+)\/mass-schedule\.html)"/g;
let match;
while ((match = regex.exec(html)) !== null) {
const url = decodeHtmlEntities(match[1]);
const province = decodeURIComponent(decodeHtmlEntities(match[2]));
const city = decodeURIComponent(decodeHtmlEntities(match[3]));
const slug = decodeURIComponent(decodeHtmlEntities(match[4]));
if (seen.has(url)) continue;
seen.add(url);
churches.push({ url, slug, province, city });
}
return churches;
}
// ─── HTML Parsers ────────────────────────────────────────────────────────────
function parseChurchJsonLd(html: string): ParsedPhilmassChurch | null {
// Extract JSON-LD: <script type="application/ld+json">{...}</script>
const jsonLdMatch = html.match(/<script\s+type="application\/ld\+json"\s*>([\s\S]*?)<\/script>/i);
if (!jsonLdMatch) return null;
try {
const data = JSON.parse(jsonLdMatch[1]);
const church = data.mainEntityOfPage;
if (!church || church['@type'] !== 'PlaceOfWorship') return null;
const address = church.address || {};
return {
name: church.name || null,
streetAddress: address.streetAddress?.replace(/,\s*$/, '').trim() || null,
city: address.addressLocality || null,
region: address.addressRegion || null,
};
} catch {
return null;
}
}
function parseChurchNameFromH1(html: string): string | null {
// Fallback: <h1>Quiapo Church mass schedule 2026 - Minor Basilica of the Black Nazarene</h1>
const h1Match = html.match(/<h1>([^<]+)<\/h1>/i);
if (!h1Match) return null;
let name = h1Match[1].trim();
// Remove "mass schedule YYYY" and trailing " - "
name = name.replace(/\s*mass\s+schedule\s+\d{4}\s*/i, '');
name = name.replace(/^\s*-\s*/, '').replace(/\s*-\s*$/, '');
return name.trim() || null;
}
function parseScheduleFromStartDates(html: string): ParsedSchedule[] {
// Extract all startDate ISO timestamps from Schema.org Event microdata
// Pattern: itemprop="startDate" content="2026-02-22T05:00:00+08:00"
const schedules: ParsedSchedule[] = [];
const seen = new Set<string>();
const regex = /itemprop="startDate"\s+content="(\d{4}-\d{2}-\d{2})T(\d{2}):(\d{2}):\d{2}[^"]*"/g;
let match;
while ((match = regex.exec(html)) !== null) {
const dateStr = match[1];
const hours = match[2];
const minutes = match[3];
// Derive dayOfWeek from the date
const date = new Date(`${dateStr}T12:00:00`); // noon to avoid TZ issues
const dayOfWeek = date.getDay(); // 0=Sun, 1=Mon, ..., 6=Sat
const time = `${hours}:${minutes}`;
const key = `${dayOfWeek}:${time}`;
if (seen.has(key)) continue;
seen.add(key);
schedules.push({ dayOfWeek, time });
}
return schedules;
}
// ─── Database Operations ─────────────────────────────────────────────────────
async function loadExistingPhilippineChurches(): Promise<ExistingChurch[]> {
console.log('Loading existing Philippine churches for deduplication...');
const churches = await prisma.church.findMany({
where: { country: 'PH' },
select: {
id: true,
name: true,
latitude: true,
longitude: true,
osmId: true,
baiduId: true,
masstimesId: true,
orarimesseId: true,
massSchedulesPhId: true,
philmassId: true,
horariosMisasId: true,
mszeInfoId: true,
weekdayMassesId: true,
messesInfoId: true,
bohosluzbyId: true,
miserendId: true,
kerknetId: true,
gottesdienstzeitenId: true,
discovermassId: true,
source: true,
website: true,
phone: true,
address: true,
},
});
console.log(`Loaded ${churches.length} existing Philippine churches`);
return churches;
}
// ─── Import Logic ────────────────────────────────────────────────────────────
async function processChurch(
churchUrl: ChurchUrl,
existingChurches: ExistingChurch[],
unmatchedLog: string[],
dryRun: boolean,
stats: ImportStats,
): Promise<void> {
stats.churchesProcessed++;
const html = await fetchPage(churchUrl.url);
if (!html) {
stats.errors++;
return;
}
// Parse church info from JSON-LD
const jsonLd = parseChurchJsonLd(html);
const churchName = jsonLd?.name || parseChurchNameFromH1(html);
if (!churchName) {
console.log(` Skipping ${churchUrl.slug}: no name found`);
stats.churchesSkipped++;
return;
}
// Parse schedules from Schema.org startDate attributes
const schedules = parseScheduleFromStartDates(html);
// Try to find a match by philmassId first
const existingByPhilmass = existingChurches.find((c) => c.philmassId === churchUrl.slug);
let matched = existingByPhilmass || null;
// If no philmassId match, try name-based matching against churches with coordinates
if (!matched) {
// Try matching by name similarity against all PH churches
// We can't use findDuplicateChurch() without coordinates, so do name-only matching
const normalizedName = churchName.toLowerCase()
.replace(/\bst\.\s/g, 'saint ')
.replace(/\bst\s/g, 'saint ')
.replace(/\bcatholic church\b/g, '')
.replace(/\bparish\b/g, '')
.replace(/\bchurch\b/g, '')
.replace(/[^\w\s]/g, '')
.replace(/\s+/g, ' ')
.trim();
// Filter to churches in the same city if possible
const cityName = jsonLd?.city || churchUrl.city.replace(/-/g, ' ');
const candidatesInCity = existingChurches.filter((c) => {
if (!c.address) return false;
return c.address.toLowerCase().includes(cityName.toLowerCase());
});
// Search in-city candidates first, then all PH churches
const searchPools = candidatesInCity.length > 0
? [candidatesInCity, existingChurches]
: [existingChurches];
for (const searchPool of searchPools) {
if (matched) break;
for (const existing of searchPool) {
const existingNorm = existing.name.toLowerCase()
.replace(/\bst\.\s/g, 'saint ')
.replace(/\bst\s/g, 'saint ')
.replace(/\bcatholic church\b/g, '')
.replace(/\bparish\b/g, '')
.replace(/\bchurch\b/g, '')
.replace(/[^\w\s]/g, '')
.replace(/\s+/g, ' ')
.trim();
// Require strong name match: one name contains the other, or very similar
// Guard against overly generic names ("chapel", "holy", etc.) by requiring
// that the shorter name is at least 8 chars after normalization
const shorter = normalizedName.length <= existingNorm.length ? normalizedName : existingNorm;
if (shorter.length >= 8) {
if (normalizedName.includes(existingNorm) || existingNorm.includes(normalizedName)) {
matched = existing;
break;
}
}
}
}
}
if (dryRun) {
if (matched) {
stats.churchesMatched++;
console.log(` [MATCH] "${churchName}" → existing "${matched.name}" (${matched.id})`);
} else {
stats.churchesUnmatched++;
unmatchedLog.push(`${churchName} | ${jsonLd?.city || churchUrl.city} | ${churchUrl.url}`);
console.log(` [UNMATCHED] "${churchName}" in ${jsonLd?.city || churchUrl.city}`);
}
if (schedules.length > 0) {
stats.massSchedulesCreated += schedules.length;
}
return;
}
if (!matched) {
stats.churchesUnmatched++;
unmatchedLog.push(`${churchName} | ${jsonLd?.city || churchUrl.city} | ${churchUrl.url}`);
return;
}
stats.churchesMatched++;
// Update existing church: set philmassId, fill missing fields
const updateData: Record<string, unknown> = {
philmassId: churchUrl.slug,
};
if (!matched.address && jsonLd?.streetAddress) {
const fullAddress = [jsonLd.streetAddress, jsonLd.city, jsonLd.region]
.filter(Boolean).join(', ');
updateData.address = fullAddress;
}
// Fill city/state from JSON-LD or URL
const dbRecord = await prisma.church.findUnique({
where: { id: matched.id },
select: { city: true, state: true },
});
if (dbRecord && !dbRecord.city && (jsonLd?.city || churchUrl.city)) {
updateData.city = jsonLd?.city || churchUrl.city.replace(/-/g, ' ');
}
if (dbRecord && !dbRecord.state && (jsonLd?.region || churchUrl.province)) {
updateData.state = jsonLd?.region || churchUrl.province.replace(/-/g, ' ');
}
try {
await prisma.church.update({
where: { id: matched.id },
data: updateData,
});
} catch (error) {
if (error instanceof Error && error.message.includes('Unique constraint')) {
stats.churchesSkipped++;
return;
}
throw error;
}
// Replace mass schedules if we have any
if (schedules.length > 0) {
try {
await prisma.$transaction(async (tx) => {
await tx.massSchedule.deleteMany({ where: { churchId: matched!.id } });
await tx.massSchedule.createMany({
data: schedules.map((s) => ({
churchId: matched!.id,
dayOfWeek: s.dayOfWeek,
time: s.time,
language: 'English',
})),
});
await tx.church.update({
where: { id: matched!.id },
data: { lastScrapedAt: new Date() },
});
});
stats.schedulesUpdated++;
stats.massSchedulesCreated += schedules.length;
} catch (error) {
stats.errors++;
console.error(` Error saving schedules for ${churchUrl.slug}: ${error instanceof Error ? error.message : error}`);
}
}
}
// ─── CLI ─────────────────────────────────────────────────────────────────────
function parseArgs(): CLIArgs {
const args = process.argv.slice(2);
const result: CLIArgs = {
all: false,
dryRun: false,
};
for (let i = 0; i < args.length; i++) {
switch (args[i]) {
case '--all':
result.all = true;
break;
case '--province':
result.province = args[++i];
break;
case '--dry-run':
result.dryRun = true;
break;
case '--resume-from':
result.resumeFrom = args[++i];
break;
case '--job-id':
result.jobId = args[++i];
break;
case '--help':
case '-h':
console.log(`
Usage: npx tsx scripts/import-philmass.ts [options]
Options:
--all Import from all provinces
--province <name> Import from a single province (e.g. "Metro-Manila")
--dry-run No database writes, just report what would happen
--resume-from <province> Skip provinces until reaching this one
--job-id <uuid> Background job tracking ID
--help, -h Show this help message
Examples:
npx tsx scripts/import-philmass.ts --province Metro-Manila --dry-run
npx tsx scripts/import-philmass.ts --all
npx tsx scripts/import-philmass.ts --all --resume-from Cebu
`);
process.exit(0);
}
}
if (!result.all && !result.province) {
console.error('Error: specify --all or --province <name>');
process.exit(1);
}
return result;
}
// ─── Helpers ─────────────────────────────────────────────────────────────────
function formatDuration(ms: number): string {
const seconds = Math.floor(ms / 1000);
const minutes = Math.floor(seconds / 60);
const hours = Math.floor(minutes / 60);
if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`;
if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
return `${seconds}s`;
}
// ─── Main ────────────────────────────────────────────────────────────────────
async function main() {
const args = parseArgs();
const startTime = Date.now();
console.log('\n' + '='.repeat(70));
console.log('PHILMASS.COM IMPORTER');
console.log('='.repeat(70));
console.log(`Mode: ${args.all ? 'All provinces' : `Single province: ${args.province}`}`);
console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`);
if (args.resumeFrom) console.log(`Resume from: ${args.resumeFrom}`);
console.log(`Time: ${new Date().toISOString()}`);
console.log('='.repeat(70) + '\n');
// Update background job status if provided
if (args.jobId) {
try {
await prisma.backgroundJob.update({
where: { id: args.jobId },
data: { status: 'running', startedAt: new Date() },
});
} catch {
// Job might not exist yet
}
}
// Load existing Philippine churches for dedup
const existingChurches = await loadExistingPhilippineChurches();
// ─── Phase 1: Discover all church URLs ───────────────────────────────────
console.log('=== Phase 1: Discovering church URLs ===\n');
const allProvinces = await fetchProvinceUrls();
console.log(`Found ${allProvinces.length} provinces\n`);
// Filter to requested provinces
let provincesToProcess: ProvinceInfo[];
if (args.province) {
const found = allProvinces.find((p) => p.name === args.province);
if (!found) {
console.error(`Province "${args.province}" not found. Available: ${allProvinces.map((p) => p.name).join(', ')}`);
process.exit(1);
}
provincesToProcess = [found];
} else {
provincesToProcess = allProvinces;
}
// Handle --resume-from
if (args.resumeFrom) {
const idx = provincesToProcess.findIndex((p) => p.name === args.resumeFrom);
if (idx === -1) {
console.error(`Resume province "${args.resumeFrom}" not found.`);
process.exit(1);
}
console.log(`Resuming from province "${args.resumeFrom}" (skipping ${idx} provinces)\n`);
provincesToProcess = provincesToProcess.slice(idx);
}
// Collect all unique church URLs across all provinces/cities
const allChurchUrls = new Map<string, ChurchUrl>(); // keyed by URL to deduplicate
const stats: ImportStats = {
provincesProcessed: 0,
citiesProcessed: 0,
churchUrlsDiscovered: 0,
churchesProcessed: 0,
churchesMatched: 0,
churchesUnmatched: 0,
churchesSkipped: 0,
schedulesUpdated: 0,
massSchedulesCreated: 0,
errors: 0,
};
for (let pi = 0; pi < provincesToProcess.length; pi++) {
const province = provincesToProcess[pi];
const elapsed = formatDuration(Date.now() - startTime);
console.log(`[${pi + 1}/${provincesToProcess.length}] Province: ${province.name} [${elapsed} elapsed]`);
try {
// Get city listing URLs from province page
const cityUrls = await fetchCityListingUrls(province.url, province.name);
console.log(` Found ${cityUrls.length} city listing pages`);
for (const cityUrl of cityUrls) {
const churchUrls = await fetchChurchUrlsFromCityPage(cityUrl, province.name);
stats.citiesProcessed++;
for (const church of churchUrls) {
if (!allChurchUrls.has(church.url)) {
allChurchUrls.set(church.url, church);
}
}
}
stats.provincesProcessed++;
console.log(` Total unique churches so far: ${allChurchUrls.size}`);
} catch (error) {
stats.errors++;
console.error(` ERROR discovering ${province.name}: ${error instanceof Error ? error.message : error}`);
}
}
stats.churchUrlsDiscovered = allChurchUrls.size;
console.log(`\nDiscovery complete: ${allChurchUrls.size} unique church URLs across ${stats.citiesProcessed} city pages\n`);
// ─── Phase 2: Process each church ─────────────────────────────────────────
console.log('=== Phase 2: Processing churches ===\n');
const churchList = [...allChurchUrls.values()];
const unmatchedLog: string[] = [];
for (let i = 0; i < churchList.length; i++) {
const church = churchList[i];
const elapsed = formatDuration(Date.now() - startTime);
if ((i + 1) % 50 === 0 || i === 0) {
console.log(`[${i + 1}/${churchList.length}] Processing churches... [${elapsed} elapsed]`);
}
try {
await processChurch(church, existingChurches, unmatchedLog, args.dryRun, stats);
} catch (error) {
stats.errors++;
console.error(` ERROR processing ${church.slug}: ${error instanceof Error ? error.message : error}`);
}
}
// Print summary
const totalTime = Date.now() - startTime;
console.log('\n' + '='.repeat(70));
console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
console.log('='.repeat(70));
console.log(`Provinces processed: ${stats.provincesProcessed}`);
console.log(`Cities processed: ${stats.citiesProcessed}`);
console.log(`Church URLs discovered: ${stats.churchUrlsDiscovered}`);
console.log(`Churches processed: ${stats.churchesProcessed}`);
console.log(` Matched (updated): ${stats.churchesMatched}`);
console.log(` Unmatched (skipped): ${stats.churchesUnmatched}`);
console.log(` Skipped (other): ${stats.churchesSkipped}`);
console.log(`Schedules updated: ${stats.schedulesUpdated}`);
console.log(`Mass schedules created: ${stats.massSchedulesCreated}`);
console.log(`Errors: ${stats.errors}`);
console.log(`Total time: ${formatDuration(totalTime)}`);
console.log(`HTTP requests: ${requestCount}`);
console.log('='.repeat(70));
// Log unmatched churches for manual review
if (unmatchedLog.length > 0) {
console.log(`\nUnmatched churches (${unmatchedLog.length}):`);
console.log('-'.repeat(70));
for (const line of unmatchedLog) {
console.log(` ${line}`);
}
console.log('-'.repeat(70));
}
console.log('');
// Update background job
if (args.jobId) {
try {
await prisma.backgroundJob.update({
where: { id: args.jobId },
data: {
status: stats.errors > 0 ? 'completed_with_errors' : 'completed',
completedAt: new Date(),
result: JSON.stringify(stats),
},
});
} catch {
// Ignore
}
}
}
main()
.catch((error) => {
console.error('Fatal error:', error);
process.exit(1);
})
.finally(async () => {
await prisma.$disconnect();
await pool.end();
});

File diff suppressed because it is too large Load Diff

396
src/lib/church-matcher.ts Normal file
View File

@@ -0,0 +1,396 @@
/**
* Church matching and deduplication logic
* Used to avoid duplicate churches when importing from multiple sources (OSM, MassTimes, etc.)
*/
import { calculateDistance } from './geo';
import type { OSMChurch } from './overpass-client';
import type { BaiduChurch } from './baidu-client';
// Type for existing church from database
export interface ExistingChurch {
id: string;
name: string;
latitude: number;
longitude: number;
osmId: string | null;
baiduId: string | null;
masstimesId: string | null;
orarimesseId: string | null;
massSchedulesPhId: string | null;
philmassId: string | null;
horariosMisasId: string | null;
mszeInfoId: string | null;
weekdayMassesId: string | null;
messesInfoId: string | null;
bohosluzbyId: string | null;
miserendId: string | null;
kerknetId: string | null;
gottesdienstzeitenId: string | null;
discovermassId: string | null;
source: string;
website: string | null;
phone: string | null;
address: string | null;
country?: string;
}
// Maximum distance in km to consider churches as potential duplicates
const DUPLICATE_DISTANCE_KM = 0.2; // 200 meters
/**
* Normalize church name for comparison
* - Lowercase
* - Expand "St." to "Saint"
* - Remove common suffixes like "Catholic Church", "Parish", etc.
* - Remove punctuation
*/
function normalizeName(name: string): string {
return name
.toLowerCase()
.replace(/\bst\.\s/g, 'saint ')
.replace(/\bst\s/g, 'saint ')
.replace(/\bcatholic church\b/g, '')
.replace(/\bparish\b/g, '')
.replace(/\broman catholic\b/g, '')
.replace(/\bchurch\b/g, '')
.replace(/[^\w\s]/g, '') // Remove punctuation
.replace(/\s+/g, ' ') // Normalize whitespace
.trim();
}
/**
* Calculate Levenshtein distance between two strings
* Used for fuzzy name matching
*/
function levenshteinDistance(a: string, b: string): number {
const matrix: number[][] = [];
for (let i = 0; i <= b.length; i++) {
matrix[i] = [i];
}
for (let j = 0; j <= a.length; j++) {
matrix[0][j] = j;
}
for (let i = 1; i <= b.length; i++) {
for (let j = 1; j <= a.length; j++) {
if (b.charAt(i - 1) === a.charAt(j - 1)) {
matrix[i][j] = matrix[i - 1][j - 1];
} else {
matrix[i][j] = Math.min(
matrix[i - 1][j - 1] + 1, // substitution
matrix[i][j - 1] + 1, // insertion
matrix[i - 1][j] + 1 // deletion
);
}
}
}
return matrix[b.length][a.length];
}
/**
* Check if two normalized names are similar
* Returns true if they have a common substring of 5+ characters OR Levenshtein distance < 5
*/
function namesAreSimilar(name1: string, name2: string): boolean {
const normalized1 = normalizeName(name1);
const normalized2 = normalizeName(name2);
// Check for common substring of 5+ characters
const minLength = Math.min(normalized1.length, normalized2.length);
if (minLength >= 5) {
for (let i = 0; i <= normalized1.length - 5; i++) {
const substring = normalized1.substring(i, i + 5);
if (normalized2.includes(substring)) {
return true;
}
}
}
// Check Levenshtein distance
const distance = levenshteinDistance(normalized1, normalized2);
if (distance < 5) {
return true;
}
return false;
}
// Candidate type for deduplication — works with OSM, Baidu, or any source
export type ChurchCandidate = {
name: string;
lat: number;
lng: number;
osmId?: string;
baiduId?: string;
orarimesseId?: string;
massSchedulesPhId?: string;
philmassId?: string;
horariosMisasId?: string;
mszeInfoId?: string;
weekdayMassesId?: string;
messesInfoId?: string;
bohosluzbyId?: string;
miserendId?: string;
kerknetId?: string;
gottesdienstzeitenId?: string;
discovermassId?: string;
};
/**
* Find duplicate church in existing database
* Returns the best match or null if no duplicate found
*
* Matching strategy (in priority order):
* 1. Exact osmId match
* 2. Exact baiduId match
* 3-9. Exact importer ID matches (orarimesse, massSchedulesPh, philmass, horariosMisas, mszeInfo, weekdayMasses, messesInfo)
* 10. Proximity + name similarity (within 200m + similar name)
*/
export function findDuplicateChurch(
candidate: ChurchCandidate,
existingChurches: ExistingChurch[]
): ExistingChurch | null {
// First pass: exact osmId match
if (candidate.osmId) {
const osmMatch = existingChurches.find((church) => church.osmId === candidate.osmId);
if (osmMatch) {
return osmMatch;
}
}
// Second pass: exact baiduId match
if (candidate.baiduId) {
const baiduMatch = existingChurches.find((church) => church.baiduId === candidate.baiduId);
if (baiduMatch) {
return baiduMatch;
}
}
// Third pass: exact orarimesseId match
if (candidate.orarimesseId) {
const orarimesseMatch = existingChurches.find(
(church) => church.orarimesseId === candidate.orarimesseId
);
if (orarimesseMatch) return orarimesseMatch;
}
// Fourth pass: exact massSchedulesPhId match
if (candidate.massSchedulesPhId) {
const msphMatch = existingChurches.find(
(church) => church.massSchedulesPhId === candidate.massSchedulesPhId
);
if (msphMatch) return msphMatch;
}
// Fifth pass: exact philmassId match
if (candidate.philmassId) {
const philmassMatch = existingChurches.find(
(church) => church.philmassId === candidate.philmassId
);
if (philmassMatch) return philmassMatch;
}
// Sixth pass: exact horariosMisasId match
if (candidate.horariosMisasId) {
const horariosMisasMatch = existingChurches.find(
(church) => church.horariosMisasId === candidate.horariosMisasId
);
if (horariosMisasMatch) return horariosMisasMatch;
}
// Seventh pass: exact mszeInfoId match
if (candidate.mszeInfoId) {
const mszeInfoMatch = existingChurches.find(
(church) => church.mszeInfoId === candidate.mszeInfoId
);
if (mszeInfoMatch) return mszeInfoMatch;
}
// Eighth pass: exact weekdayMassesId match
if (candidate.weekdayMassesId) {
const weekdayMassesMatch = existingChurches.find(
(church) => church.weekdayMassesId === candidate.weekdayMassesId
);
if (weekdayMassesMatch) return weekdayMassesMatch;
}
// Ninth pass: exact messesInfoId match
if (candidate.messesInfoId) {
const messesInfoMatch = existingChurches.find(
(church) => church.messesInfoId === candidate.messesInfoId
);
if (messesInfoMatch) return messesInfoMatch;
}
// Tenth pass: exact bohosluzbyId match
if (candidate.bohosluzbyId) {
const bohosluzbyMatch = existingChurches.find(
(church) => church.bohosluzbyId === candidate.bohosluzbyId
);
if (bohosluzbyMatch) return bohosluzbyMatch;
}
// Eleventh pass: exact miserendId match
if (candidate.miserendId) {
const miserendMatch = existingChurches.find(
(church) => church.miserendId === candidate.miserendId
);
if (miserendMatch) return miserendMatch;
}
// Twelfth pass: exact kerknetId match
if (candidate.kerknetId) {
const kerknetMatch = existingChurches.find(
(church) => church.kerknetId === candidate.kerknetId
);
if (kerknetMatch) return kerknetMatch;
}
// Thirteenth pass: exact gottesdienstzeitenId match
if (candidate.gottesdienstzeitenId) {
const gdzMatch = existingChurches.find(
(church) => church.gottesdienstzeitenId === candidate.gottesdienstzeitenId
);
if (gdzMatch) return gdzMatch;
}
// Fourteenth pass: exact discovermassId match
if (candidate.discovermassId) {
const match = existingChurches.find(c => c.discovermassId === candidate.discovermassId);
if (match) return match;
}
// Fifteenth pass: proximity + name match (skip if candidate has no real coordinates)
if (candidate.lat === 0 && candidate.lng === 0) {
return null;
}
const nearbyChurches = existingChurches.filter((church) => {
const distance = calculateDistance(
{ lat: candidate.lat, lng: candidate.lng },
{ lat: church.latitude, lng: church.longitude }
);
return distance <= DUPLICATE_DISTANCE_KM;
});
if (nearbyChurches.length === 0) {
return null;
}
// Among nearby churches, find one with similar name
for (const church of nearbyChurches) {
if (namesAreSimilar(candidate.name, church.name)) {
return church;
}
}
return null;
}
/**
* Merge OSM data into existing church record
* Only overwrites fields that are null/empty in existing with non-null OSM data
*
* Rules:
* - Never overwrite: name (if existing has one), massSchedules, scraperConfig
* - Always update: osmId, osmLastSyncedAt, hasWebsite
* - Prefer existing data for: phone, address, website (if already populated)
* - Use OSM data for: phone, address, website (only if existing field is null)
*/
export function mergeChurchData(
existing: ExistingChurch,
osmData: OSMChurch
): Partial<ExistingChurch> & { osmId: string; osmLastSyncedAt: Date; hasWebsite: boolean } {
const merged: any = {
osmId: osmData.osmId,
osmLastSyncedAt: new Date(),
hasWebsite: !!osmData.website,
};
// Only update coordinates if they differ significantly (more than 50m)
const coordDistance = calculateDistance(
{ lat: existing.latitude, lng: existing.longitude },
{ lat: osmData.lat, lng: osmData.lng }
);
if (coordDistance > 0.05) {
merged.latitude = osmData.lat;
merged.longitude = osmData.lng;
}
// Update address fields only if existing is null
if (!existing.address && osmData.address) {
merged.address = osmData.address;
}
// Update phone only if existing is null
if (!existing.phone && osmData.phone) {
merged.phone = osmData.phone;
}
// Update website only if existing is null
if (!existing.website && osmData.website) {
merged.website = osmData.website;
}
// Update source to "osm" if currently "manual"
if (existing.source === 'manual') {
merged.source = 'osm';
}
return merged;
}
/**
* Merge Baidu Maps data into existing church record
* Similar to mergeChurchData but for Baidu source
*
* Rules:
* - Always set: baiduId, baiduLastSyncedAt
* - Prefer existing data for: phone, address, website (if already populated)
* - Use Baidu data only if existing field is null
*/
export function mergeBaiduData(
existing: ExistingChurch,
baiduData: BaiduChurch
): Record<string, unknown> {
const merged: Record<string, unknown> = {
baiduId: baiduData.baiduId,
baiduLastSyncedAt: new Date(),
};
// Only update coordinates if they differ significantly (more than 50m)
const coordDistance = calculateDistance(
{ lat: existing.latitude, lng: existing.longitude },
{ lat: baiduData.lat, lng: baiduData.lng }
);
if (coordDistance > 0.05) {
// Only update coords if existing has no osmId (OSM coords are more reliable)
if (!existing.osmId) {
merged.latitude = baiduData.lat;
merged.longitude = baiduData.lng;
}
}
// Update address only if existing is null
if (!existing.address && baiduData.address) {
merged.address = baiduData.address;
}
// Update phone only if existing is null
if (!existing.phone && baiduData.phone) {
merged.phone = baiduData.phone;
}
// Set city/province if not set
if (baiduData.city) {
merged.city = merged.city || baiduData.city;
}
if (baiduData.province) {
merged.state = merged.state || baiduData.province;
}
return merged;
}