feat: add discovermassId to church-matcher ExistingChurch and ChurchCandidate
Add discovermassId field to ExistingChurch interface and ChurchCandidate type, insert a dedicated matching pass in findDuplicateChurch, and update all 15 importer push blocks plus 16 loadExistingChurches select queries to include the new field. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
325
scripts/import-baidu-churches.ts
Normal file
325
scripts/import-baidu-churches.ts
Normal file
@@ -0,0 +1,325 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Import Catholic churches from Baidu Maps (China)
|
||||
* Usage:
|
||||
* npx tsx scripts/import-baidu-churches.ts
|
||||
* npx tsx scripts/import-baidu-churches.ts --dry-run
|
||||
* npx tsx scripts/import-baidu-churches.ts --resume-from-cell 100
|
||||
* npx tsx scripts/import-baidu-churches.ts --job-id <uuid>
|
||||
*/
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
|
||||
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
|
||||
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
|
||||
const pool = new Pool({
|
||||
connectionString: dbUrl,
|
||||
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
|
||||
});
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
import { queryBaiduByGrid, type BaiduChurch } from '../src/lib/baidu-client';
|
||||
import { findDuplicateChurch, mergeBaiduData, type ExistingChurch } from '../src/lib/church-matcher';
|
||||
|
||||
interface ImportStats {
|
||||
baiduChurchesFound: number;
|
||||
newChurchesInserted: number;
|
||||
existingUpdated: number;
|
||||
existingLinked: number;
|
||||
errors: number;
|
||||
}
|
||||
|
||||
function parseArgs(): { dryRun: boolean; resumeFromCell: number; jobId?: string } {
|
||||
const args = process.argv.slice(2);
|
||||
const result = {
|
||||
dryRun: false,
|
||||
resumeFromCell: 0,
|
||||
jobId: undefined as string | undefined,
|
||||
};
|
||||
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
if (args[i] === '--dry-run') {
|
||||
result.dryRun = true;
|
||||
} else if (args[i] === '--resume-from-cell' && args[i + 1]) {
|
||||
result.resumeFromCell = parseInt(args[i + 1], 10);
|
||||
i++;
|
||||
} else if (args[i] === '--job-id' && args[i + 1]) {
|
||||
result.jobId = args[i + 1];
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
async function createOrResumeJob(jobId?: string): Promise<string | null> {
|
||||
if (jobId) {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: { status: 'running', startedAt: new Date() },
|
||||
});
|
||||
return jobId;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function completeJob(jobId: string | null, error?: string): Promise<void> {
|
||||
if (!jobId) return;
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: {
|
||||
status: error ? 'failed' : 'completed',
|
||||
error: error || null,
|
||||
completedAt: new Date(),
|
||||
},
|
||||
});
|
||||
} catch (err) {
|
||||
console.error(`Failed to update job ${jobId}:`, err);
|
||||
}
|
||||
}
|
||||
|
||||
async function updateJobProgress(jobId: string | null, stats: ImportStats, totalCells: number, currentCell: number): Promise<void> {
|
||||
if (!jobId) return;
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: {
|
||||
totalItems: totalCells,
|
||||
processed: currentCell,
|
||||
succeeded: stats.newChurchesInserted + stats.existingUpdated + stats.existingLinked,
|
||||
failed: stats.errors,
|
||||
itemsFound: stats.baiduChurchesFound,
|
||||
},
|
||||
});
|
||||
} catch (err) {
|
||||
// Non-fatal — just log it
|
||||
console.error(`Failed to update job progress:`, err);
|
||||
}
|
||||
}
|
||||
|
||||
async function importFromBaidu(dryRun: boolean, resumeFromCell: number, jobId: string | null): Promise<ImportStats> {
|
||||
const stats: ImportStats = {
|
||||
baiduChurchesFound: 0,
|
||||
newChurchesInserted: 0,
|
||||
existingUpdated: 0,
|
||||
existingLinked: 0,
|
||||
errors: 0,
|
||||
};
|
||||
|
||||
const apiKey = process.env.BAIDU_MAPS_API_KEY;
|
||||
if (!apiKey) {
|
||||
throw new Error('Missing BAIDU_MAPS_API_KEY environment variable');
|
||||
}
|
||||
|
||||
console.log(`\n${'='.repeat(60)}`);
|
||||
console.log(`Importing Catholic churches from Baidu Maps (China)`);
|
||||
console.log(`${'='.repeat(60)}\n`);
|
||||
|
||||
// Step 1: Query Baidu API
|
||||
console.log('Step 1: Querying Baidu Maps API...');
|
||||
const baiduChurches = await queryBaiduByGrid(
|
||||
apiKey,
|
||||
(progress) => {
|
||||
updateJobProgress(jobId, stats, progress.totalCells, progress.cellIndex);
|
||||
},
|
||||
resumeFromCell,
|
||||
);
|
||||
|
||||
stats.baiduChurchesFound = baiduChurches.length;
|
||||
console.log(`\nFound ${baiduChurches.length} churches from Baidu Maps`);
|
||||
|
||||
if (baiduChurches.length === 0) {
|
||||
console.log('No churches found');
|
||||
return stats;
|
||||
}
|
||||
|
||||
if (dryRun) {
|
||||
console.log('\n[DRY RUN] Would import the following churches:');
|
||||
baiduChurches.slice(0, 20).forEach((church) => {
|
||||
console.log(` - ${church.name} (${church.city || church.province || 'unknown'})`);
|
||||
console.log(` Baidu ID: ${church.baiduId}, Coords: ${church.lat.toFixed(4)}, ${church.lng.toFixed(4)}`);
|
||||
});
|
||||
if (baiduChurches.length > 20) {
|
||||
console.log(` ... and ${baiduChurches.length - 20} more`);
|
||||
}
|
||||
return stats;
|
||||
}
|
||||
|
||||
// Step 2: Load existing churches in China for deduplication
|
||||
console.log('\nStep 2: Loading existing churches in China for deduplication...');
|
||||
const existingChurches: ExistingChurch[] = await prisma.church.findMany({
|
||||
where: { country: 'CN' },
|
||||
select: {
|
||||
id: true,
|
||||
name: true,
|
||||
latitude: true,
|
||||
longitude: true,
|
||||
osmId: true,
|
||||
baiduId: true,
|
||||
masstimesId: true,
|
||||
orarimesseId: true,
|
||||
massSchedulesPhId: true,
|
||||
philmassId: true,
|
||||
horariosMisasId: true,
|
||||
mszeInfoId: true,
|
||||
weekdayMassesId: true,
|
||||
messesInfoId: true,
|
||||
bohosluzbyId: true,
|
||||
miserendId: true,
|
||||
kerknetId: true,
|
||||
gottesdienstzeitenId: true,
|
||||
discovermassId: true,
|
||||
source: true,
|
||||
website: true,
|
||||
phone: true,
|
||||
address: true,
|
||||
},
|
||||
});
|
||||
console.log(`Found ${existingChurches.length} existing churches in China`);
|
||||
|
||||
// Step 3: Process each Baidu church
|
||||
console.log('\nStep 3: Processing churches...');
|
||||
let processed = 0;
|
||||
|
||||
for (const baiduChurch of baiduChurches) {
|
||||
try {
|
||||
const candidate = {
|
||||
name: baiduChurch.name,
|
||||
lat: baiduChurch.lat,
|
||||
lng: baiduChurch.lng,
|
||||
baiduId: baiduChurch.baiduId,
|
||||
};
|
||||
|
||||
const duplicate = findDuplicateChurch(candidate, existingChurches);
|
||||
|
||||
if (duplicate && duplicate.baiduId === baiduChurch.baiduId) {
|
||||
// Existing church with matching baiduId — update it
|
||||
const mergedData = mergeBaiduData(duplicate, baiduChurch);
|
||||
await prisma.church.update({
|
||||
where: { id: duplicate.id },
|
||||
data: mergedData,
|
||||
});
|
||||
stats.existingUpdated++;
|
||||
} else if (duplicate) {
|
||||
// Existing church matched by proximity/name — link it with baiduId
|
||||
const mergedData = mergeBaiduData(duplicate, baiduChurch);
|
||||
await prisma.church.update({
|
||||
where: { id: duplicate.id },
|
||||
data: mergedData,
|
||||
});
|
||||
stats.existingLinked++;
|
||||
} else {
|
||||
// New church — insert it
|
||||
const newChurch = await prisma.church.create({
|
||||
data: {
|
||||
name: baiduChurch.name,
|
||||
latitude: baiduChurch.lat,
|
||||
longitude: baiduChurch.lng,
|
||||
address: baiduChurch.address,
|
||||
city: baiduChurch.city,
|
||||
state: baiduChurch.province,
|
||||
country: 'CN',
|
||||
phone: baiduChurch.phone,
|
||||
website: baiduChurch.website,
|
||||
source: 'baidu',
|
||||
baiduId: baiduChurch.baiduId,
|
||||
baiduLastSyncedAt: new Date(),
|
||||
hasWebsite: !!baiduChurch.website,
|
||||
},
|
||||
});
|
||||
stats.newChurchesInserted++;
|
||||
|
||||
// Add to existing churches list for dedup within this run
|
||||
existingChurches.push({
|
||||
id: newChurch.id,
|
||||
name: baiduChurch.name,
|
||||
latitude: baiduChurch.lat,
|
||||
longitude: baiduChurch.lng,
|
||||
osmId: null,
|
||||
baiduId: baiduChurch.baiduId,
|
||||
masstimesId: null,
|
||||
orarimesseId: null,
|
||||
massSchedulesPhId: null,
|
||||
philmassId: null,
|
||||
horariosMisasId: null,
|
||||
mszeInfoId: null,
|
||||
weekdayMassesId: null,
|
||||
messesInfoId: null,
|
||||
bohosluzbyId: null,
|
||||
miserendId: null,
|
||||
kerknetId: null,
|
||||
gottesdienstzeitenId: null,
|
||||
discovermassId: null,
|
||||
source: 'baidu',
|
||||
website: baiduChurch.website || null,
|
||||
phone: baiduChurch.phone || null,
|
||||
address: baiduChurch.address || null,
|
||||
});
|
||||
}
|
||||
|
||||
processed++;
|
||||
if (processed % 500 === 0) {
|
||||
console.log(`Progress: ${processed}/${baiduChurches.length} churches processed`);
|
||||
await updateJobProgress(jobId, stats, baiduChurches.length, processed);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Error processing church ${baiduChurch.name} (${baiduChurch.baiduId}):`, error);
|
||||
stats.errors++;
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\nProcessed all ${baiduChurches.length} churches`);
|
||||
return stats;
|
||||
}
|
||||
|
||||
function printSummary(stats: ImportStats, dryRun: boolean) {
|
||||
console.log(`\n${'='.repeat(60)}`);
|
||||
console.log(`Baidu Import Summary ${dryRun ? '(DRY RUN)' : ''}`);
|
||||
console.log(`${'='.repeat(60)}`);
|
||||
console.log(`Baidu churches found: ${stats.baiduChurchesFound}`);
|
||||
|
||||
if (!dryRun) {
|
||||
console.log(`New churches inserted: ${stats.newChurchesInserted}`);
|
||||
console.log(`Existing churches updated: ${stats.existingUpdated} (matched by baiduId)`);
|
||||
console.log(`Existing churches linked: ${stats.existingLinked} (matched by proximity)`);
|
||||
}
|
||||
|
||||
if (!dryRun && stats.errors > 0) {
|
||||
console.log(`Errors encountered: ${stats.errors}`);
|
||||
}
|
||||
|
||||
console.log(`${'='.repeat(60)}\n`);
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const { dryRun, resumeFromCell, jobId: argJobId } = parseArgs();
|
||||
const jobId = await createOrResumeJob(argJobId);
|
||||
|
||||
if (dryRun) {
|
||||
console.log('\n*** DRY RUN MODE - No changes will be made to database ***\n');
|
||||
}
|
||||
|
||||
try {
|
||||
const stats = await importFromBaidu(dryRun, resumeFromCell, jobId);
|
||||
printSummary(stats, dryRun);
|
||||
await completeJob(jobId);
|
||||
} catch (error) {
|
||||
console.error('Fatal error:', error);
|
||||
await completeJob(jobId, String(error));
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await prisma.$disconnect();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
641
scripts/import-bohosluzby.ts
Normal file
641
scripts/import-bohosluzby.ts
Normal file
@@ -0,0 +1,641 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Import Catholic churches and mass schedules from bohosluzby.cz (Czech Republic)
|
||||
*
|
||||
* bohosluzby.cz is the official Czech bishops' conference mass schedule finder.
|
||||
* It exposes a JSON API with two main endpoints:
|
||||
* - POST /index.php/apiWeb/allData — returns all churches (clustered by zoom level)
|
||||
* - GET /index.php/apiWeb/detailById?id={id} — returns mass schedule details
|
||||
*
|
||||
* The API requires no authentication. We fetch all churches at zoom=7 (covers
|
||||
* all of Czech Republic in one request with clustered results), then fetch
|
||||
* individual detail pages for mass schedules.
|
||||
*
|
||||
* Import strategy:
|
||||
* 1. Fetch all churches via allData endpoint (zoom=7, centered on Czech Republic)
|
||||
* 2. Flatten clustered results to get individual church records
|
||||
* 3. For each church, fetch detail to get mass schedules
|
||||
* 4. Match against existing Czech churches via church-matcher
|
||||
* 5. Upsert churches and mass schedules
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/import-bohosluzby.ts --all --dry-run
|
||||
* npx tsx scripts/import-bohosluzby.ts --all
|
||||
* npx tsx scripts/import-bohosluzby.ts --id 10009 --dry-run # Single church
|
||||
* npx tsx scripts/import-bohosluzby.ts --all --resume-from 500
|
||||
*/
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
|
||||
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
|
||||
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
|
||||
const pool = new Pool({
|
||||
connectionString: dbUrl,
|
||||
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
|
||||
});
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
import { findDuplicateChurch } from '../src/lib/church-matcher';
|
||||
import type { ExistingChurch } from '../src/lib/church-matcher';
|
||||
|
||||
// ─── Constants ───────────────────────────────────────────────────────────────
|
||||
|
||||
const BASE_URL = 'https://bohosluzby.cirkev.cz';
|
||||
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
|
||||
const REQUEST_DELAY_MS = 500; // Be polite — 0.5s between detail requests
|
||||
const RETRY_DELAY_MS = 5000;
|
||||
const MAX_RETRIES = 3;
|
||||
|
||||
// Czech Republic center coordinates for the allData request
|
||||
const CZ_CENTER_LAT = 49.8;
|
||||
const CZ_CENTER_LNG = 15.5;
|
||||
const CZ_ZOOM = 7; // Returns all churches clustered into ~7 groups
|
||||
|
||||
// ─── Types ───────────────────────────────────────────────────────────────────
|
||||
|
||||
interface BohosluzbyChurch {
|
||||
id: string;
|
||||
name: string;
|
||||
street: string | null;
|
||||
city: string | null;
|
||||
psc: string | null; // zip code
|
||||
latitude: number;
|
||||
longitude: number;
|
||||
type: string; // KOSTEL, KAPLE, etc.
|
||||
}
|
||||
|
||||
interface BohosluzbySchedule {
|
||||
dayOfWeek: number; // 0=Sunday, 1=Monday, ...
|
||||
time: string; // HH:MM
|
||||
language: string;
|
||||
type: string; // "mše sv.", "růženec", etc.
|
||||
note: string | null;
|
||||
}
|
||||
|
||||
interface ImportStats {
|
||||
churchesFetched: number;
|
||||
detailsFetched: number;
|
||||
churchesMatched: number;
|
||||
churchesCreated: number;
|
||||
churchesSkipped: number;
|
||||
schedulesCreated: number;
|
||||
errors: number;
|
||||
}
|
||||
|
||||
interface CLIArgs {
|
||||
all: boolean;
|
||||
dryRun: boolean;
|
||||
resumeFrom?: number;
|
||||
churchId?: string;
|
||||
jobId?: string;
|
||||
}
|
||||
|
||||
// ─── HTTP Client ─────────────────────────────────────────────────────────────
|
||||
|
||||
let requestCount = 0;
|
||||
|
||||
function delay(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
async function fetchWithRetry(url: string, options: RequestInit = {}): Promise<any | null> {
|
||||
if (requestCount > 0) {
|
||||
await delay(REQUEST_DELAY_MS);
|
||||
}
|
||||
requestCount++;
|
||||
|
||||
for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
...options,
|
||||
headers: {
|
||||
'User-Agent': USER_AGENT,
|
||||
...options.headers,
|
||||
},
|
||||
});
|
||||
|
||||
if (response.status === 503 || response.status === 429) {
|
||||
if (attempt < MAX_RETRIES) {
|
||||
console.log(` HTTP ${response.status} — retrying in ${RETRY_DELAY_MS / 1000}s (attempt ${attempt}/${MAX_RETRIES})`);
|
||||
await delay(RETRY_DELAY_MS);
|
||||
continue;
|
||||
}
|
||||
console.error(` HTTP ${response.status} after ${MAX_RETRIES} attempts`);
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!response.ok) {
|
||||
console.error(` HTTP ${response.status} from ${url}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
return await response.json();
|
||||
} catch (error) {
|
||||
if (attempt < MAX_RETRIES) {
|
||||
console.log(` Network error — retrying in ${RETRY_DELAY_MS / 1000}s (attempt ${attempt}/${MAX_RETRIES})`);
|
||||
await delay(RETRY_DELAY_MS);
|
||||
continue;
|
||||
}
|
||||
console.error(` API error after ${MAX_RETRIES} attempts: ${error instanceof Error ? error.message : error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// ─── API Methods ─────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Fetch all churches from the allData endpoint.
|
||||
* Returns clustered results at zoom=7 — we flatten the clusters to get
|
||||
* individual church records with id, name, lat, lng, city, street.
|
||||
*/
|
||||
async function fetchAllChurches(): Promise<BohosluzbyChurch[]> {
|
||||
console.log('Fetching all churches from allData endpoint...');
|
||||
|
||||
const params = new URLSearchParams();
|
||||
params.append('institutionTypes', "'KOSTEL'");
|
||||
params.append('latitude', String(CZ_CENTER_LAT));
|
||||
params.append('longitude', String(CZ_CENTER_LNG));
|
||||
params.append('zoom', String(CZ_ZOOM));
|
||||
|
||||
const data = await fetchWithRetry(`${BASE_URL}/index.php/apiWeb/allData`, {
|
||||
method: 'POST',
|
||||
body: params,
|
||||
headers: { 'Content-Type': 'application/x-www-form-urlencoded' },
|
||||
});
|
||||
|
||||
if (!data) {
|
||||
console.error('Failed to fetch allData');
|
||||
return [];
|
||||
}
|
||||
|
||||
const churches: BohosluzbyChurch[] = [];
|
||||
const kostelData = data["'KOSTEL'"] || [];
|
||||
|
||||
for (const cluster of kostelData) {
|
||||
// Add the cluster representative
|
||||
churches.push({
|
||||
id: cluster.id,
|
||||
name: cluster.name,
|
||||
street: cluster.street || null,
|
||||
city: cluster.city || null,
|
||||
psc: cluster.psc || null,
|
||||
latitude: parseFloat(cluster.latitude),
|
||||
longitude: parseFloat(cluster.longitude),
|
||||
type: cluster.type || 'KOSTEL',
|
||||
});
|
||||
|
||||
// Add churches from the indices array (sub-items in the cluster)
|
||||
if (Array.isArray(cluster.indices)) {
|
||||
for (const sub of cluster.indices) {
|
||||
churches.push({
|
||||
id: sub.id,
|
||||
name: sub.name,
|
||||
street: sub.street || null,
|
||||
city: sub.city || null,
|
||||
psc: sub.psc || null,
|
||||
latitude: parseFloat(sub.latitude),
|
||||
longitude: parseFloat(sub.longitude),
|
||||
type: sub.type || 'KOSTEL',
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`Fetched ${churches.length} churches from allData`);
|
||||
return churches;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch mass schedule details for a single church.
|
||||
* Returns parsed regular mass schedules.
|
||||
*/
|
||||
async function fetchChurchDetail(churchId: string): Promise<BohosluzbySchedule[]> {
|
||||
const data = await fetchWithRetry(`${BASE_URL}/index.php/apiWeb/detailById?id=${churchId}`);
|
||||
if (!data || !data.church) return [];
|
||||
|
||||
const schedules: BohosluzbySchedule[] = [];
|
||||
const regular = data.church.regular || [];
|
||||
|
||||
for (const entry of regular) {
|
||||
// Only import "mše sv." (Holy Mass) entries
|
||||
if (entry.chst_name && !entry.chst_name.includes('mše')) continue;
|
||||
|
||||
const time = entry.cas; // Already in HH:MM format
|
||||
if (!time) continue;
|
||||
|
||||
// Parse periodic_days: "12345" = Mon-Fri, "6" = Sat, "7" = Sun
|
||||
// Convert to our dayOfWeek: 0=Sun, 1=Mon, ..., 6=Sat
|
||||
const periodicDays = entry.periodic_days || '';
|
||||
for (const dayChar of periodicDays) {
|
||||
const bohosluzbyDay = parseInt(dayChar);
|
||||
if (isNaN(bohosluzbyDay)) continue;
|
||||
|
||||
// bohosluzby: 1=Mon, 2=Tue, ..., 6=Sat, 7=Sun
|
||||
// Our format: 0=Sun, 1=Mon, ..., 6=Sat
|
||||
const dayOfWeek = bohosluzbyDay === 7 ? 0 : bohosluzbyDay;
|
||||
|
||||
const key = `${dayOfWeek}:${time}`;
|
||||
// Deduplicate within this church
|
||||
if (!schedules.some(s => `${s.dayOfWeek}:${s.time}` === key)) {
|
||||
schedules.push({
|
||||
dayOfWeek,
|
||||
time,
|
||||
language: entry.chsl_name || 'česky',
|
||||
type: entry.chst_name || 'mše sv.',
|
||||
note: entry.note || null,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return schedules;
|
||||
}
|
||||
|
||||
// ─── Database Operations ─────────────────────────────────────────────────────
|
||||
|
||||
async function loadExistingCzechChurches(): Promise<ExistingChurch[]> {
|
||||
console.log('Loading existing Czech churches for deduplication...');
|
||||
const churches = await prisma.church.findMany({
|
||||
where: { country: 'CZ' },
|
||||
select: {
|
||||
id: true,
|
||||
name: true,
|
||||
latitude: true,
|
||||
longitude: true,
|
||||
osmId: true,
|
||||
baiduId: true,
|
||||
masstimesId: true,
|
||||
orarimesseId: true,
|
||||
massSchedulesPhId: true,
|
||||
philmassId: true,
|
||||
horariosMisasId: true,
|
||||
mszeInfoId: true,
|
||||
weekdayMassesId: true,
|
||||
messesInfoId: true,
|
||||
bohosluzbyId: true,
|
||||
miserendId: true,
|
||||
kerknetId: true,
|
||||
gottesdienstzeitenId: true,
|
||||
discovermassId: true,
|
||||
source: true,
|
||||
website: true,
|
||||
phone: true,
|
||||
address: true,
|
||||
},
|
||||
});
|
||||
console.log(`Loaded ${churches.length} existing Czech churches`);
|
||||
return churches;
|
||||
}
|
||||
|
||||
// ─── Import Logic ────────────────────────────────────────────────────────────
|
||||
|
||||
async function processChurch(
|
||||
church: BohosluzbyChurch,
|
||||
existingChurches: ExistingChurch[],
|
||||
dryRun: boolean,
|
||||
stats: ImportStats,
|
||||
): Promise<void> {
|
||||
if (church.latitude === 0 && church.longitude === 0) {
|
||||
stats.churchesSkipped++;
|
||||
return;
|
||||
}
|
||||
|
||||
// Fetch mass schedules
|
||||
let schedules: BohosluzbySchedule[] = [];
|
||||
if (!dryRun) {
|
||||
schedules = await fetchChurchDetail(church.id);
|
||||
stats.detailsFetched++;
|
||||
}
|
||||
|
||||
const candidate = {
|
||||
name: church.name,
|
||||
lat: church.latitude,
|
||||
lng: church.longitude,
|
||||
bohosluzbyId: church.id,
|
||||
};
|
||||
|
||||
const duplicate = findDuplicateChurch(candidate, existingChurches);
|
||||
|
||||
if (dryRun) {
|
||||
if (duplicate) {
|
||||
stats.churchesMatched++;
|
||||
} else {
|
||||
stats.churchesCreated++;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (duplicate) {
|
||||
stats.churchesMatched++;
|
||||
const updateData: Record<string, unknown> = { bohosluzbyId: church.id };
|
||||
|
||||
if (!duplicate.address && church.street) updateData.address = church.street;
|
||||
|
||||
try {
|
||||
await prisma.church.update({
|
||||
where: { id: duplicate.id },
|
||||
data: updateData,
|
||||
});
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message.includes('Unique constraint')) {
|
||||
stats.churchesSkipped++;
|
||||
return;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
|
||||
if (schedules.length > 0) {
|
||||
try {
|
||||
await prisma.$transaction(async (tx) => {
|
||||
await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } });
|
||||
await tx.massSchedule.createMany({
|
||||
data: schedules.map((s) => ({
|
||||
churchId: duplicate.id,
|
||||
dayOfWeek: s.dayOfWeek,
|
||||
time: s.time,
|
||||
language: 'Czech',
|
||||
})),
|
||||
});
|
||||
await tx.church.update({
|
||||
where: { id: duplicate.id },
|
||||
data: { lastScrapedAt: new Date() },
|
||||
});
|
||||
});
|
||||
stats.schedulesCreated += schedules.length;
|
||||
} catch (error) {
|
||||
stats.errors++;
|
||||
console.error(` Error saving schedules for ${church.id}: ${error instanceof Error ? error.message : error}`);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
const newChurch = await prisma.church.create({
|
||||
data: {
|
||||
name: church.name,
|
||||
latitude: church.latitude,
|
||||
longitude: church.longitude,
|
||||
address: church.street,
|
||||
zip: church.psc,
|
||||
city: church.city,
|
||||
country: 'CZ',
|
||||
bohosluzbyId: church.id,
|
||||
source: 'bohosluzby',
|
||||
websiteLanguage: 'cs',
|
||||
},
|
||||
});
|
||||
stats.churchesCreated++;
|
||||
|
||||
existingChurches.push({
|
||||
id: newChurch.id,
|
||||
name: church.name,
|
||||
latitude: church.latitude,
|
||||
longitude: church.longitude,
|
||||
osmId: null,
|
||||
baiduId: null,
|
||||
masstimesId: null,
|
||||
orarimesseId: null,
|
||||
massSchedulesPhId: null,
|
||||
philmassId: null,
|
||||
horariosMisasId: null,
|
||||
mszeInfoId: null,
|
||||
weekdayMassesId: null,
|
||||
messesInfoId: null,
|
||||
bohosluzbyId: church.id,
|
||||
miserendId: null,
|
||||
kerknetId: null,
|
||||
gottesdienstzeitenId: null,
|
||||
discovermassId: null,
|
||||
source: 'bohosluzby',
|
||||
website: null,
|
||||
phone: null,
|
||||
address: church.street,
|
||||
});
|
||||
|
||||
if (schedules.length > 0) {
|
||||
await prisma.massSchedule.createMany({
|
||||
data: schedules.map((s) => ({
|
||||
churchId: newChurch.id,
|
||||
dayOfWeek: s.dayOfWeek,
|
||||
time: s.time,
|
||||
language: 'Czech',
|
||||
})),
|
||||
});
|
||||
await prisma.church.update({
|
||||
where: { id: newChurch.id },
|
||||
data: { lastScrapedAt: new Date() },
|
||||
});
|
||||
stats.schedulesCreated += schedules.length;
|
||||
}
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message.includes('Unique constraint')) {
|
||||
stats.churchesSkipped++;
|
||||
return;
|
||||
}
|
||||
stats.errors++;
|
||||
console.error(` Error creating ${church.id}: ${error instanceof Error ? error.message : error}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ─── CLI ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
function parseArgs(): CLIArgs {
|
||||
const args = process.argv.slice(2);
|
||||
const result: CLIArgs = { all: false, dryRun: false };
|
||||
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
switch (args[i]) {
|
||||
case '--all':
|
||||
result.all = true;
|
||||
break;
|
||||
case '--dry-run':
|
||||
result.dryRun = true;
|
||||
break;
|
||||
case '--resume-from':
|
||||
result.resumeFrom = parseInt(args[++i]);
|
||||
break;
|
||||
case '--id':
|
||||
result.churchId = args[++i];
|
||||
break;
|
||||
case '--job-id':
|
||||
result.jobId = args[++i];
|
||||
break;
|
||||
case '--help':
|
||||
case '-h':
|
||||
console.log(`
|
||||
Usage: npx tsx scripts/import-bohosluzby.ts [options]
|
||||
|
||||
Options:
|
||||
--all Import all churches
|
||||
--id <id> Import a single church by bohosluzby ID
|
||||
--dry-run No database writes, just report what would happen
|
||||
--resume-from <n> Skip first N churches
|
||||
--job-id <uuid> Background job tracking ID
|
||||
--help, -h Show this help message
|
||||
|
||||
Examples:
|
||||
npx tsx scripts/import-bohosluzby.ts --id 10009 --dry-run
|
||||
npx tsx scripts/import-bohosluzby.ts --all --dry-run
|
||||
npx tsx scripts/import-bohosluzby.ts --all
|
||||
`);
|
||||
process.exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
if (!result.all && !result.churchId) {
|
||||
console.error('Error: specify --all or --id <bohosluzby_id>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
function formatDuration(ms: number): string {
|
||||
const seconds = Math.floor(ms / 1000);
|
||||
const minutes = Math.floor(seconds / 60);
|
||||
const hours = Math.floor(minutes / 60);
|
||||
if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`;
|
||||
if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
|
||||
return `${seconds}s`;
|
||||
}
|
||||
|
||||
// ─── Main ────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const startTime = Date.now();
|
||||
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log('BOHOSLUZBY.CZ (CZECH REPUBLIC) IMPORTER');
|
||||
console.log('='.repeat(70));
|
||||
console.log(`Mode: ${args.churchId ? `Church ID ${args.churchId}` : 'All churches'}`);
|
||||
console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`);
|
||||
if (args.resumeFrom) console.log(`Resume from: church index ${args.resumeFrom}`);
|
||||
console.log(`Time: ${new Date().toISOString()}`);
|
||||
console.log('='.repeat(70) + '\n');
|
||||
|
||||
if (args.jobId) {
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: args.jobId },
|
||||
data: { status: 'running', startedAt: new Date() },
|
||||
});
|
||||
} catch { /* Job might not exist */ }
|
||||
}
|
||||
|
||||
const stats: ImportStats = {
|
||||
churchesFetched: 0,
|
||||
detailsFetched: 0,
|
||||
churchesMatched: 0,
|
||||
churchesCreated: 0,
|
||||
churchesSkipped: 0,
|
||||
schedulesCreated: 0,
|
||||
errors: 0,
|
||||
};
|
||||
|
||||
const existingChurches = await loadExistingCzechChurches();
|
||||
|
||||
let churches: BohosluzbyChurch[];
|
||||
|
||||
if (args.churchId) {
|
||||
// Single church mode — create a minimal record and fetch detail
|
||||
churches = [{
|
||||
id: args.churchId,
|
||||
name: `Church ${args.churchId}`,
|
||||
street: null,
|
||||
city: null,
|
||||
psc: null,
|
||||
latitude: 0,
|
||||
longitude: 0,
|
||||
type: 'KOSTEL',
|
||||
}];
|
||||
// Fetch detail to get actual data
|
||||
const detail = await fetchWithRetry(`${BASE_URL}/index.php/apiWeb/detailById?id=${args.churchId}`);
|
||||
if (detail?.church?.institution?.[0]) {
|
||||
const inst = detail.church.institution[0];
|
||||
churches[0].name = inst.name || churches[0].name;
|
||||
churches[0].street = inst.street || null;
|
||||
churches[0].city = inst.city || null;
|
||||
churches[0].latitude = parseFloat(inst.latitude) || 0;
|
||||
churches[0].longitude = parseFloat(inst.longitude) || 0;
|
||||
}
|
||||
} else {
|
||||
churches = await fetchAllChurches();
|
||||
}
|
||||
|
||||
stats.churchesFetched = churches.length;
|
||||
|
||||
if (args.resumeFrom) {
|
||||
churches = churches.slice(args.resumeFrom);
|
||||
console.log(`Resuming from index ${args.resumeFrom} (${churches.length} remaining)\n`);
|
||||
}
|
||||
|
||||
console.log(`Processing ${churches.length} churches\n`);
|
||||
|
||||
for (let i = 0; i < churches.length; i++) {
|
||||
const church = churches[i];
|
||||
if (i % 100 === 0) {
|
||||
const elapsed = formatDuration(Date.now() - startTime);
|
||||
console.log(`[${i + 1}/${churches.length}] Processing ${church.name} (${church.id}) [${elapsed} elapsed]`);
|
||||
}
|
||||
|
||||
try {
|
||||
await processChurch(church, existingChurches, args.dryRun, stats);
|
||||
} catch (error) {
|
||||
stats.errors++;
|
||||
console.error(` ERROR processing church ${church.id}: ${error instanceof Error ? error.message : error}`);
|
||||
}
|
||||
}
|
||||
|
||||
const totalTime = Date.now() - startTime;
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
|
||||
console.log('='.repeat(70));
|
||||
console.log(`Churches fetched: ${stats.churchesFetched}`);
|
||||
console.log(`Details fetched: ${stats.detailsFetched}`);
|
||||
console.log(` Matched (existing): ${stats.churchesMatched}`);
|
||||
console.log(` Created (new): ${stats.churchesCreated}`);
|
||||
console.log(` Skipped: ${stats.churchesSkipped}`);
|
||||
console.log(`Schedules created: ${stats.schedulesCreated}`);
|
||||
console.log(`Errors: ${stats.errors}`);
|
||||
console.log(`Total time: ${formatDuration(totalTime)}`);
|
||||
console.log(`HTTP requests: ${requestCount}`);
|
||||
console.log('='.repeat(70) + '\n');
|
||||
|
||||
if (args.jobId) {
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: args.jobId },
|
||||
data: {
|
||||
status: stats.errors > 0 ? 'completed_with_errors' : 'completed',
|
||||
completedAt: new Date(),
|
||||
processed: stats.churchesFetched,
|
||||
succeeded: stats.churchesCreated + stats.churchesMatched,
|
||||
failed: stats.errors,
|
||||
itemsFound: stats.schedulesCreated,
|
||||
},
|
||||
});
|
||||
} catch { /* Ignore */ }
|
||||
}
|
||||
}
|
||||
|
||||
main()
|
||||
.catch((error) => {
|
||||
console.error('Fatal error:', error);
|
||||
process.exit(1);
|
||||
})
|
||||
.finally(async () => {
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
});
|
||||
834
scripts/import-gcatholic.ts
Normal file
834
scripts/import-gcatholic.ts
Normal file
@@ -0,0 +1,834 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Import Catholic churches from GCatholic.org
|
||||
*
|
||||
* GCatholic is a comprehensive Catholic directory organized by diocese.
|
||||
* Each church page includes a Google Plus Code (→ lat/lng), address, phone, website, etc.
|
||||
* This script discovers churches via country → diocese → church page navigation.
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/import-gcatholic.ts --country CN
|
||||
* npx tsx scripts/import-gcatholic.ts --country CN --dry-run
|
||||
* npx tsx scripts/import-gcatholic.ts --diocese peki0
|
||||
* npx tsx scripts/import-gcatholic.ts --all
|
||||
* npx tsx scripts/import-gcatholic.ts --all --limit 100
|
||||
* npx tsx scripts/import-gcatholic.ts --all --resume-from PL
|
||||
*/
|
||||
|
||||
// Load .env for database connection (before importing anything that uses process.env)
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
// Create a fresh Prisma client for this script (don't use cached pool from lib/db)
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
|
||||
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
|
||||
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
|
||||
const pool = new Pool({
|
||||
connectionString: dbUrl,
|
||||
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
|
||||
});
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
import { findDuplicateChurch } from '../src/lib/church-matcher';
|
||||
import type { ExistingChurch } from '../src/lib/church-matcher';
|
||||
|
||||
// Plus Code decoder
|
||||
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
||||
const { OpenLocationCode } = require('open-location-code');
|
||||
const olc = new OpenLocationCode();
|
||||
|
||||
// ─── Constants ───────────────────────────────────────────────────────────────
|
||||
|
||||
const BASE_URL = 'https://www.gcatholic.org';
|
||||
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
|
||||
const DEFAULT_DELAY_MS = 1500;
|
||||
|
||||
// ─── Types ───────────────────────────────────────────────────────────────────
|
||||
|
||||
interface GCatholicChurch {
|
||||
gcatholicId: string;
|
||||
name: string;
|
||||
localName?: string;
|
||||
lat: number;
|
||||
lng: number;
|
||||
address?: string;
|
||||
city?: string;
|
||||
state?: string;
|
||||
country?: string;
|
||||
phone?: string;
|
||||
website?: string;
|
||||
diocese?: string;
|
||||
churchType?: string;
|
||||
plusCode: string;
|
||||
sourceUrl: string;
|
||||
}
|
||||
|
||||
interface ImportStats {
|
||||
churchesFound: number;
|
||||
newChurchesCreated: number;
|
||||
existingChurchesMerged: number;
|
||||
skipped: number;
|
||||
errors: number;
|
||||
errorDetails: string[];
|
||||
}
|
||||
|
||||
interface CLIArgs {
|
||||
country?: string;
|
||||
all: boolean;
|
||||
diocese?: string;
|
||||
dryRun: boolean;
|
||||
limit?: number;
|
||||
delay: number;
|
||||
resumeFrom?: string;
|
||||
}
|
||||
|
||||
// ─── HTTP Fetching ───────────────────────────────────────────────────────────
|
||||
|
||||
let requestCount = 0;
|
||||
|
||||
async function fetchPage(url: string, delayMs: number): Promise<string | null> {
|
||||
// Rate limit
|
||||
if (requestCount > 0) {
|
||||
await new Promise((resolve) => setTimeout(resolve, delayMs));
|
||||
}
|
||||
requestCount++;
|
||||
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
headers: {
|
||||
'User-Agent': USER_AGENT,
|
||||
'Accept': 'text/html,application/xhtml+xml',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
},
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
if (response.status === 404) {
|
||||
return null; // Expected for some pages
|
||||
}
|
||||
console.error(` HTTP ${response.status} for ${url}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
return await response.text();
|
||||
} catch (error) {
|
||||
console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// ─── HTML Parsing ────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Extract all country codes from the GCatholic countries page.
|
||||
* Links follow pattern: country/{ISO2}
|
||||
*/
|
||||
async function discoverCountries(delayMs: number): Promise<string[]> {
|
||||
console.log('Discovering countries from GCatholic...');
|
||||
const html = await fetchPage(`${BASE_URL}/dioceses/`, delayMs);
|
||||
if (!html) {
|
||||
console.error('Failed to fetch countries page');
|
||||
return [];
|
||||
}
|
||||
|
||||
const countryCodes = new Set<string>();
|
||||
// Match links like: href="country/CN" or href="/dioceses/country/CN"
|
||||
const regex = /href="(?:\.\.\/|\/dioceses\/)?country\/([A-Z]{2})(?:\.htm)?"/g;
|
||||
let match;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
countryCodes.add(match[1]);
|
||||
}
|
||||
|
||||
const codes = Array.from(countryCodes).sort();
|
||||
console.log(`Found ${codes.length} countries`);
|
||||
return codes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract diocese codes from a country page.
|
||||
* Links follow pattern: ../diocese/{code} or diocese/{code}
|
||||
*/
|
||||
async function discoverDioceses(countryCode: string, delayMs: number): Promise<{ code: string; name: string }[]> {
|
||||
const html = await fetchPage(`${BASE_URL}/dioceses/country/${countryCode}.htm`, delayMs);
|
||||
if (!html) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const dioceses: { code: string; name: string }[] = [];
|
||||
const seen = new Set<string>();
|
||||
|
||||
// Match links like: href="../diocese/peki0" or href="../../dioceses/diocese/peki0"
|
||||
// The text after the link is the diocese name
|
||||
const regex = /href="(?:\.\.\/)?(?:\.\.\/dioceses\/)?diocese\/([a-z0-9]+)(?:\.htm)?"[^>]*>([^<]+)</g;
|
||||
let match;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const code = match[1];
|
||||
const name = match[2].trim();
|
||||
if (!seen.has(code)) {
|
||||
seen.add(code);
|
||||
dioceses.push({ code, name });
|
||||
}
|
||||
}
|
||||
|
||||
return dioceses;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract church page links from a diocese page.
|
||||
* Church links follow pattern: ../../churches/{region}/{id}
|
||||
*/
|
||||
async function discoverChurchLinks(dioceseCode: string, delayMs: number): Promise<string[]> {
|
||||
const html = await fetchPage(`${BASE_URL}/dioceses/diocese/${dioceseCode}.htm`, delayMs);
|
||||
if (!html) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const churchUrls = new Set<string>();
|
||||
|
||||
// Match church links like: href="../../churches/china/46492" or href="../../churches/asia/1893"
|
||||
const regex = /href="(?:\.\.\/)*churches\/([a-z0-9-]+\/\d+)(?:\.htm)?"/g;
|
||||
let match;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const churchPath = match[1];
|
||||
churchUrls.add(`${BASE_URL}/churches/${churchPath}.htm`);
|
||||
}
|
||||
|
||||
return Array.from(churchUrls);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a single church page and extract structured data.
|
||||
*/
|
||||
function parseChurchPage(html: string, url: string, countryCode?: string): GCatholicChurch | null {
|
||||
// Extract church name from <h1>
|
||||
const h1Match = html.match(/<h1>([^<]+)<\/h1>/);
|
||||
if (!h1Match) return null;
|
||||
const name = h1Match[1].trim();
|
||||
|
||||
// Extract local name from <h2>
|
||||
const h2Match = html.match(/<h2>([^<]+)<\/h2>/);
|
||||
const localName = h2Match ? h2Match[1].trim() : undefined;
|
||||
|
||||
// Extract Plus Code - it's in a link with onclick containing google maps
|
||||
// Pattern: onclick="window.open('https://www.google.com/maps/search/?api=1&query=PLUSCODE','_blank')"
|
||||
// The Plus Code text is like: >8PFRW9FF+C2<
|
||||
let plusCode: string | null = null;
|
||||
|
||||
// Try the onclick pattern first
|
||||
const plusCodeOnclickMatch = html.match(/onclick="window\.open\('https:\/\/www\.google\.com\/maps\/search\/\?api=1&(?:amp;)?query=([^']+)'/);
|
||||
if (plusCodeOnclickMatch) {
|
||||
plusCode = decodeURIComponent(plusCodeOnclickMatch[1]);
|
||||
}
|
||||
|
||||
// Fallback: look for Plus Code pattern in text (format: XXXX+XX or longer)
|
||||
if (!plusCode) {
|
||||
const plusCodeTextMatch = html.match(/title="Plus Code">([A-Z0-9+]+)<\/a>/);
|
||||
if (plusCodeTextMatch) {
|
||||
plusCode = plusCodeTextMatch[1];
|
||||
}
|
||||
}
|
||||
|
||||
// Another fallback: look for the code near "Location:" label
|
||||
if (!plusCode) {
|
||||
const locationMatch = html.match(/Location:.*?>([2-9A-HJ-NP-Z][2-9A-HJ-NP-Z0-9]{3,7}\+[2-9A-HJ-NP-Z0-9]{2,3})</);
|
||||
if (locationMatch) {
|
||||
plusCode = locationMatch[1];
|
||||
}
|
||||
}
|
||||
|
||||
if (!plusCode) {
|
||||
return null; // Can't geolocate without Plus Code
|
||||
}
|
||||
|
||||
// Decode Plus Code to lat/lng
|
||||
let lat: number, lng: number;
|
||||
try {
|
||||
const decoded = olc.decode(plusCode);
|
||||
lat = decoded.latitudeCenter;
|
||||
lng = decoded.longitudeCenter;
|
||||
} catch {
|
||||
return null; // Invalid Plus Code
|
||||
}
|
||||
|
||||
// Extract GCatholic ID from URL
|
||||
const idMatch = url.match(/\/(\d+)(?:\.htm)?$/);
|
||||
const gcatholicId = idMatch ? idMatch[1] : '';
|
||||
|
||||
// Extract labeled fields using the consistent <span class="label"> pattern
|
||||
const getField = (label: string): string | undefined => {
|
||||
// Pattern: <span class="label">Label: </span>TEXT or <a>TEXT</a>
|
||||
const escaped = label.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||
const regex = new RegExp(`<span class="label">${escaped}:?\\s*</span>\\s*(.+?)(?:</p>|<br)`, 's');
|
||||
const match = html.match(regex);
|
||||
if (!match) return undefined;
|
||||
// Strip HTML tags to get plain text
|
||||
return match[1].replace(/<[^>]+>/g, '').trim() || undefined;
|
||||
};
|
||||
|
||||
// Extract address
|
||||
const address = getField('Address');
|
||||
|
||||
// Extract phone
|
||||
const phone = getField('Telephone');
|
||||
|
||||
// Extract website URL (it's in an <a> tag)
|
||||
let website: string | undefined;
|
||||
const websiteMatch = html.match(/<span class="label">Website:?\s*<\/span>\s*<a\s+href="([^"]+)"/);
|
||||
if (websiteMatch) {
|
||||
website = websiteMatch[1];
|
||||
// Ensure it's an external URL
|
||||
if (website && !website.startsWith('http')) {
|
||||
website = undefined;
|
||||
}
|
||||
}
|
||||
|
||||
// Extract diocese name
|
||||
const diocese = getField('Jurisdiction');
|
||||
|
||||
// Extract church type
|
||||
let churchType: string | undefined;
|
||||
const typeMatch = html.match(/<span class="label">Type:?\s*<\/span>.*?class="ch[a-z]">([^<]+)/);
|
||||
if (typeMatch) {
|
||||
churchType = typeMatch[1].trim();
|
||||
}
|
||||
|
||||
// Extract country from page
|
||||
let country = countryCode;
|
||||
if (!country) {
|
||||
const countryMatch = html.match(/href="[^"]*country\/([A-Z]{2})(?:\.htm)?"/);
|
||||
if (countryMatch) {
|
||||
country = countryMatch[1];
|
||||
}
|
||||
}
|
||||
|
||||
// Extract city from <h3> tag: "City, Region, Country"
|
||||
let city: string | undefined;
|
||||
let state: string | undefined;
|
||||
const h3Match = html.match(/<h3>([^<]+?)(?:,\s*<span class="zregion">([^<]+)<\/span>)?(?:,\s*<a[^>]*class="zcountry"[^>]*>[^<]+<\/a>)?\s*<\/h3>/);
|
||||
if (h3Match) {
|
||||
city = h3Match[1].trim();
|
||||
state = h3Match[2]?.trim();
|
||||
// Clean up: remove country code suffix if present (e.g., "Beijing 北京")
|
||||
// Keep as-is since it may contain local language characters
|
||||
}
|
||||
|
||||
return {
|
||||
gcatholicId,
|
||||
name,
|
||||
localName,
|
||||
lat,
|
||||
lng,
|
||||
address,
|
||||
city,
|
||||
state,
|
||||
country,
|
||||
phone,
|
||||
website,
|
||||
diocese,
|
||||
churchType,
|
||||
plusCode,
|
||||
sourceUrl: url,
|
||||
};
|
||||
}
|
||||
|
||||
// ─── CLI Argument Parsing ────────────────────────────────────────────────────
|
||||
|
||||
function parseArgs(): CLIArgs {
|
||||
const args = process.argv.slice(2);
|
||||
const result: CLIArgs = {
|
||||
all: false,
|
||||
dryRun: false,
|
||||
delay: DEFAULT_DELAY_MS,
|
||||
};
|
||||
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
switch (args[i]) {
|
||||
case '--country':
|
||||
result.country = args[++i]?.toUpperCase();
|
||||
break;
|
||||
case '--all':
|
||||
result.all = true;
|
||||
break;
|
||||
case '--diocese':
|
||||
result.diocese = args[++i];
|
||||
break;
|
||||
case '--dry-run':
|
||||
result.dryRun = true;
|
||||
break;
|
||||
case '--limit':
|
||||
result.limit = parseInt(args[++i], 10);
|
||||
break;
|
||||
case '--delay':
|
||||
result.delay = parseInt(args[++i], 10);
|
||||
break;
|
||||
case '--resume-from':
|
||||
result.resumeFrom = args[++i]?.toUpperCase();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ─── Database Operations ─────────────────────────────────────────────────────
|
||||
|
||||
async function loadExistingChurches(): Promise<ExistingChurch[]> {
|
||||
console.log('Loading existing churches for deduplication...');
|
||||
const churches = await prisma.church.findMany({
|
||||
select: {
|
||||
id: true,
|
||||
name: true,
|
||||
latitude: true,
|
||||
longitude: true,
|
||||
osmId: true,
|
||||
baiduId: true,
|
||||
masstimesId: true,
|
||||
orarimesseId: true,
|
||||
massSchedulesPhId: true,
|
||||
philmassId: true,
|
||||
horariosMisasId: true,
|
||||
mszeInfoId: true,
|
||||
weekdayMassesId: true,
|
||||
messesInfoId: true,
|
||||
bohosluzbyId: true,
|
||||
miserendId: true,
|
||||
kerknetId: true,
|
||||
gottesdienstzeitenId: true,
|
||||
discovermassId: true,
|
||||
source: true,
|
||||
website: true,
|
||||
phone: true,
|
||||
address: true,
|
||||
},
|
||||
});
|
||||
console.log(`Loaded ${churches.length} existing churches`);
|
||||
return churches;
|
||||
}
|
||||
|
||||
async function importChurch(
|
||||
church: GCatholicChurch,
|
||||
existingChurches: ExistingChurch[],
|
||||
dryRun: boolean,
|
||||
stats: ImportStats,
|
||||
): Promise<void> {
|
||||
// Build a candidate compatible with findDuplicateChurch (expects OSMChurch shape)
|
||||
const candidate = {
|
||||
osmId: `gcatholic-${church.gcatholicId}`,
|
||||
name: church.name,
|
||||
lat: church.lat,
|
||||
lng: church.lng,
|
||||
address: church.address,
|
||||
city: church.city,
|
||||
state: church.state,
|
||||
country: church.country,
|
||||
phone: church.phone,
|
||||
website: church.website,
|
||||
diocese: church.diocese,
|
||||
};
|
||||
|
||||
const duplicate = findDuplicateChurch(candidate, existingChurches);
|
||||
|
||||
if (dryRun) {
|
||||
if (duplicate) {
|
||||
console.log(` [MERGE] ${church.name} → existing: ${duplicate.name} (${duplicate.id})`);
|
||||
stats.existingChurchesMerged++;
|
||||
} else {
|
||||
console.log(` [NEW] ${church.name} (${church.lat.toFixed(4)}, ${church.lng.toFixed(4)})`);
|
||||
stats.newChurchesCreated++;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (duplicate) {
|
||||
// Merge: fill in missing fields only
|
||||
const updateData: Record<string, unknown> = {};
|
||||
|
||||
if (!duplicate.phone && church.phone) updateData.phone = church.phone;
|
||||
if (!duplicate.website && church.website) {
|
||||
updateData.website = church.website;
|
||||
updateData.hasWebsite = true;
|
||||
}
|
||||
if (!duplicate.address && church.address) updateData.address = church.address;
|
||||
|
||||
// Always set diocese if missing (GCatholic is great for this)
|
||||
// We need to check diocese on the actual DB record
|
||||
const dbRecord = await prisma.church.findUnique({
|
||||
where: { id: duplicate.id },
|
||||
select: { diocese: true },
|
||||
});
|
||||
if (dbRecord && !dbRecord.diocese && church.diocese) {
|
||||
updateData.diocese = church.diocese;
|
||||
}
|
||||
|
||||
if (Object.keys(updateData).length > 0) {
|
||||
await prisma.church.update({
|
||||
where: { id: duplicate.id },
|
||||
data: updateData,
|
||||
});
|
||||
stats.existingChurchesMerged++;
|
||||
} else {
|
||||
stats.skipped++;
|
||||
}
|
||||
} else {
|
||||
// Create new church
|
||||
const newChurch = await prisma.church.create({
|
||||
data: {
|
||||
name: church.name,
|
||||
latitude: church.lat,
|
||||
longitude: church.lng,
|
||||
address: church.address,
|
||||
city: church.city,
|
||||
state: church.state,
|
||||
country: church.country,
|
||||
phone: church.phone,
|
||||
website: church.website,
|
||||
hasWebsite: !!church.website,
|
||||
source: 'gcatholic',
|
||||
diocese: church.diocese,
|
||||
},
|
||||
});
|
||||
stats.newChurchesCreated++;
|
||||
|
||||
// Add to existing list for future dedup within this run
|
||||
existingChurches.push({
|
||||
id: newChurch.id,
|
||||
name: church.name,
|
||||
latitude: church.lat,
|
||||
longitude: church.lng,
|
||||
osmId: null,
|
||||
baiduId: null,
|
||||
masstimesId: null,
|
||||
orarimesseId: null,
|
||||
massSchedulesPhId: null,
|
||||
philmassId: null,
|
||||
horariosMisasId: null,
|
||||
mszeInfoId: null,
|
||||
weekdayMassesId: null,
|
||||
messesInfoId: null,
|
||||
bohosluzbyId: null,
|
||||
miserendId: null,
|
||||
kerknetId: null,
|
||||
gottesdienstzeitenId: null,
|
||||
discovermassId: null,
|
||||
source: 'gcatholic',
|
||||
website: church.website || null,
|
||||
phone: church.phone || null,
|
||||
address: church.address || null,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Import Logic ────────────────────────────────────────────────────────────
|
||||
|
||||
async function importDiocese(
|
||||
dioceseCode: string,
|
||||
dioceseName: string,
|
||||
countryCode: string | undefined,
|
||||
existingChurches: ExistingChurch[],
|
||||
args: CLIArgs,
|
||||
stats: ImportStats,
|
||||
globalLimit?: { remaining: number },
|
||||
): Promise<void> {
|
||||
const churchUrls = await discoverChurchLinks(dioceseCode, args.delay);
|
||||
|
||||
if (churchUrls.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(` Diocese ${dioceseName} (${dioceseCode}): ${churchUrls.length} church pages found`);
|
||||
|
||||
let dioceseNew = 0;
|
||||
let dioceseMerged = 0;
|
||||
let dioceseSkipped = 0;
|
||||
let dioceseErrors = 0;
|
||||
|
||||
for (const url of churchUrls) {
|
||||
// Check global limit
|
||||
if (globalLimit && globalLimit.remaining <= 0) {
|
||||
console.log(` Limit reached, stopping`);
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const html = await fetchPage(url, args.delay);
|
||||
if (!html) {
|
||||
stats.errors++;
|
||||
dioceseErrors++;
|
||||
stats.errorDetails.push(`Failed to fetch: ${url}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const church = parseChurchPage(html, url, countryCode);
|
||||
if (!church) {
|
||||
stats.skipped++;
|
||||
dioceseSkipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
stats.churchesFound++;
|
||||
|
||||
const prevNew = stats.newChurchesCreated;
|
||||
const prevMerged = stats.existingChurchesMerged;
|
||||
|
||||
await importChurch(church, existingChurches, args.dryRun, stats);
|
||||
|
||||
if (stats.newChurchesCreated > prevNew) dioceseNew++;
|
||||
if (stats.existingChurchesMerged > prevMerged) dioceseMerged++;
|
||||
|
||||
if (globalLimit) globalLimit.remaining--;
|
||||
|
||||
} catch (error) {
|
||||
stats.errors++;
|
||||
dioceseErrors++;
|
||||
const msg = error instanceof Error ? error.message : String(error);
|
||||
stats.errorDetails.push(`${url}: ${msg}`);
|
||||
console.error(` Error processing ${url}: ${msg}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (churchUrls.length > 0) {
|
||||
const parts = [`${dioceseNew} new`, `${dioceseMerged} merged`];
|
||||
if (dioceseSkipped > 0) parts.push(`${dioceseSkipped} skipped`);
|
||||
if (dioceseErrors > 0) parts.push(`${dioceseErrors} errors`);
|
||||
console.log(` → ${parts.join(', ')}`);
|
||||
}
|
||||
}
|
||||
|
||||
async function importCountry(
|
||||
countryCode: string,
|
||||
existingChurches: ExistingChurch[],
|
||||
args: CLIArgs,
|
||||
globalLimit?: { remaining: number },
|
||||
): Promise<ImportStats> {
|
||||
const stats: ImportStats = {
|
||||
churchesFound: 0,
|
||||
newChurchesCreated: 0,
|
||||
existingChurchesMerged: 0,
|
||||
skipped: 0,
|
||||
errors: 0,
|
||||
errorDetails: [],
|
||||
};
|
||||
|
||||
console.log(`\n${'='.repeat(60)}`);
|
||||
console.log(`Importing from GCatholic: ${countryCode}`);
|
||||
console.log(`${'='.repeat(60)}`);
|
||||
|
||||
// Discover dioceses
|
||||
const dioceses = await discoverDioceses(countryCode, args.delay);
|
||||
if (dioceses.length === 0) {
|
||||
console.log(`No dioceses found for ${countryCode}`);
|
||||
return stats;
|
||||
}
|
||||
console.log(`Found ${dioceses.length} dioceses in ${countryCode}`);
|
||||
|
||||
// Process each diocese
|
||||
for (const diocese of dioceses) {
|
||||
if (globalLimit && globalLimit.remaining <= 0) break;
|
||||
|
||||
await importDiocese(
|
||||
diocese.code,
|
||||
diocese.name,
|
||||
countryCode,
|
||||
existingChurches,
|
||||
args,
|
||||
stats,
|
||||
globalLimit,
|
||||
);
|
||||
}
|
||||
|
||||
return stats;
|
||||
}
|
||||
|
||||
// ─── Summary Printing ────────────────────────────────────────────────────────
|
||||
|
||||
function printSummary(label: string, stats: ImportStats, dryRun: boolean): void {
|
||||
console.log(`\n${'─'.repeat(60)}`);
|
||||
console.log(`Summary: ${label} ${dryRun ? '(DRY RUN)' : ''}`);
|
||||
console.log(`${'─'.repeat(60)}`);
|
||||
console.log(`Churches found on GCatholic: ${stats.churchesFound}`);
|
||||
console.log(`New churches created: ${stats.newChurchesCreated}`);
|
||||
console.log(`Merged with existing: ${stats.existingChurchesMerged}`);
|
||||
console.log(`Skipped (no data/dup): ${stats.skipped}`);
|
||||
if (stats.errors > 0) {
|
||||
console.log(`Errors: ${stats.errors}`);
|
||||
}
|
||||
console.log(`${'─'.repeat(60)}`);
|
||||
}
|
||||
|
||||
// ─── Job Tracking ────────────────────────────────────────────────────────────
|
||||
|
||||
async function createOrResumeJob(args: string[]): Promise<string | null> {
|
||||
const jobIdIndex = args.indexOf('--job-id');
|
||||
if (jobIdIndex !== -1) {
|
||||
const jobId = args[jobIdIndex + 1];
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: { status: 'running', startedAt: new Date() },
|
||||
});
|
||||
return jobId;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function completeJob(jobId: string | null, error?: string): Promise<void> {
|
||||
if (!jobId) return;
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: {
|
||||
status: error ? 'failed' : 'completed',
|
||||
error: error || null,
|
||||
completedAt: new Date(),
|
||||
},
|
||||
});
|
||||
} catch (err) {
|
||||
console.error(`Failed to update job ${jobId}:`, err);
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Main ────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const jobId = await createOrResumeJob(process.argv.slice(2));
|
||||
|
||||
if (!args.country && !args.all && !args.diocese) {
|
||||
console.error('Error: Must specify --country <ISO2>, --diocese <code>, or --all');
|
||||
console.error('Usage:');
|
||||
console.error(' npx tsx scripts/import-gcatholic.ts --country CN');
|
||||
console.error(' npx tsx scripts/import-gcatholic.ts --country CN --dry-run');
|
||||
console.error(' npx tsx scripts/import-gcatholic.ts --diocese peki0');
|
||||
console.error(' npx tsx scripts/import-gcatholic.ts --all');
|
||||
console.error(' npx tsx scripts/import-gcatholic.ts --all --limit 500');
|
||||
console.error(' npx tsx scripts/import-gcatholic.ts --all --resume-from PL');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (args.dryRun) {
|
||||
console.log('\n*** DRY RUN MODE — no changes will be written to database ***\n');
|
||||
}
|
||||
|
||||
console.log(`Delay between requests: ${args.delay}ms`);
|
||||
if (args.limit) console.log(`Limit: ${args.limit} churches`);
|
||||
|
||||
try {
|
||||
const existingChurches = await loadExistingChurches();
|
||||
const globalLimit = args.limit ? { remaining: args.limit } : undefined;
|
||||
|
||||
if (args.diocese) {
|
||||
// Single diocese mode
|
||||
const stats: ImportStats = {
|
||||
churchesFound: 0,
|
||||
newChurchesCreated: 0,
|
||||
existingChurchesMerged: 0,
|
||||
skipped: 0,
|
||||
errors: 0,
|
||||
errorDetails: [],
|
||||
};
|
||||
|
||||
await importDiocese(args.diocese, args.diocese, args.country, existingChurches, args, stats, globalLimit);
|
||||
printSummary(`Diocese ${args.diocese}`, stats, args.dryRun);
|
||||
|
||||
} else if (args.country) {
|
||||
// Single country mode
|
||||
const stats = await importCountry(args.country, existingChurches, args, globalLimit);
|
||||
printSummary(args.country, stats, args.dryRun);
|
||||
|
||||
} else if (args.all) {
|
||||
// All countries mode — discover from GCatholic
|
||||
let countries = await discoverCountries(args.delay);
|
||||
|
||||
if (countries.length === 0) {
|
||||
console.error('Failed to discover countries');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Handle --resume-from
|
||||
if (args.resumeFrom) {
|
||||
const idx = countries.indexOf(args.resumeFrom);
|
||||
if (idx === -1) {
|
||||
console.error(`Country ${args.resumeFrom} not found in GCatholic listing`);
|
||||
process.exit(1);
|
||||
}
|
||||
console.log(`Resuming from ${args.resumeFrom} (skipping ${idx} countries)\n`);
|
||||
countries = countries.slice(idx);
|
||||
}
|
||||
|
||||
console.log(`Will process ${countries.length} countries\n`);
|
||||
|
||||
const totalStats: ImportStats = {
|
||||
churchesFound: 0,
|
||||
newChurchesCreated: 0,
|
||||
existingChurchesMerged: 0,
|
||||
skipped: 0,
|
||||
errors: 0,
|
||||
errorDetails: [],
|
||||
};
|
||||
|
||||
let countriesProcessed = 0;
|
||||
|
||||
for (const countryCode of countries) {
|
||||
if (globalLimit && globalLimit.remaining <= 0) {
|
||||
console.log(`\nGlobal limit reached, stopping.`);
|
||||
break;
|
||||
}
|
||||
|
||||
const stats = await importCountry(countryCode, existingChurches, args, globalLimit);
|
||||
printSummary(countryCode, stats, args.dryRun);
|
||||
|
||||
// Aggregate
|
||||
totalStats.churchesFound += stats.churchesFound;
|
||||
totalStats.newChurchesCreated += stats.newChurchesCreated;
|
||||
totalStats.existingChurchesMerged += stats.existingChurchesMerged;
|
||||
totalStats.skipped += stats.skipped;
|
||||
totalStats.errors += stats.errors;
|
||||
totalStats.errorDetails.push(...stats.errorDetails);
|
||||
countriesProcessed++;
|
||||
|
||||
// Small extra delay between countries
|
||||
await new Promise((resolve) => setTimeout(resolve, 2000));
|
||||
}
|
||||
|
||||
// Overall summary
|
||||
console.log(`\n${'='.repeat(60)}`);
|
||||
console.log(`OVERALL SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
|
||||
console.log(`${'='.repeat(60)}`);
|
||||
console.log(`Countries processed: ${countriesProcessed}`);
|
||||
console.log(`Total churches found: ${totalStats.churchesFound}`);
|
||||
console.log(`Total new churches created: ${totalStats.newChurchesCreated}`);
|
||||
console.log(`Total merged with existing: ${totalStats.existingChurchesMerged}`);
|
||||
console.log(`Total skipped: ${totalStats.skipped}`);
|
||||
if (totalStats.errors > 0) {
|
||||
console.log(`Total errors: ${totalStats.errors}`);
|
||||
}
|
||||
console.log(`Total HTTP requests made: ${requestCount}`);
|
||||
console.log(`${'='.repeat(60)}\n`);
|
||||
|
||||
if (totalStats.errorDetails.length > 0 && totalStats.errorDetails.length <= 50) {
|
||||
console.log('\nError details:');
|
||||
totalStats.errorDetails.forEach((e) => console.log(` - ${e}`));
|
||||
} else if (totalStats.errorDetails.length > 50) {
|
||||
console.log(`\nFirst 50 errors (of ${totalStats.errorDetails.length}):`);
|
||||
totalStats.errorDetails.slice(0, 50).forEach((e) => console.log(` - ${e}`));
|
||||
}
|
||||
}
|
||||
|
||||
await completeJob(jobId);
|
||||
} catch (error) {
|
||||
console.error('Fatal error:', error);
|
||||
await completeJob(jobId, String(error));
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
686
scripts/import-gottesdienstzeiten.ts
Normal file
686
scripts/import-gottesdienstzeiten.ts
Normal file
@@ -0,0 +1,686 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Import Catholic churches and mass schedules from gottesdienstzeiten.de (Germany)
|
||||
*
|
||||
* gottesdienstzeiten.de is a German worship service directory with ~6,878 Catholic
|
||||
* churches. It runs on WordPress with a fully open REST API at /wp-json/wp/v2/posts.
|
||||
*
|
||||
* Data includes: church name, address, coordinates (Google Maps embed), diocese,
|
||||
* mass schedules (day/type/time table), website, email, phone.
|
||||
*
|
||||
* Import strategy:
|
||||
* 1. Fetch all Catholic diocese category IDs from WP API
|
||||
* 2. Paginate through posts per category (100 per page)
|
||||
* 3. Parse HTML content for coordinates, address, schedule table, info table
|
||||
* 4. Match against existing German churches via church-matcher
|
||||
* 5. Upsert churches and mass schedules
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/import-gottesdienstzeiten.ts --all --dry-run
|
||||
* npx tsx scripts/import-gottesdienstzeiten.ts --all
|
||||
* npx tsx scripts/import-gottesdienstzeiten.ts --diocese 129 --dry-run # Köln only
|
||||
* npx tsx scripts/import-gottesdienstzeiten.ts --all --resume-from 5
|
||||
*/
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
|
||||
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
|
||||
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
|
||||
const pool = new Pool({
|
||||
connectionString: dbUrl,
|
||||
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
|
||||
});
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
import { findDuplicateChurch } from '../src/lib/church-matcher';
|
||||
import type { ExistingChurch } from '../src/lib/church-matcher';
|
||||
|
||||
// ─── Constants ───────────────────────────────────────────────────────────────
|
||||
|
||||
const API_BASE = 'https://gottesdienstzeiten.de/wp-json/wp/v2';
|
||||
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
|
||||
const REQUEST_DELAY_MS = 1000;
|
||||
const RETRY_DELAY_MS = 5000;
|
||||
const MAX_RETRIES = 3;
|
||||
const POSTS_PER_PAGE = 100;
|
||||
const CATHOLIC_PARENT_CATEGORY = 4;
|
||||
|
||||
// German day names → dayOfWeek (0=Sun, 1=Mon, ..., 6=Sat)
|
||||
const GERMAN_DAYS: Record<string, number> = {
|
||||
'sonntags': 0, 'montags': 1, 'dienstags': 2, 'mittwochs': 3,
|
||||
'donnerstags': 4, 'freitags': 5, 'samstags': 6,
|
||||
// Without -s suffix (some entries use these)
|
||||
'sonntag': 0, 'montag': 1, 'dienstag': 2, 'mittwoch': 3,
|
||||
'donnerstag': 4, 'freitag': 5, 'samstag': 6,
|
||||
};
|
||||
|
||||
// Mass-related types (filter out non-mass services)
|
||||
const MASS_TYPES = new Set([
|
||||
'messfeier', 'vorabendmesse', 'heilige messe', 'hl. messe',
|
||||
'hochamt', 'festmesse', 'familienmesse', 'kindergottesdienst',
|
||||
'jugendmesse', 'abendmesse', 'frühmesse', 'werktagsmesse',
|
||||
'sonntagsmesse', 'messe', 'eucharistiefeier',
|
||||
]);
|
||||
|
||||
// ─── Types ───────────────────────────────────────────────────────────────────
|
||||
|
||||
interface DioceseCat {
|
||||
id: number;
|
||||
name: string;
|
||||
count: number;
|
||||
}
|
||||
|
||||
interface ParsedChurch {
|
||||
wpId: number;
|
||||
slug: string;
|
||||
name: string;
|
||||
latitude: number;
|
||||
longitude: number;
|
||||
address: string | null;
|
||||
zip: string | null;
|
||||
city: string | null;
|
||||
diocese: string | null;
|
||||
website: string | null;
|
||||
email: string | null;
|
||||
phone: string | null;
|
||||
schedules: ParsedSchedule[];
|
||||
}
|
||||
|
||||
interface ParsedSchedule {
|
||||
dayOfWeek: number;
|
||||
time: string;
|
||||
}
|
||||
|
||||
interface ImportStats {
|
||||
diocesesProcessed: number;
|
||||
postsFound: number;
|
||||
churchesParsed: number;
|
||||
churchesMatched: number;
|
||||
churchesCreated: number;
|
||||
churchesSkipped: number;
|
||||
schedulesCreated: number;
|
||||
errors: number;
|
||||
}
|
||||
|
||||
interface CLIArgs {
|
||||
all: boolean;
|
||||
dryRun: boolean;
|
||||
resumeFrom?: number;
|
||||
diocese?: number;
|
||||
jobId?: string;
|
||||
}
|
||||
|
||||
// ─── HTTP Helpers ────────────────────────────────────────────────────────────
|
||||
|
||||
let requestCount = 0;
|
||||
|
||||
function delay(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
async function fetchJson(url: string): Promise<any | null> {
|
||||
if (requestCount > 0) {
|
||||
await delay(REQUEST_DELAY_MS);
|
||||
}
|
||||
requestCount++;
|
||||
|
||||
for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
headers: { 'User-Agent': USER_AGENT },
|
||||
});
|
||||
|
||||
if (response.status === 429 || response.status === 503) {
|
||||
if (attempt < MAX_RETRIES) {
|
||||
console.log(` HTTP ${response.status} — retrying in ${RETRY_DELAY_MS / 1000}s`);
|
||||
await delay(RETRY_DELAY_MS);
|
||||
continue;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!response.ok) return null;
|
||||
return await response.json();
|
||||
} catch (error) {
|
||||
if (attempt < MAX_RETRIES) {
|
||||
await delay(RETRY_DELAY_MS);
|
||||
continue;
|
||||
}
|
||||
console.error(` Fetch error: ${error instanceof Error ? error.message : error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// ─── Parsing ─────────────────────────────────────────────────────────────────
|
||||
|
||||
function stripHtml(html: string): string {
|
||||
return html.replace(/<[^>]+>/g, '').trim();
|
||||
}
|
||||
|
||||
function parsePost(post: any, dioceseName: string | null): ParsedChurch | null {
|
||||
const content: string = post.content?.rendered || '';
|
||||
const wpId: number = post.id;
|
||||
const slug: string = post.slug;
|
||||
|
||||
// Extract name from title — format: "(City) Church Name"
|
||||
let name = stripHtml(post.title?.rendered || '');
|
||||
// Remove leading "(City)" prefix for cleaner name
|
||||
const nameMatch = name.match(/^\([^)]+\)\s*(.+)$/);
|
||||
if (nameMatch) name = nameMatch[1];
|
||||
|
||||
// Extract coordinates from Google Maps embed
|
||||
const coordMatch = content.match(/maps\?q=([-\d.]+),([-\d.]+)/);
|
||||
if (!coordMatch) return null;
|
||||
|
||||
const latitude = parseFloat(coordMatch[1]);
|
||||
const longitude = parseFloat(coordMatch[2]);
|
||||
if (isNaN(latitude) || isNaN(longitude) || (latitude === 0 && longitude === 0)) return null;
|
||||
|
||||
// Extract address from first <strong> tag (format: "Street, ZIP City")
|
||||
const addrMatch = content.match(/<strong>([^<]+)<\/strong>/);
|
||||
let address: string | null = null;
|
||||
let zip: string | null = null;
|
||||
let city: string | null = null;
|
||||
|
||||
if (addrMatch) {
|
||||
const fullAddr = addrMatch[1].trim();
|
||||
address = fullAddr;
|
||||
|
||||
// Parse "Street, ZIP City" format
|
||||
const zipCityMatch = fullAddr.match(/,\s*(\d{5})\s+(.+)$/);
|
||||
if (zipCityMatch) {
|
||||
zip = zipCityMatch[1];
|
||||
city = zipCityMatch[2];
|
||||
address = fullAddr.replace(/,\s*\d{5}\s+.+$/, '').trim();
|
||||
}
|
||||
}
|
||||
|
||||
// Parse info table (second table) for website, email, phone
|
||||
let website: string | null = null;
|
||||
let email: string | null = null;
|
||||
let phone: string | null = null;
|
||||
|
||||
const tables = content.match(/<table[^>]*>([\s\S]*?)<\/table>/g) || [];
|
||||
if (tables.length >= 2) {
|
||||
const infoTable = tables[1];
|
||||
// Website
|
||||
const websiteMatch = infoTable.match(/Website[\s\S]*?<a[^>]*href="([^"]+)"/);
|
||||
if (websiteMatch) website = websiteMatch[1];
|
||||
// Email
|
||||
const emailMatch = infoTable.match(/E-Mail[\s\S]*?<td[^>]*>([\s\S]*?)<\/td>/);
|
||||
if (emailMatch) {
|
||||
const emailText = stripHtml(emailMatch[1]);
|
||||
if (emailText.includes('@')) email = emailText;
|
||||
}
|
||||
// Phone
|
||||
const phoneMatch = infoTable.match(/Telefon[\s\S]*?<td[^>]*>([\s\S]*?)<\/td>/);
|
||||
if (phoneMatch) {
|
||||
const phoneText = stripHtml(phoneMatch[1]);
|
||||
if (phoneText.length > 3) phone = phoneText;
|
||||
}
|
||||
}
|
||||
|
||||
// Parse schedule table (first table)
|
||||
const schedules: ParsedSchedule[] = [];
|
||||
if (tables.length >= 1) {
|
||||
const schedTable = tables[0];
|
||||
const rows = schedTable.match(/<tr[^>]*>([\s\S]*?)<\/tr>/g) || [];
|
||||
|
||||
let currentDay = -1;
|
||||
const seen = new Set<string>();
|
||||
|
||||
for (const row of rows) {
|
||||
// Check for day header (in <th> with <em>)
|
||||
const dayMatch = row.match(/<th[^>]*>[\s\S]*?<em>([^<]*)<\/em>/);
|
||||
if (dayMatch && dayMatch[1].trim()) {
|
||||
const dayName = dayMatch[1].trim().toLowerCase();
|
||||
if (GERMAN_DAYS[dayName] !== undefined) {
|
||||
currentDay = GERMAN_DAYS[dayName];
|
||||
}
|
||||
}
|
||||
|
||||
// Get type and time from <td><em>...</em></td>
|
||||
const cells = row.match(/<td[^>]*>[\s\S]*?<em>([^<]*)<\/em>[\s\S]*?<\/td>/g);
|
||||
if (!cells || cells.length < 2 || currentDay < 0) continue;
|
||||
|
||||
const typeMatch = cells[0].match(/<em>([^<]*)<\/em>/);
|
||||
const timeMatch = cells[1].match(/<em>([^<]*)<\/em>/);
|
||||
if (!typeMatch || !timeMatch) continue;
|
||||
|
||||
const massType = typeMatch[1].trim().toLowerCase();
|
||||
const timeStr = timeMatch[1].trim();
|
||||
|
||||
// Only include mass-related types
|
||||
const isMass = MASS_TYPES.has(massType) ||
|
||||
massType.includes('messe') || massType.includes('messfeier') ||
|
||||
massType.includes('eucharistie');
|
||||
if (!isMass) continue;
|
||||
|
||||
// Parse time: "09.00 Uhr" or "18:30 Uhr" → "09:00" or "18:30"
|
||||
const parsedTime = timeStr
|
||||
.replace(/\s*Uhr\s*/i, '')
|
||||
.replace('.', ':')
|
||||
.trim();
|
||||
const timeValidation = parsedTime.match(/^(\d{1,2}):(\d{2})$/);
|
||||
if (!timeValidation) continue;
|
||||
const normalizedTime = `${timeValidation[1].padStart(2, '0')}:${timeValidation[2]}`;
|
||||
|
||||
const key = `${currentDay}:${normalizedTime}`;
|
||||
if (!seen.has(key)) {
|
||||
seen.add(key);
|
||||
schedules.push({ dayOfWeek: currentDay, time: normalizedTime });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
wpId, slug, name, latitude, longitude,
|
||||
address, zip, city, diocese: dioceseName,
|
||||
website, email, phone, schedules,
|
||||
};
|
||||
}
|
||||
|
||||
// ─── Database Operations ─────────────────────────────────────────────────────
|
||||
|
||||
async function loadExistingGermanChurches(): Promise<ExistingChurch[]> {
|
||||
console.log('Loading existing German churches for deduplication...');
|
||||
const churches = await prisma.church.findMany({
|
||||
where: { country: 'DE' },
|
||||
select: {
|
||||
id: true,
|
||||
name: true,
|
||||
latitude: true,
|
||||
longitude: true,
|
||||
osmId: true,
|
||||
baiduId: true,
|
||||
masstimesId: true,
|
||||
orarimesseId: true,
|
||||
massSchedulesPhId: true,
|
||||
philmassId: true,
|
||||
horariosMisasId: true,
|
||||
mszeInfoId: true,
|
||||
weekdayMassesId: true,
|
||||
messesInfoId: true,
|
||||
bohosluzbyId: true,
|
||||
miserendId: true,
|
||||
kerknetId: true,
|
||||
gottesdienstzeitenId: true,
|
||||
discovermassId: true,
|
||||
source: true,
|
||||
website: true,
|
||||
phone: true,
|
||||
address: true,
|
||||
},
|
||||
});
|
||||
console.log(`Loaded ${churches.length} existing German churches`);
|
||||
return churches;
|
||||
}
|
||||
|
||||
// ─── Import Logic ────────────────────────────────────────────────────────────
|
||||
|
||||
async function fetchDioceseCategories(): Promise<DioceseCat[]> {
|
||||
console.log('Fetching Catholic diocese categories...');
|
||||
const data = await fetchJson(
|
||||
`${API_BASE}/categories?per_page=100&parent=${CATHOLIC_PARENT_CATEGORY}`
|
||||
);
|
||||
if (!data) {
|
||||
console.error('Failed to fetch categories');
|
||||
return [];
|
||||
}
|
||||
const cats: DioceseCat[] = data.map((c: any) => ({
|
||||
id: c.id, name: c.name, count: c.count,
|
||||
}));
|
||||
const total = cats.reduce((s, c) => s + c.count, 0);
|
||||
console.log(`Found ${cats.length} diocese categories with ${total} total posts\n`);
|
||||
return cats.sort((a, b) => b.count - a.count);
|
||||
}
|
||||
|
||||
async function processDiocese(
|
||||
cat: DioceseCat,
|
||||
existingChurches: ExistingChurch[],
|
||||
dryRun: boolean,
|
||||
stats: ImportStats,
|
||||
): Promise<void> {
|
||||
const totalPages = Math.ceil(cat.count / POSTS_PER_PAGE);
|
||||
|
||||
for (let page = 1; page <= totalPages; page++) {
|
||||
const url = `${API_BASE}/posts?categories=${cat.id}&per_page=${POSTS_PER_PAGE}&page=${page}`;
|
||||
const posts = await fetchJson(url);
|
||||
if (!posts || !Array.isArray(posts) || posts.length === 0) break;
|
||||
|
||||
stats.postsFound += posts.length;
|
||||
|
||||
for (const post of posts) {
|
||||
const church = parsePost(post, cat.name);
|
||||
if (!church) {
|
||||
stats.churchesSkipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
stats.churchesParsed++;
|
||||
const gdzId = String(church.wpId);
|
||||
|
||||
const candidate = {
|
||||
name: church.name,
|
||||
lat: church.latitude,
|
||||
lng: church.longitude,
|
||||
gottesdienstzeitenId: gdzId,
|
||||
};
|
||||
|
||||
const duplicate = findDuplicateChurch(candidate, existingChurches);
|
||||
|
||||
if (dryRun) {
|
||||
if (duplicate) {
|
||||
stats.churchesMatched++;
|
||||
} else {
|
||||
stats.churchesCreated++;
|
||||
}
|
||||
stats.schedulesCreated += church.schedules.length;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (duplicate) {
|
||||
stats.churchesMatched++;
|
||||
const updateData: Record<string, unknown> = { gottesdienstzeitenId: gdzId };
|
||||
|
||||
if (!duplicate.address && church.address) updateData.address = church.address;
|
||||
if (!duplicate.website && church.website) {
|
||||
updateData.website = church.website;
|
||||
updateData.hasWebsite = true;
|
||||
}
|
||||
if (!duplicate.phone && church.phone) updateData.phone = church.phone;
|
||||
|
||||
try {
|
||||
await prisma.church.update({
|
||||
where: { id: duplicate.id },
|
||||
data: updateData,
|
||||
});
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message.includes('Unique constraint')) {
|
||||
stats.churchesSkipped++;
|
||||
continue;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
|
||||
if (church.schedules.length > 0) {
|
||||
try {
|
||||
await prisma.$transaction(async (tx) => {
|
||||
await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } });
|
||||
await tx.massSchedule.createMany({
|
||||
data: church.schedules.map((s) => ({
|
||||
churchId: duplicate.id,
|
||||
dayOfWeek: s.dayOfWeek,
|
||||
time: s.time,
|
||||
language: 'German',
|
||||
})),
|
||||
});
|
||||
await tx.church.update({
|
||||
where: { id: duplicate.id },
|
||||
data: { lastScrapedAt: new Date() },
|
||||
});
|
||||
});
|
||||
stats.schedulesCreated += church.schedules.length;
|
||||
} catch (error) {
|
||||
stats.errors++;
|
||||
console.error(` Error saving schedules for ${church.slug}: ${error instanceof Error ? error.message : error}`);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
const newChurch = await prisma.church.create({
|
||||
data: {
|
||||
name: church.name,
|
||||
latitude: church.latitude,
|
||||
longitude: church.longitude,
|
||||
address: church.address,
|
||||
zip: church.zip,
|
||||
city: church.city,
|
||||
country: 'DE',
|
||||
diocese: church.diocese || undefined,
|
||||
website: church.website,
|
||||
hasWebsite: !!church.website,
|
||||
email: church.email,
|
||||
phone: church.phone,
|
||||
gottesdienstzeitenId: gdzId,
|
||||
source: 'gottesdienstzeiten',
|
||||
websiteLanguage: 'de',
|
||||
},
|
||||
});
|
||||
stats.churchesCreated++;
|
||||
|
||||
existingChurches.push({
|
||||
id: newChurch.id,
|
||||
name: church.name,
|
||||
latitude: church.latitude,
|
||||
longitude: church.longitude,
|
||||
osmId: null,
|
||||
baiduId: null,
|
||||
masstimesId: null,
|
||||
orarimesseId: null,
|
||||
massSchedulesPhId: null,
|
||||
philmassId: null,
|
||||
horariosMisasId: null,
|
||||
mszeInfoId: null,
|
||||
weekdayMassesId: null,
|
||||
messesInfoId: null,
|
||||
bohosluzbyId: null,
|
||||
miserendId: null,
|
||||
kerknetId: null,
|
||||
gottesdienstzeitenId: gdzId,
|
||||
discovermassId: null,
|
||||
source: 'gottesdienstzeiten',
|
||||
website: church.website,
|
||||
phone: church.phone,
|
||||
address: church.address,
|
||||
});
|
||||
|
||||
if (church.schedules.length > 0) {
|
||||
await prisma.massSchedule.createMany({
|
||||
data: church.schedules.map((s) => ({
|
||||
churchId: newChurch.id,
|
||||
dayOfWeek: s.dayOfWeek,
|
||||
time: s.time,
|
||||
language: 'German',
|
||||
})),
|
||||
});
|
||||
await prisma.church.update({
|
||||
where: { id: newChurch.id },
|
||||
data: { lastScrapedAt: new Date() },
|
||||
});
|
||||
stats.schedulesCreated += church.schedules.length;
|
||||
}
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message.includes('Unique constraint')) {
|
||||
stats.churchesSkipped++;
|
||||
continue;
|
||||
}
|
||||
stats.errors++;
|
||||
console.error(` Error creating ${church.slug}: ${error instanceof Error ? error.message : error}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
stats.diocesesProcessed++;
|
||||
}
|
||||
|
||||
// ─── CLI ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
function parseArgs(): CLIArgs {
|
||||
const args = process.argv.slice(2);
|
||||
const result: CLIArgs = { all: false, dryRun: false };
|
||||
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
switch (args[i]) {
|
||||
case '--all':
|
||||
result.all = true;
|
||||
break;
|
||||
case '--dry-run':
|
||||
result.dryRun = true;
|
||||
break;
|
||||
case '--resume-from':
|
||||
result.resumeFrom = parseInt(args[++i]);
|
||||
break;
|
||||
case '--diocese':
|
||||
result.diocese = parseInt(args[++i]);
|
||||
break;
|
||||
case '--job-id':
|
||||
result.jobId = args[++i];
|
||||
break;
|
||||
case '--help':
|
||||
case '-h':
|
||||
console.log(`
|
||||
Usage: npx tsx scripts/import-gottesdienstzeiten.ts [options]
|
||||
|
||||
Options:
|
||||
--all Import all Catholic diocese categories
|
||||
--diocese <catId> Import a single diocese category (e.g., 129 for Köln)
|
||||
--dry-run No database writes, just report what would happen
|
||||
--resume-from <n> Skip first N diocese categories
|
||||
--job-id <uuid> Background job tracking ID
|
||||
--help, -h Show this help message
|
||||
|
||||
Examples:
|
||||
npx tsx scripts/import-gottesdienstzeiten.ts --diocese 129 --dry-run
|
||||
npx tsx scripts/import-gottesdienstzeiten.ts --all --dry-run
|
||||
npx tsx scripts/import-gottesdienstzeiten.ts --all
|
||||
`);
|
||||
process.exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
if (!result.all && !result.diocese) {
|
||||
console.error('Error: specify --all or --diocese <categoryId>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
function formatDuration(ms: number): string {
|
||||
const seconds = Math.floor(ms / 1000);
|
||||
const minutes = Math.floor(seconds / 60);
|
||||
const hours = Math.floor(minutes / 60);
|
||||
if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`;
|
||||
if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
|
||||
return `${seconds}s`;
|
||||
}
|
||||
|
||||
// ─── Main ────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const startTime = Date.now();
|
||||
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log('GOTTESDIENSTZEITEN.DE (GERMANY) IMPORTER');
|
||||
console.log('='.repeat(70));
|
||||
console.log(`Mode: ${args.diocese ? `Diocese category ${args.diocese}` : 'All dioceses'}`);
|
||||
console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`);
|
||||
if (args.resumeFrom) console.log(`Resume from: diocese index ${args.resumeFrom}`);
|
||||
console.log(`Time: ${new Date().toISOString()}`);
|
||||
console.log('='.repeat(70) + '\n');
|
||||
|
||||
if (args.jobId) {
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: args.jobId },
|
||||
data: { status: 'running', startedAt: new Date() },
|
||||
});
|
||||
} catch { /* Job might not exist */ }
|
||||
}
|
||||
|
||||
const stats: ImportStats = {
|
||||
diocesesProcessed: 0,
|
||||
postsFound: 0,
|
||||
churchesParsed: 0,
|
||||
churchesMatched: 0,
|
||||
churchesCreated: 0,
|
||||
churchesSkipped: 0,
|
||||
schedulesCreated: 0,
|
||||
errors: 0,
|
||||
};
|
||||
|
||||
const existingChurches = await loadExistingGermanChurches();
|
||||
|
||||
let categories: DioceseCat[];
|
||||
if (args.diocese) {
|
||||
categories = [{ id: args.diocese, name: `Category ${args.diocese}`, count: 1000 }];
|
||||
} else {
|
||||
categories = await fetchDioceseCategories();
|
||||
}
|
||||
|
||||
if (args.resumeFrom && !args.diocese) {
|
||||
categories = categories.slice(args.resumeFrom);
|
||||
console.log(`Resuming from diocese index ${args.resumeFrom} (${categories[0]?.name})\n`);
|
||||
}
|
||||
|
||||
console.log(`Processing ${categories.length} diocese categories\n`);
|
||||
|
||||
for (let i = 0; i < categories.length; i++) {
|
||||
const cat = categories[i];
|
||||
const elapsed = formatDuration(Date.now() - startTime);
|
||||
console.log(`[${i + 1}/${categories.length}] ${cat.name} (${cat.count} posts) [${elapsed} elapsed]`);
|
||||
|
||||
try {
|
||||
await processDiocese(cat, existingChurches, args.dryRun, stats);
|
||||
} catch (error) {
|
||||
stats.errors++;
|
||||
console.error(` ERROR processing ${cat.name}: ${error instanceof Error ? error.message : error}`);
|
||||
}
|
||||
}
|
||||
|
||||
const totalTime = Date.now() - startTime;
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
|
||||
console.log('='.repeat(70));
|
||||
console.log(`Dioceses processed: ${stats.diocesesProcessed}`);
|
||||
console.log(`WP posts found: ${stats.postsFound}`);
|
||||
console.log(`Churches parsed: ${stats.churchesParsed}`);
|
||||
console.log(` Matched (existing): ${stats.churchesMatched}`);
|
||||
console.log(` Created (new): ${stats.churchesCreated}`);
|
||||
console.log(` Skipped (no coords): ${stats.churchesSkipped}`);
|
||||
console.log(`Schedules created: ${stats.schedulesCreated}`);
|
||||
console.log(`Errors: ${stats.errors}`);
|
||||
console.log(`Total time: ${formatDuration(totalTime)}`);
|
||||
console.log(`HTTP requests: ${requestCount}`);
|
||||
console.log('='.repeat(70) + '\n');
|
||||
|
||||
if (args.jobId) {
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: args.jobId },
|
||||
data: {
|
||||
status: stats.errors > 0 ? 'completed_with_errors' : 'completed',
|
||||
completedAt: new Date(),
|
||||
processed: stats.churchesParsed,
|
||||
succeeded: stats.churchesCreated + stats.churchesMatched,
|
||||
failed: stats.errors,
|
||||
itemsFound: stats.schedulesCreated,
|
||||
},
|
||||
});
|
||||
} catch { /* Ignore */ }
|
||||
}
|
||||
}
|
||||
|
||||
main()
|
||||
.catch((error) => {
|
||||
console.error('Fatal error:', error);
|
||||
process.exit(1);
|
||||
})
|
||||
.finally(async () => {
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
});
|
||||
1028
scripts/import-horariosmisas.ts
Normal file
1028
scripts/import-horariosmisas.ts
Normal file
File diff suppressed because it is too large
Load Diff
697
scripts/import-kerknet.ts
Normal file
697
scripts/import-kerknet.ts
Normal file
@@ -0,0 +1,697 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Import Catholic churches and mass schedules from kerknet.be (Flanders, Belgium)
|
||||
*
|
||||
* Kerknet is the portal of the Catholic Church in Flanders (Dutch-speaking Belgium).
|
||||
* It has ~1,200 churches with structured data: name, address, coordinates (GeoJSON),
|
||||
* and date-specific celebration entries.
|
||||
*
|
||||
* Import strategy:
|
||||
* 1. Enumerate unique church slugs by paginating the celebration listing
|
||||
* 2. Scrape each /kerk/{slug} page for structured data (name, address, coords, nodeId)
|
||||
* 3. Fetch celebrations via AJAX endpoint per church
|
||||
* 4. Deduce recurring weekly schedules from date-specific celebrations
|
||||
* 5. Match against existing Belgian churches via church-matcher
|
||||
* 6. Upsert churches and mass schedules
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/import-kerknet.ts --all --dry-run
|
||||
* npx tsx scripts/import-kerknet.ts --all
|
||||
* npx tsx scripts/import-kerknet.ts --slug o-l-vrouw-kerk-scherpenheuvel --dry-run
|
||||
* npx tsx scripts/import-kerknet.ts --all --resume-from 100
|
||||
*/
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
|
||||
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
|
||||
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
|
||||
const pool = new Pool({
|
||||
connectionString: dbUrl,
|
||||
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
|
||||
});
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
import { findDuplicateChurch } from '../src/lib/church-matcher';
|
||||
import type { ExistingChurch } from '../src/lib/church-matcher';
|
||||
|
||||
// ─── Constants ───────────────────────────────────────────────────────────────
|
||||
|
||||
const BASE_URL = 'https://www.kerknet.be';
|
||||
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
|
||||
const ENUM_DELAY_MS = 2000; // Delay between listing pages (respecting crawl-delay spirit)
|
||||
const DETAIL_DELAY_MS = 3000; // Delay between church detail page fetches
|
||||
const CELEBRATION_DELAY_MS = 2000; // Delay between celebration AJAX calls
|
||||
const MAX_RETRIES = 3;
|
||||
const RETRY_DELAY_MS = 10000;
|
||||
const MAX_ENUM_PAGES = 2804; // Total celebration listing pages
|
||||
const ENUM_SAMPLE_INTERVAL = 5; // Check every Nth page (5 → ~560 pages to check)
|
||||
const STALE_THRESHOLD = 10; // Stop if N consecutive sampled pages yield no new slugs
|
||||
|
||||
// Dutch day abbreviations → dayOfWeek (0=Sun, 1=Mon, ..., 6=Sat)
|
||||
const DUTCH_DAYS: Record<string, number> = {
|
||||
'zo': 0, 'ma': 1, 'di': 2, 'wo': 3, 'do': 4, 'vr': 5, 'za': 6,
|
||||
};
|
||||
|
||||
// ─── Types ───────────────────────────────────────────────────────────────────
|
||||
|
||||
interface ChurchData {
|
||||
slug: string;
|
||||
nodeId: string;
|
||||
name: string;
|
||||
address: string | null;
|
||||
zip: string | null;
|
||||
city: string | null;
|
||||
latitude: number;
|
||||
longitude: number;
|
||||
website: string | null;
|
||||
}
|
||||
|
||||
interface CelebrationEntry {
|
||||
dayAbbrev: string;
|
||||
date: string; // DD/MM
|
||||
time: string; // HH.MM or HH:MM
|
||||
type: string; // Eucharistie, Gebedsdienst, etc.
|
||||
}
|
||||
|
||||
interface ParsedSchedule {
|
||||
dayOfWeek: number;
|
||||
time: string;
|
||||
}
|
||||
|
||||
interface ImportStats {
|
||||
slugsEnumerated: number;
|
||||
churchesFetched: number;
|
||||
churchesMatched: number;
|
||||
churchesCreated: number;
|
||||
churchesSkipped: number;
|
||||
schedulesCreated: number;
|
||||
errors: number;
|
||||
}
|
||||
|
||||
interface CLIArgs {
|
||||
all: boolean;
|
||||
dryRun: boolean;
|
||||
resumeFrom?: number;
|
||||
slug?: string;
|
||||
jobId?: string;
|
||||
}
|
||||
|
||||
// ─── HTTP Helpers ────────────────────────────────────────────────────────────
|
||||
|
||||
let requestCount = 0;
|
||||
|
||||
function delay(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
async function fetchPage(url: string, delayMs: number): Promise<string | null> {
|
||||
if (requestCount > 0) {
|
||||
await delay(delayMs);
|
||||
}
|
||||
requestCount++;
|
||||
|
||||
for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
headers: { 'User-Agent': USER_AGENT },
|
||||
});
|
||||
|
||||
if (response.status === 429 || response.status === 503) {
|
||||
if (attempt < MAX_RETRIES) {
|
||||
console.log(` HTTP ${response.status} — retrying in ${RETRY_DELAY_MS / 1000}s (attempt ${attempt}/${MAX_RETRIES})`);
|
||||
await delay(RETRY_DELAY_MS);
|
||||
continue;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!response.ok) {
|
||||
if (attempt < MAX_RETRIES) {
|
||||
await delay(RETRY_DELAY_MS);
|
||||
continue;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
return await response.text();
|
||||
} catch (error) {
|
||||
if (attempt < MAX_RETRIES) {
|
||||
console.log(` Network error — retrying (attempt ${attempt}/${MAX_RETRIES})`);
|
||||
await delay(RETRY_DELAY_MS);
|
||||
continue;
|
||||
}
|
||||
console.error(` Fetch failed after ${MAX_RETRIES} attempts: ${error instanceof Error ? error.message : error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// ─── Phase 1: Enumerate Church Slugs ─────────────────────────────────────────
|
||||
|
||||
async function enumerateChurchSlugs(): Promise<string[]> {
|
||||
console.log('\nPhase 1: Enumerating church slugs from celebration listings...');
|
||||
const slugs = new Set<string>();
|
||||
let consecutiveEmpty = 0;
|
||||
|
||||
for (let page = 0; page < MAX_ENUM_PAGES; page += ENUM_SAMPLE_INTERVAL) {
|
||||
const url = `${BASE_URL}/zoeken/vieringen/lijst?page=${page}`;
|
||||
const html = await fetchPage(url, ENUM_DELAY_MS);
|
||||
|
||||
if (!html) {
|
||||
consecutiveEmpty++;
|
||||
if (consecutiveEmpty >= STALE_THRESHOLD) {
|
||||
console.log(` Stopping enumeration: ${STALE_THRESHOLD} consecutive empty pages`);
|
||||
break;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Extract /kerk/{slug} links (church building pages, NOT org pages like /kerk-jette/artikel/)
|
||||
const matches = html.match(/href="\/kerk\/([^"/]+)"/g);
|
||||
const prevSize = slugs.size;
|
||||
|
||||
if (matches) {
|
||||
for (const match of matches) {
|
||||
const slugMatch = match.match(/href="\/kerk\/([^"/]+)"/);
|
||||
if (slugMatch) {
|
||||
slugs.add(slugMatch[1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const newCount = slugs.size - prevSize;
|
||||
if (newCount === 0) {
|
||||
consecutiveEmpty++;
|
||||
} else {
|
||||
consecutiveEmpty = 0;
|
||||
}
|
||||
|
||||
if (page % 50 === 0 || newCount > 0) {
|
||||
console.log(` Page ${page}: ${slugs.size} unique churches found (+${newCount})`);
|
||||
}
|
||||
|
||||
if (consecutiveEmpty >= STALE_THRESHOLD) {
|
||||
console.log(` Stopping enumeration: ${STALE_THRESHOLD} consecutive sampled pages with no new churches`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
console.log(` Enumeration complete: ${slugs.size} unique church slugs found\n`);
|
||||
return Array.from(slugs).sort();
|
||||
}
|
||||
|
||||
// ─── Phase 2: Scrape Church Detail Page ──────────────────────────────────────
|
||||
|
||||
function parseChurchPage(html: string, slug: string): ChurchData | null {
|
||||
// Extract coordinates from GeoJSON in Drupal settings
|
||||
const coordMatch = html.match(/"coordinates":\[(-?[\d.]+),(-?[\d.]+)\]/);
|
||||
if (!coordMatch) return null; // No coordinates = unusable
|
||||
|
||||
const longitude = parseFloat(coordMatch[1]);
|
||||
const latitude = parseFloat(coordMatch[2]);
|
||||
if (latitude === 0 && longitude === 0) return null;
|
||||
|
||||
// Extract node ID
|
||||
const nidMatch = html.match(/"currentNid":"(\d+)"/);
|
||||
const nodeId = nidMatch ? nidMatch[1] : slug;
|
||||
|
||||
// Extract name from GeoJSON description or page title
|
||||
let name = slug;
|
||||
const descMatch = html.match(/"description":"([^"]+)"/);
|
||||
if (descMatch) {
|
||||
name = descMatch[1];
|
||||
} else {
|
||||
const titleMatch = html.match(/<title>([^|<]+)/);
|
||||
if (titleMatch) name = titleMatch[1].trim();
|
||||
}
|
||||
|
||||
// Extract address fields
|
||||
const streetMatch = html.match(/class="thoroughfare">([^<]+)</);
|
||||
const zipMatch = html.match(/class="postal-code">([^<]+)</);
|
||||
const cityMatch = html.match(/class="locality">([^<]+)</);
|
||||
|
||||
const address = streetMatch ? streetMatch[1].trim() : null;
|
||||
const zip = zipMatch ? zipMatch[1].trim() : null;
|
||||
const city = cityMatch ? cityMatch[1].trim() : null;
|
||||
|
||||
// Extract website
|
||||
let website: string | null = null;
|
||||
const websiteMatch = html.match(/class="website"[^>]*>.*?href="([^"]+)"/s);
|
||||
if (websiteMatch) {
|
||||
website = websiteMatch[1];
|
||||
} else {
|
||||
// Try field-name-kn-website pattern
|
||||
const knWebsiteMatch = html.match(/field-name-kn-website.*?href="([^"]+)"/s);
|
||||
if (knWebsiteMatch) website = knWebsiteMatch[1];
|
||||
}
|
||||
|
||||
return { slug, nodeId, name, address, zip, city, latitude, longitude, website };
|
||||
}
|
||||
|
||||
// ─── Phase 3: Parse Celebrations ─────────────────────────────────────────────
|
||||
|
||||
function parseCelebrations(html: string): CelebrationEntry[] {
|
||||
const entries: CelebrationEntry[] = [];
|
||||
|
||||
// Match celebration blocks
|
||||
const celebBlocks = html.split('<div class="celebration">').slice(1);
|
||||
|
||||
for (const block of celebBlocks) {
|
||||
// Extract day abbreviation
|
||||
const dayMatch = block.match(/celebration__date__day">\s*(\w+)\s*</);
|
||||
if (!dayMatch) continue;
|
||||
|
||||
// Extract date (DD/MM)
|
||||
const dateMatch = block.match(/celebration__date__date">\s*([\d/]+)\s*</);
|
||||
|
||||
// Extract time (HH.MM)
|
||||
const timeMatch = block.match(/celebration__time">\s*([\d.]+)\s*</);
|
||||
if (!timeMatch) continue;
|
||||
|
||||
// Extract type
|
||||
const typeMatch = block.match(/celebration__info__type">\s*([^<]+)\s*</);
|
||||
|
||||
entries.push({
|
||||
dayAbbrev: dayMatch[1].toLowerCase().trim(),
|
||||
date: dateMatch ? dateMatch[1].trim() : '',
|
||||
time: timeMatch[1].trim(),
|
||||
type: typeMatch ? typeMatch[1].trim().toLowerCase() : 'eucharistie',
|
||||
});
|
||||
}
|
||||
|
||||
return entries;
|
||||
}
|
||||
|
||||
function deduceSchedules(celebrations: CelebrationEntry[]): ParsedSchedule[] {
|
||||
// Only keep Eucharistie (mass) entries
|
||||
const masses = celebrations.filter(c =>
|
||||
c.type === 'eucharistie' || c.type === 'eucharistieviering'
|
||||
);
|
||||
|
||||
const seen = new Set<string>();
|
||||
const schedules: ParsedSchedule[] = [];
|
||||
|
||||
for (const mass of masses) {
|
||||
const dayOfWeek = DUTCH_DAYS[mass.dayAbbrev];
|
||||
if (dayOfWeek === undefined) continue;
|
||||
|
||||
// Normalize time: "15.00" → "15:00"
|
||||
const time = mass.time.replace('.', ':').replace(/^(\d):/, '0$1:');
|
||||
|
||||
const key = `${dayOfWeek}:${time}`;
|
||||
if (!seen.has(key)) {
|
||||
seen.add(key);
|
||||
schedules.push({ dayOfWeek, time });
|
||||
}
|
||||
}
|
||||
|
||||
return schedules;
|
||||
}
|
||||
|
||||
// ─── Database Operations ─────────────────────────────────────────────────────
|
||||
|
||||
async function loadExistingBelgianChurches(): Promise<ExistingChurch[]> {
|
||||
console.log('Loading existing Belgian churches for deduplication...');
|
||||
const churches = await prisma.church.findMany({
|
||||
where: { country: 'BE' },
|
||||
select: {
|
||||
id: true,
|
||||
name: true,
|
||||
latitude: true,
|
||||
longitude: true,
|
||||
osmId: true,
|
||||
baiduId: true,
|
||||
masstimesId: true,
|
||||
orarimesseId: true,
|
||||
massSchedulesPhId: true,
|
||||
philmassId: true,
|
||||
horariosMisasId: true,
|
||||
mszeInfoId: true,
|
||||
weekdayMassesId: true,
|
||||
messesInfoId: true,
|
||||
bohosluzbyId: true,
|
||||
miserendId: true,
|
||||
kerknetId: true,
|
||||
gottesdienstzeitenId: true,
|
||||
discovermassId: true,
|
||||
source: true,
|
||||
website: true,
|
||||
phone: true,
|
||||
address: true,
|
||||
},
|
||||
});
|
||||
console.log(`Loaded ${churches.length} existing Belgian churches`);
|
||||
return churches;
|
||||
}
|
||||
|
||||
// ─── Import Logic ────────────────────────────────────────────────────────────
|
||||
|
||||
async function processChurch(
|
||||
slug: string,
|
||||
existingChurches: ExistingChurch[],
|
||||
dryRun: boolean,
|
||||
stats: ImportStats,
|
||||
): Promise<void> {
|
||||
// Fetch church detail page
|
||||
const churchHtml = await fetchPage(`${BASE_URL}/kerk/${slug}`, DETAIL_DELAY_MS);
|
||||
if (!churchHtml) {
|
||||
stats.errors++;
|
||||
return;
|
||||
}
|
||||
|
||||
const church = parseChurchPage(churchHtml, slug);
|
||||
if (!church) {
|
||||
stats.churchesSkipped++;
|
||||
return;
|
||||
}
|
||||
|
||||
stats.churchesFetched++;
|
||||
|
||||
// Fetch celebrations via AJAX
|
||||
let celebrations: CelebrationEntry[] = [];
|
||||
const celebHtml = await fetchPage(
|
||||
`${BASE_URL}/kerknet-celebration/churches/ajax/load-more/0/${church.nodeId}`,
|
||||
CELEBRATION_DELAY_MS,
|
||||
);
|
||||
if (celebHtml) {
|
||||
celebrations = parseCelebrations(celebHtml);
|
||||
}
|
||||
|
||||
const schedules = deduceSchedules(celebrations);
|
||||
|
||||
const kerknetId = `kerknet-${church.nodeId}`;
|
||||
const candidate = {
|
||||
name: church.name,
|
||||
lat: church.latitude,
|
||||
lng: church.longitude,
|
||||
kerknetId,
|
||||
};
|
||||
|
||||
const duplicate = findDuplicateChurch(candidate, existingChurches);
|
||||
|
||||
if (dryRun) {
|
||||
if (duplicate) {
|
||||
stats.churchesMatched++;
|
||||
} else {
|
||||
stats.churchesCreated++;
|
||||
}
|
||||
stats.schedulesCreated += schedules.length;
|
||||
return;
|
||||
}
|
||||
|
||||
if (duplicate) {
|
||||
stats.churchesMatched++;
|
||||
const updateData: Record<string, unknown> = { kerknetId };
|
||||
|
||||
if (!duplicate.address && church.address) updateData.address = church.address;
|
||||
if (!duplicate.website && church.website) updateData.website = church.website;
|
||||
|
||||
try {
|
||||
await prisma.church.update({
|
||||
where: { id: duplicate.id },
|
||||
data: updateData,
|
||||
});
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message.includes('Unique constraint')) {
|
||||
stats.churchesSkipped++;
|
||||
return;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
|
||||
if (schedules.length > 0) {
|
||||
try {
|
||||
await prisma.$transaction(async (tx) => {
|
||||
await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } });
|
||||
await tx.massSchedule.createMany({
|
||||
data: schedules.map((s) => ({
|
||||
churchId: duplicate.id,
|
||||
dayOfWeek: s.dayOfWeek,
|
||||
time: s.time,
|
||||
language: 'Dutch',
|
||||
})),
|
||||
});
|
||||
await tx.church.update({
|
||||
where: { id: duplicate.id },
|
||||
data: { lastScrapedAt: new Date() },
|
||||
});
|
||||
});
|
||||
stats.schedulesCreated += schedules.length;
|
||||
} catch (error) {
|
||||
stats.errors++;
|
||||
console.error(` Error saving schedules for ${slug}: ${error instanceof Error ? error.message : error}`);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
const newChurch = await prisma.church.create({
|
||||
data: {
|
||||
name: church.name,
|
||||
latitude: church.latitude,
|
||||
longitude: church.longitude,
|
||||
address: church.address,
|
||||
zip: church.zip,
|
||||
city: church.city,
|
||||
country: 'BE',
|
||||
website: church.website,
|
||||
hasWebsite: !!church.website,
|
||||
kerknetId,
|
||||
source: 'kerknet',
|
||||
websiteLanguage: 'nl',
|
||||
},
|
||||
});
|
||||
stats.churchesCreated++;
|
||||
|
||||
existingChurches.push({
|
||||
id: newChurch.id,
|
||||
name: church.name,
|
||||
latitude: church.latitude,
|
||||
longitude: church.longitude,
|
||||
osmId: null,
|
||||
baiduId: null,
|
||||
masstimesId: null,
|
||||
orarimesseId: null,
|
||||
massSchedulesPhId: null,
|
||||
philmassId: null,
|
||||
horariosMisasId: null,
|
||||
mszeInfoId: null,
|
||||
weekdayMassesId: null,
|
||||
messesInfoId: null,
|
||||
bohosluzbyId: null,
|
||||
miserendId: null,
|
||||
kerknetId,
|
||||
gottesdienstzeitenId: null,
|
||||
discovermassId: null,
|
||||
source: 'kerknet',
|
||||
website: church.website,
|
||||
phone: null,
|
||||
address: church.address,
|
||||
});
|
||||
|
||||
if (schedules.length > 0) {
|
||||
await prisma.massSchedule.createMany({
|
||||
data: schedules.map((s) => ({
|
||||
churchId: newChurch.id,
|
||||
dayOfWeek: s.dayOfWeek,
|
||||
time: s.time,
|
||||
language: 'Dutch',
|
||||
})),
|
||||
});
|
||||
await prisma.church.update({
|
||||
where: { id: newChurch.id },
|
||||
data: { lastScrapedAt: new Date() },
|
||||
});
|
||||
stats.schedulesCreated += schedules.length;
|
||||
}
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message.includes('Unique constraint')) {
|
||||
stats.churchesSkipped++;
|
||||
return;
|
||||
}
|
||||
stats.errors++;
|
||||
console.error(` Error creating ${slug}: ${error instanceof Error ? error.message : error}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ─── CLI ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
function parseArgs(): CLIArgs {
|
||||
const args = process.argv.slice(2);
|
||||
const result: CLIArgs = { all: false, dryRun: false };
|
||||
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
switch (args[i]) {
|
||||
case '--all':
|
||||
result.all = true;
|
||||
break;
|
||||
case '--dry-run':
|
||||
result.dryRun = true;
|
||||
break;
|
||||
case '--resume-from':
|
||||
result.resumeFrom = parseInt(args[++i]);
|
||||
break;
|
||||
case '--slug':
|
||||
result.slug = args[++i];
|
||||
break;
|
||||
case '--job-id':
|
||||
result.jobId = args[++i];
|
||||
break;
|
||||
case '--help':
|
||||
case '-h':
|
||||
console.log(`
|
||||
Usage: npx tsx scripts/import-kerknet.ts [options]
|
||||
|
||||
Options:
|
||||
--all Import all churches from kerknet.be
|
||||
--slug <slug> Import a single church (e.g., o-l-vrouw-kerk-scherpenheuvel)
|
||||
--dry-run No database writes, just report what would happen
|
||||
--resume-from <n> Skip first N churches (after enumeration)
|
||||
--job-id <uuid> Background job tracking ID
|
||||
--help, -h Show this help message
|
||||
|
||||
Examples:
|
||||
npx tsx scripts/import-kerknet.ts --slug o-l-vrouw-kerk-scherpenheuvel --dry-run
|
||||
npx tsx scripts/import-kerknet.ts --all --dry-run
|
||||
npx tsx scripts/import-kerknet.ts --all
|
||||
`);
|
||||
process.exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
if (!result.all && !result.slug) {
|
||||
console.error('Error: specify --all or --slug <slug>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
function formatDuration(ms: number): string {
|
||||
const seconds = Math.floor(ms / 1000);
|
||||
const minutes = Math.floor(seconds / 60);
|
||||
const hours = Math.floor(minutes / 60);
|
||||
if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`;
|
||||
if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
|
||||
return `${seconds}s`;
|
||||
}
|
||||
|
||||
// ─── Main ────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const startTime = Date.now();
|
||||
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log('KERKNET.BE (BELGIUM/FLANDERS) IMPORTER');
|
||||
console.log('='.repeat(70));
|
||||
console.log(`Mode: ${args.slug ? `Single: ${args.slug}` : 'All churches'}`);
|
||||
console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`);
|
||||
if (args.resumeFrom) console.log(`Resume from: church index ${args.resumeFrom}`);
|
||||
console.log(`Time: ${new Date().toISOString()}`);
|
||||
console.log('='.repeat(70) + '\n');
|
||||
|
||||
if (args.jobId) {
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: args.jobId },
|
||||
data: { status: 'running', startedAt: new Date() },
|
||||
});
|
||||
} catch { /* Job might not exist */ }
|
||||
}
|
||||
|
||||
const stats: ImportStats = {
|
||||
slugsEnumerated: 0,
|
||||
churchesFetched: 0,
|
||||
churchesMatched: 0,
|
||||
churchesCreated: 0,
|
||||
churchesSkipped: 0,
|
||||
schedulesCreated: 0,
|
||||
errors: 0,
|
||||
};
|
||||
|
||||
const existingChurches = await loadExistingBelgianChurches();
|
||||
|
||||
// Get list of church slugs
|
||||
let slugs: string[];
|
||||
if (args.slug) {
|
||||
slugs = [args.slug];
|
||||
} else {
|
||||
slugs = await enumerateChurchSlugs();
|
||||
stats.slugsEnumerated = slugs.length;
|
||||
}
|
||||
|
||||
if (args.resumeFrom && !args.slug) {
|
||||
slugs = slugs.slice(args.resumeFrom);
|
||||
console.log(`Resuming from church index ${args.resumeFrom} (${slugs[0]})\n`);
|
||||
}
|
||||
|
||||
console.log(`Processing ${slugs.length} churches\n`);
|
||||
|
||||
for (let i = 0; i < slugs.length; i++) {
|
||||
const slug = slugs[i];
|
||||
const elapsed = formatDuration(Date.now() - startTime);
|
||||
|
||||
if (i % 50 === 0 || slugs.length <= 10) {
|
||||
console.log(`[${i + 1}/${slugs.length}] ${slug} [${elapsed} elapsed, ${stats.churchesCreated} new, ${stats.churchesMatched} matched]`);
|
||||
}
|
||||
|
||||
try {
|
||||
await processChurch(slug, existingChurches, args.dryRun, stats);
|
||||
} catch (error) {
|
||||
stats.errors++;
|
||||
console.error(` ERROR processing ${slug}: ${error instanceof Error ? error.message : error}`);
|
||||
}
|
||||
}
|
||||
|
||||
const totalTime = Date.now() - startTime;
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
|
||||
console.log('='.repeat(70));
|
||||
console.log(`Slugs enumerated: ${stats.slugsEnumerated}`);
|
||||
console.log(`Churches fetched: ${stats.churchesFetched}`);
|
||||
console.log(` Matched (existing): ${stats.churchesMatched}`);
|
||||
console.log(` Created (new): ${stats.churchesCreated}`);
|
||||
console.log(` Skipped: ${stats.churchesSkipped}`);
|
||||
console.log(`Schedules created: ${stats.schedulesCreated}`);
|
||||
console.log(`Errors: ${stats.errors}`);
|
||||
console.log(`Total time: ${formatDuration(totalTime)}`);
|
||||
console.log(`HTTP requests: ${requestCount}`);
|
||||
console.log('='.repeat(70) + '\n');
|
||||
|
||||
if (args.jobId) {
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: args.jobId },
|
||||
data: {
|
||||
status: stats.errors > 0 ? 'completed_with_errors' : 'completed',
|
||||
completedAt: new Date(),
|
||||
processed: stats.churchesFetched,
|
||||
succeeded: stats.churchesCreated + stats.churchesMatched,
|
||||
failed: stats.errors,
|
||||
itemsFound: stats.schedulesCreated,
|
||||
},
|
||||
});
|
||||
} catch { /* Ignore */ }
|
||||
}
|
||||
}
|
||||
|
||||
main()
|
||||
.catch((error) => {
|
||||
console.error('Fatal error:', error);
|
||||
process.exit(1);
|
||||
})
|
||||
.finally(async () => {
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
});
|
||||
695
scripts/import-mass-schedules-ph.ts
Normal file
695
scripts/import-mass-schedules-ph.ts
Normal file
@@ -0,0 +1,695 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Import Catholic churches and mass schedules from mass-schedules.com (Philippines)
|
||||
*
|
||||
* mass-schedules.com has been operating since 2008 and covers ~1,500 Philippine
|
||||
* churches with weekly mass schedule tables and coordinates on separate map pages.
|
||||
*
|
||||
* Import strategy:
|
||||
* 1. Fetch sitemap XML → extract all /catholic-church/{id}/ URLs
|
||||
* 2. For each church: fetch page HTML, parse name/address/schedule, fetch map
|
||||
* page for coordinates, match against existing PH churches, upsert
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/import-mass-schedules-ph.ts --all
|
||||
* npx tsx scripts/import-mass-schedules-ph.ts --all --dry-run
|
||||
* npx tsx scripts/import-mass-schedules-ph.ts --church-id 34
|
||||
* npx tsx scripts/import-mass-schedules-ph.ts --all --resume-from 500
|
||||
* npx tsx scripts/import-mass-schedules-ph.ts --all --skip-schedules
|
||||
* npx tsx scripts/import-mass-schedules-ph.ts --all --job-id {uuid}
|
||||
*/
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
|
||||
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
|
||||
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
|
||||
const pool = new Pool({
|
||||
connectionString: dbUrl,
|
||||
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
|
||||
});
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
import { findDuplicateChurch } from '../src/lib/church-matcher';
|
||||
import type { ExistingChurch } from '../src/lib/church-matcher';
|
||||
|
||||
// ─── Constants ───────────────────────────────────────────────────────────────
|
||||
|
||||
const SITE_BASE = 'https://www.mass-schedules.com';
|
||||
const SITEMAP_URL = `${SITE_BASE}/sitemaps/sitemap02272021.xml`;
|
||||
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
|
||||
const REQUEST_DELAY_MS = 1500;
|
||||
|
||||
// ─── Types ───────────────────────────────────────────────────────────────────
|
||||
|
||||
interface SitemapChurch {
|
||||
id: string;
|
||||
slug: string;
|
||||
url: string;
|
||||
}
|
||||
|
||||
interface ParsedChurch {
|
||||
name: string;
|
||||
address: string | null;
|
||||
region: string | null;
|
||||
city: string | null;
|
||||
phone: string | null;
|
||||
mapUrl: string | null;
|
||||
}
|
||||
|
||||
interface ParsedSchedule {
|
||||
dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat
|
||||
time: string; // "05:00", "18:30"
|
||||
}
|
||||
|
||||
interface ImportStats {
|
||||
churchesFound: number;
|
||||
churchesMatched: number;
|
||||
churchesCreated: number;
|
||||
churchesSkipped: number;
|
||||
schedulesProcessed: number;
|
||||
massSchedulesCreated: number;
|
||||
errors: number;
|
||||
}
|
||||
|
||||
interface CLIArgs {
|
||||
all: boolean;
|
||||
churchId?: string;
|
||||
dryRun: boolean;
|
||||
skipSchedules: boolean;
|
||||
resumeFrom?: number;
|
||||
jobId?: string;
|
||||
}
|
||||
|
||||
// ─── HTTP Client ─────────────────────────────────────────────────────────────
|
||||
|
||||
let requestCount = 0;
|
||||
|
||||
function delay(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
async function fetchPage(url: string): Promise<string | null> {
|
||||
if (requestCount > 0) {
|
||||
await delay(REQUEST_DELAY_MS);
|
||||
}
|
||||
requestCount++;
|
||||
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
headers: {
|
||||
'User-Agent': USER_AGENT,
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
},
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
console.error(` HTTP ${response.status} for ${url}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
return await response.text();
|
||||
} catch (error) {
|
||||
console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Sitemap Parser ──────────────────────────────────────────────────────────
|
||||
|
||||
async function fetchChurchUrlsFromSitemap(): Promise<SitemapChurch[]> {
|
||||
console.log(`Fetching sitemap: ${SITEMAP_URL}`);
|
||||
const xml = await fetchPage(SITEMAP_URL);
|
||||
if (!xml) {
|
||||
throw new Error('Failed to fetch sitemap');
|
||||
}
|
||||
|
||||
// Extract /catholic-church/{id}/{slug}.html URLs
|
||||
const urlRegex = /\/catholic-church\/(\d+)\/([\w-]+)\.html/g;
|
||||
const seen = new Set<string>();
|
||||
const churches: SitemapChurch[] = [];
|
||||
|
||||
let match;
|
||||
while ((match = urlRegex.exec(xml)) !== null) {
|
||||
const id = match[1];
|
||||
if (seen.has(id)) continue; // Sitemap has duplicates
|
||||
seen.add(id);
|
||||
churches.push({
|
||||
id,
|
||||
slug: match[2],
|
||||
url: `${SITE_BASE}/catholic-church/${id}/${match[2]}.html`,
|
||||
});
|
||||
}
|
||||
|
||||
// Sort by ID for predictable ordering
|
||||
churches.sort((a, b) => parseInt(a.id) - parseInt(b.id));
|
||||
return churches;
|
||||
}
|
||||
|
||||
// ─── HTML Parsers ────────────────────────────────────────────────────────────
|
||||
|
||||
function parseChurchPage(html: string): ParsedChurch {
|
||||
// Name from <h1 class="page_title">...</h1>
|
||||
const h1Match = html.match(/<h1[^>]*class="page_title"[^>]*>([\s\S]*?)<\/h1>/i);
|
||||
let name = h1Match ? h1Match[1].trim() : '';
|
||||
// Remove " Mass Schedule" suffix
|
||||
name = name.replace(/\s*Mass\s*Schedule\s*$/i, '').trim();
|
||||
|
||||
// Address from <label>address:</label> ... <p class="data">...</p>
|
||||
const addressMatch = html.match(/<label>address:<\/label>\s*<p class="data">([\s\S]*?)<\/p>/i);
|
||||
let address: string | null = null;
|
||||
let mapUrl: string | null = null;
|
||||
if (addressMatch) {
|
||||
// Extract map link before cleaning
|
||||
const mapLinkMatch = addressMatch[1].match(/href="(\/location-map\/[^"]+)"/);
|
||||
if (mapLinkMatch) {
|
||||
mapUrl = `${SITE_BASE}${mapLinkMatch[1]}`;
|
||||
}
|
||||
// Clean address: remove HTML tags, normalize whitespace
|
||||
address = addressMatch[1]
|
||||
.replace(/<[^>]+>/g, '')
|
||||
.replace(/\(show location map\)/i, '')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim() || null;
|
||||
}
|
||||
|
||||
// Phone from <label>telephone number:</label> ... <p class="data_inline" id="TELEPHONE">...</p>
|
||||
const phoneMatch = html.match(/id="TELEPHONE"[^>]*>([\s\S]*?)<\/p>/i);
|
||||
const phone = phoneMatch ? phoneMatch[1].trim() || null : null;
|
||||
|
||||
// Region and city from breadcrumbs
|
||||
// Pattern: > {Region} > {City}
|
||||
const breadcrumbMatches = [...html.matchAll(/class="normal"\s+href="[^"]*\/locations\/\d+\/[^"]*"[^>]*>([^<]+)<\/a>/gi)];
|
||||
const region = breadcrumbMatches.length > 0 ? breadcrumbMatches[0][1].trim() : null;
|
||||
|
||||
const cityMatches = [...html.matchAll(/class="normal"\s+href="[^"]*\/catholic-churches\/\d+\/[^"]*"[^>]*>([^<]+)<\/a>/gi)];
|
||||
const city = cityMatches.length > 0 ? cityMatches[0][1].trim() : null;
|
||||
|
||||
return { name, address, region, city, phone, mapUrl };
|
||||
}
|
||||
|
||||
function parseScheduleTable(html: string): ParsedSchedule[] {
|
||||
// The schedule table has 7 columns: Sun(0), Mon(1), Tue(2), Wed(3), Thu(4), Fri(5), Sat(6)
|
||||
// Each row contains <td> cells with <p class="schedule">5:00 AM - 6:00 AM</p>
|
||||
const schedules: ParsedSchedule[] = [];
|
||||
const seen = new Set<string>();
|
||||
|
||||
// Extract all table rows from <tbody>
|
||||
const tbodyMatch = html.match(/<tbody>([\s\S]*?)<\/tbody>/i);
|
||||
if (!tbodyMatch) return schedules;
|
||||
|
||||
const rows = tbodyMatch[1].match(/<tr>([\s\S]*?)<\/tr>/gi);
|
||||
if (!rows) return schedules;
|
||||
|
||||
for (const row of rows) {
|
||||
// Extract all <td> cells
|
||||
const cells = row.match(/<td>([\s\S]*?)<\/td>/gi);
|
||||
if (!cells) continue;
|
||||
|
||||
for (let colIndex = 0; colIndex < cells.length && colIndex < 7; colIndex++) {
|
||||
const dayOfWeek = colIndex; // 0=Sun, 1=Mon, ..., 6=Sat
|
||||
|
||||
// Extract time from <p class="schedule">5:00 AM - 6:00 AM</p>
|
||||
const timeMatch = cells[colIndex].match(/<p class="schedule">\s*(\d{1,2}:\d{2}\s*[AP]M)/i);
|
||||
if (!timeMatch) continue;
|
||||
|
||||
const time = convertTo24Hour(timeMatch[1].trim());
|
||||
if (!time) continue;
|
||||
|
||||
const key = `${dayOfWeek}:${time}`;
|
||||
if (seen.has(key)) continue;
|
||||
seen.add(key);
|
||||
|
||||
schedules.push({ dayOfWeek, time });
|
||||
}
|
||||
}
|
||||
|
||||
return schedules;
|
||||
}
|
||||
|
||||
function convertTo24Hour(timeStr: string): string | null {
|
||||
// "5:00 AM" → "05:00", "6:30 PM" → "18:30"
|
||||
const match = timeStr.match(/^(\d{1,2}):(\d{2})\s*(AM|PM)$/i);
|
||||
if (!match) return null;
|
||||
|
||||
let hours = parseInt(match[1]);
|
||||
const minutes = match[2];
|
||||
const period = match[3].toUpperCase();
|
||||
|
||||
if (period === 'AM' && hours === 12) hours = 0;
|
||||
if (period === 'PM' && hours !== 12) hours += 12;
|
||||
|
||||
return `${String(hours).padStart(2, '0')}:${minutes}`;
|
||||
}
|
||||
|
||||
function parseCoordinates(html: string): { lat: number; lng: number } | null {
|
||||
// Coordinates in JS: ms.ui.church.params.lat = '14.598815'
|
||||
const latMatch = html.match(/ms\.ui\.church\.params\.lat\s*=\s*'([^']+)'/);
|
||||
const lngMatch = html.match(/ms\.ui\.church\.params\.lng\s*=\s*'([^']+)'/);
|
||||
|
||||
if (!latMatch || !lngMatch) return null;
|
||||
|
||||
const lat = parseFloat(latMatch[1]);
|
||||
const lng = parseFloat(lngMatch[1]);
|
||||
|
||||
if (isNaN(lat) || isNaN(lng) || lat === 0 || lng === 0) return null;
|
||||
|
||||
return { lat, lng };
|
||||
}
|
||||
|
||||
// ─── Database Operations ─────────────────────────────────────────────────────
|
||||
|
||||
async function loadExistingPhilippineChurches(): Promise<ExistingChurch[]> {
|
||||
console.log('Loading existing Philippine churches for deduplication...');
|
||||
const churches = await prisma.church.findMany({
|
||||
where: { country: 'PH' },
|
||||
select: {
|
||||
id: true,
|
||||
name: true,
|
||||
latitude: true,
|
||||
longitude: true,
|
||||
osmId: true,
|
||||
baiduId: true,
|
||||
masstimesId: true,
|
||||
orarimesseId: true,
|
||||
massSchedulesPhId: true,
|
||||
philmassId: true,
|
||||
horariosMisasId: true,
|
||||
mszeInfoId: true,
|
||||
weekdayMassesId: true,
|
||||
messesInfoId: true,
|
||||
bohosluzbyId: true,
|
||||
miserendId: true,
|
||||
kerknetId: true,
|
||||
gottesdienstzeitenId: true,
|
||||
discovermassId: true,
|
||||
source: true,
|
||||
website: true,
|
||||
phone: true,
|
||||
address: true,
|
||||
},
|
||||
});
|
||||
console.log(`Loaded ${churches.length} existing Philippine churches`);
|
||||
return churches;
|
||||
}
|
||||
|
||||
// ─── Import Logic ────────────────────────────────────────────────────────────
|
||||
|
||||
async function processChurch(
|
||||
sitemapEntry: SitemapChurch,
|
||||
existingChurches: ExistingChurch[],
|
||||
dryRun: boolean,
|
||||
skipSchedules: boolean,
|
||||
stats: ImportStats,
|
||||
): Promise<void> {
|
||||
stats.churchesFound++;
|
||||
|
||||
// Fetch church page
|
||||
const churchHtml = await fetchPage(sitemapEntry.url);
|
||||
if (!churchHtml) {
|
||||
stats.errors++;
|
||||
return;
|
||||
}
|
||||
|
||||
const parsed = parseChurchPage(churchHtml);
|
||||
if (!parsed.name) {
|
||||
console.log(` Skipping ${sitemapEntry.id}: no name found`);
|
||||
stats.churchesSkipped++;
|
||||
return;
|
||||
}
|
||||
|
||||
// Fetch coordinates from map page
|
||||
let coords: { lat: number; lng: number } | null = null;
|
||||
if (parsed.mapUrl) {
|
||||
const mapHtml = await fetchPage(parsed.mapUrl);
|
||||
if (mapHtml) {
|
||||
coords = parseCoordinates(mapHtml);
|
||||
}
|
||||
}
|
||||
|
||||
if (!coords) {
|
||||
console.log(` Skipping ${sitemapEntry.id} (${parsed.name}): no coordinates`);
|
||||
stats.churchesSkipped++;
|
||||
return;
|
||||
}
|
||||
|
||||
// Parse schedule
|
||||
const schedules = skipSchedules ? [] : parseScheduleTable(churchHtml);
|
||||
|
||||
// Build candidate for dedup
|
||||
const candidate = {
|
||||
name: parsed.name,
|
||||
lat: coords.lat,
|
||||
lng: coords.lng,
|
||||
massSchedulesPhId: sitemapEntry.id,
|
||||
};
|
||||
|
||||
const duplicate = findDuplicateChurch(candidate, existingChurches);
|
||||
|
||||
if (dryRun) {
|
||||
if (duplicate) {
|
||||
stats.churchesMatched++;
|
||||
console.log(` [MATCH] ${sitemapEntry.id}: "${parsed.name}" → existing "${duplicate.name}" (${duplicate.id})`);
|
||||
} else {
|
||||
stats.churchesCreated++;
|
||||
console.log(` [NEW] ${sitemapEntry.id}: "${parsed.name}" at ${coords.lat},${coords.lng}`);
|
||||
}
|
||||
if (schedules.length > 0) {
|
||||
stats.schedulesProcessed++;
|
||||
stats.massSchedulesCreated += schedules.length;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (duplicate) {
|
||||
// Update existing church
|
||||
stats.churchesMatched++;
|
||||
const updateData: Record<string, unknown> = {
|
||||
massSchedulesPhId: sitemapEntry.id,
|
||||
};
|
||||
|
||||
if (!duplicate.address && parsed.address) updateData.address = parsed.address;
|
||||
if (!duplicate.phone && parsed.phone) updateData.phone = parsed.phone;
|
||||
|
||||
// Fill city/state from breadcrumbs
|
||||
const dbRecord = await prisma.church.findUnique({
|
||||
where: { id: duplicate.id },
|
||||
select: { city: true, state: true },
|
||||
});
|
||||
if (dbRecord && !dbRecord.city && parsed.city) updateData.city = parsed.city;
|
||||
if (dbRecord && !dbRecord.state && parsed.region) updateData.state = parsed.region;
|
||||
|
||||
try {
|
||||
await prisma.church.update({
|
||||
where: { id: duplicate.id },
|
||||
data: updateData,
|
||||
});
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message.includes('Unique constraint')) {
|
||||
stats.churchesSkipped++;
|
||||
return;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
|
||||
// Replace mass schedules
|
||||
if (schedules.length > 0 && !skipSchedules) {
|
||||
try {
|
||||
await prisma.$transaction(async (tx) => {
|
||||
await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } });
|
||||
await tx.massSchedule.createMany({
|
||||
data: schedules.map((s) => ({
|
||||
churchId: duplicate.id,
|
||||
dayOfWeek: s.dayOfWeek,
|
||||
time: s.time,
|
||||
language: 'English',
|
||||
})),
|
||||
});
|
||||
await tx.church.update({
|
||||
where: { id: duplicate.id },
|
||||
data: { lastScrapedAt: new Date() },
|
||||
});
|
||||
});
|
||||
stats.schedulesProcessed++;
|
||||
stats.massSchedulesCreated += schedules.length;
|
||||
} catch (error) {
|
||||
stats.errors++;
|
||||
console.error(` Error saving schedules for ${sitemapEntry.id}: ${error instanceof Error ? error.message : error}`);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Create new church
|
||||
try {
|
||||
const newChurch = await prisma.church.create({
|
||||
data: {
|
||||
name: parsed.name,
|
||||
latitude: coords.lat,
|
||||
longitude: coords.lng,
|
||||
address: parsed.address,
|
||||
city: parsed.city || null,
|
||||
state: parsed.region || null,
|
||||
country: 'PH',
|
||||
phone: parsed.phone,
|
||||
hasWebsite: false,
|
||||
massSchedulesPhId: sitemapEntry.id,
|
||||
source: 'mass-schedules-ph',
|
||||
},
|
||||
});
|
||||
stats.churchesCreated++;
|
||||
|
||||
// Add to in-memory array for within-run dedup
|
||||
existingChurches.push({
|
||||
id: newChurch.id,
|
||||
name: parsed.name,
|
||||
latitude: coords.lat,
|
||||
longitude: coords.lng,
|
||||
osmId: null,
|
||||
baiduId: null,
|
||||
masstimesId: null,
|
||||
orarimesseId: null,
|
||||
massSchedulesPhId: sitemapEntry.id,
|
||||
philmassId: null,
|
||||
horariosMisasId: null,
|
||||
mszeInfoId: null,
|
||||
weekdayMassesId: null,
|
||||
messesInfoId: null,
|
||||
bohosluzbyId: null,
|
||||
miserendId: null,
|
||||
kerknetId: null,
|
||||
gottesdienstzeitenId: null,
|
||||
discovermassId: null,
|
||||
source: 'mass-schedules-ph',
|
||||
website: null,
|
||||
phone: parsed.phone,
|
||||
address: parsed.address,
|
||||
});
|
||||
|
||||
// Create mass schedules
|
||||
if (schedules.length > 0 && !skipSchedules) {
|
||||
await prisma.massSchedule.createMany({
|
||||
data: schedules.map((s) => ({
|
||||
churchId: newChurch.id,
|
||||
dayOfWeek: s.dayOfWeek,
|
||||
time: s.time,
|
||||
language: 'English',
|
||||
})),
|
||||
});
|
||||
await prisma.church.update({
|
||||
where: { id: newChurch.id },
|
||||
data: { lastScrapedAt: new Date() },
|
||||
});
|
||||
stats.schedulesProcessed++;
|
||||
stats.massSchedulesCreated += schedules.length;
|
||||
}
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message.includes('Unique constraint')) {
|
||||
stats.churchesSkipped++;
|
||||
return;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ─── CLI ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
function parseArgs(): CLIArgs {
|
||||
const args = process.argv.slice(2);
|
||||
const result: CLIArgs = {
|
||||
all: false,
|
||||
dryRun: false,
|
||||
skipSchedules: false,
|
||||
};
|
||||
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
switch (args[i]) {
|
||||
case '--all':
|
||||
result.all = true;
|
||||
break;
|
||||
case '--church-id':
|
||||
result.churchId = args[++i];
|
||||
break;
|
||||
case '--dry-run':
|
||||
result.dryRun = true;
|
||||
break;
|
||||
case '--skip-schedules':
|
||||
result.skipSchedules = true;
|
||||
break;
|
||||
case '--resume-from':
|
||||
result.resumeFrom = parseInt(args[++i]);
|
||||
break;
|
||||
case '--job-id':
|
||||
result.jobId = args[++i];
|
||||
break;
|
||||
case '--help':
|
||||
case '-h':
|
||||
console.log(`
|
||||
Usage: npx tsx scripts/import-mass-schedules-ph.ts [options]
|
||||
|
||||
Options:
|
||||
--all Import all churches from sitemap
|
||||
--church-id <id> Import a single church by ID (e.g. "34")
|
||||
--dry-run No database writes, just report what would happen
|
||||
--skip-schedules Skip mass schedule import (churches only)
|
||||
--resume-from <id> Skip churches with ID less than this value
|
||||
--job-id <uuid> Background job tracking ID
|
||||
--help, -h Show this help message
|
||||
|
||||
Examples:
|
||||
npx tsx scripts/import-mass-schedules-ph.ts --church-id 34 --dry-run
|
||||
npx tsx scripts/import-mass-schedules-ph.ts --all
|
||||
npx tsx scripts/import-mass-schedules-ph.ts --all --skip-schedules
|
||||
npx tsx scripts/import-mass-schedules-ph.ts --all --resume-from 500
|
||||
`);
|
||||
process.exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
if (!result.all && !result.churchId) {
|
||||
console.error('Error: specify --all or --church-id <id>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ─── Helpers ─────────────────────────────────────────────────────────────────
|
||||
|
||||
function formatDuration(ms: number): string {
|
||||
const seconds = Math.floor(ms / 1000);
|
||||
const minutes = Math.floor(seconds / 60);
|
||||
const hours = Math.floor(minutes / 60);
|
||||
if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`;
|
||||
if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
|
||||
return `${seconds}s`;
|
||||
}
|
||||
|
||||
// ─── Main ────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const startTime = Date.now();
|
||||
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log('MASS-SCHEDULES.COM (PHILIPPINES) IMPORTER');
|
||||
console.log('='.repeat(70));
|
||||
console.log(`Mode: ${args.all ? 'All churches from sitemap' : `Single church: ${args.churchId}`}`);
|
||||
console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`);
|
||||
console.log(`Skip schedules: ${args.skipSchedules ? 'YES' : 'NO'}`);
|
||||
if (args.resumeFrom) console.log(`Resume from ID: ${args.resumeFrom}`);
|
||||
console.log(`Time: ${new Date().toISOString()}`);
|
||||
console.log('='.repeat(70) + '\n');
|
||||
|
||||
// Update background job status if provided
|
||||
if (args.jobId) {
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: args.jobId },
|
||||
data: { status: 'running', startedAt: new Date() },
|
||||
});
|
||||
} catch {
|
||||
// Job might not exist yet
|
||||
}
|
||||
}
|
||||
|
||||
// Load existing Philippine churches for dedup
|
||||
const existingChurches = await loadExistingPhilippineChurches();
|
||||
|
||||
// Build church list: skip sitemap for single-church mode
|
||||
let churchesToProcess: SitemapChurch[];
|
||||
if (args.churchId) {
|
||||
// Single church: construct URL directly, no sitemap needed
|
||||
churchesToProcess = [{
|
||||
id: args.churchId,
|
||||
slug: 'church',
|
||||
url: `${SITE_BASE}/catholic-church/${args.churchId}/church.html`,
|
||||
}];
|
||||
console.log(`Single church mode: ID ${args.churchId}\n`);
|
||||
} else {
|
||||
// Full mode: fetch sitemap
|
||||
const allChurches = await fetchChurchUrlsFromSitemap();
|
||||
console.log(`Found ${allChurches.length} unique church URLs in sitemap\n`);
|
||||
churchesToProcess = allChurches;
|
||||
}
|
||||
|
||||
// Handle --resume-from
|
||||
if (args.resumeFrom) {
|
||||
const before = churchesToProcess.length;
|
||||
churchesToProcess = churchesToProcess.filter((c) => parseInt(c.id) >= args.resumeFrom!);
|
||||
console.log(`Resuming from ID ${args.resumeFrom} (skipping ${before - churchesToProcess.length} churches)\n`);
|
||||
}
|
||||
|
||||
const stats: ImportStats = {
|
||||
churchesFound: 0,
|
||||
churchesMatched: 0,
|
||||
churchesCreated: 0,
|
||||
churchesSkipped: 0,
|
||||
schedulesProcessed: 0,
|
||||
massSchedulesCreated: 0,
|
||||
errors: 0,
|
||||
};
|
||||
|
||||
// Process each church
|
||||
for (let i = 0; i < churchesToProcess.length; i++) {
|
||||
const church = churchesToProcess[i];
|
||||
const elapsed = formatDuration(Date.now() - startTime);
|
||||
console.log(`[${i + 1}/${churchesToProcess.length}] Church ID ${church.id} [${elapsed} elapsed]`);
|
||||
|
||||
try {
|
||||
await processChurch(church, existingChurches, args.dryRun, args.skipSchedules, stats);
|
||||
} catch (error) {
|
||||
stats.errors++;
|
||||
console.error(` ERROR processing church ${church.id}: ${error instanceof Error ? error.message : error}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Print summary
|
||||
const totalTime = Date.now() - startTime;
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
|
||||
console.log('='.repeat(70));
|
||||
console.log(`Churches found: ${stats.churchesFound}`);
|
||||
console.log(` Matched (existing): ${stats.churchesMatched}`);
|
||||
console.log(` Created (new): ${stats.churchesCreated}`);
|
||||
console.log(` Skipped: ${stats.churchesSkipped}`);
|
||||
console.log(`Schedules processed: ${stats.schedulesProcessed}`);
|
||||
console.log(`Mass schedules created: ${stats.massSchedulesCreated}`);
|
||||
console.log(`Errors: ${stats.errors}`);
|
||||
console.log(`Total time: ${formatDuration(totalTime)}`);
|
||||
console.log(`HTTP requests: ${requestCount}`);
|
||||
console.log('='.repeat(70) + '\n');
|
||||
|
||||
// Update background job
|
||||
if (args.jobId) {
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: args.jobId },
|
||||
data: {
|
||||
status: stats.errors > 0 ? 'completed_with_errors' : 'completed',
|
||||
completedAt: new Date(),
|
||||
result: JSON.stringify(stats),
|
||||
},
|
||||
});
|
||||
} catch {
|
||||
// Ignore
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
main()
|
||||
.catch((error) => {
|
||||
console.error('Fatal error:', error);
|
||||
process.exit(1);
|
||||
})
|
||||
.finally(async () => {
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
});
|
||||
672
scripts/import-masstimes-api.ts
Normal file
672
scripts/import-masstimes-api.ts
Normal file
@@ -0,0 +1,672 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Import Catholic churches and mass schedules globally from masstimes.org API
|
||||
*
|
||||
* masstimes.org has ~121,000 churches worldwide. This script queries their
|
||||
* geo-search API with a grid of coordinates covering world landmass, then
|
||||
* deduplicates and imports the results.
|
||||
*
|
||||
* API: GET https://masstimes.org/Churchs/?lat={lat}&long={lng}&pg={page}
|
||||
* - Requires Referer header
|
||||
* - Returns 30 results per page within 100-mile (~160km) radius
|
||||
* - Paginate until empty array
|
||||
*
|
||||
* Grid strategy:
|
||||
* - 2.5° latitude spacing (~278km), longitude adjusted for latitude
|
||||
* - Continental bounding boxes to skip oceans
|
||||
* - 100-mile radius means ~322km diameter — 2.5° spacing ensures overlap
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/import-masstimes-api.ts --all
|
||||
* npx tsx scripts/import-masstimes-api.ts --all --dry-run
|
||||
* npx tsx scripts/import-masstimes-api.ts --region europe
|
||||
* npx tsx scripts/import-masstimes-api.ts --all --skip-us
|
||||
* npx tsx scripts/import-masstimes-api.ts --all --job-id {uuid}
|
||||
*/
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
|
||||
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
|
||||
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
|
||||
const pool = new Pool({
|
||||
connectionString: dbUrl,
|
||||
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
|
||||
});
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
import { findDuplicateChurch } from '../src/lib/church-matcher';
|
||||
import type { ExistingChurch } from '../src/lib/church-matcher';
|
||||
|
||||
// ─── Constants ───────────────────────────────────────────────────────────────
|
||||
|
||||
const API_BASE = 'https://masstimes.org/Churchs/';
|
||||
const REFERER = 'https://masstimes.org/map';
|
||||
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
|
||||
const RATE_LIMIT_MS = 2000; // 2 seconds between requests — respectful rate
|
||||
const PAGE_SIZE = 30;
|
||||
const LAT_SPACING = 2.5; // degrees (~278km)
|
||||
const TARGET_LNG_SPACING_KM = 250; // target spacing in km
|
||||
|
||||
// Country name → ISO code mapping for masstimes country names
|
||||
const COUNTRY_CODE_MAP: Record<string, string> = {
|
||||
'united states': 'US', 'canada': 'CA', 'mexico': 'MX',
|
||||
'united kingdom': 'GB', 'ireland': 'IE', 'france': 'FR', 'germany': 'DE',
|
||||
'spain': 'ES', 'italy': 'IT', 'portugal': 'PT', 'netherlands': 'NL',
|
||||
'belgium': 'BE', 'luxembourg': 'LU', 'switzerland': 'CH', 'austria': 'AT',
|
||||
'poland': 'PL', 'czech republic': 'CZ', 'czechia': 'CZ', 'slovakia': 'SK',
|
||||
'hungary': 'HU', 'croatia': 'HR', 'slovenia': 'SI', 'romania': 'RO',
|
||||
'bulgaria': 'BG', 'serbia': 'RS', 'bosnia and herzegovina': 'BA',
|
||||
'montenegro': 'ME', 'north macedonia': 'MK', 'albania': 'AL', 'kosovo': 'XK',
|
||||
'greece': 'GR', 'cyprus': 'CY', 'malta': 'MT', 'denmark': 'DK',
|
||||
'sweden': 'SE', 'norway': 'NO', 'finland': 'FI', 'iceland': 'IS',
|
||||
'estonia': 'EE', 'latvia': 'LV', 'lithuania': 'LT',
|
||||
'ukraine': 'UA', 'russia': 'RU', 'belarus': 'BY', 'moldova': 'MD',
|
||||
'georgia': 'GE', 'armenia': 'AM', 'azerbaijan': 'AZ',
|
||||
'turkey': 'TR', 'israel': 'IL', 'jordan': 'JO', 'lebanon': 'LB',
|
||||
'egypt': 'EG', 'morocco': 'MA', 'tunisia': 'TN', 'algeria': 'DZ',
|
||||
'india': 'IN', 'sri lanka': 'LK', 'pakistan': 'PK', 'bangladesh': 'BD',
|
||||
'nepal': 'NP', 'myanmar': 'MM', 'thailand': 'TH', 'vietnam': 'VN',
|
||||
'cambodia': 'KH', 'laos': 'LA', 'malaysia': 'MY', 'singapore': 'SG',
|
||||
'indonesia': 'ID', 'philippines': 'PH', 'china': 'CN', 'japan': 'JP',
|
||||
'south korea': 'KR', 'korea, south': 'KR', 'taiwan': 'TW',
|
||||
'hong kong': 'HK', 'macau': 'MO', 'mongolia': 'MN',
|
||||
'australia': 'AU', 'new zealand': 'NZ', 'fiji': 'FJ',
|
||||
'papua new guinea': 'PG', 'samoa': 'WS', 'tonga': 'TO', 'guam': 'GU',
|
||||
'nigeria': 'NG', 'ghana': 'GH', 'kenya': 'KE', 'tanzania': 'TZ',
|
||||
'uganda': 'UG', 'south africa': 'ZA', 'cameroon': 'CM', 'senegal': 'SN',
|
||||
'ethiopia': 'ET', 'madagascar': 'MG', 'mozambique': 'MZ',
|
||||
'zambia': 'ZM', 'zimbabwe': 'ZW', 'malawi': 'MW', 'rwanda': 'RW',
|
||||
'burundi': 'BI', 'congo, democratic republic of the': 'CD',
|
||||
'congo, republic of the': 'CG', "côte d'ivoire": 'CI', 'ivory coast': 'CI',
|
||||
'burkina faso': 'BF', 'mali': 'ML', 'niger': 'NE', 'chad': 'TD',
|
||||
'central african republic': 'CF', 'gabon': 'GA', 'equatorial guinea': 'GQ',
|
||||
'angola': 'AO', 'namibia': 'NA', 'botswana': 'BW', 'lesotho': 'LS',
|
||||
'eswatini': 'SZ', 'swaziland': 'SZ', 'mauritius': 'MU',
|
||||
'brazil': 'BR', 'argentina': 'AR', 'colombia': 'CO', 'peru': 'PE',
|
||||
'chile': 'CL', 'venezuela': 'VE', 'ecuador': 'EC', 'bolivia': 'BO',
|
||||
'paraguay': 'PY', 'uruguay': 'UY', 'guyana': 'GY', 'suriname': 'SR',
|
||||
'trinidad and tobago': 'TT', 'jamaica': 'JM', 'barbados': 'BB',
|
||||
'bahamas': 'BS', 'bahamas, the': 'BS', 'haiti': 'HT',
|
||||
'dominican republic': 'DO', 'cuba': 'CU', 'puerto rico': 'PR',
|
||||
'guatemala': 'GT', 'honduras': 'HN', 'el salvador': 'SV',
|
||||
'nicaragua': 'NI', 'costa rica': 'CR', 'panama': 'PA', 'belize': 'BZ',
|
||||
'grenada': 'GD', 'saint lucia': 'LC', 'dominica': 'DM',
|
||||
'saint vincent and the grenadines': 'VC', 'antigua and barbuda': 'AG',
|
||||
'saint kitts and nevis': 'KN', 'bermuda': 'BM', 'cayman islands': 'KY',
|
||||
'aruba': 'AW', 'curaçao': 'CW', 'curacao': 'CW',
|
||||
'united arab emirates': 'AE', 'saudi arabia': 'SA', 'qatar': 'QA',
|
||||
'bahrain': 'BH', 'kuwait': 'KW', 'oman': 'OM', 'iraq': 'IQ',
|
||||
'iran': 'IR', 'afghanistan': 'AF',
|
||||
'kazakhstan': 'KZ', 'uzbekistan': 'UZ', 'kyrgyzstan': 'KG',
|
||||
'tajikistan': 'TJ', 'turkmenistan': 'TM',
|
||||
'liechtenstein': 'LI', 'monaco': 'MC', 'andorra': 'AD', 'san marino': 'SM',
|
||||
'vatican city': 'VA', 'holy see (vatican city)': 'VA',
|
||||
'east timor': 'TL', 'timor-leste': 'TL',
|
||||
};
|
||||
|
||||
// Continental bounding boxes (lat_min, lat_max, lng_min, lng_max)
|
||||
const REGIONS: Record<string, Array<[number, number, number, number]>> = {
|
||||
'north-america': [[7, 72, -170, -50]],
|
||||
'central-america': [[7, 24, -120, -60]],
|
||||
'south-america': [[-56, 13, -82, -34]],
|
||||
'europe': [[35, 72, -12, 45]],
|
||||
'eastern-europe': [[40, 70, 20, 60]],
|
||||
'africa': [[-36, 38, -20, 55]],
|
||||
'middle-east': [[12, 42, 25, 65]],
|
||||
'south-asia': [[5, 38, 60, 98]],
|
||||
'east-asia': [[18, 55, 95, 150]],
|
||||
'southeast-asia': [[-12, 22, 92, 142]],
|
||||
'oceania': [[-48, -8, 110, 180], [-22, 0, 160, 180]],
|
||||
'central-asia': [[35, 55, 45, 90]],
|
||||
};
|
||||
|
||||
// ─── Types ───────────────────────────────────────────────────────────────────
|
||||
|
||||
interface MasstimesChurch {
|
||||
id: string;
|
||||
name: string;
|
||||
latitude: string;
|
||||
longitude: string;
|
||||
church_address_street_address: string;
|
||||
church_address_city_name: string;
|
||||
church_address_providence_name: string;
|
||||
church_address_postal_code: string;
|
||||
church_address_country_territory_name: string;
|
||||
church_address_county: string | null;
|
||||
diocese_name: string;
|
||||
phone_number: string;
|
||||
email: string;
|
||||
url: string;
|
||||
pastors_name: string;
|
||||
church_worship_times: MasstimesWorshipTime[];
|
||||
distance: string;
|
||||
wheel_chair_access: boolean;
|
||||
}
|
||||
|
||||
interface MasstimesWorshipTime {
|
||||
day_of_week: string;
|
||||
time_start: string;
|
||||
time_end: string;
|
||||
language: string | null;
|
||||
service_typename: string;
|
||||
comment: string;
|
||||
is_perpetual: boolean;
|
||||
}
|
||||
|
||||
interface ImportStats {
|
||||
gridPoints: number;
|
||||
apiRequests: number;
|
||||
churchesDiscovered: number;
|
||||
churchesMatched: number;
|
||||
churchesCreated: number;
|
||||
churchesSkipped: number;
|
||||
massSchedulesCreated: number;
|
||||
errors: number;
|
||||
}
|
||||
|
||||
interface CLIArgs {
|
||||
all: boolean;
|
||||
region?: string;
|
||||
dryRun: boolean;
|
||||
skipUs: boolean;
|
||||
resumeFrom: number;
|
||||
jobId?: string;
|
||||
}
|
||||
|
||||
// ─── CLI ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
function parseArgs(): CLIArgs {
|
||||
const args = process.argv.slice(2);
|
||||
const result: CLIArgs = { all: false, dryRun: false, skipUs: false, resumeFrom: 0 };
|
||||
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
switch (args[i]) {
|
||||
case '--all': result.all = true; break;
|
||||
case '--region': result.region = args[++i]; break;
|
||||
case '--dry-run': result.dryRun = true; break;
|
||||
case '--skip-us': result.skipUs = true; break;
|
||||
case '--resume-from': result.resumeFrom = parseInt(args[++i], 10); break;
|
||||
case '--job-id': result.jobId = args[++i]; break;
|
||||
case '--help':
|
||||
console.log(`Usage: npx tsx scripts/import-masstimes-api.ts [options]
|
||||
--all Query all regions globally
|
||||
--region <name> Query specific region: ${Object.keys(REGIONS).join(', ')}
|
||||
--skip-us Skip US grid points (already well-covered)
|
||||
--dry-run No database writes
|
||||
--resume-from <n> Skip first N grid points
|
||||
--job-id <uuid> Background job tracking`);
|
||||
process.exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
if (!result.all && !result.region) {
|
||||
console.error('Error: specify --all or --region <name>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ─── Grid Generation ─────────────────────────────────────────────────────────
|
||||
|
||||
function generateGridPoints(regions: string[], skipUs: boolean): Array<{ lat: number; lng: number }> {
|
||||
const points: Array<{ lat: number; lng: number }> = [];
|
||||
const seen = new Set<string>();
|
||||
|
||||
for (const regionName of regions) {
|
||||
const boxes = REGIONS[regionName];
|
||||
if (!boxes) {
|
||||
console.error(`Unknown region: ${regionName}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const [latMin, latMax, lngMin, lngMax] of boxes) {
|
||||
for (let lat = latMin; lat <= latMax; lat += LAT_SPACING) {
|
||||
// Adjust longitude spacing based on latitude (degrees get narrower)
|
||||
const kmPerDegreeLng = 111.32 * Math.cos((lat * Math.PI) / 180);
|
||||
const lngSpacing = kmPerDegreeLng > 0
|
||||
? Math.max(LAT_SPACING, TARGET_LNG_SPACING_KM / kmPerDegreeLng)
|
||||
: LAT_SPACING;
|
||||
|
||||
for (let lng = lngMin; lng <= lngMax; lng += lngSpacing) {
|
||||
const roundedLat = Math.round(lat * 10) / 10;
|
||||
const roundedLng = Math.round(lng * 10) / 10;
|
||||
const key = `${roundedLat},${roundedLng}`;
|
||||
|
||||
if (!seen.has(key)) {
|
||||
// Skip US continental bounding box if requested
|
||||
if (skipUs && roundedLat >= 24 && roundedLat <= 50
|
||||
&& roundedLng >= -125 && roundedLng <= -66) {
|
||||
continue;
|
||||
}
|
||||
seen.add(key);
|
||||
points.push({ lat: roundedLat, lng: roundedLng });
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return points;
|
||||
}
|
||||
|
||||
// ─── API ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function sleep(ms: number): Promise<void> {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
async function fetchPage(lat: number, lng: number, page: number): Promise<MasstimesChurch[]> {
|
||||
const url = `${API_BASE}?lat=${lat}&long=${lng}&pg=${page}`;
|
||||
const response = await fetch(url, {
|
||||
headers: {
|
||||
'Referer': REFERER,
|
||||
'User-Agent': USER_AGENT,
|
||||
'Accept': 'application/json',
|
||||
},
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
if (response.status === 429) {
|
||||
console.error(` Rate limited (429) — backing off 30s`);
|
||||
await sleep(30000);
|
||||
return fetchPage(lat, lng, page); // retry once
|
||||
}
|
||||
throw new Error(`HTTP ${response.status} for ${url}`);
|
||||
}
|
||||
|
||||
return response.json() as Promise<MasstimesChurch[]>;
|
||||
}
|
||||
|
||||
async function fetchAllForPoint(
|
||||
lat: number,
|
||||
lng: number,
|
||||
stats: ImportStats,
|
||||
): Promise<MasstimesChurch[]> {
|
||||
const allChurches: MasstimesChurch[] = [];
|
||||
let page = 1;
|
||||
|
||||
while (true) {
|
||||
stats.apiRequests++;
|
||||
const results = await fetchPage(lat, lng, page);
|
||||
if (results.length === 0) break;
|
||||
|
||||
allChurches.push(...results);
|
||||
|
||||
if (results.length < PAGE_SIZE) break; // last page
|
||||
page++;
|
||||
|
||||
await sleep(RATE_LIMIT_MS);
|
||||
}
|
||||
|
||||
return allChurches;
|
||||
}
|
||||
|
||||
// ─── Data Conversion ─────────────────────────────────────────────────────────
|
||||
|
||||
function resolveCountryCode(countryName: string): string {
|
||||
if (!countryName) return 'XX';
|
||||
const lower = countryName.trim().toLowerCase();
|
||||
return COUNTRY_CODE_MAP[lower] || 'XX';
|
||||
}
|
||||
|
||||
const DAY_MAP: Record<string, number[]> = {
|
||||
'sunday': [0],
|
||||
'monday': [1],
|
||||
'tuesday': [2],
|
||||
'wednesday': [3],
|
||||
'thursday': [4],
|
||||
'friday': [5],
|
||||
'saturday': [6],
|
||||
'weekdays': [1, 2, 3, 4, 5],
|
||||
};
|
||||
|
||||
function parseWorshipTimes(times: MasstimesWorshipTime[]): Array<{
|
||||
dayOfWeek: number;
|
||||
time: string;
|
||||
language: string;
|
||||
notes: string | null;
|
||||
massType: string | null;
|
||||
}> {
|
||||
const schedules: Array<{
|
||||
dayOfWeek: number;
|
||||
time: string;
|
||||
language: string;
|
||||
notes: string | null;
|
||||
massType: string | null;
|
||||
}> = [];
|
||||
|
||||
for (const wt of times) {
|
||||
// Only import mass services (Weekend = Sun/Sat, Week Days = weekday masses)
|
||||
if (wt.service_typename !== 'Weekend' && wt.service_typename !== 'Week Days') {
|
||||
continue;
|
||||
}
|
||||
|
||||
const timeStr = wt.time_start?.trim();
|
||||
if (!timeStr || timeStr === '00:00:00') continue;
|
||||
|
||||
// Parse "HH:MM:SS" → "HH:MM"
|
||||
const timeParts = timeStr.split(':');
|
||||
const time24 = `${timeParts[0].padStart(2, '0')}:${timeParts[1] || '00'}`;
|
||||
|
||||
const language = wt.language?.trim() || 'Unknown';
|
||||
const notes = wt.comment?.trim() || null;
|
||||
|
||||
const dayKey = wt.day_of_week?.trim().toLowerCase();
|
||||
const days = DAY_MAP[dayKey];
|
||||
|
||||
if (days) {
|
||||
for (const day of days) {
|
||||
schedules.push({ dayOfWeek: day, time: time24, language, notes, massType: null });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return schedules;
|
||||
}
|
||||
|
||||
// ─── Database ────────────────────────────────────────────────────────────────
|
||||
|
||||
async function loadExistingChurches(): Promise<ExistingChurch[]> {
|
||||
console.log('Loading existing churches for deduplication...');
|
||||
const churches = await prisma.church.findMany({
|
||||
select: {
|
||||
id: true,
|
||||
name: true,
|
||||
latitude: true,
|
||||
longitude: true,
|
||||
osmId: true,
|
||||
baiduId: true,
|
||||
masstimesId: true,
|
||||
orarimesseId: true,
|
||||
massSchedulesPhId: true,
|
||||
philmassId: true,
|
||||
horariosMisasId: true,
|
||||
mszeInfoId: true,
|
||||
weekdayMassesId: true,
|
||||
messesInfoId: true,
|
||||
bohosluzbyId: true,
|
||||
miserendId: true,
|
||||
kerknetId: true,
|
||||
gottesdienstzeitenId: true,
|
||||
discovermassId: true,
|
||||
source: true,
|
||||
website: true,
|
||||
phone: true,
|
||||
address: true,
|
||||
country: true,
|
||||
},
|
||||
});
|
||||
console.log(`Loaded ${churches.length} existing churches`);
|
||||
return churches;
|
||||
}
|
||||
|
||||
async function updateJobProgress(jobId: string, stats: ImportStats): Promise<void> {
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: {
|
||||
processed: stats.gridPoints,
|
||||
succeeded: stats.churchesMatched + stats.churchesCreated,
|
||||
failed: stats.errors,
|
||||
itemsFound: stats.churchesDiscovered,
|
||||
},
|
||||
});
|
||||
} catch (err) {
|
||||
console.error(`Failed to update job progress:`, err);
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Main Import ─────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
|
||||
let regionNames: string[];
|
||||
if (args.all) {
|
||||
regionNames = Object.keys(REGIONS);
|
||||
} else {
|
||||
regionNames = [args.region!];
|
||||
}
|
||||
|
||||
const gridPoints = generateGridPoints(regionNames, args.skipUs);
|
||||
|
||||
console.log(`\n${'='.repeat(70)}`);
|
||||
console.log('MASSTIMES.ORG API GLOBAL IMPORTER');
|
||||
console.log('='.repeat(70));
|
||||
console.log(`Regions: ${regionNames.join(', ')}`);
|
||||
console.log(`Grid points: ${gridPoints.length}`);
|
||||
console.log(`Skip US: ${args.skipUs ? 'YES' : 'NO'}`);
|
||||
console.log(`Dry run: ${args.dryRun ? 'YES' : 'NO'}`);
|
||||
console.log(`Rate limit: ${RATE_LIMIT_MS}ms between requests`);
|
||||
console.log(`Resume from: ${args.resumeFrom || 'start'}`);
|
||||
const estHours = Math.round(gridPoints.length * 2 * RATE_LIMIT_MS / 1000 / 3600 * 10) / 10;
|
||||
console.log(`Est. time: ~${estHours} hours (est. 2 pages/point avg)`);
|
||||
console.log(`Time: ${new Date().toISOString()}`);
|
||||
console.log('='.repeat(70));
|
||||
|
||||
const existingChurches = await loadExistingChurches();
|
||||
|
||||
// Build masstimesId lookup for fast dedup
|
||||
const masstimesIdSet = new Set<string>();
|
||||
for (const c of existingChurches) {
|
||||
if (c.masstimesId) masstimesIdSet.add(c.masstimesId);
|
||||
}
|
||||
|
||||
// Track discovered IDs to deduplicate across grid points
|
||||
const discoveredIds = new Set<string>();
|
||||
|
||||
const stats: ImportStats = {
|
||||
gridPoints: 0,
|
||||
apiRequests: 0,
|
||||
churchesDiscovered: 0,
|
||||
churchesMatched: 0,
|
||||
churchesCreated: 0,
|
||||
churchesSkipped: 0,
|
||||
massSchedulesCreated: 0,
|
||||
errors: 0,
|
||||
};
|
||||
|
||||
let jobId = args.jobId;
|
||||
if (jobId) {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: { status: 'running', startedAt: new Date(), totalItems: gridPoints.length },
|
||||
});
|
||||
}
|
||||
|
||||
const startTime = Date.now();
|
||||
|
||||
for (let i = 0; i < gridPoints.length; i++) {
|
||||
const { lat, lng } = gridPoints[i];
|
||||
stats.gridPoints++;
|
||||
|
||||
if (i < args.resumeFrom) continue;
|
||||
|
||||
try {
|
||||
const churches = await fetchAllForPoint(lat, lng, stats);
|
||||
|
||||
if (churches.length > 0) {
|
||||
let newInPoint = 0;
|
||||
for (const mc of churches) {
|
||||
if (discoveredIds.has(mc.id)) continue;
|
||||
discoveredIds.add(mc.id);
|
||||
stats.churchesDiscovered++;
|
||||
|
||||
// Already in DB by masstimesId
|
||||
if (masstimesIdSet.has(mc.id)) {
|
||||
stats.churchesMatched++;
|
||||
continue;
|
||||
}
|
||||
|
||||
const churchLat = parseFloat(mc.latitude);
|
||||
const churchLng = parseFloat(mc.longitude);
|
||||
if (isNaN(churchLat) || isNaN(churchLng) || (churchLat === 0 && churchLng === 0)) continue;
|
||||
|
||||
const country = resolveCountryCode(mc.church_address_country_territory_name);
|
||||
const address = [
|
||||
mc.church_address_street_address,
|
||||
mc.church_address_city_name,
|
||||
mc.church_address_providence_name,
|
||||
mc.church_address_postal_code,
|
||||
].filter(s => s?.trim()).join(', ').trim() || null;
|
||||
|
||||
// Proximity + name match
|
||||
const candidate = { name: mc.name, lat: churchLat, lng: churchLng };
|
||||
const duplicate = findDuplicateChurch(candidate, existingChurches);
|
||||
|
||||
if (duplicate) {
|
||||
stats.churchesMatched++;
|
||||
if (!args.dryRun) {
|
||||
const updateData: Record<string, unknown> = { masstimesId: mc.id };
|
||||
if (!duplicate.phone && mc.phone_number?.trim()) updateData.phone = mc.phone_number.trim();
|
||||
if (!duplicate.website && mc.url?.trim()) {
|
||||
updateData.website = mc.url.trim();
|
||||
updateData.hasWebsite = true;
|
||||
}
|
||||
if (!duplicate.address && address) updateData.address = address;
|
||||
if (duplicate.country === 'XX' && country !== 'XX') updateData.country = country;
|
||||
|
||||
try {
|
||||
await prisma.church.update({ where: { id: duplicate.id }, data: updateData });
|
||||
masstimesIdSet.add(mc.id);
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message.includes('Unique constraint')) {
|
||||
stats.churchesSkipped++;
|
||||
} else throw error;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Create new church
|
||||
if (!args.dryRun) {
|
||||
const schedules = parseWorshipTimes(mc.church_worship_times || []);
|
||||
try {
|
||||
const newChurch = await prisma.church.create({
|
||||
data: {
|
||||
name: mc.name,
|
||||
latitude: churchLat,
|
||||
longitude: churchLng,
|
||||
address,
|
||||
city: mc.church_address_city_name?.trim() || null,
|
||||
state: mc.church_address_providence_name?.trim() || null,
|
||||
zip: mc.church_address_postal_code?.trim() || null,
|
||||
country,
|
||||
phone: mc.phone_number?.trim() || null,
|
||||
website: mc.url?.trim() || null,
|
||||
email: mc.email?.trim() || null,
|
||||
hasWebsite: !!mc.url?.trim(),
|
||||
masstimesId: mc.id,
|
||||
source: 'masstimes',
|
||||
diocese: mc.diocese_name?.trim() || null,
|
||||
pastorName: mc.pastors_name?.trim() || null,
|
||||
wheelchairAccess: mc.wheel_chair_access || false,
|
||||
massSchedules: schedules.length > 0 ? {
|
||||
create: schedules.map(s => ({
|
||||
dayOfWeek: s.dayOfWeek,
|
||||
time: s.time,
|
||||
language: s.language,
|
||||
notes: s.notes,
|
||||
massType: s.massType,
|
||||
isActive: true,
|
||||
})),
|
||||
} : undefined,
|
||||
},
|
||||
});
|
||||
|
||||
stats.churchesCreated++;
|
||||
stats.massSchedulesCreated += schedules.length;
|
||||
newInPoint++;
|
||||
masstimesIdSet.add(mc.id);
|
||||
|
||||
existingChurches.push({
|
||||
id: newChurch.id, name: mc.name,
|
||||
latitude: churchLat, longitude: churchLng,
|
||||
osmId: null, baiduId: null, masstimesId: mc.id,
|
||||
orarimesseId: null, massSchedulesPhId: null,
|
||||
philmassId: null, horariosMisasId: null,
|
||||
mszeInfoId: null, weekdayMassesId: null,
|
||||
messesInfoId: null, bohosluzbyId: null, miserendId: null, kerknetId: null, gottesdienstzeitenId: null, discovermassId: null,
|
||||
source: 'masstimes', website: mc.url?.trim() || null,
|
||||
phone: mc.phone_number?.trim() || null, address, country,
|
||||
});
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message.includes('Unique constraint')) {
|
||||
stats.churchesSkipped++;
|
||||
} else {
|
||||
stats.errors++;
|
||||
console.error(` Error creating ${mc.name}: ${error instanceof Error ? error.message : error}`);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
stats.churchesCreated++;
|
||||
stats.massSchedulesCreated += parseWorshipTimes(mc.church_worship_times || []).length;
|
||||
newInPoint++;
|
||||
}
|
||||
}
|
||||
|
||||
if (newInPoint > 0) {
|
||||
console.log(` Grid ${i + 1}/${gridPoints.length} (${lat},${lng}): ${churches.length} found, ${newInPoint} new`);
|
||||
}
|
||||
}
|
||||
|
||||
await sleep(RATE_LIMIT_MS);
|
||||
} catch (error) {
|
||||
stats.errors++;
|
||||
console.error(` Error at grid ${i + 1} (${lat},${lng}): ${error instanceof Error ? error.message : error}`);
|
||||
await sleep(RATE_LIMIT_MS * 2);
|
||||
}
|
||||
|
||||
// Progress every 50 points
|
||||
if ((i + 1) % 50 === 0 || i === gridPoints.length - 1) {
|
||||
const elapsed = (Date.now() - startTime) / 1000;
|
||||
const rate = elapsed > 0 ? Math.round(stats.apiRequests / elapsed * 3600) : 0;
|
||||
console.log(` Progress: ${i + 1}/${gridPoints.length} grid points, ${stats.churchesDiscovered} discovered, ${stats.churchesCreated} new, ${stats.apiRequests} API calls [${Math.round(elapsed)}s, ~${rate}/hr]`);
|
||||
}
|
||||
|
||||
if (jobId && (i + 1) % 20 === 0) {
|
||||
await updateJobProgress(jobId, stats);
|
||||
}
|
||||
}
|
||||
|
||||
if (jobId) {
|
||||
await updateJobProgress(jobId, stats);
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: { status: 'completed', completedAt: new Date() },
|
||||
});
|
||||
}
|
||||
|
||||
const elapsed = (Date.now() - startTime) / 1000;
|
||||
console.log(`\n${'='.repeat(70)}`);
|
||||
console.log('MASSTIMES API IMPORT SUMMARY');
|
||||
console.log('='.repeat(70));
|
||||
console.log(`Grid points queried: ${stats.gridPoints}`);
|
||||
console.log(`API requests: ${stats.apiRequests}`);
|
||||
console.log(`Churches discovered: ${stats.churchesDiscovered}`);
|
||||
console.log(`Churches matched: ${stats.churchesMatched} (already in DB)`);
|
||||
console.log(`Churches created: ${stats.churchesCreated}`);
|
||||
console.log(`Churches skipped: ${stats.churchesSkipped} (duplicates)`);
|
||||
console.log(`Mass schedules created: ${stats.massSchedulesCreated}`);
|
||||
console.log(`Errors: ${stats.errors}`);
|
||||
console.log(`Elapsed: ${Math.round(elapsed)}s (${(elapsed / 3600).toFixed(1)}h)`);
|
||||
console.log('='.repeat(70));
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
console.error(`Fatal error: ${error.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
681
scripts/import-messesinfo.ts
Normal file
681
scripts/import-messesinfo.ts
Normal file
@@ -0,0 +1,681 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Import Catholic churches and mass schedules from messes.info (France)
|
||||
*
|
||||
* messes.info is the official French bishops' conference (CEF) mass schedule
|
||||
* database. It exposes a GWT-RPC API returning structured JSON with parish
|
||||
* data including name, address, coordinates, diocese, and celebration times.
|
||||
*
|
||||
* The API requires no authentication. We enumerate all French dioceses using
|
||||
* the "community:{diocese_code}" query prefix, which returns all parishes
|
||||
* within each diocese.
|
||||
*
|
||||
* Import strategy:
|
||||
* 1. Query each of ~93 diocese codes via GWT-RPC API
|
||||
* 2. Parse response: extract localities (churches) + celebrations (mass times)
|
||||
* 3. Deduce recurring weekly schedule from date-specific celebration entries
|
||||
* 4. Match against existing French churches via church-matcher
|
||||
* 5. Upsert churches and mass schedules
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/import-messesinfo.ts --all --dry-run
|
||||
* npx tsx scripts/import-messesinfo.ts --all
|
||||
* npx tsx scripts/import-messesinfo.ts --diocese pa --dry-run # Paris only
|
||||
* npx tsx scripts/import-messesinfo.ts --all --resume-from 20
|
||||
*/
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
|
||||
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
|
||||
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
|
||||
const pool = new Pool({
|
||||
connectionString: dbUrl,
|
||||
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
|
||||
});
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
import { findDuplicateChurch } from '../src/lib/church-matcher';
|
||||
import type { ExistingChurch } from '../src/lib/church-matcher';
|
||||
|
||||
// ─── Constants ───────────────────────────────────────────────────────────────
|
||||
|
||||
const API_URL = 'https://messes.info/gwtRequest';
|
||||
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
|
||||
const REQUEST_DELAY_MS = 3000;
|
||||
const RETRY_DELAY_MS = 10000;
|
||||
const MAX_RETRIES = 3;
|
||||
const RESULTS_PER_QUERY = 2000;
|
||||
|
||||
// Diocese codes discovered from the API. Each code maps to a diocese in France.
|
||||
// The query "community:{code}" returns all parishes within that diocese.
|
||||
// Codes are 2-letter abbreviations (e.g., pa=Paris, ly=Lyon, st=Strasbourg).
|
||||
const DIOCESE_CODES = [
|
||||
'a', 'aa', 'ac', 'ad', 'ag', 'al', 'am', 'an', 'ar', 'au', 'av', 'ay',
|
||||
'ba', 'bb', 'be', 'bl', 'bm', 'bo', 'br', 'bs', 'bv', 'by',
|
||||
'ca', 'cb', 'cc', 'cd', 'ch', 'cl', 'cm', 'cn', 'cr', 'cs',
|
||||
'da', 'di', 'dj', 'dn',
|
||||
'et', 'ex', 'ey',
|
||||
'ft',
|
||||
'ga', 'gr',
|
||||
'lg', 'lh', 'li', 'lm', 'lp', 'lr', 'ls', 'lu', 'lv', 'ly',
|
||||
'ma', 'md', 'me', 'ml', 'mp', 'mt', 'mx',
|
||||
'na', 'nc', 'ni', 'nt', 'nv', 'ny',
|
||||
'or',
|
||||
'pa', 'pm', 'po', 'ps', 'pt',
|
||||
'qu',
|
||||
're', 'rn', 'ro', 'rv',
|
||||
'sl', 'ss', 'st', 'sz',
|
||||
'tl', 'to', 'ts', 'tu',
|
||||
'va', 'vd', 've', 'vl', 'vv',
|
||||
];
|
||||
|
||||
// ─── Types ───────────────────────────────────────────────────────────────────
|
||||
|
||||
interface LocalityData {
|
||||
idfixe: string;
|
||||
name: string;
|
||||
address: string | null;
|
||||
city: string | null;
|
||||
zipcode: string | null;
|
||||
latitude: number;
|
||||
longitude: number;
|
||||
sector: string | null;
|
||||
communityId: string | null;
|
||||
localityId: string; // e.g. "75/paris-04/saint-louis-en-l-ile"
|
||||
}
|
||||
|
||||
interface CelebrationData {
|
||||
date: string;
|
||||
time: string; // normalized to "HH:MM"
|
||||
recurrenceCategory: number;
|
||||
}
|
||||
|
||||
interface ParsedSchedule {
|
||||
dayOfWeek: number;
|
||||
time: string;
|
||||
}
|
||||
|
||||
interface ImportStats {
|
||||
diocesesProcessed: number;
|
||||
localitiesFound: number;
|
||||
churchesMatched: number;
|
||||
churchesCreated: number;
|
||||
churchesSkipped: number;
|
||||
schedulesCreated: number;
|
||||
errors: number;
|
||||
}
|
||||
|
||||
interface CLIArgs {
|
||||
all: boolean;
|
||||
dryRun: boolean;
|
||||
resumeFrom?: number;
|
||||
diocese?: string;
|
||||
jobId?: string;
|
||||
}
|
||||
|
||||
// ─── HTTP Client ─────────────────────────────────────────────────────────────
|
||||
|
||||
let requestCount = 0;
|
||||
|
||||
function delay(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert MessesInfo time format "18h00" or "9h30" to "HH:MM" format.
|
||||
*/
|
||||
function normalizeTime(messesTime: string): string {
|
||||
const match = messesTime.match(/^(\d{1,2})h(\d{2})$/);
|
||||
if (match) {
|
||||
return `${match[1].padStart(2, '0')}:${match[2]}`;
|
||||
}
|
||||
// Already in HH:MM format
|
||||
if (/^\d{1,2}:\d{2}$/.test(messesTime)) {
|
||||
const parts = messesTime.split(':');
|
||||
return `${parts[0].padStart(2, '0')}:${parts[1]}`;
|
||||
}
|
||||
return messesTime;
|
||||
}
|
||||
|
||||
async function fetchDioceseData(dioceseCode: string): Promise<any | null> {
|
||||
if (requestCount > 0) {
|
||||
await delay(REQUEST_DELAY_MS);
|
||||
}
|
||||
requestCount++;
|
||||
|
||||
const body = JSON.stringify({
|
||||
F: 'cef.kephas.shared.request.AppRequestFactory',
|
||||
I: [{
|
||||
O: 'Bzv0wi60qgwcW5aKiRKrtgNaLKo=',
|
||||
P: [`community:${dioceseCode}`, 0, RESULTS_PER_QUERY, 1, null, '48.86:2.35', ''],
|
||||
R: ['listCelebrationTime.locality'],
|
||||
}],
|
||||
});
|
||||
|
||||
for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
|
||||
try {
|
||||
const response = await fetch(API_URL, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'User-Agent': USER_AGENT,
|
||||
'Content-Type': 'application/json',
|
||||
'Accept': 'application/json',
|
||||
},
|
||||
body,
|
||||
});
|
||||
|
||||
if (response.status === 503 || response.status === 429) {
|
||||
if (attempt < MAX_RETRIES) {
|
||||
console.log(` HTTP ${response.status} — retrying in ${RETRY_DELAY_MS / 1000}s (attempt ${attempt}/${MAX_RETRIES})`);
|
||||
await delay(RETRY_DELAY_MS);
|
||||
continue;
|
||||
}
|
||||
console.error(` HTTP ${response.status} after ${MAX_RETRIES} attempts`);
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!response.ok) {
|
||||
console.error(` HTTP ${response.status} from API`);
|
||||
return null;
|
||||
}
|
||||
|
||||
return await response.json();
|
||||
} catch (error) {
|
||||
if (attempt < MAX_RETRIES) {
|
||||
console.log(` Network error — retrying in ${RETRY_DELAY_MS / 1000}s (attempt ${attempt}/${MAX_RETRIES})`);
|
||||
await delay(RETRY_DELAY_MS);
|
||||
continue;
|
||||
}
|
||||
console.error(` API error after ${MAX_RETRIES} attempts: ${error instanceof Error ? error.message : error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// ─── Response Parser ─────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Parse the GWT-RPC response into a map of locality idfixe → data.
|
||||
*
|
||||
* The response O array contains interleaved objects:
|
||||
* - Locality objects: have P.idfixe, P.name, P.address, P.latitude, etc.
|
||||
* - Celebration objects: have P.date, P.time, P.localityId, P.recurrenceCategory
|
||||
* - Metadata object: has P.size, P.sizeLocalities
|
||||
*
|
||||
* Localities and celebrations are linked by P.localityId matching P.id on localities.
|
||||
*/
|
||||
function parseApiResponse(data: any): Map<string, { locality: LocalityData; celebrations: CelebrationData[] }> {
|
||||
const result = new Map<string, { locality: LocalityData; celebrations: CelebrationData[] }>();
|
||||
|
||||
if (!data?.O || !Array.isArray(data.O)) return result;
|
||||
|
||||
// First pass: collect all localities by their id
|
||||
const localitiesById = new Map<string, LocalityData>();
|
||||
for (const obj of data.O) {
|
||||
const p = obj.P;
|
||||
if (!p || typeof p !== 'object') continue;
|
||||
|
||||
if (p.idfixe && p.name) {
|
||||
const locality: LocalityData = {
|
||||
idfixe: p.idfixe,
|
||||
name: p.name,
|
||||
address: p.address || null,
|
||||
city: p.city || null,
|
||||
zipcode: p.zipcode || null,
|
||||
latitude: p.latitude || 0,
|
||||
longitude: p.longitude || 0,
|
||||
sector: p.sector || null,
|
||||
communityId: p.communityId || null,
|
||||
localityId: p.id || '',
|
||||
};
|
||||
localitiesById.set(p.id, locality);
|
||||
|
||||
// Initialize in result map (dedup by idfixe)
|
||||
if (!result.has(p.idfixe)) {
|
||||
result.set(p.idfixe, { locality, celebrations: [] });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Second pass: collect celebrations and link to localities
|
||||
for (const obj of data.O) {
|
||||
const p = obj.P;
|
||||
if (!p || typeof p !== 'object') continue;
|
||||
|
||||
if (p.date && p.time && p.localityId) {
|
||||
const locality = localitiesById.get(p.localityId);
|
||||
if (locality && result.has(locality.idfixe)) {
|
||||
result.get(locality.idfixe)!.celebrations.push({
|
||||
date: p.date,
|
||||
time: normalizeTime(p.time),
|
||||
recurrenceCategory: p.recurrenceCategory ?? 0,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ─── Schedule Deduction ──────────────────────────────────────────────────────
|
||||
|
||||
function deduceSchedules(celebrations: CelebrationData[]): ParsedSchedule[] {
|
||||
const seen = new Set<string>();
|
||||
const schedules: ParsedSchedule[] = [];
|
||||
|
||||
// First pass: weekly recurring entries only (recurrenceCategory=1)
|
||||
for (const celeb of celebrations) {
|
||||
if (celeb.recurrenceCategory !== 1) continue;
|
||||
const date = new Date(celeb.date + 'T12:00:00Z');
|
||||
const dayOfWeek = date.getUTCDay();
|
||||
const key = `${dayOfWeek}:${celeb.time}`;
|
||||
if (!seen.has(key)) {
|
||||
seen.add(key);
|
||||
schedules.push({ dayOfWeek, time: celeb.time });
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: if no weekly entries, deduce from all
|
||||
if (schedules.length === 0) {
|
||||
for (const celeb of celebrations) {
|
||||
const date = new Date(celeb.date + 'T12:00:00Z');
|
||||
const dayOfWeek = date.getUTCDay();
|
||||
const key = `${dayOfWeek}:${celeb.time}`;
|
||||
if (!seen.has(key)) {
|
||||
seen.add(key);
|
||||
schedules.push({ dayOfWeek, time: celeb.time });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return schedules;
|
||||
}
|
||||
|
||||
// ─── Database Operations ─────────────────────────────────────────────────────
|
||||
|
||||
async function loadExistingFrenchChurches(): Promise<ExistingChurch[]> {
|
||||
console.log('Loading existing French churches for deduplication...');
|
||||
const churches = await prisma.church.findMany({
|
||||
where: { country: 'FR' },
|
||||
select: {
|
||||
id: true,
|
||||
name: true,
|
||||
latitude: true,
|
||||
longitude: true,
|
||||
osmId: true,
|
||||
baiduId: true,
|
||||
masstimesId: true,
|
||||
orarimesseId: true,
|
||||
massSchedulesPhId: true,
|
||||
philmassId: true,
|
||||
horariosMisasId: true,
|
||||
mszeInfoId: true,
|
||||
weekdayMassesId: true,
|
||||
messesInfoId: true,
|
||||
bohosluzbyId: true,
|
||||
miserendId: true,
|
||||
kerknetId: true,
|
||||
gottesdienstzeitenId: true,
|
||||
discovermassId: true,
|
||||
source: true,
|
||||
website: true,
|
||||
phone: true,
|
||||
address: true,
|
||||
},
|
||||
});
|
||||
console.log(`Loaded ${churches.length} existing French churches`);
|
||||
return churches;
|
||||
}
|
||||
|
||||
// ─── Import Logic ────────────────────────────────────────────────────────────
|
||||
|
||||
async function processDiocese(
|
||||
dioceseCode: string,
|
||||
existingChurches: ExistingChurch[],
|
||||
dryRun: boolean,
|
||||
stats: ImportStats,
|
||||
): Promise<void> {
|
||||
const data = await fetchDioceseData(dioceseCode);
|
||||
if (!data) {
|
||||
stats.errors++;
|
||||
return;
|
||||
}
|
||||
|
||||
// Check for API error
|
||||
if (data.S && data.S[0] === false) {
|
||||
console.log(` API error for diocese ${dioceseCode}`);
|
||||
stats.errors++;
|
||||
return;
|
||||
}
|
||||
|
||||
const localities = parseApiResponse(data);
|
||||
console.log(` Found ${localities.size} unique localities`);
|
||||
stats.localitiesFound += localities.size;
|
||||
stats.diocesesProcessed++;
|
||||
|
||||
for (const [idfixe, { locality, celebrations }] of localities) {
|
||||
if (locality.latitude === 0 && locality.longitude === 0) {
|
||||
stats.churchesSkipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
const schedules = deduceSchedules(celebrations);
|
||||
|
||||
const candidate = {
|
||||
name: locality.name,
|
||||
lat: locality.latitude,
|
||||
lng: locality.longitude,
|
||||
messesInfoId: idfixe,
|
||||
};
|
||||
|
||||
const duplicate = findDuplicateChurch(candidate, existingChurches);
|
||||
|
||||
if (dryRun) {
|
||||
if (duplicate) {
|
||||
stats.churchesMatched++;
|
||||
} else {
|
||||
stats.churchesCreated++;
|
||||
}
|
||||
stats.schedulesCreated += schedules.length;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (duplicate) {
|
||||
stats.churchesMatched++;
|
||||
const updateData: Record<string, unknown> = { messesInfoId: idfixe };
|
||||
|
||||
if (!duplicate.address && locality.address) updateData.address = locality.address;
|
||||
if (duplicate.latitude === 0 && duplicate.longitude === 0 && locality.latitude !== 0) {
|
||||
updateData.latitude = locality.latitude;
|
||||
updateData.longitude = locality.longitude;
|
||||
}
|
||||
|
||||
try {
|
||||
await prisma.church.update({
|
||||
where: { id: duplicate.id },
|
||||
data: updateData,
|
||||
});
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message.includes('Unique constraint')) {
|
||||
stats.churchesSkipped++;
|
||||
continue;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
|
||||
if (schedules.length > 0) {
|
||||
try {
|
||||
await prisma.$transaction(async (tx) => {
|
||||
await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } });
|
||||
await tx.massSchedule.createMany({
|
||||
data: schedules.map((s) => ({
|
||||
churchId: duplicate.id,
|
||||
dayOfWeek: s.dayOfWeek,
|
||||
time: s.time,
|
||||
language: 'French',
|
||||
})),
|
||||
});
|
||||
await tx.church.update({
|
||||
where: { id: duplicate.id },
|
||||
data: { lastScrapedAt: new Date() },
|
||||
});
|
||||
});
|
||||
stats.schedulesCreated += schedules.length;
|
||||
} catch (error) {
|
||||
stats.errors++;
|
||||
console.error(` Error saving schedules for ${idfixe}: ${error instanceof Error ? error.message : error}`);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Determine country code from zipcode
|
||||
let country = 'FR';
|
||||
if (locality.zipcode && /^97[1-6]/.test(locality.zipcode)) {
|
||||
country = 'FR'; // DOM-TOM are still FR
|
||||
}
|
||||
|
||||
try {
|
||||
const newChurch = await prisma.church.create({
|
||||
data: {
|
||||
name: locality.name,
|
||||
latitude: locality.latitude,
|
||||
longitude: locality.longitude,
|
||||
address: locality.address,
|
||||
zip: locality.zipcode,
|
||||
city: locality.city,
|
||||
country,
|
||||
diocese: locality.sector || undefined,
|
||||
messesInfoId: idfixe,
|
||||
source: 'messes-info',
|
||||
websiteLanguage: 'fr',
|
||||
},
|
||||
});
|
||||
stats.churchesCreated++;
|
||||
|
||||
existingChurches.push({
|
||||
id: newChurch.id,
|
||||
name: locality.name,
|
||||
latitude: locality.latitude,
|
||||
longitude: locality.longitude,
|
||||
osmId: null,
|
||||
baiduId: null,
|
||||
masstimesId: null,
|
||||
orarimesseId: null,
|
||||
massSchedulesPhId: null,
|
||||
philmassId: null,
|
||||
horariosMisasId: null,
|
||||
mszeInfoId: null,
|
||||
weekdayMassesId: null,
|
||||
messesInfoId: idfixe,
|
||||
bohosluzbyId: null,
|
||||
miserendId: null,
|
||||
kerknetId: null,
|
||||
gottesdienstzeitenId: null,
|
||||
discovermassId: null,
|
||||
source: 'messes-info',
|
||||
website: null,
|
||||
phone: null,
|
||||
address: locality.address,
|
||||
});
|
||||
|
||||
if (schedules.length > 0) {
|
||||
await prisma.massSchedule.createMany({
|
||||
data: schedules.map((s) => ({
|
||||
churchId: newChurch.id,
|
||||
dayOfWeek: s.dayOfWeek,
|
||||
time: s.time,
|
||||
language: 'French',
|
||||
})),
|
||||
});
|
||||
await prisma.church.update({
|
||||
where: { id: newChurch.id },
|
||||
data: { lastScrapedAt: new Date() },
|
||||
});
|
||||
stats.schedulesCreated += schedules.length;
|
||||
}
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message.includes('Unique constraint')) {
|
||||
stats.churchesSkipped++;
|
||||
continue;
|
||||
}
|
||||
stats.errors++;
|
||||
console.error(` Error creating ${idfixe}: ${error instanceof Error ? error.message : error}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ─── CLI ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
function parseArgs(): CLIArgs {
|
||||
const args = process.argv.slice(2);
|
||||
const result: CLIArgs = { all: false, dryRun: false };
|
||||
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
switch (args[i]) {
|
||||
case '--all':
|
||||
result.all = true;
|
||||
break;
|
||||
case '--dry-run':
|
||||
result.dryRun = true;
|
||||
break;
|
||||
case '--resume-from':
|
||||
result.resumeFrom = parseInt(args[++i]);
|
||||
break;
|
||||
case '--diocese':
|
||||
result.diocese = args[++i];
|
||||
break;
|
||||
case '--job-id':
|
||||
result.jobId = args[++i];
|
||||
break;
|
||||
case '--help':
|
||||
case '-h':
|
||||
console.log(`
|
||||
Usage: npx tsx scripts/import-messesinfo.ts [options]
|
||||
|
||||
Options:
|
||||
--all Import all dioceses
|
||||
--diocese <code> Import a single diocese (e.g., pa for Paris)
|
||||
--dry-run No database writes, just report what would happen
|
||||
--resume-from <n> Skip first N dioceses
|
||||
--job-id <uuid> Background job tracking ID
|
||||
--help, -h Show this help message
|
||||
|
||||
Examples:
|
||||
npx tsx scripts/import-messesinfo.ts --diocese pa --dry-run
|
||||
npx tsx scripts/import-messesinfo.ts --all --dry-run
|
||||
npx tsx scripts/import-messesinfo.ts --all
|
||||
`);
|
||||
process.exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
if (!result.all && !result.diocese) {
|
||||
console.error('Error: specify --all or --diocese <code>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
function formatDuration(ms: number): string {
|
||||
const seconds = Math.floor(ms / 1000);
|
||||
const minutes = Math.floor(seconds / 60);
|
||||
const hours = Math.floor(minutes / 60);
|
||||
if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`;
|
||||
if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
|
||||
return `${seconds}s`;
|
||||
}
|
||||
|
||||
// ─── Main ────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const startTime = Date.now();
|
||||
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log('MESSES.INFO (FRANCE) IMPORTER');
|
||||
console.log('='.repeat(70));
|
||||
console.log(`Mode: ${args.diocese ? `Diocese ${args.diocese}` : 'All dioceses'}`);
|
||||
console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`);
|
||||
if (args.resumeFrom) console.log(`Resume from: diocese index ${args.resumeFrom}`);
|
||||
console.log(`Time: ${new Date().toISOString()}`);
|
||||
console.log('='.repeat(70) + '\n');
|
||||
|
||||
if (args.jobId) {
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: args.jobId },
|
||||
data: { status: 'running', startedAt: new Date() },
|
||||
});
|
||||
} catch { /* Job might not exist */ }
|
||||
}
|
||||
|
||||
const stats: ImportStats = {
|
||||
diocesesProcessed: 0,
|
||||
localitiesFound: 0,
|
||||
churchesMatched: 0,
|
||||
churchesCreated: 0,
|
||||
churchesSkipped: 0,
|
||||
schedulesCreated: 0,
|
||||
errors: 0,
|
||||
};
|
||||
|
||||
const existingChurches = await loadExistingFrenchChurches();
|
||||
|
||||
let dioceses = args.diocese ? [args.diocese] : [...DIOCESE_CODES];
|
||||
|
||||
if (args.diocese && !DIOCESE_CODES.includes(args.diocese)) {
|
||||
console.log(`Warning: diocese "${args.diocese}" not in known list, trying anyway...`);
|
||||
}
|
||||
|
||||
if (args.resumeFrom && !args.diocese) {
|
||||
dioceses = dioceses.slice(args.resumeFrom);
|
||||
console.log(`Resuming from diocese index ${args.resumeFrom} (${dioceses[0]})\n`);
|
||||
}
|
||||
|
||||
console.log(`Processing ${dioceses.length} dioceses\n`);
|
||||
|
||||
for (let i = 0; i < dioceses.length; i++) {
|
||||
const code = dioceses[i];
|
||||
const elapsed = formatDuration(Date.now() - startTime);
|
||||
console.log(`[${i + 1}/${dioceses.length}] Diocese "${code}" [${elapsed} elapsed]`);
|
||||
|
||||
try {
|
||||
await processDiocese(code, existingChurches, args.dryRun, stats);
|
||||
} catch (error) {
|
||||
stats.errors++;
|
||||
console.error(` ERROR processing diocese ${code}: ${error instanceof Error ? error.message : error}`);
|
||||
}
|
||||
}
|
||||
|
||||
const totalTime = Date.now() - startTime;
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
|
||||
console.log('='.repeat(70));
|
||||
console.log(`Dioceses processed: ${stats.diocesesProcessed}`);
|
||||
console.log(`Localities found: ${stats.localitiesFound}`);
|
||||
console.log(` Matched (existing): ${stats.churchesMatched}`);
|
||||
console.log(` Created (new): ${stats.churchesCreated}`);
|
||||
console.log(` Skipped: ${stats.churchesSkipped}`);
|
||||
console.log(`Schedules created: ${stats.schedulesCreated}`);
|
||||
console.log(`Errors: ${stats.errors}`);
|
||||
console.log(`Total time: ${formatDuration(totalTime)}`);
|
||||
console.log(`HTTP requests: ${requestCount}`);
|
||||
console.log('='.repeat(70) + '\n');
|
||||
|
||||
if (args.jobId) {
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: args.jobId },
|
||||
data: {
|
||||
status: stats.errors > 0 ? 'completed_with_errors' : 'completed',
|
||||
completedAt: new Date(),
|
||||
processed: stats.localitiesFound,
|
||||
succeeded: stats.churchesCreated + stats.churchesMatched,
|
||||
failed: stats.errors,
|
||||
itemsFound: stats.schedulesCreated,
|
||||
},
|
||||
});
|
||||
} catch { /* Ignore */ }
|
||||
}
|
||||
}
|
||||
|
||||
main()
|
||||
.catch((error) => {
|
||||
console.error('Fatal error:', error);
|
||||
process.exit(1);
|
||||
})
|
||||
.finally(async () => {
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
});
|
||||
579
scripts/import-miserend.ts
Normal file
579
scripts/import-miserend.ts
Normal file
@@ -0,0 +1,579 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Import Catholic churches and mass schedules from miserend.hu (Hungary)
|
||||
*
|
||||
* miserend.hu is the Hungarian Catholic mass schedule database, maintained by
|
||||
* the community with ~5,055 churches (mostly Hungary, some Romania/Slovakia).
|
||||
* It publishes a daily-updated SQLite database at:
|
||||
* https://miserend.hu/fajlok/sqlite/miserend_v4.sqlite3
|
||||
*
|
||||
* The SQLite contains:
|
||||
* - templomok: churches (tid, nev, lat, lng, varos, cim, orszag, megye)
|
||||
* - misek: date-specific mass entries (tid, ido, datumtol, datumig, nyelv)
|
||||
* - kepek: church photos
|
||||
*
|
||||
* Import strategy:
|
||||
* 1. Download the SQLite database
|
||||
* 2. Extract all churches with coordinates
|
||||
* 3. Deduce weekly recurring schedules from date-specific entries
|
||||
* 4. Match against existing churches via church-matcher
|
||||
* 5. Upsert churches and mass schedules
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/import-miserend.ts --all --dry-run
|
||||
* npx tsx scripts/import-miserend.ts --all
|
||||
* npx tsx scripts/import-miserend.ts --id 37 --dry-run # Single church
|
||||
* npx tsx scripts/import-miserend.ts --all --resume-from 500
|
||||
*/
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
import fs from 'fs';
|
||||
import { execFileSync } from 'child_process';
|
||||
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
|
||||
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
|
||||
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
|
||||
const pool = new Pool({
|
||||
connectionString: dbUrl,
|
||||
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
|
||||
});
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
import { findDuplicateChurch } from '../src/lib/church-matcher';
|
||||
import type { ExistingChurch } from '../src/lib/church-matcher';
|
||||
|
||||
// ─── Constants ───────────────────────────────────────────────────────────────
|
||||
|
||||
const SQLITE_URL = 'https://miserend.hu/fajlok/sqlite/miserend_v4.sqlite3';
|
||||
const SQLITE_PATH = '/tmp/miserend_v4.sqlite3';
|
||||
|
||||
// Country mapping from Hungarian names to ISO codes
|
||||
const COUNTRY_MAP: Record<string, string> = {
|
||||
'Magyarország': 'HU',
|
||||
'România': 'RO',
|
||||
'Slovensko': 'SK',
|
||||
'Szlovákia': 'SK',
|
||||
'Szerbia-Montenegro': 'RS',
|
||||
'Србија': 'RS',
|
||||
'Ukrajna': 'UA',
|
||||
'Україна': 'UA',
|
||||
'Österreich': 'AT',
|
||||
'Schweiz/Suisse/Svizzera/Svizra': 'CH',
|
||||
'België / Belgique / Belgien': 'BE',
|
||||
'Éire / Ireland': 'IE',
|
||||
'Россия': 'RU',
|
||||
};
|
||||
|
||||
// ─── Types ───────────────────────────────────────────────────────────────────
|
||||
|
||||
interface MiserendChurch {
|
||||
tid: number;
|
||||
nev: string;
|
||||
ismertnev: string | null;
|
||||
orszag: string | null;
|
||||
megye: string | null;
|
||||
varos: string | null;
|
||||
cim: string | null;
|
||||
lat: number;
|
||||
lng: number;
|
||||
}
|
||||
|
||||
interface MiserendMass {
|
||||
mid: number;
|
||||
tid: number;
|
||||
datumtol: number; // MMDD format
|
||||
datumig: number;
|
||||
ido: string; // HH:MM:SS
|
||||
nyelv: string | null;
|
||||
}
|
||||
|
||||
interface ParsedSchedule {
|
||||
dayOfWeek: number;
|
||||
time: string;
|
||||
}
|
||||
|
||||
interface ImportStats {
|
||||
churchesFetched: number;
|
||||
churchesMatched: number;
|
||||
churchesCreated: number;
|
||||
churchesSkipped: number;
|
||||
schedulesCreated: number;
|
||||
errors: number;
|
||||
}
|
||||
|
||||
interface CLIArgs {
|
||||
all: boolean;
|
||||
dryRun: boolean;
|
||||
resumeFrom?: number;
|
||||
churchId?: string;
|
||||
jobId?: string;
|
||||
}
|
||||
|
||||
// ─── SQLite Helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
function sqliteQuery(query: string): string {
|
||||
try {
|
||||
return execFileSync('sqlite3', [SQLITE_PATH, query], {
|
||||
encoding: 'utf-8',
|
||||
maxBuffer: 100 * 1024 * 1024, // 100MB
|
||||
}).trim();
|
||||
} catch {
|
||||
return '';
|
||||
}
|
||||
}
|
||||
|
||||
function downloadSqlite(): void {
|
||||
console.log('Downloading miserend SQLite database...');
|
||||
execFileSync('curl', ['-sL', '-o', SQLITE_PATH, SQLITE_URL], { timeout: 120000 });
|
||||
const size = fs.statSync(SQLITE_PATH).size;
|
||||
console.log(`Downloaded ${(size / 1024 / 1024).toFixed(1)}MB`);
|
||||
}
|
||||
|
||||
function loadChurches(): MiserendChurch[] {
|
||||
const raw = sqliteQuery(
|
||||
"SELECT tid, nev, ismertnev, orszag, megye, varos, cim, lat, lng FROM templomok WHERE lat IS NOT NULL AND lng IS NOT NULL AND lat != 0 AND lng != 0;"
|
||||
);
|
||||
if (!raw) return [];
|
||||
|
||||
return raw.split('\n').map(line => {
|
||||
const [tid, nev, ismertnev, orszag, megye, varos, cim, lat, lng] = line.split('|');
|
||||
return {
|
||||
tid: parseInt(tid),
|
||||
nev: nev || '',
|
||||
ismertnev: ismertnev || null,
|
||||
orszag: orszag || null,
|
||||
megye: megye || null,
|
||||
varos: varos || null,
|
||||
cim: cim || null,
|
||||
lat: parseFloat(lat),
|
||||
lng: parseFloat(lng),
|
||||
};
|
||||
}).filter(c => !isNaN(c.tid) && !isNaN(c.lat) && !isNaN(c.lng));
|
||||
}
|
||||
|
||||
function loadMassesForChurch(tid: number): MiserendMass[] {
|
||||
const raw = sqliteQuery(
|
||||
`SELECT mid, tid, datumtol, datumig, ido, nyelv FROM misek WHERE tid=${tid};`
|
||||
);
|
||||
if (!raw) return [];
|
||||
|
||||
return raw.split('\n').map(line => {
|
||||
const [mid, tidStr, datumtol, datumig, ido, nyelv] = line.split('|');
|
||||
return {
|
||||
mid: parseInt(mid),
|
||||
tid: parseInt(tidStr),
|
||||
datumtol: parseInt(datumtol),
|
||||
datumig: parseInt(datumig),
|
||||
ido: ido || '',
|
||||
nyelv: nyelv || null,
|
||||
};
|
||||
}).filter(m => !isNaN(m.mid) && m.ido);
|
||||
}
|
||||
|
||||
// ─── Schedule Deduction ──────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Deduce weekly recurring schedule from date-specific mass entries.
|
||||
* Each entry has datumtol/datumig in MMDD format (e.g., 104 = Jan 4).
|
||||
* We convert each date to a day of week and collect unique day+time combos.
|
||||
*/
|
||||
function deduceSchedules(masses: MiserendMass[]): ParsedSchedule[] {
|
||||
const seen = new Set<string>();
|
||||
const schedules: ParsedSchedule[] = [];
|
||||
|
||||
// Use current year for date conversion
|
||||
const year = new Date().getFullYear();
|
||||
|
||||
for (const mass of masses) {
|
||||
const time = mass.ido.substring(0, 5); // HH:MM from HH:MM:SS
|
||||
if (!time || time === '00:00') continue;
|
||||
|
||||
// Convert MMDD to a Date to get day of week
|
||||
const mmdd = mass.datumtol;
|
||||
const month = Math.floor(mmdd / 100);
|
||||
const day = mmdd % 100;
|
||||
if (month < 1 || month > 12 || day < 1 || day > 31) continue;
|
||||
|
||||
const date = new Date(year, month - 1, day);
|
||||
const dayOfWeek = date.getDay(); // 0=Sun, 1=Mon, ..., 6=Sat
|
||||
|
||||
const key = `${dayOfWeek}:${time}`;
|
||||
if (!seen.has(key)) {
|
||||
seen.add(key);
|
||||
schedules.push({ dayOfWeek, time });
|
||||
}
|
||||
}
|
||||
|
||||
return schedules;
|
||||
}
|
||||
|
||||
// ─── Database Operations ─────────────────────────────────────────────────────
|
||||
|
||||
async function loadExistingChurches(countryCodes: string[]): Promise<ExistingChurch[]> {
|
||||
console.log(`Loading existing churches for countries: ${countryCodes.join(', ')}...`);
|
||||
const churches = await prisma.church.findMany({
|
||||
where: { country: { in: countryCodes } },
|
||||
select: {
|
||||
id: true,
|
||||
name: true,
|
||||
latitude: true,
|
||||
longitude: true,
|
||||
osmId: true,
|
||||
baiduId: true,
|
||||
masstimesId: true,
|
||||
orarimesseId: true,
|
||||
massSchedulesPhId: true,
|
||||
philmassId: true,
|
||||
horariosMisasId: true,
|
||||
mszeInfoId: true,
|
||||
weekdayMassesId: true,
|
||||
messesInfoId: true,
|
||||
bohosluzbyId: true,
|
||||
miserendId: true,
|
||||
kerknetId: true,
|
||||
gottesdienstzeitenId: true,
|
||||
discovermassId: true,
|
||||
source: true,
|
||||
website: true,
|
||||
phone: true,
|
||||
address: true,
|
||||
},
|
||||
});
|
||||
console.log(`Loaded ${churches.length} existing churches`);
|
||||
return churches;
|
||||
}
|
||||
|
||||
// ─── Import Logic ────────────────────────────────────────────────────────────
|
||||
|
||||
async function processChurch(
|
||||
church: MiserendChurch,
|
||||
existingChurches: ExistingChurch[],
|
||||
dryRun: boolean,
|
||||
stats: ImportStats,
|
||||
): Promise<void> {
|
||||
const miserendId = String(church.tid);
|
||||
const country = church.orszag ? (COUNTRY_MAP[church.orszag] || 'HU') : 'HU';
|
||||
|
||||
const candidate = {
|
||||
name: church.nev,
|
||||
lat: church.lat,
|
||||
lng: church.lng,
|
||||
miserendId,
|
||||
};
|
||||
|
||||
const duplicate = findDuplicateChurch(candidate, existingChurches);
|
||||
|
||||
// Deduce schedules
|
||||
let schedules: ParsedSchedule[] = [];
|
||||
if (!dryRun) {
|
||||
const masses = loadMassesForChurch(church.tid);
|
||||
schedules = deduceSchedules(masses);
|
||||
}
|
||||
|
||||
if (dryRun) {
|
||||
if (duplicate) {
|
||||
stats.churchesMatched++;
|
||||
} else {
|
||||
stats.churchesCreated++;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (duplicate) {
|
||||
stats.churchesMatched++;
|
||||
const updateData: Record<string, unknown> = { miserendId };
|
||||
|
||||
if (!duplicate.address && church.cim) updateData.address = church.cim;
|
||||
|
||||
try {
|
||||
await prisma.church.update({
|
||||
where: { id: duplicate.id },
|
||||
data: updateData,
|
||||
});
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message.includes('Unique constraint')) {
|
||||
stats.churchesSkipped++;
|
||||
return;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
|
||||
if (schedules.length > 0) {
|
||||
try {
|
||||
await prisma.$transaction(async (tx) => {
|
||||
await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } });
|
||||
await tx.massSchedule.createMany({
|
||||
data: schedules.map((s) => ({
|
||||
churchId: duplicate.id,
|
||||
dayOfWeek: s.dayOfWeek,
|
||||
time: s.time,
|
||||
language: 'Hungarian',
|
||||
})),
|
||||
});
|
||||
await tx.church.update({
|
||||
where: { id: duplicate.id },
|
||||
data: { lastScrapedAt: new Date() },
|
||||
});
|
||||
});
|
||||
stats.schedulesCreated += schedules.length;
|
||||
} catch (error) {
|
||||
stats.errors++;
|
||||
console.error(` Error saving schedules for ${miserendId}: ${error instanceof Error ? error.message : error}`);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
const newChurch = await prisma.church.create({
|
||||
data: {
|
||||
name: church.nev,
|
||||
latitude: church.lat,
|
||||
longitude: church.lng,
|
||||
address: church.cim,
|
||||
city: church.varos,
|
||||
state: church.megye,
|
||||
country,
|
||||
miserendId,
|
||||
source: 'miserend',
|
||||
websiteLanguage: 'hu',
|
||||
},
|
||||
});
|
||||
stats.churchesCreated++;
|
||||
|
||||
existingChurches.push({
|
||||
id: newChurch.id,
|
||||
name: church.nev,
|
||||
latitude: church.lat,
|
||||
longitude: church.lng,
|
||||
osmId: null,
|
||||
baiduId: null,
|
||||
masstimesId: null,
|
||||
orarimesseId: null,
|
||||
massSchedulesPhId: null,
|
||||
philmassId: null,
|
||||
horariosMisasId: null,
|
||||
mszeInfoId: null,
|
||||
weekdayMassesId: null,
|
||||
messesInfoId: null,
|
||||
bohosluzbyId: null,
|
||||
miserendId,
|
||||
kerknetId: null,
|
||||
gottesdienstzeitenId: null,
|
||||
discovermassId: null,
|
||||
source: 'miserend',
|
||||
website: null,
|
||||
phone: null,
|
||||
address: church.cim,
|
||||
});
|
||||
|
||||
if (schedules.length > 0) {
|
||||
await prisma.massSchedule.createMany({
|
||||
data: schedules.map((s) => ({
|
||||
churchId: newChurch.id,
|
||||
dayOfWeek: s.dayOfWeek,
|
||||
time: s.time,
|
||||
language: 'Hungarian',
|
||||
})),
|
||||
});
|
||||
await prisma.church.update({
|
||||
where: { id: newChurch.id },
|
||||
data: { lastScrapedAt: new Date() },
|
||||
});
|
||||
stats.schedulesCreated += schedules.length;
|
||||
}
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message.includes('Unique constraint')) {
|
||||
stats.churchesSkipped++;
|
||||
return;
|
||||
}
|
||||
stats.errors++;
|
||||
console.error(` Error creating ${miserendId}: ${error instanceof Error ? error.message : error}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ─── CLI ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
function parseArgs(): CLIArgs {
|
||||
const args = process.argv.slice(2);
|
||||
const result: CLIArgs = { all: false, dryRun: false };
|
||||
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
switch (args[i]) {
|
||||
case '--all':
|
||||
result.all = true;
|
||||
break;
|
||||
case '--dry-run':
|
||||
result.dryRun = true;
|
||||
break;
|
||||
case '--resume-from':
|
||||
result.resumeFrom = parseInt(args[++i]);
|
||||
break;
|
||||
case '--id':
|
||||
result.churchId = args[++i];
|
||||
break;
|
||||
case '--job-id':
|
||||
result.jobId = args[++i];
|
||||
break;
|
||||
case '--help':
|
||||
case '-h':
|
||||
console.log(`
|
||||
Usage: npx tsx scripts/import-miserend.ts [options]
|
||||
|
||||
Options:
|
||||
--all Import all churches
|
||||
--id <tid> Import a single church by miserend ID
|
||||
--dry-run No database writes, just report what would happen
|
||||
--resume-from <n> Skip first N churches
|
||||
--job-id <uuid> Background job tracking ID
|
||||
--help, -h Show this help message
|
||||
|
||||
Examples:
|
||||
npx tsx scripts/import-miserend.ts --id 37 --dry-run
|
||||
npx tsx scripts/import-miserend.ts --all --dry-run
|
||||
npx tsx scripts/import-miserend.ts --all
|
||||
`);
|
||||
process.exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
if (!result.all && !result.churchId) {
|
||||
console.error('Error: specify --all or --id <miserend_tid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
function formatDuration(ms: number): string {
|
||||
const seconds = Math.floor(ms / 1000);
|
||||
const minutes = Math.floor(seconds / 60);
|
||||
const hours = Math.floor(minutes / 60);
|
||||
if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`;
|
||||
if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
|
||||
return `${seconds}s`;
|
||||
}
|
||||
|
||||
// ─── Main ────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const startTime = Date.now();
|
||||
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log('MISEREND.HU (HUNGARY) IMPORTER');
|
||||
console.log('='.repeat(70));
|
||||
console.log(`Mode: ${args.churchId ? `Church ID ${args.churchId}` : 'All churches'}`);
|
||||
console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`);
|
||||
if (args.resumeFrom) console.log(`Resume from: church index ${args.resumeFrom}`);
|
||||
console.log(`Time: ${new Date().toISOString()}`);
|
||||
console.log('='.repeat(70) + '\n');
|
||||
|
||||
if (args.jobId) {
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: args.jobId },
|
||||
data: { status: 'running', startedAt: new Date() },
|
||||
});
|
||||
} catch { /* Job might not exist */ }
|
||||
}
|
||||
|
||||
const stats: ImportStats = {
|
||||
churchesFetched: 0,
|
||||
churchesMatched: 0,
|
||||
churchesCreated: 0,
|
||||
churchesSkipped: 0,
|
||||
schedulesCreated: 0,
|
||||
errors: 0,
|
||||
};
|
||||
|
||||
// Download SQLite database
|
||||
downloadSqlite();
|
||||
|
||||
// Load churches from SQLite
|
||||
let churches = loadChurches();
|
||||
stats.churchesFetched = churches.length;
|
||||
console.log(`Found ${churches.length} churches with coordinates in SQLite\n`);
|
||||
|
||||
if (args.churchId) {
|
||||
churches = churches.filter(c => String(c.tid) === args.churchId);
|
||||
if (churches.length === 0) {
|
||||
console.error(`Church ID ${args.churchId} not found in SQLite database`);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Get unique country codes from the data
|
||||
const countryCodes = [...new Set(churches.map(c => {
|
||||
return c.orszag ? (COUNTRY_MAP[c.orszag] || 'HU') : 'HU';
|
||||
}))];
|
||||
const existingChurches = await loadExistingChurches(countryCodes);
|
||||
|
||||
if (args.resumeFrom) {
|
||||
churches = churches.slice(args.resumeFrom);
|
||||
console.log(`Resuming from index ${args.resumeFrom} (${churches.length} remaining)\n`);
|
||||
}
|
||||
|
||||
console.log(`Processing ${churches.length} churches\n`);
|
||||
|
||||
for (let i = 0; i < churches.length; i++) {
|
||||
const church = churches[i];
|
||||
if (i % 200 === 0) {
|
||||
const elapsed = formatDuration(Date.now() - startTime);
|
||||
console.log(`[${i + 1}/${churches.length}] Processing ${church.nev} (${church.tid}) [${elapsed} elapsed]`);
|
||||
}
|
||||
|
||||
try {
|
||||
await processChurch(church, existingChurches, args.dryRun, stats);
|
||||
} catch (error) {
|
||||
stats.errors++;
|
||||
console.error(` ERROR processing church ${church.tid}: ${error instanceof Error ? error.message : error}`);
|
||||
}
|
||||
}
|
||||
|
||||
const totalTime = Date.now() - startTime;
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
|
||||
console.log('='.repeat(70));
|
||||
console.log(`Churches in SQLite: ${stats.churchesFetched}`);
|
||||
console.log(` Matched (existing): ${stats.churchesMatched}`);
|
||||
console.log(` Created (new): ${stats.churchesCreated}`);
|
||||
console.log(` Skipped: ${stats.churchesSkipped}`);
|
||||
console.log(`Schedules created: ${stats.schedulesCreated}`);
|
||||
console.log(`Errors: ${stats.errors}`);
|
||||
console.log(`Total time: ${formatDuration(totalTime)}`);
|
||||
console.log('='.repeat(70) + '\n');
|
||||
|
||||
if (args.jobId) {
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: args.jobId },
|
||||
data: {
|
||||
status: stats.errors > 0 ? 'completed_with_errors' : 'completed',
|
||||
completedAt: new Date(),
|
||||
processed: stats.churchesFetched,
|
||||
succeeded: stats.churchesCreated + stats.churchesMatched,
|
||||
failed: stats.errors,
|
||||
itemsFound: stats.schedulesCreated,
|
||||
},
|
||||
});
|
||||
} catch { /* Ignore */ }
|
||||
}
|
||||
}
|
||||
|
||||
main()
|
||||
.catch((error) => {
|
||||
console.error('Fatal error:', error);
|
||||
process.exit(1);
|
||||
})
|
||||
.finally(async () => {
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
});
|
||||
746
scripts/import-msze-info.ts
Normal file
746
scripts/import-msze-info.ts
Normal file
@@ -0,0 +1,746 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Import Catholic churches and mass schedules from msze.info (Poland)
|
||||
*
|
||||
* msze.info is a Polish directory of Catholic parishes with mass schedules.
|
||||
* The site uses numbered sitemaps (Churches1.xml through Churches11.xml)
|
||||
* with ~500 URLs each, containing both /kosciol/{id} (church pages) and
|
||||
* /msze-online/{slug} (livestream pages).
|
||||
*
|
||||
* Import strategy:
|
||||
* 1. Fetch all 11 sitemaps → extract /kosciol/{id} URLs (skip /msze-online/)
|
||||
* 2. For each church: fetch HTML, parse name/address/phone/website/schedule
|
||||
* 3. Extract coordinates from embedded tomtom_codeAddress() JS call
|
||||
* 4. Match against existing PL churches, upsert
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/import-msze-info.ts --all
|
||||
* npx tsx scripts/import-msze-info.ts --all --dry-run
|
||||
* npx tsx scripts/import-msze-info.ts --all --resume-from 500
|
||||
* npx tsx scripts/import-msze-info.ts --all --job-id {uuid}
|
||||
*/
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
|
||||
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
|
||||
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
|
||||
const pool = new Pool({
|
||||
connectionString: dbUrl,
|
||||
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
|
||||
});
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
import { findDuplicateChurch } from '../src/lib/church-matcher';
|
||||
import type { ExistingChurch } from '../src/lib/church-matcher';
|
||||
|
||||
// ─── Constants ───────────────────────────────────────────────────────────────
|
||||
|
||||
const SITE_BASE = 'https://www.msze.info';
|
||||
const SITEMAP_COUNT = 11;
|
||||
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
|
||||
const REQUEST_DELAY_MS = 1500;
|
||||
|
||||
// ─── Types ───────────────────────────────────────────────────────────────────
|
||||
|
||||
interface ParsedChurch {
|
||||
name: string;
|
||||
address: string | null;
|
||||
city: string | null;
|
||||
zip: string | null;
|
||||
phone: string | null;
|
||||
website: string | null;
|
||||
email: string | null;
|
||||
latitude: number;
|
||||
longitude: number;
|
||||
}
|
||||
|
||||
interface ParsedSchedule {
|
||||
dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat
|
||||
time: string; // "05:00", "18:30"
|
||||
}
|
||||
|
||||
interface ImportStats {
|
||||
churchesFound: number;
|
||||
churchesMatched: number;
|
||||
churchesCreated: number;
|
||||
churchesSkipped: number;
|
||||
schedulesProcessed: number;
|
||||
massSchedulesCreated: number;
|
||||
errors: number;
|
||||
}
|
||||
|
||||
interface CLIArgs {
|
||||
all: boolean;
|
||||
dryRun: boolean;
|
||||
resumeFrom?: number;
|
||||
jobId?: string;
|
||||
}
|
||||
|
||||
// ─── HTTP Client ─────────────────────────────────────────────────────────────
|
||||
|
||||
let requestCount = 0;
|
||||
|
||||
function delay(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
async function fetchPage(url: string, delayMs: number = REQUEST_DELAY_MS): Promise<string | null> {
|
||||
if (requestCount > 0) {
|
||||
await delay(delayMs);
|
||||
}
|
||||
requestCount++;
|
||||
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
headers: {
|
||||
'User-Agent': USER_AGENT,
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
},
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
console.error(` HTTP ${response.status} for ${url}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
return await response.text();
|
||||
} catch (error) {
|
||||
console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Sitemap Parser ──────────────────────────────────────────────────────────
|
||||
|
||||
async function fetchChurchUrlsFromSitemaps(): Promise<string[]> {
|
||||
const allIds: string[] = [];
|
||||
const seen = new Set<string>();
|
||||
|
||||
for (let i = 1; i <= SITEMAP_COUNT; i++) {
|
||||
const sitemapUrl = `${SITE_BASE}/sitemap/Churches${i}.xml`;
|
||||
console.log(` Fetching ${sitemapUrl}...`);
|
||||
const xml = await fetchPage(sitemapUrl);
|
||||
if (!xml) {
|
||||
console.error(` Failed to fetch ${sitemapUrl}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Extract /kosciol/{id} URLs, skip /msze-online/
|
||||
const locRegex = /<loc>https?:\/\/(?:www\.)?msze\.info\/kosciol\/(\d+)<\/loc>/g;
|
||||
let match;
|
||||
while ((match = locRegex.exec(xml)) !== null) {
|
||||
const id = match[1];
|
||||
if (!seen.has(id)) {
|
||||
seen.add(id);
|
||||
allIds.push(id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sort numerically for deterministic order
|
||||
allIds.sort((a, b) => parseInt(a) - parseInt(b));
|
||||
|
||||
console.log(`Found ${allIds.length} unique church IDs from ${SITEMAP_COUNT} sitemaps`);
|
||||
return allIds;
|
||||
}
|
||||
|
||||
// ─── HTML Parsers ────────────────────────────────────────────────────────────
|
||||
|
||||
function parseChurchPage(html: string): ParsedChurch {
|
||||
// Name: from <h1>Church Name, City</h1>
|
||||
const h1Match = html.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i);
|
||||
let name = '';
|
||||
let cityFromH1: string | null = null;
|
||||
|
||||
if (h1Match) {
|
||||
const raw = h1Match[1].replace(/<[^>]+>/g, '').trim();
|
||||
// Split "Church Name, City" — city is the last comma-separated part
|
||||
const lastComma = raw.lastIndexOf(',');
|
||||
if (lastComma > 0) {
|
||||
name = raw.substring(0, lastComma).trim();
|
||||
cityFromH1 = raw.substring(lastComma + 1).trim();
|
||||
} else {
|
||||
name = raw;
|
||||
}
|
||||
}
|
||||
|
||||
// Address: look for "Adres:" or address-like patterns
|
||||
// Pattern: <span class="highlight">Adres:</span> <strong>Street, City</strong>
|
||||
let address: string | null = null;
|
||||
let city: string | null = cityFromH1;
|
||||
let zip: string | null = null;
|
||||
|
||||
const addressMatch = html.match(/Adres:<\/span>\s*(?:<strong>)?([\s\S]*?)(?:<\/strong>|<br|<\/p)/i);
|
||||
if (addressMatch) {
|
||||
address = addressMatch[1]
|
||||
.replace(/<[^>]+>/g, '')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim() || null;
|
||||
}
|
||||
|
||||
// Also try the tomtom_codeAddress first argument as fallback address
|
||||
if (!address) {
|
||||
const tomtomAddrMatch = html.match(/tomtom_codeAddress\s*\(\s*'([^']+)'/);
|
||||
if (tomtomAddrMatch) {
|
||||
address = tomtomAddrMatch[1].trim() || null;
|
||||
}
|
||||
}
|
||||
|
||||
if (address) {
|
||||
// Extract Polish postal code (XX-XXX format)
|
||||
const zipMatch = address.match(/\b(\d{2}-\d{3})\b/);
|
||||
if (zipMatch) {
|
||||
zip = zipMatch[1];
|
||||
}
|
||||
|
||||
// Extract city from address if not already from h1
|
||||
if (!city) {
|
||||
// City is typically the last part after comma
|
||||
const parts = address.split(',');
|
||||
if (parts.length > 1) {
|
||||
city = parts[parts.length - 1].replace(/\d{2}-\d{3}/, '').trim() || null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Coordinates: from tomtom_codeAddress('addr', zoom, 'name', null, lat, lng)
|
||||
let latitude = 0;
|
||||
let longitude = 0;
|
||||
const coordMatch = html.match(
|
||||
/tomtom_codeAddress\s*\([^,]+,\s*\d+\s*,\s*[^,]+,\s*(?:null|'[^']*')\s*,\s*(-?[\d.]+)\s*,\s*(-?[\d.]+)\s*\)/
|
||||
);
|
||||
if (coordMatch) {
|
||||
const lat = parseFloat(coordMatch[1]);
|
||||
const lng = parseFloat(coordMatch[2]);
|
||||
if (!isNaN(lat) && !isNaN(lng) && lat !== 0 && lng !== 0) {
|
||||
latitude = lat;
|
||||
longitude = lng;
|
||||
}
|
||||
}
|
||||
|
||||
// Phone: <a href="tel:...">
|
||||
let phone: string | null = null;
|
||||
const phoneMatch = html.match(/<a\s+href="tel:([^"]+)"/i);
|
||||
if (phoneMatch) {
|
||||
phone = phoneMatch[1].trim() || null;
|
||||
}
|
||||
|
||||
// Website: look for external link near "Witryna" text
|
||||
let website: string | null = null;
|
||||
const websiteMatch = html.match(/<a\s+href="(https?:\/\/[^"]+)"[^>]*>[^<]*Witryna/i);
|
||||
if (websiteMatch) {
|
||||
website = websiteMatch[1].trim() || null;
|
||||
}
|
||||
// Also try: link text that looks like a URL (www.xxx)
|
||||
if (!website) {
|
||||
const wwwMatch = html.match(/<a\s+href="(https?:\/\/[^"]+)"[^>]*>www\.[^<]+<\/a>/i);
|
||||
if (wwwMatch) {
|
||||
website = wwwMatch[1].trim() || null;
|
||||
}
|
||||
}
|
||||
|
||||
// Email: not reliably available (Cloudflare-protected)
|
||||
const email: string | null = null;
|
||||
|
||||
return { name, address, city, zip, phone, website, email, latitude, longitude };
|
||||
}
|
||||
|
||||
function parseMassSchedule(html: string): ParsedSchedule[] {
|
||||
const schedules: ParsedSchedule[] = [];
|
||||
const seen = new Set<string>();
|
||||
|
||||
// Find mass schedule sections by h2/h3 headings containing "MSZE"
|
||||
// Pattern: <h2>MSZE NIEDZIELE I ŚWIĘTA - Church Name</h2> followed by "godz. ..."
|
||||
// Pattern: <h3>MSZE DNI POWSZEDNIE - Church Name</h3> followed by "godz. ..."
|
||||
const sectionRegex = /<h[2-4][^>]*>([\s\S]*?)<\/h[2-4]>([\s\S]*?)(?=<h[2-4]|<footer|<script|$)/gi;
|
||||
let sectionMatch;
|
||||
|
||||
while ((sectionMatch = sectionRegex.exec(html)) !== null) {
|
||||
const heading = sectionMatch[1].replace(/<[^>]+>/g, '').trim().toUpperCase();
|
||||
const content = sectionMatch[2];
|
||||
|
||||
// Only process mass schedule headings (starts with "MSZE")
|
||||
if (!heading.startsWith('MSZE')) continue;
|
||||
|
||||
// Determine which days this section covers
|
||||
const days = resolvePolishDays(heading);
|
||||
if (days.length === 0) continue;
|
||||
|
||||
// Extract times from "godz." patterns
|
||||
const times = extractTimes(content);
|
||||
|
||||
for (const day of days) {
|
||||
for (const time of times) {
|
||||
const key = `${day}:${time}`;
|
||||
if (seen.has(key)) continue;
|
||||
seen.add(key);
|
||||
schedules.push({ dayOfWeek: day, time });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return schedules;
|
||||
}
|
||||
|
||||
function resolvePolishDays(heading: string): number[] {
|
||||
const h = heading; // already uppercased by caller
|
||||
|
||||
// "NIEDZIELE I ŚWIĘTA" or just "NIEDZIEL" → Sunday
|
||||
if (h.includes('NIEDZIEL')) {
|
||||
return [0];
|
||||
}
|
||||
|
||||
// "DNI POWSZEDNIE" → Weekdays (Mon-Sat)
|
||||
if (h.includes('DNI POWSZEDNIE') || h.includes('POWSZEDNI')) {
|
||||
return [1, 2, 3, 4, 5, 6];
|
||||
}
|
||||
|
||||
// Individual day names (rare but possible)
|
||||
if (h.includes('PONIEDZIA')) return [1]; // poniedziałek
|
||||
if (h.includes('WTOREK') || h.includes('WTORK')) return [2];
|
||||
if (h.includes('ŚRODA') || h.includes('SRODA') || h.includes('ŚROD')) return [3];
|
||||
if (h.includes('CZWARTEK') || h.includes('CZWART')) return [4];
|
||||
if (h.includes('PIĄTEK') || h.includes('PIATEK') || h.includes('PIĄT')) return [5];
|
||||
if (h.includes('SOBOT')) return [6];
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
function extractTimes(text: string): string[] {
|
||||
const times: string[] = [];
|
||||
|
||||
// Match "godz." followed by times, or standalone HH:MM patterns
|
||||
// Handles: "godz. 6:30, 8:00, 9:30" and "godz. 7:00"
|
||||
const timeRegex = /(\d{1,2}):(\d{2})/g;
|
||||
let match;
|
||||
|
||||
// Only look at text near "godz." patterns
|
||||
const godzSections = text.split(/godz\.\s*/i);
|
||||
|
||||
for (let i = 1; i < godzSections.length; i++) {
|
||||
// Take text until the next section break (paragraph, div, heading)
|
||||
const section = godzSections[i].split(/<(?:p|div|br\s*\/?>|h[2-4])/i)[0];
|
||||
|
||||
while ((match = timeRegex.exec(section)) !== null) {
|
||||
const hours = parseInt(match[1]);
|
||||
const mins = parseInt(match[2]);
|
||||
if (hours >= 0 && hours <= 23 && mins >= 0 && mins <= 59) {
|
||||
times.push(`${String(hours).padStart(2, '0')}:${String(mins).padStart(2, '0')}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return times;
|
||||
}
|
||||
|
||||
// ─── Database Operations ─────────────────────────────────────────────────────
|
||||
|
||||
async function loadExistingPolishChurches(): Promise<ExistingChurch[]> {
|
||||
console.log('Loading existing Polish churches for deduplication...');
|
||||
const churches = await prisma.church.findMany({
|
||||
where: { country: 'PL' },
|
||||
select: {
|
||||
id: true,
|
||||
name: true,
|
||||
latitude: true,
|
||||
longitude: true,
|
||||
osmId: true,
|
||||
baiduId: true,
|
||||
masstimesId: true,
|
||||
orarimesseId: true,
|
||||
massSchedulesPhId: true,
|
||||
philmassId: true,
|
||||
horariosMisasId: true,
|
||||
mszeInfoId: true,
|
||||
weekdayMassesId: true,
|
||||
messesInfoId: true,
|
||||
bohosluzbyId: true,
|
||||
miserendId: true,
|
||||
kerknetId: true,
|
||||
gottesdienstzeitenId: true,
|
||||
discovermassId: true,
|
||||
source: true,
|
||||
website: true,
|
||||
phone: true,
|
||||
address: true,
|
||||
},
|
||||
});
|
||||
console.log(`Loaded ${churches.length} existing Polish churches`);
|
||||
return churches;
|
||||
}
|
||||
|
||||
// ─── Import Logic ────────────────────────────────────────────────────────────
|
||||
|
||||
async function processChurch(
|
||||
churchId: string,
|
||||
existingChurches: ExistingChurch[],
|
||||
dryRun: boolean,
|
||||
stats: ImportStats,
|
||||
): Promise<void> {
|
||||
stats.churchesFound++;
|
||||
|
||||
const url = `${SITE_BASE}/kosciol/${churchId}`;
|
||||
const churchHtml = await fetchPage(url);
|
||||
if (!churchHtml) {
|
||||
stats.errors++;
|
||||
return;
|
||||
}
|
||||
|
||||
const parsed = parseChurchPage(churchHtml);
|
||||
if (!parsed.name) {
|
||||
console.log(` Skipping ${churchId}: no name found`);
|
||||
stats.churchesSkipped++;
|
||||
return;
|
||||
}
|
||||
|
||||
const schedules = parseMassSchedule(churchHtml);
|
||||
|
||||
// Build candidate for dedup
|
||||
const candidate = {
|
||||
name: parsed.name,
|
||||
lat: parsed.latitude,
|
||||
lng: parsed.longitude,
|
||||
mszeInfoId: churchId,
|
||||
};
|
||||
|
||||
const duplicate = findDuplicateChurch(candidate, existingChurches);
|
||||
|
||||
if (dryRun) {
|
||||
if (duplicate) {
|
||||
stats.churchesMatched++;
|
||||
console.log(` [MATCH] "${parsed.name}" → existing "${duplicate.name}" (${duplicate.id})`);
|
||||
} else {
|
||||
stats.churchesCreated++;
|
||||
console.log(` [NEW] "${parsed.name}" (${parsed.city || 'unknown city'})`);
|
||||
}
|
||||
if (schedules.length > 0) {
|
||||
stats.schedulesProcessed++;
|
||||
stats.massSchedulesCreated += schedules.length;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (duplicate) {
|
||||
// Update existing church
|
||||
stats.churchesMatched++;
|
||||
const updateData: Record<string, unknown> = {
|
||||
mszeInfoId: churchId,
|
||||
};
|
||||
|
||||
if (!duplicate.address && parsed.address) updateData.address = parsed.address;
|
||||
if (!duplicate.phone && parsed.phone) updateData.phone = parsed.phone;
|
||||
if (!duplicate.website && parsed.website) {
|
||||
updateData.website = parsed.website;
|
||||
updateData.hasWebsite = true;
|
||||
}
|
||||
|
||||
// Update coordinates if existing has none and we have them
|
||||
if (duplicate.latitude === 0 && duplicate.longitude === 0 && parsed.latitude !== 0) {
|
||||
updateData.latitude = parsed.latitude;
|
||||
updateData.longitude = parsed.longitude;
|
||||
}
|
||||
|
||||
// Fill city/zip if not set
|
||||
const dbRecord = await prisma.church.findUnique({
|
||||
where: { id: duplicate.id },
|
||||
select: { city: true, zip: true, email: true },
|
||||
});
|
||||
if (dbRecord && !dbRecord.city && parsed.city) updateData.city = parsed.city;
|
||||
if (dbRecord && !dbRecord.zip && parsed.zip) updateData.zip = parsed.zip;
|
||||
if (dbRecord && !dbRecord.email && parsed.email) updateData.email = parsed.email;
|
||||
|
||||
try {
|
||||
await prisma.church.update({
|
||||
where: { id: duplicate.id },
|
||||
data: updateData,
|
||||
});
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message.includes('Unique constraint')) {
|
||||
stats.churchesSkipped++;
|
||||
return;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
|
||||
// Replace mass schedules
|
||||
if (schedules.length > 0) {
|
||||
try {
|
||||
await prisma.$transaction(async (tx) => {
|
||||
await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } });
|
||||
await tx.massSchedule.createMany({
|
||||
data: schedules.map((s) => ({
|
||||
churchId: duplicate.id,
|
||||
dayOfWeek: s.dayOfWeek,
|
||||
time: s.time,
|
||||
language: 'Polish',
|
||||
})),
|
||||
});
|
||||
await tx.church.update({
|
||||
where: { id: duplicate.id },
|
||||
data: { lastScrapedAt: new Date() },
|
||||
});
|
||||
});
|
||||
stats.schedulesProcessed++;
|
||||
stats.massSchedulesCreated += schedules.length;
|
||||
} catch (error) {
|
||||
stats.errors++;
|
||||
console.error(` Error saving schedules for ${churchId}: ${error instanceof Error ? error.message : error}`);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Create new church
|
||||
try {
|
||||
const newChurch = await prisma.church.create({
|
||||
data: {
|
||||
name: parsed.name,
|
||||
latitude: parsed.latitude,
|
||||
longitude: parsed.longitude,
|
||||
address: parsed.address,
|
||||
zip: parsed.zip,
|
||||
city: parsed.city,
|
||||
country: 'PL',
|
||||
phone: parsed.phone,
|
||||
website: parsed.website,
|
||||
email: parsed.email,
|
||||
hasWebsite: !!parsed.website,
|
||||
mszeInfoId: churchId,
|
||||
source: 'msze-info',
|
||||
},
|
||||
});
|
||||
stats.churchesCreated++;
|
||||
|
||||
// Add to in-memory array for within-run dedup
|
||||
existingChurches.push({
|
||||
id: newChurch.id,
|
||||
name: parsed.name,
|
||||
latitude: parsed.latitude,
|
||||
longitude: parsed.longitude,
|
||||
osmId: null,
|
||||
baiduId: null,
|
||||
masstimesId: null,
|
||||
orarimesseId: null,
|
||||
massSchedulesPhId: null,
|
||||
philmassId: null,
|
||||
horariosMisasId: null,
|
||||
mszeInfoId: churchId,
|
||||
weekdayMassesId: null,
|
||||
messesInfoId: null,
|
||||
bohosluzbyId: null,
|
||||
miserendId: null,
|
||||
kerknetId: null,
|
||||
gottesdienstzeitenId: null,
|
||||
discovermassId: null,
|
||||
source: 'msze-info',
|
||||
website: parsed.website,
|
||||
phone: parsed.phone,
|
||||
address: parsed.address,
|
||||
});
|
||||
|
||||
// Create mass schedules
|
||||
if (schedules.length > 0) {
|
||||
await prisma.massSchedule.createMany({
|
||||
data: schedules.map((s) => ({
|
||||
churchId: newChurch.id,
|
||||
dayOfWeek: s.dayOfWeek,
|
||||
time: s.time,
|
||||
language: 'Polish',
|
||||
})),
|
||||
});
|
||||
await prisma.church.update({
|
||||
where: { id: newChurch.id },
|
||||
data: { lastScrapedAt: new Date() },
|
||||
});
|
||||
stats.schedulesProcessed++;
|
||||
stats.massSchedulesCreated += schedules.length;
|
||||
}
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message.includes('Unique constraint')) {
|
||||
stats.churchesSkipped++;
|
||||
return;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ─── CLI ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
function parseArgs(): CLIArgs {
|
||||
const args = process.argv.slice(2);
|
||||
const result: CLIArgs = {
|
||||
all: false,
|
||||
dryRun: false,
|
||||
};
|
||||
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
switch (args[i]) {
|
||||
case '--all':
|
||||
result.all = true;
|
||||
break;
|
||||
case '--dry-run':
|
||||
result.dryRun = true;
|
||||
break;
|
||||
case '--resume-from':
|
||||
result.resumeFrom = parseInt(args[++i]);
|
||||
break;
|
||||
case '--job-id':
|
||||
result.jobId = args[++i];
|
||||
break;
|
||||
case '--help':
|
||||
case '-h':
|
||||
console.log(`
|
||||
Usage: npx tsx scripts/import-msze-info.ts [options]
|
||||
|
||||
Options:
|
||||
--all Import all churches from sitemaps
|
||||
--dry-run No database writes, just report what would happen
|
||||
--resume-from <n> Skip first N churches
|
||||
--job-id <uuid> Background job tracking ID
|
||||
--help, -h Show this help message
|
||||
|
||||
Examples:
|
||||
npx tsx scripts/import-msze-info.ts --all --dry-run
|
||||
npx tsx scripts/import-msze-info.ts --all
|
||||
npx tsx scripts/import-msze-info.ts --all --resume-from 500
|
||||
`);
|
||||
process.exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
if (!result.all) {
|
||||
console.error('Error: specify --all');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ─── Helpers ─────────────────────────────────────────────────────────────────
|
||||
|
||||
function formatDuration(ms: number): string {
|
||||
const seconds = Math.floor(ms / 1000);
|
||||
const minutes = Math.floor(seconds / 60);
|
||||
const hours = Math.floor(minutes / 60);
|
||||
if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`;
|
||||
if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
|
||||
return `${seconds}s`;
|
||||
}
|
||||
|
||||
// ─── Main ────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const startTime = Date.now();
|
||||
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log('MSZE.INFO (POLAND) IMPORTER');
|
||||
console.log('='.repeat(70));
|
||||
console.log(`Mode: All churches from sitemaps`);
|
||||
console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`);
|
||||
if (args.resumeFrom) console.log(`Resume from: ${args.resumeFrom}`);
|
||||
console.log(`Time: ${new Date().toISOString()}`);
|
||||
console.log('='.repeat(70) + '\n');
|
||||
|
||||
// Update background job status if provided
|
||||
if (args.jobId) {
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: args.jobId },
|
||||
data: { status: 'running', startedAt: new Date() },
|
||||
});
|
||||
} catch {
|
||||
// Job might not exist yet
|
||||
}
|
||||
}
|
||||
|
||||
const stats: ImportStats = {
|
||||
churchesFound: 0,
|
||||
churchesMatched: 0,
|
||||
churchesCreated: 0,
|
||||
churchesSkipped: 0,
|
||||
schedulesProcessed: 0,
|
||||
massSchedulesCreated: 0,
|
||||
errors: 0,
|
||||
};
|
||||
|
||||
// Load existing Polish churches for dedup
|
||||
const existingChurches = await loadExistingPolishChurches();
|
||||
|
||||
// Fetch church IDs from sitemaps
|
||||
console.log('Fetching church URLs from sitemaps...');
|
||||
let churchIds = await fetchChurchUrlsFromSitemaps();
|
||||
|
||||
// Handle --resume-from
|
||||
if (args.resumeFrom) {
|
||||
const before = churchIds.length;
|
||||
churchIds = churchIds.slice(args.resumeFrom);
|
||||
console.log(`Resuming from index ${args.resumeFrom} (skipping ${before - churchIds.length} churches)\n`);
|
||||
} else {
|
||||
console.log(`Processing ${churchIds.length} churches\n`);
|
||||
}
|
||||
|
||||
// Process each church
|
||||
for (let i = 0; i < churchIds.length; i++) {
|
||||
const id = churchIds[i];
|
||||
const elapsed = formatDuration(Date.now() - startTime);
|
||||
console.log(`[${i + 1}/${churchIds.length}] kosciol/${id} [${elapsed} elapsed]`);
|
||||
|
||||
try {
|
||||
await processChurch(id, existingChurches, args.dryRun, stats);
|
||||
} catch (error) {
|
||||
stats.errors++;
|
||||
console.error(` ERROR processing ${id}: ${error instanceof Error ? error.message : error}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Print summary
|
||||
const totalTime = Date.now() - startTime;
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
|
||||
console.log('='.repeat(70));
|
||||
console.log(`Churches found: ${stats.churchesFound}`);
|
||||
console.log(` Matched (existing): ${stats.churchesMatched}`);
|
||||
console.log(` Created (new): ${stats.churchesCreated}`);
|
||||
console.log(` Skipped: ${stats.churchesSkipped}`);
|
||||
console.log(`Schedules processed: ${stats.schedulesProcessed}`);
|
||||
console.log(`Mass schedules created: ${stats.massSchedulesCreated}`);
|
||||
console.log(`Errors: ${stats.errors}`);
|
||||
console.log(`Total time: ${formatDuration(totalTime)}`);
|
||||
console.log(`HTTP requests: ${requestCount}`);
|
||||
console.log('='.repeat(70) + '\n');
|
||||
|
||||
// Update background job
|
||||
if (args.jobId) {
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: args.jobId },
|
||||
data: {
|
||||
status: stats.errors > 0 ? 'completed_with_errors' : 'completed',
|
||||
completedAt: new Date(),
|
||||
result: JSON.stringify(stats),
|
||||
},
|
||||
});
|
||||
} catch {
|
||||
// Ignore
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
main()
|
||||
.catch((error) => {
|
||||
console.error('Fatal error:', error);
|
||||
process.exit(1);
|
||||
})
|
||||
.finally(async () => {
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
});
|
||||
771
scripts/import-orarimesse.ts
Normal file
771
scripts/import-orarimesse.ts
Normal file
@@ -0,0 +1,771 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Import Catholic churches and mass schedules from OrariMesse.it
|
||||
*
|
||||
* OrariMesse.it is the official CEI (Italian Bishops' Conference) platform for
|
||||
* mass times in Italy. It provides a public REST API organized by diocese.
|
||||
*
|
||||
* Import strategy:
|
||||
* Pass 1: For each diocese, fetch all churches → match against existing DB
|
||||
* records (by ICSC code or proximity+name) → upsert
|
||||
* Pass 2: For churches with active schedules, fetch detail endpoint →
|
||||
* convert 8-day rolling schedule to recurring → replace mass schedules
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/import-orarimesse.ts --all
|
||||
* npx tsx scripts/import-orarimesse.ts --diocese roma
|
||||
* npx tsx scripts/import-orarimesse.ts --all --dry-run
|
||||
* npx tsx scripts/import-orarimesse.ts --all --schedules-only
|
||||
* npx tsx scripts/import-orarimesse.ts --all --resume-from napoli
|
||||
* npx tsx scripts/import-orarimesse.ts --all --job-id {uuid}
|
||||
*/
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
|
||||
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
|
||||
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
|
||||
const pool = new Pool({
|
||||
connectionString: dbUrl,
|
||||
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
|
||||
});
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
import { findDuplicateChurch } from '../src/lib/church-matcher';
|
||||
import type { ExistingChurch } from '../src/lib/church-matcher';
|
||||
|
||||
// ─── Constants ───────────────────────────────────────────────────────────────
|
||||
|
||||
const API_BASE = 'https://orarimesse.it/api';
|
||||
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
|
||||
const DIOCESE_DELAY_MS = 2000;
|
||||
const DETAIL_DELAY_MS = 1000;
|
||||
|
||||
// ─── Italian Day Map ─────────────────────────────────────────────────────────
|
||||
|
||||
const ITALIAN_DAY_MAP: Record<string, number> = {
|
||||
'domenica': 0, 'lunedì': 1, 'lunedi': 1,
|
||||
'martedì': 2, 'martedi': 2, 'mercoledì': 3, 'mercoledi': 3,
|
||||
'giovedì': 4, 'giovedi': 4, 'venerdì': 5, 'venerdi': 5,
|
||||
'sabato': 6,
|
||||
};
|
||||
|
||||
// ─── Types ───────────────────────────────────────────────────────────────────
|
||||
|
||||
interface OrariMesseDiocese {
|
||||
codice_cei: string;
|
||||
title: string;
|
||||
slug: string;
|
||||
url: string;
|
||||
countChiese: number;
|
||||
}
|
||||
|
||||
interface OrariMesseChurch {
|
||||
idchurch: number;
|
||||
address: string;
|
||||
name: string;
|
||||
conosciutaCome: string;
|
||||
isopen: boolean;
|
||||
nextmass: string;
|
||||
lat: string;
|
||||
lon: string;
|
||||
sito: string;
|
||||
emailLdc: string;
|
||||
icsc: string;
|
||||
comune: string;
|
||||
tipologia: string;
|
||||
accessibile: boolean;
|
||||
}
|
||||
|
||||
interface OrariMesseDioceseResponse {
|
||||
codice_cei: string;
|
||||
title: string;
|
||||
slug: string;
|
||||
countChiese: number;
|
||||
listaChiese: OrariMesseChurch[];
|
||||
}
|
||||
|
||||
interface OrariMesseMass {
|
||||
idmass: number;
|
||||
time: string;
|
||||
noteOrarioMessa: string;
|
||||
}
|
||||
|
||||
interface OrariMesseDay {
|
||||
day: string;
|
||||
mass: OrariMesseMass[];
|
||||
}
|
||||
|
||||
interface OrariMesseDetail {
|
||||
idchurch: number;
|
||||
name: string;
|
||||
address: string;
|
||||
lat: string;
|
||||
lon: string;
|
||||
icsc: string;
|
||||
comune: string;
|
||||
diocesi: string;
|
||||
parroco: string;
|
||||
telefono: string;
|
||||
email: string;
|
||||
sito: string;
|
||||
days: OrariMesseDay[];
|
||||
}
|
||||
|
||||
interface ImportStats {
|
||||
diocesesProcessed: number;
|
||||
churchesFound: number;
|
||||
churchesMatched: number;
|
||||
churchesCreated: number;
|
||||
churchesSkipped: number;
|
||||
schedulesProcessed: number;
|
||||
massSchedulesCreated: number;
|
||||
errors: number;
|
||||
}
|
||||
|
||||
interface CLIArgs {
|
||||
all: boolean;
|
||||
diocese?: string;
|
||||
dryRun: boolean;
|
||||
schedulesOnly: boolean;
|
||||
resumeFrom?: string;
|
||||
jobId?: string;
|
||||
}
|
||||
|
||||
// ─── API Client ──────────────────────────────────────────────────────────────
|
||||
|
||||
let requestCount = 0;
|
||||
|
||||
function delay(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
async function fetchApi<T>(endpoint: string, params: Record<string, string> = {}, delayMs: number): Promise<T | null> {
|
||||
if (requestCount > 0) {
|
||||
await delay(delayMs);
|
||||
}
|
||||
requestCount++;
|
||||
|
||||
const url = new URL(`${API_BASE}/${endpoint}`);
|
||||
for (const [key, value] of Object.entries(params)) {
|
||||
url.searchParams.set(key, value);
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await fetch(url.toString(), {
|
||||
headers: {
|
||||
'User-Agent': USER_AGENT,
|
||||
'Accept': 'application/json',
|
||||
},
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
console.error(` HTTP ${response.status} for ${url}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
const json = await response.json() as { status: boolean; code: string; data: T };
|
||||
if (json.status === true && json.code === 'OK') {
|
||||
return json.data;
|
||||
}
|
||||
|
||||
console.error(` API error for ${url}: ${JSON.stringify(json).substring(0, 200)}`);
|
||||
return null;
|
||||
} catch (error) {
|
||||
console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async function fetchDioceses(): Promise<OrariMesseDiocese[]> {
|
||||
const data = await fetchApi<OrariMesseDiocese[]>('getDiocesi', {}, DIOCESE_DELAY_MS);
|
||||
return data || [];
|
||||
}
|
||||
|
||||
async function fetchChurchesInDiocese(slug: string): Promise<OrariMesseDioceseResponse | null> {
|
||||
const data = await fetchApi<OrariMesseDioceseResponse[]>(
|
||||
'getListaChiese',
|
||||
{ diocesi: slug, type: 'compact' },
|
||||
DIOCESE_DELAY_MS
|
||||
);
|
||||
// Response is an array with a single diocese object
|
||||
if (data && data.length > 0) {
|
||||
return data[0];
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function fetchChurchDetail(idchurch: number): Promise<OrariMesseDetail | null> {
|
||||
return fetchApi<OrariMesseDetail>(
|
||||
'getDettaglioMessa',
|
||||
{ idchurch: String(idchurch) },
|
||||
DETAIL_DELAY_MS
|
||||
);
|
||||
}
|
||||
|
||||
// ─── Day/Time Conversion ─────────────────────────────────────────────────────
|
||||
|
||||
function parseItalianDay(dayString: string): number | null {
|
||||
// "Giovedì 26 Febbraio" → extract first word → lowercase → lookup
|
||||
const firstWord = dayString.split(' ')[0].toLowerCase();
|
||||
return ITALIAN_DAY_MAP[firstWord] ?? null;
|
||||
}
|
||||
|
||||
function convertTime(time: string): string {
|
||||
// "07.00" → "07:00"
|
||||
return time.replace('.', ':');
|
||||
}
|
||||
|
||||
interface RecurringMass {
|
||||
dayOfWeek: number;
|
||||
time: string;
|
||||
notes: string | null;
|
||||
}
|
||||
|
||||
function convertScheduleToRecurring(days: OrariMesseDay[]): RecurringMass[] {
|
||||
// The API returns an 8-day rolling window. Same weekday can appear twice
|
||||
// (e.g. Thursday this week and Thursday next week). We deduplicate by
|
||||
// dayOfWeek+time to get the recurring weekly schedule.
|
||||
const seen = new Set<string>();
|
||||
const result: RecurringMass[] = [];
|
||||
|
||||
for (const day of days) {
|
||||
const dayOfWeek = parseItalianDay(day.day);
|
||||
if (dayOfWeek === null) continue;
|
||||
|
||||
for (const mass of day.mass) {
|
||||
const time = convertTime(mass.time);
|
||||
const key = `${dayOfWeek}:${time}`;
|
||||
if (seen.has(key)) continue;
|
||||
seen.add(key);
|
||||
|
||||
result.push({
|
||||
dayOfWeek,
|
||||
time,
|
||||
notes: mass.noteOrarioMessa || null,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ─── Database Operations ─────────────────────────────────────────────────────
|
||||
|
||||
async function loadExistingItalianChurches(): Promise<ExistingChurch[]> {
|
||||
console.log('Loading existing Italian churches for deduplication...');
|
||||
const churches = await prisma.church.findMany({
|
||||
where: { country: 'IT' },
|
||||
select: {
|
||||
id: true,
|
||||
name: true,
|
||||
latitude: true,
|
||||
longitude: true,
|
||||
osmId: true,
|
||||
baiduId: true,
|
||||
masstimesId: true,
|
||||
orarimesseId: true,
|
||||
massSchedulesPhId: true,
|
||||
philmassId: true,
|
||||
horariosMisasId: true,
|
||||
mszeInfoId: true,
|
||||
weekdayMassesId: true,
|
||||
messesInfoId: true,
|
||||
bohosluzbyId: true,
|
||||
miserendId: true,
|
||||
kerknetId: true,
|
||||
gottesdienstzeitenId: true,
|
||||
discovermassId: true,
|
||||
source: true,
|
||||
website: true,
|
||||
phone: true,
|
||||
address: true,
|
||||
},
|
||||
});
|
||||
console.log(`Loaded ${churches.length} existing Italian churches`);
|
||||
return churches;
|
||||
}
|
||||
|
||||
// ─── Pass 1: Church Upsert ──────────────────────────────────────────────────
|
||||
|
||||
async function processChurchesForDiocese(
|
||||
dioceseSlug: string,
|
||||
churches: OrariMesseChurch[],
|
||||
existingChurches: ExistingChurch[],
|
||||
idchurchToDbId: Map<number, string>,
|
||||
dryRun: boolean,
|
||||
stats: ImportStats,
|
||||
): Promise<void> {
|
||||
for (const church of churches) {
|
||||
stats.churchesFound++;
|
||||
|
||||
// Parse coordinates
|
||||
const lat = parseFloat(church.lat);
|
||||
const lon = parseFloat(church.lon);
|
||||
if (isNaN(lat) || isNaN(lon) || lat === 0 || lon === 0) {
|
||||
stats.churchesSkipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Build candidate for dedup
|
||||
const candidate = {
|
||||
name: church.name,
|
||||
lat,
|
||||
lng: lon,
|
||||
orarimesseId: church.icsc || undefined,
|
||||
};
|
||||
|
||||
const duplicate = findDuplicateChurch(candidate, existingChurches);
|
||||
|
||||
if (dryRun) {
|
||||
if (duplicate) {
|
||||
stats.churchesMatched++;
|
||||
} else {
|
||||
stats.churchesCreated++;
|
||||
}
|
||||
// Track idchurch for Pass 2 even in dry run
|
||||
if (duplicate) {
|
||||
idchurchToDbId.set(church.idchurch, duplicate.id);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (duplicate) {
|
||||
// Update existing church: set orarimesseId, fill missing fields
|
||||
stats.churchesMatched++;
|
||||
const updateData: Record<string, unknown> = {
|
||||
orarimesseId: church.icsc || undefined,
|
||||
orarimesseLastSyncedAt: new Date(),
|
||||
};
|
||||
|
||||
if (!duplicate.address && church.address) updateData.address = church.address;
|
||||
if (!duplicate.website && church.sito) {
|
||||
updateData.website = church.sito;
|
||||
updateData.hasWebsite = true;
|
||||
}
|
||||
|
||||
// Check diocese on the actual DB record (not in ExistingChurch)
|
||||
const dbRecord = await prisma.church.findUnique({
|
||||
where: { id: duplicate.id },
|
||||
select: { diocese: true, city: true, email: true },
|
||||
});
|
||||
if (dbRecord && !dbRecord.diocese && dioceseSlug) {
|
||||
updateData.diocese = dioceseSlug;
|
||||
}
|
||||
if (dbRecord && !dbRecord.city && church.comune) {
|
||||
updateData.city = church.comune;
|
||||
}
|
||||
if (dbRecord && !dbRecord.email && church.emailLdc) {
|
||||
updateData.email = church.emailLdc;
|
||||
}
|
||||
|
||||
try {
|
||||
await prisma.church.update({
|
||||
where: { id: duplicate.id },
|
||||
data: updateData,
|
||||
});
|
||||
} catch (error) {
|
||||
// Unique constraint violation on orarimesseId — another church already has this ICSC
|
||||
if (error instanceof Error && error.message.includes('Unique constraint')) {
|
||||
stats.churchesSkipped++;
|
||||
continue;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
|
||||
idchurchToDbId.set(church.idchurch, duplicate.id);
|
||||
} else {
|
||||
// Create new church
|
||||
try {
|
||||
const newChurch = await prisma.church.create({
|
||||
data: {
|
||||
name: church.name,
|
||||
latitude: lat,
|
||||
longitude: lon,
|
||||
address: church.address || null,
|
||||
city: church.comune || null,
|
||||
country: 'IT',
|
||||
diocese: dioceseSlug,
|
||||
website: church.sito || null,
|
||||
email: church.emailLdc || null,
|
||||
hasWebsite: !!church.sito,
|
||||
orarimesseId: church.icsc || null,
|
||||
orarimesseLastSyncedAt: new Date(),
|
||||
source: 'orarimesse',
|
||||
wheelchairAccess: church.accessibile || false,
|
||||
},
|
||||
});
|
||||
stats.churchesCreated++;
|
||||
|
||||
// Add to in-memory array for within-run dedup
|
||||
existingChurches.push({
|
||||
id: newChurch.id,
|
||||
name: church.name,
|
||||
latitude: lat,
|
||||
longitude: lon,
|
||||
osmId: null,
|
||||
baiduId: null,
|
||||
masstimesId: null,
|
||||
orarimesseId: church.icsc || null,
|
||||
massSchedulesPhId: null,
|
||||
philmassId: null,
|
||||
horariosMisasId: null,
|
||||
mszeInfoId: null,
|
||||
weekdayMassesId: null,
|
||||
messesInfoId: null,
|
||||
bohosluzbyId: null,
|
||||
miserendId: null,
|
||||
kerknetId: null,
|
||||
gottesdienstzeitenId: null,
|
||||
discovermassId: null,
|
||||
source: 'orarimesse',
|
||||
website: church.sito || null,
|
||||
phone: null,
|
||||
address: church.address || null,
|
||||
});
|
||||
|
||||
idchurchToDbId.set(church.idchurch, newChurch.id);
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message.includes('Unique constraint')) {
|
||||
stats.churchesSkipped++;
|
||||
continue;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Pass 2: Mass Schedules ─────────────────────────────────────────────────
|
||||
|
||||
async function processSchedulesForDiocese(
|
||||
churches: OrariMesseChurch[],
|
||||
idchurchToDbId: Map<number, string>,
|
||||
dryRun: boolean,
|
||||
stats: ImportStats,
|
||||
): Promise<void> {
|
||||
// Filter to churches with active schedules
|
||||
const churchesWithMass = churches.filter((c) => c.nextmass);
|
||||
if (churchesWithMass.length === 0) return;
|
||||
|
||||
console.log(` Pass 2: Fetching schedules for ${churchesWithMass.length} churches with active masses...`);
|
||||
|
||||
for (const church of churchesWithMass) {
|
||||
const dbId = idchurchToDbId.get(church.idchurch);
|
||||
if (!dbId) continue; // Church not in our DB (skipped in Pass 1)
|
||||
|
||||
const detail = await fetchChurchDetail(church.idchurch);
|
||||
if (!detail || !detail.days || detail.days.length === 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
stats.schedulesProcessed++;
|
||||
|
||||
const recurring = convertScheduleToRecurring(detail.days);
|
||||
if (recurring.length === 0) continue;
|
||||
|
||||
if (dryRun) {
|
||||
stats.massSchedulesCreated += recurring.length;
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
await prisma.$transaction(async (tx) => {
|
||||
// Delete existing mass schedules for this church
|
||||
await tx.massSchedule.deleteMany({ where: { churchId: dbId } });
|
||||
|
||||
// Create new recurring schedules
|
||||
await tx.massSchedule.createMany({
|
||||
data: recurring.map((m) => ({
|
||||
churchId: dbId,
|
||||
dayOfWeek: m.dayOfWeek,
|
||||
time: m.time,
|
||||
language: 'Italian',
|
||||
notes: m.notes,
|
||||
})),
|
||||
});
|
||||
|
||||
// Mark church as scraped
|
||||
await tx.church.update({
|
||||
where: { id: dbId },
|
||||
data: { lastScrapedAt: new Date() },
|
||||
});
|
||||
});
|
||||
|
||||
stats.massSchedulesCreated += recurring.length;
|
||||
} catch (error) {
|
||||
stats.errors++;
|
||||
console.error(` Error saving schedules for idchurch=${church.idchurch}: ${error instanceof Error ? error.message : error}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ─── CLI ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
function parseArgs(): CLIArgs {
|
||||
const args = process.argv.slice(2);
|
||||
const result: CLIArgs = {
|
||||
all: false,
|
||||
dryRun: false,
|
||||
schedulesOnly: false,
|
||||
};
|
||||
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
switch (args[i]) {
|
||||
case '--all':
|
||||
result.all = true;
|
||||
break;
|
||||
case '--diocese':
|
||||
result.diocese = args[++i];
|
||||
break;
|
||||
case '--dry-run':
|
||||
result.dryRun = true;
|
||||
break;
|
||||
case '--schedules-only':
|
||||
result.schedulesOnly = true;
|
||||
break;
|
||||
case '--resume-from':
|
||||
result.resumeFrom = args[++i];
|
||||
break;
|
||||
case '--job-id':
|
||||
result.jobId = args[++i];
|
||||
break;
|
||||
case '--help':
|
||||
case '-h':
|
||||
console.log(`
|
||||
Usage: npx tsx scripts/import-orarimesse.ts [options]
|
||||
|
||||
Options:
|
||||
--all Import from all 77 dioceses
|
||||
--diocese <slug> Import from a single diocese (e.g. "roma")
|
||||
--dry-run No database writes, just report what would happen
|
||||
--schedules-only Skip Pass 1 (church upsert), only fetch schedules
|
||||
--resume-from <slug> Skip dioceses until reaching this slug
|
||||
--job-id <uuid> Background job tracking ID
|
||||
--help, -h Show this help message
|
||||
|
||||
Examples:
|
||||
npx tsx scripts/import-orarimesse.ts --diocese roma --dry-run
|
||||
npx tsx scripts/import-orarimesse.ts --all
|
||||
npx tsx scripts/import-orarimesse.ts --all --schedules-only
|
||||
npx tsx scripts/import-orarimesse.ts --all --resume-from napoli
|
||||
`);
|
||||
process.exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
if (!result.all && !result.diocese) {
|
||||
console.error('Error: specify --all or --diocese <slug>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ─── Helpers ─────────────────────────────────────────────────────────────────
|
||||
|
||||
function formatDuration(ms: number): string {
|
||||
const seconds = Math.floor(ms / 1000);
|
||||
const minutes = Math.floor(seconds / 60);
|
||||
const hours = Math.floor(minutes / 60);
|
||||
if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`;
|
||||
if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
|
||||
return `${seconds}s`;
|
||||
}
|
||||
|
||||
// ─── Main ────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const startTime = Date.now();
|
||||
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log('ORARIMESSE.IT IMPORTER');
|
||||
console.log('='.repeat(70));
|
||||
console.log(`Mode: ${args.all ? 'All dioceses' : `Single diocese: ${args.diocese}`}`);
|
||||
console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`);
|
||||
console.log(`Schedules only: ${args.schedulesOnly ? 'YES' : 'NO'}`);
|
||||
if (args.resumeFrom) console.log(`Resume from: ${args.resumeFrom}`);
|
||||
console.log(`Time: ${new Date().toISOString()}`);
|
||||
console.log('='.repeat(70) + '\n');
|
||||
|
||||
// Update background job status if provided
|
||||
if (args.jobId) {
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: args.jobId },
|
||||
data: { status: 'running', startedAt: new Date() },
|
||||
});
|
||||
} catch {
|
||||
// Job might not exist yet, that's fine
|
||||
}
|
||||
}
|
||||
|
||||
// Load existing Italian churches for dedup
|
||||
const existingChurches = await loadExistingItalianChurches();
|
||||
|
||||
// Fetch diocese list
|
||||
console.log('Fetching diocese list from OrariMesse.it...');
|
||||
const allDioceses = await fetchDioceses();
|
||||
console.log(`Found ${allDioceses.length} dioceses\n`);
|
||||
|
||||
// Filter to requested dioceses
|
||||
let diocesesToProcess: OrariMesseDiocese[];
|
||||
if (args.diocese) {
|
||||
const found = allDioceses.find((d) => d.slug === args.diocese);
|
||||
if (!found) {
|
||||
console.error(`Diocese "${args.diocese}" not found. Available: ${allDioceses.map((d) => d.slug).join(', ')}`);
|
||||
process.exit(1);
|
||||
}
|
||||
diocesesToProcess = [found];
|
||||
} else {
|
||||
diocesesToProcess = allDioceses;
|
||||
}
|
||||
|
||||
// Handle --resume-from
|
||||
if (args.resumeFrom) {
|
||||
const idx = diocesesToProcess.findIndex((d) => d.slug === args.resumeFrom);
|
||||
if (idx === -1) {
|
||||
console.error(`Resume diocese "${args.resumeFrom}" not found.`);
|
||||
process.exit(1);
|
||||
}
|
||||
console.log(`Resuming from diocese "${args.resumeFrom}" (skipping ${idx} dioceses)\n`);
|
||||
diocesesToProcess = diocesesToProcess.slice(idx);
|
||||
}
|
||||
|
||||
const stats: ImportStats = {
|
||||
diocesesProcessed: 0,
|
||||
churchesFound: 0,
|
||||
churchesMatched: 0,
|
||||
churchesCreated: 0,
|
||||
churchesSkipped: 0,
|
||||
schedulesProcessed: 0,
|
||||
massSchedulesCreated: 0,
|
||||
errors: 0,
|
||||
};
|
||||
|
||||
// Map OrariMesse idchurch → our DB id (for Pass 2 schedule lookups)
|
||||
const idchurchToDbId = new Map<number, string>();
|
||||
|
||||
// If schedules-only mode, pre-populate the map from existing orarimesseId records
|
||||
if (args.schedulesOnly) {
|
||||
console.log('Schedules-only mode: loading existing orarimesseId mappings...');
|
||||
const mapped = await prisma.church.findMany({
|
||||
where: { orarimesseId: { not: null } },
|
||||
select: { id: true, orarimesseId: true },
|
||||
});
|
||||
// We'll build the idchurch map during diocese processing since we need the API's idchurch values
|
||||
console.log(`Found ${mapped.length} churches with orarimesseId in DB\n`);
|
||||
}
|
||||
|
||||
// Process each diocese
|
||||
for (let i = 0; i < diocesesToProcess.length; i++) {
|
||||
const diocese = diocesesToProcess[i];
|
||||
const elapsed = formatDuration(Date.now() - startTime);
|
||||
console.log(`[${i + 1}/${diocesesToProcess.length}] Diocese: ${diocese.title} (${diocese.slug}) [${elapsed} elapsed]`);
|
||||
|
||||
try {
|
||||
// Fetch churches in this diocese
|
||||
const dioceseData = await fetchChurchesInDiocese(diocese.slug);
|
||||
if (!dioceseData || !dioceseData.listaChiese || dioceseData.listaChiese.length === 0) {
|
||||
console.log(` No churches found, skipping`);
|
||||
stats.diocesesProcessed++;
|
||||
continue;
|
||||
}
|
||||
|
||||
const churches = dioceseData.listaChiese;
|
||||
console.log(` Found ${churches.length} churches (${churches.filter((c) => c.nextmass).length} with active masses)`);
|
||||
|
||||
// Pass 1: Upsert churches
|
||||
if (!args.schedulesOnly) {
|
||||
const prevMatched = stats.churchesMatched;
|
||||
const prevCreated = stats.churchesCreated;
|
||||
const prevSkipped = stats.churchesSkipped;
|
||||
|
||||
await processChurchesForDiocese(
|
||||
diocese.slug, churches, existingChurches, idchurchToDbId,
|
||||
args.dryRun, stats
|
||||
);
|
||||
|
||||
const matched = stats.churchesMatched - prevMatched;
|
||||
const created = stats.churchesCreated - prevCreated;
|
||||
const skipped = stats.churchesSkipped - prevSkipped;
|
||||
console.log(` Pass 1: ${matched} matched, ${created} created, ${skipped} skipped`);
|
||||
} else {
|
||||
// In schedules-only mode, still need to build idchurch → dbId map
|
||||
for (const church of churches) {
|
||||
if (church.icsc) {
|
||||
const existing = existingChurches.find((e) => e.orarimesseId === church.icsc);
|
||||
if (existing) {
|
||||
idchurchToDbId.set(church.idchurch, existing.id);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Pass 2: Import schedules
|
||||
const prevSchedules = stats.massSchedulesCreated;
|
||||
await processSchedulesForDiocese(churches, idchurchToDbId, args.dryRun, stats);
|
||||
const newSchedules = stats.massSchedulesCreated - prevSchedules;
|
||||
if (newSchedules > 0) {
|
||||
console.log(` Pass 2: ${stats.schedulesProcessed} churches processed, ${newSchedules} mass times created`);
|
||||
}
|
||||
|
||||
stats.diocesesProcessed++;
|
||||
} catch (error) {
|
||||
stats.errors++;
|
||||
console.error(` ERROR processing diocese ${diocese.slug}: ${error instanceof Error ? error.message : error}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Print summary
|
||||
const totalTime = Date.now() - startTime;
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
|
||||
console.log('='.repeat(70));
|
||||
console.log(`Dioceses processed: ${stats.diocesesProcessed}`);
|
||||
console.log(`Churches found: ${stats.churchesFound}`);
|
||||
console.log(` Matched (existing): ${stats.churchesMatched}`);
|
||||
console.log(` Created (new): ${stats.churchesCreated}`);
|
||||
console.log(` Skipped: ${stats.churchesSkipped}`);
|
||||
console.log(`Schedules processed: ${stats.schedulesProcessed}`);
|
||||
console.log(`Mass schedules created: ${stats.massSchedulesCreated}`);
|
||||
console.log(`Errors: ${stats.errors}`);
|
||||
console.log(`Total time: ${formatDuration(totalTime)}`);
|
||||
console.log(`API requests: ${requestCount}`);
|
||||
console.log('='.repeat(70) + '\n');
|
||||
|
||||
// Update background job
|
||||
if (args.jobId) {
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: args.jobId },
|
||||
data: {
|
||||
status: stats.errors > 0 ? 'completed_with_errors' : 'completed',
|
||||
completedAt: new Date(),
|
||||
result: JSON.stringify(stats),
|
||||
},
|
||||
});
|
||||
} catch {
|
||||
// Ignore
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
main()
|
||||
.catch((error) => {
|
||||
console.error('Fatal error:', error);
|
||||
process.exit(1);
|
||||
})
|
||||
.finally(async () => {
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
});
|
||||
616
scripts/import-osm-churches.ts
Normal file
616
scripts/import-osm-churches.ts
Normal file
@@ -0,0 +1,616 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Import Catholic churches from OpenStreetMap
|
||||
* Usage:
|
||||
* npx tsx scripts/import-osm-churches.ts --country US
|
||||
* npx tsx scripts/import-osm-churches.ts --all
|
||||
* npx tsx scripts/import-osm-churches.ts --country MX --dry-run
|
||||
* npx tsx scripts/import-osm-churches.ts --all --sort-by-count
|
||||
*/
|
||||
|
||||
// Load .env for database connection (before importing anything that uses process.env)
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
|
||||
// Load .env.local first (production Neon URL), then .env (local fallback)
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
// Create a fresh Prisma client for this script (don't use cached pool from lib/db)
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
|
||||
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
|
||||
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
|
||||
const pool = new Pool({
|
||||
connectionString: dbUrl,
|
||||
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined
|
||||
});
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
import { queryOverpassByCountryWithFallback, type OSMChurch } from '../src/lib/overpass-client';
|
||||
import { findDuplicateChurch, mergeChurchData } from '../src/lib/church-matcher';
|
||||
import { parseServiceTimes } from '../src/lib/service-times-parser';
|
||||
|
||||
// Countries with significant Catholic populations, organized by priority
|
||||
const CATHOLIC_COUNTRIES = {
|
||||
// Priority 1: Large Catholic populations (North & South America + major European/Asian countries)
|
||||
priority1: [
|
||||
// North America
|
||||
'US', 'MX', 'CA',
|
||||
// South America
|
||||
'BR', 'AR', 'CO', 'PE', 'VE', 'CL', 'EC', 'GT', 'CU', 'BO', 'DO', 'HT', 'HN', 'PY', 'SV', 'NI', 'CR', 'PA', 'UY', 'GY', 'SR', 'GF',
|
||||
// Europe
|
||||
'IT', 'FR', 'ES', 'PL', 'DE', 'PT', 'BE', 'CZ', 'AT', 'HU', 'IE', 'HR', 'GB',
|
||||
// Asia & Oceania
|
||||
'PH', 'AU', 'NG', 'CD',
|
||||
],
|
||||
// Priority 2: Medium Catholic populations
|
||||
priority2: [
|
||||
// Rest of Europe
|
||||
'NL', 'SK', 'SI', 'LT', 'CH', 'LU', 'MT',
|
||||
'UA', 'RO', 'LV', 'BY',
|
||||
// Africa
|
||||
'AO', 'UG', 'TZ', 'KE', 'CM', 'RW', 'BI', 'MG', 'MW', 'ZM', 'ZW', 'MZ', 'BJ', 'TG', 'CI', 'BF', 'ML', 'NE', 'SN', 'GN', 'LR', 'SL', 'GH', 'GA', 'CG', 'CF', 'TD', 'SD', 'ET', 'ER', 'SO',
|
||||
// Asia
|
||||
'IN', 'TL', 'VN', 'KR', 'JP', 'ID', 'MY', 'SG', 'TH', 'LA', 'KH', 'MM', 'CN', 'LK', 'BD', 'PK',
|
||||
// Middle East
|
||||
'LB', 'IL', 'PS', 'JO', 'SY', 'IQ',
|
||||
// Oceania
|
||||
'NZ', 'PG', 'FJ', 'NC', 'PF',
|
||||
],
|
||||
// Priority 3: Smaller Catholic presence
|
||||
priority3: [
|
||||
// Caribbean & Central America (smaller islands)
|
||||
'BS', 'BB', 'JM', 'TT', 'GD', 'LC', 'VC', 'AG', 'DM', 'KN',
|
||||
// Europe (smaller countries + Balkans/Eastern)
|
||||
'MC', 'SM', 'VA', 'LI', 'AD',
|
||||
'RS', 'BA', 'MK', 'AL', 'EE',
|
||||
// Caucasus + Russia
|
||||
'GE', 'AM', 'RU',
|
||||
// Africa (rest)
|
||||
'NA', 'BW', 'LS', 'SZ', 'MU', 'SC', 'KM', 'CV', 'ST', 'GQ', 'DJ', 'GM',
|
||||
// Asia (rest)
|
||||
'BT', 'NP', 'AF', 'KZ', 'UZ', 'TM', 'TJ', 'KG', 'MN', 'BN', 'MV',
|
||||
// Oceania (rest)
|
||||
'WS', 'TO', 'VU', 'SB', 'KI', 'NR', 'TV', 'FM', 'MH', 'PW',
|
||||
],
|
||||
};
|
||||
|
||||
interface ImportStats {
|
||||
osmChurchesFound: number;
|
||||
newChurchesInserted: number;
|
||||
existingUpdated: number;
|
||||
existingLinked: number;
|
||||
churchesWithWebsites: number;
|
||||
churchesWithoutWebsites: number;
|
||||
churchesWithServiceTimes: number;
|
||||
scheduleEntriesCreated: number;
|
||||
errors: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse command line arguments
|
||||
*/
|
||||
function parseArgs(): { country?: string; all: boolean; dryRun: boolean; resumeFrom?: string; priority?: number; sortByCount: boolean } {
|
||||
const args = process.argv.slice(2);
|
||||
const result = {
|
||||
country: undefined as string | undefined,
|
||||
all: false,
|
||||
dryRun: false,
|
||||
resumeFrom: undefined as string | undefined,
|
||||
priority: undefined as number | undefined,
|
||||
sortByCount: false,
|
||||
};
|
||||
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
if (args[i] === '--country' && args[i + 1]) {
|
||||
result.country = args[i + 1].toUpperCase();
|
||||
i++;
|
||||
} else if (args[i] === '--all') {
|
||||
result.all = true;
|
||||
} else if (args[i] === '--dry-run') {
|
||||
result.dryRun = true;
|
||||
} else if (args[i] === '--resume-from' && args[i + 1]) {
|
||||
result.resumeFrom = args[i + 1].toUpperCase();
|
||||
i++;
|
||||
} else if (args[i] === '--priority' && args[i + 1]) {
|
||||
const priority = parseInt(args[i + 1], 10);
|
||||
if (priority >= 1 && priority <= 3) {
|
||||
result.priority = priority;
|
||||
} else {
|
||||
console.error('Error: --priority must be 1, 2, or 3');
|
||||
process.exit(1);
|
||||
}
|
||||
i++;
|
||||
} else if (args[i] === '--sort-by-count') {
|
||||
result.sortByCount = true;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Import churches from a single country
|
||||
*/
|
||||
async function importFromOSM(countryCode: string, dryRun: boolean = false): Promise<ImportStats> {
|
||||
const stats: ImportStats = {
|
||||
osmChurchesFound: 0,
|
||||
newChurchesInserted: 0,
|
||||
existingUpdated: 0,
|
||||
existingLinked: 0,
|
||||
churchesWithWebsites: 0,
|
||||
churchesWithoutWebsites: 0,
|
||||
churchesWithServiceTimes: 0,
|
||||
scheduleEntriesCreated: 0,
|
||||
errors: 0,
|
||||
};
|
||||
|
||||
console.log(`\n${'='.repeat(60)}`);
|
||||
console.log(`Importing Catholic churches from ${countryCode}`);
|
||||
console.log(`${'='.repeat(60)}\n`);
|
||||
|
||||
try {
|
||||
// Query Overpass API (with automatic fallback to regional bounding boxes)
|
||||
const osmChurches = await queryOverpassByCountryWithFallback(countryCode);
|
||||
stats.osmChurchesFound = osmChurches.length;
|
||||
|
||||
if (osmChurches.length === 0) {
|
||||
console.log(`No churches found in ${countryCode}`);
|
||||
return stats;
|
||||
}
|
||||
|
||||
console.log(`Found ${osmChurches.length} Catholic churches in ${countryCode}`);
|
||||
|
||||
if (dryRun) {
|
||||
console.log('\n[DRY RUN] Would import the following churches:');
|
||||
osmChurches.slice(0, 10).forEach((church) => {
|
||||
console.log(` - ${church.name} (${church.city || 'unknown city'})`);
|
||||
console.log(` OSM ID: ${church.osmId}, Website: ${church.website || 'none'}`);
|
||||
});
|
||||
if (osmChurches.length > 10) {
|
||||
console.log(` ... and ${osmChurches.length - 10} more`);
|
||||
}
|
||||
|
||||
// Count websites
|
||||
stats.churchesWithWebsites = osmChurches.filter((c) => c.website).length;
|
||||
stats.churchesWithoutWebsites = osmChurches.length - stats.churchesWithWebsites;
|
||||
|
||||
return stats;
|
||||
}
|
||||
|
||||
// Fetch all existing churches for deduplication
|
||||
// For large datasets, we could optimize by fetching only churches in the same country/region
|
||||
console.log('Fetching existing churches for deduplication...');
|
||||
const existingChurches = await prisma.church.findMany({
|
||||
select: {
|
||||
id: true,
|
||||
name: true,
|
||||
latitude: true,
|
||||
longitude: true,
|
||||
osmId: true,
|
||||
baiduId: true,
|
||||
masstimesId: true,
|
||||
orarimesseId: true,
|
||||
massSchedulesPhId: true,
|
||||
philmassId: true,
|
||||
horariosMisasId: true,
|
||||
mszeInfoId: true,
|
||||
weekdayMassesId: true,
|
||||
messesInfoId: true,
|
||||
bohosluzbyId: true,
|
||||
miserendId: true,
|
||||
kerknetId: true,
|
||||
gottesdienstzeitenId: true,
|
||||
discovermassId: true,
|
||||
source: true,
|
||||
website: true,
|
||||
phone: true,
|
||||
address: true,
|
||||
},
|
||||
});
|
||||
console.log(`Found ${existingChurches.length} existing churches in database`);
|
||||
|
||||
// Process churches one by one (no batch transactions to avoid rollbacks)
|
||||
let processed = 0;
|
||||
|
||||
for (const osmChurch of osmChurches) {
|
||||
try {
|
||||
// Check for duplicate
|
||||
const duplicate = findDuplicateChurch(osmChurch, existingChurches);
|
||||
|
||||
if (duplicate && duplicate.osmId === osmChurch.osmId) {
|
||||
// Existing church with matching osmId - update it
|
||||
const mergedData = mergeChurchData(duplicate, osmChurch);
|
||||
|
||||
// Verify the church exists in the database (not just in our temp list from this run)
|
||||
const existsInDb = await prisma.church.findUnique({ where: { id: duplicate.id } });
|
||||
if (existsInDb) {
|
||||
await prisma.church.update({
|
||||
where: { id: duplicate.id },
|
||||
data: mergedData,
|
||||
});
|
||||
stats.existingUpdated++;
|
||||
|
||||
// Import service_times for existing churches that don't have schedules yet
|
||||
if (osmChurch.serviceTimes) {
|
||||
const existingSchedules = await prisma.massSchedule.count({ where: { churchId: duplicate.id } });
|
||||
if (existingSchedules === 0) {
|
||||
const scheduleEntries = parseServiceTimes(osmChurch.serviceTimes);
|
||||
if (scheduleEntries.length > 0) {
|
||||
await prisma.massSchedule.createMany({
|
||||
data: scheduleEntries.map(entry => ({
|
||||
churchId: duplicate.id,
|
||||
dayOfWeek: entry.dayOfWeek,
|
||||
time: entry.time,
|
||||
massType: entry.dayOfWeek === 0 ? 'Sunday' :
|
||||
entry.dayOfWeek === 6 ? 'Saturday' : 'Daily',
|
||||
language: 'Unknown',
|
||||
notes: 'From OSM service_times tag',
|
||||
})),
|
||||
});
|
||||
stats.churchesWithServiceTimes++;
|
||||
stats.scheduleEntriesCreated += scheduleEntries.length;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Duplicate from earlier in this run - skip (already processed)
|
||||
stats.existingUpdated++;
|
||||
}
|
||||
|
||||
if (osmChurch.website) stats.churchesWithWebsites++;
|
||||
else stats.churchesWithoutWebsites++;
|
||||
|
||||
} else if (duplicate) {
|
||||
// Existing church matched by proximity/name - link it with osmId
|
||||
const mergedData = mergeChurchData(duplicate, osmChurch);
|
||||
|
||||
// Verify the church exists in the database (not just in our temp list from this run)
|
||||
const existsInDb = await prisma.church.findUnique({ where: { id: duplicate.id } });
|
||||
if (existsInDb) {
|
||||
await prisma.church.update({
|
||||
where: { id: duplicate.id },
|
||||
data: mergedData,
|
||||
});
|
||||
stats.existingLinked++;
|
||||
|
||||
// Import service_times for linked churches that don't have schedules yet
|
||||
if (osmChurch.serviceTimes) {
|
||||
const existingSchedules = await prisma.massSchedule.count({ where: { churchId: duplicate.id } });
|
||||
if (existingSchedules === 0) {
|
||||
const scheduleEntries = parseServiceTimes(osmChurch.serviceTimes);
|
||||
if (scheduleEntries.length > 0) {
|
||||
await prisma.massSchedule.createMany({
|
||||
data: scheduleEntries.map(entry => ({
|
||||
churchId: duplicate.id,
|
||||
dayOfWeek: entry.dayOfWeek,
|
||||
time: entry.time,
|
||||
massType: entry.dayOfWeek === 0 ? 'Sunday' :
|
||||
entry.dayOfWeek === 6 ? 'Saturday' : 'Daily',
|
||||
language: 'Unknown',
|
||||
notes: 'From OSM service_times tag',
|
||||
})),
|
||||
});
|
||||
stats.churchesWithServiceTimes++;
|
||||
stats.scheduleEntriesCreated += scheduleEntries.length;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Duplicate from earlier in this run - skip (already processed)
|
||||
stats.existingLinked++;
|
||||
}
|
||||
|
||||
if (osmChurch.website) stats.churchesWithWebsites++;
|
||||
else stats.churchesWithoutWebsites++;
|
||||
|
||||
} else {
|
||||
// New church - insert it and capture the real ID
|
||||
const newChurch = await prisma.church.create({
|
||||
data: {
|
||||
name: osmChurch.name,
|
||||
latitude: osmChurch.lat,
|
||||
longitude: osmChurch.lng,
|
||||
address: osmChurch.address,
|
||||
city: osmChurch.city,
|
||||
state: osmChurch.state,
|
||||
zip: osmChurch.zip,
|
||||
country: osmChurch.country || countryCode,
|
||||
phone: osmChurch.phone,
|
||||
website: osmChurch.website,
|
||||
diocese: osmChurch.diocese,
|
||||
wheelchairAccess: osmChurch.wheelchairAccess ?? false,
|
||||
source: 'osm',
|
||||
osmId: osmChurch.osmId,
|
||||
hasWebsite: !!osmChurch.website,
|
||||
osmLastSyncedAt: new Date(),
|
||||
},
|
||||
});
|
||||
stats.newChurchesInserted++;
|
||||
|
||||
if (osmChurch.website) stats.churchesWithWebsites++;
|
||||
else stats.churchesWithoutWebsites++;
|
||||
|
||||
// Parse service_times tag and create mass schedules
|
||||
if (osmChurch.serviceTimes) {
|
||||
const scheduleEntries = parseServiceTimes(osmChurch.serviceTimes);
|
||||
if (scheduleEntries.length > 0) {
|
||||
await prisma.massSchedule.createMany({
|
||||
data: scheduleEntries.map(entry => ({
|
||||
churchId: newChurch.id,
|
||||
dayOfWeek: entry.dayOfWeek,
|
||||
time: entry.time,
|
||||
massType: entry.dayOfWeek === 0 ? 'Sunday' :
|
||||
entry.dayOfWeek === 6 ? 'Saturday' : 'Daily',
|
||||
language: 'Unknown',
|
||||
notes: 'From OSM service_times tag',
|
||||
})),
|
||||
});
|
||||
stats.churchesWithServiceTimes++;
|
||||
stats.scheduleEntriesCreated += scheduleEntries.length;
|
||||
|
||||
// Mark as scraped since we have schedule data
|
||||
await prisma.church.update({
|
||||
where: { id: newChurch.id },
|
||||
data: { lastScrapedAt: new Date() },
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Add to existing churches list for future deduplication in this run (use real DB ID)
|
||||
existingChurches.push({
|
||||
id: newChurch.id,
|
||||
name: osmChurch.name,
|
||||
latitude: osmChurch.lat,
|
||||
longitude: osmChurch.lng,
|
||||
osmId: osmChurch.osmId,
|
||||
baiduId: null,
|
||||
masstimesId: null,
|
||||
orarimesseId: null,
|
||||
massSchedulesPhId: null,
|
||||
philmassId: null,
|
||||
horariosMisasId: null,
|
||||
mszeInfoId: null,
|
||||
weekdayMassesId: null,
|
||||
messesInfoId: null,
|
||||
bohosluzbyId: null,
|
||||
miserendId: null,
|
||||
kerknetId: null,
|
||||
gottesdienstzeitenId: null,
|
||||
discovermassId: null,
|
||||
source: 'osm',
|
||||
website: osmChurch.website || null,
|
||||
phone: osmChurch.phone || null,
|
||||
address: osmChurch.address || null,
|
||||
});
|
||||
}
|
||||
|
||||
processed++;
|
||||
|
||||
// Log progress every 500 churches
|
||||
if (processed % 500 === 0) {
|
||||
console.log(`Progress: ${processed}/${osmChurches.length} churches processed`);
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error(`Error processing church ${osmChurch.name}:`, error);
|
||||
stats.errors++;
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\nProcessed all ${osmChurches.length} churches from ${countryCode}`);
|
||||
|
||||
} catch (error) {
|
||||
console.error(`Failed to import from ${countryCode}:`, error);
|
||||
stats.errors++;
|
||||
}
|
||||
|
||||
return stats;
|
||||
}
|
||||
|
||||
/**
|
||||
* Print import summary
|
||||
*/
|
||||
function printSummary(countryCode: string, stats: ImportStats, dryRun: boolean) {
|
||||
console.log(`\n${'='.repeat(60)}`);
|
||||
console.log(`Import Summary for ${countryCode} ${dryRun ? '(DRY RUN)' : ''}`);
|
||||
console.log(`${'='.repeat(60)}`);
|
||||
console.log(`OSM churches found: ${stats.osmChurchesFound}`);
|
||||
|
||||
if (!dryRun) {
|
||||
console.log(`New churches inserted: ${stats.newChurchesInserted}`);
|
||||
console.log(`Existing churches updated: ${stats.existingUpdated} (matched by osmId)`);
|
||||
console.log(`Existing churches linked: ${stats.existingLinked} (matched by proximity)`);
|
||||
}
|
||||
|
||||
console.log(`Churches with websites: ${stats.churchesWithWebsites}`);
|
||||
console.log(`Churches without websites: ${stats.churchesWithoutWebsites}`);
|
||||
|
||||
if (!dryRun && stats.churchesWithServiceTimes > 0) {
|
||||
console.log(`Churches with service_times: ${stats.churchesWithServiceTimes}`);
|
||||
console.log(`Schedule entries created: ${stats.scheduleEntriesCreated}`);
|
||||
}
|
||||
|
||||
if (!dryRun && stats.errors > 0) {
|
||||
console.log(`Errors encountered: ${stats.errors}`);
|
||||
}
|
||||
|
||||
console.log(`${'='.repeat(60)}\n`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Main function
|
||||
*/
|
||||
async function createOrResumeJob(args: string[]): Promise<string | null> {
|
||||
const jobIdIndex = args.indexOf('--job-id');
|
||||
if (jobIdIndex !== -1) {
|
||||
const jobId = args[jobIdIndex + 1];
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: { status: 'running', startedAt: new Date() },
|
||||
});
|
||||
return jobId;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function completeJob(jobId: string | null, error?: string): Promise<void> {
|
||||
if (!jobId) return;
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: {
|
||||
status: error ? 'failed' : 'completed',
|
||||
error: error || null,
|
||||
completedAt: new Date(),
|
||||
},
|
||||
});
|
||||
} catch (err) {
|
||||
console.error(`Failed to update job ${jobId}:`, err);
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const { country, all, dryRun, resumeFrom, priority, sortByCount } = parseArgs();
|
||||
const jobId = await createOrResumeJob(process.argv.slice(2));
|
||||
|
||||
if (!country && !all && !priority) {
|
||||
console.error('Error: Must specify --country <CODE>, --all, or --priority <1|2|3>');
|
||||
console.error('Usage:');
|
||||
console.error(' npx tsx scripts/import-osm-churches.ts --country US');
|
||||
console.error(' npx tsx scripts/import-osm-churches.ts --all');
|
||||
console.error(' npx tsx scripts/import-osm-churches.ts --priority 1');
|
||||
console.error(' npx tsx scripts/import-osm-churches.ts --all --resume-from IT');
|
||||
console.error(' npx tsx scripts/import-osm-churches.ts --country MX --dry-run');
|
||||
console.error(' npx tsx scripts/import-osm-churches.ts --all --sort-by-count');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (dryRun) {
|
||||
console.log('\n*** DRY RUN MODE - No changes will be made to database ***\n');
|
||||
}
|
||||
|
||||
try {
|
||||
if (country) {
|
||||
// Import single country
|
||||
const stats = await importFromOSM(country, dryRun);
|
||||
printSummary(country, stats, dryRun);
|
||||
|
||||
} else if (all || priority !== undefined) {
|
||||
// Import all countries or specific priority
|
||||
let allCountries: string[];
|
||||
|
||||
if (priority !== undefined) {
|
||||
// Import only specified priority level
|
||||
const priorityKey = `priority${priority}` as keyof typeof CATHOLIC_COUNTRIES;
|
||||
allCountries = CATHOLIC_COUNTRIES[priorityKey];
|
||||
console.log(`Importing Priority ${priority} countries (${allCountries.length} countries)...\n`);
|
||||
} else {
|
||||
// Import all priorities
|
||||
console.log('Importing all Catholic countries by priority...\n');
|
||||
allCountries = [
|
||||
...CATHOLIC_COUNTRIES.priority1,
|
||||
...CATHOLIC_COUNTRIES.priority2,
|
||||
...CATHOLIC_COUNTRIES.priority3,
|
||||
];
|
||||
}
|
||||
|
||||
// Sort by existing OSM church count (least first) if requested
|
||||
if (sortByCount) {
|
||||
console.log('Querying DB for current OSM church counts per country...');
|
||||
const countRows = await prisma.$queryRawUnsafe<Array<{ country: string; count: bigint }>>(
|
||||
`SELECT country, COUNT(*) as count FROM churches WHERE source = 'osm' AND country IS NOT NULL GROUP BY country`
|
||||
);
|
||||
const countMap = new Map<string, number>();
|
||||
for (const row of countRows) {
|
||||
countMap.set(row.country, Number(row.count));
|
||||
}
|
||||
|
||||
allCountries.sort((a, b) => (countMap.get(a) || 0) - (countMap.get(b) || 0));
|
||||
|
||||
console.log('Country processing order (least OSM churches first):');
|
||||
for (const c of allCountries) {
|
||||
console.log(` ${c}: ${countMap.get(c) || 0} existing OSM churches`);
|
||||
}
|
||||
console.log('');
|
||||
}
|
||||
|
||||
// Handle --resume-from flag
|
||||
if (resumeFrom) {
|
||||
const resumeIndex = allCountries.indexOf(resumeFrom);
|
||||
if (resumeIndex === -1) {
|
||||
console.error(`Error: Country ${resumeFrom} not found in the list`);
|
||||
process.exit(1);
|
||||
}
|
||||
console.log(`Resuming from ${resumeFrom} (skipping first ${resumeIndex} countries)...\n`);
|
||||
allCountries = allCountries.slice(resumeIndex);
|
||||
}
|
||||
|
||||
const totalStats: ImportStats = {
|
||||
osmChurchesFound: 0,
|
||||
newChurchesInserted: 0,
|
||||
existingUpdated: 0,
|
||||
existingLinked: 0,
|
||||
churchesWithWebsites: 0,
|
||||
churchesWithoutWebsites: 0,
|
||||
churchesWithServiceTimes: 0,
|
||||
scheduleEntriesCreated: 0,
|
||||
errors: 0,
|
||||
};
|
||||
|
||||
for (const countryCode of allCountries) {
|
||||
const stats = await importFromOSM(countryCode, dryRun);
|
||||
printSummary(countryCode, stats, dryRun);
|
||||
|
||||
// Aggregate stats
|
||||
totalStats.osmChurchesFound += stats.osmChurchesFound;
|
||||
totalStats.newChurchesInserted += stats.newChurchesInserted;
|
||||
totalStats.existingUpdated += stats.existingUpdated;
|
||||
totalStats.existingLinked += stats.existingLinked;
|
||||
totalStats.churchesWithWebsites += stats.churchesWithWebsites;
|
||||
totalStats.churchesWithoutWebsites += stats.churchesWithoutWebsites;
|
||||
totalStats.churchesWithServiceTimes += stats.churchesWithServiceTimes;
|
||||
totalStats.scheduleEntriesCreated += stats.scheduleEntriesCreated;
|
||||
totalStats.errors += stats.errors;
|
||||
|
||||
// Small delay between countries to be respectful (rate limiting is also in the client)
|
||||
await new Promise((resolve) => setTimeout(resolve, 2000));
|
||||
}
|
||||
|
||||
// Print overall summary
|
||||
console.log(`\n${'='.repeat(60)}`);
|
||||
console.log(`OVERALL SUMMARY ${dryRun ? '(DRY RUN)' : ''}`);
|
||||
console.log(`${'='.repeat(60)}`);
|
||||
console.log(`Total countries processed: ${allCountries.length}`);
|
||||
console.log(`Total OSM churches found: ${totalStats.osmChurchesFound}`);
|
||||
|
||||
if (!dryRun) {
|
||||
console.log(`Total new churches inserted: ${totalStats.newChurchesInserted}`);
|
||||
console.log(`Total churches updated: ${totalStats.existingUpdated}`);
|
||||
console.log(`Total churches linked: ${totalStats.existingLinked}`);
|
||||
}
|
||||
|
||||
console.log(`Total with websites: ${totalStats.churchesWithWebsites}`);
|
||||
console.log(`Total without websites: ${totalStats.churchesWithoutWebsites}`);
|
||||
|
||||
if (!dryRun && totalStats.errors > 0) {
|
||||
console.log(`Total errors: ${totalStats.errors}`);
|
||||
}
|
||||
|
||||
console.log(`${'='.repeat(60)}\n`);
|
||||
}
|
||||
|
||||
await completeJob(jobId);
|
||||
} catch (error) {
|
||||
console.error('Fatal error:', error);
|
||||
await completeJob(jobId, String(error));
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await prisma.$disconnect();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
346
scripts/import-osm-region.ts
Normal file
346
scripts/import-osm-region.ts
Normal file
@@ -0,0 +1,346 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Import Catholic churches from a specific region of a country
|
||||
* Usage:
|
||||
* npx tsx scripts/import-osm-region.ts --country GB --region "England South"
|
||||
* npx tsx scripts/import-osm-region.ts --country IT --region "North" --dry-run
|
||||
*/
|
||||
|
||||
// Load .env for database connection (before importing anything that uses process.env)
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
|
||||
// Load .env.local first (production Neon URL), then .env (local fallback)
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
// Create a fresh Prisma client for this script (don't use cached pool from lib/db)
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
|
||||
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
|
||||
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
|
||||
const pool = new Pool({
|
||||
connectionString: dbUrl,
|
||||
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined
|
||||
});
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
import { COUNTRY_BOUNDING_BOXES, queryOverpassByBoundingBox, type OSMChurch } from '../src/lib/overpass-client';
|
||||
import { findDuplicateChurch, mergeChurchData } from '../src/lib/church-matcher';
|
||||
|
||||
interface ImportStats {
|
||||
osmChurchesFound: number;
|
||||
newChurchesInserted: number;
|
||||
existingUpdated: number;
|
||||
existingLinked: number;
|
||||
churchesWithWebsites: number;
|
||||
churchesWithoutWebsites: number;
|
||||
errors: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse command line arguments
|
||||
*/
|
||||
function parseArgs(): { country?: string; region?: string; dryRun: boolean } {
|
||||
const args = process.argv.slice(2);
|
||||
const result = {
|
||||
country: undefined as string | undefined,
|
||||
region: undefined as string | undefined,
|
||||
dryRun: false,
|
||||
};
|
||||
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
if (args[i] === '--country' && args[i + 1]) {
|
||||
result.country = args[i + 1].toUpperCase();
|
||||
i++;
|
||||
} else if (args[i] === '--region' && args[i + 1]) {
|
||||
result.region = args[i + 1];
|
||||
i++;
|
||||
} else if (args[i] === '--dry-run') {
|
||||
result.dryRun = true;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Import churches from a single region
|
||||
*/
|
||||
async function importFromRegion(countryCode: string, regionName: string, dryRun: boolean = false): Promise<ImportStats> {
|
||||
const stats: ImportStats = {
|
||||
osmChurchesFound: 0,
|
||||
newChurchesInserted: 0,
|
||||
existingUpdated: 0,
|
||||
existingLinked: 0,
|
||||
churchesWithWebsites: 0,
|
||||
churchesWithoutWebsites: 0,
|
||||
errors: 0,
|
||||
};
|
||||
|
||||
console.log(`\n${'='.repeat(60)}`);
|
||||
console.log(`Importing from ${countryCode} - ${regionName}`);
|
||||
console.log(`${'='.repeat(60)}\n`);
|
||||
|
||||
// Look up the bounding box
|
||||
const regions = COUNTRY_BOUNDING_BOXES[countryCode];
|
||||
if (!regions) {
|
||||
console.error(`Error: No bounding boxes defined for country ${countryCode}`);
|
||||
console.error('Available countries:', Object.keys(COUNTRY_BOUNDING_BOXES).join(', '));
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const region = regions.find(r => r.name === regionName);
|
||||
if (!region) {
|
||||
console.error(`Error: Region "${regionName}" not found for ${countryCode}`);
|
||||
console.error('Available regions:', regions.map(r => r.name).join(', '));
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
try {
|
||||
// Query Overpass API for this specific region
|
||||
console.log(`Querying bounding box: (${region.south}, ${region.west}, ${region.north}, ${region.east})`);
|
||||
const osmChurches = await queryOverpassByBoundingBox(region.south, region.west, region.north, region.east);
|
||||
stats.osmChurchesFound = osmChurches.length;
|
||||
|
||||
if (osmChurches.length === 0) {
|
||||
console.log(`No churches found in ${regionName}`);
|
||||
return stats;
|
||||
}
|
||||
|
||||
console.log(`Found ${osmChurches.length} Catholic churches in ${regionName}`);
|
||||
|
||||
if (dryRun) {
|
||||
console.log('\n[DRY RUN] Would import the following churches:');
|
||||
osmChurches.slice(0, 10).forEach((church) => {
|
||||
console.log(` - ${church.name} (${church.city || 'unknown city'})`);
|
||||
console.log(` OSM ID: ${church.osmId}, Website: ${church.website || 'none'}`);
|
||||
});
|
||||
if (osmChurches.length > 10) {
|
||||
console.log(` ... and ${osmChurches.length - 10} more`);
|
||||
}
|
||||
|
||||
// Count websites
|
||||
stats.churchesWithWebsites = osmChurches.filter((c) => c.website).length;
|
||||
stats.churchesWithoutWebsites = osmChurches.length - stats.churchesWithWebsites;
|
||||
|
||||
return stats;
|
||||
}
|
||||
|
||||
// Fetch all existing churches for deduplication
|
||||
console.log('Fetching existing churches for deduplication...');
|
||||
const existingChurches = await prisma.church.findMany({
|
||||
select: {
|
||||
id: true,
|
||||
name: true,
|
||||
latitude: true,
|
||||
longitude: true,
|
||||
osmId: true,
|
||||
baiduId: true,
|
||||
masstimesId: true,
|
||||
orarimesseId: true,
|
||||
massSchedulesPhId: true,
|
||||
philmassId: true,
|
||||
horariosMisasId: true,
|
||||
mszeInfoId: true,
|
||||
weekdayMassesId: true,
|
||||
messesInfoId: true,
|
||||
bohosluzbyId: true,
|
||||
miserendId: true,
|
||||
kerknetId: true,
|
||||
gottesdienstzeitenId: true,
|
||||
discovermassId: true,
|
||||
source: true,
|
||||
website: true,
|
||||
phone: true,
|
||||
address: true,
|
||||
},
|
||||
});
|
||||
console.log(`Found ${existingChurches.length} existing churches in database`);
|
||||
|
||||
// Process churches one by one (no batch transactions to avoid rollbacks)
|
||||
let processed = 0;
|
||||
|
||||
for (const osmChurch of osmChurches) {
|
||||
try {
|
||||
// Check for duplicate
|
||||
const duplicate = findDuplicateChurch(osmChurch, existingChurches);
|
||||
|
||||
if (duplicate && duplicate.osmId === osmChurch.osmId) {
|
||||
// Existing church with matching osmId - update it
|
||||
const mergedData = mergeChurchData(duplicate, osmChurch);
|
||||
|
||||
// Verify the church exists in the database (not just in our temp list from this run)
|
||||
const existsInDb = await prisma.church.findUnique({ where: { id: duplicate.id } });
|
||||
if (existsInDb) {
|
||||
await prisma.church.update({
|
||||
where: { id: duplicate.id },
|
||||
data: mergedData,
|
||||
});
|
||||
stats.existingUpdated++;
|
||||
} else {
|
||||
// Duplicate from earlier in this run - skip (already processed)
|
||||
stats.existingUpdated++;
|
||||
}
|
||||
|
||||
if (osmChurch.website) stats.churchesWithWebsites++;
|
||||
else stats.churchesWithoutWebsites++;
|
||||
|
||||
} else if (duplicate) {
|
||||
// Existing church matched by proximity/name - link it with osmId
|
||||
const mergedData = mergeChurchData(duplicate, osmChurch);
|
||||
|
||||
// Verify the church exists in the database (not just in our temp list from this run)
|
||||
const existsInDb = await prisma.church.findUnique({ where: { id: duplicate.id } });
|
||||
if (existsInDb) {
|
||||
await prisma.church.update({
|
||||
where: { id: duplicate.id },
|
||||
data: mergedData,
|
||||
});
|
||||
stats.existingLinked++;
|
||||
} else {
|
||||
// Duplicate from earlier in this run - skip (already processed)
|
||||
stats.existingLinked++;
|
||||
}
|
||||
|
||||
if (osmChurch.website) stats.churchesWithWebsites++;
|
||||
else stats.churchesWithoutWebsites++;
|
||||
|
||||
} else {
|
||||
// New church - insert it and capture the real ID
|
||||
const newChurch = await prisma.church.create({
|
||||
data: {
|
||||
name: osmChurch.name,
|
||||
latitude: osmChurch.lat,
|
||||
longitude: osmChurch.lng,
|
||||
address: osmChurch.address,
|
||||
city: osmChurch.city,
|
||||
state: osmChurch.state,
|
||||
zip: osmChurch.zip,
|
||||
country: osmChurch.country || countryCode,
|
||||
phone: osmChurch.phone,
|
||||
website: osmChurch.website,
|
||||
diocese: osmChurch.diocese,
|
||||
wheelchairAccess: osmChurch.wheelchairAccess ?? false,
|
||||
source: 'osm',
|
||||
osmId: osmChurch.osmId,
|
||||
hasWebsite: !!osmChurch.website,
|
||||
osmLastSyncedAt: new Date(),
|
||||
},
|
||||
});
|
||||
stats.newChurchesInserted++;
|
||||
|
||||
if (osmChurch.website) stats.churchesWithWebsites++;
|
||||
else stats.churchesWithoutWebsites++;
|
||||
|
||||
// Add to existing churches list for future deduplication in this run (use real DB ID)
|
||||
existingChurches.push({
|
||||
id: newChurch.id,
|
||||
name: osmChurch.name,
|
||||
latitude: osmChurch.lat,
|
||||
longitude: osmChurch.lng,
|
||||
osmId: osmChurch.osmId,
|
||||
baiduId: null,
|
||||
masstimesId: null,
|
||||
orarimesseId: null,
|
||||
massSchedulesPhId: null,
|
||||
philmassId: null,
|
||||
horariosMisasId: null,
|
||||
mszeInfoId: null,
|
||||
weekdayMassesId: null,
|
||||
messesInfoId: null,
|
||||
bohosluzbyId: null,
|
||||
miserendId: null,
|
||||
kerknetId: null,
|
||||
gottesdienstzeitenId: null,
|
||||
discovermassId: null,
|
||||
source: 'osm',
|
||||
website: osmChurch.website || null,
|
||||
phone: osmChurch.phone || null,
|
||||
address: osmChurch.address || null,
|
||||
});
|
||||
}
|
||||
|
||||
processed++;
|
||||
|
||||
// Log progress every 100 churches
|
||||
if (processed % 100 === 0) {
|
||||
console.log(`Progress: ${processed}/${osmChurches.length} churches processed`);
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error(`Error processing church ${osmChurch.name}:`, error);
|
||||
stats.errors++;
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\nProcessed all ${osmChurches.length} churches from ${regionName}`);
|
||||
|
||||
} catch (error) {
|
||||
console.error(`Failed to import from ${regionName}:`, error);
|
||||
stats.errors++;
|
||||
}
|
||||
|
||||
return stats;
|
||||
}
|
||||
|
||||
/**
|
||||
* Print import summary
|
||||
*/
|
||||
function printSummary(countryCode: string, regionName: string, stats: ImportStats, dryRun: boolean) {
|
||||
console.log(`\n${'='.repeat(60)}`);
|
||||
console.log(`Import Summary for ${countryCode} - ${regionName} ${dryRun ? '(DRY RUN)' : ''}`);
|
||||
console.log(`${'='.repeat(60)}`);
|
||||
console.log(`OSM churches found: ${stats.osmChurchesFound}`);
|
||||
|
||||
if (!dryRun) {
|
||||
console.log(`New churches inserted: ${stats.newChurchesInserted}`);
|
||||
console.log(`Existing churches updated: ${stats.existingUpdated} (matched by osmId)`);
|
||||
console.log(`Existing churches linked: ${stats.existingLinked} (matched by proximity)`);
|
||||
}
|
||||
|
||||
console.log(`Churches with websites: ${stats.churchesWithWebsites}`);
|
||||
console.log(`Churches without websites: ${stats.churchesWithoutWebsites}`);
|
||||
|
||||
if (!dryRun && stats.errors > 0) {
|
||||
console.log(`Errors encountered: ${stats.errors}`);
|
||||
}
|
||||
|
||||
console.log(`${'='.repeat(60)}\n`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Main function
|
||||
*/
|
||||
async function main() {
|
||||
const { country, region, dryRun } = parseArgs();
|
||||
|
||||
if (!country || !region) {
|
||||
console.error('Error: Must specify both --country <CODE> and --region <NAME>');
|
||||
console.error('Usage:');
|
||||
console.error(' npx tsx scripts/import-osm-region.ts --country GB --region "England South"');
|
||||
console.error(' npx tsx scripts/import-osm-region.ts --country IT --region "North" --dry-run');
|
||||
console.error('\nAvailable countries:', Object.keys(COUNTRY_BOUNDING_BOXES).join(', '));
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (dryRun) {
|
||||
console.log('\n*** DRY RUN MODE - No changes will be made to database ***\n');
|
||||
}
|
||||
|
||||
try {
|
||||
const stats = await importFromRegion(country, region, dryRun);
|
||||
printSummary(country, region, stats, dryRun);
|
||||
|
||||
} catch (error) {
|
||||
console.error('Fatal error:', error);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await prisma.$disconnect();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
742
scripts/import-philmass.ts
Normal file
742
scripts/import-philmass.ts
Normal file
@@ -0,0 +1,742 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Import Catholic churches and mass schedules from Philmass.com
|
||||
*
|
||||
* Philmass.com provides rich Schema.org-annotated mass schedule data for
|
||||
* Philippine churches. It has no coordinates, so we match against existing
|
||||
* churches (OSM + mass-schedules.com) and only update matched records.
|
||||
* Unmatched churches are logged for manual review.
|
||||
*
|
||||
* Discovery strategy:
|
||||
* 1. Fetch Philippines page → extract province URLs
|
||||
* 2. For each province → extract city listing URLs
|
||||
* 3. For each city listing → extract church mass-schedule URLs
|
||||
* 4. Deduplicate all church URLs globally
|
||||
* 5. For each church: parse JSON-LD + Schema.org Events, match, upsert
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/import-philmass.ts --all
|
||||
* npx tsx scripts/import-philmass.ts --all --dry-run
|
||||
* npx tsx scripts/import-philmass.ts --province Metro-Manila
|
||||
* npx tsx scripts/import-philmass.ts --all --resume-from Cebu
|
||||
* npx tsx scripts/import-philmass.ts --all --job-id {uuid}
|
||||
*/
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
|
||||
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
|
||||
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
|
||||
const pool = new Pool({
|
||||
connectionString: dbUrl,
|
||||
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
|
||||
});
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
import { findDuplicateChurch } from '../src/lib/church-matcher';
|
||||
import type { ExistingChurch } from '../src/lib/church-matcher';
|
||||
|
||||
// ─── Constants ───────────────────────────────────────────────────────────────
|
||||
|
||||
const SITE_BASE = 'https://www.philmass.com';
|
||||
const PHILIPPINES_URL = `${SITE_BASE}/Asia/Philippines.html`;
|
||||
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
|
||||
const REQUEST_DELAY_MS = 2000;
|
||||
|
||||
// ─── Types ───────────────────────────────────────────────────────────────────
|
||||
|
||||
interface ProvinceInfo {
|
||||
name: string;
|
||||
url: string;
|
||||
}
|
||||
|
||||
interface ChurchUrl {
|
||||
url: string;
|
||||
slug: string; // URL slug used as philmassId
|
||||
province: string;
|
||||
city: string;
|
||||
}
|
||||
|
||||
interface ParsedPhilmassChurch {
|
||||
name: string;
|
||||
streetAddress: string | null;
|
||||
city: string | null;
|
||||
region: string | null;
|
||||
}
|
||||
|
||||
interface ParsedSchedule {
|
||||
dayOfWeek: number;
|
||||
time: string;
|
||||
}
|
||||
|
||||
interface ImportStats {
|
||||
provincesProcessed: number;
|
||||
citiesProcessed: number;
|
||||
churchUrlsDiscovered: number;
|
||||
churchesProcessed: number;
|
||||
churchesMatched: number;
|
||||
churchesUnmatched: number;
|
||||
churchesSkipped: number;
|
||||
schedulesUpdated: number;
|
||||
massSchedulesCreated: number;
|
||||
errors: number;
|
||||
}
|
||||
|
||||
interface CLIArgs {
|
||||
all: boolean;
|
||||
province?: string;
|
||||
dryRun: boolean;
|
||||
resumeFrom?: string;
|
||||
jobId?: string;
|
||||
}
|
||||
|
||||
// ─── HTTP Client ─────────────────────────────────────────────────────────────
|
||||
|
||||
let requestCount = 0;
|
||||
|
||||
function delay(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
async function fetchPage(url: string): Promise<string | null> {
|
||||
if (requestCount > 0) {
|
||||
await delay(REQUEST_DELAY_MS);
|
||||
}
|
||||
requestCount++;
|
||||
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
headers: {
|
||||
'User-Agent': USER_AGENT,
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
},
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
console.error(` HTTP ${response.status} for ${url}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
return await response.text();
|
||||
} catch (error) {
|
||||
console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Discovery: Province → City → Church URLs ───────────────────────────────
|
||||
|
||||
async function fetchProvinceUrls(): Promise<ProvinceInfo[]> {
|
||||
console.log(`Fetching Philippines page: ${PHILIPPINES_URL}`);
|
||||
const html = await fetchPage(PHILIPPINES_URL);
|
||||
if (!html) throw new Error('Failed to fetch Philippines page');
|
||||
|
||||
const provinces: ProvinceInfo[] = [];
|
||||
const seen = new Set<string>();
|
||||
|
||||
// Pattern: href="https://www.philmass.com/Asia/Philippines/{Province}.html"
|
||||
const regex = /href="(https:\/\/www\.philmass\.com\/Asia\/Philippines\/([^/"]+)\.html)"/g;
|
||||
let match;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const url = match[1];
|
||||
const name = match[2];
|
||||
// Skip non-province pages (weekly-sunday, etc.)
|
||||
if (name.includes('weekly') || name.includes('Roman-Catholic') || seen.has(name)) continue;
|
||||
seen.add(name);
|
||||
provinces.push({ name, url });
|
||||
}
|
||||
|
||||
return provinces;
|
||||
}
|
||||
|
||||
function decodeHtmlEntities(str: string): string {
|
||||
return str
|
||||
.replace(/&#(\d+);/g, (_, code: string) => String.fromCharCode(parseInt(code)))
|
||||
.replace(/&/g, '&')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/"/g, '"');
|
||||
}
|
||||
|
||||
async function fetchCityListingUrls(provinceUrl: string, provinceName: string): Promise<string[]> {
|
||||
const html = await fetchPage(provinceUrl);
|
||||
if (!html) return [];
|
||||
|
||||
const urls: string[] = [];
|
||||
const seen = new Set<string>();
|
||||
|
||||
// Pattern: href=".../{Province}/{City}/Roman-Catholic-Churches-in-{City}...html"
|
||||
const regex = /href="(https:\/\/www\.philmass\.com\/Asia\/Philippines\/[^"]*\/Roman-Catholic-Churches-in-[^"]*\.html)"/g;
|
||||
let match;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const url = decodeHtmlEntities(match[1]);
|
||||
if (seen.has(url)) continue;
|
||||
seen.add(url);
|
||||
urls.push(url);
|
||||
}
|
||||
|
||||
return urls;
|
||||
}
|
||||
|
||||
async function fetchChurchUrlsFromCityPage(cityUrl: string, provinceName: string): Promise<ChurchUrl[]> {
|
||||
const html = await fetchPage(cityUrl);
|
||||
if (!html) return [];
|
||||
|
||||
const churches: ChurchUrl[] = [];
|
||||
const seen = new Set<string>();
|
||||
|
||||
// Pattern: href=".../Roman-Catholic-Churches/{Church-Name}/mass-schedule.html"
|
||||
const regex = /href="(https:\/\/www\.philmass\.com\/Asia\/Philippines\/([^/]+)\/([^/]+)\/Roman-Catholic-Churches\/([^/]+)\/mass-schedule\.html)"/g;
|
||||
let match;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const url = decodeHtmlEntities(match[1]);
|
||||
const province = decodeURIComponent(decodeHtmlEntities(match[2]));
|
||||
const city = decodeURIComponent(decodeHtmlEntities(match[3]));
|
||||
const slug = decodeURIComponent(decodeHtmlEntities(match[4]));
|
||||
|
||||
if (seen.has(url)) continue;
|
||||
seen.add(url);
|
||||
|
||||
churches.push({ url, slug, province, city });
|
||||
}
|
||||
|
||||
return churches;
|
||||
}
|
||||
|
||||
// ─── HTML Parsers ────────────────────────────────────────────────────────────
|
||||
|
||||
function parseChurchJsonLd(html: string): ParsedPhilmassChurch | null {
|
||||
// Extract JSON-LD: <script type="application/ld+json">{...}</script>
|
||||
const jsonLdMatch = html.match(/<script\s+type="application\/ld\+json"\s*>([\s\S]*?)<\/script>/i);
|
||||
if (!jsonLdMatch) return null;
|
||||
|
||||
try {
|
||||
const data = JSON.parse(jsonLdMatch[1]);
|
||||
const church = data.mainEntityOfPage;
|
||||
if (!church || church['@type'] !== 'PlaceOfWorship') return null;
|
||||
|
||||
const address = church.address || {};
|
||||
return {
|
||||
name: church.name || null,
|
||||
streetAddress: address.streetAddress?.replace(/,\s*$/, '').trim() || null,
|
||||
city: address.addressLocality || null,
|
||||
region: address.addressRegion || null,
|
||||
};
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function parseChurchNameFromH1(html: string): string | null {
|
||||
// Fallback: <h1>Quiapo Church mass schedule 2026 - Minor Basilica of the Black Nazarene</h1>
|
||||
const h1Match = html.match(/<h1>([^<]+)<\/h1>/i);
|
||||
if (!h1Match) return null;
|
||||
|
||||
let name = h1Match[1].trim();
|
||||
// Remove "mass schedule YYYY" and trailing " - "
|
||||
name = name.replace(/\s*mass\s+schedule\s+\d{4}\s*/i, '');
|
||||
name = name.replace(/^\s*-\s*/, '').replace(/\s*-\s*$/, '');
|
||||
return name.trim() || null;
|
||||
}
|
||||
|
||||
function parseScheduleFromStartDates(html: string): ParsedSchedule[] {
|
||||
// Extract all startDate ISO timestamps from Schema.org Event microdata
|
||||
// Pattern: itemprop="startDate" content="2026-02-22T05:00:00+08:00"
|
||||
const schedules: ParsedSchedule[] = [];
|
||||
const seen = new Set<string>();
|
||||
|
||||
const regex = /itemprop="startDate"\s+content="(\d{4}-\d{2}-\d{2})T(\d{2}):(\d{2}):\d{2}[^"]*"/g;
|
||||
let match;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const dateStr = match[1];
|
||||
const hours = match[2];
|
||||
const minutes = match[3];
|
||||
|
||||
// Derive dayOfWeek from the date
|
||||
const date = new Date(`${dateStr}T12:00:00`); // noon to avoid TZ issues
|
||||
const dayOfWeek = date.getDay(); // 0=Sun, 1=Mon, ..., 6=Sat
|
||||
|
||||
const time = `${hours}:${minutes}`;
|
||||
const key = `${dayOfWeek}:${time}`;
|
||||
|
||||
if (seen.has(key)) continue;
|
||||
seen.add(key);
|
||||
|
||||
schedules.push({ dayOfWeek, time });
|
||||
}
|
||||
|
||||
return schedules;
|
||||
}
|
||||
|
||||
// ─── Database Operations ─────────────────────────────────────────────────────
|
||||
|
||||
async function loadExistingPhilippineChurches(): Promise<ExistingChurch[]> {
|
||||
console.log('Loading existing Philippine churches for deduplication...');
|
||||
const churches = await prisma.church.findMany({
|
||||
where: { country: 'PH' },
|
||||
select: {
|
||||
id: true,
|
||||
name: true,
|
||||
latitude: true,
|
||||
longitude: true,
|
||||
osmId: true,
|
||||
baiduId: true,
|
||||
masstimesId: true,
|
||||
orarimesseId: true,
|
||||
massSchedulesPhId: true,
|
||||
philmassId: true,
|
||||
horariosMisasId: true,
|
||||
mszeInfoId: true,
|
||||
weekdayMassesId: true,
|
||||
messesInfoId: true,
|
||||
bohosluzbyId: true,
|
||||
miserendId: true,
|
||||
kerknetId: true,
|
||||
gottesdienstzeitenId: true,
|
||||
discovermassId: true,
|
||||
source: true,
|
||||
website: true,
|
||||
phone: true,
|
||||
address: true,
|
||||
},
|
||||
});
|
||||
console.log(`Loaded ${churches.length} existing Philippine churches`);
|
||||
return churches;
|
||||
}
|
||||
|
||||
// ─── Import Logic ────────────────────────────────────────────────────────────
|
||||
|
||||
async function processChurch(
|
||||
churchUrl: ChurchUrl,
|
||||
existingChurches: ExistingChurch[],
|
||||
unmatchedLog: string[],
|
||||
dryRun: boolean,
|
||||
stats: ImportStats,
|
||||
): Promise<void> {
|
||||
stats.churchesProcessed++;
|
||||
|
||||
const html = await fetchPage(churchUrl.url);
|
||||
if (!html) {
|
||||
stats.errors++;
|
||||
return;
|
||||
}
|
||||
|
||||
// Parse church info from JSON-LD
|
||||
const jsonLd = parseChurchJsonLd(html);
|
||||
const churchName = jsonLd?.name || parseChurchNameFromH1(html);
|
||||
|
||||
if (!churchName) {
|
||||
console.log(` Skipping ${churchUrl.slug}: no name found`);
|
||||
stats.churchesSkipped++;
|
||||
return;
|
||||
}
|
||||
|
||||
// Parse schedules from Schema.org startDate attributes
|
||||
const schedules = parseScheduleFromStartDates(html);
|
||||
|
||||
// Try to find a match by philmassId first
|
||||
const existingByPhilmass = existingChurches.find((c) => c.philmassId === churchUrl.slug);
|
||||
let matched = existingByPhilmass || null;
|
||||
|
||||
// If no philmassId match, try name-based matching against churches with coordinates
|
||||
if (!matched) {
|
||||
// Try matching by name similarity against all PH churches
|
||||
// We can't use findDuplicateChurch() without coordinates, so do name-only matching
|
||||
const normalizedName = churchName.toLowerCase()
|
||||
.replace(/\bst\.\s/g, 'saint ')
|
||||
.replace(/\bst\s/g, 'saint ')
|
||||
.replace(/\bcatholic church\b/g, '')
|
||||
.replace(/\bparish\b/g, '')
|
||||
.replace(/\bchurch\b/g, '')
|
||||
.replace(/[^\w\s]/g, '')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
|
||||
// Filter to churches in the same city if possible
|
||||
const cityName = jsonLd?.city || churchUrl.city.replace(/-/g, ' ');
|
||||
const candidatesInCity = existingChurches.filter((c) => {
|
||||
if (!c.address) return false;
|
||||
return c.address.toLowerCase().includes(cityName.toLowerCase());
|
||||
});
|
||||
|
||||
// Search in-city candidates first, then all PH churches
|
||||
const searchPools = candidatesInCity.length > 0
|
||||
? [candidatesInCity, existingChurches]
|
||||
: [existingChurches];
|
||||
|
||||
for (const searchPool of searchPools) {
|
||||
if (matched) break;
|
||||
for (const existing of searchPool) {
|
||||
const existingNorm = existing.name.toLowerCase()
|
||||
.replace(/\bst\.\s/g, 'saint ')
|
||||
.replace(/\bst\s/g, 'saint ')
|
||||
.replace(/\bcatholic church\b/g, '')
|
||||
.replace(/\bparish\b/g, '')
|
||||
.replace(/\bchurch\b/g, '')
|
||||
.replace(/[^\w\s]/g, '')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
|
||||
// Require strong name match: one name contains the other, or very similar
|
||||
// Guard against overly generic names ("chapel", "holy", etc.) by requiring
|
||||
// that the shorter name is at least 8 chars after normalization
|
||||
const shorter = normalizedName.length <= existingNorm.length ? normalizedName : existingNorm;
|
||||
if (shorter.length >= 8) {
|
||||
if (normalizedName.includes(existingNorm) || existingNorm.includes(normalizedName)) {
|
||||
matched = existing;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (dryRun) {
|
||||
if (matched) {
|
||||
stats.churchesMatched++;
|
||||
console.log(` [MATCH] "${churchName}" → existing "${matched.name}" (${matched.id})`);
|
||||
} else {
|
||||
stats.churchesUnmatched++;
|
||||
unmatchedLog.push(`${churchName} | ${jsonLd?.city || churchUrl.city} | ${churchUrl.url}`);
|
||||
console.log(` [UNMATCHED] "${churchName}" in ${jsonLd?.city || churchUrl.city}`);
|
||||
}
|
||||
if (schedules.length > 0) {
|
||||
stats.massSchedulesCreated += schedules.length;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (!matched) {
|
||||
stats.churchesUnmatched++;
|
||||
unmatchedLog.push(`${churchName} | ${jsonLd?.city || churchUrl.city} | ${churchUrl.url}`);
|
||||
return;
|
||||
}
|
||||
|
||||
stats.churchesMatched++;
|
||||
|
||||
// Update existing church: set philmassId, fill missing fields
|
||||
const updateData: Record<string, unknown> = {
|
||||
philmassId: churchUrl.slug,
|
||||
};
|
||||
|
||||
if (!matched.address && jsonLd?.streetAddress) {
|
||||
const fullAddress = [jsonLd.streetAddress, jsonLd.city, jsonLd.region]
|
||||
.filter(Boolean).join(', ');
|
||||
updateData.address = fullAddress;
|
||||
}
|
||||
|
||||
// Fill city/state from JSON-LD or URL
|
||||
const dbRecord = await prisma.church.findUnique({
|
||||
where: { id: matched.id },
|
||||
select: { city: true, state: true },
|
||||
});
|
||||
if (dbRecord && !dbRecord.city && (jsonLd?.city || churchUrl.city)) {
|
||||
updateData.city = jsonLd?.city || churchUrl.city.replace(/-/g, ' ');
|
||||
}
|
||||
if (dbRecord && !dbRecord.state && (jsonLd?.region || churchUrl.province)) {
|
||||
updateData.state = jsonLd?.region || churchUrl.province.replace(/-/g, ' ');
|
||||
}
|
||||
|
||||
try {
|
||||
await prisma.church.update({
|
||||
where: { id: matched.id },
|
||||
data: updateData,
|
||||
});
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message.includes('Unique constraint')) {
|
||||
stats.churchesSkipped++;
|
||||
return;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
|
||||
// Replace mass schedules if we have any
|
||||
if (schedules.length > 0) {
|
||||
try {
|
||||
await prisma.$transaction(async (tx) => {
|
||||
await tx.massSchedule.deleteMany({ where: { churchId: matched!.id } });
|
||||
await tx.massSchedule.createMany({
|
||||
data: schedules.map((s) => ({
|
||||
churchId: matched!.id,
|
||||
dayOfWeek: s.dayOfWeek,
|
||||
time: s.time,
|
||||
language: 'English',
|
||||
})),
|
||||
});
|
||||
await tx.church.update({
|
||||
where: { id: matched!.id },
|
||||
data: { lastScrapedAt: new Date() },
|
||||
});
|
||||
});
|
||||
stats.schedulesUpdated++;
|
||||
stats.massSchedulesCreated += schedules.length;
|
||||
} catch (error) {
|
||||
stats.errors++;
|
||||
console.error(` Error saving schedules for ${churchUrl.slug}: ${error instanceof Error ? error.message : error}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ─── CLI ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
function parseArgs(): CLIArgs {
|
||||
const args = process.argv.slice(2);
|
||||
const result: CLIArgs = {
|
||||
all: false,
|
||||
dryRun: false,
|
||||
};
|
||||
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
switch (args[i]) {
|
||||
case '--all':
|
||||
result.all = true;
|
||||
break;
|
||||
case '--province':
|
||||
result.province = args[++i];
|
||||
break;
|
||||
case '--dry-run':
|
||||
result.dryRun = true;
|
||||
break;
|
||||
case '--resume-from':
|
||||
result.resumeFrom = args[++i];
|
||||
break;
|
||||
case '--job-id':
|
||||
result.jobId = args[++i];
|
||||
break;
|
||||
case '--help':
|
||||
case '-h':
|
||||
console.log(`
|
||||
Usage: npx tsx scripts/import-philmass.ts [options]
|
||||
|
||||
Options:
|
||||
--all Import from all provinces
|
||||
--province <name> Import from a single province (e.g. "Metro-Manila")
|
||||
--dry-run No database writes, just report what would happen
|
||||
--resume-from <province> Skip provinces until reaching this one
|
||||
--job-id <uuid> Background job tracking ID
|
||||
--help, -h Show this help message
|
||||
|
||||
Examples:
|
||||
npx tsx scripts/import-philmass.ts --province Metro-Manila --dry-run
|
||||
npx tsx scripts/import-philmass.ts --all
|
||||
npx tsx scripts/import-philmass.ts --all --resume-from Cebu
|
||||
`);
|
||||
process.exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
if (!result.all && !result.province) {
|
||||
console.error('Error: specify --all or --province <name>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ─── Helpers ─────────────────────────────────────────────────────────────────
|
||||
|
||||
function formatDuration(ms: number): string {
|
||||
const seconds = Math.floor(ms / 1000);
|
||||
const minutes = Math.floor(seconds / 60);
|
||||
const hours = Math.floor(minutes / 60);
|
||||
if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`;
|
||||
if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
|
||||
return `${seconds}s`;
|
||||
}
|
||||
|
||||
// ─── Main ────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const startTime = Date.now();
|
||||
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log('PHILMASS.COM IMPORTER');
|
||||
console.log('='.repeat(70));
|
||||
console.log(`Mode: ${args.all ? 'All provinces' : `Single province: ${args.province}`}`);
|
||||
console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`);
|
||||
if (args.resumeFrom) console.log(`Resume from: ${args.resumeFrom}`);
|
||||
console.log(`Time: ${new Date().toISOString()}`);
|
||||
console.log('='.repeat(70) + '\n');
|
||||
|
||||
// Update background job status if provided
|
||||
if (args.jobId) {
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: args.jobId },
|
||||
data: { status: 'running', startedAt: new Date() },
|
||||
});
|
||||
} catch {
|
||||
// Job might not exist yet
|
||||
}
|
||||
}
|
||||
|
||||
// Load existing Philippine churches for dedup
|
||||
const existingChurches = await loadExistingPhilippineChurches();
|
||||
|
||||
// ─── Phase 1: Discover all church URLs ───────────────────────────────────
|
||||
|
||||
console.log('=== Phase 1: Discovering church URLs ===\n');
|
||||
|
||||
const allProvinces = await fetchProvinceUrls();
|
||||
console.log(`Found ${allProvinces.length} provinces\n`);
|
||||
|
||||
// Filter to requested provinces
|
||||
let provincesToProcess: ProvinceInfo[];
|
||||
if (args.province) {
|
||||
const found = allProvinces.find((p) => p.name === args.province);
|
||||
if (!found) {
|
||||
console.error(`Province "${args.province}" not found. Available: ${allProvinces.map((p) => p.name).join(', ')}`);
|
||||
process.exit(1);
|
||||
}
|
||||
provincesToProcess = [found];
|
||||
} else {
|
||||
provincesToProcess = allProvinces;
|
||||
}
|
||||
|
||||
// Handle --resume-from
|
||||
if (args.resumeFrom) {
|
||||
const idx = provincesToProcess.findIndex((p) => p.name === args.resumeFrom);
|
||||
if (idx === -1) {
|
||||
console.error(`Resume province "${args.resumeFrom}" not found.`);
|
||||
process.exit(1);
|
||||
}
|
||||
console.log(`Resuming from province "${args.resumeFrom}" (skipping ${idx} provinces)\n`);
|
||||
provincesToProcess = provincesToProcess.slice(idx);
|
||||
}
|
||||
|
||||
// Collect all unique church URLs across all provinces/cities
|
||||
const allChurchUrls = new Map<string, ChurchUrl>(); // keyed by URL to deduplicate
|
||||
|
||||
const stats: ImportStats = {
|
||||
provincesProcessed: 0,
|
||||
citiesProcessed: 0,
|
||||
churchUrlsDiscovered: 0,
|
||||
churchesProcessed: 0,
|
||||
churchesMatched: 0,
|
||||
churchesUnmatched: 0,
|
||||
churchesSkipped: 0,
|
||||
schedulesUpdated: 0,
|
||||
massSchedulesCreated: 0,
|
||||
errors: 0,
|
||||
};
|
||||
|
||||
for (let pi = 0; pi < provincesToProcess.length; pi++) {
|
||||
const province = provincesToProcess[pi];
|
||||
const elapsed = formatDuration(Date.now() - startTime);
|
||||
console.log(`[${pi + 1}/${provincesToProcess.length}] Province: ${province.name} [${elapsed} elapsed]`);
|
||||
|
||||
try {
|
||||
// Get city listing URLs from province page
|
||||
const cityUrls = await fetchCityListingUrls(province.url, province.name);
|
||||
console.log(` Found ${cityUrls.length} city listing pages`);
|
||||
|
||||
for (const cityUrl of cityUrls) {
|
||||
const churchUrls = await fetchChurchUrlsFromCityPage(cityUrl, province.name);
|
||||
stats.citiesProcessed++;
|
||||
|
||||
for (const church of churchUrls) {
|
||||
if (!allChurchUrls.has(church.url)) {
|
||||
allChurchUrls.set(church.url, church);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
stats.provincesProcessed++;
|
||||
console.log(` Total unique churches so far: ${allChurchUrls.size}`);
|
||||
} catch (error) {
|
||||
stats.errors++;
|
||||
console.error(` ERROR discovering ${province.name}: ${error instanceof Error ? error.message : error}`);
|
||||
}
|
||||
}
|
||||
|
||||
stats.churchUrlsDiscovered = allChurchUrls.size;
|
||||
console.log(`\nDiscovery complete: ${allChurchUrls.size} unique church URLs across ${stats.citiesProcessed} city pages\n`);
|
||||
|
||||
// ─── Phase 2: Process each church ─────────────────────────────────────────
|
||||
|
||||
console.log('=== Phase 2: Processing churches ===\n');
|
||||
|
||||
const churchList = [...allChurchUrls.values()];
|
||||
const unmatchedLog: string[] = [];
|
||||
|
||||
for (let i = 0; i < churchList.length; i++) {
|
||||
const church = churchList[i];
|
||||
const elapsed = formatDuration(Date.now() - startTime);
|
||||
if ((i + 1) % 50 === 0 || i === 0) {
|
||||
console.log(`[${i + 1}/${churchList.length}] Processing churches... [${elapsed} elapsed]`);
|
||||
}
|
||||
|
||||
try {
|
||||
await processChurch(church, existingChurches, unmatchedLog, args.dryRun, stats);
|
||||
} catch (error) {
|
||||
stats.errors++;
|
||||
console.error(` ERROR processing ${church.slug}: ${error instanceof Error ? error.message : error}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Print summary
|
||||
const totalTime = Date.now() - startTime;
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
|
||||
console.log('='.repeat(70));
|
||||
console.log(`Provinces processed: ${stats.provincesProcessed}`);
|
||||
console.log(`Cities processed: ${stats.citiesProcessed}`);
|
||||
console.log(`Church URLs discovered: ${stats.churchUrlsDiscovered}`);
|
||||
console.log(`Churches processed: ${stats.churchesProcessed}`);
|
||||
console.log(` Matched (updated): ${stats.churchesMatched}`);
|
||||
console.log(` Unmatched (skipped): ${stats.churchesUnmatched}`);
|
||||
console.log(` Skipped (other): ${stats.churchesSkipped}`);
|
||||
console.log(`Schedules updated: ${stats.schedulesUpdated}`);
|
||||
console.log(`Mass schedules created: ${stats.massSchedulesCreated}`);
|
||||
console.log(`Errors: ${stats.errors}`);
|
||||
console.log(`Total time: ${formatDuration(totalTime)}`);
|
||||
console.log(`HTTP requests: ${requestCount}`);
|
||||
console.log('='.repeat(70));
|
||||
|
||||
// Log unmatched churches for manual review
|
||||
if (unmatchedLog.length > 0) {
|
||||
console.log(`\nUnmatched churches (${unmatchedLog.length}):`);
|
||||
console.log('-'.repeat(70));
|
||||
for (const line of unmatchedLog) {
|
||||
console.log(` ${line}`);
|
||||
}
|
||||
console.log('-'.repeat(70));
|
||||
}
|
||||
|
||||
console.log('');
|
||||
|
||||
// Update background job
|
||||
if (args.jobId) {
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: args.jobId },
|
||||
data: {
|
||||
status: stats.errors > 0 ? 'completed_with_errors' : 'completed',
|
||||
completedAt: new Date(),
|
||||
result: JSON.stringify(stats),
|
||||
},
|
||||
});
|
||||
} catch {
|
||||
// Ignore
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
main()
|
||||
.catch((error) => {
|
||||
console.error('Fatal error:', error);
|
||||
process.exit(1);
|
||||
})
|
||||
.finally(async () => {
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
});
|
||||
1121
scripts/import-weekdaymasses.ts
Normal file
1121
scripts/import-weekdaymasses.ts
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user