feat: add discovermassId to church-matcher ExistingChurch and ChurchCandidate
Add discovermassId field to ExistingChurch interface and ChurchCandidate type, insert a dedicated matching pass in findDuplicateChurch, and update all 15 importer push blocks plus 16 loadExistingChurches select queries to include the new field. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
579
scripts/import-miserend.ts
Normal file
579
scripts/import-miserend.ts
Normal file
@@ -0,0 +1,579 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Import Catholic churches and mass schedules from miserend.hu (Hungary)
|
||||
*
|
||||
* miserend.hu is the Hungarian Catholic mass schedule database, maintained by
|
||||
* the community with ~5,055 churches (mostly Hungary, some Romania/Slovakia).
|
||||
* It publishes a daily-updated SQLite database at:
|
||||
* https://miserend.hu/fajlok/sqlite/miserend_v4.sqlite3
|
||||
*
|
||||
* The SQLite contains:
|
||||
* - templomok: churches (tid, nev, lat, lng, varos, cim, orszag, megye)
|
||||
* - misek: date-specific mass entries (tid, ido, datumtol, datumig, nyelv)
|
||||
* - kepek: church photos
|
||||
*
|
||||
* Import strategy:
|
||||
* 1. Download the SQLite database
|
||||
* 2. Extract all churches with coordinates
|
||||
* 3. Deduce weekly recurring schedules from date-specific entries
|
||||
* 4. Match against existing churches via church-matcher
|
||||
* 5. Upsert churches and mass schedules
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/import-miserend.ts --all --dry-run
|
||||
* npx tsx scripts/import-miserend.ts --all
|
||||
* npx tsx scripts/import-miserend.ts --id 37 --dry-run # Single church
|
||||
* npx tsx scripts/import-miserend.ts --all --resume-from 500
|
||||
*/
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
import fs from 'fs';
|
||||
import { execFileSync } from 'child_process';
|
||||
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
|
||||
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
|
||||
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
|
||||
const pool = new Pool({
|
||||
connectionString: dbUrl,
|
||||
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
|
||||
});
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
import { findDuplicateChurch } from '../src/lib/church-matcher';
|
||||
import type { ExistingChurch } from '../src/lib/church-matcher';
|
||||
|
||||
// ─── Constants ───────────────────────────────────────────────────────────────
|
||||
|
||||
const SQLITE_URL = 'https://miserend.hu/fajlok/sqlite/miserend_v4.sqlite3';
|
||||
const SQLITE_PATH = '/tmp/miserend_v4.sqlite3';
|
||||
|
||||
// Country mapping from Hungarian names to ISO codes
|
||||
const COUNTRY_MAP: Record<string, string> = {
|
||||
'Magyarország': 'HU',
|
||||
'România': 'RO',
|
||||
'Slovensko': 'SK',
|
||||
'Szlovákia': 'SK',
|
||||
'Szerbia-Montenegro': 'RS',
|
||||
'Србија': 'RS',
|
||||
'Ukrajna': 'UA',
|
||||
'Україна': 'UA',
|
||||
'Österreich': 'AT',
|
||||
'Schweiz/Suisse/Svizzera/Svizra': 'CH',
|
||||
'België / Belgique / Belgien': 'BE',
|
||||
'Éire / Ireland': 'IE',
|
||||
'Россия': 'RU',
|
||||
};
|
||||
|
||||
// ─── Types ───────────────────────────────────────────────────────────────────
|
||||
|
||||
interface MiserendChurch {
|
||||
tid: number;
|
||||
nev: string;
|
||||
ismertnev: string | null;
|
||||
orszag: string | null;
|
||||
megye: string | null;
|
||||
varos: string | null;
|
||||
cim: string | null;
|
||||
lat: number;
|
||||
lng: number;
|
||||
}
|
||||
|
||||
interface MiserendMass {
|
||||
mid: number;
|
||||
tid: number;
|
||||
datumtol: number; // MMDD format
|
||||
datumig: number;
|
||||
ido: string; // HH:MM:SS
|
||||
nyelv: string | null;
|
||||
}
|
||||
|
||||
interface ParsedSchedule {
|
||||
dayOfWeek: number;
|
||||
time: string;
|
||||
}
|
||||
|
||||
interface ImportStats {
|
||||
churchesFetched: number;
|
||||
churchesMatched: number;
|
||||
churchesCreated: number;
|
||||
churchesSkipped: number;
|
||||
schedulesCreated: number;
|
||||
errors: number;
|
||||
}
|
||||
|
||||
interface CLIArgs {
|
||||
all: boolean;
|
||||
dryRun: boolean;
|
||||
resumeFrom?: number;
|
||||
churchId?: string;
|
||||
jobId?: string;
|
||||
}
|
||||
|
||||
// ─── SQLite Helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
function sqliteQuery(query: string): string {
|
||||
try {
|
||||
return execFileSync('sqlite3', [SQLITE_PATH, query], {
|
||||
encoding: 'utf-8',
|
||||
maxBuffer: 100 * 1024 * 1024, // 100MB
|
||||
}).trim();
|
||||
} catch {
|
||||
return '';
|
||||
}
|
||||
}
|
||||
|
||||
function downloadSqlite(): void {
|
||||
console.log('Downloading miserend SQLite database...');
|
||||
execFileSync('curl', ['-sL', '-o', SQLITE_PATH, SQLITE_URL], { timeout: 120000 });
|
||||
const size = fs.statSync(SQLITE_PATH).size;
|
||||
console.log(`Downloaded ${(size / 1024 / 1024).toFixed(1)}MB`);
|
||||
}
|
||||
|
||||
function loadChurches(): MiserendChurch[] {
|
||||
const raw = sqliteQuery(
|
||||
"SELECT tid, nev, ismertnev, orszag, megye, varos, cim, lat, lng FROM templomok WHERE lat IS NOT NULL AND lng IS NOT NULL AND lat != 0 AND lng != 0;"
|
||||
);
|
||||
if (!raw) return [];
|
||||
|
||||
return raw.split('\n').map(line => {
|
||||
const [tid, nev, ismertnev, orszag, megye, varos, cim, lat, lng] = line.split('|');
|
||||
return {
|
||||
tid: parseInt(tid),
|
||||
nev: nev || '',
|
||||
ismertnev: ismertnev || null,
|
||||
orszag: orszag || null,
|
||||
megye: megye || null,
|
||||
varos: varos || null,
|
||||
cim: cim || null,
|
||||
lat: parseFloat(lat),
|
||||
lng: parseFloat(lng),
|
||||
};
|
||||
}).filter(c => !isNaN(c.tid) && !isNaN(c.lat) && !isNaN(c.lng));
|
||||
}
|
||||
|
||||
function loadMassesForChurch(tid: number): MiserendMass[] {
|
||||
const raw = sqliteQuery(
|
||||
`SELECT mid, tid, datumtol, datumig, ido, nyelv FROM misek WHERE tid=${tid};`
|
||||
);
|
||||
if (!raw) return [];
|
||||
|
||||
return raw.split('\n').map(line => {
|
||||
const [mid, tidStr, datumtol, datumig, ido, nyelv] = line.split('|');
|
||||
return {
|
||||
mid: parseInt(mid),
|
||||
tid: parseInt(tidStr),
|
||||
datumtol: parseInt(datumtol),
|
||||
datumig: parseInt(datumig),
|
||||
ido: ido || '',
|
||||
nyelv: nyelv || null,
|
||||
};
|
||||
}).filter(m => !isNaN(m.mid) && m.ido);
|
||||
}
|
||||
|
||||
// ─── Schedule Deduction ──────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Deduce weekly recurring schedule from date-specific mass entries.
|
||||
* Each entry has datumtol/datumig in MMDD format (e.g., 104 = Jan 4).
|
||||
* We convert each date to a day of week and collect unique day+time combos.
|
||||
*/
|
||||
function deduceSchedules(masses: MiserendMass[]): ParsedSchedule[] {
|
||||
const seen = new Set<string>();
|
||||
const schedules: ParsedSchedule[] = [];
|
||||
|
||||
// Use current year for date conversion
|
||||
const year = new Date().getFullYear();
|
||||
|
||||
for (const mass of masses) {
|
||||
const time = mass.ido.substring(0, 5); // HH:MM from HH:MM:SS
|
||||
if (!time || time === '00:00') continue;
|
||||
|
||||
// Convert MMDD to a Date to get day of week
|
||||
const mmdd = mass.datumtol;
|
||||
const month = Math.floor(mmdd / 100);
|
||||
const day = mmdd % 100;
|
||||
if (month < 1 || month > 12 || day < 1 || day > 31) continue;
|
||||
|
||||
const date = new Date(year, month - 1, day);
|
||||
const dayOfWeek = date.getDay(); // 0=Sun, 1=Mon, ..., 6=Sat
|
||||
|
||||
const key = `${dayOfWeek}:${time}`;
|
||||
if (!seen.has(key)) {
|
||||
seen.add(key);
|
||||
schedules.push({ dayOfWeek, time });
|
||||
}
|
||||
}
|
||||
|
||||
return schedules;
|
||||
}
|
||||
|
||||
// ─── Database Operations ─────────────────────────────────────────────────────
|
||||
|
||||
async function loadExistingChurches(countryCodes: string[]): Promise<ExistingChurch[]> {
|
||||
console.log(`Loading existing churches for countries: ${countryCodes.join(', ')}...`);
|
||||
const churches = await prisma.church.findMany({
|
||||
where: { country: { in: countryCodes } },
|
||||
select: {
|
||||
id: true,
|
||||
name: true,
|
||||
latitude: true,
|
||||
longitude: true,
|
||||
osmId: true,
|
||||
baiduId: true,
|
||||
masstimesId: true,
|
||||
orarimesseId: true,
|
||||
massSchedulesPhId: true,
|
||||
philmassId: true,
|
||||
horariosMisasId: true,
|
||||
mszeInfoId: true,
|
||||
weekdayMassesId: true,
|
||||
messesInfoId: true,
|
||||
bohosluzbyId: true,
|
||||
miserendId: true,
|
||||
kerknetId: true,
|
||||
gottesdienstzeitenId: true,
|
||||
discovermassId: true,
|
||||
source: true,
|
||||
website: true,
|
||||
phone: true,
|
||||
address: true,
|
||||
},
|
||||
});
|
||||
console.log(`Loaded ${churches.length} existing churches`);
|
||||
return churches;
|
||||
}
|
||||
|
||||
// ─── Import Logic ────────────────────────────────────────────────────────────
|
||||
|
||||
async function processChurch(
|
||||
church: MiserendChurch,
|
||||
existingChurches: ExistingChurch[],
|
||||
dryRun: boolean,
|
||||
stats: ImportStats,
|
||||
): Promise<void> {
|
||||
const miserendId = String(church.tid);
|
||||
const country = church.orszag ? (COUNTRY_MAP[church.orszag] || 'HU') : 'HU';
|
||||
|
||||
const candidate = {
|
||||
name: church.nev,
|
||||
lat: church.lat,
|
||||
lng: church.lng,
|
||||
miserendId,
|
||||
};
|
||||
|
||||
const duplicate = findDuplicateChurch(candidate, existingChurches);
|
||||
|
||||
// Deduce schedules
|
||||
let schedules: ParsedSchedule[] = [];
|
||||
if (!dryRun) {
|
||||
const masses = loadMassesForChurch(church.tid);
|
||||
schedules = deduceSchedules(masses);
|
||||
}
|
||||
|
||||
if (dryRun) {
|
||||
if (duplicate) {
|
||||
stats.churchesMatched++;
|
||||
} else {
|
||||
stats.churchesCreated++;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (duplicate) {
|
||||
stats.churchesMatched++;
|
||||
const updateData: Record<string, unknown> = { miserendId };
|
||||
|
||||
if (!duplicate.address && church.cim) updateData.address = church.cim;
|
||||
|
||||
try {
|
||||
await prisma.church.update({
|
||||
where: { id: duplicate.id },
|
||||
data: updateData,
|
||||
});
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message.includes('Unique constraint')) {
|
||||
stats.churchesSkipped++;
|
||||
return;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
|
||||
if (schedules.length > 0) {
|
||||
try {
|
||||
await prisma.$transaction(async (tx) => {
|
||||
await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } });
|
||||
await tx.massSchedule.createMany({
|
||||
data: schedules.map((s) => ({
|
||||
churchId: duplicate.id,
|
||||
dayOfWeek: s.dayOfWeek,
|
||||
time: s.time,
|
||||
language: 'Hungarian',
|
||||
})),
|
||||
});
|
||||
await tx.church.update({
|
||||
where: { id: duplicate.id },
|
||||
data: { lastScrapedAt: new Date() },
|
||||
});
|
||||
});
|
||||
stats.schedulesCreated += schedules.length;
|
||||
} catch (error) {
|
||||
stats.errors++;
|
||||
console.error(` Error saving schedules for ${miserendId}: ${error instanceof Error ? error.message : error}`);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
const newChurch = await prisma.church.create({
|
||||
data: {
|
||||
name: church.nev,
|
||||
latitude: church.lat,
|
||||
longitude: church.lng,
|
||||
address: church.cim,
|
||||
city: church.varos,
|
||||
state: church.megye,
|
||||
country,
|
||||
miserendId,
|
||||
source: 'miserend',
|
||||
websiteLanguage: 'hu',
|
||||
},
|
||||
});
|
||||
stats.churchesCreated++;
|
||||
|
||||
existingChurches.push({
|
||||
id: newChurch.id,
|
||||
name: church.nev,
|
||||
latitude: church.lat,
|
||||
longitude: church.lng,
|
||||
osmId: null,
|
||||
baiduId: null,
|
||||
masstimesId: null,
|
||||
orarimesseId: null,
|
||||
massSchedulesPhId: null,
|
||||
philmassId: null,
|
||||
horariosMisasId: null,
|
||||
mszeInfoId: null,
|
||||
weekdayMassesId: null,
|
||||
messesInfoId: null,
|
||||
bohosluzbyId: null,
|
||||
miserendId,
|
||||
kerknetId: null,
|
||||
gottesdienstzeitenId: null,
|
||||
discovermassId: null,
|
||||
source: 'miserend',
|
||||
website: null,
|
||||
phone: null,
|
||||
address: church.cim,
|
||||
});
|
||||
|
||||
if (schedules.length > 0) {
|
||||
await prisma.massSchedule.createMany({
|
||||
data: schedules.map((s) => ({
|
||||
churchId: newChurch.id,
|
||||
dayOfWeek: s.dayOfWeek,
|
||||
time: s.time,
|
||||
language: 'Hungarian',
|
||||
})),
|
||||
});
|
||||
await prisma.church.update({
|
||||
where: { id: newChurch.id },
|
||||
data: { lastScrapedAt: new Date() },
|
||||
});
|
||||
stats.schedulesCreated += schedules.length;
|
||||
}
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message.includes('Unique constraint')) {
|
||||
stats.churchesSkipped++;
|
||||
return;
|
||||
}
|
||||
stats.errors++;
|
||||
console.error(` Error creating ${miserendId}: ${error instanceof Error ? error.message : error}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ─── CLI ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
function parseArgs(): CLIArgs {
|
||||
const args = process.argv.slice(2);
|
||||
const result: CLIArgs = { all: false, dryRun: false };
|
||||
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
switch (args[i]) {
|
||||
case '--all':
|
||||
result.all = true;
|
||||
break;
|
||||
case '--dry-run':
|
||||
result.dryRun = true;
|
||||
break;
|
||||
case '--resume-from':
|
||||
result.resumeFrom = parseInt(args[++i]);
|
||||
break;
|
||||
case '--id':
|
||||
result.churchId = args[++i];
|
||||
break;
|
||||
case '--job-id':
|
||||
result.jobId = args[++i];
|
||||
break;
|
||||
case '--help':
|
||||
case '-h':
|
||||
console.log(`
|
||||
Usage: npx tsx scripts/import-miserend.ts [options]
|
||||
|
||||
Options:
|
||||
--all Import all churches
|
||||
--id <tid> Import a single church by miserend ID
|
||||
--dry-run No database writes, just report what would happen
|
||||
--resume-from <n> Skip first N churches
|
||||
--job-id <uuid> Background job tracking ID
|
||||
--help, -h Show this help message
|
||||
|
||||
Examples:
|
||||
npx tsx scripts/import-miserend.ts --id 37 --dry-run
|
||||
npx tsx scripts/import-miserend.ts --all --dry-run
|
||||
npx tsx scripts/import-miserend.ts --all
|
||||
`);
|
||||
process.exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
if (!result.all && !result.churchId) {
|
||||
console.error('Error: specify --all or --id <miserend_tid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
function formatDuration(ms: number): string {
|
||||
const seconds = Math.floor(ms / 1000);
|
||||
const minutes = Math.floor(seconds / 60);
|
||||
const hours = Math.floor(minutes / 60);
|
||||
if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`;
|
||||
if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
|
||||
return `${seconds}s`;
|
||||
}
|
||||
|
||||
// ─── Main ────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const startTime = Date.now();
|
||||
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log('MISEREND.HU (HUNGARY) IMPORTER');
|
||||
console.log('='.repeat(70));
|
||||
console.log(`Mode: ${args.churchId ? `Church ID ${args.churchId}` : 'All churches'}`);
|
||||
console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`);
|
||||
if (args.resumeFrom) console.log(`Resume from: church index ${args.resumeFrom}`);
|
||||
console.log(`Time: ${new Date().toISOString()}`);
|
||||
console.log('='.repeat(70) + '\n');
|
||||
|
||||
if (args.jobId) {
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: args.jobId },
|
||||
data: { status: 'running', startedAt: new Date() },
|
||||
});
|
||||
} catch { /* Job might not exist */ }
|
||||
}
|
||||
|
||||
const stats: ImportStats = {
|
||||
churchesFetched: 0,
|
||||
churchesMatched: 0,
|
||||
churchesCreated: 0,
|
||||
churchesSkipped: 0,
|
||||
schedulesCreated: 0,
|
||||
errors: 0,
|
||||
};
|
||||
|
||||
// Download SQLite database
|
||||
downloadSqlite();
|
||||
|
||||
// Load churches from SQLite
|
||||
let churches = loadChurches();
|
||||
stats.churchesFetched = churches.length;
|
||||
console.log(`Found ${churches.length} churches with coordinates in SQLite\n`);
|
||||
|
||||
if (args.churchId) {
|
||||
churches = churches.filter(c => String(c.tid) === args.churchId);
|
||||
if (churches.length === 0) {
|
||||
console.error(`Church ID ${args.churchId} not found in SQLite database`);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Get unique country codes from the data
|
||||
const countryCodes = [...new Set(churches.map(c => {
|
||||
return c.orszag ? (COUNTRY_MAP[c.orszag] || 'HU') : 'HU';
|
||||
}))];
|
||||
const existingChurches = await loadExistingChurches(countryCodes);
|
||||
|
||||
if (args.resumeFrom) {
|
||||
churches = churches.slice(args.resumeFrom);
|
||||
console.log(`Resuming from index ${args.resumeFrom} (${churches.length} remaining)\n`);
|
||||
}
|
||||
|
||||
console.log(`Processing ${churches.length} churches\n`);
|
||||
|
||||
for (let i = 0; i < churches.length; i++) {
|
||||
const church = churches[i];
|
||||
if (i % 200 === 0) {
|
||||
const elapsed = formatDuration(Date.now() - startTime);
|
||||
console.log(`[${i + 1}/${churches.length}] Processing ${church.nev} (${church.tid}) [${elapsed} elapsed]`);
|
||||
}
|
||||
|
||||
try {
|
||||
await processChurch(church, existingChurches, args.dryRun, stats);
|
||||
} catch (error) {
|
||||
stats.errors++;
|
||||
console.error(` ERROR processing church ${church.tid}: ${error instanceof Error ? error.message : error}`);
|
||||
}
|
||||
}
|
||||
|
||||
const totalTime = Date.now() - startTime;
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
|
||||
console.log('='.repeat(70));
|
||||
console.log(`Churches in SQLite: ${stats.churchesFetched}`);
|
||||
console.log(` Matched (existing): ${stats.churchesMatched}`);
|
||||
console.log(` Created (new): ${stats.churchesCreated}`);
|
||||
console.log(` Skipped: ${stats.churchesSkipped}`);
|
||||
console.log(`Schedules created: ${stats.schedulesCreated}`);
|
||||
console.log(`Errors: ${stats.errors}`);
|
||||
console.log(`Total time: ${formatDuration(totalTime)}`);
|
||||
console.log('='.repeat(70) + '\n');
|
||||
|
||||
if (args.jobId) {
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: args.jobId },
|
||||
data: {
|
||||
status: stats.errors > 0 ? 'completed_with_errors' : 'completed',
|
||||
completedAt: new Date(),
|
||||
processed: stats.churchesFetched,
|
||||
succeeded: stats.churchesCreated + stats.churchesMatched,
|
||||
failed: stats.errors,
|
||||
itemsFound: stats.schedulesCreated,
|
||||
},
|
||||
});
|
||||
} catch { /* Ignore */ }
|
||||
}
|
||||
}
|
||||
|
||||
main()
|
||||
.catch((error) => {
|
||||
console.error('Fatal error:', error);
|
||||
process.exit(1);
|
||||
})
|
||||
.finally(async () => {
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
});
|
||||
Reference in New Issue
Block a user