Files
ScraperControl/scripts/import-orarimesse.ts

770 lines
24 KiB
TypeScript
Raw Normal View History

#!/usr/bin/env tsx
/**
* Import Catholic churches and mass schedules from OrariMesse.it
*
* OrariMesse.it is the official CEI (Italian Bishops' Conference) platform for
* mass times in Italy. It provides a public REST API organized by diocese.
*
* Import strategy:
* Pass 1: For each diocese, fetch all churches match against existing DB
* records (by ICSC code or proximity+name) upsert
* Pass 2: For churches with active schedules, fetch detail endpoint
* convert 8-day rolling schedule to recurring replace mass schedules
*
* Usage:
* npx tsx scripts/import-orarimesse.ts --all
* npx tsx scripts/import-orarimesse.ts --diocese roma
* npx tsx scripts/import-orarimesse.ts --all --dry-run
* npx tsx scripts/import-orarimesse.ts --all --schedules-only
* npx tsx scripts/import-orarimesse.ts --all --resume-from napoli
* npx tsx scripts/import-orarimesse.ts --all --job-id {uuid}
*/
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
const pool = new Pool({
connectionString: dbUrl,
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
});
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
import { findDuplicateChurch } from '../src/lib/church-matcher';
import type { ExistingChurch } from '../src/lib/church-matcher';
// ─── Constants ───────────────────────────────────────────────────────────────
const API_BASE = 'https://orarimesse.it/api';
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
const DIOCESE_DELAY_MS = 2000;
const DETAIL_DELAY_MS = 1000;
// ─── Italian Day Map ─────────────────────────────────────────────────────────
const ITALIAN_DAY_MAP: Record<string, number> = {
'domenica': 0, 'lunedì': 1, 'lunedi': 1,
'martedì': 2, 'martedi': 2, 'mercoledì': 3, 'mercoledi': 3,
'giovedì': 4, 'giovedi': 4, 'venerdì': 5, 'venerdi': 5,
'sabato': 6,
};
// ─── Types ───────────────────────────────────────────────────────────────────
interface OrariMesseDiocese {
codice_cei: string;
title: string;
slug: string;
url: string;
countChiese: number;
}
interface OrariMesseChurch {
idchurch: number;
address: string;
name: string;
conosciutaCome: string;
isopen: boolean;
nextmass: string;
lat: string;
lon: string;
sito: string;
emailLdc: string;
icsc: string;
comune: string;
tipologia: string;
accessibile: boolean;
}
interface OrariMesseDioceseResponse {
codice_cei: string;
title: string;
slug: string;
countChiese: number;
listaChiese: OrariMesseChurch[];
}
interface OrariMesseMass {
idmass: number;
time: string;
noteOrarioMessa: string;
}
interface OrariMesseDay {
day: string;
mass: OrariMesseMass[];
}
interface OrariMesseDetail {
idchurch: number;
name: string;
address: string;
lat: string;
lon: string;
icsc: string;
comune: string;
diocesi: string;
parroco: string;
telefono: string;
email: string;
sito: string;
days: OrariMesseDay[];
}
interface ImportStats {
diocesesProcessed: number;
churchesFound: number;
churchesMatched: number;
churchesCreated: number;
churchesSkipped: number;
schedulesProcessed: number;
massSchedulesCreated: number;
errors: number;
}
interface CLIArgs {
all: boolean;
diocese?: string;
dryRun: boolean;
schedulesOnly: boolean;
resumeFrom?: string;
jobId?: string;
}
// ─── API Client ──────────────────────────────────────────────────────────────
let requestCount = 0;
function delay(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function fetchApi<T>(endpoint: string, params: Record<string, string> = {}, delayMs: number): Promise<T | null> {
if (requestCount > 0) {
await delay(delayMs);
}
requestCount++;
const url = new URL(`${API_BASE}/${endpoint}`);
for (const [key, value] of Object.entries(params)) {
url.searchParams.set(key, value);
}
try {
const response = await fetch(url.toString(), {
headers: {
'User-Agent': USER_AGENT,
'Accept': 'application/json',
},
});
if (!response.ok) {
console.error(` HTTP ${response.status} for ${url}`);
return null;
}
const json = await response.json() as { status: boolean; code: string; data: T };
if (json.status === true && json.code === 'OK') {
return json.data;
}
console.error(` API error for ${url}: ${JSON.stringify(json).substring(0, 200)}`);
return null;
} catch (error) {
console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`);
return null;
}
}
async function fetchDioceses(): Promise<OrariMesseDiocese[]> {
const data = await fetchApi<OrariMesseDiocese[]>('getDiocesi', {}, DIOCESE_DELAY_MS);
return data || [];
}
async function fetchChurchesInDiocese(slug: string): Promise<OrariMesseDioceseResponse | null> {
const data = await fetchApi<OrariMesseDioceseResponse[]>(
'getListaChiese',
{ diocesi: slug, type: 'compact' },
DIOCESE_DELAY_MS
);
// Response is an array with a single diocese object
if (data && data.length > 0) {
return data[0];
}
return null;
}
async function fetchChurchDetail(idchurch: number): Promise<OrariMesseDetail | null> {
return fetchApi<OrariMesseDetail>(
'getDettaglioMessa',
{ idchurch: String(idchurch) },
DETAIL_DELAY_MS
);
}
// ─── Day/Time Conversion ─────────────────────────────────────────────────────
function parseItalianDay(dayString: string): number | null {
// "Giovedì 26 Febbraio" → extract first word → lowercase → lookup
const firstWord = dayString.split(' ')[0].toLowerCase();
return ITALIAN_DAY_MAP[firstWord] ?? null;
}
function convertTime(time: string): string {
// "07.00" → "07:00"
return time.replace('.', ':');
}
interface RecurringMass {
dayOfWeek: number;
time: string;
notes: string | null;
}
function convertScheduleToRecurring(days: OrariMesseDay[]): RecurringMass[] {
// The API returns an 8-day rolling window. Same weekday can appear twice
// (e.g. Thursday this week and Thursday next week). We deduplicate by
// dayOfWeek+time to get the recurring weekly schedule.
const seen = new Set<string>();
const result: RecurringMass[] = [];
for (const day of days) {
const dayOfWeek = parseItalianDay(day.day);
if (dayOfWeek === null) continue;
for (const mass of day.mass) {
const time = convertTime(mass.time);
const key = `${dayOfWeek}:${time}`;
if (seen.has(key)) continue;
seen.add(key);
result.push({
dayOfWeek,
time,
notes: mass.noteOrarioMessa || null,
});
}
}
return result;
}
// ─── Database Operations ─────────────────────────────────────────────────────
async function loadExistingItalianChurches(): Promise<ExistingChurch[]> {
console.log('Loading existing Italian churches for deduplication...');
const churches = await prisma.church.findMany({
where: { country: 'IT' },
select: {
id: true,
name: true,
latitude: true,
longitude: true,
osmId: true,
baiduId: true,
masstimesId: true,
orarimesseId: true,
massSchedulesPhId: true,
philmassId: true,
horariosMisasId: true,
mszeInfoId: true,
weekdayMassesId: true,
messesInfoId: true,
bohosluzbyId: true,
miserendId: true,
kerknetId: true,
gottesdienstzeitenId: true,
source: true,
website: true,
phone: true,
address: true,
},
});
console.log(`Loaded ${churches.length} existing Italian churches`);
return churches;
}
// ─── Pass 1: Church Upsert ──────────────────────────────────────────────────
async function processChurchesForDiocese(
dioceseSlug: string,
churches: OrariMesseChurch[],
existingChurches: ExistingChurch[],
idchurchToDbId: Map<number, string>,
dryRun: boolean,
stats: ImportStats,
): Promise<void> {
for (const church of churches) {
stats.churchesFound++;
// Parse coordinates
const lat = parseFloat(church.lat);
const lon = parseFloat(church.lon);
if (isNaN(lat) || isNaN(lon) || lat === 0 || lon === 0) {
stats.churchesSkipped++;
continue;
}
// Build candidate for dedup
const candidate = {
name: church.name,
lat,
lng: lon,
orarimesseId: church.icsc || undefined,
};
const duplicate = findDuplicateChurch(candidate, existingChurches);
if (dryRun) {
if (duplicate) {
stats.churchesMatched++;
} else {
stats.churchesCreated++;
}
// Track idchurch for Pass 2 even in dry run
if (duplicate) {
idchurchToDbId.set(church.idchurch, duplicate.id);
}
continue;
}
if (duplicate) {
// Update existing church: set orarimesseId, fill missing fields
stats.churchesMatched++;
const updateData: Record<string, unknown> = {
orarimesseId: church.icsc || undefined,
orarimesseLastSyncedAt: new Date(),
};
if (!duplicate.address && church.address) updateData.address = church.address;
if (!duplicate.website && church.sito) {
updateData.website = church.sito;
updateData.hasWebsite = true;
}
// Check diocese on the actual DB record (not in ExistingChurch)
const dbRecord = await prisma.church.findUnique({
where: { id: duplicate.id },
select: { diocese: true, city: true, email: true },
});
if (dbRecord && !dbRecord.diocese && dioceseSlug) {
updateData.diocese = dioceseSlug;
}
if (dbRecord && !dbRecord.city && church.comune) {
updateData.city = church.comune;
}
if (dbRecord && !dbRecord.email && church.emailLdc) {
updateData.email = church.emailLdc;
}
try {
await prisma.church.update({
where: { id: duplicate.id },
data: updateData,
});
} catch (error) {
// Unique constraint violation on orarimesseId — another church already has this ICSC
if (error instanceof Error && error.message.includes('Unique constraint')) {
stats.churchesSkipped++;
continue;
}
throw error;
}
idchurchToDbId.set(church.idchurch, duplicate.id);
} else {
// Create new church
try {
const newChurch = await prisma.church.create({
data: {
name: church.name,
latitude: lat,
longitude: lon,
address: church.address || null,
city: church.comune || null,
country: 'IT',
diocese: dioceseSlug,
website: church.sito || null,
email: church.emailLdc || null,
hasWebsite: !!church.sito,
orarimesseId: church.icsc || null,
orarimesseLastSyncedAt: new Date(),
source: 'orarimesse',
wheelchairAccess: church.accessibile || false,
},
});
stats.churchesCreated++;
// Add to in-memory array for within-run dedup
existingChurches.push({
id: newChurch.id,
name: church.name,
latitude: lat,
longitude: lon,
osmId: null,
baiduId: null,
masstimesId: null,
orarimesseId: church.icsc || null,
massSchedulesPhId: null,
philmassId: null,
horariosMisasId: null,
mszeInfoId: null,
weekdayMassesId: null,
messesInfoId: null,
bohosluzbyId: null,
miserendId: null,
kerknetId: null,
gottesdienstzeitenId: null,
source: 'orarimesse',
website: church.sito || null,
phone: null,
address: church.address || null,
});
idchurchToDbId.set(church.idchurch, newChurch.id);
} catch (error) {
if (error instanceof Error && error.message.includes('Unique constraint')) {
stats.churchesSkipped++;
continue;
}
throw error;
}
}
}
}
// ─── Pass 2: Mass Schedules ─────────────────────────────────────────────────
async function processSchedulesForDiocese(
churches: OrariMesseChurch[],
idchurchToDbId: Map<number, string>,
dryRun: boolean,
stats: ImportStats,
): Promise<void> {
// Filter to churches with active schedules
const churchesWithMass = churches.filter((c) => c.nextmass);
if (churchesWithMass.length === 0) return;
console.log(` Pass 2: Fetching schedules for ${churchesWithMass.length} churches with active masses...`);
for (const church of churchesWithMass) {
const dbId = idchurchToDbId.get(church.idchurch);
if (!dbId) continue; // Church not in our DB (skipped in Pass 1)
const detail = await fetchChurchDetail(church.idchurch);
if (!detail || !detail.days || detail.days.length === 0) {
continue;
}
stats.schedulesProcessed++;
const recurring = convertScheduleToRecurring(detail.days);
if (recurring.length === 0) continue;
if (dryRun) {
stats.massSchedulesCreated += recurring.length;
continue;
}
try {
await prisma.$transaction(async (tx) => {
// Delete existing mass schedules for this church
await tx.massSchedule.deleteMany({ where: { churchId: dbId } });
// Create new recurring schedules
await tx.massSchedule.createMany({
data: recurring.map((m) => ({
churchId: dbId,
dayOfWeek: m.dayOfWeek,
time: m.time,
language: 'Italian',
notes: m.notes,
})),
});
// Mark church as scraped
await tx.church.update({
where: { id: dbId },
data: { lastScrapedAt: new Date() },
});
});
stats.massSchedulesCreated += recurring.length;
} catch (error) {
stats.errors++;
console.error(` Error saving schedules for idchurch=${church.idchurch}: ${error instanceof Error ? error.message : error}`);
}
}
}
// ─── CLI ─────────────────────────────────────────────────────────────────────
function parseArgs(): CLIArgs {
const args = process.argv.slice(2);
const result: CLIArgs = {
all: false,
dryRun: false,
schedulesOnly: false,
};
for (let i = 0; i < args.length; i++) {
switch (args[i]) {
case '--all':
result.all = true;
break;
case '--diocese':
result.diocese = args[++i];
break;
case '--dry-run':
result.dryRun = true;
break;
case '--schedules-only':
result.schedulesOnly = true;
break;
case '--resume-from':
result.resumeFrom = args[++i];
break;
case '--job-id':
result.jobId = args[++i];
break;
case '--help':
case '-h':
console.log(`
Usage: npx tsx scripts/import-orarimesse.ts [options]
Options:
--all Import from all 77 dioceses
--diocese <slug> Import from a single diocese (e.g. "roma")
--dry-run No database writes, just report what would happen
--schedules-only Skip Pass 1 (church upsert), only fetch schedules
--resume-from <slug> Skip dioceses until reaching this slug
--job-id <uuid> Background job tracking ID
--help, -h Show this help message
Examples:
npx tsx scripts/import-orarimesse.ts --diocese roma --dry-run
npx tsx scripts/import-orarimesse.ts --all
npx tsx scripts/import-orarimesse.ts --all --schedules-only
npx tsx scripts/import-orarimesse.ts --all --resume-from napoli
`);
process.exit(0);
}
}
if (!result.all && !result.diocese) {
console.error('Error: specify --all or --diocese <slug>');
process.exit(1);
}
return result;
}
// ─── Helpers ─────────────────────────────────────────────────────────────────
function formatDuration(ms: number): string {
const seconds = Math.floor(ms / 1000);
const minutes = Math.floor(seconds / 60);
const hours = Math.floor(minutes / 60);
if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`;
if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
return `${seconds}s`;
}
// ─── Main ────────────────────────────────────────────────────────────────────
async function main() {
const args = parseArgs();
const startTime = Date.now();
console.log('\n' + '='.repeat(70));
console.log('ORARIMESSE.IT IMPORTER');
console.log('='.repeat(70));
console.log(`Mode: ${args.all ? 'All dioceses' : `Single diocese: ${args.diocese}`}`);
console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`);
console.log(`Schedules only: ${args.schedulesOnly ? 'YES' : 'NO'}`);
if (args.resumeFrom) console.log(`Resume from: ${args.resumeFrom}`);
console.log(`Time: ${new Date().toISOString()}`);
console.log('='.repeat(70) + '\n');
// Update background job status if provided
if (args.jobId) {
try {
await prisma.backgroundJob.update({
where: { id: args.jobId },
data: { status: 'running', startedAt: new Date() },
});
} catch {
// Job might not exist yet, that's fine
}
}
// Load existing Italian churches for dedup
const existingChurches = await loadExistingItalianChurches();
// Fetch diocese list
console.log('Fetching diocese list from OrariMesse.it...');
const allDioceses = await fetchDioceses();
console.log(`Found ${allDioceses.length} dioceses\n`);
// Filter to requested dioceses
let diocesesToProcess: OrariMesseDiocese[];
if (args.diocese) {
const found = allDioceses.find((d) => d.slug === args.diocese);
if (!found) {
console.error(`Diocese "${args.diocese}" not found. Available: ${allDioceses.map((d) => d.slug).join(', ')}`);
process.exit(1);
}
diocesesToProcess = [found];
} else {
diocesesToProcess = allDioceses;
}
// Handle --resume-from
if (args.resumeFrom) {
const idx = diocesesToProcess.findIndex((d) => d.slug === args.resumeFrom);
if (idx === -1) {
console.error(`Resume diocese "${args.resumeFrom}" not found.`);
process.exit(1);
}
console.log(`Resuming from diocese "${args.resumeFrom}" (skipping ${idx} dioceses)\n`);
diocesesToProcess = diocesesToProcess.slice(idx);
}
const stats: ImportStats = {
diocesesProcessed: 0,
churchesFound: 0,
churchesMatched: 0,
churchesCreated: 0,
churchesSkipped: 0,
schedulesProcessed: 0,
massSchedulesCreated: 0,
errors: 0,
};
// Map OrariMesse idchurch → our DB id (for Pass 2 schedule lookups)
const idchurchToDbId = new Map<number, string>();
// If schedules-only mode, pre-populate the map from existing orarimesseId records
if (args.schedulesOnly) {
console.log('Schedules-only mode: loading existing orarimesseId mappings...');
const mapped = await prisma.church.findMany({
where: { orarimesseId: { not: null } },
select: { id: true, orarimesseId: true },
});
// We'll build the idchurch map during diocese processing since we need the API's idchurch values
console.log(`Found ${mapped.length} churches with orarimesseId in DB\n`);
}
// Process each diocese
for (let i = 0; i < diocesesToProcess.length; i++) {
const diocese = diocesesToProcess[i];
const elapsed = formatDuration(Date.now() - startTime);
console.log(`[${i + 1}/${diocesesToProcess.length}] Diocese: ${diocese.title} (${diocese.slug}) [${elapsed} elapsed]`);
try {
// Fetch churches in this diocese
const dioceseData = await fetchChurchesInDiocese(diocese.slug);
if (!dioceseData || !dioceseData.listaChiese || dioceseData.listaChiese.length === 0) {
console.log(` No churches found, skipping`);
stats.diocesesProcessed++;
continue;
}
const churches = dioceseData.listaChiese;
console.log(` Found ${churches.length} churches (${churches.filter((c) => c.nextmass).length} with active masses)`);
// Pass 1: Upsert churches
if (!args.schedulesOnly) {
const prevMatched = stats.churchesMatched;
const prevCreated = stats.churchesCreated;
const prevSkipped = stats.churchesSkipped;
await processChurchesForDiocese(
diocese.slug, churches, existingChurches, idchurchToDbId,
args.dryRun, stats
);
const matched = stats.churchesMatched - prevMatched;
const created = stats.churchesCreated - prevCreated;
const skipped = stats.churchesSkipped - prevSkipped;
console.log(` Pass 1: ${matched} matched, ${created} created, ${skipped} skipped`);
} else {
// In schedules-only mode, still need to build idchurch → dbId map
for (const church of churches) {
if (church.icsc) {
const existing = existingChurches.find((e) => e.orarimesseId === church.icsc);
if (existing) {
idchurchToDbId.set(church.idchurch, existing.id);
}
}
}
}
// Pass 2: Import schedules
const prevSchedules = stats.massSchedulesCreated;
await processSchedulesForDiocese(churches, idchurchToDbId, args.dryRun, stats);
const newSchedules = stats.massSchedulesCreated - prevSchedules;
if (newSchedules > 0) {
console.log(` Pass 2: ${stats.schedulesProcessed} churches processed, ${newSchedules} mass times created`);
}
stats.diocesesProcessed++;
} catch (error) {
stats.errors++;
console.error(` ERROR processing diocese ${diocese.slug}: ${error instanceof Error ? error.message : error}`);
}
}
// Print summary
const totalTime = Date.now() - startTime;
console.log('\n' + '='.repeat(70));
console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
console.log('='.repeat(70));
console.log(`Dioceses processed: ${stats.diocesesProcessed}`);
console.log(`Churches found: ${stats.churchesFound}`);
console.log(` Matched (existing): ${stats.churchesMatched}`);
console.log(` Created (new): ${stats.churchesCreated}`);
console.log(` Skipped: ${stats.churchesSkipped}`);
console.log(`Schedules processed: ${stats.schedulesProcessed}`);
console.log(`Mass schedules created: ${stats.massSchedulesCreated}`);
console.log(`Errors: ${stats.errors}`);
console.log(`Total time: ${formatDuration(totalTime)}`);
console.log(`API requests: ${requestCount}`);
console.log('='.repeat(70) + '\n');
// Update background job
if (args.jobId) {
try {
await prisma.backgroundJob.update({
where: { id: args.jobId },
data: {
status: stats.errors > 0 ? 'completed_with_errors' : 'completed',
completedAt: new Date(),
result: JSON.stringify(stats),
},
});
} catch {
// Ignore
}
}
}
main()
.catch((error) => {
console.error('Fatal error:', error);
process.exit(1);
})
.finally(async () => {
await prisma.$disconnect();
await pool.end();
});