Files
ScraperControl/scripts/import-horariosmisas.ts

1027 lines
32 KiB
TypeScript
Raw Permalink Normal View History

#!/usr/bin/env tsx
/**
* Import Catholic churches and mass schedules from horariosmisas.com (Spain)
*
* horariosmisas.com is a Spanish directory of Catholic parishes with mass
* schedules organized by province and city. The site uses a WordPress sitemap
* structure with ~20 post-sitemap files.
*
* Import strategy:
* 1. Fetch sitemap index extract post-sitemap*.xml URLs
* 2. Fetch each post sitemap extract church URLs (3 path segments)
* 3. Filter out non-church URLs (blog, legal pages, daily readings)
* 4. For each church: fetch HTML, parse name/address/phone/website/schedule
* 5. Match against existing ES churches, upsert
* 6. Optional geocoding pass via Nominatim
*
* Usage:
* npx tsx scripts/import-horariosmisas.ts --all
* npx tsx scripts/import-horariosmisas.ts --all --dry-run
* npx tsx scripts/import-horariosmisas.ts --province madrid
* npx tsx scripts/import-horariosmisas.ts --all --geocode
* npx tsx scripts/import-horariosmisas.ts --geocode-only
* npx tsx scripts/import-horariosmisas.ts --all --resume-from 500
* npx tsx scripts/import-horariosmisas.ts --all --job-id {uuid}
*/
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
const pool = new Pool({
connectionString: dbUrl,
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
});
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
import { findDuplicateChurch } from '../src/lib/church-matcher';
import type { ExistingChurch } from '../src/lib/church-matcher';
// ─── Constants ───────────────────────────────────────────────────────────────
const SITE_BASE = 'https://horariosmisas.com';
const SITEMAP_INDEX_URL = `${SITE_BASE}/sitemap_index.xml`;
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
const REQUEST_DELAY_MS = 1500;
const NOMINATIM_DELAY_MS = 1100;
const NOMINATIM_URL = 'https://nominatim.openstreetmap.org/search';
// ─── Types ───────────────────────────────────────────────────────────────────
interface SitemapChurch {
province: string;
city: string;
slug: string;
url: string;
}
interface ParsedChurch {
name: string;
address: string | null;
zip: string | null;
city: string | null;
phone: string | null;
website: string | null;
}
interface ParsedSchedule {
dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat
time: string; // "05:00", "18:30"
}
interface ImportStats {
churchesFound: number;
churchesMatched: number;
churchesCreated: number;
churchesSkipped: number;
schedulesProcessed: number;
massSchedulesCreated: number;
geocoded: number;
geocodeFailed: number;
errors: number;
}
interface CLIArgs {
all: boolean;
province?: string;
dryRun: boolean;
geocode: boolean;
geocodeOnly: boolean;
resumeFrom?: number;
jobId?: string;
}
// ─── Spanish Day Mapping ─────────────────────────────────────────────────────
const DAY_MAP: Record<string, number[]> = {
'domingos y festivos': [0],
'domingos': [0],
'domingo': [0],
'lunes': [1],
'martes': [2],
'miércoles': [3],
'miercoles': [3],
'jueves': [4],
'viernes': [5],
'sábado': [6],
'sabado': [6],
'sábados': [6],
'sabados': [6],
};
const DAY_ORDER = ['domingo', 'lunes', 'martes', 'miércoles', 'jueves', 'viernes', 'sábado'];
// URL patterns to exclude (not church pages)
const EXCLUDE_PATTERNS = [
/\/misas-diarias\//,
/\/santos-del-dia\//,
/\/oraciones\//,
/\/noticias\//,
/\/blog\//,
/\/contacto\//,
/\/aviso-legal\//,
/\/politica-de-privacidad\//,
/\/politica-de-cookies\//,
];
// ─── HTTP Client ─────────────────────────────────────────────────────────────
let requestCount = 0;
function delay(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function fetchPage(url: string, delayMs: number = REQUEST_DELAY_MS): Promise<string | null> {
if (requestCount > 0) {
await delay(delayMs);
}
requestCount++;
try {
const response = await fetch(url, {
headers: {
'User-Agent': USER_AGENT,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
},
});
if (!response.ok) {
console.error(` HTTP ${response.status} for ${url}`);
return null;
}
return await response.text();
} catch (error) {
console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`);
return null;
}
}
// ─── Sitemap Parser ──────────────────────────────────────────────────────────
async function fetchChurchUrlsFromSitemaps(): Promise<SitemapChurch[]> {
console.log(`Fetching sitemap index: ${SITEMAP_INDEX_URL}`);
const indexXml = await fetchPage(SITEMAP_INDEX_URL);
if (!indexXml) {
throw new Error('Failed to fetch sitemap index');
}
// Extract post-sitemap URLs
const sitemapUrlRegex = /<loc>(https:\/\/horariosmisas\.com\/post-sitemap\d*\.xml)<\/loc>/g;
const sitemapUrls: string[] = [];
let match;
while ((match = sitemapUrlRegex.exec(indexXml)) !== null) {
sitemapUrls.push(match[1]);
}
console.log(`Found ${sitemapUrls.length} post-sitemap files`);
// Fetch each sitemap and extract church URLs
const allUrls: string[] = [];
for (const sitemapUrl of sitemapUrls) {
console.log(` Fetching ${sitemapUrl}...`);
const sitemapXml = await fetchPage(sitemapUrl);
if (!sitemapXml) {
console.error(` Failed to fetch ${sitemapUrl}`);
continue;
}
const locRegex = /<loc>(https:\/\/horariosmisas\.com\/[^<]+)<\/loc>/g;
let locMatch;
while ((locMatch = locRegex.exec(sitemapXml)) !== null) {
allUrls.push(locMatch[1]);
}
}
console.log(`Extracted ${allUrls.length} total URLs from sitemaps`);
// Filter to church URLs: exactly 3 path segments (/{province}/{city}/{slug}/)
const seen = new Set<string>();
const churches: SitemapChurch[] = [];
for (const url of allUrls) {
// Remove trailing slash and base URL to get path
const urlObj = new URL(url);
const pathSegments = urlObj.pathname.replace(/^\/|\/$/g, '').split('/');
// Must have exactly 3 segments
if (pathSegments.length !== 3) continue;
// Exclude non-church patterns
const isExcluded = EXCLUDE_PATTERNS.some((pattern) => pattern.test(url));
if (isExcluded) continue;
const [province, city, slug] = pathSegments;
// Deduplicate by slug
if (seen.has(slug)) continue;
seen.add(slug);
churches.push({
province,
city,
slug,
url: url.endsWith('/') ? url : `${url}/`,
});
}
// Sort alphabetically by province, then city, then slug
churches.sort((a, b) =>
a.province.localeCompare(b.province) ||
a.city.localeCompare(b.city) ||
a.slug.localeCompare(b.slug),
);
console.log(`Found ${churches.length} unique church URLs after filtering`);
return churches;
}
// ─── HTML Parsers ────────────────────────────────────────────────────────────
function parseChurchPage(html: string): ParsedChurch {
// Name from <h1>Church Name (City)</h1>
const h1Match = html.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i);
let name = '';
if (h1Match) {
// Strip HTML tags, then strip (City) suffix
name = h1Match[1]
.replace(/<[^>]+>/g, '')
.replace(/\s*\([^)]*\)\s*$/, '')
.trim();
}
// Address: look for pin emoji followed by <strong>...</strong>
// Handles both the emoji character and the HTML entity &#x1f4cc;
let address: string | null = null;
let zip: string | null = null;
let city: string | null = null;
const addressMatch = html.match(/(?:\u{1F4CC}|&#x1f4cc;)\s*<strong>([\s\S]*?)<\/strong>/iu);
if (addressMatch) {
address = addressMatch[1]
.replace(/<[^>]+>/g, '')
.replace(/\s*\([^)]*\)\s*$/, '') // Strip (Province) suffix
.replace(/\s+/g, ' ')
.trim() || null;
if (address) {
// Extract 5-digit Spanish postal code
const pcMatch = address.match(/\b(\d{5})\b/);
if (pcMatch) {
zip = pcMatch[1];
// City is the text after the postal code
const afterPc = address.substring(address.indexOf(zip) + 5).trim();
// Remove leading comma, dash, space
city = afterPc.replace(/^[,\-\s]+/, '').trim() || null;
}
}
}
// Phone: Teléfono:</strong> <a...>number</a> (handle both accented and unaccented)
let phone: string | null = null;
const phoneMatch = html.match(/Tel[eé]fono:<\/strong>\s*<a[^>]*>([\s\S]*?)<\/a>/i);
if (phoneMatch) {
phone = phoneMatch[1].replace(/<[^>]+>/g, '').trim() || null;
}
// Website: Página Web:</strong> <a href="url"> (handle both accented and unaccented)
let website: string | null = null;
const websiteMatch = html.match(/P[aá]gina\s+Web:<\/strong>\s*<a\s+href="([^"]+)"/i);
if (websiteMatch) {
website = websiteMatch[1].trim() || null;
}
return { name, address, zip, city, phone, website };
}
function parseScheduleTable(html: string): ParsedSchedule[] {
const schedules: ParsedSchedule[] = [];
const seen = new Set<string>();
// Determine current season: Oct-May = winter, Jun-Sep = summer
const month = new Date().getMonth(); // 0-indexed
const isSummer = month >= 5 && month <= 8; // Jun(5) through Sep(8)
// Try to split by seasonal headings
let relevantHtml = html;
// Check for seasonal sections
const hasVerano = /verano/i.test(html);
const hasInvierno = /invierno/i.test(html);
if (hasVerano && hasInvierno) {
// Split into seasonal sections
const veranoRegex = /(?:<h[2-4][^>]*>|<strong>)[^<]*verano[^<]*(?:<\/h[2-4]>|<\/strong>)/gi;
const inviernoRegex = /(?:<h[2-4][^>]*>|<strong>)[^<]*invierno[^<]*(?:<\/h[2-4]>|<\/strong>)/gi;
const veranoMatch = veranoRegex.exec(html);
const inviernoMatch = inviernoRegex.exec(html);
if (veranoMatch && inviernoMatch) {
if (isSummer) {
// Use the section starting from "verano" heading
const startIdx = veranoMatch.index;
const endIdx = inviernoMatch.index > startIdx
? inviernoMatch.index
: html.length;
relevantHtml = html.substring(startIdx, endIdx);
} else {
// Use the section starting from "invierno" heading
const startIdx = inviernoMatch.index;
const endIdx = veranoMatch.index > startIdx
? veranoMatch.index
: html.length;
relevantHtml = html.substring(startIdx, endIdx);
}
}
}
// Find all <table> elements with DÍA/HORARIO headers
const tableRegex = /<table[^>]*>([\s\S]*?)<\/table>/gi;
let tableMatch;
while ((tableMatch = tableRegex.exec(relevantHtml)) !== null) {
const tableHtml = tableMatch[1];
// Check if this looks like a schedule table (has DÍA or HORARIO headers)
if (!/d[ií]a/i.test(tableHtml) && !/horario/i.test(tableHtml)) {
continue;
}
// Extract rows
const rowRegex = /<tr[^>]*>([\s\S]*?)<\/tr>/gi;
let rowMatch;
while ((rowMatch = rowRegex.exec(tableHtml)) !== null) {
const row = rowMatch[1];
// Skip header rows
if (/<th/i.test(row)) continue;
// Extract cells
const cellRegex = /<td[^>]*>([\s\S]*?)<\/td>/gi;
const cells: string[] = [];
let cellMatch;
while ((cellMatch = cellRegex.exec(row)) !== null) {
cells.push(cellMatch[1].replace(/<[^>]+>/g, '').trim());
}
if (cells.length < 2) continue;
const dayText = cells[0];
const timeText = cells[1];
// Resolve days
const days = resolveDays(dayText);
if (days.length === 0) continue;
// Extract times
const times = extractTimes(timeText);
// Create schedule entries
for (const day of days) {
for (const time of times) {
const key = `${day}:${time}`;
if (seen.has(key)) continue;
seen.add(key);
schedules.push({ dayOfWeek: day, time });
}
}
}
}
return schedules;
}
function resolveDays(dayText: string): number[] {
const normalized = dayText.toLowerCase().trim();
// 1. Exact match in DAY_MAP
if (DAY_MAP[normalized]) {
return DAY_MAP[normalized];
}
// 2. Check for range: "Lunes a Viernes"
const rangeMatch = normalized.match(/^(\w+)\s+a\s+(\w+)$/);
if (rangeMatch) {
const startDay = findDayIndex(rangeMatch[1]);
const endDay = findDayIndex(rangeMatch[2]);
if (startDay !== -1 && endDay !== -1) {
const days: number[] = [];
// DAY_ORDER: domingo=0, lunes=1, ..., sábado=6
for (let i = startDay; i <= endDay; i++) {
days.push(i);
}
return days;
}
}
// 3. Check for compound: "Lunes, Miércoles y Viernes"
// Split by comma and "y"
const parts = normalized
.split(/[,]\s*/)
.flatMap((part) => part.split(/\s+y\s+/))
.map((p) => p.trim())
.filter((p) => p.length > 0);
if (parts.length > 1) {
const days: number[] = [];
for (const part of parts) {
// Try exact match first
if (DAY_MAP[part]) {
days.push(...DAY_MAP[part]);
} else {
const idx = findDayIndex(part);
if (idx !== -1) days.push(idx);
}
}
if (days.length > 0) return days;
}
// 4. Try partial match in DAY_MAP keys
for (const [key, value] of Object.entries(DAY_MAP)) {
if (normalized.includes(key)) {
return value;
}
}
return [];
}
function findDayIndex(dayName: string): number {
const normalized = dayName
.toLowerCase()
.replace(/á/g, 'a')
.replace(/é/g, 'e')
.replace(/í/g, 'i')
.replace(/ó/g, 'o')
.replace(/ú/g, 'u')
.replace(/s$/, ''); // Remove trailing 's' for plurals
for (let i = 0; i < DAY_ORDER.length; i++) {
const dayNormalized = DAY_ORDER[i]
.replace(/á/g, 'a')
.replace(/é/g, 'e')
.replace(/í/g, 'i')
.replace(/ó/g, 'o')
.replace(/ú/g, 'u');
if (dayNormalized === normalized || dayNormalized.startsWith(normalized)) {
return i;
}
}
return -1;
}
function extractTimes(text: string): string[] {
const times: string[] = [];
const timeRegex = /(\d{1,2}):(\d{2})\s*h?/g;
let match;
while ((match = timeRegex.exec(text)) !== null) {
const hours = parseInt(match[1]);
const minutes = match[2];
if (hours >= 0 && hours <= 23) {
times.push(`${String(hours).padStart(2, '0')}:${minutes}`);
}
}
return times;
}
// ─── Geocoding ───────────────────────────────────────────────────────────────
async function forwardGeocode(
address: string | null,
zip: string | null,
city: string | null,
): Promise<{ lat: number; lng: number } | null> {
// Try queries in order of specificity
const queries: string[] = [];
if (address) queries.push(address);
if (zip && city) queries.push(`${zip} ${city}, Spain`);
if (city) queries.push(`${city}, Spain`);
for (const query of queries) {
await delay(NOMINATIM_DELAY_MS);
try {
const params = new URLSearchParams({
q: query,
countrycodes: 'es',
format: 'json',
limit: '1',
});
const response = await fetch(`${NOMINATIM_URL}?${params}`, {
headers: { 'User-Agent': USER_AGENT },
});
if (!response.ok) continue;
const results = await response.json() as Array<{ lat: string; lon: string }>;
if (results.length > 0) {
const lat = parseFloat(results[0].lat);
const lng = parseFloat(results[0].lon);
if (!isNaN(lat) && !isNaN(lng)) {
return { lat, lng };
}
}
} catch {
// Try next query
}
}
return null;
}
// ─── Database Operations ─────────────────────────────────────────────────────
async function loadExistingSpanishChurches(): Promise<ExistingChurch[]> {
console.log('Loading existing Spanish churches for deduplication...');
const churches = await prisma.church.findMany({
where: { country: 'ES' },
select: {
id: true,
name: true,
latitude: true,
longitude: true,
osmId: true,
baiduId: true,
masstimesId: true,
orarimesseId: true,
massSchedulesPhId: true,
philmassId: true,
horariosMisasId: true,
mszeInfoId: true,
weekdayMassesId: true,
messesInfoId: true,
bohosluzbyId: true,
miserendId: true,
kerknetId: true,
gottesdienstzeitenId: true,
source: true,
website: true,
phone: true,
address: true,
},
});
console.log(`Loaded ${churches.length} existing Spanish churches`);
return churches;
}
async function geocodeUnmatchedChurches(dryRun: boolean, stats: ImportStats): Promise<void> {
console.log('\n--- Geocoding Phase ---');
const churches = await prisma.church.findMany({
where: {
country: 'ES',
latitude: 0,
longitude: 0,
address: { not: null },
},
select: {
id: true,
name: true,
address: true,
zip: true,
city: true,
},
});
console.log(`Found ${churches.length} Spanish churches needing geocoding`);
for (let i = 0; i < churches.length; i++) {
const church = churches[i];
console.log(` [${i + 1}/${churches.length}] Geocoding "${church.name}"...`);
const coords = await forwardGeocode(church.address, church.zip, church.city);
if (coords) {
console.log(` Found: ${coords.lat}, ${coords.lng}`);
stats.geocoded++;
if (!dryRun) {
await prisma.church.update({
where: { id: church.id },
data: {
latitude: coords.lat,
longitude: coords.lng,
reverseGeocodedAt: new Date(),
},
});
}
} else {
console.log(` No results`);
stats.geocodeFailed++;
}
}
}
// ─── Import Logic ────────────────────────────────────────────────────────────
async function processChurch(
sitemapEntry: SitemapChurch,
existingChurches: ExistingChurch[],
dryRun: boolean,
stats: ImportStats,
): Promise<void> {
stats.churchesFound++;
// Fetch church page
const churchHtml = await fetchPage(sitemapEntry.url);
if (!churchHtml) {
stats.errors++;
return;
}
const parsed = parseChurchPage(churchHtml);
if (!parsed.name) {
console.log(` Skipping ${sitemapEntry.slug}: no name found`);
stats.churchesSkipped++;
return;
}
// Parse schedule
const schedules = parseScheduleTable(churchHtml);
// Build candidate for dedup — use lat: 0, lng: 0 since we rely on horariosMisasId match
const candidate = {
name: parsed.name,
lat: 0,
lng: 0,
horariosMisasId: sitemapEntry.slug,
};
const duplicate = findDuplicateChurch(candidate, existingChurches);
if (dryRun) {
if (duplicate) {
stats.churchesMatched++;
console.log(` [MATCH] "${parsed.name}" → existing "${duplicate.name}" (${duplicate.id})`);
} else {
stats.churchesCreated++;
console.log(` [NEW] "${parsed.name}" (${sitemapEntry.province}/${sitemapEntry.city})`);
}
if (schedules.length > 0) {
stats.schedulesProcessed++;
stats.massSchedulesCreated += schedules.length;
}
return;
}
if (duplicate) {
// Update existing church
stats.churchesMatched++;
const updateData: Record<string, unknown> = {
horariosMisasId: sitemapEntry.slug,
};
if (!duplicate.address && parsed.address) updateData.address = parsed.address;
if (!duplicate.phone && parsed.phone) updateData.phone = parsed.phone;
if (!duplicate.website && parsed.website) {
updateData.website = parsed.website;
updateData.hasWebsite = true;
}
// Fill city/state/zip if not set
const dbRecord = await prisma.church.findUnique({
where: { id: duplicate.id },
select: { city: true, state: true, zip: true },
});
if (dbRecord && !dbRecord.city && parsed.city) updateData.city = parsed.city;
if (dbRecord && !dbRecord.state) updateData.state = sitemapEntry.province;
if (dbRecord && !dbRecord.zip && parsed.zip) updateData.zip = parsed.zip;
try {
await prisma.church.update({
where: { id: duplicate.id },
data: updateData,
});
} catch (error) {
if (error instanceof Error && error.message.includes('Unique constraint')) {
stats.churchesSkipped++;
return;
}
throw error;
}
// Replace mass schedules
if (schedules.length > 0) {
try {
await prisma.$transaction(async (tx) => {
await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } });
await tx.massSchedule.createMany({
data: schedules.map((s) => ({
churchId: duplicate.id,
dayOfWeek: s.dayOfWeek,
time: s.time,
language: 'Spanish',
})),
});
await tx.church.update({
where: { id: duplicate.id },
data: { lastScrapedAt: new Date() },
});
});
stats.schedulesProcessed++;
stats.massSchedulesCreated += schedules.length;
} catch (error) {
stats.errors++;
console.error(` Error saving schedules for ${sitemapEntry.slug}: ${error instanceof Error ? error.message : error}`);
}
}
} else {
// Create new church
try {
const newChurch = await prisma.church.create({
data: {
name: parsed.name,
latitude: 0,
longitude: 0,
address: parsed.address,
zip: parsed.zip,
city: parsed.city || null,
state: sitemapEntry.province || null,
country: 'ES',
phone: parsed.phone,
website: parsed.website,
hasWebsite: !!parsed.website,
horariosMisasId: sitemapEntry.slug,
source: 'horariosmisas',
},
});
stats.churchesCreated++;
// Add to in-memory array for within-run dedup
existingChurches.push({
id: newChurch.id,
name: parsed.name,
latitude: 0,
longitude: 0,
osmId: null,
baiduId: null,
masstimesId: null,
orarimesseId: null,
massSchedulesPhId: null,
philmassId: null,
horariosMisasId: sitemapEntry.slug,
mszeInfoId: null,
weekdayMassesId: null,
messesInfoId: null,
bohosluzbyId: null,
miserendId: null,
kerknetId: null,
gottesdienstzeitenId: null,
source: 'horariosmisas',
website: parsed.website,
phone: parsed.phone,
address: parsed.address,
});
// Create mass schedules
if (schedules.length > 0) {
await prisma.massSchedule.createMany({
data: schedules.map((s) => ({
churchId: newChurch.id,
dayOfWeek: s.dayOfWeek,
time: s.time,
language: 'Spanish',
})),
});
await prisma.church.update({
where: { id: newChurch.id },
data: { lastScrapedAt: new Date() },
});
stats.schedulesProcessed++;
stats.massSchedulesCreated += schedules.length;
}
} catch (error) {
if (error instanceof Error && error.message.includes('Unique constraint')) {
stats.churchesSkipped++;
return;
}
throw error;
}
}
}
// ─── CLI ─────────────────────────────────────────────────────────────────────
function parseArgs(): CLIArgs {
const args = process.argv.slice(2);
const result: CLIArgs = {
all: false,
dryRun: false,
geocode: false,
geocodeOnly: false,
};
for (let i = 0; i < args.length; i++) {
switch (args[i]) {
case '--all':
result.all = true;
break;
case '--province':
result.province = args[++i];
break;
case '--dry-run':
result.dryRun = true;
break;
case '--geocode':
result.geocode = true;
break;
case '--geocode-only':
result.geocodeOnly = true;
break;
case '--resume-from':
result.resumeFrom = parseInt(args[++i]);
break;
case '--job-id':
result.jobId = args[++i];
break;
case '--help':
case '-h':
console.log(`
Usage: npx tsx scripts/import-horariosmisas.ts [options]
Options:
--all Import all churches from sitemaps
--province <name> Filter by province slug (e.g. "madrid")
--dry-run No database writes, just report what would happen
--geocode Geocode churches after import (Nominatim)
--geocode-only Only geocode existing churches (skip import)
--resume-from <n> Skip first N churches
--job-id <uuid> Background job tracking ID
--help, -h Show this help message
Examples:
npx tsx scripts/import-horariosmisas.ts --all --dry-run
npx tsx scripts/import-horariosmisas.ts --all
npx tsx scripts/import-horariosmisas.ts --province madrid
npx tsx scripts/import-horariosmisas.ts --all --geocode
npx tsx scripts/import-horariosmisas.ts --geocode-only
npx tsx scripts/import-horariosmisas.ts --all --resume-from 500
`);
process.exit(0);
}
}
if (!result.all && !result.province && !result.geocodeOnly) {
console.error('Error: specify --all, --province <name>, or --geocode-only');
process.exit(1);
}
return result;
}
// ─── Helpers ─────────────────────────────────────────────────────────────────
function formatDuration(ms: number): string {
const seconds = Math.floor(ms / 1000);
const minutes = Math.floor(seconds / 60);
const hours = Math.floor(minutes / 60);
if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`;
if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
return `${seconds}s`;
}
// ─── Main ────────────────────────────────────────────────────────────────────
async function main() {
const args = parseArgs();
const startTime = Date.now();
console.log('\n' + '='.repeat(70));
console.log('HORARIOSMISAS.COM (SPAIN) IMPORTER');
console.log('='.repeat(70));
console.log(`Mode: ${args.geocodeOnly ? 'Geocode only' : args.all ? 'All churches from sitemaps' : `Province: ${args.province}`}`);
console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`);
console.log(`Geocode: ${args.geocode || args.geocodeOnly ? 'YES' : 'NO'}`);
if (args.resumeFrom) console.log(`Resume from: ${args.resumeFrom}`);
console.log(`Time: ${new Date().toISOString()}`);
console.log('='.repeat(70) + '\n');
// Update background job status if provided
if (args.jobId) {
try {
await prisma.backgroundJob.update({
where: { id: args.jobId },
data: { status: 'running', startedAt: new Date() },
});
} catch {
// Job might not exist yet
}
}
const stats: ImportStats = {
churchesFound: 0,
churchesMatched: 0,
churchesCreated: 0,
churchesSkipped: 0,
schedulesProcessed: 0,
massSchedulesCreated: 0,
geocoded: 0,
geocodeFailed: 0,
errors: 0,
};
if (!args.geocodeOnly) {
// Load existing Spanish churches for dedup
const existingChurches = await loadExistingSpanishChurches();
// Fetch church URLs from sitemaps
const allChurches = await fetchChurchUrlsFromSitemaps();
// Filter by province if specified
let churchesToProcess = allChurches;
if (args.province) {
churchesToProcess = allChurches.filter((c) => c.province === args.province);
console.log(`Filtered to ${churchesToProcess.length} churches in province "${args.province}"\n`);
} else {
console.log(`Processing ${churchesToProcess.length} churches\n`);
}
// Handle --resume-from
if (args.resumeFrom) {
const before = churchesToProcess.length;
churchesToProcess = churchesToProcess.slice(args.resumeFrom);
console.log(`Resuming from index ${args.resumeFrom} (skipping ${before - churchesToProcess.length} churches)\n`);
}
// Process each church
for (let i = 0; i < churchesToProcess.length; i++) {
const church = churchesToProcess[i];
const elapsed = formatDuration(Date.now() - startTime);
console.log(`[${i + 1}/${churchesToProcess.length}] ${church.province}/${church.city}/${church.slug} [${elapsed} elapsed]`);
try {
await processChurch(church, existingChurches, args.dryRun, stats);
} catch (error) {
stats.errors++;
console.error(` ERROR processing ${church.slug}: ${error instanceof Error ? error.message : error}`);
}
}
}
// Geocode phase
if (args.geocode || args.geocodeOnly) {
await geocodeUnmatchedChurches(args.dryRun, stats);
}
// Print summary
const totalTime = Date.now() - startTime;
console.log('\n' + '='.repeat(70));
console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
console.log('='.repeat(70));
console.log(`Churches found: ${stats.churchesFound}`);
console.log(` Matched (existing): ${stats.churchesMatched}`);
console.log(` Created (new): ${stats.churchesCreated}`);
console.log(` Skipped: ${stats.churchesSkipped}`);
console.log(`Schedules processed: ${stats.schedulesProcessed}`);
console.log(`Mass schedules created: ${stats.massSchedulesCreated}`);
if (args.geocode || args.geocodeOnly) {
console.log(`Geocoded: ${stats.geocoded}`);
console.log(`Geocode failed: ${stats.geocodeFailed}`);
}
console.log(`Errors: ${stats.errors}`);
console.log(`Total time: ${formatDuration(totalTime)}`);
console.log(`HTTP requests: ${requestCount}`);
console.log('='.repeat(70) + '\n');
// Update background job
if (args.jobId) {
try {
await prisma.backgroundJob.update({
where: { id: args.jobId },
data: {
status: stats.errors > 0 ? 'completed_with_errors' : 'completed',
completedAt: new Date(),
result: JSON.stringify(stats),
},
});
} catch {
// Ignore
}
}
}
main()
.catch((error) => {
console.error('Fatal error:', error);
process.exit(1);
})
.finally(async () => {
await prisma.$disconnect();
await pool.end();
});