#!/usr/bin/env tsx /** * Import Catholic churches and mass schedules from horariosmisas.com (Spain) * * horariosmisas.com is a Spanish directory of Catholic parishes with mass * schedules organized by province and city. The site uses a WordPress sitemap * structure with ~20 post-sitemap files. * * Import strategy: * 1. Fetch sitemap index → extract post-sitemap*.xml URLs * 2. Fetch each post sitemap → extract church URLs (3 path segments) * 3. Filter out non-church URLs (blog, legal pages, daily readings) * 4. For each church: fetch HTML, parse name/address/phone/website/schedule * 5. Match against existing ES churches, upsert * 6. Optional geocoding pass via Nominatim * * Usage: * npx tsx scripts/import-horariosmisas.ts --all * npx tsx scripts/import-horariosmisas.ts --all --dry-run * npx tsx scripts/import-horariosmisas.ts --province madrid * npx tsx scripts/import-horariosmisas.ts --all --geocode * npx tsx scripts/import-horariosmisas.ts --geocode-only * npx tsx scripts/import-horariosmisas.ts --all --resume-from 500 * npx tsx scripts/import-horariosmisas.ts --all --job-id {uuid} */ import dotenv from 'dotenv'; import path from 'path'; dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); dotenv.config({ path: path.resolve(process.cwd(), '.env') }); import { Pool } from 'pg'; import { PrismaPg } from '@prisma/adapter-pg'; import { PrismaClient } from '@prisma/client'; const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); const pool = new Pool({ connectionString: dbUrl, ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined, }); const adapter = new PrismaPg(pool); const prisma = new PrismaClient({ adapter }); import { findDuplicateChurch } from '../src/lib/church-matcher'; import type { ExistingChurch } from '../src/lib/church-matcher'; // ─── Constants ─────────────────────────────────────────────────────────────── const SITE_BASE = 'https://horariosmisas.com'; const SITEMAP_INDEX_URL = `${SITE_BASE}/sitemap_index.xml`; const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)'; const REQUEST_DELAY_MS = 1500; const NOMINATIM_DELAY_MS = 1100; const NOMINATIM_URL = 'https://nominatim.openstreetmap.org/search'; // ─── Types ─────────────────────────────────────────────────────────────────── interface SitemapChurch { province: string; city: string; slug: string; url: string; } interface ParsedChurch { name: string; address: string | null; zip: string | null; city: string | null; phone: string | null; website: string | null; } interface ParsedSchedule { dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat time: string; // "05:00", "18:30" } interface ImportStats { churchesFound: number; churchesMatched: number; churchesCreated: number; churchesSkipped: number; schedulesProcessed: number; massSchedulesCreated: number; geocoded: number; geocodeFailed: number; errors: number; } interface CLIArgs { all: boolean; province?: string; dryRun: boolean; geocode: boolean; geocodeOnly: boolean; resumeFrom?: number; jobId?: string; } // ─── Spanish Day Mapping ───────────────────────────────────────────────────── const DAY_MAP: Record = { 'domingos y festivos': [0], 'domingos': [0], 'domingo': [0], 'lunes': [1], 'martes': [2], 'miércoles': [3], 'miercoles': [3], 'jueves': [4], 'viernes': [5], 'sábado': [6], 'sabado': [6], 'sábados': [6], 'sabados': [6], }; const DAY_ORDER = ['domingo', 'lunes', 'martes', 'miércoles', 'jueves', 'viernes', 'sábado']; // URL patterns to exclude (not church pages) const EXCLUDE_PATTERNS = [ /\/misas-diarias\//, /\/santos-del-dia\//, /\/oraciones\//, /\/noticias\//, /\/blog\//, /\/contacto\//, /\/aviso-legal\//, /\/politica-de-privacidad\//, /\/politica-de-cookies\//, ]; // ─── HTTP Client ───────────────────────────────────────────────────────────── let requestCount = 0; function delay(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } async function fetchPage(url: string, delayMs: number = REQUEST_DELAY_MS): Promise { if (requestCount > 0) { await delay(delayMs); } requestCount++; try { const response = await fetch(url, { headers: { 'User-Agent': USER_AGENT, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', }, }); if (!response.ok) { console.error(` HTTP ${response.status} for ${url}`); return null; } return await response.text(); } catch (error) { console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`); return null; } } // ─── Sitemap Parser ────────────────────────────────────────────────────────── async function fetchChurchUrlsFromSitemaps(): Promise { console.log(`Fetching sitemap index: ${SITEMAP_INDEX_URL}`); const indexXml = await fetchPage(SITEMAP_INDEX_URL); if (!indexXml) { throw new Error('Failed to fetch sitemap index'); } // Extract post-sitemap URLs const sitemapUrlRegex = /(https:\/\/horariosmisas\.com\/post-sitemap\d*\.xml)<\/loc>/g; const sitemapUrls: string[] = []; let match; while ((match = sitemapUrlRegex.exec(indexXml)) !== null) { sitemapUrls.push(match[1]); } console.log(`Found ${sitemapUrls.length} post-sitemap files`); // Fetch each sitemap and extract church URLs const allUrls: string[] = []; for (const sitemapUrl of sitemapUrls) { console.log(` Fetching ${sitemapUrl}...`); const sitemapXml = await fetchPage(sitemapUrl); if (!sitemapXml) { console.error(` Failed to fetch ${sitemapUrl}`); continue; } const locRegex = /(https:\/\/horariosmisas\.com\/[^<]+)<\/loc>/g; let locMatch; while ((locMatch = locRegex.exec(sitemapXml)) !== null) { allUrls.push(locMatch[1]); } } console.log(`Extracted ${allUrls.length} total URLs from sitemaps`); // Filter to church URLs: exactly 3 path segments (/{province}/{city}/{slug}/) const seen = new Set(); const churches: SitemapChurch[] = []; for (const url of allUrls) { // Remove trailing slash and base URL to get path const urlObj = new URL(url); const pathSegments = urlObj.pathname.replace(/^\/|\/$/g, '').split('/'); // Must have exactly 3 segments if (pathSegments.length !== 3) continue; // Exclude non-church patterns const isExcluded = EXCLUDE_PATTERNS.some((pattern) => pattern.test(url)); if (isExcluded) continue; const [province, city, slug] = pathSegments; // Deduplicate by slug if (seen.has(slug)) continue; seen.add(slug); churches.push({ province, city, slug, url: url.endsWith('/') ? url : `${url}/`, }); } // Sort alphabetically by province, then city, then slug churches.sort((a, b) => a.province.localeCompare(b.province) || a.city.localeCompare(b.city) || a.slug.localeCompare(b.slug), ); console.log(`Found ${churches.length} unique church URLs after filtering`); return churches; } // ─── HTML Parsers ──────────────────────────────────────────────────────────── function parseChurchPage(html: string): ParsedChurch { // Name from

Church Name (City)

const h1Match = html.match(/]*>([\s\S]*?)<\/h1>/i); let name = ''; if (h1Match) { // Strip HTML tags, then strip (City) suffix name = h1Match[1] .replace(/<[^>]+>/g, '') .replace(/\s*\([^)]*\)\s*$/, '') .trim(); } // Address: look for pin emoji followed by ... // Handles both the emoji character and the HTML entity 📌 let address: string | null = null; let zip: string | null = null; let city: string | null = null; const addressMatch = html.match(/(?:\u{1F4CC}|📌)\s*([\s\S]*?)<\/strong>/iu); if (addressMatch) { address = addressMatch[1] .replace(/<[^>]+>/g, '') .replace(/\s*\([^)]*\)\s*$/, '') // Strip (Province) suffix .replace(/\s+/g, ' ') .trim() || null; if (address) { // Extract 5-digit Spanish postal code const pcMatch = address.match(/\b(\d{5})\b/); if (pcMatch) { zip = pcMatch[1]; // City is the text after the postal code const afterPc = address.substring(address.indexOf(zip) + 5).trim(); // Remove leading comma, dash, space city = afterPc.replace(/^[,\-\s]+/, '').trim() || null; } } } // Phone: Teléfono: number (handle both accented and unaccented) let phone: string | null = null; const phoneMatch = html.match(/Tel[eé]fono:<\/strong>\s*]*>([\s\S]*?)<\/a>/i); if (phoneMatch) { phone = phoneMatch[1].replace(/<[^>]+>/g, '').trim() || null; } // Website: Página Web: (handle both accented and unaccented) let website: string | null = null; const websiteMatch = html.match(/P[aá]gina\s+Web:<\/strong>\s*(); // Determine current season: Oct-May = winter, Jun-Sep = summer const month = new Date().getMonth(); // 0-indexed const isSummer = month >= 5 && month <= 8; // Jun(5) through Sep(8) // Try to split by seasonal headings let relevantHtml = html; // Check for seasonal sections const hasVerano = /verano/i.test(html); const hasInvierno = /invierno/i.test(html); if (hasVerano && hasInvierno) { // Split into seasonal sections const veranoRegex = /(?:]*>|)[^<]*verano[^<]*(?:<\/h[2-4]>|<\/strong>)/gi; const inviernoRegex = /(?:]*>|)[^<]*invierno[^<]*(?:<\/h[2-4]>|<\/strong>)/gi; const veranoMatch = veranoRegex.exec(html); const inviernoMatch = inviernoRegex.exec(html); if (veranoMatch && inviernoMatch) { if (isSummer) { // Use the section starting from "verano" heading const startIdx = veranoMatch.index; const endIdx = inviernoMatch.index > startIdx ? inviernoMatch.index : html.length; relevantHtml = html.substring(startIdx, endIdx); } else { // Use the section starting from "invierno" heading const startIdx = inviernoMatch.index; const endIdx = veranoMatch.index > startIdx ? veranoMatch.index : html.length; relevantHtml = html.substring(startIdx, endIdx); } } } // Find all elements with DÍA/HORARIO headers const tableRegex = /]*>([\s\S]*?)<\/table>/gi; let tableMatch; while ((tableMatch = tableRegex.exec(relevantHtml)) !== null) { const tableHtml = tableMatch[1]; // Check if this looks like a schedule table (has DÍA or HORARIO headers) if (!/d[ií]a/i.test(tableHtml) && !/horario/i.test(tableHtml)) { continue; } // Extract rows const rowRegex = /]*>([\s\S]*?)<\/tr>/gi; let rowMatch; while ((rowMatch = rowRegex.exec(tableHtml)) !== null) { const row = rowMatch[1]; // Skip header rows if (/
]*>([\s\S]*?)<\/td>/gi; const cells: string[] = []; let cellMatch; while ((cellMatch = cellRegex.exec(row)) !== null) { cells.push(cellMatch[1].replace(/<[^>]+>/g, '').trim()); } if (cells.length < 2) continue; const dayText = cells[0]; const timeText = cells[1]; // Resolve days const days = resolveDays(dayText); if (days.length === 0) continue; // Extract times const times = extractTimes(timeText); // Create schedule entries for (const day of days) { for (const time of times) { const key = `${day}:${time}`; if (seen.has(key)) continue; seen.add(key); schedules.push({ dayOfWeek: day, time }); } } } } return schedules; } function resolveDays(dayText: string): number[] { const normalized = dayText.toLowerCase().trim(); // 1. Exact match in DAY_MAP if (DAY_MAP[normalized]) { return DAY_MAP[normalized]; } // 2. Check for range: "Lunes a Viernes" const rangeMatch = normalized.match(/^(\w+)\s+a\s+(\w+)$/); if (rangeMatch) { const startDay = findDayIndex(rangeMatch[1]); const endDay = findDayIndex(rangeMatch[2]); if (startDay !== -1 && endDay !== -1) { const days: number[] = []; // DAY_ORDER: domingo=0, lunes=1, ..., sábado=6 for (let i = startDay; i <= endDay; i++) { days.push(i); } return days; } } // 3. Check for compound: "Lunes, Miércoles y Viernes" // Split by comma and "y" const parts = normalized .split(/[,]\s*/) .flatMap((part) => part.split(/\s+y\s+/)) .map((p) => p.trim()) .filter((p) => p.length > 0); if (parts.length > 1) { const days: number[] = []; for (const part of parts) { // Try exact match first if (DAY_MAP[part]) { days.push(...DAY_MAP[part]); } else { const idx = findDayIndex(part); if (idx !== -1) days.push(idx); } } if (days.length > 0) return days; } // 4. Try partial match in DAY_MAP keys for (const [key, value] of Object.entries(DAY_MAP)) { if (normalized.includes(key)) { return value; } } return []; } function findDayIndex(dayName: string): number { const normalized = dayName .toLowerCase() .replace(/á/g, 'a') .replace(/é/g, 'e') .replace(/í/g, 'i') .replace(/ó/g, 'o') .replace(/ú/g, 'u') .replace(/s$/, ''); // Remove trailing 's' for plurals for (let i = 0; i < DAY_ORDER.length; i++) { const dayNormalized = DAY_ORDER[i] .replace(/á/g, 'a') .replace(/é/g, 'e') .replace(/í/g, 'i') .replace(/ó/g, 'o') .replace(/ú/g, 'u'); if (dayNormalized === normalized || dayNormalized.startsWith(normalized)) { return i; } } return -1; } function extractTimes(text: string): string[] { const times: string[] = []; const timeRegex = /(\d{1,2}):(\d{2})\s*h?/g; let match; while ((match = timeRegex.exec(text)) !== null) { const hours = parseInt(match[1]); const minutes = match[2]; if (hours >= 0 && hours <= 23) { times.push(`${String(hours).padStart(2, '0')}:${minutes}`); } } return times; } // ─── Geocoding ─────────────────────────────────────────────────────────────── async function forwardGeocode( address: string | null, zip: string | null, city: string | null, ): Promise<{ lat: number; lng: number } | null> { // Try queries in order of specificity const queries: string[] = []; if (address) queries.push(address); if (zip && city) queries.push(`${zip} ${city}, Spain`); if (city) queries.push(`${city}, Spain`); for (const query of queries) { await delay(NOMINATIM_DELAY_MS); try { const params = new URLSearchParams({ q: query, countrycodes: 'es', format: 'json', limit: '1', }); const response = await fetch(`${NOMINATIM_URL}?${params}`, { headers: { 'User-Agent': USER_AGENT }, }); if (!response.ok) continue; const results = await response.json() as Array<{ lat: string; lon: string }>; if (results.length > 0) { const lat = parseFloat(results[0].lat); const lng = parseFloat(results[0].lon); if (!isNaN(lat) && !isNaN(lng)) { return { lat, lng }; } } } catch { // Try next query } } return null; } // ─── Database Operations ───────────────────────────────────────────────────── async function loadExistingSpanishChurches(): Promise { console.log('Loading existing Spanish churches for deduplication...'); const churches = await prisma.church.findMany({ where: { country: 'ES' }, select: { id: true, name: true, latitude: true, longitude: true, osmId: true, baiduId: true, masstimesId: true, orarimesseId: true, massSchedulesPhId: true, philmassId: true, horariosMisasId: true, mszeInfoId: true, weekdayMassesId: true, messesInfoId: true, bohosluzbyId: true, miserendId: true, kerknetId: true, gottesdienstzeitenId: true, source: true, website: true, phone: true, address: true, }, }); console.log(`Loaded ${churches.length} existing Spanish churches`); return churches; } async function geocodeUnmatchedChurches(dryRun: boolean, stats: ImportStats): Promise { console.log('\n--- Geocoding Phase ---'); const churches = await prisma.church.findMany({ where: { country: 'ES', latitude: 0, longitude: 0, address: { not: null }, }, select: { id: true, name: true, address: true, zip: true, city: true, }, }); console.log(`Found ${churches.length} Spanish churches needing geocoding`); for (let i = 0; i < churches.length; i++) { const church = churches[i]; console.log(` [${i + 1}/${churches.length}] Geocoding "${church.name}"...`); const coords = await forwardGeocode(church.address, church.zip, church.city); if (coords) { console.log(` Found: ${coords.lat}, ${coords.lng}`); stats.geocoded++; if (!dryRun) { await prisma.church.update({ where: { id: church.id }, data: { latitude: coords.lat, longitude: coords.lng, reverseGeocodedAt: new Date(), }, }); } } else { console.log(` No results`); stats.geocodeFailed++; } } } // ─── Import Logic ──────────────────────────────────────────────────────────── async function processChurch( sitemapEntry: SitemapChurch, existingChurches: ExistingChurch[], dryRun: boolean, stats: ImportStats, ): Promise { stats.churchesFound++; // Fetch church page const churchHtml = await fetchPage(sitemapEntry.url); if (!churchHtml) { stats.errors++; return; } const parsed = parseChurchPage(churchHtml); if (!parsed.name) { console.log(` Skipping ${sitemapEntry.slug}: no name found`); stats.churchesSkipped++; return; } // Parse schedule const schedules = parseScheduleTable(churchHtml); // Build candidate for dedup — use lat: 0, lng: 0 since we rely on horariosMisasId match const candidate = { name: parsed.name, lat: 0, lng: 0, horariosMisasId: sitemapEntry.slug, }; const duplicate = findDuplicateChurch(candidate, existingChurches); if (dryRun) { if (duplicate) { stats.churchesMatched++; console.log(` [MATCH] "${parsed.name}" → existing "${duplicate.name}" (${duplicate.id})`); } else { stats.churchesCreated++; console.log(` [NEW] "${parsed.name}" (${sitemapEntry.province}/${sitemapEntry.city})`); } if (schedules.length > 0) { stats.schedulesProcessed++; stats.massSchedulesCreated += schedules.length; } return; } if (duplicate) { // Update existing church stats.churchesMatched++; const updateData: Record = { horariosMisasId: sitemapEntry.slug, }; if (!duplicate.address && parsed.address) updateData.address = parsed.address; if (!duplicate.phone && parsed.phone) updateData.phone = parsed.phone; if (!duplicate.website && parsed.website) { updateData.website = parsed.website; updateData.hasWebsite = true; } // Fill city/state/zip if not set const dbRecord = await prisma.church.findUnique({ where: { id: duplicate.id }, select: { city: true, state: true, zip: true }, }); if (dbRecord && !dbRecord.city && parsed.city) updateData.city = parsed.city; if (dbRecord && !dbRecord.state) updateData.state = sitemapEntry.province; if (dbRecord && !dbRecord.zip && parsed.zip) updateData.zip = parsed.zip; try { await prisma.church.update({ where: { id: duplicate.id }, data: updateData, }); } catch (error) { if (error instanceof Error && error.message.includes('Unique constraint')) { stats.churchesSkipped++; return; } throw error; } // Replace mass schedules if (schedules.length > 0) { try { await prisma.$transaction(async (tx) => { await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } }); await tx.massSchedule.createMany({ data: schedules.map((s) => ({ churchId: duplicate.id, dayOfWeek: s.dayOfWeek, time: s.time, language: 'Spanish', })), }); await tx.church.update({ where: { id: duplicate.id }, data: { lastScrapedAt: new Date() }, }); }); stats.schedulesProcessed++; stats.massSchedulesCreated += schedules.length; } catch (error) { stats.errors++; console.error(` Error saving schedules for ${sitemapEntry.slug}: ${error instanceof Error ? error.message : error}`); } } } else { // Create new church try { const newChurch = await prisma.church.create({ data: { name: parsed.name, latitude: 0, longitude: 0, address: parsed.address, zip: parsed.zip, city: parsed.city || null, state: sitemapEntry.province || null, country: 'ES', phone: parsed.phone, website: parsed.website, hasWebsite: !!parsed.website, horariosMisasId: sitemapEntry.slug, source: 'horariosmisas', }, }); stats.churchesCreated++; // Add to in-memory array for within-run dedup existingChurches.push({ id: newChurch.id, name: parsed.name, latitude: 0, longitude: 0, osmId: null, baiduId: null, masstimesId: null, orarimesseId: null, massSchedulesPhId: null, philmassId: null, horariosMisasId: sitemapEntry.slug, mszeInfoId: null, weekdayMassesId: null, messesInfoId: null, bohosluzbyId: null, miserendId: null, kerknetId: null, gottesdienstzeitenId: null, source: 'horariosmisas', website: parsed.website, phone: parsed.phone, address: parsed.address, }); // Create mass schedules if (schedules.length > 0) { await prisma.massSchedule.createMany({ data: schedules.map((s) => ({ churchId: newChurch.id, dayOfWeek: s.dayOfWeek, time: s.time, language: 'Spanish', })), }); await prisma.church.update({ where: { id: newChurch.id }, data: { lastScrapedAt: new Date() }, }); stats.schedulesProcessed++; stats.massSchedulesCreated += schedules.length; } } catch (error) { if (error instanceof Error && error.message.includes('Unique constraint')) { stats.churchesSkipped++; return; } throw error; } } } // ─── CLI ───────────────────────────────────────────────────────────────────── function parseArgs(): CLIArgs { const args = process.argv.slice(2); const result: CLIArgs = { all: false, dryRun: false, geocode: false, geocodeOnly: false, }; for (let i = 0; i < args.length; i++) { switch (args[i]) { case '--all': result.all = true; break; case '--province': result.province = args[++i]; break; case '--dry-run': result.dryRun = true; break; case '--geocode': result.geocode = true; break; case '--geocode-only': result.geocodeOnly = true; break; case '--resume-from': result.resumeFrom = parseInt(args[++i]); break; case '--job-id': result.jobId = args[++i]; break; case '--help': case '-h': console.log(` Usage: npx tsx scripts/import-horariosmisas.ts [options] Options: --all Import all churches from sitemaps --province Filter by province slug (e.g. "madrid") --dry-run No database writes, just report what would happen --geocode Geocode churches after import (Nominatim) --geocode-only Only geocode existing churches (skip import) --resume-from Skip first N churches --job-id Background job tracking ID --help, -h Show this help message Examples: npx tsx scripts/import-horariosmisas.ts --all --dry-run npx tsx scripts/import-horariosmisas.ts --all npx tsx scripts/import-horariosmisas.ts --province madrid npx tsx scripts/import-horariosmisas.ts --all --geocode npx tsx scripts/import-horariosmisas.ts --geocode-only npx tsx scripts/import-horariosmisas.ts --all --resume-from 500 `); process.exit(0); } } if (!result.all && !result.province && !result.geocodeOnly) { console.error('Error: specify --all, --province , or --geocode-only'); process.exit(1); } return result; } // ─── Helpers ───────────────────────────────────────────────────────────────── function formatDuration(ms: number): string { const seconds = Math.floor(ms / 1000); const minutes = Math.floor(seconds / 60); const hours = Math.floor(minutes / 60); if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`; if (minutes > 0) return `${minutes}m ${seconds % 60}s`; return `${seconds}s`; } // ─── Main ──────────────────────────────────────────────────────────────────── async function main() { const args = parseArgs(); const startTime = Date.now(); console.log('\n' + '='.repeat(70)); console.log('HORARIOSMISAS.COM (SPAIN) IMPORTER'); console.log('='.repeat(70)); console.log(`Mode: ${args.geocodeOnly ? 'Geocode only' : args.all ? 'All churches from sitemaps' : `Province: ${args.province}`}`); console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`); console.log(`Geocode: ${args.geocode || args.geocodeOnly ? 'YES' : 'NO'}`); if (args.resumeFrom) console.log(`Resume from: ${args.resumeFrom}`); console.log(`Time: ${new Date().toISOString()}`); console.log('='.repeat(70) + '\n'); // Update background job status if provided if (args.jobId) { try { await prisma.backgroundJob.update({ where: { id: args.jobId }, data: { status: 'running', startedAt: new Date() }, }); } catch { // Job might not exist yet } } const stats: ImportStats = { churchesFound: 0, churchesMatched: 0, churchesCreated: 0, churchesSkipped: 0, schedulesProcessed: 0, massSchedulesCreated: 0, geocoded: 0, geocodeFailed: 0, errors: 0, }; if (!args.geocodeOnly) { // Load existing Spanish churches for dedup const existingChurches = await loadExistingSpanishChurches(); // Fetch church URLs from sitemaps const allChurches = await fetchChurchUrlsFromSitemaps(); // Filter by province if specified let churchesToProcess = allChurches; if (args.province) { churchesToProcess = allChurches.filter((c) => c.province === args.province); console.log(`Filtered to ${churchesToProcess.length} churches in province "${args.province}"\n`); } else { console.log(`Processing ${churchesToProcess.length} churches\n`); } // Handle --resume-from if (args.resumeFrom) { const before = churchesToProcess.length; churchesToProcess = churchesToProcess.slice(args.resumeFrom); console.log(`Resuming from index ${args.resumeFrom} (skipping ${before - churchesToProcess.length} churches)\n`); } // Process each church for (let i = 0; i < churchesToProcess.length; i++) { const church = churchesToProcess[i]; const elapsed = formatDuration(Date.now() - startTime); console.log(`[${i + 1}/${churchesToProcess.length}] ${church.province}/${church.city}/${church.slug} [${elapsed} elapsed]`); try { await processChurch(church, existingChurches, args.dryRun, stats); } catch (error) { stats.errors++; console.error(` ERROR processing ${church.slug}: ${error instanceof Error ? error.message : error}`); } } } // Geocode phase if (args.geocode || args.geocodeOnly) { await geocodeUnmatchedChurches(args.dryRun, stats); } // Print summary const totalTime = Date.now() - startTime; console.log('\n' + '='.repeat(70)); console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`); console.log('='.repeat(70)); console.log(`Churches found: ${stats.churchesFound}`); console.log(` Matched (existing): ${stats.churchesMatched}`); console.log(` Created (new): ${stats.churchesCreated}`); console.log(` Skipped: ${stats.churchesSkipped}`); console.log(`Schedules processed: ${stats.schedulesProcessed}`); console.log(`Mass schedules created: ${stats.massSchedulesCreated}`); if (args.geocode || args.geocodeOnly) { console.log(`Geocoded: ${stats.geocoded}`); console.log(`Geocode failed: ${stats.geocodeFailed}`); } console.log(`Errors: ${stats.errors}`); console.log(`Total time: ${formatDuration(totalTime)}`); console.log(`HTTP requests: ${requestCount}`); console.log('='.repeat(70) + '\n'); // Update background job if (args.jobId) { try { await prisma.backgroundJob.update({ where: { id: args.jobId }, data: { status: stats.errors > 0 ? 'completed_with_errors' : 'completed', completedAt: new Date(), result: JSON.stringify(stats), }, }); } catch { // Ignore } } } main() .catch((error) => { console.error('Fatal error:', error); process.exit(1); }) .finally(async () => { await prisma.$disconnect(); await pool.end(); });