#!/usr/bin/env tsx /** * Import Catholic churches and mass schedules from the BuscarMisas network. * * A group of 5 identical WordPress-based directories covering Latin America: * - horariosmissa.com.br (Brazil, ~4,732 churches) * - buscarmisas.com.mx (Mexico, ~3,950 churches) * - horariosmisa.com.ar (Argentina, ~3,012 churches) * - buscarmisas.co (Colombia, ~2,665 churches) * - horariomisa.cl (Chile, ~935 churches) * * Usage: * npx tsx scripts/import-buscarmisas-network.ts --domain horariosmissa.com.br * npx tsx scripts/import-buscarmisas-network.ts --domain horariosmissa.com.br --resume-from 500 * npx tsx scripts/import-buscarmisas-network.ts --all * npx tsx scripts/import-buscarmisas-network.ts --domain horariosmissa.com.br --dry-run */ import dotenv from 'dotenv'; import path from 'path'; dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); dotenv.config({ path: path.resolve(process.cwd(), '.env') }); import { Pool } from 'pg'; import { PrismaPg } from '@prisma/adapter-pg'; import { PrismaClient } from '@prisma/client'; import { findDuplicateChurch } from '../src/lib/church-matcher'; import type { ExistingChurch } from '../src/lib/church-matcher'; import { getDayNamesForCountry, buildDayPatterns } from '../src/scrapers/i18n/day-names'; const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; console.log(`Connecting to: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); const pool = new Pool({ connectionString: dbUrl, ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined }); const adapter = new PrismaPg(pool); const prisma = new PrismaClient({ adapter }); // ─── Site Config ───────────────────────────────────────────────────────────── interface SiteConfig { country: string; // ISO 3166-1 alpha-2 language: 'pt' | 'es'; sitemapType: 'page' | 'post'; } const NETWORK_SITES: Record = { 'horariosmissa.com.br': { country: 'BR', language: 'pt', sitemapType: 'page' }, 'buscarmisas.com.mx': { country: 'MX', language: 'es', sitemapType: 'page' }, 'horariosmisa.com.ar': { country: 'AR', language: 'es', sitemapType: 'page' }, 'buscarmisas.co': { country: 'CO', language: 'es', sitemapType: 'page' }, 'horariomisa.cl': { country: 'CL', language: 'es', sitemapType: 'post' }, }; // ─── Types ──────────────────────────────────────────────────────────────────── interface ParsedChurch { name: string; address: string | null; city: string | null; state: string | null; phone: string | null; lat: number; lng: number; externalId: string; country: string; } interface ParsedMass { dayOfWeek: number; // 0 = Sunday, 6 = Saturday time: string; // HH:MM 24-hour } interface CLIArgs { domain: string | null; all: boolean; dryRun: boolean; resumeFrom: number; limit: number | null; jobId: string | null; } interface ImportStats { total: number; created: number; updated: number; skipped: number; errors: number; massSchedulesCreated: number; } // ─── Helpers ───────────────────────────────────────────────────────────────── /** * Build external ID for a church URL. * Format: "{domain-slug}/{church-slug}" * e.g. "horariosmissa-com-br/paroquia-nossa-senhora-dos-remedios" */ export function buildExternalId(domain: string, churchUrl: string): string { const domainSlug = domain.replace(/\./g, '-'); // URL path: /{region}/{city}/{church-slug}/ const segments = churchUrl.replace(/\/$/, '').split('/').filter(Boolean); const churchSlug = segments[segments.length - 1] || ''; return `${domainSlug}/${churchSlug}`; } /** * Parse church data from a church page HTML string. * Returns null if name or coordinates cannot be extracted. */ export function parseChurchPage( html: string, domain: string, churchUrl: string, config: SiteConfig, ): ParsedChurch | null { // Name: cell after Nome (PT) or Nombre (ES) const nameLabel = config.language === 'pt' ? 'Nome' : 'Nombre'; const nameMatch = html.match( new RegExp(`${nameLabel}<\\/strong><\\/td>\\s*([^<]+)<\\/td>`, 'i') ); const name = nameMatch?.[1]?.trim() ?? ''; if (!name) return null; // Coordinates: Google Maps iframe center= parameter const coordMatch = html.match(/center=([-\d.]+)%2C([-\d.]+)/i); if (!coordMatch) return null; const lat = parseFloat(coordMatch[1]); const lng = parseFloat(coordMatch[2]); if (!isFinite(lat) || !isFinite(lng) || Math.abs(lat) > 90 || Math.abs(lng) > 180) return null; // Address: cell after Endereço (PT) or Dirección (ES) const addrLabel = config.language === 'pt' ? 'Endere[çc]o' : 'Direcci[oó]n'; const addrMatch = html.match( new RegExp(`${addrLabel}<\\/strong><\\/td>\\s*([^<]+)<\\/td>`, 'i') ); const address = addrMatch?.[1]?.trim() ?? null; // Phone: tel: href const phoneMatch = html.match(/href="tel:([^"]+)"/i); const phone = phoneMatch?.[1]?.trim() ?? null; // City and state from URL path segments // URL form: https://{domain}/{state}/{city}/{slug}/ const urlPath = new URL(churchUrl).pathname.split('/').filter(Boolean); const state = urlPath[0] ? decodeURIComponent(urlPath[0].replace(/-/g, ' ')) : null; const city = urlPath[1] ? decodeURIComponent(urlPath[1].replace(/-/g, ' ')) : null; return { name, address, city, state, phone, lat, lng, externalId: buildExternalId(domain, churchUrl), country: config.country, }; } /** * Parse the weekly mass schedule table from church page HTML. * Table format: day-name cell | time cell (comma-separated times, "-" = no mass) */ export function parseMassSchedule(html: string, countryCode: string): ParsedMass[] { const dayPatterns = buildDayPatterns(getDayNamesForCountry(countryCode)); const results: ParsedMass[] = []; // Extract all cells as pairs [day, time] const cells = [...html.matchAll(/]*>(.*?)<\/td>/gis)].map(m => m[1].replace(/<[^>]+>/g, '').trim() ); for (let i = 0; i + 1 < cells.length; i += 2) { const dayCell = cells[i].toLowerCase(); const timeCell = cells[i + 1]; const dayOfWeek = dayPatterns[dayCell]; if (dayOfWeek === undefined) continue; if (timeCell === '-' || !timeCell) continue; // Split comma-separated times: "10:00, 18:00" → ["10:00", "18:00"] for (const rawTime of timeCell.split(',')) { const time = rawTime.trim(); if (/^\d{1,2}:\d{2}$/.test(time)) { results.push({ dayOfWeek, time }); } } } return results; } // ─── HTTP Helpers ───────────────────────────────────────────────────────────── const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)'; const REQUEST_DELAY_MS = 2_000; const DOMAIN_DELAY_MS = 5_000; async function fetchText(url: string): Promise { const res = await fetch(url, { headers: { 'User-Agent': USER_AGENT } }); if (!res.ok) throw new Error(`HTTP ${res.status} for ${url}`); return res.text(); } async function fetchWithRetry(url: string, retries = 3): Promise { for (let attempt = 1; attempt <= retries; attempt++) { try { return await fetchText(url); } catch (err) { const msg = err instanceof Error ? err.message : String(err); if (attempt === retries) throw err; const isRetryable = msg.includes('429') || msg.includes('503'); if (!isRetryable) throw err; const backoff = attempt * 30_000; // 30s, 60s, 90s console.warn(` [retry ${attempt}/${retries}] ${msg} — waiting ${backoff / 1000}s`); await sleep(backoff); } } throw new Error('unreachable'); } function sleep(ms: number): Promise { return new Promise(resolve => setTimeout(resolve, ms)); } // ─── DB Helpers ─────────────────────────────────────────────────────────────── async function loadExistingChurches(country: string): Promise { console.log(`Loading existing ${country} churches from DB...`); const churches = await prisma.church.findMany({ where: { country }, select: { id: true, name: true, latitude: true, longitude: true, osmId: true, baiduId: true, masstimesId: true, orarimesseId: true, massSchedulesPhId: true, philmassId: true, horariosMisasId: true, mszeInfoId: true, weekdayMassesId: true, messesInfoId: true, bohosluzbyId: true, miserendId: true, kerknetId: true, gottesdienstzeitenId: true, discovermassId: true, buscarmisasNetworkId: true, source: true, website: true, phone: true, address: true, country: true, }, }); console.log(` Loaded ${churches.length} existing ${country} churches`); return churches as ExistingChurch[]; } // ─── Church Processing ──────────────────────────────────────────────────────── async function processChurch( url: string, domain: string, config: SiteConfig, existingChurches: ExistingChurch[], args: CLIArgs, stats: ImportStats, ): Promise { stats.total++; try { const html = await fetchWithRetry(url); const parsed = parseChurchPage(html, domain, url, config); if (!parsed) { console.log(` [skip] No name/coords: ${url}`); stats.skipped++; return; } const masses = parseMassSchedule(html, config.country); if (args.dryRun) { console.log(` [dry-run] ${parsed.name} — ${masses.length} masses`); return; } const candidate = { name: parsed.name, lat: parsed.lat, lng: parsed.lng, buscarmisasNetworkId: parsed.externalId, }; const duplicate = findDuplicateChurch(candidate, existingChurches); if (duplicate) { const updateData: Record = { buscarmisasNetworkId: parsed.externalId }; if (!duplicate.phone && parsed.phone) updateData.phone = parsed.phone; if (parsed.lat !== 0 && duplicate.latitude === 0) { updateData.latitude = parsed.lat; updateData.longitude = parsed.lng; } await prisma.$transaction(async (tx) => { await tx.church.update({ where: { id: duplicate.id }, data: updateData }); if (masses.length > 0) { await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } }); await tx.massSchedule.createMany({ data: masses.map(m => ({ churchId: duplicate.id, dayOfWeek: m.dayOfWeek, time: m.time, language: config.language === 'pt' ? 'Portuguese' : 'Spanish', notes: null })), }); } await tx.church.update({ where: { id: duplicate.id }, data: { lastScrapedAt: new Date() } }); }); duplicate.buscarmisasNetworkId = parsed.externalId; stats.updated++; } else { const church = await prisma.church.create({ data: { name: parsed.name, address: parsed.address, city: parsed.city, state: parsed.state, country: parsed.country, phone: parsed.phone, latitude: parsed.lat, longitude: parsed.lng, buscarmisasNetworkId: parsed.externalId, source: 'buscarmisas-network', hasWebsite: false, }, }); existingChurches.push({ id: church.id, name: parsed.name, latitude: parsed.lat, longitude: parsed.lng, osmId: null, baiduId: null, masstimesId: null, orarimesseId: null, massSchedulesPhId: null, philmassId: null, horariosMisasId: null, mszeInfoId: null, weekdayMassesId: null, messesInfoId: null, bohosluzbyId: null, miserendId: null, kerknetId: null, gottesdienstzeitenId: null, discovermassId: null, buscarmisasNetworkId: parsed.externalId, source: 'buscarmisas-network', website: null, phone: parsed.phone, address: parsed.address, country: parsed.country, }); if (masses.length > 0) { await prisma.massSchedule.createMany({ data: masses.map(m => ({ churchId: church.id, dayOfWeek: m.dayOfWeek, time: m.time, language: config.language === 'pt' ? 'Portuguese' : 'Spanish', notes: null, })), }); await prisma.church.update({ where: { id: church.id }, data: { lastScrapedAt: new Date() } }); } stats.created++; } stats.massSchedulesCreated += masses.length; console.log( ` [${duplicate ? 'update' : 'create'}] ${parsed.name} — ${masses.length} masses — ` + `${stats.total} total (${stats.created}↑ ${stats.updated}↻ ${stats.errors}✗)` ); } catch (err) { stats.errors++; console.error(` [error] ${url}: ${err instanceof Error ? err.message : err}`); } } // ─── Sitemap Discovery ──────────────────────────────────────────────────────── /** * Fetch all church page URLs for a domain from its sitemap. * Church URLs have exactly 3 path segments: /{region}/{city}/{slug}/ */ export async function getChurchUrls(domain: string, config: SiteConfig): Promise { const indexUrl = `https://${domain}/sitemap_index.xml`; console.log(`Fetching sitemap index: ${indexUrl}`); const indexXml = await fetchWithRetry(indexUrl); // Extract child sitemap URLs matching the sitemapType const childPattern = config.sitemapType === 'page' ? /https:\/\/[^<]*\/page-sitemap\d*\.xml/g : /https:\/\/[^<]*\/post-sitemap\.xml/g; const childUrls = [...indexXml.matchAll(childPattern)].map(m => m[0]); console.log(` Found ${childUrls.length} child sitemaps`); const churchUrls: string[] = []; for (const sitemapUrl of childUrls) { const xml = await fetchWithRetry(sitemapUrl); const locs = [...xml.matchAll(/([^<]+)<\/loc>/g)].map(m => m[1].trim()); for (const loc of locs) { // Church URLs: exactly 3 non-empty path segments after the domain try { const segments = new URL(loc).pathname.split('/').filter(Boolean); if (segments.length === 3) { churchUrls.push(loc); } } catch { /* skip malformed URLs */ } } } // Deduplicate const unique = [...new Set(churchUrls)]; console.log(` Total church URLs: ${unique.length}`); return unique; } // ─── CLI ────────────────────────────────────────────────────────────────────── function parseCLIArgs(): CLIArgs { const argv = process.argv.slice(2); const result: CLIArgs = { domain: null, all: false, dryRun: false, resumeFrom: 0, limit: null, jobId: null }; for (let i = 0; i < argv.length; i++) { switch (argv[i]) { case '--domain': result.domain = argv[++i]; break; case '--all': result.all = true; break; case '--dry-run': result.dryRun = true; break; case '--resume-from': result.resumeFrom = parseInt(argv[++i], 10); break; case '--limit': result.limit = parseInt(argv[++i], 10); break; case '--job-id': result.jobId = argv[++i]; break; } } return result; } function validateArgs(args: CLIArgs): void { if (!args.domain && !args.all) { console.error('Usage:'); console.error(' npx tsx scripts/import-buscarmisas-network.ts --domain '); console.error(' npx tsx scripts/import-buscarmisas-network.ts --all'); console.error('\nValid domains:', Object.keys(NETWORK_SITES).join(', ')); process.exit(1); } if (args.domain && !NETWORK_SITES[args.domain]) { console.error(`Unknown domain: ${args.domain}`); console.error('Valid domains:', Object.keys(NETWORK_SITES).join(', ')); process.exit(1); } if (args.all && args.resumeFrom > 0) { console.error('--resume-from cannot be used with --all. Use --domain to resume a specific site.'); process.exit(1); } } async function runDomain(domain: string, config: SiteConfig, args: CLIArgs): Promise { const stats: ImportStats = { total: 0, created: 0, updated: 0, skipped: 0, errors: 0, massSchedulesCreated: 0 }; const allUrls = await getChurchUrls(domain, config); const existingChurches = await loadExistingChurches(config.country); // Build set of already-imported IDs for fast skip const importedIds = new Set( existingChurches.filter(c => c.buscarmisasNetworkId).map(c => c.buscarmisasNetworkId!) ); let candidateUrls = allUrls.slice(args.resumeFrom).filter(url => { const externalId = buildExternalId(domain, url); return !importedIds.has(externalId); }); if (args.limit !== null) candidateUrls = candidateUrls.slice(0, args.limit); console.log(`\n${domain}: ${allUrls.length} total | ${importedIds.size} already imported | ${candidateUrls.length} to process\n`); for (let i = 0; i < candidateUrls.length; i++) { const url = candidateUrls[i]; console.log(`[${i + 1}/${candidateUrls.length}] ${url}`); await processChurch(url, domain, config, existingChurches, args, stats); if (i < candidateUrls.length - 1) await sleep(REQUEST_DELAY_MS); } return stats; } // ─── Main ───────────────────────────────────────────────────────────────────── async function main() { const args = parseCLIArgs(); validateArgs(args); if (args.jobId) { try { await prisma.backgroundJob.update({ where: { id: args.jobId }, data: { status: 'running', startedAt: new Date() }, }); } catch { /* job may not exist yet */ } } const domainsToRun: [string, SiteConfig][] = args.all ? Object.entries(NETWORK_SITES) : [[args.domain!, NETWORK_SITES[args.domain!]]]; const totalStats: ImportStats = { total: 0, created: 0, updated: 0, skipped: 0, errors: 0, massSchedulesCreated: 0 }; try { for (let d = 0; d < domainsToRun.length; d++) { const [domain, config] = domainsToRun[d]; console.log(`\n${'─'.repeat(60)}`); console.log(`Domain ${d + 1}/${domainsToRun.length}: ${domain} (${config.country})`); console.log('─'.repeat(60)); const stats = await runDomain(domain, config, args); totalStats.total += stats.total; totalStats.created += stats.created; totalStats.updated += stats.updated; totalStats.skipped += stats.skipped; totalStats.errors += stats.errors; totalStats.massSchedulesCreated += stats.massSchedulesCreated; if (d < domainsToRun.length - 1) await sleep(DOMAIN_DELAY_MS); } } finally { console.log('\n─── Import Complete ───────────────────────────────────────'); console.log(`Total processed: ${totalStats.total}`); console.log(`Created: ${totalStats.created}`); console.log(`Updated: ${totalStats.updated}`); console.log(`Skipped: ${totalStats.skipped}`); console.log(`Errors: ${totalStats.errors}`); console.log(`Mass schedules: ${totalStats.massSchedulesCreated}`); if (args.jobId) { const status = totalStats.errors > totalStats.total * 0.1 ? 'failed' : 'completed'; try { await prisma.backgroundJob.update({ where: { id: args.jobId }, data: { status, completedAt: new Date(), processed: totalStats.total, succeeded: totalStats.created + totalStats.updated, failed: totalStats.errors, itemsFound: totalStats.massSchedulesCreated, }, }); } catch { /* ignore */ } } await prisma.$disconnect(); await pool.end(); } } main().catch(err => { console.error('Fatal error:', err); process.exit(1); });