#!/usr/bin/env tsx /** * Import Catholic churches and mass schedules from the BuscarMisas network. * * A group of 5 identical WordPress-based directories covering Latin America: * - horariosmissa.com.br (Brazil, ~4,732 churches) * - buscarmisas.com.mx (Mexico, ~3,950 churches) * - horariosmisa.com.ar (Argentina, ~3,012 churches) * - buscarmisas.co (Colombia, ~2,665 churches) * - horariomisa.cl (Chile, ~935 churches) * * Usage: * npx tsx scripts/import-buscarmisas-network.ts --domain horariosmissa.com.br * npx tsx scripts/import-buscarmisas-network.ts --domain horariosmissa.com.br --resume-from 500 * npx tsx scripts/import-buscarmisas-network.ts --all * npx tsx scripts/import-buscarmisas-network.ts --domain horariosmissa.com.br --dry-run */ import dotenv from 'dotenv'; import path from 'path'; dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); dotenv.config({ path: path.resolve(process.cwd(), '.env') }); import { Pool } from 'pg'; import { PrismaPg } from '@prisma/adapter-pg'; import { PrismaClient } from '@prisma/client'; import { findDuplicateChurch } from '../src/lib/church-matcher'; import type { ExistingChurch } from '../src/lib/church-matcher'; import { getDayNamesForCountry, buildDayPatterns } from '../src/scrapers/i18n/day-names'; // ─── Site Config ───────────────────────────────────────────────────────────── interface SiteConfig { country: string; // ISO 3166-1 alpha-2 language: 'pt' | 'es'; sitemapType: 'page' | 'post'; } const NETWORK_SITES: Record = { 'horariosmissa.com.br': { country: 'BR', language: 'pt', sitemapType: 'page' }, 'buscarmisas.com.mx': { country: 'MX', language: 'es', sitemapType: 'page' }, 'horariosmisa.com.ar': { country: 'AR', language: 'es', sitemapType: 'page' }, 'buscarmisas.co': { country: 'CO', language: 'es', sitemapType: 'page' }, 'horariomisa.cl': { country: 'CL', language: 'es', sitemapType: 'post' }, }; // ─── Types ──────────────────────────────────────────────────────────────────── interface ParsedChurch { name: string; address: string | null; city: string | null; state: string | null; phone: string | null; lat: number; lng: number; externalId: string; country: string; } interface ParsedMass { dayOfWeek: number; // 0 = Sunday, 6 = Saturday time: string; // HH:MM 24-hour } interface CLIArgs { domain: string | null; all: boolean; dryRun: boolean; resumeFrom: number; limit: number | null; jobId: string | null; } interface ImportStats { total: number; created: number; updated: number; skipped: number; errors: number; massSchedulesCreated: number; } // ─── Helpers ───────────────────────────────────────────────────────────────── /** * Build external ID for a church URL. * Format: "{domain-slug}/{church-slug}" * e.g. "horariosmissa-com-br/paroquia-nossa-senhora-dos-remedios" */ export function buildExternalId(domain: string, churchUrl: string): string { const domainSlug = domain.replace(/\./g, '-'); // URL path: /{region}/{city}/{church-slug}/ const segments = churchUrl.replace(/\/$/, '').split('/').filter(Boolean); const churchSlug = segments[segments.length - 1] || ''; return `${domainSlug}/${churchSlug}`; } /** * Parse church data from a church page HTML string. * Returns null if name or coordinates cannot be extracted. */ export function parseChurchPage( html: string, domain: string, churchUrl: string, config: SiteConfig, ): ParsedChurch | null { // Name: cell after Nome (PT) or Nombre (ES) const nameLabel = config.language === 'pt' ? 'Nome' : 'Nombre'; const nameMatch = html.match( new RegExp(`${nameLabel}<\\/strong><\\/td>\\s*([^<]+)<\\/td>`, 'i') ); const name = nameMatch?.[1]?.trim() ?? ''; if (!name) return null; // Coordinates: Google Maps iframe center= parameter const coordMatch = html.match(/center=([-\d.]+)%2C([-\d.]+)/i); if (!coordMatch) return null; const lat = parseFloat(coordMatch[1]); const lng = parseFloat(coordMatch[2]); if (!isFinite(lat) || !isFinite(lng) || Math.abs(lat) > 90 || Math.abs(lng) > 180) return null; // Address: cell after Endereço (PT) or Dirección (ES) const addrLabel = config.language === 'pt' ? 'Endere[çc]o' : 'Direcci[oó]n'; const addrMatch = html.match( new RegExp(`${addrLabel}<\\/strong><\\/td>\\s*([^<]+)<\\/td>`, 'i') ); const address = addrMatch?.[1]?.trim() ?? null; // Phone: tel: href const phoneMatch = html.match(/href="tel:([^"]+)"/i); const phone = phoneMatch?.[1]?.trim() ?? null; // City and state from URL path segments // URL form: https://{domain}/{state}/{city}/{slug}/ const urlPath = new URL(churchUrl).pathname.split('/').filter(Boolean); const state = urlPath[0] ? decodeURIComponent(urlPath[0].replace(/-/g, ' ')) : null; const city = urlPath[1] ? decodeURIComponent(urlPath[1].replace(/-/g, ' ')) : null; return { name, address, city, state, phone, lat, lng, externalId: buildExternalId(domain, churchUrl), country: config.country, }; } /** * Parse the weekly mass schedule table from church page HTML. * Table format: day-name cell | time cell (comma-separated times, "-" = no mass) */ export function parseMassSchedule(html: string, countryCode: string): ParsedMass[] { const dayPatterns = buildDayPatterns(getDayNamesForCountry(countryCode)); const results: ParsedMass[] = []; // Extract all cells as pairs [day, time] const cells = [...html.matchAll(/]*>(.*?)<\/td>/gis)].map(m => m[1].replace(/<[^>]+>/g, '').trim() ); for (let i = 0; i + 1 < cells.length; i += 2) { const dayCell = cells[i].toLowerCase(); const timeCell = cells[i + 1]; const dayOfWeek = dayPatterns[dayCell]; if (dayOfWeek === undefined) continue; if (timeCell === '-' || !timeCell) continue; // Split comma-separated times: "10:00, 18:00" → ["10:00", "18:00"] for (const rawTime of timeCell.split(',')) { const time = rawTime.trim(); if (/^\d{1,2}:\d{2}$/.test(time)) { results.push({ dayOfWeek, time }); } } } return results; } // ─── HTTP Helpers ───────────────────────────────────────────────────────────── const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)'; const REQUEST_DELAY_MS = 2_000; const DOMAIN_DELAY_MS = 5_000; async function fetchText(url: string): Promise { const res = await fetch(url, { headers: { 'User-Agent': USER_AGENT } }); if (!res.ok) throw new Error(`HTTP ${res.status} for ${url}`); return res.text(); } async function fetchWithRetry(url: string, retries = 3): Promise { for (let attempt = 1; attempt <= retries; attempt++) { try { return await fetchText(url); } catch (err) { const msg = err instanceof Error ? err.message : String(err); if (attempt === retries) throw err; const isRetryable = msg.includes('429') || msg.includes('503'); if (!isRetryable) throw err; const backoff = attempt * 30_000; // 30s, 60s, 90s console.warn(` [retry ${attempt}/${retries}] ${msg} — waiting ${backoff / 1000}s`); await sleep(backoff); } } throw new Error('unreachable'); } function sleep(ms: number): Promise { return new Promise(resolve => setTimeout(resolve, ms)); } // ─── Sitemap Discovery ──────────────────────────────────────────────────────── /** * Fetch all church page URLs for a domain from its sitemap. * Church URLs have exactly 3 path segments: /{region}/{city}/{slug}/ */ export async function getChurchUrls(domain: string, config: SiteConfig): Promise { const indexUrl = `https://${domain}/sitemap_index.xml`; console.log(`Fetching sitemap index: ${indexUrl}`); const indexXml = await fetchWithRetry(indexUrl); // Extract child sitemap URLs matching the sitemapType const childPattern = config.sitemapType === 'page' ? /https:\/\/[^<]*\/page-sitemap\d*\.xml/g : /https:\/\/[^<]*\/post-sitemap\.xml/g; const childUrls = [...indexXml.matchAll(childPattern)].map(m => m[0]); console.log(` Found ${childUrls.length} child sitemaps`); const churchUrls: string[] = []; for (const sitemapUrl of childUrls) { const xml = await fetchWithRetry(sitemapUrl); const locs = [...xml.matchAll(/([^<]+)<\/loc>/g)].map(m => m[1].trim()); for (const loc of locs) { // Church URLs: exactly 3 non-empty path segments after the domain try { const segments = new URL(loc).pathname.split('/').filter(Boolean); if (segments.length === 3) { churchUrls.push(loc); } } catch { /* skip malformed URLs */ } } } // Deduplicate const unique = [...new Set(churchUrls)]; console.log(` Total church URLs: ${unique.length}`); return unique; }