#!/usr/bin/env tsx /** * Import Catholic churches and mass schedules from the BuscarMisas network. * * A group of 5 identical WordPress-based directories covering Latin America: * - horariosmissa.com.br (Brazil, ~4,732 churches) * - buscarmisas.com.mx (Mexico, ~3,950 churches) * - horariosmisa.com.ar (Argentina, ~3,012 churches) * - buscarmisas.co (Colombia, ~2,665 churches) * - horariomisa.cl (Chile, ~935 churches) * * Usage: * npx tsx scripts/import-buscarmisas-network.ts --domain horariosmissa.com.br * npx tsx scripts/import-buscarmisas-network.ts --domain horariosmissa.com.br --resume-from 500 * npx tsx scripts/import-buscarmisas-network.ts --all * npx tsx scripts/import-buscarmisas-network.ts --domain horariosmissa.com.br --dry-run */ import dotenv from 'dotenv'; import path from 'path'; dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); dotenv.config({ path: path.resolve(process.cwd(), '.env') }); import { Pool } from 'pg'; import { PrismaPg } from '@prisma/adapter-pg'; import { PrismaClient } from '@prisma/client'; import { findDuplicateChurch } from '../src/lib/church-matcher'; import type { ExistingChurch } from '../src/lib/church-matcher'; import { getDayNamesForCountry, buildDayPatterns } from '../src/scrapers/i18n/day-names'; // ─── Site Config ───────────────────────────────────────────────────────────── interface SiteConfig { country: string; // ISO 3166-1 alpha-2 language: 'pt' | 'es'; sitemapType: 'page' | 'post'; } const NETWORK_SITES: Record = { 'horariosmissa.com.br': { country: 'BR', language: 'pt', sitemapType: 'page' }, 'buscarmisas.com.mx': { country: 'MX', language: 'es', sitemapType: 'page' }, 'horariosmisa.com.ar': { country: 'AR', language: 'es', sitemapType: 'page' }, 'buscarmisas.co': { country: 'CO', language: 'es', sitemapType: 'page' }, 'horariomisa.cl': { country: 'CL', language: 'es', sitemapType: 'post' }, }; // ─── Types ──────────────────────────────────────────────────────────────────── interface ParsedChurch { name: string; address: string | null; city: string | null; state: string | null; phone: string | null; lat: number; lng: number; externalId: string; country: string; } interface ParsedMass { dayOfWeek: number; // 0 = Sunday, 6 = Saturday time: string; // HH:MM 24-hour } interface CLIArgs { domain: string | null; all: boolean; dryRun: boolean; resumeFrom: number; limit: number | null; jobId: string | null; } interface ImportStats { total: number; created: number; updated: number; skipped: number; errors: number; massSchedulesCreated: number; } // ─── Helpers ───────────────────────────────────────────────────────────────── /** * Build external ID for a church URL. * Format: "{domain-slug}/{church-slug}" * e.g. "horariosmissa-com-br/paroquia-nossa-senhora-dos-remedios" */ export function buildExternalId(domain: string, churchUrl: string): string { const domainSlug = domain.replace(/\./g, '-'); // URL path: /{region}/{city}/{church-slug}/ const segments = churchUrl.replace(/\/$/, '').split('/').filter(Boolean); const churchSlug = segments[segments.length - 1] || ''; return `${domainSlug}/${churchSlug}`; } /** * Parse church data from a church page HTML string. * Returns null if name or coordinates cannot be extracted. */ export function parseChurchPage( html: string, domain: string, churchUrl: string, config: SiteConfig, ): ParsedChurch | null { // Name: cell after Nome (PT) or Nombre (ES) const nameLabel = config.language === 'pt' ? 'Nome' : 'Nombre'; const nameMatch = html.match( new RegExp(`${nameLabel}<\\/strong><\\/td>\\s*([^<]+)<\\/td>`, 'i') ); const name = nameMatch?.[1]?.trim() ?? ''; if (!name) return null; // Coordinates: Google Maps iframe center= parameter const coordMatch = html.match(/center=([-\d.]+)%2C([-\d.]+)/i); if (!coordMatch) return null; const lat = parseFloat(coordMatch[1]); const lng = parseFloat(coordMatch[2]); if (!isFinite(lat) || !isFinite(lng) || Math.abs(lat) > 90 || Math.abs(lng) > 180) return null; // Address: cell after Endereço (PT) or Dirección (ES) const addrLabel = config.language === 'pt' ? 'Endere[çc]o' : 'Direcci[oó]n'; const addrMatch = html.match( new RegExp(`${addrLabel}<\\/strong><\\/td>\\s*([^<]+)<\\/td>`, 'i') ); const address = addrMatch?.[1]?.trim() ?? null; // Phone: tel: href const phoneMatch = html.match(/href="tel:([^"]+)"/i); const phone = phoneMatch?.[1]?.trim() ?? null; // City and state from URL path segments // URL form: https://{domain}/{state}/{city}/{slug}/ const urlPath = new URL(churchUrl).pathname.split('/').filter(Boolean); const state = urlPath[0] ? decodeURIComponent(urlPath[0].replace(/-/g, ' ')) : null; const city = urlPath[1] ? decodeURIComponent(urlPath[1].replace(/-/g, ' ')) : null; return { name, address, city, state, phone, lat, lng, externalId: buildExternalId(domain, churchUrl), country: config.country, }; } /** * Parse the weekly mass schedule table from church page HTML. * Table format: day-name cell | time cell (comma-separated times, "-" = no mass) */ export function parseMassSchedule(html: string, countryCode: string): ParsedMass[] { const dayPatterns = buildDayPatterns(getDayNamesForCountry(countryCode)); const results: ParsedMass[] = []; // Extract all cells as pairs [day, time] const cells = [...html.matchAll(/]*>(.*?)<\/td>/gis)].map(m => m[1].replace(/<[^>]+>/g, '').trim() ); for (let i = 0; i + 1 < cells.length; i += 2) { const dayCell = cells[i].toLowerCase(); const timeCell = cells[i + 1]; const dayOfWeek = dayPatterns[dayCell]; if (dayOfWeek === undefined) continue; if (timeCell === '-' || !timeCell) continue; // Split comma-separated times: "10:00, 18:00" → ["10:00", "18:00"] for (const rawTime of timeCell.split(',')) { const time = rawTime.trim(); if (/^\d{1,2}:\d{2}$/.test(time)) { results.push({ dayOfWeek, time }); } } } return results; }