From 08dc9e76baa1fbbc50b0015ba25e3fa97517356e Mon Sep 17 00:00:00 2001 From: albertfj114 Date: Thu, 19 Mar 2026 19:09:24 -0400 Subject: [PATCH] =?UTF-8?q?feat:=20add=20buscarmisas-network=20importer=20?= =?UTF-8?q?=E2=80=94=20parsing=20functions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- scripts/import-buscarmisas-network.ts | 186 ++++++++++++++++++++++++++ 1 file changed, 186 insertions(+) create mode 100644 scripts/import-buscarmisas-network.ts diff --git a/scripts/import-buscarmisas-network.ts b/scripts/import-buscarmisas-network.ts new file mode 100644 index 0000000..e9a2f59 --- /dev/null +++ b/scripts/import-buscarmisas-network.ts @@ -0,0 +1,186 @@ +#!/usr/bin/env tsx +/** + * Import Catholic churches and mass schedules from the BuscarMisas network. + * + * A group of 5 identical WordPress-based directories covering Latin America: + * - horariosmissa.com.br (Brazil, ~4,732 churches) + * - buscarmisas.com.mx (Mexico, ~3,950 churches) + * - horariosmisa.com.ar (Argentina, ~3,012 churches) + * - buscarmisas.co (Colombia, ~2,665 churches) + * - horariomisa.cl (Chile, ~935 churches) + * + * Usage: + * npx tsx scripts/import-buscarmisas-network.ts --domain horariosmissa.com.br + * npx tsx scripts/import-buscarmisas-network.ts --domain horariosmissa.com.br --resume-from 500 + * npx tsx scripts/import-buscarmisas-network.ts --all + * npx tsx scripts/import-buscarmisas-network.ts --domain horariosmissa.com.br --dry-run + */ + +import dotenv from 'dotenv'; +import path from 'path'; + +dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); +dotenv.config({ path: path.resolve(process.cwd(), '.env') }); + +import { Pool } from 'pg'; +import { PrismaPg } from '@prisma/adapter-pg'; +import { PrismaClient } from '@prisma/client'; + +import { findDuplicateChurch } from '../src/lib/church-matcher'; +import type { ExistingChurch } from '../src/lib/church-matcher'; +import { getDayNamesForCountry, buildDayPatterns } from '../src/scrapers/i18n/day-names'; + +// ─── Site Config ───────────────────────────────────────────────────────────── + +interface SiteConfig { + country: string; // ISO 3166-1 alpha-2 + language: 'pt' | 'es'; + sitemapType: 'page' | 'post'; +} + +const NETWORK_SITES: Record = { + 'horariosmissa.com.br': { country: 'BR', language: 'pt', sitemapType: 'page' }, + 'buscarmisas.com.mx': { country: 'MX', language: 'es', sitemapType: 'page' }, + 'horariosmisa.com.ar': { country: 'AR', language: 'es', sitemapType: 'page' }, + 'buscarmisas.co': { country: 'CO', language: 'es', sitemapType: 'page' }, + 'horariomisa.cl': { country: 'CL', language: 'es', sitemapType: 'post' }, +}; + +// ─── Types ──────────────────────────────────────────────────────────────────── + +interface ParsedChurch { + name: string; + address: string | null; + city: string | null; + state: string | null; + phone: string | null; + lat: number; + lng: number; + externalId: string; + country: string; +} + +interface ParsedMass { + dayOfWeek: number; // 0 = Sunday, 6 = Saturday + time: string; // HH:MM 24-hour +} + +interface CLIArgs { + domain: string | null; + all: boolean; + dryRun: boolean; + resumeFrom: number; + limit: number | null; + jobId: string | null; +} + +interface ImportStats { + total: number; + created: number; + updated: number; + skipped: number; + errors: number; + massSchedulesCreated: number; +} + +// ─── Helpers ───────────────────────────────────────────────────────────────── + +/** + * Build external ID for a church URL. + * Format: "{domain-slug}/{church-slug}" + * e.g. "horariosmissa-com-br/paroquia-nossa-senhora-dos-remedios" + */ +export function buildExternalId(domain: string, churchUrl: string): string { + const domainSlug = domain.replace(/\./g, '-'); + // URL path: /{region}/{city}/{church-slug}/ + const segments = churchUrl.replace(/\/$/, '').split('/').filter(Boolean); + const churchSlug = segments[segments.length - 1] || ''; + return `${domainSlug}/${churchSlug}`; +} + +/** + * Parse church data from a church page HTML string. + * Returns null if name or coordinates cannot be extracted. + */ +export function parseChurchPage( + html: string, + domain: string, + churchUrl: string, + config: SiteConfig, +): ParsedChurch | null { + // Name: cell after Nome (PT) or Nombre (ES) + const nameLabel = config.language === 'pt' ? 'Nome' : 'Nombre'; + const nameMatch = html.match( + new RegExp(`${nameLabel}<\\/strong><\\/td>\\s*([^<]+)<\\/td>`, 'i') + ); + const name = nameMatch?.[1]?.trim() ?? ''; + if (!name) return null; + + // Coordinates: Google Maps iframe center= parameter + const coordMatch = html.match(/center=([-\d.]+)%2C([-\d.]+)/i); + if (!coordMatch) return null; + const lat = parseFloat(coordMatch[1]); + const lng = parseFloat(coordMatch[2]); + if (!isFinite(lat) || !isFinite(lng) || Math.abs(lat) > 90 || Math.abs(lng) > 180) return null; + + // Address: cell after Endereço (PT) or Dirección (ES) + const addrLabel = config.language === 'pt' ? 'Endere[çc]o' : 'Direcci[oó]n'; + const addrMatch = html.match( + new RegExp(`${addrLabel}<\\/strong><\\/td>\\s*([^<]+)<\\/td>`, 'i') + ); + const address = addrMatch?.[1]?.trim() ?? null; + + // Phone: tel: href + const phoneMatch = html.match(/href="tel:([^"]+)"/i); + const phone = phoneMatch?.[1]?.trim() ?? null; + + // City and state from URL path segments + // URL form: https://{domain}/{state}/{city}/{slug}/ + const urlPath = new URL(churchUrl).pathname.split('/').filter(Boolean); + const state = urlPath[0] ? decodeURIComponent(urlPath[0].replace(/-/g, ' ')) : null; + const city = urlPath[1] ? decodeURIComponent(urlPath[1].replace(/-/g, ' ')) : null; + + return { + name, + address, + city, + state, + phone, + lat, + lng, + externalId: buildExternalId(domain, churchUrl), + country: config.country, + }; +} + +/** + * Parse the weekly mass schedule table from church page HTML. + * Table format: day-name cell | time cell (comma-separated times, "-" = no mass) + */ +export function parseMassSchedule(html: string, countryCode: string): ParsedMass[] { + const dayPatterns = buildDayPatterns(getDayNamesForCountry(countryCode)); + const results: ParsedMass[] = []; + + // Extract all cells as pairs [day, time] + const cells = [...html.matchAll(/]*>(.*?)<\/td>/gis)].map(m => + m[1].replace(/<[^>]+>/g, '').trim() + ); + + for (let i = 0; i + 1 < cells.length; i += 2) { + const dayCell = cells[i].toLowerCase(); + const timeCell = cells[i + 1]; + + const dayOfWeek = dayPatterns[dayCell]; + if (dayOfWeek === undefined) continue; + if (timeCell === '-' || !timeCell) continue; + + // Split comma-separated times: "10:00, 18:00" → ["10:00", "18:00"] + for (const rawTime of timeCell.split(',')) { + const time = rawTime.trim(); + if (/^\d{1,2}:\d{2}$/.test(time)) { + results.push({ dayOfWeek, time }); + } + } + } + return results; +}