Files
ScraperControl/scripts/import-buscarmisas-network.ts

260 lines
9.4 KiB
TypeScript
Raw Normal View History

#!/usr/bin/env tsx
/**
* Import Catholic churches and mass schedules from the BuscarMisas network.
*
* A group of 5 identical WordPress-based directories covering Latin America:
* - horariosmissa.com.br (Brazil, ~4,732 churches)
* - buscarmisas.com.mx (Mexico, ~3,950 churches)
* - horariosmisa.com.ar (Argentina, ~3,012 churches)
* - buscarmisas.co (Colombia, ~2,665 churches)
* - horariomisa.cl (Chile, ~935 churches)
*
* Usage:
* npx tsx scripts/import-buscarmisas-network.ts --domain horariosmissa.com.br
* npx tsx scripts/import-buscarmisas-network.ts --domain horariosmissa.com.br --resume-from 500
* npx tsx scripts/import-buscarmisas-network.ts --all
* npx tsx scripts/import-buscarmisas-network.ts --domain horariosmissa.com.br --dry-run
*/
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { findDuplicateChurch } from '../src/lib/church-matcher';
import type { ExistingChurch } from '../src/lib/church-matcher';
import { getDayNamesForCountry, buildDayPatterns } from '../src/scrapers/i18n/day-names';
// ─── Site Config ─────────────────────────────────────────────────────────────
interface SiteConfig {
country: string; // ISO 3166-1 alpha-2
language: 'pt' | 'es';
sitemapType: 'page' | 'post';
}
const NETWORK_SITES: Record<string, SiteConfig> = {
'horariosmissa.com.br': { country: 'BR', language: 'pt', sitemapType: 'page' },
'buscarmisas.com.mx': { country: 'MX', language: 'es', sitemapType: 'page' },
'horariosmisa.com.ar': { country: 'AR', language: 'es', sitemapType: 'page' },
'buscarmisas.co': { country: 'CO', language: 'es', sitemapType: 'page' },
'horariomisa.cl': { country: 'CL', language: 'es', sitemapType: 'post' },
};
// ─── Types ────────────────────────────────────────────────────────────────────
interface ParsedChurch {
name: string;
address: string | null;
city: string | null;
state: string | null;
phone: string | null;
lat: number;
lng: number;
externalId: string;
country: string;
}
interface ParsedMass {
dayOfWeek: number; // 0 = Sunday, 6 = Saturday
time: string; // HH:MM 24-hour
}
interface CLIArgs {
domain: string | null;
all: boolean;
dryRun: boolean;
resumeFrom: number;
limit: number | null;
jobId: string | null;
}
interface ImportStats {
total: number;
created: number;
updated: number;
skipped: number;
errors: number;
massSchedulesCreated: number;
}
// ─── Helpers ─────────────────────────────────────────────────────────────────
/**
* Build external ID for a church URL.
* Format: "{domain-slug}/{church-slug}"
* e.g. "horariosmissa-com-br/paroquia-nossa-senhora-dos-remedios"
*/
export function buildExternalId(domain: string, churchUrl: string): string {
const domainSlug = domain.replace(/\./g, '-');
// URL path: /{region}/{city}/{church-slug}/
const segments = churchUrl.replace(/\/$/, '').split('/').filter(Boolean);
const churchSlug = segments[segments.length - 1] || '';
return `${domainSlug}/${churchSlug}`;
}
/**
* Parse church data from a church page HTML string.
* Returns null if name or coordinates cannot be extracted.
*/
export function parseChurchPage(
html: string,
domain: string,
churchUrl: string,
config: SiteConfig,
): ParsedChurch | null {
// Name: cell after <strong>Nome</strong> (PT) or <strong>Nombre</strong> (ES)
const nameLabel = config.language === 'pt' ? 'Nome' : 'Nombre';
const nameMatch = html.match(
new RegExp(`<strong>${nameLabel}<\\/strong><\\/td>\\s*<td>([^<]+)<\\/td>`, 'i')
);
const name = nameMatch?.[1]?.trim() ?? '';
if (!name) return null;
// Coordinates: Google Maps iframe center= parameter
const coordMatch = html.match(/center=([-\d.]+)%2C([-\d.]+)/i);
if (!coordMatch) return null;
const lat = parseFloat(coordMatch[1]);
const lng = parseFloat(coordMatch[2]);
if (!isFinite(lat) || !isFinite(lng) || Math.abs(lat) > 90 || Math.abs(lng) > 180) return null;
// Address: cell after <strong>Endereço</strong> (PT) or <strong>Dirección</strong> (ES)
const addrLabel = config.language === 'pt' ? 'Endere[çc]o' : 'Direcci[oó]n';
const addrMatch = html.match(
new RegExp(`<strong>${addrLabel}<\\/strong><\\/td>\\s*<td>([^<]+)<\\/td>`, 'i')
);
const address = addrMatch?.[1]?.trim() ?? null;
// Phone: tel: href
const phoneMatch = html.match(/href="tel:([^"]+)"/i);
const phone = phoneMatch?.[1]?.trim() ?? null;
// City and state from URL path segments
// URL form: https://{domain}/{state}/{city}/{slug}/
const urlPath = new URL(churchUrl).pathname.split('/').filter(Boolean);
const state = urlPath[0] ? decodeURIComponent(urlPath[0].replace(/-/g, ' ')) : null;
const city = urlPath[1] ? decodeURIComponent(urlPath[1].replace(/-/g, ' ')) : null;
return {
name,
address,
city,
state,
phone,
lat,
lng,
externalId: buildExternalId(domain, churchUrl),
country: config.country,
};
}
/**
* Parse the weekly mass schedule table from church page HTML.
* Table format: day-name cell | time cell (comma-separated times, "-" = no mass)
*/
export function parseMassSchedule(html: string, countryCode: string): ParsedMass[] {
const dayPatterns = buildDayPatterns(getDayNamesForCountry(countryCode));
const results: ParsedMass[] = [];
// Extract all <td> cells as pairs [day, time]
const cells = [...html.matchAll(/<td[^>]*>(.*?)<\/td>/gis)].map(m =>
m[1].replace(/<[^>]+>/g, '').trim()
);
for (let i = 0; i + 1 < cells.length; i += 2) {
const dayCell = cells[i].toLowerCase();
const timeCell = cells[i + 1];
const dayOfWeek = dayPatterns[dayCell];
if (dayOfWeek === undefined) continue;
if (timeCell === '-' || !timeCell) continue;
// Split comma-separated times: "10:00, 18:00" → ["10:00", "18:00"]
for (const rawTime of timeCell.split(',')) {
const time = rawTime.trim();
if (/^\d{1,2}:\d{2}$/.test(time)) {
results.push({ dayOfWeek, time });
}
}
}
return results;
}
// ─── HTTP Helpers ─────────────────────────────────────────────────────────────
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
const REQUEST_DELAY_MS = 2_000;
const DOMAIN_DELAY_MS = 5_000;
async function fetchText(url: string): Promise<string> {
const res = await fetch(url, { headers: { 'User-Agent': USER_AGENT } });
if (!res.ok) throw new Error(`HTTP ${res.status} for ${url}`);
return res.text();
}
async function fetchWithRetry(url: string, retries = 3): Promise<string> {
for (let attempt = 1; attempt <= retries; attempt++) {
try {
return await fetchText(url);
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
if (attempt === retries) throw err;
const isRetryable = msg.includes('429') || msg.includes('503');
if (!isRetryable) throw err;
const backoff = attempt * 30_000; // 30s, 60s, 90s
console.warn(` [retry ${attempt}/${retries}] ${msg} — waiting ${backoff / 1000}s`);
await sleep(backoff);
}
}
throw new Error('unreachable');
}
function sleep(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms));
}
// ─── Sitemap Discovery ────────────────────────────────────────────────────────
/**
* Fetch all church page URLs for a domain from its sitemap.
* Church URLs have exactly 3 path segments: /{region}/{city}/{slug}/
*/
export async function getChurchUrls(domain: string, config: SiteConfig): Promise<string[]> {
const indexUrl = `https://${domain}/sitemap_index.xml`;
console.log(`Fetching sitemap index: ${indexUrl}`);
const indexXml = await fetchWithRetry(indexUrl);
// Extract child sitemap URLs matching the sitemapType
const childPattern = config.sitemapType === 'page'
? /https:\/\/[^<]*\/page-sitemap\d*\.xml/g
: /https:\/\/[^<]*\/post-sitemap\.xml/g;
const childUrls = [...indexXml.matchAll(childPattern)].map(m => m[0]);
console.log(` Found ${childUrls.length} child sitemaps`);
const churchUrls: string[] = [];
for (const sitemapUrl of childUrls) {
const xml = await fetchWithRetry(sitemapUrl);
const locs = [...xml.matchAll(/<loc>([^<]+)<\/loc>/g)].map(m => m[1].trim());
for (const loc of locs) {
// Church URLs: exactly 3 non-empty path segments after the domain
try {
const segments = new URL(loc).pathname.split('/').filter(Boolean);
if (segments.length === 3) {
churchUrls.push(loc);
}
} catch { /* skip malformed URLs */ }
}
}
// Deduplicate
const unique = [...new Set(churchUrls)];
console.log(` Total church URLs: ${unique.length}`);
return unique;
}