feat: add buscarmisas-network importer — parsing functions
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
186
scripts/import-buscarmisas-network.ts
Normal file
186
scripts/import-buscarmisas-network.ts
Normal file
@@ -0,0 +1,186 @@
|
|||||||
|
#!/usr/bin/env tsx
|
||||||
|
/**
|
||||||
|
* Import Catholic churches and mass schedules from the BuscarMisas network.
|
||||||
|
*
|
||||||
|
* A group of 5 identical WordPress-based directories covering Latin America:
|
||||||
|
* - horariosmissa.com.br (Brazil, ~4,732 churches)
|
||||||
|
* - buscarmisas.com.mx (Mexico, ~3,950 churches)
|
||||||
|
* - horariosmisa.com.ar (Argentina, ~3,012 churches)
|
||||||
|
* - buscarmisas.co (Colombia, ~2,665 churches)
|
||||||
|
* - horariomisa.cl (Chile, ~935 churches)
|
||||||
|
*
|
||||||
|
* Usage:
|
||||||
|
* npx tsx scripts/import-buscarmisas-network.ts --domain horariosmissa.com.br
|
||||||
|
* npx tsx scripts/import-buscarmisas-network.ts --domain horariosmissa.com.br --resume-from 500
|
||||||
|
* npx tsx scripts/import-buscarmisas-network.ts --all
|
||||||
|
* npx tsx scripts/import-buscarmisas-network.ts --domain horariosmissa.com.br --dry-run
|
||||||
|
*/
|
||||||
|
|
||||||
|
import dotenv from 'dotenv';
|
||||||
|
import path from 'path';
|
||||||
|
|
||||||
|
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||||
|
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import { PrismaPg } from '@prisma/adapter-pg';
|
||||||
|
import { PrismaClient } from '@prisma/client';
|
||||||
|
|
||||||
|
import { findDuplicateChurch } from '../src/lib/church-matcher';
|
||||||
|
import type { ExistingChurch } from '../src/lib/church-matcher';
|
||||||
|
import { getDayNamesForCountry, buildDayPatterns } from '../src/scrapers/i18n/day-names';
|
||||||
|
|
||||||
|
// ─── Site Config ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
interface SiteConfig {
|
||||||
|
country: string; // ISO 3166-1 alpha-2
|
||||||
|
language: 'pt' | 'es';
|
||||||
|
sitemapType: 'page' | 'post';
|
||||||
|
}
|
||||||
|
|
||||||
|
const NETWORK_SITES: Record<string, SiteConfig> = {
|
||||||
|
'horariosmissa.com.br': { country: 'BR', language: 'pt', sitemapType: 'page' },
|
||||||
|
'buscarmisas.com.mx': { country: 'MX', language: 'es', sitemapType: 'page' },
|
||||||
|
'horariosmisa.com.ar': { country: 'AR', language: 'es', sitemapType: 'page' },
|
||||||
|
'buscarmisas.co': { country: 'CO', language: 'es', sitemapType: 'page' },
|
||||||
|
'horariomisa.cl': { country: 'CL', language: 'es', sitemapType: 'post' },
|
||||||
|
};
|
||||||
|
|
||||||
|
// ─── Types ────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
interface ParsedChurch {
|
||||||
|
name: string;
|
||||||
|
address: string | null;
|
||||||
|
city: string | null;
|
||||||
|
state: string | null;
|
||||||
|
phone: string | null;
|
||||||
|
lat: number;
|
||||||
|
lng: number;
|
||||||
|
externalId: string;
|
||||||
|
country: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface ParsedMass {
|
||||||
|
dayOfWeek: number; // 0 = Sunday, 6 = Saturday
|
||||||
|
time: string; // HH:MM 24-hour
|
||||||
|
}
|
||||||
|
|
||||||
|
interface CLIArgs {
|
||||||
|
domain: string | null;
|
||||||
|
all: boolean;
|
||||||
|
dryRun: boolean;
|
||||||
|
resumeFrom: number;
|
||||||
|
limit: number | null;
|
||||||
|
jobId: string | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface ImportStats {
|
||||||
|
total: number;
|
||||||
|
created: number;
|
||||||
|
updated: number;
|
||||||
|
skipped: number;
|
||||||
|
errors: number;
|
||||||
|
massSchedulesCreated: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Helpers ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Build external ID for a church URL.
|
||||||
|
* Format: "{domain-slug}/{church-slug}"
|
||||||
|
* e.g. "horariosmissa-com-br/paroquia-nossa-senhora-dos-remedios"
|
||||||
|
*/
|
||||||
|
export function buildExternalId(domain: string, churchUrl: string): string {
|
||||||
|
const domainSlug = domain.replace(/\./g, '-');
|
||||||
|
// URL path: /{region}/{city}/{church-slug}/
|
||||||
|
const segments = churchUrl.replace(/\/$/, '').split('/').filter(Boolean);
|
||||||
|
const churchSlug = segments[segments.length - 1] || '';
|
||||||
|
return `${domainSlug}/${churchSlug}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse church data from a church page HTML string.
|
||||||
|
* Returns null if name or coordinates cannot be extracted.
|
||||||
|
*/
|
||||||
|
export function parseChurchPage(
|
||||||
|
html: string,
|
||||||
|
domain: string,
|
||||||
|
churchUrl: string,
|
||||||
|
config: SiteConfig,
|
||||||
|
): ParsedChurch | null {
|
||||||
|
// Name: cell after <strong>Nome</strong> (PT) or <strong>Nombre</strong> (ES)
|
||||||
|
const nameLabel = config.language === 'pt' ? 'Nome' : 'Nombre';
|
||||||
|
const nameMatch = html.match(
|
||||||
|
new RegExp(`<strong>${nameLabel}<\\/strong><\\/td>\\s*<td>([^<]+)<\\/td>`, 'i')
|
||||||
|
);
|
||||||
|
const name = nameMatch?.[1]?.trim() ?? '';
|
||||||
|
if (!name) return null;
|
||||||
|
|
||||||
|
// Coordinates: Google Maps iframe center= parameter
|
||||||
|
const coordMatch = html.match(/center=([-\d.]+)%2C([-\d.]+)/i);
|
||||||
|
if (!coordMatch) return null;
|
||||||
|
const lat = parseFloat(coordMatch[1]);
|
||||||
|
const lng = parseFloat(coordMatch[2]);
|
||||||
|
if (!isFinite(lat) || !isFinite(lng) || Math.abs(lat) > 90 || Math.abs(lng) > 180) return null;
|
||||||
|
|
||||||
|
// Address: cell after <strong>Endereço</strong> (PT) or <strong>Dirección</strong> (ES)
|
||||||
|
const addrLabel = config.language === 'pt' ? 'Endere[çc]o' : 'Direcci[oó]n';
|
||||||
|
const addrMatch = html.match(
|
||||||
|
new RegExp(`<strong>${addrLabel}<\\/strong><\\/td>\\s*<td>([^<]+)<\\/td>`, 'i')
|
||||||
|
);
|
||||||
|
const address = addrMatch?.[1]?.trim() ?? null;
|
||||||
|
|
||||||
|
// Phone: tel: href
|
||||||
|
const phoneMatch = html.match(/href="tel:([^"]+)"/i);
|
||||||
|
const phone = phoneMatch?.[1]?.trim() ?? null;
|
||||||
|
|
||||||
|
// City and state from URL path segments
|
||||||
|
// URL form: https://{domain}/{state}/{city}/{slug}/
|
||||||
|
const urlPath = new URL(churchUrl).pathname.split('/').filter(Boolean);
|
||||||
|
const state = urlPath[0] ? decodeURIComponent(urlPath[0].replace(/-/g, ' ')) : null;
|
||||||
|
const city = urlPath[1] ? decodeURIComponent(urlPath[1].replace(/-/g, ' ')) : null;
|
||||||
|
|
||||||
|
return {
|
||||||
|
name,
|
||||||
|
address,
|
||||||
|
city,
|
||||||
|
state,
|
||||||
|
phone,
|
||||||
|
lat,
|
||||||
|
lng,
|
||||||
|
externalId: buildExternalId(domain, churchUrl),
|
||||||
|
country: config.country,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse the weekly mass schedule table from church page HTML.
|
||||||
|
* Table format: day-name cell | time cell (comma-separated times, "-" = no mass)
|
||||||
|
*/
|
||||||
|
export function parseMassSchedule(html: string, countryCode: string): ParsedMass[] {
|
||||||
|
const dayPatterns = buildDayPatterns(getDayNamesForCountry(countryCode));
|
||||||
|
const results: ParsedMass[] = [];
|
||||||
|
|
||||||
|
// Extract all <td> cells as pairs [day, time]
|
||||||
|
const cells = [...html.matchAll(/<td[^>]*>(.*?)<\/td>/gis)].map(m =>
|
||||||
|
m[1].replace(/<[^>]+>/g, '').trim()
|
||||||
|
);
|
||||||
|
|
||||||
|
for (let i = 0; i + 1 < cells.length; i += 2) {
|
||||||
|
const dayCell = cells[i].toLowerCase();
|
||||||
|
const timeCell = cells[i + 1];
|
||||||
|
|
||||||
|
const dayOfWeek = dayPatterns[dayCell];
|
||||||
|
if (dayOfWeek === undefined) continue;
|
||||||
|
if (timeCell === '-' || !timeCell) continue;
|
||||||
|
|
||||||
|
// Split comma-separated times: "10:00, 18:00" → ["10:00", "18:00"]
|
||||||
|
for (const rawTime of timeCell.split(',')) {
|
||||||
|
const time = rawTime.trim();
|
||||||
|
if (/^\d{1,2}:\d{2}$/.test(time)) {
|
||||||
|
results.push({ dayOfWeek, time });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return results;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user