#!/usr/bin/env tsx /** * Import Catholic churches and mass schedules from mass-schedules.com (Philippines) * * mass-schedules.com has been operating since 2008 and covers ~1,500 Philippine * churches with weekly mass schedule tables and coordinates on separate map pages. * * Import strategy: * 1. Fetch sitemap XML → extract all /catholic-church/{id}/ URLs * 2. For each church: fetch page HTML, parse name/address/schedule, fetch map * page for coordinates, match against existing PH churches, upsert * * Usage: * npx tsx scripts/import-mass-schedules-ph.ts --all * npx tsx scripts/import-mass-schedules-ph.ts --all --dry-run * npx tsx scripts/import-mass-schedules-ph.ts --church-id 34 * npx tsx scripts/import-mass-schedules-ph.ts --all --resume-from 500 * npx tsx scripts/import-mass-schedules-ph.ts --all --skip-schedules * npx tsx scripts/import-mass-schedules-ph.ts --all --job-id {uuid} */ import dotenv from 'dotenv'; import path from 'path'; dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); dotenv.config({ path: path.resolve(process.cwd(), '.env') }); import { Pool } from 'pg'; import { PrismaPg } from '@prisma/adapter-pg'; import { PrismaClient } from '@prisma/client'; const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); const pool = new Pool({ connectionString: dbUrl, ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined, }); const adapter = new PrismaPg(pool); const prisma = new PrismaClient({ adapter }); import { findDuplicateChurch } from '../src/lib/church-matcher'; import type { ExistingChurch } from '../src/lib/church-matcher'; // ─── Constants ─────────────────────────────────────────────────────────────── const SITE_BASE = 'https://www.mass-schedules.com'; const SITEMAP_URL = `${SITE_BASE}/sitemaps/sitemap02272021.xml`; const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)'; const REQUEST_DELAY_MS = 1500; // ─── Types ─────────────────────────────────────────────────────────────────── interface SitemapChurch { id: string; slug: string; url: string; } interface ParsedChurch { name: string; address: string | null; region: string | null; city: string | null; phone: string | null; mapUrl: string | null; } interface ParsedSchedule { dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat time: string; // "05:00", "18:30" } interface ImportStats { churchesFound: number; churchesMatched: number; churchesCreated: number; churchesSkipped: number; schedulesProcessed: number; massSchedulesCreated: number; errors: number; } interface CLIArgs { all: boolean; churchId?: string; dryRun: boolean; skipSchedules: boolean; resumeFrom?: number; jobId?: string; } // ─── HTTP Client ───────────────────────────────────────────────────────────── let requestCount = 0; function delay(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } async function fetchPage(url: string): Promise { if (requestCount > 0) { await delay(REQUEST_DELAY_MS); } requestCount++; try { const response = await fetch(url, { headers: { 'User-Agent': USER_AGENT, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', }, }); if (!response.ok) { console.error(` HTTP ${response.status} for ${url}`); return null; } return await response.text(); } catch (error) { console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`); return null; } } // ─── Sitemap Parser ────────────────────────────────────────────────────────── async function fetchChurchUrlsFromSitemap(): Promise { console.log(`Fetching sitemap: ${SITEMAP_URL}`); const xml = await fetchPage(SITEMAP_URL); if (!xml) { throw new Error('Failed to fetch sitemap'); } // Extract /catholic-church/{id}/{slug}.html URLs const urlRegex = /\/catholic-church\/(\d+)\/([\w-]+)\.html/g; const seen = new Set(); const churches: SitemapChurch[] = []; let match; while ((match = urlRegex.exec(xml)) !== null) { const id = match[1]; if (seen.has(id)) continue; // Sitemap has duplicates seen.add(id); churches.push({ id, slug: match[2], url: `${SITE_BASE}/catholic-church/${id}/${match[2]}.html`, }); } // Sort by ID for predictable ordering churches.sort((a, b) => parseInt(a.id) - parseInt(b.id)); return churches; } // ─── HTML Parsers ──────────────────────────────────────────────────────────── function parseChurchPage(html: string): ParsedChurch { // Name from

...

const h1Match = html.match(/]*class="page_title"[^>]*>([\s\S]*?)<\/h1>/i); let name = h1Match ? h1Match[1].trim() : ''; // Remove " Mass Schedule" suffix name = name.replace(/\s*Mass\s*Schedule\s*$/i, '').trim(); // Address from ...

...

const addressMatch = html.match(/