#!/usr/bin/env tsx /** * Import Catholic churches and mass schedules from Philmass.com * * Philmass.com provides rich Schema.org-annotated mass schedule data for * Philippine churches. It has no coordinates, so we match against existing * churches (OSM + mass-schedules.com) and only update matched records. * Unmatched churches are logged for manual review. * * Discovery strategy: * 1. Fetch Philippines page → extract province URLs * 2. For each province → extract city listing URLs * 3. For each city listing → extract church mass-schedule URLs * 4. Deduplicate all church URLs globally * 5. For each church: parse JSON-LD + Schema.org Events, match, upsert * * Usage: * npx tsx scripts/import-philmass.ts --all * npx tsx scripts/import-philmass.ts --all --dry-run * npx tsx scripts/import-philmass.ts --province Metro-Manila * npx tsx scripts/import-philmass.ts --all --resume-from Cebu * npx tsx scripts/import-philmass.ts --all --job-id {uuid} */ import dotenv from 'dotenv'; import path from 'path'; dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); dotenv.config({ path: path.resolve(process.cwd(), '.env') }); import { Pool } from 'pg'; import { PrismaPg } from '@prisma/adapter-pg'; import { PrismaClient } from '@prisma/client'; const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); const pool = new Pool({ connectionString: dbUrl, ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined, }); const adapter = new PrismaPg(pool); const prisma = new PrismaClient({ adapter }); import { findDuplicateChurch } from '../src/lib/church-matcher'; import type { ExistingChurch } from '../src/lib/church-matcher'; // ─── Constants ─────────────────────────────────────────────────────────────── const SITE_BASE = 'https://www.philmass.com'; const PHILIPPINES_URL = `${SITE_BASE}/Asia/Philippines.html`; const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)'; const REQUEST_DELAY_MS = 2000; // ─── Types ─────────────────────────────────────────────────────────────────── interface ProvinceInfo { name: string; url: string; } interface ChurchUrl { url: string; slug: string; // URL slug used as philmassId province: string; city: string; } interface ParsedPhilmassChurch { name: string; streetAddress: string | null; city: string | null; region: string | null; } interface ParsedSchedule { dayOfWeek: number; time: string; } interface ImportStats { provincesProcessed: number; citiesProcessed: number; churchUrlsDiscovered: number; churchesProcessed: number; churchesMatched: number; churchesUnmatched: number; churchesSkipped: number; schedulesUpdated: number; massSchedulesCreated: number; errors: number; } interface CLIArgs { all: boolean; province?: string; dryRun: boolean; resumeFrom?: string; jobId?: string; } // ─── HTTP Client ───────────────────────────────────────────────────────────── let requestCount = 0; function delay(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } async function fetchPage(url: string): Promise { if (requestCount > 0) { await delay(REQUEST_DELAY_MS); } requestCount++; try { const response = await fetch(url, { headers: { 'User-Agent': USER_AGENT, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', }, }); if (!response.ok) { console.error(` HTTP ${response.status} for ${url}`); return null; } return await response.text(); } catch (error) { console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`); return null; } } // ─── Discovery: Province → City → Church URLs ─────────────────────────────── async function fetchProvinceUrls(): Promise { console.log(`Fetching Philippines page: ${PHILIPPINES_URL}`); const html = await fetchPage(PHILIPPINES_URL); if (!html) throw new Error('Failed to fetch Philippines page'); const provinces: ProvinceInfo[] = []; const seen = new Set(); // Pattern: href="https://www.philmass.com/Asia/Philippines/{Province}.html" const regex = /href="(https:\/\/www\.philmass\.com\/Asia\/Philippines\/([^/"]+)\.html)"/g; let match; while ((match = regex.exec(html)) !== null) { const url = match[1]; const name = match[2]; // Skip non-province pages (weekly-sunday, etc.) if (name.includes('weekly') || name.includes('Roman-Catholic') || seen.has(name)) continue; seen.add(name); provinces.push({ name, url }); } return provinces; } function decodeHtmlEntities(str: string): string { return str .replace(/&#(\d+);/g, (_, code: string) => String.fromCharCode(parseInt(code))) .replace(/&/g, '&') .replace(/</g, '<') .replace(/>/g, '>') .replace(/"/g, '"'); } async function fetchCityListingUrls(provinceUrl: string, provinceName: string): Promise { const html = await fetchPage(provinceUrl); if (!html) return []; const urls: string[] = []; const seen = new Set(); // Pattern: href=".../{Province}/{City}/Roman-Catholic-Churches-in-{City}...html" const regex = /href="(https:\/\/www\.philmass\.com\/Asia\/Philippines\/[^"]*\/Roman-Catholic-Churches-in-[^"]*\.html)"/g; let match; while ((match = regex.exec(html)) !== null) { const url = decodeHtmlEntities(match[1]); if (seen.has(url)) continue; seen.add(url); urls.push(url); } return urls; } async function fetchChurchUrlsFromCityPage(cityUrl: string, provinceName: string): Promise { const html = await fetchPage(cityUrl); if (!html) return []; const churches: ChurchUrl[] = []; const seen = new Set(); // Pattern: href=".../Roman-Catholic-Churches/{Church-Name}/mass-schedule.html" const regex = /href="(https:\/\/www\.philmass\.com\/Asia\/Philippines\/([^/]+)\/([^/]+)\/Roman-Catholic-Churches\/([^/]+)\/mass-schedule\.html)"/g; let match; while ((match = regex.exec(html)) !== null) { const url = decodeHtmlEntities(match[1]); const province = decodeURIComponent(decodeHtmlEntities(match[2])); const city = decodeURIComponent(decodeHtmlEntities(match[3])); const slug = decodeURIComponent(decodeHtmlEntities(match[4])); if (seen.has(url)) continue; seen.add(url); churches.push({ url, slug, province, city }); } return churches; } // ─── HTML Parsers ──────────────────────────────────────────────────────────── function parseChurchJsonLd(html: string): ParsedPhilmassChurch | null { // Extract JSON-LD: const jsonLdMatch = html.match(/([\s\S]*?)<\/script>/i); if (!jsonLdMatch) return null; try { const data = JSON.parse(jsonLdMatch[1]); const church = data.mainEntityOfPage; if (!church || church['@type'] !== 'PlaceOfWorship') return null; const address = church.address || {}; return { name: church.name || null, streetAddress: address.streetAddress?.replace(/,\s*$/, '').trim() || null, city: address.addressLocality || null, region: address.addressRegion || null, }; } catch { return null; } } function parseChurchNameFromH1(html: string): string | null { // Fallback:

Quiapo Church mass schedule 2026 - Minor Basilica of the Black Nazarene

const h1Match = html.match(/

([^<]+)<\/h1>/i); if (!h1Match) return null; let name = h1Match[1].trim(); // Remove "mass schedule YYYY" and trailing " - " name = name.replace(/\s*mass\s+schedule\s+\d{4}\s*/i, ''); name = name.replace(/^\s*-\s*/, '').replace(/\s*-\s*$/, ''); return name.trim() || null; } function parseScheduleFromStartDates(html: string): ParsedSchedule[] { // Extract all startDate ISO timestamps from Schema.org Event microdata // Pattern: itemprop="startDate" content="2026-02-22T05:00:00+08:00" const schedules: ParsedSchedule[] = []; const seen = new Set(); const regex = /itemprop="startDate"\s+content="(\d{4}-\d{2}-\d{2})T(\d{2}):(\d{2}):\d{2}[^"]*"/g; let match; while ((match = regex.exec(html)) !== null) { const dateStr = match[1]; const hours = match[2]; const minutes = match[3]; // Derive dayOfWeek from the date const date = new Date(`${dateStr}T12:00:00`); // noon to avoid TZ issues const dayOfWeek = date.getDay(); // 0=Sun, 1=Mon, ..., 6=Sat const time = `${hours}:${minutes}`; const key = `${dayOfWeek}:${time}`; if (seen.has(key)) continue; seen.add(key); schedules.push({ dayOfWeek, time }); } return schedules; } // ─── Database Operations ───────────────────────────────────────────────────── async function loadExistingPhilippineChurches(): Promise { console.log('Loading existing Philippine churches for deduplication...'); const churches = await prisma.church.findMany({ where: { country: 'PH' }, select: { id: true, name: true, latitude: true, longitude: true, osmId: true, baiduId: true, masstimesId: true, orarimesseId: true, massSchedulesPhId: true, philmassId: true, horariosMisasId: true, mszeInfoId: true, weekdayMassesId: true, messesInfoId: true, bohosluzbyId: true, miserendId: true, kerknetId: true, gottesdienstzeitenId: true, source: true, website: true, phone: true, address: true, }, }); console.log(`Loaded ${churches.length} existing Philippine churches`); return churches; } // ─── Import Logic ──────────────────────────────────────────────────────────── async function processChurch( churchUrl: ChurchUrl, existingChurches: ExistingChurch[], unmatchedLog: string[], dryRun: boolean, stats: ImportStats, ): Promise { stats.churchesProcessed++; const html = await fetchPage(churchUrl.url); if (!html) { stats.errors++; return; } // Parse church info from JSON-LD const jsonLd = parseChurchJsonLd(html); const churchName = jsonLd?.name || parseChurchNameFromH1(html); if (!churchName) { console.log(` Skipping ${churchUrl.slug}: no name found`); stats.churchesSkipped++; return; } // Parse schedules from Schema.org startDate attributes const schedules = parseScheduleFromStartDates(html); // Try to find a match by philmassId first const existingByPhilmass = existingChurches.find((c) => c.philmassId === churchUrl.slug); let matched = existingByPhilmass || null; // If no philmassId match, try name-based matching against churches with coordinates if (!matched) { // Try matching by name similarity against all PH churches // We can't use findDuplicateChurch() without coordinates, so do name-only matching const normalizedName = churchName.toLowerCase() .replace(/\bst\.\s/g, 'saint ') .replace(/\bst\s/g, 'saint ') .replace(/\bcatholic church\b/g, '') .replace(/\bparish\b/g, '') .replace(/\bchurch\b/g, '') .replace(/[^\w\s]/g, '') .replace(/\s+/g, ' ') .trim(); // Filter to churches in the same city if possible const cityName = jsonLd?.city || churchUrl.city.replace(/-/g, ' '); const candidatesInCity = existingChurches.filter((c) => { if (!c.address) return false; return c.address.toLowerCase().includes(cityName.toLowerCase()); }); // Search in-city candidates first, then all PH churches const searchPools = candidatesInCity.length > 0 ? [candidatesInCity, existingChurches] : [existingChurches]; for (const searchPool of searchPools) { if (matched) break; for (const existing of searchPool) { const existingNorm = existing.name.toLowerCase() .replace(/\bst\.\s/g, 'saint ') .replace(/\bst\s/g, 'saint ') .replace(/\bcatholic church\b/g, '') .replace(/\bparish\b/g, '') .replace(/\bchurch\b/g, '') .replace(/[^\w\s]/g, '') .replace(/\s+/g, ' ') .trim(); // Require strong name match: one name contains the other, or very similar // Guard against overly generic names ("chapel", "holy", etc.) by requiring // that the shorter name is at least 8 chars after normalization const shorter = normalizedName.length <= existingNorm.length ? normalizedName : existingNorm; if (shorter.length >= 8) { if (normalizedName.includes(existingNorm) || existingNorm.includes(normalizedName)) { matched = existing; break; } } } } } if (dryRun) { if (matched) { stats.churchesMatched++; console.log(` [MATCH] "${churchName}" → existing "${matched.name}" (${matched.id})`); } else { stats.churchesUnmatched++; unmatchedLog.push(`${churchName} | ${jsonLd?.city || churchUrl.city} | ${churchUrl.url}`); console.log(` [UNMATCHED] "${churchName}" in ${jsonLd?.city || churchUrl.city}`); } if (schedules.length > 0) { stats.massSchedulesCreated += schedules.length; } return; } if (!matched) { stats.churchesUnmatched++; unmatchedLog.push(`${churchName} | ${jsonLd?.city || churchUrl.city} | ${churchUrl.url}`); return; } stats.churchesMatched++; // Update existing church: set philmassId, fill missing fields const updateData: Record = { philmassId: churchUrl.slug, }; if (!matched.address && jsonLd?.streetAddress) { const fullAddress = [jsonLd.streetAddress, jsonLd.city, jsonLd.region] .filter(Boolean).join(', '); updateData.address = fullAddress; } // Fill city/state from JSON-LD or URL const dbRecord = await prisma.church.findUnique({ where: { id: matched.id }, select: { city: true, state: true }, }); if (dbRecord && !dbRecord.city && (jsonLd?.city || churchUrl.city)) { updateData.city = jsonLd?.city || churchUrl.city.replace(/-/g, ' '); } if (dbRecord && !dbRecord.state && (jsonLd?.region || churchUrl.province)) { updateData.state = jsonLd?.region || churchUrl.province.replace(/-/g, ' '); } try { await prisma.church.update({ where: { id: matched.id }, data: updateData, }); } catch (error) { if (error instanceof Error && error.message.includes('Unique constraint')) { stats.churchesSkipped++; return; } throw error; } // Replace mass schedules if we have any if (schedules.length > 0) { try { await prisma.$transaction(async (tx) => { await tx.massSchedule.deleteMany({ where: { churchId: matched!.id } }); await tx.massSchedule.createMany({ data: schedules.map((s) => ({ churchId: matched!.id, dayOfWeek: s.dayOfWeek, time: s.time, language: 'English', })), }); await tx.church.update({ where: { id: matched!.id }, data: { lastScrapedAt: new Date() }, }); }); stats.schedulesUpdated++; stats.massSchedulesCreated += schedules.length; } catch (error) { stats.errors++; console.error(` Error saving schedules for ${churchUrl.slug}: ${error instanceof Error ? error.message : error}`); } } } // ─── CLI ───────────────────────────────────────────────────────────────────── function parseArgs(): CLIArgs { const args = process.argv.slice(2); const result: CLIArgs = { all: false, dryRun: false, }; for (let i = 0; i < args.length; i++) { switch (args[i]) { case '--all': result.all = true; break; case '--province': result.province = args[++i]; break; case '--dry-run': result.dryRun = true; break; case '--resume-from': result.resumeFrom = args[++i]; break; case '--job-id': result.jobId = args[++i]; break; case '--help': case '-h': console.log(` Usage: npx tsx scripts/import-philmass.ts [options] Options: --all Import from all provinces --province Import from a single province (e.g. "Metro-Manila") --dry-run No database writes, just report what would happen --resume-from Skip provinces until reaching this one --job-id Background job tracking ID --help, -h Show this help message Examples: npx tsx scripts/import-philmass.ts --province Metro-Manila --dry-run npx tsx scripts/import-philmass.ts --all npx tsx scripts/import-philmass.ts --all --resume-from Cebu `); process.exit(0); } } if (!result.all && !result.province) { console.error('Error: specify --all or --province '); process.exit(1); } return result; } // ─── Helpers ───────────────────────────────────────────────────────────────── function formatDuration(ms: number): string { const seconds = Math.floor(ms / 1000); const minutes = Math.floor(seconds / 60); const hours = Math.floor(minutes / 60); if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`; if (minutes > 0) return `${minutes}m ${seconds % 60}s`; return `${seconds}s`; } // ─── Main ──────────────────────────────────────────────────────────────────── async function main() { const args = parseArgs(); const startTime = Date.now(); console.log('\n' + '='.repeat(70)); console.log('PHILMASS.COM IMPORTER'); console.log('='.repeat(70)); console.log(`Mode: ${args.all ? 'All provinces' : `Single province: ${args.province}`}`); console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`); if (args.resumeFrom) console.log(`Resume from: ${args.resumeFrom}`); console.log(`Time: ${new Date().toISOString()}`); console.log('='.repeat(70) + '\n'); // Update background job status if provided if (args.jobId) { try { await prisma.backgroundJob.update({ where: { id: args.jobId }, data: { status: 'running', startedAt: new Date() }, }); } catch { // Job might not exist yet } } // Load existing Philippine churches for dedup const existingChurches = await loadExistingPhilippineChurches(); // ─── Phase 1: Discover all church URLs ─────────────────────────────────── console.log('=== Phase 1: Discovering church URLs ===\n'); const allProvinces = await fetchProvinceUrls(); console.log(`Found ${allProvinces.length} provinces\n`); // Filter to requested provinces let provincesToProcess: ProvinceInfo[]; if (args.province) { const found = allProvinces.find((p) => p.name === args.province); if (!found) { console.error(`Province "${args.province}" not found. Available: ${allProvinces.map((p) => p.name).join(', ')}`); process.exit(1); } provincesToProcess = [found]; } else { provincesToProcess = allProvinces; } // Handle --resume-from if (args.resumeFrom) { const idx = provincesToProcess.findIndex((p) => p.name === args.resumeFrom); if (idx === -1) { console.error(`Resume province "${args.resumeFrom}" not found.`); process.exit(1); } console.log(`Resuming from province "${args.resumeFrom}" (skipping ${idx} provinces)\n`); provincesToProcess = provincesToProcess.slice(idx); } // Collect all unique church URLs across all provinces/cities const allChurchUrls = new Map(); // keyed by URL to deduplicate const stats: ImportStats = { provincesProcessed: 0, citiesProcessed: 0, churchUrlsDiscovered: 0, churchesProcessed: 0, churchesMatched: 0, churchesUnmatched: 0, churchesSkipped: 0, schedulesUpdated: 0, massSchedulesCreated: 0, errors: 0, }; for (let pi = 0; pi < provincesToProcess.length; pi++) { const province = provincesToProcess[pi]; const elapsed = formatDuration(Date.now() - startTime); console.log(`[${pi + 1}/${provincesToProcess.length}] Province: ${province.name} [${elapsed} elapsed]`); try { // Get city listing URLs from province page const cityUrls = await fetchCityListingUrls(province.url, province.name); console.log(` Found ${cityUrls.length} city listing pages`); for (const cityUrl of cityUrls) { const churchUrls = await fetchChurchUrlsFromCityPage(cityUrl, province.name); stats.citiesProcessed++; for (const church of churchUrls) { if (!allChurchUrls.has(church.url)) { allChurchUrls.set(church.url, church); } } } stats.provincesProcessed++; console.log(` Total unique churches so far: ${allChurchUrls.size}`); } catch (error) { stats.errors++; console.error(` ERROR discovering ${province.name}: ${error instanceof Error ? error.message : error}`); } } stats.churchUrlsDiscovered = allChurchUrls.size; console.log(`\nDiscovery complete: ${allChurchUrls.size} unique church URLs across ${stats.citiesProcessed} city pages\n`); // ─── Phase 2: Process each church ───────────────────────────────────────── console.log('=== Phase 2: Processing churches ===\n'); const churchList = [...allChurchUrls.values()]; const unmatchedLog: string[] = []; for (let i = 0; i < churchList.length; i++) { const church = churchList[i]; const elapsed = formatDuration(Date.now() - startTime); if ((i + 1) % 50 === 0 || i === 0) { console.log(`[${i + 1}/${churchList.length}] Processing churches... [${elapsed} elapsed]`); } try { await processChurch(church, existingChurches, unmatchedLog, args.dryRun, stats); } catch (error) { stats.errors++; console.error(` ERROR processing ${church.slug}: ${error instanceof Error ? error.message : error}`); } } // Print summary const totalTime = Date.now() - startTime; console.log('\n' + '='.repeat(70)); console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`); console.log('='.repeat(70)); console.log(`Provinces processed: ${stats.provincesProcessed}`); console.log(`Cities processed: ${stats.citiesProcessed}`); console.log(`Church URLs discovered: ${stats.churchUrlsDiscovered}`); console.log(`Churches processed: ${stats.churchesProcessed}`); console.log(` Matched (updated): ${stats.churchesMatched}`); console.log(` Unmatched (skipped): ${stats.churchesUnmatched}`); console.log(` Skipped (other): ${stats.churchesSkipped}`); console.log(`Schedules updated: ${stats.schedulesUpdated}`); console.log(`Mass schedules created: ${stats.massSchedulesCreated}`); console.log(`Errors: ${stats.errors}`); console.log(`Total time: ${formatDuration(totalTime)}`); console.log(`HTTP requests: ${requestCount}`); console.log('='.repeat(70)); // Log unmatched churches for manual review if (unmatchedLog.length > 0) { console.log(`\nUnmatched churches (${unmatchedLog.length}):`); console.log('-'.repeat(70)); for (const line of unmatchedLog) { console.log(` ${line}`); } console.log('-'.repeat(70)); } console.log(''); // Update background job if (args.jobId) { try { await prisma.backgroundJob.update({ where: { id: args.jobId }, data: { status: stats.errors > 0 ? 'completed_with_errors' : 'completed', completedAt: new Date(), result: JSON.stringify(stats), }, }); } catch { // Ignore } } } main() .catch((error) => { console.error('Fatal error:', error); process.exit(1); }) .finally(async () => { await prisma.$disconnect(); await pool.end(); });