743 lines
25 KiB
TypeScript
743 lines
25 KiB
TypeScript
|
|
#!/usr/bin/env tsx
|
||
|
|
/**
|
||
|
|
* Import Catholic churches and mass schedules from Philmass.com
|
||
|
|
*
|
||
|
|
* Philmass.com provides rich Schema.org-annotated mass schedule data for
|
||
|
|
* Philippine churches. It has no coordinates, so we match against existing
|
||
|
|
* churches (OSM + mass-schedules.com) and only update matched records.
|
||
|
|
* Unmatched churches are logged for manual review.
|
||
|
|
*
|
||
|
|
* Discovery strategy:
|
||
|
|
* 1. Fetch Philippines page → extract province URLs
|
||
|
|
* 2. For each province → extract city listing URLs
|
||
|
|
* 3. For each city listing → extract church mass-schedule URLs
|
||
|
|
* 4. Deduplicate all church URLs globally
|
||
|
|
* 5. For each church: parse JSON-LD + Schema.org Events, match, upsert
|
||
|
|
*
|
||
|
|
* Usage:
|
||
|
|
* npx tsx scripts/import-philmass.ts --all
|
||
|
|
* npx tsx scripts/import-philmass.ts --all --dry-run
|
||
|
|
* npx tsx scripts/import-philmass.ts --province Metro-Manila
|
||
|
|
* npx tsx scripts/import-philmass.ts --all --resume-from Cebu
|
||
|
|
* npx tsx scripts/import-philmass.ts --all --job-id {uuid}
|
||
|
|
*/
|
||
|
|
|
||
|
|
import dotenv from 'dotenv';
|
||
|
|
import path from 'path';
|
||
|
|
|
||
|
|
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||
|
|
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||
|
|
|
||
|
|
import { Pool } from 'pg';
|
||
|
|
import { PrismaPg } from '@prisma/adapter-pg';
|
||
|
|
import { PrismaClient } from '@prisma/client';
|
||
|
|
|
||
|
|
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
|
||
|
|
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
|
||
|
|
const pool = new Pool({
|
||
|
|
connectionString: dbUrl,
|
||
|
|
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
|
||
|
|
});
|
||
|
|
const adapter = new PrismaPg(pool);
|
||
|
|
const prisma = new PrismaClient({ adapter });
|
||
|
|
|
||
|
|
import { findDuplicateChurch } from '../src/lib/church-matcher';
|
||
|
|
import type { ExistingChurch } from '../src/lib/church-matcher';
|
||
|
|
|
||
|
|
// ─── Constants ───────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
const SITE_BASE = 'https://www.philmass.com';
|
||
|
|
const PHILIPPINES_URL = `${SITE_BASE}/Asia/Philippines.html`;
|
||
|
|
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
|
||
|
|
const REQUEST_DELAY_MS = 2000;
|
||
|
|
|
||
|
|
// ─── Types ───────────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
interface ProvinceInfo {
|
||
|
|
name: string;
|
||
|
|
url: string;
|
||
|
|
}
|
||
|
|
|
||
|
|
interface ChurchUrl {
|
||
|
|
url: string;
|
||
|
|
slug: string; // URL slug used as philmassId
|
||
|
|
province: string;
|
||
|
|
city: string;
|
||
|
|
}
|
||
|
|
|
||
|
|
interface ParsedPhilmassChurch {
|
||
|
|
name: string;
|
||
|
|
streetAddress: string | null;
|
||
|
|
city: string | null;
|
||
|
|
region: string | null;
|
||
|
|
}
|
||
|
|
|
||
|
|
interface ParsedSchedule {
|
||
|
|
dayOfWeek: number;
|
||
|
|
time: string;
|
||
|
|
}
|
||
|
|
|
||
|
|
interface ImportStats {
|
||
|
|
provincesProcessed: number;
|
||
|
|
citiesProcessed: number;
|
||
|
|
churchUrlsDiscovered: number;
|
||
|
|
churchesProcessed: number;
|
||
|
|
churchesMatched: number;
|
||
|
|
churchesUnmatched: number;
|
||
|
|
churchesSkipped: number;
|
||
|
|
schedulesUpdated: number;
|
||
|
|
massSchedulesCreated: number;
|
||
|
|
errors: number;
|
||
|
|
}
|
||
|
|
|
||
|
|
interface CLIArgs {
|
||
|
|
all: boolean;
|
||
|
|
province?: string;
|
||
|
|
dryRun: boolean;
|
||
|
|
resumeFrom?: string;
|
||
|
|
jobId?: string;
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── HTTP Client ─────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
let requestCount = 0;
|
||
|
|
|
||
|
|
function delay(ms: number): Promise<void> {
|
||
|
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
||
|
|
}
|
||
|
|
|
||
|
|
async function fetchPage(url: string): Promise<string | null> {
|
||
|
|
if (requestCount > 0) {
|
||
|
|
await delay(REQUEST_DELAY_MS);
|
||
|
|
}
|
||
|
|
requestCount++;
|
||
|
|
|
||
|
|
try {
|
||
|
|
const response = await fetch(url, {
|
||
|
|
headers: {
|
||
|
|
'User-Agent': USER_AGENT,
|
||
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||
|
|
},
|
||
|
|
});
|
||
|
|
|
||
|
|
if (!response.ok) {
|
||
|
|
console.error(` HTTP ${response.status} for ${url}`);
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
return await response.text();
|
||
|
|
} catch (error) {
|
||
|
|
console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`);
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Discovery: Province → City → Church URLs ───────────────────────────────
|
||
|
|
|
||
|
|
async function fetchProvinceUrls(): Promise<ProvinceInfo[]> {
|
||
|
|
console.log(`Fetching Philippines page: ${PHILIPPINES_URL}`);
|
||
|
|
const html = await fetchPage(PHILIPPINES_URL);
|
||
|
|
if (!html) throw new Error('Failed to fetch Philippines page');
|
||
|
|
|
||
|
|
const provinces: ProvinceInfo[] = [];
|
||
|
|
const seen = new Set<string>();
|
||
|
|
|
||
|
|
// Pattern: href="https://www.philmass.com/Asia/Philippines/{Province}.html"
|
||
|
|
const regex = /href="(https:\/\/www\.philmass\.com\/Asia\/Philippines\/([^/"]+)\.html)"/g;
|
||
|
|
let match;
|
||
|
|
while ((match = regex.exec(html)) !== null) {
|
||
|
|
const url = match[1];
|
||
|
|
const name = match[2];
|
||
|
|
// Skip non-province pages (weekly-sunday, etc.)
|
||
|
|
if (name.includes('weekly') || name.includes('Roman-Catholic') || seen.has(name)) continue;
|
||
|
|
seen.add(name);
|
||
|
|
provinces.push({ name, url });
|
||
|
|
}
|
||
|
|
|
||
|
|
return provinces;
|
||
|
|
}
|
||
|
|
|
||
|
|
function decodeHtmlEntities(str: string): string {
|
||
|
|
return str
|
||
|
|
.replace(/&#(\d+);/g, (_, code: string) => String.fromCharCode(parseInt(code)))
|
||
|
|
.replace(/&/g, '&')
|
||
|
|
.replace(/</g, '<')
|
||
|
|
.replace(/>/g, '>')
|
||
|
|
.replace(/"/g, '"');
|
||
|
|
}
|
||
|
|
|
||
|
|
async function fetchCityListingUrls(provinceUrl: string, provinceName: string): Promise<string[]> {
|
||
|
|
const html = await fetchPage(provinceUrl);
|
||
|
|
if (!html) return [];
|
||
|
|
|
||
|
|
const urls: string[] = [];
|
||
|
|
const seen = new Set<string>();
|
||
|
|
|
||
|
|
// Pattern: href=".../{Province}/{City}/Roman-Catholic-Churches-in-{City}...html"
|
||
|
|
const regex = /href="(https:\/\/www\.philmass\.com\/Asia\/Philippines\/[^"]*\/Roman-Catholic-Churches-in-[^"]*\.html)"/g;
|
||
|
|
let match;
|
||
|
|
while ((match = regex.exec(html)) !== null) {
|
||
|
|
const url = decodeHtmlEntities(match[1]);
|
||
|
|
if (seen.has(url)) continue;
|
||
|
|
seen.add(url);
|
||
|
|
urls.push(url);
|
||
|
|
}
|
||
|
|
|
||
|
|
return urls;
|
||
|
|
}
|
||
|
|
|
||
|
|
async function fetchChurchUrlsFromCityPage(cityUrl: string, provinceName: string): Promise<ChurchUrl[]> {
|
||
|
|
const html = await fetchPage(cityUrl);
|
||
|
|
if (!html) return [];
|
||
|
|
|
||
|
|
const churches: ChurchUrl[] = [];
|
||
|
|
const seen = new Set<string>();
|
||
|
|
|
||
|
|
// Pattern: href=".../Roman-Catholic-Churches/{Church-Name}/mass-schedule.html"
|
||
|
|
const regex = /href="(https:\/\/www\.philmass\.com\/Asia\/Philippines\/([^/]+)\/([^/]+)\/Roman-Catholic-Churches\/([^/]+)\/mass-schedule\.html)"/g;
|
||
|
|
let match;
|
||
|
|
while ((match = regex.exec(html)) !== null) {
|
||
|
|
const url = decodeHtmlEntities(match[1]);
|
||
|
|
const province = decodeURIComponent(decodeHtmlEntities(match[2]));
|
||
|
|
const city = decodeURIComponent(decodeHtmlEntities(match[3]));
|
||
|
|
const slug = decodeURIComponent(decodeHtmlEntities(match[4]));
|
||
|
|
|
||
|
|
if (seen.has(url)) continue;
|
||
|
|
seen.add(url);
|
||
|
|
|
||
|
|
churches.push({ url, slug, province, city });
|
||
|
|
}
|
||
|
|
|
||
|
|
return churches;
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── HTML Parsers ────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
function parseChurchJsonLd(html: string): ParsedPhilmassChurch | null {
|
||
|
|
// Extract JSON-LD: <script type="application/ld+json">{...}</script>
|
||
|
|
const jsonLdMatch = html.match(/<script\s+type="application\/ld\+json"\s*>([\s\S]*?)<\/script>/i);
|
||
|
|
if (!jsonLdMatch) return null;
|
||
|
|
|
||
|
|
try {
|
||
|
|
const data = JSON.parse(jsonLdMatch[1]);
|
||
|
|
const church = data.mainEntityOfPage;
|
||
|
|
if (!church || church['@type'] !== 'PlaceOfWorship') return null;
|
||
|
|
|
||
|
|
const address = church.address || {};
|
||
|
|
return {
|
||
|
|
name: church.name || null,
|
||
|
|
streetAddress: address.streetAddress?.replace(/,\s*$/, '').trim() || null,
|
||
|
|
city: address.addressLocality || null,
|
||
|
|
region: address.addressRegion || null,
|
||
|
|
};
|
||
|
|
} catch {
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
function parseChurchNameFromH1(html: string): string | null {
|
||
|
|
// Fallback: <h1>Quiapo Church mass schedule 2026 - Minor Basilica of the Black Nazarene</h1>
|
||
|
|
const h1Match = html.match(/<h1>([^<]+)<\/h1>/i);
|
||
|
|
if (!h1Match) return null;
|
||
|
|
|
||
|
|
let name = h1Match[1].trim();
|
||
|
|
// Remove "mass schedule YYYY" and trailing " - "
|
||
|
|
name = name.replace(/\s*mass\s+schedule\s+\d{4}\s*/i, '');
|
||
|
|
name = name.replace(/^\s*-\s*/, '').replace(/\s*-\s*$/, '');
|
||
|
|
return name.trim() || null;
|
||
|
|
}
|
||
|
|
|
||
|
|
function parseScheduleFromStartDates(html: string): ParsedSchedule[] {
|
||
|
|
// Extract all startDate ISO timestamps from Schema.org Event microdata
|
||
|
|
// Pattern: itemprop="startDate" content="2026-02-22T05:00:00+08:00"
|
||
|
|
const schedules: ParsedSchedule[] = [];
|
||
|
|
const seen = new Set<string>();
|
||
|
|
|
||
|
|
const regex = /itemprop="startDate"\s+content="(\d{4}-\d{2}-\d{2})T(\d{2}):(\d{2}):\d{2}[^"]*"/g;
|
||
|
|
let match;
|
||
|
|
while ((match = regex.exec(html)) !== null) {
|
||
|
|
const dateStr = match[1];
|
||
|
|
const hours = match[2];
|
||
|
|
const minutes = match[3];
|
||
|
|
|
||
|
|
// Derive dayOfWeek from the date
|
||
|
|
const date = new Date(`${dateStr}T12:00:00`); // noon to avoid TZ issues
|
||
|
|
const dayOfWeek = date.getDay(); // 0=Sun, 1=Mon, ..., 6=Sat
|
||
|
|
|
||
|
|
const time = `${hours}:${minutes}`;
|
||
|
|
const key = `${dayOfWeek}:${time}`;
|
||
|
|
|
||
|
|
if (seen.has(key)) continue;
|
||
|
|
seen.add(key);
|
||
|
|
|
||
|
|
schedules.push({ dayOfWeek, time });
|
||
|
|
}
|
||
|
|
|
||
|
|
return schedules;
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Database Operations ─────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
async function loadExistingPhilippineChurches(): Promise<ExistingChurch[]> {
|
||
|
|
console.log('Loading existing Philippine churches for deduplication...');
|
||
|
|
const churches = await prisma.church.findMany({
|
||
|
|
where: { country: 'PH' },
|
||
|
|
select: {
|
||
|
|
id: true,
|
||
|
|
name: true,
|
||
|
|
latitude: true,
|
||
|
|
longitude: true,
|
||
|
|
osmId: true,
|
||
|
|
baiduId: true,
|
||
|
|
masstimesId: true,
|
||
|
|
orarimesseId: true,
|
||
|
|
massSchedulesPhId: true,
|
||
|
|
philmassId: true,
|
||
|
|
horariosMisasId: true,
|
||
|
|
mszeInfoId: true,
|
||
|
|
weekdayMassesId: true,
|
||
|
|
messesInfoId: true,
|
||
|
|
bohosluzbyId: true,
|
||
|
|
miserendId: true,
|
||
|
|
kerknetId: true,
|
||
|
|
gottesdienstzeitenId: true,
|
||
|
|
discovermassId: true,
|
||
|
|
source: true,
|
||
|
|
website: true,
|
||
|
|
phone: true,
|
||
|
|
address: true,
|
||
|
|
},
|
||
|
|
});
|
||
|
|
console.log(`Loaded ${churches.length} existing Philippine churches`);
|
||
|
|
return churches;
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Import Logic ────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
async function processChurch(
|
||
|
|
churchUrl: ChurchUrl,
|
||
|
|
existingChurches: ExistingChurch[],
|
||
|
|
unmatchedLog: string[],
|
||
|
|
dryRun: boolean,
|
||
|
|
stats: ImportStats,
|
||
|
|
): Promise<void> {
|
||
|
|
stats.churchesProcessed++;
|
||
|
|
|
||
|
|
const html = await fetchPage(churchUrl.url);
|
||
|
|
if (!html) {
|
||
|
|
stats.errors++;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Parse church info from JSON-LD
|
||
|
|
const jsonLd = parseChurchJsonLd(html);
|
||
|
|
const churchName = jsonLd?.name || parseChurchNameFromH1(html);
|
||
|
|
|
||
|
|
if (!churchName) {
|
||
|
|
console.log(` Skipping ${churchUrl.slug}: no name found`);
|
||
|
|
stats.churchesSkipped++;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Parse schedules from Schema.org startDate attributes
|
||
|
|
const schedules = parseScheduleFromStartDates(html);
|
||
|
|
|
||
|
|
// Try to find a match by philmassId first
|
||
|
|
const existingByPhilmass = existingChurches.find((c) => c.philmassId === churchUrl.slug);
|
||
|
|
let matched = existingByPhilmass || null;
|
||
|
|
|
||
|
|
// If no philmassId match, try name-based matching against churches with coordinates
|
||
|
|
if (!matched) {
|
||
|
|
// Try matching by name similarity against all PH churches
|
||
|
|
// We can't use findDuplicateChurch() without coordinates, so do name-only matching
|
||
|
|
const normalizedName = churchName.toLowerCase()
|
||
|
|
.replace(/\bst\.\s/g, 'saint ')
|
||
|
|
.replace(/\bst\s/g, 'saint ')
|
||
|
|
.replace(/\bcatholic church\b/g, '')
|
||
|
|
.replace(/\bparish\b/g, '')
|
||
|
|
.replace(/\bchurch\b/g, '')
|
||
|
|
.replace(/[^\w\s]/g, '')
|
||
|
|
.replace(/\s+/g, ' ')
|
||
|
|
.trim();
|
||
|
|
|
||
|
|
// Filter to churches in the same city if possible
|
||
|
|
const cityName = jsonLd?.city || churchUrl.city.replace(/-/g, ' ');
|
||
|
|
const candidatesInCity = existingChurches.filter((c) => {
|
||
|
|
if (!c.address) return false;
|
||
|
|
return c.address.toLowerCase().includes(cityName.toLowerCase());
|
||
|
|
});
|
||
|
|
|
||
|
|
// Search in-city candidates first, then all PH churches
|
||
|
|
const searchPools = candidatesInCity.length > 0
|
||
|
|
? [candidatesInCity, existingChurches]
|
||
|
|
: [existingChurches];
|
||
|
|
|
||
|
|
for (const searchPool of searchPools) {
|
||
|
|
if (matched) break;
|
||
|
|
for (const existing of searchPool) {
|
||
|
|
const existingNorm = existing.name.toLowerCase()
|
||
|
|
.replace(/\bst\.\s/g, 'saint ')
|
||
|
|
.replace(/\bst\s/g, 'saint ')
|
||
|
|
.replace(/\bcatholic church\b/g, '')
|
||
|
|
.replace(/\bparish\b/g, '')
|
||
|
|
.replace(/\bchurch\b/g, '')
|
||
|
|
.replace(/[^\w\s]/g, '')
|
||
|
|
.replace(/\s+/g, ' ')
|
||
|
|
.trim();
|
||
|
|
|
||
|
|
// Require strong name match: one name contains the other, or very similar
|
||
|
|
// Guard against overly generic names ("chapel", "holy", etc.) by requiring
|
||
|
|
// that the shorter name is at least 8 chars after normalization
|
||
|
|
const shorter = normalizedName.length <= existingNorm.length ? normalizedName : existingNorm;
|
||
|
|
if (shorter.length >= 8) {
|
||
|
|
if (normalizedName.includes(existingNorm) || existingNorm.includes(normalizedName)) {
|
||
|
|
matched = existing;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (dryRun) {
|
||
|
|
if (matched) {
|
||
|
|
stats.churchesMatched++;
|
||
|
|
console.log(` [MATCH] "${churchName}" → existing "${matched.name}" (${matched.id})`);
|
||
|
|
} else {
|
||
|
|
stats.churchesUnmatched++;
|
||
|
|
unmatchedLog.push(`${churchName} | ${jsonLd?.city || churchUrl.city} | ${churchUrl.url}`);
|
||
|
|
console.log(` [UNMATCHED] "${churchName}" in ${jsonLd?.city || churchUrl.city}`);
|
||
|
|
}
|
||
|
|
if (schedules.length > 0) {
|
||
|
|
stats.massSchedulesCreated += schedules.length;
|
||
|
|
}
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!matched) {
|
||
|
|
stats.churchesUnmatched++;
|
||
|
|
unmatchedLog.push(`${churchName} | ${jsonLd?.city || churchUrl.city} | ${churchUrl.url}`);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
stats.churchesMatched++;
|
||
|
|
|
||
|
|
// Update existing church: set philmassId, fill missing fields
|
||
|
|
const updateData: Record<string, unknown> = {
|
||
|
|
philmassId: churchUrl.slug,
|
||
|
|
};
|
||
|
|
|
||
|
|
if (!matched.address && jsonLd?.streetAddress) {
|
||
|
|
const fullAddress = [jsonLd.streetAddress, jsonLd.city, jsonLd.region]
|
||
|
|
.filter(Boolean).join(', ');
|
||
|
|
updateData.address = fullAddress;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Fill city/state from JSON-LD or URL
|
||
|
|
const dbRecord = await prisma.church.findUnique({
|
||
|
|
where: { id: matched.id },
|
||
|
|
select: { city: true, state: true },
|
||
|
|
});
|
||
|
|
if (dbRecord && !dbRecord.city && (jsonLd?.city || churchUrl.city)) {
|
||
|
|
updateData.city = jsonLd?.city || churchUrl.city.replace(/-/g, ' ');
|
||
|
|
}
|
||
|
|
if (dbRecord && !dbRecord.state && (jsonLd?.region || churchUrl.province)) {
|
||
|
|
updateData.state = jsonLd?.region || churchUrl.province.replace(/-/g, ' ');
|
||
|
|
}
|
||
|
|
|
||
|
|
try {
|
||
|
|
await prisma.church.update({
|
||
|
|
where: { id: matched.id },
|
||
|
|
data: updateData,
|
||
|
|
});
|
||
|
|
} catch (error) {
|
||
|
|
if (error instanceof Error && error.message.includes('Unique constraint')) {
|
||
|
|
stats.churchesSkipped++;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
throw error;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Replace mass schedules if we have any
|
||
|
|
if (schedules.length > 0) {
|
||
|
|
try {
|
||
|
|
await prisma.$transaction(async (tx) => {
|
||
|
|
await tx.massSchedule.deleteMany({ where: { churchId: matched!.id } });
|
||
|
|
await tx.massSchedule.createMany({
|
||
|
|
data: schedules.map((s) => ({
|
||
|
|
churchId: matched!.id,
|
||
|
|
dayOfWeek: s.dayOfWeek,
|
||
|
|
time: s.time,
|
||
|
|
language: 'English',
|
||
|
|
})),
|
||
|
|
});
|
||
|
|
await tx.church.update({
|
||
|
|
where: { id: matched!.id },
|
||
|
|
data: { lastScrapedAt: new Date() },
|
||
|
|
});
|
||
|
|
});
|
||
|
|
stats.schedulesUpdated++;
|
||
|
|
stats.massSchedulesCreated += schedules.length;
|
||
|
|
} catch (error) {
|
||
|
|
stats.errors++;
|
||
|
|
console.error(` Error saving schedules for ${churchUrl.slug}: ${error instanceof Error ? error.message : error}`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── CLI ─────────────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
function parseArgs(): CLIArgs {
|
||
|
|
const args = process.argv.slice(2);
|
||
|
|
const result: CLIArgs = {
|
||
|
|
all: false,
|
||
|
|
dryRun: false,
|
||
|
|
};
|
||
|
|
|
||
|
|
for (let i = 0; i < args.length; i++) {
|
||
|
|
switch (args[i]) {
|
||
|
|
case '--all':
|
||
|
|
result.all = true;
|
||
|
|
break;
|
||
|
|
case '--province':
|
||
|
|
result.province = args[++i];
|
||
|
|
break;
|
||
|
|
case '--dry-run':
|
||
|
|
result.dryRun = true;
|
||
|
|
break;
|
||
|
|
case '--resume-from':
|
||
|
|
result.resumeFrom = args[++i];
|
||
|
|
break;
|
||
|
|
case '--job-id':
|
||
|
|
result.jobId = args[++i];
|
||
|
|
break;
|
||
|
|
case '--help':
|
||
|
|
case '-h':
|
||
|
|
console.log(`
|
||
|
|
Usage: npx tsx scripts/import-philmass.ts [options]
|
||
|
|
|
||
|
|
Options:
|
||
|
|
--all Import from all provinces
|
||
|
|
--province <name> Import from a single province (e.g. "Metro-Manila")
|
||
|
|
--dry-run No database writes, just report what would happen
|
||
|
|
--resume-from <province> Skip provinces until reaching this one
|
||
|
|
--job-id <uuid> Background job tracking ID
|
||
|
|
--help, -h Show this help message
|
||
|
|
|
||
|
|
Examples:
|
||
|
|
npx tsx scripts/import-philmass.ts --province Metro-Manila --dry-run
|
||
|
|
npx tsx scripts/import-philmass.ts --all
|
||
|
|
npx tsx scripts/import-philmass.ts --all --resume-from Cebu
|
||
|
|
`);
|
||
|
|
process.exit(0);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!result.all && !result.province) {
|
||
|
|
console.error('Error: specify --all or --province <name>');
|
||
|
|
process.exit(1);
|
||
|
|
}
|
||
|
|
|
||
|
|
return result;
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Helpers ─────────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
function formatDuration(ms: number): string {
|
||
|
|
const seconds = Math.floor(ms / 1000);
|
||
|
|
const minutes = Math.floor(seconds / 60);
|
||
|
|
const hours = Math.floor(minutes / 60);
|
||
|
|
if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`;
|
||
|
|
if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
|
||
|
|
return `${seconds}s`;
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Main ────────────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
async function main() {
|
||
|
|
const args = parseArgs();
|
||
|
|
const startTime = Date.now();
|
||
|
|
|
||
|
|
console.log('\n' + '='.repeat(70));
|
||
|
|
console.log('PHILMASS.COM IMPORTER');
|
||
|
|
console.log('='.repeat(70));
|
||
|
|
console.log(`Mode: ${args.all ? 'All provinces' : `Single province: ${args.province}`}`);
|
||
|
|
console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`);
|
||
|
|
if (args.resumeFrom) console.log(`Resume from: ${args.resumeFrom}`);
|
||
|
|
console.log(`Time: ${new Date().toISOString()}`);
|
||
|
|
console.log('='.repeat(70) + '\n');
|
||
|
|
|
||
|
|
// Update background job status if provided
|
||
|
|
if (args.jobId) {
|
||
|
|
try {
|
||
|
|
await prisma.backgroundJob.update({
|
||
|
|
where: { id: args.jobId },
|
||
|
|
data: { status: 'running', startedAt: new Date() },
|
||
|
|
});
|
||
|
|
} catch {
|
||
|
|
// Job might not exist yet
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Load existing Philippine churches for dedup
|
||
|
|
const existingChurches = await loadExistingPhilippineChurches();
|
||
|
|
|
||
|
|
// ─── Phase 1: Discover all church URLs ───────────────────────────────────
|
||
|
|
|
||
|
|
console.log('=== Phase 1: Discovering church URLs ===\n');
|
||
|
|
|
||
|
|
const allProvinces = await fetchProvinceUrls();
|
||
|
|
console.log(`Found ${allProvinces.length} provinces\n`);
|
||
|
|
|
||
|
|
// Filter to requested provinces
|
||
|
|
let provincesToProcess: ProvinceInfo[];
|
||
|
|
if (args.province) {
|
||
|
|
const found = allProvinces.find((p) => p.name === args.province);
|
||
|
|
if (!found) {
|
||
|
|
console.error(`Province "${args.province}" not found. Available: ${allProvinces.map((p) => p.name).join(', ')}`);
|
||
|
|
process.exit(1);
|
||
|
|
}
|
||
|
|
provincesToProcess = [found];
|
||
|
|
} else {
|
||
|
|
provincesToProcess = allProvinces;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Handle --resume-from
|
||
|
|
if (args.resumeFrom) {
|
||
|
|
const idx = provincesToProcess.findIndex((p) => p.name === args.resumeFrom);
|
||
|
|
if (idx === -1) {
|
||
|
|
console.error(`Resume province "${args.resumeFrom}" not found.`);
|
||
|
|
process.exit(1);
|
||
|
|
}
|
||
|
|
console.log(`Resuming from province "${args.resumeFrom}" (skipping ${idx} provinces)\n`);
|
||
|
|
provincesToProcess = provincesToProcess.slice(idx);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Collect all unique church URLs across all provinces/cities
|
||
|
|
const allChurchUrls = new Map<string, ChurchUrl>(); // keyed by URL to deduplicate
|
||
|
|
|
||
|
|
const stats: ImportStats = {
|
||
|
|
provincesProcessed: 0,
|
||
|
|
citiesProcessed: 0,
|
||
|
|
churchUrlsDiscovered: 0,
|
||
|
|
churchesProcessed: 0,
|
||
|
|
churchesMatched: 0,
|
||
|
|
churchesUnmatched: 0,
|
||
|
|
churchesSkipped: 0,
|
||
|
|
schedulesUpdated: 0,
|
||
|
|
massSchedulesCreated: 0,
|
||
|
|
errors: 0,
|
||
|
|
};
|
||
|
|
|
||
|
|
for (let pi = 0; pi < provincesToProcess.length; pi++) {
|
||
|
|
const province = provincesToProcess[pi];
|
||
|
|
const elapsed = formatDuration(Date.now() - startTime);
|
||
|
|
console.log(`[${pi + 1}/${provincesToProcess.length}] Province: ${province.name} [${elapsed} elapsed]`);
|
||
|
|
|
||
|
|
try {
|
||
|
|
// Get city listing URLs from province page
|
||
|
|
const cityUrls = await fetchCityListingUrls(province.url, province.name);
|
||
|
|
console.log(` Found ${cityUrls.length} city listing pages`);
|
||
|
|
|
||
|
|
for (const cityUrl of cityUrls) {
|
||
|
|
const churchUrls = await fetchChurchUrlsFromCityPage(cityUrl, province.name);
|
||
|
|
stats.citiesProcessed++;
|
||
|
|
|
||
|
|
for (const church of churchUrls) {
|
||
|
|
if (!allChurchUrls.has(church.url)) {
|
||
|
|
allChurchUrls.set(church.url, church);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
stats.provincesProcessed++;
|
||
|
|
console.log(` Total unique churches so far: ${allChurchUrls.size}`);
|
||
|
|
} catch (error) {
|
||
|
|
stats.errors++;
|
||
|
|
console.error(` ERROR discovering ${province.name}: ${error instanceof Error ? error.message : error}`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
stats.churchUrlsDiscovered = allChurchUrls.size;
|
||
|
|
console.log(`\nDiscovery complete: ${allChurchUrls.size} unique church URLs across ${stats.citiesProcessed} city pages\n`);
|
||
|
|
|
||
|
|
// ─── Phase 2: Process each church ─────────────────────────────────────────
|
||
|
|
|
||
|
|
console.log('=== Phase 2: Processing churches ===\n');
|
||
|
|
|
||
|
|
const churchList = [...allChurchUrls.values()];
|
||
|
|
const unmatchedLog: string[] = [];
|
||
|
|
|
||
|
|
for (let i = 0; i < churchList.length; i++) {
|
||
|
|
const church = churchList[i];
|
||
|
|
const elapsed = formatDuration(Date.now() - startTime);
|
||
|
|
if ((i + 1) % 50 === 0 || i === 0) {
|
||
|
|
console.log(`[${i + 1}/${churchList.length}] Processing churches... [${elapsed} elapsed]`);
|
||
|
|
}
|
||
|
|
|
||
|
|
try {
|
||
|
|
await processChurch(church, existingChurches, unmatchedLog, args.dryRun, stats);
|
||
|
|
} catch (error) {
|
||
|
|
stats.errors++;
|
||
|
|
console.error(` ERROR processing ${church.slug}: ${error instanceof Error ? error.message : error}`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Print summary
|
||
|
|
const totalTime = Date.now() - startTime;
|
||
|
|
console.log('\n' + '='.repeat(70));
|
||
|
|
console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
|
||
|
|
console.log('='.repeat(70));
|
||
|
|
console.log(`Provinces processed: ${stats.provincesProcessed}`);
|
||
|
|
console.log(`Cities processed: ${stats.citiesProcessed}`);
|
||
|
|
console.log(`Church URLs discovered: ${stats.churchUrlsDiscovered}`);
|
||
|
|
console.log(`Churches processed: ${stats.churchesProcessed}`);
|
||
|
|
console.log(` Matched (updated): ${stats.churchesMatched}`);
|
||
|
|
console.log(` Unmatched (skipped): ${stats.churchesUnmatched}`);
|
||
|
|
console.log(` Skipped (other): ${stats.churchesSkipped}`);
|
||
|
|
console.log(`Schedules updated: ${stats.schedulesUpdated}`);
|
||
|
|
console.log(`Mass schedules created: ${stats.massSchedulesCreated}`);
|
||
|
|
console.log(`Errors: ${stats.errors}`);
|
||
|
|
console.log(`Total time: ${formatDuration(totalTime)}`);
|
||
|
|
console.log(`HTTP requests: ${requestCount}`);
|
||
|
|
console.log('='.repeat(70));
|
||
|
|
|
||
|
|
// Log unmatched churches for manual review
|
||
|
|
if (unmatchedLog.length > 0) {
|
||
|
|
console.log(`\nUnmatched churches (${unmatchedLog.length}):`);
|
||
|
|
console.log('-'.repeat(70));
|
||
|
|
for (const line of unmatchedLog) {
|
||
|
|
console.log(` ${line}`);
|
||
|
|
}
|
||
|
|
console.log('-'.repeat(70));
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log('');
|
||
|
|
|
||
|
|
// Update background job
|
||
|
|
if (args.jobId) {
|
||
|
|
try {
|
||
|
|
await prisma.backgroundJob.update({
|
||
|
|
where: { id: args.jobId },
|
||
|
|
data: {
|
||
|
|
status: stats.errors > 0 ? 'completed_with_errors' : 'completed',
|
||
|
|
completedAt: new Date(),
|
||
|
|
result: JSON.stringify(stats),
|
||
|
|
},
|
||
|
|
});
|
||
|
|
} catch {
|
||
|
|
// Ignore
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
main()
|
||
|
|
.catch((error) => {
|
||
|
|
console.error('Fatal error:', error);
|
||
|
|
process.exit(1);
|
||
|
|
})
|
||
|
|
.finally(async () => {
|
||
|
|
await prisma.$disconnect();
|
||
|
|
await pool.end();
|
||
|
|
});
|