#!/usr/bin/env tsx /** * Import Catholic churches and mass schedules from discovermass.com (USA) * * discovermass.com is a US Catholic church directory with 20,284 churches. * Data includes name, address, phone, website, coordinates, mass times, * confessions, and adoration schedules. * * robots.txt specifies Crawl-delay: 10 — this importer follows that rule. * * Usage: * npx tsx scripts/import-discovermass.ts --all * npx tsx scripts/import-discovermass.ts --all --dry-run * npx tsx scripts/import-discovermass.ts --all --resume-from 5000 * npx tsx scripts/import-discovermass.ts --all --job-id {uuid} */ import dotenv from 'dotenv'; import path from 'path'; dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); dotenv.config({ path: path.resolve(process.cwd(), '.env') }); import { Pool } from 'pg'; import { PrismaPg } from '@prisma/adapter-pg'; import { PrismaClient } from '@prisma/client'; const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); const pool = new Pool({ connectionString: dbUrl, ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined, }); const adapter = new PrismaPg(pool); const prisma = new PrismaClient({ adapter }); import { findDuplicateChurch } from '../src/lib/church-matcher'; import type { ExistingChurch } from '../src/lib/church-matcher'; // ─── Constants ─────────────────────────────────────────────────────────────── const SITE_BASE = 'https://discovermass.com'; const SITEMAP_COUNT = 11; const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)'; const REQUEST_DELAY_MS = 10_000; // Crawl-delay: 10 from robots.txt // ─── Types ─────────────────────────────────────────────────────────────────── interface ParsedChurch { name: string; address: string | null; city: string | null; state: string | null; zip: string | null; phone: string | null; website: string | null; lat: number; lng: number; } interface ParsedMass { dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat time: string; // HH:MM 24-hour language: string; notes?: string; } interface ParsedConf { dayOfWeek: number; startTime: string; // HH:MM 24-hour endTime: string; // HH:MM 24-hour notes?: string; } interface ParsedAdoration { dayOfWeek: number; startTime: string; // HH:MM 24-hour endTime: string; // HH:MM 24-hour notes?: string; } interface ImportStats { total: number; created: number; updated: number; skipped: number; errors: number; massSchedulesCreated: number; confessionSchedulesCreated: number; adorationSchedulesCreated: number; } interface CLIArgs { all: boolean; dryRun: boolean; resumeFrom?: number; jobId?: string; testParse?: boolean; } // ─── Day Mappings ───────────────────────────────────────────────────────────── // Full day names used in mass schedule
  • labels const FULL_DAY_NAMES: Record = { Sunday: 0, Monday: 1, Tuesday: 2, Wednesday: 3, Thursday: 4, Friday: 5, Saturday: 6, }; // Abbreviated day prefixes used in confession/adoration serviceTime text const ABBREV_DAY_NAMES: Record = { Sun: [0], Mon: [1], Tue: [2], Wed: [3], Thr: [4], Thu: [4], Fri: [5], Sat: [6], Weekdays: [1, 2, 3, 4, 5], Daily: [0, 1, 2, 3, 4, 5, 6], }; // ─── Time Utilities ─────────────────────────────────────────────────────────── /** * Convert "5:00pm", "11:00am", "12:00pm", "12:00am" to "HH:MM" 24-hour format. * Returns the original string unchanged if it doesn't match expected format. */ function convertTo24h(timeStr: string): string { const cleaned = timeStr.trim().toLowerCase(); const m = cleaned.match(/^(\d{1,2}):(\d{2})(am|pm)$/); if (!m) return cleaned; let hours = parseInt(m[1], 10); const mins = m[2]; const meridiem = m[3]; if (meridiem === 'pm' && hours !== 12) hours += 12; if (meridiem === 'am' && hours === 12) hours = 0; return `${String(hours).padStart(2, '0')}:${mins}`; } /** * Parse "8:30am-9:00am" → ["08:30", "09:00"]. */ function parseTimeRange(rangeStr: string): [string, string] { const hyphenIdx = rangeStr.indexOf('-', rangeStr.indexOf(':') + 1); if (hyphenIdx === -1) { const t = convertTo24h(rangeStr.trim()); return [t, t]; } const start = convertTo24h(rangeStr.slice(0, hyphenIdx).trim()); const end = convertTo24h(rangeStr.slice(hyphenIdx + 1).trim()); return [start, end]; } /** * Expand abbreviated day prefix to array of dayOfWeek integers. */ function expandDayAbbrev(prefix: string): number[] { return ABBREV_DAY_NAMES[prefix] ?? []; } // ─── Address Parsing ────────────────────────────────────────────────────────── /** * Parse "14085 Peyton Drive, Chino Hills, CA 91709" into components. */ function parseAddress(raw: string): { address: string | null; city: string | null; state: string | null; zip: string | null } { const parts = raw.split(', '); if (parts.length < 3) return { address: raw, city: null, state: null, zip: null }; const last = parts[parts.length - 1].trim(); const stateZipMatch = last.match(/^([A-Z]{2})\s+(\d{5}(?:-\d{4})?)$/); if (!stateZipMatch) return { address: raw, city: null, state: null, zip: null }; return { address: parts.slice(0, parts.length - 2).join(', ').trim(), city: parts[parts.length - 2].trim(), state: stateZipMatch[1], zip: stateZipMatch[2], }; } // ─── HTML Parsing ───────────────────────────────────────────────────────────── /** * Parse church metadata from page HTML. * Returns null if the page doesn't look like a valid church listing. */ function parseChurch(html: string): ParsedChurch | null { const nameMatch = html.match(/ element first (most reliable) const addrElemMatch = html.match(/id="theaddress"[^>]*>([^<]+)<\/h2>/); if (addrElemMatch) { const parsed = parseAddress(addrElemMatch[1].trim()); address = parsed.address; city = parsed.city; state = parsed.state; zip = parsed.zip; } else { // Fallback: scan for street address pattern in text const addrMatch = html.match(/>\s*(\d+\s[^<\n,]{5,}),\s*([^<,\n]+),\s*([A-Z]{2})\s+(\d{5}(?:-\d{4})?)\s*([^<]+)<\/span>/); const phone = phoneMatch ? phoneMatch[1].trim() : null; const websiteMatch = html.match(/ block. */ function parseMassTimes(html: string): ParsedMass[] { const safeHtml = html.length > 100_000 ? html.slice(0, 100_000) : html; const massUlMatch = safeHtml.match(/
      \s*
    • \s*
      Mass Times<\/h5>[\s\S]*?<\/ul>/); if (!massUlMatch) return []; const massUl = massUlMatch[0]; const results: ParsedMass[] = []; const liParts = massUl.split(/]*>/); for (let i = 1; i < liParts.length; i++) { const li = liParts[i]; const labelMatch = li.match(/([^<]+)<\/span>/); if (!labelMatch) continue; const dayLabel = labelMatch[1].trim(); const dayOfWeek = FULL_DAY_NAMES[dayLabel]; if (dayOfWeek === undefined) continue; const serviceTimeParts = li.split(""); for (let j = 1; j < serviceTimeParts.length; j++) { const st = serviceTimeParts[j]; const timeMatch = st.match(/([^<]+)<\/span>/); if (!timeMatch) continue; const time = convertTo24h(timeMatch[1].trim()); const langMatch = st.match(/\(([^)]+)\)<\/span>/); const language = langMatch ? langMatch[1].trim() : 'English'; const commentMatch = st.match(/([^<]+)<\/span>/); const notes = commentMatch ? commentMatch[1].trim() : undefined; results.push({ dayOfWeek, time, language, notes }); } } return results; } /** * Parse confessions and adoration from the "Other Services"
        block. */ function parseOtherServices(html: string): { confessions: ParsedConf[]; adorations: ParsedAdoration[] } { const safeHtml = html.length > 100_000 ? html.slice(0, 100_000) : html; const otherUlMatch = safeHtml.match(/
          \s*
        • \s*
          Other Services<\/h5>[\s\S]*?<\/ul>/); if (!otherUlMatch) return { confessions: [], adorations: [] }; const otherUl = otherUlMatch[0]; function parseServiceItems(liHtml: string): Array<{ dayOfWeek: number; startTime: string; endTime: string; notes?: string }> { const items: Array<{ dayOfWeek: number; startTime: string; endTime: string; notes?: string }> = []; const stParts = liHtml.split(""); for (let i = 1; i < stParts.length; i++) { const st = stParts[i]; const dayTimeMatch = st.match(/^([A-Za-z]+):\s*([^<]+)<\/span>/); if (!dayTimeMatch) continue; const days = expandDayAbbrev(dayTimeMatch[1].trim()); if (days.length === 0) continue; const [startTime, endTime] = parseTimeRange(dayTimeMatch[2]); const commentMatch = st.match(/([^<]+)<\/span>/); const notes = commentMatch ? commentMatch[1].trim() : undefined; for (const dayOfWeek of days) { items.push({ dayOfWeek, startTime, endTime, notes }); } } return items; } const confessions: ParsedConf[] = []; const adorations: ParsedAdoration[] = []; const confMatch = otherUl.match(/
        • [\s\S]*?<\/li>/); if (confMatch) confessions.push(...parseServiceItems(confMatch[0])); const adorMatch = otherUl.match(/
        • [\s\S]*?<\/li>/); if (adorMatch) adorations.push(...parseServiceItems(adorMatch[0])); return { confessions, adorations }; } // ─── HTTP Helpers ───────────────────────────────────────────────────────────── async function fetchHtml(url: string): Promise { const res = await fetch(url, { headers: { 'User-Agent': USER_AGENT } }); if (!res.ok) throw new Error(`HTTP ${res.status} for ${url}`); return res.text(); } function sleep(ms: number): Promise { return new Promise(resolve => setTimeout(resolve, ms)); } // ─── Sitemap Enumeration ────────────────────────────────────────────────────── async function getAllChurchUrls(): Promise { const urls: string[] = []; for (let i = 1; i <= SITEMAP_COUNT; i++) { const sitemapUrl = `${SITE_BASE}/wp-sitemap-posts-item-${i}.xml`; console.log(`Fetching sitemap ${i}/${SITEMAP_COUNT}...`); const xml = await fetchHtml(sitemapUrl); const matches = xml.matchAll(/(https:\/\/discovermass\.com\/church\/[^<]+)<\/loc>/g); for (const match of matches) { urls.push(match[1]); } } console.log(`Total church URLs: ${urls.length}`); return urls; } // ─── DB Helpers ─────────────────────────────────────────────────────────────── async function loadExistingChurches(): Promise { console.log('Loading existing US churches from DB...'); const churches = await prisma.church.findMany({ where: { country: 'US' }, select: { id: true, name: true, latitude: true, longitude: true, osmId: true, baiduId: true, masstimesId: true, orarimesseId: true, massSchedulesPhId: true, philmassId: true, horariosMisasId: true, mszeInfoId: true, weekdayMassesId: true, messesInfoId: true, bohosluzbyId: true, miserendId: true, kerknetId: true, gottesdienstzeitenId: true, discovermassId: true, source: true, website: true, phone: true, address: true, country: true, }, }); console.log(`Loaded ${churches.length} existing US churches`); return churches as ExistingChurch[]; } // ─── Church Processing ──────────────────────────────────────────────────────── async function processChurch( url: string, existingChurches: ExistingChurch[], args: CLIArgs, stats: ImportStats, ): Promise { const slug = url.replace('https://discovermass.com/church/', '').replace(/\/$/, ''); stats.total++; try { const html = await fetchHtml(url); const parsed = parseChurch(html); if (!parsed) { console.log(` [skip] Could not parse: ${slug}`); stats.skipped++; return; } const masses = parseMassTimes(html); const { confessions, adorations } = parseOtherServices(html); if (args.dryRun) { console.log(` [dry-run] ${parsed.name} — ${masses.length} masses, ${confessions.length} confessions, ${adorations.length} adorations`); return; } const candidate = { name: parsed.name, lat: parsed.lat, lng: parsed.lng, discovermassId: slug }; const duplicate = findDuplicateChurch(candidate, existingChurches); if (duplicate) { const updateData: Record = { discovermassId: slug }; if (!duplicate.phone && parsed.phone) updateData.phone = parsed.phone; if (!duplicate.website && parsed.website) { updateData.website = parsed.website; updateData.hasWebsite = true; } if (parsed.lat !== 0 && duplicate.latitude === 0) { updateData.latitude = parsed.lat; updateData.longitude = parsed.lng; } try { await prisma.$transaction(async (tx) => { await tx.church.update({ where: { id: duplicate.id }, data: updateData }); if (masses.length > 0) { await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } }); await tx.massSchedule.createMany({ data: masses.map(m => ({ churchId: duplicate.id, dayOfWeek: m.dayOfWeek, time: m.time, language: m.language, notes: m.notes ?? null })), }); } if (confessions.length > 0) { await tx.confessionSchedule.deleteMany({ where: { churchId: duplicate.id } }); await tx.confessionSchedule.createMany({ data: confessions.map(c => ({ churchId: duplicate.id, dayOfWeek: c.dayOfWeek, startTime: c.startTime, endTime: c.endTime, notes: c.notes ?? null })), }); } if (adorations.length > 0) { await tx.adorationSchedule.deleteMany({ where: { churchId: duplicate.id } }); await tx.adorationSchedule.createMany({ data: adorations.map(a => ({ churchId: duplicate.id, dayOfWeek: a.dayOfWeek, startTime: a.startTime, endTime: a.endTime, notes: a.notes ?? null })), }); } await tx.church.update({ where: { id: duplicate.id }, data: { lastScrapedAt: new Date() } }); }); duplicate.discovermassId = slug; stats.updated++; } catch (err) { if (err instanceof Error && err.message.includes('Unique constraint')) { stats.skipped++; return; } throw err; } } else { try { const church = await prisma.church.create({ data: { name: parsed.name, address: parsed.address, city: parsed.city, state: parsed.state, zip: parsed.zip, country: 'US', phone: parsed.phone, website: parsed.website, hasWebsite: !!parsed.website, latitude: parsed.lat, longitude: parsed.lng, discovermassId: slug, source: 'discovermass', }, }); existingChurches.push({ id: church.id, name: parsed.name, latitude: parsed.lat, longitude: parsed.lng, osmId: null, baiduId: null, masstimesId: null, orarimesseId: null, massSchedulesPhId: null, philmassId: null, horariosMisasId: null, mszeInfoId: null, weekdayMassesId: null, messesInfoId: null, bohosluzbyId: null, miserendId: null, kerknetId: null, gottesdienstzeitenId: null, discovermassId: slug, source: 'discovermass', website: parsed.website, phone: parsed.phone, address: parsed.address, country: 'US', }); if (masses.length > 0) { await prisma.massSchedule.createMany({ data: masses.map(m => ({ churchId: church.id, dayOfWeek: m.dayOfWeek, time: m.time, language: m.language, notes: m.notes ?? null })), }); } if (confessions.length > 0) { await prisma.confessionSchedule.createMany({ data: confessions.map(c => ({ churchId: church.id, dayOfWeek: c.dayOfWeek, startTime: c.startTime, endTime: c.endTime, notes: c.notes ?? null })), }); } if (adorations.length > 0) { await prisma.adorationSchedule.createMany({ data: adorations.map(a => ({ churchId: church.id, dayOfWeek: a.dayOfWeek, startTime: a.startTime, endTime: a.endTime, notes: a.notes ?? null })), }); } await prisma.church.update({ where: { id: church.id }, data: { lastScrapedAt: new Date() } }); stats.created++; } catch (err) { if (err instanceof Error && err.message.includes('Unique constraint')) { stats.skipped++; return; } throw err; } } stats.massSchedulesCreated += masses.length; stats.confessionSchedulesCreated += confessions.length; stats.adorationSchedulesCreated += adorations.length; console.log( ` [${duplicate ? 'update' : 'create'}] ${parsed.name} — ` + `${masses.length}M ${confessions.length}C ${adorations.length}A — ` + `${stats.total} total (${stats.created} new, ${stats.updated} upd, ${stats.errors} err)` ); } catch (err) { stats.errors++; console.error(` [error] ${slug}: ${err instanceof Error ? err.message : err}`); } } // ─── CLI Parsing ────────────────────────────────────────────────────────────── function parseCLIArgs(): CLIArgs { const args = process.argv.slice(2); const result: CLIArgs = { all: false, dryRun: false }; for (let i = 0; i < args.length; i++) { switch (args[i]) { case '--all': result.all = true; break; case '--dry-run': result.dryRun = true; break; case '--resume-from': result.resumeFrom = parseInt(args[++i], 10); break; case '--job-id': result.jobId = args[++i]; break; case '--test-parse': result.testParse = true; break; } } return result; } // ─── Test Parse ─────────────────────────────────────────────────────────────── async function runTestParse() { const testUrl = 'https://discovermass.com/church/st-paul-the-apostle-chino-hills/'; console.log(`Fetching test page: ${testUrl}`); const html = await fetchHtml(testUrl); const church = parseChurch(html); const masses = parseMassTimes(html); const { confessions, adorations } = parseOtherServices(html); console.log('Church:', JSON.stringify(church, null, 2)); console.log(`Masses (${masses.length}):`, JSON.stringify(masses, null, 2)); console.log(`Confessions (${confessions.length}):`, JSON.stringify(confessions, null, 2)); console.log(`Adorations (${adorations.length}):`, JSON.stringify(adorations, null, 2)); await pool.end(); process.exit(0); } // ─── Main ───────────────────────────────────────────────────────────────────── async function main() { const args = parseCLIArgs(); if (args.testParse) { await runTestParse(); return; } if (!args.all) { console.error('Usage: npx tsx scripts/import-discovermass.ts --all [--dry-run] [--resume-from N] [--job-id UUID]'); process.exit(1); } if (args.jobId) { try { await prisma.backgroundJob.update({ where: { id: args.jobId }, data: { status: 'running', startedAt: new Date() }, }); } catch { /* Job might not exist yet */ } } const stats: ImportStats = { total: 0, created: 0, updated: 0, skipped: 0, errors: 0, massSchedulesCreated: 0, confessionSchedulesCreated: 0, adorationSchedulesCreated: 0, }; try { const urls = await getAllChurchUrls(); const existingChurches = await loadExistingChurches(); const startIdx = args.resumeFrom ?? 0; const churchUrls = urls.slice(startIdx); console.log(`\nProcessing ${churchUrls.length} churches (starting from index ${startIdx})...\n`); for (let i = 0; i < churchUrls.length; i++) { const url = churchUrls[i]; const overallIdx = startIdx + i; console.log(`[${overallIdx + 1}/${urls.length}] ${url}`); await processChurch(url, existingChurches, args, stats); if (i < churchUrls.length - 1) { await sleep(REQUEST_DELAY_MS); } } } finally { console.log('\n─── Import Complete ───────────────────────────────────────'); console.log(`Total processed: ${stats.total}`); console.log(`Created: ${stats.created}`); console.log(`Updated: ${stats.updated}`); console.log(`Skipped: ${stats.skipped}`); console.log(`Errors: ${stats.errors}`); console.log(`Mass schedules: ${stats.massSchedulesCreated}`); console.log(`Confession sched: ${stats.confessionSchedulesCreated}`); console.log(`Adoration sched: ${stats.adorationSchedulesCreated}`); if (args.jobId) { const status = stats.errors > stats.total * 0.1 ? 'failed' : 'completed'; try { await prisma.backgroundJob.update({ where: { id: args.jobId }, data: { status, completedAt: new Date(), processed: stats.total, succeeded: stats.created + stats.updated, failed: stats.errors, itemsFound: stats.massSchedulesCreated, }, }); } catch { /* Ignore */ } } await prisma.$disconnect(); await pool.end(); } } main().catch((err) => { console.error('Fatal error:', err); process.exit(1); });