diff --git a/scripts/import-discovermass.ts b/scripts/import-discovermass.ts new file mode 100644 index 0000000..947a327 --- /dev/null +++ b/scripts/import-discovermass.ts @@ -0,0 +1,616 @@ +#!/usr/bin/env tsx +/** + * Import Catholic churches and mass schedules from discovermass.com (USA) + * + * discovermass.com is a US Catholic church directory with 20,284 churches. + * Data includes name, address, phone, website, coordinates, mass times, + * confessions, and adoration schedules. + * + * robots.txt specifies Crawl-delay: 10 — this importer follows that rule. + * + * Usage: + * npx tsx scripts/import-discovermass.ts --all + * npx tsx scripts/import-discovermass.ts --all --dry-run + * npx tsx scripts/import-discovermass.ts --all --resume-from 5000 + * npx tsx scripts/import-discovermass.ts --all --job-id {uuid} + */ + +import dotenv from 'dotenv'; +import path from 'path'; + +dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); +dotenv.config({ path: path.resolve(process.cwd(), '.env') }); + +import { Pool } from 'pg'; +import { PrismaPg } from '@prisma/adapter-pg'; +import { PrismaClient } from '@prisma/client'; + +const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; +console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); +const pool = new Pool({ + connectionString: dbUrl, + ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined, +}); +const adapter = new PrismaPg(pool); +const prisma = new PrismaClient({ adapter }); + +import { findDuplicateChurch } from '../src/lib/church-matcher'; +import type { ExistingChurch } from '../src/lib/church-matcher'; + +// ─── Constants ─────────────────────────────────────────────────────────────── + +const SITE_BASE = 'https://discovermass.com'; +const SITEMAP_COUNT = 11; +const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)'; +const REQUEST_DELAY_MS = 10_000; // Crawl-delay: 10 from robots.txt + +// ─── Types ─────────────────────────────────────────────────────────────────── + +interface ParsedChurch { + name: string; + address: string | null; + city: string | null; + state: string | null; + zip: string | null; + phone: string | null; + website: string | null; + lat: number; + lng: number; +} + +interface ParsedMass { + dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat + time: string; // HH:MM 24-hour + language: string; + notes?: string; +} + +interface ParsedConf { + dayOfWeek: number; + startTime: string; // HH:MM 24-hour + endTime: string; // HH:MM 24-hour + notes?: string; +} + +interface ParsedAdoration { + dayOfWeek: number; + startTime: string; // HH:MM 24-hour + endTime: string; // HH:MM 24-hour + notes?: string; +} + +interface ImportStats { + total: number; + created: number; + updated: number; + skipped: number; + errors: number; + massSchedulesCreated: number; + confessionSchedulesCreated: number; + adorationSchedulesCreated: number; +} + +interface CLIArgs { + all: boolean; + dryRun: boolean; + resumeFrom?: number; + jobId?: string; + testParse?: boolean; +} + +// ─── Day Mappings ───────────────────────────────────────────────────────────── + +// Full day names used in mass schedule
  • labels +const FULL_DAY_NAMES: Record = { + Sunday: 0, Monday: 1, Tuesday: 2, Wednesday: 3, + Thursday: 4, Friday: 5, Saturday: 6, +}; + +// Abbreviated day prefixes used in confession/adoration serviceTime text +const ABBREV_DAY_NAMES: Record = { + Sun: [0], Mon: [1], Tue: [2], Wed: [3], + Thr: [4], Thu: [4], Fri: [5], Sat: [6], + Weekdays: [1, 2, 3, 4, 5], + Daily: [0, 1, 2, 3, 4, 5, 6], +}; + +// ─── Time Utilities ─────────────────────────────────────────────────────────── + +/** + * Convert "5:00pm", "11:00am", "12:00pm", "12:00am" to "HH:MM" 24-hour format. + * Returns the original string unchanged if it doesn't match expected format. + */ +function convertTo24h(timeStr: string): string { + const cleaned = timeStr.trim().toLowerCase(); + const m = cleaned.match(/^(\d{1,2}):(\d{2})(am|pm)$/); + if (!m) return cleaned; + let hours = parseInt(m[1], 10); + const mins = m[2]; + const meridiem = m[3]; + if (meridiem === 'pm' && hours !== 12) hours += 12; + if (meridiem === 'am' && hours === 12) hours = 0; + return `${String(hours).padStart(2, '0')}:${mins}`; +} + +/** + * Parse "8:30am-9:00am" → ["08:30", "09:00"]. + */ +function parseTimeRange(rangeStr: string): [string, string] { + const hyphenIdx = rangeStr.indexOf('-', rangeStr.indexOf(':') + 1); + if (hyphenIdx === -1) { + const t = convertTo24h(rangeStr.trim()); + return [t, t]; + } + const start = convertTo24h(rangeStr.slice(0, hyphenIdx).trim()); + const end = convertTo24h(rangeStr.slice(hyphenIdx + 1).trim()); + return [start, end]; +} + +/** + * Expand abbreviated day prefix to array of dayOfWeek integers. + */ +function expandDayAbbrev(prefix: string): number[] { + return ABBREV_DAY_NAMES[prefix] ?? []; +} + +// ─── Address Parsing ────────────────────────────────────────────────────────── + +/** + * Parse "14085 Peyton Drive, Chino Hills, CA 91709" into components. + */ +function parseAddress(raw: string): { address: string | null; city: string | null; state: string | null; zip: string | null } { + const parts = raw.split(', '); + if (parts.length < 3) return { address: raw, city: null, state: null, zip: null }; + const last = parts[parts.length - 1].trim(); + const stateZipMatch = last.match(/^([A-Z]{2})\s+(\d{5}(?:-\d{4})?)$/); + if (!stateZipMatch) return { address: raw, city: null, state: null, zip: null }; + return { + address: parts.slice(0, parts.length - 2).join(', ').trim(), + city: parts[parts.length - 2].trim(), + state: stateZipMatch[1], + zip: stateZipMatch[2], + }; +} + +// ─── HTML Parsing ───────────────────────────────────────────────────────────── + +/** + * Parse church metadata from page HTML. + * Returns null if the page doesn't look like a valid church listing. + */ +function parseChurch(html: string): ParsedChurch | null { + const nameMatch = html.match(/ element first (most reliable) + const addrElemMatch = html.match(/id="theaddress"[^>]*>([^<]+)<\/h2>/); + if (addrElemMatch) { + const parsed = parseAddress(addrElemMatch[1].trim()); + address = parsed.address; + city = parsed.city; + state = parsed.state; + zip = parsed.zip; + } else { + // Fallback: scan for street address pattern in text + const addrMatch = html.match(/>\s*(\d+\s[^<\n,]{5,}),\s*([^<,\n]+),\s*([A-Z]{2})\s+(\d{5}(?:-\d{4})?)\s*([^<]+)<\/span>/); + const phone = phoneMatch ? phoneMatch[1].trim() : null; + + const websiteMatch = html.match(/ block. + */ +function parseMassTimes(html: string): ParsedMass[] { + const safeHtml = html.length > 100_000 ? html.slice(0, 100_000) : html; + const massUlMatch = safeHtml.match(/
      \s*
    • \s*
      Mass Times<\/h5>[\s\S]*?<\/ul>/); + if (!massUlMatch) return []; + const massUl = massUlMatch[0]; + + const results: ParsedMass[] = []; + const liParts = massUl.split(/]*>/); + for (let i = 1; i < liParts.length; i++) { + const li = liParts[i]; + const labelMatch = li.match(/([^<]+)<\/span>/); + if (!labelMatch) continue; + const dayLabel = labelMatch[1].trim(); + const dayOfWeek = FULL_DAY_NAMES[dayLabel]; + if (dayOfWeek === undefined) continue; + + const serviceTimeParts = li.split(""); + for (let j = 1; j < serviceTimeParts.length; j++) { + const st = serviceTimeParts[j]; + const timeMatch = st.match(/([^<]+)<\/span>/); + if (!timeMatch) continue; + const time = convertTo24h(timeMatch[1].trim()); + const langMatch = st.match(/\(([^)]+)\)<\/span>/); + const language = langMatch ? langMatch[1].trim() : 'English'; + const commentMatch = st.match(/([^<]+)<\/span>/); + const notes = commentMatch ? commentMatch[1].trim() : undefined; + results.push({ dayOfWeek, time, language, notes }); + } + } + return results; +} + +/** + * Parse confessions and adoration from the "Other Services"
        block. + */ +function parseOtherServices(html: string): { confessions: ParsedConf[]; adorations: ParsedAdoration[] } { + const safeHtml = html.length > 100_000 ? html.slice(0, 100_000) : html; + const otherUlMatch = safeHtml.match(/
          \s*
        • \s*
          Other Services<\/h5>[\s\S]*?<\/ul>/); + if (!otherUlMatch) return { confessions: [], adorations: [] }; + const otherUl = otherUlMatch[0]; + + function parseServiceItems(liHtml: string): Array<{ dayOfWeek: number; startTime: string; endTime: string; notes?: string }> { + const items: Array<{ dayOfWeek: number; startTime: string; endTime: string; notes?: string }> = []; + const stParts = liHtml.split(""); + for (let i = 1; i < stParts.length; i++) { + const st = stParts[i]; + const dayTimeMatch = st.match(/^([A-Za-z]+):\s*([^<]+)<\/span>/); + if (!dayTimeMatch) continue; + const days = expandDayAbbrev(dayTimeMatch[1].trim()); + if (days.length === 0) continue; + const [startTime, endTime] = parseTimeRange(dayTimeMatch[2]); + const commentMatch = st.match(/([^<]+)<\/span>/); + const notes = commentMatch ? commentMatch[1].trim() : undefined; + for (const dayOfWeek of days) { + items.push({ dayOfWeek, startTime, endTime, notes }); + } + } + return items; + } + + const confessions: ParsedConf[] = []; + const adorations: ParsedAdoration[] = []; + const confMatch = otherUl.match(/
        • [\s\S]*?<\/li>/); + if (confMatch) confessions.push(...parseServiceItems(confMatch[0])); + const adorMatch = otherUl.match(/
        • [\s\S]*?<\/li>/); + if (adorMatch) adorations.push(...parseServiceItems(adorMatch[0])); + return { confessions, adorations }; +} + +// ─── HTTP Helpers ───────────────────────────────────────────────────────────── + +async function fetchHtml(url: string): Promise { + const res = await fetch(url, { headers: { 'User-Agent': USER_AGENT } }); + if (!res.ok) throw new Error(`HTTP ${res.status} for ${url}`); + return res.text(); +} + +function sleep(ms: number): Promise { + return new Promise(resolve => setTimeout(resolve, ms)); +} + +// ─── Sitemap Enumeration ────────────────────────────────────────────────────── + +async function getAllChurchUrls(): Promise { + const urls: string[] = []; + for (let i = 1; i <= SITEMAP_COUNT; i++) { + const sitemapUrl = `${SITE_BASE}/wp-sitemap-posts-item-${i}.xml`; + console.log(`Fetching sitemap ${i}/${SITEMAP_COUNT}...`); + const xml = await fetchHtml(sitemapUrl); + const matches = xml.matchAll(/(https:\/\/discovermass\.com\/church\/[^<]+)<\/loc>/g); + for (const match of matches) { + urls.push(match[1]); + } + } + console.log(`Total church URLs: ${urls.length}`); + return urls; +} + +// ─── DB Helpers ─────────────────────────────────────────────────────────────── + +async function loadExistingChurches(): Promise { + console.log('Loading existing US churches from DB...'); + const churches = await prisma.church.findMany({ + where: { country: 'US' }, + select: { + id: true, name: true, latitude: true, longitude: true, + osmId: true, baiduId: true, masstimesId: true, + orarimesseId: true, massSchedulesPhId: true, philmassId: true, + horariosMisasId: true, mszeInfoId: true, weekdayMassesId: true, + messesInfoId: true, bohosluzbyId: true, miserendId: true, + kerknetId: true, gottesdienstzeitenId: true, discovermassId: true, + source: true, website: true, phone: true, address: true, country: true, + }, + }); + console.log(`Loaded ${churches.length} existing US churches`); + return churches as ExistingChurch[]; +} + +// ─── Church Processing ──────────────────────────────────────────────────────── + +async function processChurch( + url: string, + existingChurches: ExistingChurch[], + args: CLIArgs, + stats: ImportStats, +): Promise { + const slug = url.replace('https://discovermass.com/church/', '').replace(/\/$/, ''); + stats.total++; + + try { + const html = await fetchHtml(url); + const parsed = parseChurch(html); + if (!parsed) { + console.log(` [skip] Could not parse: ${slug}`); + stats.skipped++; + return; + } + + const masses = parseMassTimes(html); + const { confessions, adorations } = parseOtherServices(html); + + if (args.dryRun) { + console.log(` [dry-run] ${parsed.name} — ${masses.length} masses, ${confessions.length} confessions, ${adorations.length} adorations`); + return; + } + + const candidate = { name: parsed.name, lat: parsed.lat, lng: parsed.lng, discovermassId: slug }; + const duplicate = findDuplicateChurch(candidate, existingChurches); + + if (duplicate) { + const updateData: Record = { discovermassId: slug }; + if (!duplicate.phone && parsed.phone) updateData.phone = parsed.phone; + if (!duplicate.website && parsed.website) { + updateData.website = parsed.website; + updateData.hasWebsite = true; + } + if (parsed.lat !== 0 && duplicate.latitude === 0) { + updateData.latitude = parsed.lat; + updateData.longitude = parsed.lng; + } + + try { + await prisma.$transaction(async (tx) => { + await tx.church.update({ where: { id: duplicate.id }, data: updateData }); + if (masses.length > 0) { + await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } }); + await tx.massSchedule.createMany({ + data: masses.map(m => ({ churchId: duplicate.id, dayOfWeek: m.dayOfWeek, time: m.time, language: m.language, notes: m.notes ?? null })), + }); + } + if (confessions.length > 0) { + await tx.confessionSchedule.deleteMany({ where: { churchId: duplicate.id } }); + await tx.confessionSchedule.createMany({ + data: confessions.map(c => ({ churchId: duplicate.id, dayOfWeek: c.dayOfWeek, startTime: c.startTime, endTime: c.endTime, notes: c.notes ?? null })), + }); + } + if (adorations.length > 0) { + await tx.adorationSchedule.deleteMany({ where: { churchId: duplicate.id } }); + await tx.adorationSchedule.createMany({ + data: adorations.map(a => ({ churchId: duplicate.id, dayOfWeek: a.dayOfWeek, startTime: a.startTime, endTime: a.endTime, notes: a.notes ?? null })), + }); + } + await tx.church.update({ where: { id: duplicate.id }, data: { lastScrapedAt: new Date() } }); + }); + duplicate.discovermassId = slug; + stats.updated++; + } catch (err) { + if (err instanceof Error && err.message.includes('Unique constraint')) { + stats.skipped++; + return; + } + throw err; + } + } else { + try { + const church = await prisma.church.create({ + data: { + name: parsed.name, + address: parsed.address, + city: parsed.city, + state: parsed.state, + zip: parsed.zip, + country: 'US', + phone: parsed.phone, + website: parsed.website, + hasWebsite: !!parsed.website, + latitude: parsed.lat, + longitude: parsed.lng, + discovermassId: slug, + source: 'discovermass', + }, + }); + + existingChurches.push({ + id: church.id, name: parsed.name, latitude: parsed.lat, longitude: parsed.lng, + osmId: null, baiduId: null, masstimesId: null, orarimesseId: null, + massSchedulesPhId: null, philmassId: null, horariosMisasId: null, + mszeInfoId: null, weekdayMassesId: null, messesInfoId: null, + bohosluzbyId: null, miserendId: null, kerknetId: null, + gottesdienstzeitenId: null, discovermassId: slug, + source: 'discovermass', website: parsed.website, phone: parsed.phone, + address: parsed.address, country: 'US', + }); + + if (masses.length > 0) { + await prisma.massSchedule.createMany({ + data: masses.map(m => ({ churchId: church.id, dayOfWeek: m.dayOfWeek, time: m.time, language: m.language, notes: m.notes ?? null })), + }); + } + if (confessions.length > 0) { + await prisma.confessionSchedule.createMany({ + data: confessions.map(c => ({ churchId: church.id, dayOfWeek: c.dayOfWeek, startTime: c.startTime, endTime: c.endTime, notes: c.notes ?? null })), + }); + } + if (adorations.length > 0) { + await prisma.adorationSchedule.createMany({ + data: adorations.map(a => ({ churchId: church.id, dayOfWeek: a.dayOfWeek, startTime: a.startTime, endTime: a.endTime, notes: a.notes ?? null })), + }); + } + await prisma.church.update({ where: { id: church.id }, data: { lastScrapedAt: new Date() } }); + stats.created++; + } catch (err) { + if (err instanceof Error && err.message.includes('Unique constraint')) { + stats.skipped++; + return; + } + throw err; + } + } + + stats.massSchedulesCreated += masses.length; + stats.confessionSchedulesCreated += confessions.length; + stats.adorationSchedulesCreated += adorations.length; + + console.log( + ` [${duplicate ? 'update' : 'create'}] ${parsed.name} — ` + + `${masses.length}M ${confessions.length}C ${adorations.length}A — ` + + `${stats.total} total (${stats.created} new, ${stats.updated} upd, ${stats.errors} err)` + ); + } catch (err) { + stats.errors++; + console.error(` [error] ${slug}: ${err instanceof Error ? err.message : err}`); + } +} + +// ─── CLI Parsing ────────────────────────────────────────────────────────────── + +function parseCLIArgs(): CLIArgs { + const args = process.argv.slice(2); + const result: CLIArgs = { all: false, dryRun: false }; + for (let i = 0; i < args.length; i++) { + switch (args[i]) { + case '--all': result.all = true; break; + case '--dry-run': result.dryRun = true; break; + case '--resume-from': result.resumeFrom = parseInt(args[++i], 10); break; + case '--job-id': result.jobId = args[++i]; break; + case '--test-parse': result.testParse = true; break; + } + } + return result; +} + +// ─── Test Parse ─────────────────────────────────────────────────────────────── + +async function runTestParse() { + const testUrl = 'https://discovermass.com/church/st-paul-the-apostle-chino-hills/'; + console.log(`Fetching test page: ${testUrl}`); + const html = await fetchHtml(testUrl); + const church = parseChurch(html); + const masses = parseMassTimes(html); + const { confessions, adorations } = parseOtherServices(html); + console.log('Church:', JSON.stringify(church, null, 2)); + console.log(`Masses (${masses.length}):`, JSON.stringify(masses, null, 2)); + console.log(`Confessions (${confessions.length}):`, JSON.stringify(confessions, null, 2)); + console.log(`Adorations (${adorations.length}):`, JSON.stringify(adorations, null, 2)); + await pool.end(); + process.exit(0); +} + +// ─── Main ───────────────────────────────────────────────────────────────────── + +async function main() { + const args = parseCLIArgs(); + + if (args.testParse) { + await runTestParse(); + return; + } + + if (!args.all) { + console.error('Usage: npx tsx scripts/import-discovermass.ts --all [--dry-run] [--resume-from N] [--job-id UUID]'); + process.exit(1); + } + + if (args.jobId) { + try { + await prisma.backgroundJob.update({ + where: { id: args.jobId }, + data: { status: 'running', startedAt: new Date() }, + }); + } catch { /* Job might not exist yet */ } + } + + const stats: ImportStats = { + total: 0, created: 0, updated: 0, skipped: 0, errors: 0, + massSchedulesCreated: 0, confessionSchedulesCreated: 0, adorationSchedulesCreated: 0, + }; + + try { + const urls = await getAllChurchUrls(); + const existingChurches = await loadExistingChurches(); + const startIdx = args.resumeFrom ?? 0; + const churchUrls = urls.slice(startIdx); + console.log(`\nProcessing ${churchUrls.length} churches (starting from index ${startIdx})...\n`); + + for (let i = 0; i < churchUrls.length; i++) { + const url = churchUrls[i]; + const overallIdx = startIdx + i; + console.log(`[${overallIdx + 1}/${urls.length}] ${url}`); + await processChurch(url, existingChurches, args, stats); + if (i < churchUrls.length - 1) { + await sleep(REQUEST_DELAY_MS); + } + } + } finally { + console.log('\n─── Import Complete ───────────────────────────────────────'); + console.log(`Total processed: ${stats.total}`); + console.log(`Created: ${stats.created}`); + console.log(`Updated: ${stats.updated}`); + console.log(`Skipped: ${stats.skipped}`); + console.log(`Errors: ${stats.errors}`); + console.log(`Mass schedules: ${stats.massSchedulesCreated}`); + console.log(`Confession sched: ${stats.confessionSchedulesCreated}`); + console.log(`Adoration sched: ${stats.adorationSchedulesCreated}`); + + if (args.jobId) { + const status = stats.errors > stats.total * 0.1 ? 'failed' : 'completed'; + try { + await prisma.backgroundJob.update({ + where: { id: args.jobId }, + data: { + status, + completedAt: new Date(), + processed: stats.total, + succeeded: stats.created + stats.updated, + failed: stats.errors, + itemsFound: stats.massSchedulesCreated, + }, + }); + } catch { /* Ignore */ } + } + + await prisma.$disconnect(); + await pool.end(); + } +} + +main().catch((err) => { + console.error('Fatal error:', err); + process.exit(1); +});