#!/usr/bin/env tsx
/**
* Import Catholic churches and mass schedules from discovermass.com (USA)
*
* discovermass.com is a US Catholic church directory with 20,284 churches.
* Data includes name, address, phone, website, coordinates, mass times,
* confessions, and adoration schedules.
*
* robots.txt specifies Crawl-delay: 10 — this importer follows that rule.
*
* Usage:
* npx tsx scripts/import-discovermass.ts --all
* npx tsx scripts/import-discovermass.ts --all --dry-run
* npx tsx scripts/import-discovermass.ts --all --resume-from 5000
* npx tsx scripts/import-discovermass.ts --all --job-id {uuid}
*/
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
const pool = new Pool({
connectionString: dbUrl,
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
});
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
import { findDuplicateChurch } from '../src/lib/church-matcher';
import type { ExistingChurch } from '../src/lib/church-matcher';
// ─── Constants ───────────────────────────────────────────────────────────────
const SITE_BASE = 'https://discovermass.com';
const SITEMAP_COUNT = 11;
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
const REQUEST_DELAY_MS = 10_000; // Crawl-delay: 10 from robots.txt
// ─── Types ───────────────────────────────────────────────────────────────────
interface ParsedChurch {
name: string;
address: string | null;
city: string | null;
state: string | null;
zip: string | null;
phone: string | null;
website: string | null;
lat: number;
lng: number;
}
interface ParsedMass {
dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat
time: string; // HH:MM 24-hour
language: string;
notes?: string;
}
interface ParsedConf {
dayOfWeek: number;
startTime: string; // HH:MM 24-hour
endTime: string; // HH:MM 24-hour
notes?: string;
}
interface ParsedAdoration {
dayOfWeek: number;
startTime: string; // HH:MM 24-hour
endTime: string; // HH:MM 24-hour
notes?: string;
}
interface ImportStats {
total: number;
created: number;
updated: number;
skipped: number;
errors: number;
massSchedulesCreated: number;
confessionSchedulesCreated: number;
adorationSchedulesCreated: number;
}
interface CLIArgs {
all: boolean;
dryRun: boolean;
resumeFrom?: number;
jobId?: string;
}
// ─── Day Mappings ─────────────────────────────────────────────────────────────
// Full day names used in mass schedule
labels
const FULL_DAY_NAMES: Record = {
Sunday: 0, Monday: 1, Tuesday: 2, Wednesday: 3,
Thursday: 4, Friday: 5, Saturday: 6,
};
// Abbreviated day prefixes used in confession/adoration serviceTime text
const ABBREV_DAY_NAMES: Record = {
Sun: [0], Mon: [1], Tue: [2], Wed: [3],
Thr: [4], Thu: [4], Fri: [5], Sat: [6],
Weekdays: [1, 2, 3, 4, 5],
Daily: [0, 1, 2, 3, 4, 5, 6],
};
// ─── Time Utilities ───────────────────────────────────────────────────────────
/**
* Convert "5:00pm", "11:00am", "12:00pm", "12:00am" to "HH:MM" 24-hour format.
* Returns the original string unchanged if it doesn't match expected format.
*/
function convertTo24h(timeStr: string): string {
const cleaned = timeStr.trim().toLowerCase();
const m = cleaned.match(/^(\d{1,2}):(\d{2})(am|pm)$/);
if (!m) return cleaned;
let hours = parseInt(m[1], 10);
const mins = m[2];
const meridiem = m[3];
if (meridiem === 'pm' && hours !== 12) hours += 12;
if (meridiem === 'am' && hours === 12) hours = 0;
return `${String(hours).padStart(2, '0')}:${mins}`;
}
/**
* Parse "8:30am-9:00am" → ["08:30", "09:00"].
*/
function parseTimeRange(rangeStr: string): [string, string] {
const hyphenIdx = rangeStr.indexOf('-', rangeStr.indexOf(':') + 1);
if (hyphenIdx === -1) {
const t = convertTo24h(rangeStr.trim());
return [t, t];
}
const start = convertTo24h(rangeStr.slice(0, hyphenIdx).trim());
const end = convertTo24h(rangeStr.slice(hyphenIdx + 1).trim());
return [start, end];
}
/**
* Expand abbreviated day prefix to array of dayOfWeek integers.
*/
function expandDayAbbrev(prefix: string): number[] {
return ABBREV_DAY_NAMES[prefix] ?? [];
}
// ─── Address Parsing ──────────────────────────────────────────────────────────
/**
* Parse "14085 Peyton Drive, Chino Hills, CA 91709" into components.
*/
function parseAddress(raw: string): { address: string | null; city: string | null; state: string | null; zip: string | null } {
const parts = raw.split(', ');
if (parts.length < 3) return { address: raw, city: null, state: null, zip: null };
const last = parts[parts.length - 1].trim();
const stateZipMatch = last.match(/^([A-Z]{2})\s+(\d{5}(?:-\d{4})?)$/);
if (!stateZipMatch) return { address: raw, city: null, state: null, zip: null };
return {
address: parts.slice(0, parts.length - 2).join(', ').trim(),
city: parts[parts.length - 2].trim(),
state: stateZipMatch[1],
zip: stateZipMatch[2],
};
}
// ─── HTML Parsing ─────────────────────────────────────────────────────────────
/**
* Parse church metadata from page HTML.
* Returns null if the page doesn't look like a valid church listing.
*/
function parseChurch(html: string): ParsedChurch | null {
const nameMatch = html.match(/ element first (most reliable)
const addrElemMatch = html.match(/id="theaddress"[^>]*>([^<]+)<\/h2>/);
if (addrElemMatch) {
const parsed = parseAddress(addrElemMatch[1].trim());
address = parsed.address;
city = parsed.city;
state = parsed.state;
zip = parsed.zip;
} else {
// Fallback: scan for street address pattern in text
const addrMatch = html.match(/>\s*(\d+\s[^<\n,]{5,}),\s*([^<,\n]+),\s*([A-Z]{2})\s+(\d{5}(?:-\d{4})?)\s*);
if (addrMatch) {
const raw = `${addrMatch[1].trim()}, ${addrMatch[2].trim()}, ${addrMatch[3]} ${addrMatch[4]}`;
const parsed = parseAddress(raw);
address = parsed.address;
city = parsed.city;
state = parsed.state;
zip = parsed.zip;
}
}
const phoneMatch = html.match(/([^<]+)<\/span>/);
const phone = phoneMatch ? phoneMatch[1].trim() : null;
const websiteMatch = html.match(/ block.
*/
function parseMassTimes(html: string): ParsedMass[] {
const safeHtml = html.length > 100_000 ? html.slice(0, 100_000) : html;
const massUlMatch = safeHtml.match(/\s*- \s*
Mass Times<\/h5>[\s\S]*?<\/ul>/);
if (!massUlMatch) return [];
const massUl = massUlMatch[0];
const results: ParsedMass[] = [];
const liParts = massUl.split(/
- ]*>/);
for (let i = 1; i < liParts.length; i++) {
const li = liParts[i];
const labelMatch = li.match(/([^<]+)<\/span>/);
if (!labelMatch) continue;
const dayLabel = labelMatch[1].trim();
const dayOfWeek = FULL_DAY_NAMES[dayLabel];
if (dayOfWeek === undefined) continue;
const serviceTimeParts = li.split("");
for (let j = 1; j < serviceTimeParts.length; j++) {
const st = serviceTimeParts[j];
const timeMatch = st.match(/([^<]+)<\/span>/);
if (!timeMatch) continue;
const time = convertTo24h(timeMatch[1].trim());
const langMatch = st.match(/\(([^)]+)\)<\/span>/);
const language = langMatch ? langMatch[1].trim() : 'English';
const commentMatch = st.match(/