Files
ScraperControl/scripts/import-discovermass.ts
albertfj114 033f805965 fix: clean up church-matcher types and add HK OSM bounding box
- Remove discovermassId/buscarmisasNetworkId from findDuplicateChurch match
  passes (importers now do their own pre-check dedup); restore as optional
  fields on ExistingChurch to keep type/runtime in sync
- Add HK bounding box to COUNTRY_BOUNDING_BOXES; fix silent 0-result
  fallback when country query returns empty from mirror server
- discovermass importer: add --limit flag and skip-already-imported
  pre-check using importedSlugs set
- Import scripts: remove discovermassId from ExistingChurch select/stubs
  (field not needed in shared matcher context)
- Schema: reorder discovermassId/kerknetId/gottesdienstzeitenId fields

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-01 22:20:45 -04:00

606 lines
24 KiB
TypeScript

#!/usr/bin/env tsx
/**
* Import Catholic churches and mass schedules from discovermass.com (USA)
*
* discovermass.com is a US Catholic church directory with 20,284 churches.
* Data includes name, address, phone, website, coordinates, mass times,
* confessions, and adoration schedules.
*
* robots.txt specifies Crawl-delay: 10 — this importer follows that rule.
*
* Usage:
* npx tsx scripts/import-discovermass.ts --all
* npx tsx scripts/import-discovermass.ts --all --dry-run
* npx tsx scripts/import-discovermass.ts --all --resume-from 5000
* npx tsx scripts/import-discovermass.ts --all --job-id {uuid}
*/
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
const pool = new Pool({
connectionString: dbUrl,
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
});
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
import { findDuplicateChurch } from '../src/lib/church-matcher';
import type { ExistingChurch } from '../src/lib/church-matcher';
// ─── Constants ───────────────────────────────────────────────────────────────
const SITE_BASE = 'https://discovermass.com';
const SITEMAP_COUNT = 11;
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
const REQUEST_DELAY_MS = 10_000; // Crawl-delay: 10 from robots.txt
// ─── Types ───────────────────────────────────────────────────────────────────
interface ParsedChurch {
name: string;
address: string | null;
city: string | null;
state: string | null;
zip: string | null;
phone: string | null;
website: string | null;
lat: number;
lng: number;
}
interface ParsedMass {
dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat
time: string; // HH:MM 24-hour
language: string;
notes?: string;
}
interface ParsedConf {
dayOfWeek: number;
startTime: string; // HH:MM 24-hour
endTime: string; // HH:MM 24-hour
notes?: string;
}
interface ParsedAdoration {
dayOfWeek: number;
startTime: string; // HH:MM 24-hour
endTime: string; // HH:MM 24-hour
notes?: string;
}
interface ImportStats {
total: number;
created: number;
updated: number;
skipped: number;
errors: number;
massSchedulesCreated: number;
confessionSchedulesCreated: number;
adorationSchedulesCreated: number;
}
interface CLIArgs {
all: boolean;
dryRun: boolean;
resumeFrom?: number;
limit?: number;
jobId?: string;
}
// ─── Day Mappings ─────────────────────────────────────────────────────────────
// Full day names used in mass schedule <li> labels
const FULL_DAY_NAMES: Record<string, number> = {
Sunday: 0, Monday: 1, Tuesday: 2, Wednesday: 3,
Thursday: 4, Friday: 5, Saturday: 6,
};
// Abbreviated day prefixes used in confession/adoration serviceTime text
const ABBREV_DAY_NAMES: Record<string, number[]> = {
Sun: [0], Mon: [1], Tue: [2], Wed: [3],
Thr: [4], Thu: [4], Fri: [5], Sat: [6],
Weekdays: [1, 2, 3, 4, 5],
Daily: [0, 1, 2, 3, 4, 5, 6],
};
// ─── Time Utilities ───────────────────────────────────────────────────────────
/**
* Convert "5:00pm", "11:00am", "12:00pm", "12:00am" to "HH:MM" 24-hour format.
* Returns the original string unchanged if it doesn't match expected format.
*/
function convertTo24h(timeStr: string): string {
const cleaned = timeStr.trim().toLowerCase();
const m = cleaned.match(/^(\d{1,2}):(\d{2})(am|pm)$/);
if (!m) return cleaned;
let hours = parseInt(m[1], 10);
const mins = m[2];
const meridiem = m[3];
if (meridiem === 'pm' && hours !== 12) hours += 12;
if (meridiem === 'am' && hours === 12) hours = 0;
return `${String(hours).padStart(2, '0')}:${mins}`;
}
/**
* Parse "8:30am-9:00am" → ["08:30", "09:00"].
*/
function parseTimeRange(rangeStr: string): [string, string] {
const hyphenIdx = rangeStr.indexOf('-', rangeStr.indexOf(':') + 1);
if (hyphenIdx === -1) {
const t = convertTo24h(rangeStr.trim());
return [t, t];
}
const start = convertTo24h(rangeStr.slice(0, hyphenIdx).trim());
const end = convertTo24h(rangeStr.slice(hyphenIdx + 1).trim());
return [start, end];
}
/**
* Expand abbreviated day prefix to array of dayOfWeek integers.
*/
function expandDayAbbrev(prefix: string): number[] {
return ABBREV_DAY_NAMES[prefix] ?? [];
}
// ─── Address Parsing ──────────────────────────────────────────────────────────
/**
* Parse "14085 Peyton Drive, Chino Hills, CA 91709" into components.
*/
function parseAddress(raw: string): { address: string | null; city: string | null; state: string | null; zip: string | null } {
const parts = raw.split(', ');
if (parts.length < 3) return { address: raw, city: null, state: null, zip: null };
const last = parts[parts.length - 1].trim();
const stateZipMatch = last.match(/^([A-Z]{2})\s+(\d{5}(?:-\d{4})?)$/);
if (!stateZipMatch) return { address: raw, city: null, state: null, zip: null };
return {
address: parts.slice(0, parts.length - 2).join(', ').trim(),
city: parts[parts.length - 2].trim(),
state: stateZipMatch[1],
zip: stateZipMatch[2],
};
}
// ─── HTML Parsing ─────────────────────────────────────────────────────────────
/**
* Parse church metadata from page HTML.
* Returns null if the page doesn't look like a valid church listing.
*/
function parseChurch(html: string): ParsedChurch | null {
const nameMatch = html.match(/<meta property="og:title" content="([^"]+)"/);
if (!nameMatch) return null;
const name = nameMatch[1].trim();
if (!name || name === 'Discover Mass') return null;
let address: string | null = null;
let city: string | null = null;
let state: string | null = null;
let zip: string | null = null;
// Try structured <h2 id="theaddress"> element first (most reliable)
const addrElemMatch = html.match(/id="theaddress"[^>]*>([^<]+)<\/h2>/);
if (addrElemMatch) {
const parsed = parseAddress(addrElemMatch[1].trim());
address = parsed.address;
city = parsed.city;
state = parsed.state;
zip = parsed.zip;
} else {
// Fallback: scan for street address pattern in text
const addrMatch = html.match(/>\s*(\d+\s[^<\n,]{5,}),\s*([^<,\n]+),\s*([A-Z]{2})\s+(\d{5}(?:-\d{4})?)\s*</);
if (addrMatch) {
const raw = `${addrMatch[1].trim()}, ${addrMatch[2].trim()}, ${addrMatch[3]} ${addrMatch[4]}`;
const parsed = parseAddress(raw);
address = parsed.address;
city = parsed.city;
state = parsed.state;
zip = parsed.zip;
}
}
const phoneMatch = html.match(/<span class='side-phone attribute'>([^<]+)<\/span>/);
const phone = phoneMatch ? phoneMatch[1].trim() : null;
const websiteMatch = html.match(/<span class='side-website attribute'><a href='([^']+)'/);
const website = websiteMatch ? websiteMatch[1].trim() : null;
let lat = 0;
let lng = 0;
const coordMatch = html.match(/daddr=([-\d.]+),([-\d.]+)/);
if (coordMatch) {
const rawLat = parseFloat(coordMatch[1]);
const rawLng = parseFloat(coordMatch[2]);
if (isFinite(rawLat) && isFinite(rawLng) && Math.abs(rawLat) <= 90 && Math.abs(rawLng) <= 180) {
lat = rawLat;
lng = rawLng;
}
}
return { name, address, city, state, zip, phone, website, lat, lng };
}
/**
* Parse mass schedule from the "Mass Times" <ul> block.
*/
function parseMassTimes(html: string): ParsedMass[] {
const safeHtml = html.length > 100_000 ? html.slice(0, 100_000) : html;
const massUlMatch = safeHtml.match(/<ul>\s*<li>\s*<h5>Mass Times<\/h5>[\s\S]*?<\/ul>/);
if (!massUlMatch) return [];
const massUl = massUlMatch[0];
const results: ParsedMass[] = [];
const liParts = massUl.split(/<li[^>]*>/);
for (let i = 1; i < liParts.length; i++) {
const li = liParts[i];
const labelMatch = li.match(/<span class="label">([^<]+)<\/span>/);
if (!labelMatch) continue;
const dayLabel = labelMatch[1].trim();
const dayOfWeek = FULL_DAY_NAMES[dayLabel];
if (dayOfWeek === undefined) continue;
const serviceTimeParts = li.split("<span class='serviceTime'>");
for (let j = 1; j < serviceTimeParts.length; j++) {
const st = serviceTimeParts[j];
const timeMatch = st.match(/<span class='time'>([^<]+)<\/span>/);
if (!timeMatch) continue;
const time = convertTo24h(timeMatch[1].trim());
const langMatch = st.match(/<span class='language'>\(([^)]+)\)<\/span>/);
const language = langMatch ? langMatch[1].trim() : 'English';
const commentMatch = st.match(/<span class='comment'>([^<]+)<\/span>/);
const notes = commentMatch ? commentMatch[1].trim() : undefined;
results.push({ dayOfWeek, time, language, notes });
}
}
return results;
}
/**
* Parse confessions and adoration from the "Other Services" <ul> block.
*/
function parseOtherServices(html: string): { confessions: ParsedConf[]; adorations: ParsedAdoration[] } {
const safeHtml = html.length > 100_000 ? html.slice(0, 100_000) : html;
const otherUlMatch = safeHtml.match(/<ul>\s*<li>\s*<h5>Other Services<\/h5>[\s\S]*?<\/ul>/);
if (!otherUlMatch) return { confessions: [], adorations: [] };
const otherUl = otherUlMatch[0];
function parseServiceItems(liHtml: string): Array<{ dayOfWeek: number; startTime: string; endTime: string; notes?: string }> {
const items: Array<{ dayOfWeek: number; startTime: string; endTime: string; notes?: string }> = [];
const stParts = liHtml.split("<span class='serviceTime'>");
for (let i = 1; i < stParts.length; i++) {
const st = stParts[i];
const dayTimeMatch = st.match(/^([A-Za-z]+):\s*<span class='time'>([^<]+)<\/span>/);
if (!dayTimeMatch) continue;
const days = expandDayAbbrev(dayTimeMatch[1].trim());
if (days.length === 0) continue;
const [startTime, endTime] = parseTimeRange(dayTimeMatch[2]);
const commentMatch = st.match(/<span class='comment'>([^<]+)<\/span>/);
const notes = commentMatch ? commentMatch[1].trim() : undefined;
for (const dayOfWeek of days) {
items.push({ dayOfWeek, startTime, endTime, notes });
}
}
return items;
}
const confessions: ParsedConf[] = [];
const adorations: ParsedAdoration[] = [];
const confMatch = otherUl.match(/<li class="Confessions">[\s\S]*?<\/li>/);
if (confMatch) confessions.push(...parseServiceItems(confMatch[0]));
const adorMatch = otherUl.match(/<li class="Adoration">[\s\S]*?<\/li>/);
if (adorMatch) adorations.push(...parseServiceItems(adorMatch[0]));
return { confessions, adorations };
}
// ─── HTTP Helpers ─────────────────────────────────────────────────────────────
async function fetchHtml(url: string): Promise<string> {
const res = await fetch(url, { headers: { 'User-Agent': USER_AGENT } });
if (!res.ok) throw new Error(`HTTP ${res.status} for ${url}`);
return res.text();
}
function sleep(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms));
}
// ─── Sitemap Enumeration ──────────────────────────────────────────────────────
async function getAllChurchUrls(): Promise<string[]> {
const urls: string[] = [];
for (let i = 1; i <= SITEMAP_COUNT; i++) {
const sitemapUrl = `${SITE_BASE}/wp-sitemap-posts-item-${i}.xml`;
console.log(`Fetching sitemap ${i}/${SITEMAP_COUNT}...`);
const xml = await fetchHtml(sitemapUrl);
const matches = xml.matchAll(/<loc>(https:\/\/discovermass\.com\/church\/[^<]+)<\/loc>/g);
for (const match of matches) {
urls.push(match[1]);
}
}
console.log(`Total church URLs: ${urls.length}`);
return urls;
}
// ─── DB Helpers ───────────────────────────────────────────────────────────────
async function loadExistingChurches(): Promise<ExistingChurch[]> {
console.log('Loading existing US churches from DB...');
const churches = await prisma.church.findMany({
where: { country: 'US' },
select: {
id: true, name: true, latitude: true, longitude: true,
osmId: true, baiduId: true, masstimesId: true,
orarimesseId: true, massSchedulesPhId: true, philmassId: true,
horariosMisasId: true, mszeInfoId: true, weekdayMassesId: true,
messesInfoId: true, bohosluzbyId: true, miserendId: true,
kerknetId: true, gottesdienstzeitenId: true, discovermassId: true,
source: true, website: true, phone: true, address: true, country: true,
},
});
console.log(`Loaded ${churches.length} existing US churches`);
return churches as ExistingChurch[];
}
// ─── Church Processing ────────────────────────────────────────────────────────
async function processChurch(
url: string,
existingChurches: ExistingChurch[],
args: CLIArgs,
stats: ImportStats,
): Promise<void> {
const slug = url.replace('https://discovermass.com/church/', '').replace(/\/$/, '');
stats.total++;
try {
const html = await fetchHtml(url);
const parsed = parseChurch(html);
if (!parsed) {
console.log(` [skip] Could not parse: ${slug}`);
stats.skipped++;
return;
}
const masses = parseMassTimes(html);
const { confessions, adorations } = parseOtherServices(html);
if (args.dryRun) {
console.log(` [dry-run] ${parsed.name}${masses.length} masses, ${confessions.length} confessions, ${adorations.length} adorations`);
return;
}
const candidate = { name: parsed.name, lat: parsed.lat, lng: parsed.lng, discovermassId: slug };
const duplicate = findDuplicateChurch(candidate, existingChurches);
if (duplicate) {
const updateData: Record<string, unknown> = { discovermassId: slug };
if (!duplicate.phone && parsed.phone) updateData.phone = parsed.phone;
if (!duplicate.website && parsed.website) {
updateData.website = parsed.website;
updateData.hasWebsite = true;
}
if (parsed.lat !== 0 && duplicate.latitude === 0) {
updateData.latitude = parsed.lat;
updateData.longitude = parsed.lng;
}
try {
await prisma.$transaction(async (tx) => {
await tx.church.update({ where: { id: duplicate.id }, data: updateData });
if (masses.length > 0) {
await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } });
await tx.massSchedule.createMany({
data: masses.map(m => ({ churchId: duplicate.id, dayOfWeek: m.dayOfWeek, time: m.time, language: m.language, notes: m.notes ?? null })),
});
}
if (confessions.length > 0) {
await tx.confessionSchedule.deleteMany({ where: { churchId: duplicate.id } });
await tx.confessionSchedule.createMany({
data: confessions.map(c => ({ churchId: duplicate.id, dayOfWeek: c.dayOfWeek, startTime: c.startTime, endTime: c.endTime, notes: c.notes ?? null })),
});
}
if (adorations.length > 0) {
await tx.adorationSchedule.deleteMany({ where: { churchId: duplicate.id } });
await tx.adorationSchedule.createMany({
data: adorations.map(a => ({ churchId: duplicate.id, dayOfWeek: a.dayOfWeek, startTime: a.startTime, endTime: a.endTime, notes: a.notes ?? null })),
});
}
await tx.church.update({ where: { id: duplicate.id }, data: { lastScrapedAt: new Date() } });
});
duplicate.discovermassId = slug;
stats.updated++;
} catch (err) {
if (err instanceof Error && err.message.includes('Unique constraint')) {
stats.skipped++;
return;
}
throw err;
}
} else {
try {
const church = await prisma.church.create({
data: {
name: parsed.name,
address: parsed.address,
city: parsed.city,
state: parsed.state,
zip: parsed.zip,
country: 'US',
phone: parsed.phone,
website: parsed.website,
hasWebsite: !!parsed.website,
latitude: parsed.lat,
longitude: parsed.lng,
discovermassId: slug,
source: 'discovermass',
},
});
existingChurches.push({
id: church.id, name: parsed.name, latitude: parsed.lat, longitude: parsed.lng,
osmId: null, baiduId: null, masstimesId: null, orarimesseId: null,
massSchedulesPhId: null, philmassId: null, horariosMisasId: null,
mszeInfoId: null, weekdayMassesId: null, messesInfoId: null,
bohosluzbyId: null, miserendId: null, kerknetId: null,
gottesdienstzeitenId: null, discovermassId: slug,
source: 'discovermass', website: parsed.website, phone: parsed.phone,
address: parsed.address, country: 'US',
});
if (masses.length > 0) {
await prisma.massSchedule.createMany({
data: masses.map(m => ({ churchId: church.id, dayOfWeek: m.dayOfWeek, time: m.time, language: m.language, notes: m.notes ?? null })),
});
}
if (confessions.length > 0) {
await prisma.confessionSchedule.createMany({
data: confessions.map(c => ({ churchId: church.id, dayOfWeek: c.dayOfWeek, startTime: c.startTime, endTime: c.endTime, notes: c.notes ?? null })),
});
}
if (adorations.length > 0) {
await prisma.adorationSchedule.createMany({
data: adorations.map(a => ({ churchId: church.id, dayOfWeek: a.dayOfWeek, startTime: a.startTime, endTime: a.endTime, notes: a.notes ?? null })),
});
}
await prisma.church.update({ where: { id: church.id }, data: { lastScrapedAt: new Date() } });
stats.created++;
} catch (err) {
if (err instanceof Error && err.message.includes('Unique constraint')) {
stats.skipped++;
return;
}
throw err;
}
}
stats.massSchedulesCreated += masses.length;
stats.confessionSchedulesCreated += confessions.length;
stats.adorationSchedulesCreated += adorations.length;
console.log(
` [${duplicate ? 'update' : 'create'}] ${parsed.name}` +
`${masses.length}M ${confessions.length}C ${adorations.length}A — ` +
`${stats.total} total (${stats.created} new, ${stats.updated} upd, ${stats.errors} err)`
);
} catch (err) {
stats.errors++;
console.error(` [error] ${slug}: ${err instanceof Error ? err.message : err}`);
}
}
// ─── CLI Parsing ──────────────────────────────────────────────────────────────
function parseCLIArgs(): CLIArgs {
const args = process.argv.slice(2);
const result: CLIArgs = { all: false, dryRun: false };
for (let i = 0; i < args.length; i++) {
switch (args[i]) {
case '--all': result.all = true; break;
case '--dry-run': result.dryRun = true; break;
case '--resume-from': result.resumeFrom = parseInt(args[++i], 10); break;
case '--limit': result.limit = parseInt(args[++i], 10); break;
case '--job-id': result.jobId = args[++i]; break;
}
}
return result;
}
// ─── Main ─────────────────────────────────────────────────────────────────────
async function main() {
const args = parseCLIArgs();
if (!args.all) {
console.error('Usage: npx tsx scripts/import-discovermass.ts --all [--dry-run] [--resume-from N] [--job-id UUID]');
process.exit(1);
}
if (args.jobId) {
try {
await prisma.backgroundJob.update({
where: { id: args.jobId },
data: { status: 'running', startedAt: new Date() },
});
} catch { /* Job might not exist yet */ }
}
const stats: ImportStats = {
total: 0, created: 0, updated: 0, skipped: 0, errors: 0,
massSchedulesCreated: 0, confessionSchedulesCreated: 0, adorationSchedulesCreated: 0,
};
try {
const urls = await getAllChurchUrls();
const existingChurches = await loadExistingChurches();
// Skip already-imported churches — check discovermassId set in DB
const importedSlugs = new Set(
existingChurches.filter(c => c.discovermassId).map(c => c.discovermassId!)
);
// Apply --resume-from first, then filter to unimported, then apply --limit
const startIdx = args.resumeFrom ?? 0;
const candidateUrls = urls.slice(startIdx).filter(url => {
const slug = url.replace('https://discovermass.com/church/', '').replace(/\/$/, '');
return !importedSlugs.has(slug);
});
const churchUrls = args.limit ? candidateUrls.slice(0, args.limit) : candidateUrls;
console.log(`\nSitemap total: ${urls.length} | Already imported: ${importedSlugs.size} | This run: ${churchUrls.length}${args.limit ? ` (limit ${args.limit})` : ''}\n`);
for (let i = 0; i < churchUrls.length; i++) {
const url = churchUrls[i];
console.log(`[${i + 1}/${churchUrls.length}] ${url}`);
await processChurch(url, existingChurches, args, stats);
if (i < churchUrls.length - 1) {
await sleep(REQUEST_DELAY_MS);
}
}
} finally {
console.log('\n─── Import Complete ───────────────────────────────────────');
console.log(`Total processed: ${stats.total}`);
console.log(`Created: ${stats.created}`);
console.log(`Updated: ${stats.updated}`);
console.log(`Skipped: ${stats.skipped}`);
console.log(`Errors: ${stats.errors}`);
console.log(`Mass schedules: ${stats.massSchedulesCreated}`);
console.log(`Confession sched: ${stats.confessionSchedulesCreated}`);
console.log(`Adoration sched: ${stats.adorationSchedulesCreated}`);
if (args.jobId) {
const status = stats.errors > stats.total * 0.1 ? 'failed' : 'completed';
try {
await prisma.backgroundJob.update({
where: { id: args.jobId },
data: {
status,
completedAt: new Date(),
processed: stats.total,
succeeded: stats.created + stats.updated,
failed: stats.errors,
itemsFound: stats.massSchedulesCreated,
},
});
} catch { /* Ignore */ }
}
await prisma.$disconnect();
await pool.end();
}
}
main().catch((err) => {
console.error('Fatal error:', err);
process.exit(1);
});