2026-03-11 07:02:58 -04:00
|
|
|
#!/usr/bin/env tsx
|
|
|
|
|
/**
|
|
|
|
|
* Import Catholic churches and mass schedules from discovermass.com (USA)
|
|
|
|
|
*
|
|
|
|
|
* discovermass.com is a US Catholic church directory with 20,284 churches.
|
|
|
|
|
* Data includes name, address, phone, website, coordinates, mass times,
|
|
|
|
|
* confessions, and adoration schedules.
|
|
|
|
|
*
|
|
|
|
|
* robots.txt specifies Crawl-delay: 10 — this importer follows that rule.
|
|
|
|
|
*
|
|
|
|
|
* Usage:
|
|
|
|
|
* npx tsx scripts/import-discovermass.ts --all
|
|
|
|
|
* npx tsx scripts/import-discovermass.ts --all --dry-run
|
|
|
|
|
* npx tsx scripts/import-discovermass.ts --all --resume-from 5000
|
|
|
|
|
* npx tsx scripts/import-discovermass.ts --all --job-id {uuid}
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
import dotenv from 'dotenv';
|
|
|
|
|
import path from 'path';
|
|
|
|
|
|
|
|
|
|
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
|
|
|
|
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
|
|
|
|
|
|
|
|
|
import { Pool } from 'pg';
|
|
|
|
|
import { PrismaPg } from '@prisma/adapter-pg';
|
|
|
|
|
import { PrismaClient } from '@prisma/client';
|
|
|
|
|
|
|
|
|
|
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
|
|
|
|
|
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
|
|
|
|
|
const pool = new Pool({
|
|
|
|
|
connectionString: dbUrl,
|
|
|
|
|
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
|
|
|
|
|
});
|
|
|
|
|
const adapter = new PrismaPg(pool);
|
|
|
|
|
const prisma = new PrismaClient({ adapter });
|
|
|
|
|
|
|
|
|
|
import { findDuplicateChurch } from '../src/lib/church-matcher';
|
|
|
|
|
import type { ExistingChurch } from '../src/lib/church-matcher';
|
|
|
|
|
|
|
|
|
|
// ─── Constants ───────────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
const SITE_BASE = 'https://discovermass.com';
|
|
|
|
|
const SITEMAP_COUNT = 11;
|
|
|
|
|
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
|
|
|
|
|
const REQUEST_DELAY_MS = 10_000; // Crawl-delay: 10 from robots.txt
|
|
|
|
|
|
|
|
|
|
// ─── Types ───────────────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
interface ParsedChurch {
|
|
|
|
|
name: string;
|
|
|
|
|
address: string | null;
|
|
|
|
|
city: string | null;
|
|
|
|
|
state: string | null;
|
|
|
|
|
zip: string | null;
|
|
|
|
|
phone: string | null;
|
|
|
|
|
website: string | null;
|
|
|
|
|
lat: number;
|
|
|
|
|
lng: number;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
interface ParsedMass {
|
|
|
|
|
dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat
|
|
|
|
|
time: string; // HH:MM 24-hour
|
|
|
|
|
language: string;
|
|
|
|
|
notes?: string;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
interface ParsedConf {
|
|
|
|
|
dayOfWeek: number;
|
|
|
|
|
startTime: string; // HH:MM 24-hour
|
|
|
|
|
endTime: string; // HH:MM 24-hour
|
|
|
|
|
notes?: string;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
interface ParsedAdoration {
|
|
|
|
|
dayOfWeek: number;
|
|
|
|
|
startTime: string; // HH:MM 24-hour
|
|
|
|
|
endTime: string; // HH:MM 24-hour
|
|
|
|
|
notes?: string;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
interface ImportStats {
|
|
|
|
|
total: number;
|
|
|
|
|
created: number;
|
|
|
|
|
updated: number;
|
|
|
|
|
skipped: number;
|
|
|
|
|
errors: number;
|
|
|
|
|
massSchedulesCreated: number;
|
|
|
|
|
confessionSchedulesCreated: number;
|
|
|
|
|
adorationSchedulesCreated: number;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
interface CLIArgs {
|
|
|
|
|
all: boolean;
|
|
|
|
|
dryRun: boolean;
|
|
|
|
|
resumeFrom?: number;
|
2026-04-01 22:20:45 -04:00
|
|
|
limit?: number;
|
2026-03-11 07:02:58 -04:00
|
|
|
jobId?: string;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ─── Day Mappings ─────────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
// Full day names used in mass schedule <li> labels
|
|
|
|
|
const FULL_DAY_NAMES: Record<string, number> = {
|
|
|
|
|
Sunday: 0, Monday: 1, Tuesday: 2, Wednesday: 3,
|
|
|
|
|
Thursday: 4, Friday: 5, Saturday: 6,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// Abbreviated day prefixes used in confession/adoration serviceTime text
|
|
|
|
|
const ABBREV_DAY_NAMES: Record<string, number[]> = {
|
|
|
|
|
Sun: [0], Mon: [1], Tue: [2], Wed: [3],
|
|
|
|
|
Thr: [4], Thu: [4], Fri: [5], Sat: [6],
|
|
|
|
|
Weekdays: [1, 2, 3, 4, 5],
|
|
|
|
|
Daily: [0, 1, 2, 3, 4, 5, 6],
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// ─── Time Utilities ───────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Convert "5:00pm", "11:00am", "12:00pm", "12:00am" to "HH:MM" 24-hour format.
|
|
|
|
|
* Returns the original string unchanged if it doesn't match expected format.
|
|
|
|
|
*/
|
|
|
|
|
function convertTo24h(timeStr: string): string {
|
|
|
|
|
const cleaned = timeStr.trim().toLowerCase();
|
|
|
|
|
const m = cleaned.match(/^(\d{1,2}):(\d{2})(am|pm)$/);
|
|
|
|
|
if (!m) return cleaned;
|
|
|
|
|
let hours = parseInt(m[1], 10);
|
|
|
|
|
const mins = m[2];
|
|
|
|
|
const meridiem = m[3];
|
|
|
|
|
if (meridiem === 'pm' && hours !== 12) hours += 12;
|
|
|
|
|
if (meridiem === 'am' && hours === 12) hours = 0;
|
|
|
|
|
return `${String(hours).padStart(2, '0')}:${mins}`;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Parse "8:30am-9:00am" → ["08:30", "09:00"].
|
|
|
|
|
*/
|
|
|
|
|
function parseTimeRange(rangeStr: string): [string, string] {
|
|
|
|
|
const hyphenIdx = rangeStr.indexOf('-', rangeStr.indexOf(':') + 1);
|
|
|
|
|
if (hyphenIdx === -1) {
|
|
|
|
|
const t = convertTo24h(rangeStr.trim());
|
|
|
|
|
return [t, t];
|
|
|
|
|
}
|
|
|
|
|
const start = convertTo24h(rangeStr.slice(0, hyphenIdx).trim());
|
|
|
|
|
const end = convertTo24h(rangeStr.slice(hyphenIdx + 1).trim());
|
|
|
|
|
return [start, end];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Expand abbreviated day prefix to array of dayOfWeek integers.
|
|
|
|
|
*/
|
|
|
|
|
function expandDayAbbrev(prefix: string): number[] {
|
|
|
|
|
return ABBREV_DAY_NAMES[prefix] ?? [];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ─── Address Parsing ──────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Parse "14085 Peyton Drive, Chino Hills, CA 91709" into components.
|
|
|
|
|
*/
|
|
|
|
|
function parseAddress(raw: string): { address: string | null; city: string | null; state: string | null; zip: string | null } {
|
|
|
|
|
const parts = raw.split(', ');
|
|
|
|
|
if (parts.length < 3) return { address: raw, city: null, state: null, zip: null };
|
|
|
|
|
const last = parts[parts.length - 1].trim();
|
|
|
|
|
const stateZipMatch = last.match(/^([A-Z]{2})\s+(\d{5}(?:-\d{4})?)$/);
|
|
|
|
|
if (!stateZipMatch) return { address: raw, city: null, state: null, zip: null };
|
|
|
|
|
return {
|
|
|
|
|
address: parts.slice(0, parts.length - 2).join(', ').trim(),
|
|
|
|
|
city: parts[parts.length - 2].trim(),
|
|
|
|
|
state: stateZipMatch[1],
|
|
|
|
|
zip: stateZipMatch[2],
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ─── HTML Parsing ─────────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Parse church metadata from page HTML.
|
|
|
|
|
* Returns null if the page doesn't look like a valid church listing.
|
|
|
|
|
*/
|
|
|
|
|
function parseChurch(html: string): ParsedChurch | null {
|
|
|
|
|
const nameMatch = html.match(/<meta property="og:title" content="([^"]+)"/);
|
|
|
|
|
if (!nameMatch) return null;
|
|
|
|
|
const name = nameMatch[1].trim();
|
|
|
|
|
if (!name || name === 'Discover Mass') return null;
|
|
|
|
|
|
|
|
|
|
let address: string | null = null;
|
|
|
|
|
let city: string | null = null;
|
|
|
|
|
let state: string | null = null;
|
|
|
|
|
let zip: string | null = null;
|
|
|
|
|
// Try structured <h2 id="theaddress"> element first (most reliable)
|
|
|
|
|
const addrElemMatch = html.match(/id="theaddress"[^>]*>([^<]+)<\/h2>/);
|
|
|
|
|
if (addrElemMatch) {
|
|
|
|
|
const parsed = parseAddress(addrElemMatch[1].trim());
|
|
|
|
|
address = parsed.address;
|
|
|
|
|
city = parsed.city;
|
|
|
|
|
state = parsed.state;
|
|
|
|
|
zip = parsed.zip;
|
|
|
|
|
} else {
|
|
|
|
|
// Fallback: scan for street address pattern in text
|
|
|
|
|
const addrMatch = html.match(/>\s*(\d+\s[^<\n,]{5,}),\s*([^<,\n]+),\s*([A-Z]{2})\s+(\d{5}(?:-\d{4})?)\s*</);
|
|
|
|
|
if (addrMatch) {
|
|
|
|
|
const raw = `${addrMatch[1].trim()}, ${addrMatch[2].trim()}, ${addrMatch[3]} ${addrMatch[4]}`;
|
|
|
|
|
const parsed = parseAddress(raw);
|
|
|
|
|
address = parsed.address;
|
|
|
|
|
city = parsed.city;
|
|
|
|
|
state = parsed.state;
|
|
|
|
|
zip = parsed.zip;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const phoneMatch = html.match(/<span class='side-phone attribute'>([^<]+)<\/span>/);
|
|
|
|
|
const phone = phoneMatch ? phoneMatch[1].trim() : null;
|
|
|
|
|
|
|
|
|
|
const websiteMatch = html.match(/<span class='side-website attribute'><a href='([^']+)'/);
|
|
|
|
|
const website = websiteMatch ? websiteMatch[1].trim() : null;
|
|
|
|
|
|
|
|
|
|
let lat = 0;
|
|
|
|
|
let lng = 0;
|
|
|
|
|
const coordMatch = html.match(/daddr=([-\d.]+),([-\d.]+)/);
|
|
|
|
|
if (coordMatch) {
|
|
|
|
|
const rawLat = parseFloat(coordMatch[1]);
|
|
|
|
|
const rawLng = parseFloat(coordMatch[2]);
|
|
|
|
|
if (isFinite(rawLat) && isFinite(rawLng) && Math.abs(rawLat) <= 90 && Math.abs(rawLng) <= 180) {
|
|
|
|
|
lat = rawLat;
|
|
|
|
|
lng = rawLng;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return { name, address, city, state, zip, phone, website, lat, lng };
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Parse mass schedule from the "Mass Times" <ul> block.
|
|
|
|
|
*/
|
|
|
|
|
function parseMassTimes(html: string): ParsedMass[] {
|
|
|
|
|
const safeHtml = html.length > 100_000 ? html.slice(0, 100_000) : html;
|
|
|
|
|
const massUlMatch = safeHtml.match(/<ul>\s*<li>\s*<h5>Mass Times<\/h5>[\s\S]*?<\/ul>/);
|
|
|
|
|
if (!massUlMatch) return [];
|
|
|
|
|
const massUl = massUlMatch[0];
|
|
|
|
|
|
|
|
|
|
const results: ParsedMass[] = [];
|
|
|
|
|
const liParts = massUl.split(/<li[^>]*>/);
|
|
|
|
|
for (let i = 1; i < liParts.length; i++) {
|
|
|
|
|
const li = liParts[i];
|
|
|
|
|
const labelMatch = li.match(/<span class="label">([^<]+)<\/span>/);
|
|
|
|
|
if (!labelMatch) continue;
|
|
|
|
|
const dayLabel = labelMatch[1].trim();
|
|
|
|
|
const dayOfWeek = FULL_DAY_NAMES[dayLabel];
|
|
|
|
|
if (dayOfWeek === undefined) continue;
|
|
|
|
|
|
|
|
|
|
const serviceTimeParts = li.split("<span class='serviceTime'>");
|
|
|
|
|
for (let j = 1; j < serviceTimeParts.length; j++) {
|
|
|
|
|
const st = serviceTimeParts[j];
|
|
|
|
|
const timeMatch = st.match(/<span class='time'>([^<]+)<\/span>/);
|
|
|
|
|
if (!timeMatch) continue;
|
|
|
|
|
const time = convertTo24h(timeMatch[1].trim());
|
|
|
|
|
const langMatch = st.match(/<span class='language'>\(([^)]+)\)<\/span>/);
|
|
|
|
|
const language = langMatch ? langMatch[1].trim() : 'English';
|
|
|
|
|
const commentMatch = st.match(/<span class='comment'>([^<]+)<\/span>/);
|
|
|
|
|
const notes = commentMatch ? commentMatch[1].trim() : undefined;
|
|
|
|
|
results.push({ dayOfWeek, time, language, notes });
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return results;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Parse confessions and adoration from the "Other Services" <ul> block.
|
|
|
|
|
*/
|
|
|
|
|
function parseOtherServices(html: string): { confessions: ParsedConf[]; adorations: ParsedAdoration[] } {
|
|
|
|
|
const safeHtml = html.length > 100_000 ? html.slice(0, 100_000) : html;
|
|
|
|
|
const otherUlMatch = safeHtml.match(/<ul>\s*<li>\s*<h5>Other Services<\/h5>[\s\S]*?<\/ul>/);
|
|
|
|
|
if (!otherUlMatch) return { confessions: [], adorations: [] };
|
|
|
|
|
const otherUl = otherUlMatch[0];
|
|
|
|
|
|
|
|
|
|
function parseServiceItems(liHtml: string): Array<{ dayOfWeek: number; startTime: string; endTime: string; notes?: string }> {
|
|
|
|
|
const items: Array<{ dayOfWeek: number; startTime: string; endTime: string; notes?: string }> = [];
|
|
|
|
|
const stParts = liHtml.split("<span class='serviceTime'>");
|
|
|
|
|
for (let i = 1; i < stParts.length; i++) {
|
|
|
|
|
const st = stParts[i];
|
|
|
|
|
const dayTimeMatch = st.match(/^([A-Za-z]+):\s*<span class='time'>([^<]+)<\/span>/);
|
|
|
|
|
if (!dayTimeMatch) continue;
|
|
|
|
|
const days = expandDayAbbrev(dayTimeMatch[1].trim());
|
|
|
|
|
if (days.length === 0) continue;
|
|
|
|
|
const [startTime, endTime] = parseTimeRange(dayTimeMatch[2]);
|
|
|
|
|
const commentMatch = st.match(/<span class='comment'>([^<]+)<\/span>/);
|
|
|
|
|
const notes = commentMatch ? commentMatch[1].trim() : undefined;
|
|
|
|
|
for (const dayOfWeek of days) {
|
|
|
|
|
items.push({ dayOfWeek, startTime, endTime, notes });
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return items;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const confessions: ParsedConf[] = [];
|
|
|
|
|
const adorations: ParsedAdoration[] = [];
|
|
|
|
|
const confMatch = otherUl.match(/<li class="Confessions">[\s\S]*?<\/li>/);
|
|
|
|
|
if (confMatch) confessions.push(...parseServiceItems(confMatch[0]));
|
|
|
|
|
const adorMatch = otherUl.match(/<li class="Adoration">[\s\S]*?<\/li>/);
|
|
|
|
|
if (adorMatch) adorations.push(...parseServiceItems(adorMatch[0]));
|
|
|
|
|
return { confessions, adorations };
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ─── HTTP Helpers ─────────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
async function fetchHtml(url: string): Promise<string> {
|
|
|
|
|
const res = await fetch(url, { headers: { 'User-Agent': USER_AGENT } });
|
|
|
|
|
if (!res.ok) throw new Error(`HTTP ${res.status} for ${url}`);
|
|
|
|
|
return res.text();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function sleep(ms: number): Promise<void> {
|
|
|
|
|
return new Promise(resolve => setTimeout(resolve, ms));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ─── Sitemap Enumeration ──────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
async function getAllChurchUrls(): Promise<string[]> {
|
|
|
|
|
const urls: string[] = [];
|
|
|
|
|
for (let i = 1; i <= SITEMAP_COUNT; i++) {
|
|
|
|
|
const sitemapUrl = `${SITE_BASE}/wp-sitemap-posts-item-${i}.xml`;
|
|
|
|
|
console.log(`Fetching sitemap ${i}/${SITEMAP_COUNT}...`);
|
|
|
|
|
const xml = await fetchHtml(sitemapUrl);
|
|
|
|
|
const matches = xml.matchAll(/<loc>(https:\/\/discovermass\.com\/church\/[^<]+)<\/loc>/g);
|
|
|
|
|
for (const match of matches) {
|
|
|
|
|
urls.push(match[1]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
console.log(`Total church URLs: ${urls.length}`);
|
|
|
|
|
return urls;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ─── DB Helpers ───────────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
async function loadExistingChurches(): Promise<ExistingChurch[]> {
|
|
|
|
|
console.log('Loading existing US churches from DB...');
|
|
|
|
|
const churches = await prisma.church.findMany({
|
|
|
|
|
where: { country: 'US' },
|
|
|
|
|
select: {
|
|
|
|
|
id: true, name: true, latitude: true, longitude: true,
|
|
|
|
|
osmId: true, baiduId: true, masstimesId: true,
|
|
|
|
|
orarimesseId: true, massSchedulesPhId: true, philmassId: true,
|
|
|
|
|
horariosMisasId: true, mszeInfoId: true, weekdayMassesId: true,
|
|
|
|
|
messesInfoId: true, bohosluzbyId: true, miserendId: true,
|
|
|
|
|
kerknetId: true, gottesdienstzeitenId: true, discovermassId: true,
|
|
|
|
|
source: true, website: true, phone: true, address: true, country: true,
|
|
|
|
|
},
|
|
|
|
|
});
|
|
|
|
|
console.log(`Loaded ${churches.length} existing US churches`);
|
|
|
|
|
return churches as ExistingChurch[];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ─── Church Processing ────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
async function processChurch(
|
|
|
|
|
url: string,
|
|
|
|
|
existingChurches: ExistingChurch[],
|
|
|
|
|
args: CLIArgs,
|
|
|
|
|
stats: ImportStats,
|
|
|
|
|
): Promise<void> {
|
|
|
|
|
const slug = url.replace('https://discovermass.com/church/', '').replace(/\/$/, '');
|
|
|
|
|
stats.total++;
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
const html = await fetchHtml(url);
|
|
|
|
|
const parsed = parseChurch(html);
|
|
|
|
|
if (!parsed) {
|
|
|
|
|
console.log(` [skip] Could not parse: ${slug}`);
|
|
|
|
|
stats.skipped++;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const masses = parseMassTimes(html);
|
|
|
|
|
const { confessions, adorations } = parseOtherServices(html);
|
|
|
|
|
|
|
|
|
|
if (args.dryRun) {
|
|
|
|
|
console.log(` [dry-run] ${parsed.name} — ${masses.length} masses, ${confessions.length} confessions, ${adorations.length} adorations`);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const candidate = { name: parsed.name, lat: parsed.lat, lng: parsed.lng, discovermassId: slug };
|
|
|
|
|
const duplicate = findDuplicateChurch(candidate, existingChurches);
|
|
|
|
|
|
|
|
|
|
if (duplicate) {
|
|
|
|
|
const updateData: Record<string, unknown> = { discovermassId: slug };
|
|
|
|
|
if (!duplicate.phone && parsed.phone) updateData.phone = parsed.phone;
|
|
|
|
|
if (!duplicate.website && parsed.website) {
|
|
|
|
|
updateData.website = parsed.website;
|
|
|
|
|
updateData.hasWebsite = true;
|
|
|
|
|
}
|
|
|
|
|
if (parsed.lat !== 0 && duplicate.latitude === 0) {
|
|
|
|
|
updateData.latitude = parsed.lat;
|
|
|
|
|
updateData.longitude = parsed.lng;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
await prisma.$transaction(async (tx) => {
|
|
|
|
|
await tx.church.update({ where: { id: duplicate.id }, data: updateData });
|
|
|
|
|
if (masses.length > 0) {
|
|
|
|
|
await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } });
|
|
|
|
|
await tx.massSchedule.createMany({
|
|
|
|
|
data: masses.map(m => ({ churchId: duplicate.id, dayOfWeek: m.dayOfWeek, time: m.time, language: m.language, notes: m.notes ?? null })),
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
if (confessions.length > 0) {
|
|
|
|
|
await tx.confessionSchedule.deleteMany({ where: { churchId: duplicate.id } });
|
|
|
|
|
await tx.confessionSchedule.createMany({
|
|
|
|
|
data: confessions.map(c => ({ churchId: duplicate.id, dayOfWeek: c.dayOfWeek, startTime: c.startTime, endTime: c.endTime, notes: c.notes ?? null })),
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
if (adorations.length > 0) {
|
|
|
|
|
await tx.adorationSchedule.deleteMany({ where: { churchId: duplicate.id } });
|
|
|
|
|
await tx.adorationSchedule.createMany({
|
|
|
|
|
data: adorations.map(a => ({ churchId: duplicate.id, dayOfWeek: a.dayOfWeek, startTime: a.startTime, endTime: a.endTime, notes: a.notes ?? null })),
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
await tx.church.update({ where: { id: duplicate.id }, data: { lastScrapedAt: new Date() } });
|
|
|
|
|
});
|
|
|
|
|
duplicate.discovermassId = slug;
|
|
|
|
|
stats.updated++;
|
|
|
|
|
} catch (err) {
|
|
|
|
|
if (err instanceof Error && err.message.includes('Unique constraint')) {
|
|
|
|
|
stats.skipped++;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
throw err;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
try {
|
|
|
|
|
const church = await prisma.church.create({
|
|
|
|
|
data: {
|
|
|
|
|
name: parsed.name,
|
|
|
|
|
address: parsed.address,
|
|
|
|
|
city: parsed.city,
|
|
|
|
|
state: parsed.state,
|
|
|
|
|
zip: parsed.zip,
|
|
|
|
|
country: 'US',
|
|
|
|
|
phone: parsed.phone,
|
|
|
|
|
website: parsed.website,
|
|
|
|
|
hasWebsite: !!parsed.website,
|
|
|
|
|
latitude: parsed.lat,
|
|
|
|
|
longitude: parsed.lng,
|
|
|
|
|
discovermassId: slug,
|
|
|
|
|
source: 'discovermass',
|
|
|
|
|
},
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
existingChurches.push({
|
|
|
|
|
id: church.id, name: parsed.name, latitude: parsed.lat, longitude: parsed.lng,
|
|
|
|
|
osmId: null, baiduId: null, masstimesId: null, orarimesseId: null,
|
|
|
|
|
massSchedulesPhId: null, philmassId: null, horariosMisasId: null,
|
|
|
|
|
mszeInfoId: null, weekdayMassesId: null, messesInfoId: null,
|
|
|
|
|
bohosluzbyId: null, miserendId: null, kerknetId: null,
|
|
|
|
|
gottesdienstzeitenId: null, discovermassId: slug,
|
|
|
|
|
source: 'discovermass', website: parsed.website, phone: parsed.phone,
|
|
|
|
|
address: parsed.address, country: 'US',
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
if (masses.length > 0) {
|
|
|
|
|
await prisma.massSchedule.createMany({
|
|
|
|
|
data: masses.map(m => ({ churchId: church.id, dayOfWeek: m.dayOfWeek, time: m.time, language: m.language, notes: m.notes ?? null })),
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
if (confessions.length > 0) {
|
|
|
|
|
await prisma.confessionSchedule.createMany({
|
|
|
|
|
data: confessions.map(c => ({ churchId: church.id, dayOfWeek: c.dayOfWeek, startTime: c.startTime, endTime: c.endTime, notes: c.notes ?? null })),
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
if (adorations.length > 0) {
|
|
|
|
|
await prisma.adorationSchedule.createMany({
|
|
|
|
|
data: adorations.map(a => ({ churchId: church.id, dayOfWeek: a.dayOfWeek, startTime: a.startTime, endTime: a.endTime, notes: a.notes ?? null })),
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
await prisma.church.update({ where: { id: church.id }, data: { lastScrapedAt: new Date() } });
|
|
|
|
|
stats.created++;
|
|
|
|
|
} catch (err) {
|
|
|
|
|
if (err instanceof Error && err.message.includes('Unique constraint')) {
|
|
|
|
|
stats.skipped++;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
throw err;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
stats.massSchedulesCreated += masses.length;
|
|
|
|
|
stats.confessionSchedulesCreated += confessions.length;
|
|
|
|
|
stats.adorationSchedulesCreated += adorations.length;
|
|
|
|
|
|
|
|
|
|
console.log(
|
|
|
|
|
` [${duplicate ? 'update' : 'create'}] ${parsed.name} — ` +
|
|
|
|
|
`${masses.length}M ${confessions.length}C ${adorations.length}A — ` +
|
|
|
|
|
`${stats.total} total (${stats.created} new, ${stats.updated} upd, ${stats.errors} err)`
|
|
|
|
|
);
|
|
|
|
|
} catch (err) {
|
|
|
|
|
stats.errors++;
|
|
|
|
|
console.error(` [error] ${slug}: ${err instanceof Error ? err.message : err}`);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ─── CLI Parsing ──────────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
function parseCLIArgs(): CLIArgs {
|
|
|
|
|
const args = process.argv.slice(2);
|
|
|
|
|
const result: CLIArgs = { all: false, dryRun: false };
|
|
|
|
|
for (let i = 0; i < args.length; i++) {
|
|
|
|
|
switch (args[i]) {
|
|
|
|
|
case '--all': result.all = true; break;
|
|
|
|
|
case '--dry-run': result.dryRun = true; break;
|
|
|
|
|
case '--resume-from': result.resumeFrom = parseInt(args[++i], 10); break;
|
2026-04-01 22:20:45 -04:00
|
|
|
case '--limit': result.limit = parseInt(args[++i], 10); break;
|
2026-03-11 07:02:58 -04:00
|
|
|
case '--job-id': result.jobId = args[++i]; break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ─── Main ─────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
async function main() {
|
|
|
|
|
const args = parseCLIArgs();
|
|
|
|
|
|
|
|
|
|
if (!args.all) {
|
|
|
|
|
console.error('Usage: npx tsx scripts/import-discovermass.ts --all [--dry-run] [--resume-from N] [--job-id UUID]');
|
|
|
|
|
process.exit(1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (args.jobId) {
|
|
|
|
|
try {
|
|
|
|
|
await prisma.backgroundJob.update({
|
|
|
|
|
where: { id: args.jobId },
|
|
|
|
|
data: { status: 'running', startedAt: new Date() },
|
|
|
|
|
});
|
|
|
|
|
} catch { /* Job might not exist yet */ }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const stats: ImportStats = {
|
|
|
|
|
total: 0, created: 0, updated: 0, skipped: 0, errors: 0,
|
|
|
|
|
massSchedulesCreated: 0, confessionSchedulesCreated: 0, adorationSchedulesCreated: 0,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
const urls = await getAllChurchUrls();
|
|
|
|
|
const existingChurches = await loadExistingChurches();
|
2026-04-01 22:20:45 -04:00
|
|
|
|
|
|
|
|
// Skip already-imported churches — check discovermassId set in DB
|
|
|
|
|
const importedSlugs = new Set(
|
|
|
|
|
existingChurches.filter(c => c.discovermassId).map(c => c.discovermassId!)
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
// Apply --resume-from first, then filter to unimported, then apply --limit
|
2026-03-11 07:02:58 -04:00
|
|
|
const startIdx = args.resumeFrom ?? 0;
|
2026-04-01 22:20:45 -04:00
|
|
|
const candidateUrls = urls.slice(startIdx).filter(url => {
|
|
|
|
|
const slug = url.replace('https://discovermass.com/church/', '').replace(/\/$/, '');
|
|
|
|
|
return !importedSlugs.has(slug);
|
|
|
|
|
});
|
|
|
|
|
const churchUrls = args.limit ? candidateUrls.slice(0, args.limit) : candidateUrls;
|
|
|
|
|
|
|
|
|
|
console.log(`\nSitemap total: ${urls.length} | Already imported: ${importedSlugs.size} | This run: ${churchUrls.length}${args.limit ? ` (limit ${args.limit})` : ''}\n`);
|
2026-03-11 07:02:58 -04:00
|
|
|
|
|
|
|
|
for (let i = 0; i < churchUrls.length; i++) {
|
|
|
|
|
const url = churchUrls[i];
|
2026-04-01 22:20:45 -04:00
|
|
|
console.log(`[${i + 1}/${churchUrls.length}] ${url}`);
|
2026-03-11 07:02:58 -04:00
|
|
|
await processChurch(url, existingChurches, args, stats);
|
|
|
|
|
if (i < churchUrls.length - 1) {
|
|
|
|
|
await sleep(REQUEST_DELAY_MS);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} finally {
|
|
|
|
|
console.log('\n─── Import Complete ───────────────────────────────────────');
|
|
|
|
|
console.log(`Total processed: ${stats.total}`);
|
|
|
|
|
console.log(`Created: ${stats.created}`);
|
|
|
|
|
console.log(`Updated: ${stats.updated}`);
|
|
|
|
|
console.log(`Skipped: ${stats.skipped}`);
|
|
|
|
|
console.log(`Errors: ${stats.errors}`);
|
|
|
|
|
console.log(`Mass schedules: ${stats.massSchedulesCreated}`);
|
|
|
|
|
console.log(`Confession sched: ${stats.confessionSchedulesCreated}`);
|
|
|
|
|
console.log(`Adoration sched: ${stats.adorationSchedulesCreated}`);
|
|
|
|
|
|
|
|
|
|
if (args.jobId) {
|
|
|
|
|
const status = stats.errors > stats.total * 0.1 ? 'failed' : 'completed';
|
|
|
|
|
try {
|
|
|
|
|
await prisma.backgroundJob.update({
|
|
|
|
|
where: { id: args.jobId },
|
|
|
|
|
data: {
|
|
|
|
|
status,
|
|
|
|
|
completedAt: new Date(),
|
|
|
|
|
processed: stats.total,
|
|
|
|
|
succeeded: stats.created + stats.updated,
|
|
|
|
|
failed: stats.errors,
|
|
|
|
|
itemsFound: stats.massSchedulesCreated,
|
|
|
|
|
},
|
|
|
|
|
});
|
|
|
|
|
} catch { /* Ignore */ }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
await prisma.$disconnect();
|
|
|
|
|
await pool.end();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
main().catch((err) => {
|
|
|
|
|
console.error('Fatal error:', err);
|
|
|
|
|
process.exit(1);
|
|
|
|
|
});
|