feat: add import-discovermass.ts — USA church importer with 10s crawl delay
Imports 20,284 US Catholic churches from discovermass.com including mass, confession, and adoration schedules. Respects robots.txt Crawl-delay: 10. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
616
scripts/import-discovermass.ts
Normal file
616
scripts/import-discovermass.ts
Normal file
@@ -0,0 +1,616 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Import Catholic churches and mass schedules from discovermass.com (USA)
|
||||
*
|
||||
* discovermass.com is a US Catholic church directory with 20,284 churches.
|
||||
* Data includes name, address, phone, website, coordinates, mass times,
|
||||
* confessions, and adoration schedules.
|
||||
*
|
||||
* robots.txt specifies Crawl-delay: 10 — this importer follows that rule.
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/import-discovermass.ts --all
|
||||
* npx tsx scripts/import-discovermass.ts --all --dry-run
|
||||
* npx tsx scripts/import-discovermass.ts --all --resume-from 5000
|
||||
* npx tsx scripts/import-discovermass.ts --all --job-id {uuid}
|
||||
*/
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
|
||||
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
|
||||
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
|
||||
const pool = new Pool({
|
||||
connectionString: dbUrl,
|
||||
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
|
||||
});
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
import { findDuplicateChurch } from '../src/lib/church-matcher';
|
||||
import type { ExistingChurch } from '../src/lib/church-matcher';
|
||||
|
||||
// ─── Constants ───────────────────────────────────────────────────────────────
|
||||
|
||||
const SITE_BASE = 'https://discovermass.com';
|
||||
const SITEMAP_COUNT = 11;
|
||||
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
|
||||
const REQUEST_DELAY_MS = 10_000; // Crawl-delay: 10 from robots.txt
|
||||
|
||||
// ─── Types ───────────────────────────────────────────────────────────────────
|
||||
|
||||
interface ParsedChurch {
|
||||
name: string;
|
||||
address: string | null;
|
||||
city: string | null;
|
||||
state: string | null;
|
||||
zip: string | null;
|
||||
phone: string | null;
|
||||
website: string | null;
|
||||
lat: number;
|
||||
lng: number;
|
||||
}
|
||||
|
||||
interface ParsedMass {
|
||||
dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat
|
||||
time: string; // HH:MM 24-hour
|
||||
language: string;
|
||||
notes?: string;
|
||||
}
|
||||
|
||||
interface ParsedConf {
|
||||
dayOfWeek: number;
|
||||
startTime: string; // HH:MM 24-hour
|
||||
endTime: string; // HH:MM 24-hour
|
||||
notes?: string;
|
||||
}
|
||||
|
||||
interface ParsedAdoration {
|
||||
dayOfWeek: number;
|
||||
startTime: string; // HH:MM 24-hour
|
||||
endTime: string; // HH:MM 24-hour
|
||||
notes?: string;
|
||||
}
|
||||
|
||||
interface ImportStats {
|
||||
total: number;
|
||||
created: number;
|
||||
updated: number;
|
||||
skipped: number;
|
||||
errors: number;
|
||||
massSchedulesCreated: number;
|
||||
confessionSchedulesCreated: number;
|
||||
adorationSchedulesCreated: number;
|
||||
}
|
||||
|
||||
interface CLIArgs {
|
||||
all: boolean;
|
||||
dryRun: boolean;
|
||||
resumeFrom?: number;
|
||||
jobId?: string;
|
||||
testParse?: boolean;
|
||||
}
|
||||
|
||||
// ─── Day Mappings ─────────────────────────────────────────────────────────────
|
||||
|
||||
// Full day names used in mass schedule <li> labels
|
||||
const FULL_DAY_NAMES: Record<string, number> = {
|
||||
Sunday: 0, Monday: 1, Tuesday: 2, Wednesday: 3,
|
||||
Thursday: 4, Friday: 5, Saturday: 6,
|
||||
};
|
||||
|
||||
// Abbreviated day prefixes used in confession/adoration serviceTime text
|
||||
const ABBREV_DAY_NAMES: Record<string, number[]> = {
|
||||
Sun: [0], Mon: [1], Tue: [2], Wed: [3],
|
||||
Thr: [4], Thu: [4], Fri: [5], Sat: [6],
|
||||
Weekdays: [1, 2, 3, 4, 5],
|
||||
Daily: [0, 1, 2, 3, 4, 5, 6],
|
||||
};
|
||||
|
||||
// ─── Time Utilities ───────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Convert "5:00pm", "11:00am", "12:00pm", "12:00am" to "HH:MM" 24-hour format.
|
||||
* Returns the original string unchanged if it doesn't match expected format.
|
||||
*/
|
||||
function convertTo24h(timeStr: string): string {
|
||||
const cleaned = timeStr.trim().toLowerCase();
|
||||
const m = cleaned.match(/^(\d{1,2}):(\d{2})(am|pm)$/);
|
||||
if (!m) return cleaned;
|
||||
let hours = parseInt(m[1], 10);
|
||||
const mins = m[2];
|
||||
const meridiem = m[3];
|
||||
if (meridiem === 'pm' && hours !== 12) hours += 12;
|
||||
if (meridiem === 'am' && hours === 12) hours = 0;
|
||||
return `${String(hours).padStart(2, '0')}:${mins}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse "8:30am-9:00am" → ["08:30", "09:00"].
|
||||
*/
|
||||
function parseTimeRange(rangeStr: string): [string, string] {
|
||||
const hyphenIdx = rangeStr.indexOf('-', rangeStr.indexOf(':') + 1);
|
||||
if (hyphenIdx === -1) {
|
||||
const t = convertTo24h(rangeStr.trim());
|
||||
return [t, t];
|
||||
}
|
||||
const start = convertTo24h(rangeStr.slice(0, hyphenIdx).trim());
|
||||
const end = convertTo24h(rangeStr.slice(hyphenIdx + 1).trim());
|
||||
return [start, end];
|
||||
}
|
||||
|
||||
/**
|
||||
* Expand abbreviated day prefix to array of dayOfWeek integers.
|
||||
*/
|
||||
function expandDayAbbrev(prefix: string): number[] {
|
||||
return ABBREV_DAY_NAMES[prefix] ?? [];
|
||||
}
|
||||
|
||||
// ─── Address Parsing ──────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Parse "14085 Peyton Drive, Chino Hills, CA 91709" into components.
|
||||
*/
|
||||
function parseAddress(raw: string): { address: string | null; city: string | null; state: string | null; zip: string | null } {
|
||||
const parts = raw.split(', ');
|
||||
if (parts.length < 3) return { address: raw, city: null, state: null, zip: null };
|
||||
const last = parts[parts.length - 1].trim();
|
||||
const stateZipMatch = last.match(/^([A-Z]{2})\s+(\d{5}(?:-\d{4})?)$/);
|
||||
if (!stateZipMatch) return { address: raw, city: null, state: null, zip: null };
|
||||
return {
|
||||
address: parts.slice(0, parts.length - 2).join(', ').trim(),
|
||||
city: parts[parts.length - 2].trim(),
|
||||
state: stateZipMatch[1],
|
||||
zip: stateZipMatch[2],
|
||||
};
|
||||
}
|
||||
|
||||
// ─── HTML Parsing ─────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Parse church metadata from page HTML.
|
||||
* Returns null if the page doesn't look like a valid church listing.
|
||||
*/
|
||||
function parseChurch(html: string): ParsedChurch | null {
|
||||
const nameMatch = html.match(/<meta property="og:title" content="([^"]+)"/);
|
||||
if (!nameMatch) return null;
|
||||
const name = nameMatch[1].trim();
|
||||
if (!name || name === 'Discover Mass') return null;
|
||||
|
||||
let address: string | null = null;
|
||||
let city: string | null = null;
|
||||
let state: string | null = null;
|
||||
let zip: string | null = null;
|
||||
// Try structured <h2 id="theaddress"> element first (most reliable)
|
||||
const addrElemMatch = html.match(/id="theaddress"[^>]*>([^<]+)<\/h2>/);
|
||||
if (addrElemMatch) {
|
||||
const parsed = parseAddress(addrElemMatch[1].trim());
|
||||
address = parsed.address;
|
||||
city = parsed.city;
|
||||
state = parsed.state;
|
||||
zip = parsed.zip;
|
||||
} else {
|
||||
// Fallback: scan for street address pattern in text
|
||||
const addrMatch = html.match(/>\s*(\d+\s[^<\n,]{5,}),\s*([^<,\n]+),\s*([A-Z]{2})\s+(\d{5}(?:-\d{4})?)\s*</);
|
||||
if (addrMatch) {
|
||||
const raw = `${addrMatch[1].trim()}, ${addrMatch[2].trim()}, ${addrMatch[3]} ${addrMatch[4]}`;
|
||||
const parsed = parseAddress(raw);
|
||||
address = parsed.address;
|
||||
city = parsed.city;
|
||||
state = parsed.state;
|
||||
zip = parsed.zip;
|
||||
}
|
||||
}
|
||||
|
||||
const phoneMatch = html.match(/<span class='side-phone attribute'>([^<]+)<\/span>/);
|
||||
const phone = phoneMatch ? phoneMatch[1].trim() : null;
|
||||
|
||||
const websiteMatch = html.match(/<span class='side-website attribute'><a href='([^']+)'/);
|
||||
const website = websiteMatch ? websiteMatch[1].trim() : null;
|
||||
|
||||
let lat = 0;
|
||||
let lng = 0;
|
||||
const coordMatch = html.match(/daddr=([-\d.]+),([-\d.]+)/);
|
||||
if (coordMatch) {
|
||||
const rawLat = parseFloat(coordMatch[1]);
|
||||
const rawLng = parseFloat(coordMatch[2]);
|
||||
if (isFinite(rawLat) && isFinite(rawLng) && Math.abs(rawLat) <= 90 && Math.abs(rawLng) <= 180) {
|
||||
lat = rawLat;
|
||||
lng = rawLng;
|
||||
}
|
||||
}
|
||||
|
||||
return { name, address, city, state, zip, phone, website, lat, lng };
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse mass schedule from the "Mass Times" <ul> block.
|
||||
*/
|
||||
function parseMassTimes(html: string): ParsedMass[] {
|
||||
const safeHtml = html.length > 100_000 ? html.slice(0, 100_000) : html;
|
||||
const massUlMatch = safeHtml.match(/<ul>\s*<li>\s*<h5>Mass Times<\/h5>[\s\S]*?<\/ul>/);
|
||||
if (!massUlMatch) return [];
|
||||
const massUl = massUlMatch[0];
|
||||
|
||||
const results: ParsedMass[] = [];
|
||||
const liParts = massUl.split(/<li[^>]*>/);
|
||||
for (let i = 1; i < liParts.length; i++) {
|
||||
const li = liParts[i];
|
||||
const labelMatch = li.match(/<span class="label">([^<]+)<\/span>/);
|
||||
if (!labelMatch) continue;
|
||||
const dayLabel = labelMatch[1].trim();
|
||||
const dayOfWeek = FULL_DAY_NAMES[dayLabel];
|
||||
if (dayOfWeek === undefined) continue;
|
||||
|
||||
const serviceTimeParts = li.split("<span class='serviceTime'>");
|
||||
for (let j = 1; j < serviceTimeParts.length; j++) {
|
||||
const st = serviceTimeParts[j];
|
||||
const timeMatch = st.match(/<span class='time'>([^<]+)<\/span>/);
|
||||
if (!timeMatch) continue;
|
||||
const time = convertTo24h(timeMatch[1].trim());
|
||||
const langMatch = st.match(/<span class='language'>\(([^)]+)\)<\/span>/);
|
||||
const language = langMatch ? langMatch[1].trim() : 'English';
|
||||
const commentMatch = st.match(/<span class='comment'>([^<]+)<\/span>/);
|
||||
const notes = commentMatch ? commentMatch[1].trim() : undefined;
|
||||
results.push({ dayOfWeek, time, language, notes });
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse confessions and adoration from the "Other Services" <ul> block.
|
||||
*/
|
||||
function parseOtherServices(html: string): { confessions: ParsedConf[]; adorations: ParsedAdoration[] } {
|
||||
const safeHtml = html.length > 100_000 ? html.slice(0, 100_000) : html;
|
||||
const otherUlMatch = safeHtml.match(/<ul>\s*<li>\s*<h5>Other Services<\/h5>[\s\S]*?<\/ul>/);
|
||||
if (!otherUlMatch) return { confessions: [], adorations: [] };
|
||||
const otherUl = otherUlMatch[0];
|
||||
|
||||
function parseServiceItems(liHtml: string): Array<{ dayOfWeek: number; startTime: string; endTime: string; notes?: string }> {
|
||||
const items: Array<{ dayOfWeek: number; startTime: string; endTime: string; notes?: string }> = [];
|
||||
const stParts = liHtml.split("<span class='serviceTime'>");
|
||||
for (let i = 1; i < stParts.length; i++) {
|
||||
const st = stParts[i];
|
||||
const dayTimeMatch = st.match(/^([A-Za-z]+):\s*<span class='time'>([^<]+)<\/span>/);
|
||||
if (!dayTimeMatch) continue;
|
||||
const days = expandDayAbbrev(dayTimeMatch[1].trim());
|
||||
if (days.length === 0) continue;
|
||||
const [startTime, endTime] = parseTimeRange(dayTimeMatch[2]);
|
||||
const commentMatch = st.match(/<span class='comment'>([^<]+)<\/span>/);
|
||||
const notes = commentMatch ? commentMatch[1].trim() : undefined;
|
||||
for (const dayOfWeek of days) {
|
||||
items.push({ dayOfWeek, startTime, endTime, notes });
|
||||
}
|
||||
}
|
||||
return items;
|
||||
}
|
||||
|
||||
const confessions: ParsedConf[] = [];
|
||||
const adorations: ParsedAdoration[] = [];
|
||||
const confMatch = otherUl.match(/<li class="Confessions">[\s\S]*?<\/li>/);
|
||||
if (confMatch) confessions.push(...parseServiceItems(confMatch[0]));
|
||||
const adorMatch = otherUl.match(/<li class="Adoration">[\s\S]*?<\/li>/);
|
||||
if (adorMatch) adorations.push(...parseServiceItems(adorMatch[0]));
|
||||
return { confessions, adorations };
|
||||
}
|
||||
|
||||
// ─── HTTP Helpers ─────────────────────────────────────────────────────────────
|
||||
|
||||
async function fetchHtml(url: string): Promise<string> {
|
||||
const res = await fetch(url, { headers: { 'User-Agent': USER_AGENT } });
|
||||
if (!res.ok) throw new Error(`HTTP ${res.status} for ${url}`);
|
||||
return res.text();
|
||||
}
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
// ─── Sitemap Enumeration ──────────────────────────────────────────────────────
|
||||
|
||||
async function getAllChurchUrls(): Promise<string[]> {
|
||||
const urls: string[] = [];
|
||||
for (let i = 1; i <= SITEMAP_COUNT; i++) {
|
||||
const sitemapUrl = `${SITE_BASE}/wp-sitemap-posts-item-${i}.xml`;
|
||||
console.log(`Fetching sitemap ${i}/${SITEMAP_COUNT}...`);
|
||||
const xml = await fetchHtml(sitemapUrl);
|
||||
const matches = xml.matchAll(/<loc>(https:\/\/discovermass\.com\/church\/[^<]+)<\/loc>/g);
|
||||
for (const match of matches) {
|
||||
urls.push(match[1]);
|
||||
}
|
||||
}
|
||||
console.log(`Total church URLs: ${urls.length}`);
|
||||
return urls;
|
||||
}
|
||||
|
||||
// ─── DB Helpers ───────────────────────────────────────────────────────────────
|
||||
|
||||
async function loadExistingChurches(): Promise<ExistingChurch[]> {
|
||||
console.log('Loading existing US churches from DB...');
|
||||
const churches = await prisma.church.findMany({
|
||||
where: { country: 'US' },
|
||||
select: {
|
||||
id: true, name: true, latitude: true, longitude: true,
|
||||
osmId: true, baiduId: true, masstimesId: true,
|
||||
orarimesseId: true, massSchedulesPhId: true, philmassId: true,
|
||||
horariosMisasId: true, mszeInfoId: true, weekdayMassesId: true,
|
||||
messesInfoId: true, bohosluzbyId: true, miserendId: true,
|
||||
kerknetId: true, gottesdienstzeitenId: true, discovermassId: true,
|
||||
source: true, website: true, phone: true, address: true, country: true,
|
||||
},
|
||||
});
|
||||
console.log(`Loaded ${churches.length} existing US churches`);
|
||||
return churches as ExistingChurch[];
|
||||
}
|
||||
|
||||
// ─── Church Processing ────────────────────────────────────────────────────────
|
||||
|
||||
async function processChurch(
|
||||
url: string,
|
||||
existingChurches: ExistingChurch[],
|
||||
args: CLIArgs,
|
||||
stats: ImportStats,
|
||||
): Promise<void> {
|
||||
const slug = url.replace('https://discovermass.com/church/', '').replace(/\/$/, '');
|
||||
stats.total++;
|
||||
|
||||
try {
|
||||
const html = await fetchHtml(url);
|
||||
const parsed = parseChurch(html);
|
||||
if (!parsed) {
|
||||
console.log(` [skip] Could not parse: ${slug}`);
|
||||
stats.skipped++;
|
||||
return;
|
||||
}
|
||||
|
||||
const masses = parseMassTimes(html);
|
||||
const { confessions, adorations } = parseOtherServices(html);
|
||||
|
||||
if (args.dryRun) {
|
||||
console.log(` [dry-run] ${parsed.name} — ${masses.length} masses, ${confessions.length} confessions, ${adorations.length} adorations`);
|
||||
return;
|
||||
}
|
||||
|
||||
const candidate = { name: parsed.name, lat: parsed.lat, lng: parsed.lng, discovermassId: slug };
|
||||
const duplicate = findDuplicateChurch(candidate, existingChurches);
|
||||
|
||||
if (duplicate) {
|
||||
const updateData: Record<string, unknown> = { discovermassId: slug };
|
||||
if (!duplicate.phone && parsed.phone) updateData.phone = parsed.phone;
|
||||
if (!duplicate.website && parsed.website) {
|
||||
updateData.website = parsed.website;
|
||||
updateData.hasWebsite = true;
|
||||
}
|
||||
if (parsed.lat !== 0 && duplicate.latitude === 0) {
|
||||
updateData.latitude = parsed.lat;
|
||||
updateData.longitude = parsed.lng;
|
||||
}
|
||||
|
||||
try {
|
||||
await prisma.$transaction(async (tx) => {
|
||||
await tx.church.update({ where: { id: duplicate.id }, data: updateData });
|
||||
if (masses.length > 0) {
|
||||
await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } });
|
||||
await tx.massSchedule.createMany({
|
||||
data: masses.map(m => ({ churchId: duplicate.id, dayOfWeek: m.dayOfWeek, time: m.time, language: m.language, notes: m.notes ?? null })),
|
||||
});
|
||||
}
|
||||
if (confessions.length > 0) {
|
||||
await tx.confessionSchedule.deleteMany({ where: { churchId: duplicate.id } });
|
||||
await tx.confessionSchedule.createMany({
|
||||
data: confessions.map(c => ({ churchId: duplicate.id, dayOfWeek: c.dayOfWeek, startTime: c.startTime, endTime: c.endTime, notes: c.notes ?? null })),
|
||||
});
|
||||
}
|
||||
if (adorations.length > 0) {
|
||||
await tx.adorationSchedule.deleteMany({ where: { churchId: duplicate.id } });
|
||||
await tx.adorationSchedule.createMany({
|
||||
data: adorations.map(a => ({ churchId: duplicate.id, dayOfWeek: a.dayOfWeek, startTime: a.startTime, endTime: a.endTime, notes: a.notes ?? null })),
|
||||
});
|
||||
}
|
||||
await tx.church.update({ where: { id: duplicate.id }, data: { lastScrapedAt: new Date() } });
|
||||
});
|
||||
duplicate.discovermassId = slug;
|
||||
stats.updated++;
|
||||
} catch (err) {
|
||||
if (err instanceof Error && err.message.includes('Unique constraint')) {
|
||||
stats.skipped++;
|
||||
return;
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
const church = await prisma.church.create({
|
||||
data: {
|
||||
name: parsed.name,
|
||||
address: parsed.address,
|
||||
city: parsed.city,
|
||||
state: parsed.state,
|
||||
zip: parsed.zip,
|
||||
country: 'US',
|
||||
phone: parsed.phone,
|
||||
website: parsed.website,
|
||||
hasWebsite: !!parsed.website,
|
||||
latitude: parsed.lat,
|
||||
longitude: parsed.lng,
|
||||
discovermassId: slug,
|
||||
source: 'discovermass',
|
||||
},
|
||||
});
|
||||
|
||||
existingChurches.push({
|
||||
id: church.id, name: parsed.name, latitude: parsed.lat, longitude: parsed.lng,
|
||||
osmId: null, baiduId: null, masstimesId: null, orarimesseId: null,
|
||||
massSchedulesPhId: null, philmassId: null, horariosMisasId: null,
|
||||
mszeInfoId: null, weekdayMassesId: null, messesInfoId: null,
|
||||
bohosluzbyId: null, miserendId: null, kerknetId: null,
|
||||
gottesdienstzeitenId: null, discovermassId: slug,
|
||||
source: 'discovermass', website: parsed.website, phone: parsed.phone,
|
||||
address: parsed.address, country: 'US',
|
||||
});
|
||||
|
||||
if (masses.length > 0) {
|
||||
await prisma.massSchedule.createMany({
|
||||
data: masses.map(m => ({ churchId: church.id, dayOfWeek: m.dayOfWeek, time: m.time, language: m.language, notes: m.notes ?? null })),
|
||||
});
|
||||
}
|
||||
if (confessions.length > 0) {
|
||||
await prisma.confessionSchedule.createMany({
|
||||
data: confessions.map(c => ({ churchId: church.id, dayOfWeek: c.dayOfWeek, startTime: c.startTime, endTime: c.endTime, notes: c.notes ?? null })),
|
||||
});
|
||||
}
|
||||
if (adorations.length > 0) {
|
||||
await prisma.adorationSchedule.createMany({
|
||||
data: adorations.map(a => ({ churchId: church.id, dayOfWeek: a.dayOfWeek, startTime: a.startTime, endTime: a.endTime, notes: a.notes ?? null })),
|
||||
});
|
||||
}
|
||||
await prisma.church.update({ where: { id: church.id }, data: { lastScrapedAt: new Date() } });
|
||||
stats.created++;
|
||||
} catch (err) {
|
||||
if (err instanceof Error && err.message.includes('Unique constraint')) {
|
||||
stats.skipped++;
|
||||
return;
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
stats.massSchedulesCreated += masses.length;
|
||||
stats.confessionSchedulesCreated += confessions.length;
|
||||
stats.adorationSchedulesCreated += adorations.length;
|
||||
|
||||
console.log(
|
||||
` [${duplicate ? 'update' : 'create'}] ${parsed.name} — ` +
|
||||
`${masses.length}M ${confessions.length}C ${adorations.length}A — ` +
|
||||
`${stats.total} total (${stats.created} new, ${stats.updated} upd, ${stats.errors} err)`
|
||||
);
|
||||
} catch (err) {
|
||||
stats.errors++;
|
||||
console.error(` [error] ${slug}: ${err instanceof Error ? err.message : err}`);
|
||||
}
|
||||
}
|
||||
|
||||
// ─── CLI Parsing ──────────────────────────────────────────────────────────────
|
||||
|
||||
function parseCLIArgs(): CLIArgs {
|
||||
const args = process.argv.slice(2);
|
||||
const result: CLIArgs = { all: false, dryRun: false };
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
switch (args[i]) {
|
||||
case '--all': result.all = true; break;
|
||||
case '--dry-run': result.dryRun = true; break;
|
||||
case '--resume-from': result.resumeFrom = parseInt(args[++i], 10); break;
|
||||
case '--job-id': result.jobId = args[++i]; break;
|
||||
case '--test-parse': result.testParse = true; break;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// ─── Test Parse ───────────────────────────────────────────────────────────────
|
||||
|
||||
async function runTestParse() {
|
||||
const testUrl = 'https://discovermass.com/church/st-paul-the-apostle-chino-hills/';
|
||||
console.log(`Fetching test page: ${testUrl}`);
|
||||
const html = await fetchHtml(testUrl);
|
||||
const church = parseChurch(html);
|
||||
const masses = parseMassTimes(html);
|
||||
const { confessions, adorations } = parseOtherServices(html);
|
||||
console.log('Church:', JSON.stringify(church, null, 2));
|
||||
console.log(`Masses (${masses.length}):`, JSON.stringify(masses, null, 2));
|
||||
console.log(`Confessions (${confessions.length}):`, JSON.stringify(confessions, null, 2));
|
||||
console.log(`Adorations (${adorations.length}):`, JSON.stringify(adorations, null, 2));
|
||||
await pool.end();
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// ─── Main ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
const args = parseCLIArgs();
|
||||
|
||||
if (args.testParse) {
|
||||
await runTestParse();
|
||||
return;
|
||||
}
|
||||
|
||||
if (!args.all) {
|
||||
console.error('Usage: npx tsx scripts/import-discovermass.ts --all [--dry-run] [--resume-from N] [--job-id UUID]');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (args.jobId) {
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: args.jobId },
|
||||
data: { status: 'running', startedAt: new Date() },
|
||||
});
|
||||
} catch { /* Job might not exist yet */ }
|
||||
}
|
||||
|
||||
const stats: ImportStats = {
|
||||
total: 0, created: 0, updated: 0, skipped: 0, errors: 0,
|
||||
massSchedulesCreated: 0, confessionSchedulesCreated: 0, adorationSchedulesCreated: 0,
|
||||
};
|
||||
|
||||
try {
|
||||
const urls = await getAllChurchUrls();
|
||||
const existingChurches = await loadExistingChurches();
|
||||
const startIdx = args.resumeFrom ?? 0;
|
||||
const churchUrls = urls.slice(startIdx);
|
||||
console.log(`\nProcessing ${churchUrls.length} churches (starting from index ${startIdx})...\n`);
|
||||
|
||||
for (let i = 0; i < churchUrls.length; i++) {
|
||||
const url = churchUrls[i];
|
||||
const overallIdx = startIdx + i;
|
||||
console.log(`[${overallIdx + 1}/${urls.length}] ${url}`);
|
||||
await processChurch(url, existingChurches, args, stats);
|
||||
if (i < churchUrls.length - 1) {
|
||||
await sleep(REQUEST_DELAY_MS);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
console.log('\n─── Import Complete ───────────────────────────────────────');
|
||||
console.log(`Total processed: ${stats.total}`);
|
||||
console.log(`Created: ${stats.created}`);
|
||||
console.log(`Updated: ${stats.updated}`);
|
||||
console.log(`Skipped: ${stats.skipped}`);
|
||||
console.log(`Errors: ${stats.errors}`);
|
||||
console.log(`Mass schedules: ${stats.massSchedulesCreated}`);
|
||||
console.log(`Confession sched: ${stats.confessionSchedulesCreated}`);
|
||||
console.log(`Adoration sched: ${stats.adorationSchedulesCreated}`);
|
||||
|
||||
if (args.jobId) {
|
||||
const status = stats.errors > stats.total * 0.1 ? 'failed' : 'completed';
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: args.jobId },
|
||||
data: {
|
||||
status,
|
||||
completedAt: new Date(),
|
||||
processed: stats.total,
|
||||
succeeded: stats.created + stats.updated,
|
||||
failed: stats.errors,
|
||||
itemsFound: stats.massSchedulesCreated,
|
||||
},
|
||||
});
|
||||
} catch { /* Ignore */ }
|
||||
}
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error('Fatal error:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user