Reset local main to gitea/master (new source of truth) and restored local-only files: web scrapers, admin dashboard, ChromaDB integration, debug scripts, and utility libraries that aren't tracked in Gitea. Gitea master adds: discovermass, buscarmisas-network, hk-parishes, bohosluzby, kerknet, gottesdienstzeiten, miserend importers, ClaimRequest model, forward geocoding, heartbeat healthcheck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
172 lines
5.5 KiB
TypeScript
172 lines
5.5 KiB
TypeScript
import 'dotenv/config';
|
|
import { prisma } from '../src/lib/db';
|
|
import { MassTimesScraper, ChurchData } from '../src/lib/masstimes-scraper';
|
|
|
|
const TARGET_STATES = [
|
|
'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL',
|
|
'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME',
|
|
'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH',
|
|
'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI',
|
|
'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI',
|
|
'WY',
|
|
];
|
|
|
|
function deduplicateMassSchedules<T extends { dayOfWeek: number; time: string; language: string }>(schedules: T[]): T[] {
|
|
const seen = new Map<string, T>();
|
|
for (const s of schedules) {
|
|
const key = `${s.dayOfWeek}:${s.time}:${s.language}`;
|
|
if (!seen.has(key)) {
|
|
seen.set(key, s);
|
|
}
|
|
}
|
|
return Array.from(seen.values());
|
|
}
|
|
|
|
async function saveChurch(data: ChurchData, seenIds: Set<string>): Promise<boolean> {
|
|
if (seenIds.has(data.masstimesId)) {
|
|
console.log(` Skipping duplicate: ${data.name}`);
|
|
return false;
|
|
}
|
|
|
|
try {
|
|
await prisma.$transaction(async (tx) => {
|
|
const church = await tx.church.upsert({
|
|
where: { masstimesId: data.masstimesId },
|
|
create: {
|
|
masstimesId: data.masstimesId,
|
|
name: data.name,
|
|
address: data.address,
|
|
city: data.city,
|
|
state: data.state,
|
|
zip: data.zip,
|
|
country: data.country,
|
|
latitude: data.latitude,
|
|
longitude: data.longitude,
|
|
phone: data.phone,
|
|
website: data.website,
|
|
email: data.email,
|
|
pastorName: data.pastorName,
|
|
diocese: data.diocese,
|
|
directions: data.directions,
|
|
wheelchairAccess: data.wheelchairAccess,
|
|
lastScrapedAt: new Date(),
|
|
scrapeStrategy: 'masstimes',
|
|
},
|
|
update: {
|
|
name: data.name,
|
|
address: data.address,
|
|
city: data.city,
|
|
state: data.state,
|
|
zip: data.zip,
|
|
latitude: data.latitude,
|
|
longitude: data.longitude,
|
|
phone: data.phone,
|
|
website: data.website,
|
|
email: data.email,
|
|
pastorName: data.pastorName,
|
|
diocese: data.diocese,
|
|
directions: data.directions,
|
|
wheelchairAccess: data.wheelchairAccess,
|
|
lastScrapedAt: new Date(),
|
|
},
|
|
});
|
|
|
|
await tx.massSchedule.deleteMany({ where: { churchId: church.id } });
|
|
await tx.confessionSchedule.deleteMany({ where: { churchId: church.id } });
|
|
await tx.adorationSchedule.deleteMany({ where: { churchId: church.id } });
|
|
|
|
if (data.massSchedules.length > 0) {
|
|
await tx.massSchedule.createMany({
|
|
data: deduplicateMassSchedules(data.massSchedules).map((ms) => ({
|
|
churchId: church.id,
|
|
dayOfWeek: ms.dayOfWeek,
|
|
time: ms.time,
|
|
massType: ms.massType,
|
|
language: ms.language,
|
|
notes: ms.notes,
|
|
})),
|
|
});
|
|
}
|
|
|
|
if (data.confessionSchedules.length > 0) {
|
|
await tx.confessionSchedule.createMany({
|
|
data: data.confessionSchedules.map((cs) => ({
|
|
churchId: church.id,
|
|
dayOfWeek: cs.dayOfWeek,
|
|
startTime: cs.startTime,
|
|
endTime: cs.endTime,
|
|
notes: cs.notes,
|
|
})),
|
|
});
|
|
}
|
|
|
|
if (data.adorationSchedules.length > 0) {
|
|
await tx.adorationSchedule.createMany({
|
|
data: data.adorationSchedules.map((as) => ({
|
|
churchId: church.id,
|
|
dayOfWeek: as.dayOfWeek,
|
|
startTime: as.startTime,
|
|
endTime: as.endTime,
|
|
isPerpetual: as.isPerpetual,
|
|
notes: as.notes,
|
|
})),
|
|
});
|
|
}
|
|
});
|
|
|
|
seenIds.add(data.masstimesId);
|
|
console.log(` Saved: ${data.name}`);
|
|
return true;
|
|
} catch (error) {
|
|
console.error(` Error saving ${data.name}:`, error);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
async function main() {
|
|
const seenIds = new Set<string>();
|
|
console.log('\n' + '='.repeat(70));
|
|
console.log('MASSTIMES.ORG CHURCH SCRAPER (JSON API)');
|
|
console.log('='.repeat(70));
|
|
console.log(`\nTarget states: ${TARGET_STATES.length}`);
|
|
console.log(`Time: ${new Date().toISOString()}`);
|
|
console.log('\n' + '-'.repeat(70));
|
|
|
|
const scraper = new MassTimesScraper();
|
|
const stats = { total: 0, saved: 0, errors: 0 };
|
|
|
|
try {
|
|
await scraper.init();
|
|
console.log('Browser initialized\n');
|
|
|
|
for (let i = 0; i < TARGET_STATES.length; i++) {
|
|
const state = TARGET_STATES[i];
|
|
console.log(`\n[${'='.repeat(20)}] SCRAPING ${state} [${'='.repeat(20)}]\n`);
|
|
console.log(`State ${i + 1}/${TARGET_STATES.length}: ${state}`);
|
|
const churches = await scraper.scrapeState(state);
|
|
stats.total += churches.length;
|
|
console.log(`\n Saving ${churches.length} churches from ${state} to database...`);
|
|
for (const church of churches) {
|
|
const saved = await saveChurch(church, seenIds);
|
|
if (saved) stats.saved++;
|
|
else stats.errors++;
|
|
}
|
|
console.log(`\n Resting 5 minutes before next state...\n`);
|
|
await new Promise(resolve => setTimeout(resolve, 300000));
|
|
}
|
|
} finally {
|
|
await scraper.close();
|
|
await prisma.$disconnect();
|
|
}
|
|
|
|
console.log('\n' + '='.repeat(70));
|
|
console.log('SUMMARY');
|
|
console.log('='.repeat(70));
|
|
console.log(`Total scraped: ${stats.total}`);
|
|
console.log(`Saved: ${stats.saved}`);
|
|
console.log(`Errors: ${stats.errors}`);
|
|
console.log('='.repeat(70) + '\n');
|
|
}
|
|
|
|
main().catch(console.error);
|