chore: sync with Gitea master and restore local-only files
Reset local main to gitea/master (new source of truth) and restored local-only files: web scrapers, admin dashboard, ChromaDB integration, debug scripts, and utility libraries that aren't tracked in Gitea. Gitea master adds: discovermass, buscarmisas-network, hk-parishes, bohosluzby, kerknet, gottesdienstzeiten, miserend importers, ClaimRequest model, forward geocoding, heartbeat healthcheck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
171
scripts/scrape-masstimes.ts
Normal file
171
scripts/scrape-masstimes.ts
Normal file
@@ -0,0 +1,171 @@
|
||||
import 'dotenv/config';
|
||||
import { prisma } from '../src/lib/db';
|
||||
import { MassTimesScraper, ChurchData } from '../src/lib/masstimes-scraper';
|
||||
|
||||
const TARGET_STATES = [
|
||||
'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL',
|
||||
'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME',
|
||||
'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH',
|
||||
'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI',
|
||||
'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI',
|
||||
'WY',
|
||||
];
|
||||
|
||||
function deduplicateMassSchedules<T extends { dayOfWeek: number; time: string; language: string }>(schedules: T[]): T[] {
|
||||
const seen = new Map<string, T>();
|
||||
for (const s of schedules) {
|
||||
const key = `${s.dayOfWeek}:${s.time}:${s.language}`;
|
||||
if (!seen.has(key)) {
|
||||
seen.set(key, s);
|
||||
}
|
||||
}
|
||||
return Array.from(seen.values());
|
||||
}
|
||||
|
||||
async function saveChurch(data: ChurchData, seenIds: Set<string>): Promise<boolean> {
|
||||
if (seenIds.has(data.masstimesId)) {
|
||||
console.log(` Skipping duplicate: ${data.name}`);
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
await prisma.$transaction(async (tx) => {
|
||||
const church = await tx.church.upsert({
|
||||
where: { masstimesId: data.masstimesId },
|
||||
create: {
|
||||
masstimesId: data.masstimesId,
|
||||
name: data.name,
|
||||
address: data.address,
|
||||
city: data.city,
|
||||
state: data.state,
|
||||
zip: data.zip,
|
||||
country: data.country,
|
||||
latitude: data.latitude,
|
||||
longitude: data.longitude,
|
||||
phone: data.phone,
|
||||
website: data.website,
|
||||
email: data.email,
|
||||
pastorName: data.pastorName,
|
||||
diocese: data.diocese,
|
||||
directions: data.directions,
|
||||
wheelchairAccess: data.wheelchairAccess,
|
||||
lastScrapedAt: new Date(),
|
||||
scrapeStrategy: 'masstimes',
|
||||
},
|
||||
update: {
|
||||
name: data.name,
|
||||
address: data.address,
|
||||
city: data.city,
|
||||
state: data.state,
|
||||
zip: data.zip,
|
||||
latitude: data.latitude,
|
||||
longitude: data.longitude,
|
||||
phone: data.phone,
|
||||
website: data.website,
|
||||
email: data.email,
|
||||
pastorName: data.pastorName,
|
||||
diocese: data.diocese,
|
||||
directions: data.directions,
|
||||
wheelchairAccess: data.wheelchairAccess,
|
||||
lastScrapedAt: new Date(),
|
||||
},
|
||||
});
|
||||
|
||||
await tx.massSchedule.deleteMany({ where: { churchId: church.id } });
|
||||
await tx.confessionSchedule.deleteMany({ where: { churchId: church.id } });
|
||||
await tx.adorationSchedule.deleteMany({ where: { churchId: church.id } });
|
||||
|
||||
if (data.massSchedules.length > 0) {
|
||||
await tx.massSchedule.createMany({
|
||||
data: deduplicateMassSchedules(data.massSchedules).map((ms) => ({
|
||||
churchId: church.id,
|
||||
dayOfWeek: ms.dayOfWeek,
|
||||
time: ms.time,
|
||||
massType: ms.massType,
|
||||
language: ms.language,
|
||||
notes: ms.notes,
|
||||
})),
|
||||
});
|
||||
}
|
||||
|
||||
if (data.confessionSchedules.length > 0) {
|
||||
await tx.confessionSchedule.createMany({
|
||||
data: data.confessionSchedules.map((cs) => ({
|
||||
churchId: church.id,
|
||||
dayOfWeek: cs.dayOfWeek,
|
||||
startTime: cs.startTime,
|
||||
endTime: cs.endTime,
|
||||
notes: cs.notes,
|
||||
})),
|
||||
});
|
||||
}
|
||||
|
||||
if (data.adorationSchedules.length > 0) {
|
||||
await tx.adorationSchedule.createMany({
|
||||
data: data.adorationSchedules.map((as) => ({
|
||||
churchId: church.id,
|
||||
dayOfWeek: as.dayOfWeek,
|
||||
startTime: as.startTime,
|
||||
endTime: as.endTime,
|
||||
isPerpetual: as.isPerpetual,
|
||||
notes: as.notes,
|
||||
})),
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
seenIds.add(data.masstimesId);
|
||||
console.log(` Saved: ${data.name}`);
|
||||
return true;
|
||||
} catch (error) {
|
||||
console.error(` Error saving ${data.name}:`, error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const seenIds = new Set<string>();
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log('MASSTIMES.ORG CHURCH SCRAPER (JSON API)');
|
||||
console.log('='.repeat(70));
|
||||
console.log(`\nTarget states: ${TARGET_STATES.length}`);
|
||||
console.log(`Time: ${new Date().toISOString()}`);
|
||||
console.log('\n' + '-'.repeat(70));
|
||||
|
||||
const scraper = new MassTimesScraper();
|
||||
const stats = { total: 0, saved: 0, errors: 0 };
|
||||
|
||||
try {
|
||||
await scraper.init();
|
||||
console.log('Browser initialized\n');
|
||||
|
||||
for (let i = 0; i < TARGET_STATES.length; i++) {
|
||||
const state = TARGET_STATES[i];
|
||||
console.log(`\n[${'='.repeat(20)}] SCRAPING ${state} [${'='.repeat(20)}]\n`);
|
||||
console.log(`State ${i + 1}/${TARGET_STATES.length}: ${state}`);
|
||||
const churches = await scraper.scrapeState(state);
|
||||
stats.total += churches.length;
|
||||
console.log(`\n Saving ${churches.length} churches from ${state} to database...`);
|
||||
for (const church of churches) {
|
||||
const saved = await saveChurch(church, seenIds);
|
||||
if (saved) stats.saved++;
|
||||
else stats.errors++;
|
||||
}
|
||||
console.log(`\n Resting 5 minutes before next state...\n`);
|
||||
await new Promise(resolve => setTimeout(resolve, 300000));
|
||||
}
|
||||
} finally {
|
||||
await scraper.close();
|
||||
await prisma.$disconnect();
|
||||
}
|
||||
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log('SUMMARY');
|
||||
console.log('='.repeat(70));
|
||||
console.log(`Total scraped: ${stats.total}`);
|
||||
console.log(`Saved: ${stats.saved}`);
|
||||
console.log(`Errors: ${stats.errors}`);
|
||||
console.log('='.repeat(70) + '\n');
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
Reference in New Issue
Block a user