Files
ScraperControl/scripts/scrape-masstimes.ts

172 lines
5.5 KiB
TypeScript
Raw Normal View History

import 'dotenv/config';
import { prisma } from '../src/lib/db';
import { MassTimesScraper, ChurchData } from '../src/lib/masstimes-scraper';
const TARGET_STATES = [
'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL',
'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME',
'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH',
'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI',
'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI',
'WY',
];
function deduplicateMassSchedules<T extends { dayOfWeek: number; time: string; language: string }>(schedules: T[]): T[] {
const seen = new Map<string, T>();
for (const s of schedules) {
const key = `${s.dayOfWeek}:${s.time}:${s.language}`;
if (!seen.has(key)) {
seen.set(key, s);
}
}
return Array.from(seen.values());
}
async function saveChurch(data: ChurchData, seenIds: Set<string>): Promise<boolean> {
if (seenIds.has(data.masstimesId)) {
console.log(` Skipping duplicate: ${data.name}`);
return false;
}
try {
await prisma.$transaction(async (tx) => {
const church = await tx.church.upsert({
where: { masstimesId: data.masstimesId },
create: {
masstimesId: data.masstimesId,
name: data.name,
address: data.address,
city: data.city,
state: data.state,
zip: data.zip,
country: data.country,
latitude: data.latitude,
longitude: data.longitude,
phone: data.phone,
website: data.website,
email: data.email,
pastorName: data.pastorName,
diocese: data.diocese,
directions: data.directions,
wheelchairAccess: data.wheelchairAccess,
lastScrapedAt: new Date(),
scrapeStrategy: 'masstimes',
},
update: {
name: data.name,
address: data.address,
city: data.city,
state: data.state,
zip: data.zip,
latitude: data.latitude,
longitude: data.longitude,
phone: data.phone,
website: data.website,
email: data.email,
pastorName: data.pastorName,
diocese: data.diocese,
directions: data.directions,
wheelchairAccess: data.wheelchairAccess,
lastScrapedAt: new Date(),
},
});
await tx.massSchedule.deleteMany({ where: { churchId: church.id } });
await tx.confessionSchedule.deleteMany({ where: { churchId: church.id } });
await tx.adorationSchedule.deleteMany({ where: { churchId: church.id } });
if (data.massSchedules.length > 0) {
await tx.massSchedule.createMany({
data: deduplicateMassSchedules(data.massSchedules).map((ms) => ({
churchId: church.id,
dayOfWeek: ms.dayOfWeek,
time: ms.time,
massType: ms.massType,
language: ms.language,
notes: ms.notes,
})),
});
}
if (data.confessionSchedules.length > 0) {
await tx.confessionSchedule.createMany({
data: data.confessionSchedules.map((cs) => ({
churchId: church.id,
dayOfWeek: cs.dayOfWeek,
startTime: cs.startTime,
endTime: cs.endTime,
notes: cs.notes,
})),
});
}
if (data.adorationSchedules.length > 0) {
await tx.adorationSchedule.createMany({
data: data.adorationSchedules.map((as) => ({
churchId: church.id,
dayOfWeek: as.dayOfWeek,
startTime: as.startTime,
endTime: as.endTime,
isPerpetual: as.isPerpetual,
notes: as.notes,
})),
});
}
});
seenIds.add(data.masstimesId);
console.log(` Saved: ${data.name}`);
return true;
} catch (error) {
console.error(` Error saving ${data.name}:`, error);
return false;
}
}
async function main() {
const seenIds = new Set<string>();
console.log('\n' + '='.repeat(70));
console.log('MASSTIMES.ORG CHURCH SCRAPER (JSON API)');
console.log('='.repeat(70));
console.log(`\nTarget states: ${TARGET_STATES.length}`);
console.log(`Time: ${new Date().toISOString()}`);
console.log('\n' + '-'.repeat(70));
const scraper = new MassTimesScraper();
const stats = { total: 0, saved: 0, errors: 0 };
try {
await scraper.init();
console.log('Browser initialized\n');
for (let i = 0; i < TARGET_STATES.length; i++) {
const state = TARGET_STATES[i];
console.log(`\n[${'='.repeat(20)}] SCRAPING ${state} [${'='.repeat(20)}]\n`);
console.log(`State ${i + 1}/${TARGET_STATES.length}: ${state}`);
const churches = await scraper.scrapeState(state);
stats.total += churches.length;
console.log(`\n Saving ${churches.length} churches from ${state} to database...`);
for (const church of churches) {
const saved = await saveChurch(church, seenIds);
if (saved) stats.saved++;
else stats.errors++;
}
console.log(`\n Resting 5 minutes before next state...\n`);
await new Promise(resolve => setTimeout(resolve, 300000));
}
} finally {
await scraper.close();
await prisma.$disconnect();
}
console.log('\n' + '='.repeat(70));
console.log('SUMMARY');
console.log('='.repeat(70));
console.log(`Total scraped: ${stats.total}`);
console.log(`Saved: ${stats.saved}`);
console.log(`Errors: ${stats.errors}`);
console.log('='.repeat(70) + '\n');
}
main().catch(console.error);