165 lines
6.4 KiB
TypeScript
165 lines
6.4 KiB
TypeScript
|
|
import { config } from 'dotenv';
|
||
|
|
import { PrismaClient } from '@prisma/client';
|
||
|
|
import { Pool } from 'pg';
|
||
|
|
import { PrismaPg } from '@prisma/adapter-pg';
|
||
|
|
|
||
|
|
// Load .env.local first, then .env
|
||
|
|
config({ path: '.env.local' });
|
||
|
|
config({ path: '.env' });
|
||
|
|
|
||
|
|
const connectionString = process.env.DATABASE_URL;
|
||
|
|
|
||
|
|
if (!connectionString) {
|
||
|
|
throw new Error('DATABASE_URL environment variable is not set');
|
||
|
|
}
|
||
|
|
|
||
|
|
const pool = new Pool({ connectionString });
|
||
|
|
const adapter = new PrismaPg(pool);
|
||
|
|
const prisma = new PrismaClient({ adapter });
|
||
|
|
|
||
|
|
async function checkScraperStatus() {
|
||
|
|
try {
|
||
|
|
console.log('Checking mass schedule scraper status...\n');
|
||
|
|
|
||
|
|
// Overall church stats
|
||
|
|
const totalChurches = await prisma.church.count();
|
||
|
|
|
||
|
|
const churchesWithWebsites = await prisma.church.count({
|
||
|
|
where: {
|
||
|
|
OR: [
|
||
|
|
{ website: { not: null } },
|
||
|
|
{ massScheduleUrl: { not: null } },
|
||
|
|
],
|
||
|
|
},
|
||
|
|
});
|
||
|
|
|
||
|
|
const churchesScraped = await prisma.church.count({
|
||
|
|
where: { lastScrapedAt: { not: null } },
|
||
|
|
});
|
||
|
|
|
||
|
|
// Mass schedule stats
|
||
|
|
const totalMassSchedules = await prisma.massSchedule.count();
|
||
|
|
|
||
|
|
const churchesWithSchedules = await prisma.church.count({
|
||
|
|
where: {
|
||
|
|
massSchedules: {
|
||
|
|
some: {},
|
||
|
|
},
|
||
|
|
},
|
||
|
|
});
|
||
|
|
|
||
|
|
// Recently scraped (last 7 days)
|
||
|
|
const weekAgo = new Date();
|
||
|
|
weekAgo.setDate(weekAgo.getDate() - 7);
|
||
|
|
|
||
|
|
const recentlyScraped = await prisma.church.count({
|
||
|
|
where: {
|
||
|
|
lastScrapedAt: { gte: weekAgo },
|
||
|
|
},
|
||
|
|
});
|
||
|
|
|
||
|
|
// Get scraper sources
|
||
|
|
const bySource = await prisma.church.groupBy({
|
||
|
|
by: ['source'],
|
||
|
|
_count: {
|
||
|
|
id: true,
|
||
|
|
},
|
||
|
|
});
|
||
|
|
|
||
|
|
console.log('═══════════════════════════════════════════════════════════════');
|
||
|
|
console.log('CHURCH DATA SOURCES');
|
||
|
|
console.log('═══════════════════════════════════════════════════════════════');
|
||
|
|
bySource.forEach((source) => {
|
||
|
|
const percent = ((source._count.id / totalChurches) * 100).toFixed(1);
|
||
|
|
console.log(`${source.source.padEnd(12)} | ${String(source._count.id).padStart(7)} churches (${percent}%)`);
|
||
|
|
});
|
||
|
|
console.log('');
|
||
|
|
|
||
|
|
console.log('═══════════════════════════════════════════════════════════════');
|
||
|
|
console.log('MASS SCHEDULE SCRAPING STATUS');
|
||
|
|
console.log('═══════════════════════════════════════════════════════════════');
|
||
|
|
console.log(`Total churches: ${totalChurches.toLocaleString()}`);
|
||
|
|
console.log(`Churches with websites: ${churchesWithWebsites.toLocaleString()} (${((churchesWithWebsites / totalChurches) * 100).toFixed(1)}%)`);
|
||
|
|
console.log(`Churches ever scraped: ${churchesScraped.toLocaleString()} (${((churchesScraped / totalChurches) * 100).toFixed(1)}%)`);
|
||
|
|
console.log(`Churches with mass schedules: ${churchesWithSchedules.toLocaleString()} (${((churchesWithSchedules / totalChurches) * 100).toFixed(1)}%)`);
|
||
|
|
console.log(`Total mass schedules: ${totalMassSchedules.toLocaleString()}`);
|
||
|
|
console.log('');
|
||
|
|
console.log(`Scraped in last 7 days: ${recentlyScraped.toLocaleString()}`);
|
||
|
|
console.log('');
|
||
|
|
|
||
|
|
// Average schedules per church
|
||
|
|
if (churchesWithSchedules > 0) {
|
||
|
|
const avgSchedules = totalMassSchedules / churchesWithSchedules;
|
||
|
|
console.log(`Average schedules per church: ${avgSchedules.toFixed(1)} masses/week`);
|
||
|
|
console.log('');
|
||
|
|
}
|
||
|
|
|
||
|
|
// Get sample of recently scraped churches
|
||
|
|
const recentSample = await prisma.church.findMany({
|
||
|
|
where: {
|
||
|
|
lastScrapedAt: { not: null },
|
||
|
|
},
|
||
|
|
select: {
|
||
|
|
name: true,
|
||
|
|
city: true,
|
||
|
|
state: true,
|
||
|
|
country: true,
|
||
|
|
lastScrapedAt: true,
|
||
|
|
website: true,
|
||
|
|
source: true,
|
||
|
|
_count: {
|
||
|
|
select: {
|
||
|
|
massSchedules: true,
|
||
|
|
},
|
||
|
|
},
|
||
|
|
},
|
||
|
|
orderBy: { lastScrapedAt: 'desc' },
|
||
|
|
take: 10,
|
||
|
|
});
|
||
|
|
|
||
|
|
console.log('═══════════════════════════════════════════════════════════════');
|
||
|
|
console.log('RECENTLY SCRAPED CHURCHES (Last 10)');
|
||
|
|
console.log('═══════════════════════════════════════════════════════════════');
|
||
|
|
if (recentSample.length === 0) {
|
||
|
|
console.log('No churches have been scraped yet.');
|
||
|
|
} else {
|
||
|
|
recentSample.forEach((church, index) => {
|
||
|
|
const location = [church.city, church.state, church.country].filter(Boolean).join(', ');
|
||
|
|
console.log(`${index + 1}. ${church.name} (${location})`);
|
||
|
|
console.log(` Source: ${church.source}`);
|
||
|
|
console.log(` Website: ${church.website || 'None'}`);
|
||
|
|
console.log(` Last scraped: ${church.lastScrapedAt?.toLocaleString() || 'Never'}`);
|
||
|
|
console.log(` Mass schedules: ${church._count.massSchedules}`);
|
||
|
|
console.log('');
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
// Churches ready to scrape (have website, not scraped)
|
||
|
|
const readyToScrape = await prisma.church.count({
|
||
|
|
where: {
|
||
|
|
OR: [
|
||
|
|
{ website: { not: null } },
|
||
|
|
{ massScheduleUrl: { not: null } },
|
||
|
|
],
|
||
|
|
lastScrapedAt: null,
|
||
|
|
},
|
||
|
|
});
|
||
|
|
|
||
|
|
console.log('═══════════════════════════════════════════════════════════════');
|
||
|
|
console.log('SCRAPING POTENTIAL');
|
||
|
|
console.log('═══════════════════════════════════════════════════════════════');
|
||
|
|
console.log(`Churches ready to scrape: ${readyToScrape.toLocaleString()}`);
|
||
|
|
console.log(` (have website, never scraped)`);
|
||
|
|
console.log('');
|
||
|
|
|
||
|
|
} catch (error) {
|
||
|
|
console.error('Error:', error);
|
||
|
|
process.exit(1);
|
||
|
|
} finally {
|
||
|
|
await prisma.$disconnect();
|
||
|
|
await pool.end();
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
checkScraperStatus();
|