#!/usr/bin/env tsx /** * Bulk church website scraper * Scrapes mass schedules from church websites and updates the database. * * Usage: * npx tsx scripts/scrape-churches.ts --limit 100 * npx tsx scripts/scrape-churches.ts --limit 50 --max-failures 3 * npx tsx scripts/scrape-churches.ts --all # Process ALL eligible churches * npx tsx scripts/scrape-churches.ts --all --language english * npx tsx scripts/scrape-churches.ts --all --max-failures 3 * npx tsx scripts/scrape-churches.ts --ids id1,id2,id3 * npx tsx scripts/scrape-churches.ts --all --job-id # Resume/track existing job */ import dotenv from 'dotenv'; import path from 'path'; dotenv.config({ path: path.resolve(process.cwd(), '.env') }); import { Pool } from 'pg'; import { PrismaPg } from '@prisma/adapter-pg'; import { PrismaClient } from '@prisma/client'; import { scrapeAllChurches, scrapeChurch, countEligibleChurches } from '../src/lib/scraper-service'; import type { ScrapeJobResult } from '../src/lib/scraper-service'; // Fresh DB connection for scripts const pool = new Pool({ connectionString: process.env.DATABASE_URL }); const adapter = new PrismaPg(pool); const jobPrisma = new PrismaClient({ adapter }); let shuttingDown = false; function formatDuration(seconds: number): string { if (seconds < 60) return `${seconds.toFixed(0)}s`; if (seconds < 3600) return `${Math.floor(seconds / 60)}m ${Math.floor(seconds % 60)}s`; const h = Math.floor(seconds / 3600); const m = Math.floor((seconds % 3600) / 60); return `${h}h ${m}m`; } // --- Job Tracking --- async function createOrResumeJob(args: string[]): Promise { const jobIdIndex = args.indexOf('--job-id'); if (jobIdIndex !== -1) { const jobId = args[jobIdIndex + 1]; await jobPrisma.backgroundJob.update({ where: { id: jobId }, data: { status: 'running', startedAt: new Date() }, }); return jobId; } return null; } async function createNewJob(language: string | null, config: Record): Promise { const job = await jobPrisma.backgroundJob.create({ data: { type: 'scraper', language: language || 'generic', status: 'running', startedAt: new Date(), config, }, }); return job.id; } async function updateJobProgress(jobId: string, processed: number, succeeded: number, failed: number, itemsFound: number, totalItems: number): Promise { await jobPrisma.backgroundJob.update({ where: { id: jobId }, data: { processed, succeeded, failed, itemsFound, totalItems }, }); } async function checkJobStopping(jobId: string): Promise { const job = await jobPrisma.backgroundJob.findUnique({ where: { id: jobId } }); return job?.status === 'stopping'; } async function completeJob(jobId: string, error?: string): Promise { await jobPrisma.backgroundJob.update({ where: { id: jobId }, data: { status: error ? 'failed' : 'completed', error, completedAt: new Date(), }, }); } async function main() { const args = process.argv.slice(2); const limitIndex = args.indexOf('--limit'); const maxFailIndex = args.indexOf('--max-failures'); const idsIndex = args.indexOf('--ids'); const allMode = args.includes('--all'); const langIndex = args.indexOf('--language'); const maxFailures = maxFailIndex !== -1 ? parseInt(args[maxFailIndex + 1]) : 5; const ids = idsIndex !== -1 ? args[idsIndex + 1].split(',') : null; const language = langIndex !== -1 ? args[langIndex + 1] : null; // --ids mode: scrape specific churches if (ids) { console.log('============================================================'); console.log('Church Website Scraper — Targeted Mode'); console.log('============================================================'); console.log(`Targeting ${ids.length} specific churches`); console.log(`Max failures: ${maxFailures}`); console.log(`Started: ${new Date().toISOString()}`); console.log('============================================================\n'); const startTime = Date.now(); const results = await Promise.all(ids.map((id) => scrapeChurch(id.trim()))); printSummary(results, startTime); return; } // --all mode: batch loop through ALL eligible churches if (allMode) { const BATCH_SIZE = 100; const totalEligible = await countEligibleChurches(maxFailures); console.log('============================================================'); console.log('Church Website Scraper — Full Run'); console.log('============================================================'); console.log(`Language: ${language || 'all'}`); console.log(`Eligible churches: ${totalEligible.toLocaleString()}`); console.log(`Batch size: ${BATCH_SIZE}`); console.log(`Max failures: ${maxFailures}`); console.log(`Started: ${new Date().toISOString()}`); console.log('============================================================\n'); if (totalEligible === 0) { console.log('No eligible churches to scrape. All done!'); return; } // Job tracking let jobId = await createOrResumeJob(args); if (!jobId) { jobId = await createNewJob(language, { allMode: true, maxFailures, language }); } console.log(`Job ID: ${jobId}\n`); // Graceful shutdown handlers process.on('SIGINT', () => { if (shuttingDown) { console.log('\nForce quit.'); process.exit(1); } console.log('\nShutting down gracefully (finishing current batch)...'); shuttingDown = true; }); process.on('SIGTERM', () => { console.log('\nSIGTERM received, shutting down after current batch...'); shuttingDown = true; }); const allResults: ScrapeJobResult[] = []; const globalStart = Date.now(); let batchNum = 0; let totalSchedulesFound = 0; try { while (!shuttingDown) { batchNum++; const batchStart = Date.now(); const batchResults = await scrapeAllChurches({ limit: BATCH_SIZE, maxFailures, language: language || undefined }); if (batchResults.length === 0) { console.log('\nNo more eligible churches. All done!'); break; } allResults.push(...batchResults); // Batch summary const batchElapsed = (Date.now() - batchStart) / 1000; const batchSuccess = batchResults.filter((r) => r.success).length; const batchSchedules = batchResults.reduce((sum, r) => sum + r.schedulesFound, 0); totalSchedulesFound += batchSchedules; // Overall progress const totalElapsed = (Date.now() - globalStart) / 1000; const rate = allResults.length / (totalElapsed / 3600); const remaining = totalEligible - allResults.length; const etaSeconds = remaining > 0 && rate > 0 ? (remaining / rate) * 3600 : 0; console.log(`\n--- Batch ${batchNum} (${batchResults.length} churches) ---`); console.log(` Success: ${batchSuccess}/${batchResults.length} | Schedules: ${batchSchedules} | Time: ${formatDuration(batchElapsed)}`); console.log(` Progress: ${allResults.length.toLocaleString()}/${totalEligible.toLocaleString()} (${((allResults.length / totalEligible) * 100).toFixed(1)}%)`); console.log(` Rate: ${rate.toFixed(0)}/hr | ETA: ~${formatDuration(etaSeconds)}`); // Update job progress const succeeded = allResults.filter(r => r.success).length; const failed = allResults.filter(r => !r.success).length; await updateJobProgress(jobId, allResults.length, succeeded, failed, totalSchedulesFound, totalEligible); // Check if job was requested to stop (every 10 items) if (allResults.length % 10 === 0) { const stopping = await checkJobStopping(jobId); if (stopping) { console.log('\nJob stop requested via admin dashboard.'); shuttingDown = true; } } if (shuttingDown) { console.log('\nGraceful shutdown: batch completed.'); break; } } await completeJob(jobId); } catch (error) { await completeJob(jobId, error instanceof Error ? error.message : 'Unknown error'); throw error; } printSummary(allResults, globalStart); return; } // Default mode: single batch with --limit const limit = limitIndex !== -1 ? parseInt(args[limitIndex + 1]) : 100; console.log('============================================================'); console.log('Church Website Scraper'); console.log('============================================================'); console.log(`Language: ${language || 'all'}`); console.log(`Limit: ${limit}`); console.log(`Max failures: ${maxFailures}`); console.log(`Started: ${new Date().toISOString()}`); console.log('============================================================\n'); // Job tracking for single batch mode too let jobId = await createOrResumeJob(args); if (!jobId) { jobId = await createNewJob(language, { limit, maxFailures, language }); } console.log(`Job ID: ${jobId}\n`); const startTime = Date.now(); try { const results = await scrapeAllChurches({ limit, maxFailures, language: language || undefined }); const succeeded = results.filter(r => r.success).length; const failed = results.filter(r => !r.success).length; const totalSchedules = results.reduce((sum, r) => sum + r.schedulesFound, 0); await updateJobProgress(jobId, results.length, succeeded, failed, totalSchedules, limit); await completeJob(jobId); printSummary(results, startTime); } catch (error) { await completeJob(jobId, error instanceof Error ? error.message : 'Unknown error'); throw error; } } function printSummary(results: ScrapeJobResult[], startTime: number) { const elapsed = (Date.now() - startTime) / 1000; const succeeded = results.filter((r) => r.success); const failed = results.filter((r) => !r.success); const totalSchedules = results.reduce((sum, r) => sum + r.schedulesFound, 0); const rate = results.length / (elapsed / 3600); console.log('\n============================================================'); console.log('Scraping Summary'); console.log('============================================================'); console.log(`Churches processed: ${results.length.toLocaleString()}`); console.log(`Succeeded: ${succeeded.length.toLocaleString()}`); console.log(`Failed: ${failed.length.toLocaleString()}`); console.log(`Total schedules found: ${totalSchedules.toLocaleString()}`); console.log(`Elapsed time: ${formatDuration(elapsed)}`); console.log(`Average rate: ${rate.toFixed(0)}/hr`); console.log(`Finished: ${new Date().toISOString()}`); console.log('============================================================'); if (failed.length > 0) { console.log(`\nFailed churches (${failed.length}):`); // Show first 50 failures to avoid overwhelming output const toShow = failed.slice(0, 50); for (const f of toShow) { console.log(` - ${f.churchName}: ${f.error}`); } if (failed.length > 50) { console.log(` ... and ${failed.length - 50} more`); } } } main().catch((error) => { console.error('Fatal error:', error); process.exit(1); }).finally(async () => { await jobPrisma.$disconnect(); await pool.end(); });