Reset local main to gitea/master (new source of truth) and restored local-only files: web scrapers, admin dashboard, ChromaDB integration, debug scripts, and utility libraries that aren't tracked in Gitea. Gitea master adds: discovermass, buscarmisas-network, hk-parishes, bohosluzby, kerknet, gottesdienstzeiten, miserend importers, ClaimRequest model, forward geocoding, heartbeat healthcheck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
300 lines
11 KiB
TypeScript
300 lines
11 KiB
TypeScript
#!/usr/bin/env tsx
|
|
/**
|
|
* Bulk church website scraper
|
|
* Scrapes mass schedules from church websites and updates the database.
|
|
*
|
|
* Usage:
|
|
* npx tsx scripts/scrape-churches.ts --limit 100
|
|
* npx tsx scripts/scrape-churches.ts --limit 50 --max-failures 3
|
|
* npx tsx scripts/scrape-churches.ts --all # Process ALL eligible churches
|
|
* npx tsx scripts/scrape-churches.ts --all --language english
|
|
* npx tsx scripts/scrape-churches.ts --all --max-failures 3
|
|
* npx tsx scripts/scrape-churches.ts --ids id1,id2,id3
|
|
* npx tsx scripts/scrape-churches.ts --all --job-id <uuid> # Resume/track existing job
|
|
*/
|
|
|
|
import dotenv from 'dotenv';
|
|
import path from 'path';
|
|
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
|
|
|
import { Pool } from 'pg';
|
|
import { PrismaPg } from '@prisma/adapter-pg';
|
|
import { PrismaClient } from '@prisma/client';
|
|
import { scrapeAllChurches, scrapeChurch, countEligibleChurches } from '../src/lib/scraper-service';
|
|
import type { ScrapeJobResult } from '../src/lib/scraper-service';
|
|
|
|
// Fresh DB connection for scripts
|
|
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
|
const adapter = new PrismaPg(pool);
|
|
const jobPrisma = new PrismaClient({ adapter });
|
|
|
|
let shuttingDown = false;
|
|
|
|
function formatDuration(seconds: number): string {
|
|
if (seconds < 60) return `${seconds.toFixed(0)}s`;
|
|
if (seconds < 3600) return `${Math.floor(seconds / 60)}m ${Math.floor(seconds % 60)}s`;
|
|
const h = Math.floor(seconds / 3600);
|
|
const m = Math.floor((seconds % 3600) / 60);
|
|
return `${h}h ${m}m`;
|
|
}
|
|
|
|
// --- Job Tracking ---
|
|
|
|
async function createOrResumeJob(args: string[]): Promise<string | null> {
|
|
const jobIdIndex = args.indexOf('--job-id');
|
|
if (jobIdIndex !== -1) {
|
|
const jobId = args[jobIdIndex + 1];
|
|
await jobPrisma.backgroundJob.update({
|
|
where: { id: jobId },
|
|
data: { status: 'running', startedAt: new Date() },
|
|
});
|
|
return jobId;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
async function createNewJob(language: string | null, config: Record<string, unknown>): Promise<string> {
|
|
const job = await jobPrisma.backgroundJob.create({
|
|
data: {
|
|
type: 'scraper',
|
|
language: language || 'generic',
|
|
status: 'running',
|
|
startedAt: new Date(),
|
|
config,
|
|
},
|
|
});
|
|
return job.id;
|
|
}
|
|
|
|
async function updateJobProgress(jobId: string, processed: number, succeeded: number, failed: number, itemsFound: number, totalItems: number): Promise<void> {
|
|
await jobPrisma.backgroundJob.update({
|
|
where: { id: jobId },
|
|
data: { processed, succeeded, failed, itemsFound, totalItems },
|
|
});
|
|
}
|
|
|
|
async function checkJobStopping(jobId: string): Promise<boolean> {
|
|
const job = await jobPrisma.backgroundJob.findUnique({ where: { id: jobId } });
|
|
return job?.status === 'stopping';
|
|
}
|
|
|
|
async function completeJob(jobId: string, error?: string): Promise<void> {
|
|
await jobPrisma.backgroundJob.update({
|
|
where: { id: jobId },
|
|
data: {
|
|
status: error ? 'failed' : 'completed',
|
|
error,
|
|
completedAt: new Date(),
|
|
},
|
|
});
|
|
}
|
|
|
|
async function main() {
|
|
const args = process.argv.slice(2);
|
|
const limitIndex = args.indexOf('--limit');
|
|
const maxFailIndex = args.indexOf('--max-failures');
|
|
const idsIndex = args.indexOf('--ids');
|
|
const allMode = args.includes('--all');
|
|
const langIndex = args.indexOf('--language');
|
|
|
|
const maxFailures = maxFailIndex !== -1 ? parseInt(args[maxFailIndex + 1]) : 5;
|
|
const ids = idsIndex !== -1 ? args[idsIndex + 1].split(',') : null;
|
|
const language = langIndex !== -1 ? args[langIndex + 1] : null;
|
|
|
|
// --ids mode: scrape specific churches
|
|
if (ids) {
|
|
console.log('============================================================');
|
|
console.log('Church Website Scraper — Targeted Mode');
|
|
console.log('============================================================');
|
|
console.log(`Targeting ${ids.length} specific churches`);
|
|
console.log(`Max failures: ${maxFailures}`);
|
|
console.log(`Started: ${new Date().toISOString()}`);
|
|
console.log('============================================================\n');
|
|
|
|
const startTime = Date.now();
|
|
const results = await Promise.all(ids.map((id) => scrapeChurch(id.trim())));
|
|
printSummary(results, startTime);
|
|
return;
|
|
}
|
|
|
|
// --all mode: batch loop through ALL eligible churches
|
|
if (allMode) {
|
|
const BATCH_SIZE = 100;
|
|
const totalEligible = await countEligibleChurches(maxFailures);
|
|
|
|
console.log('============================================================');
|
|
console.log('Church Website Scraper — Full Run');
|
|
console.log('============================================================');
|
|
console.log(`Language: ${language || 'all'}`);
|
|
console.log(`Eligible churches: ${totalEligible.toLocaleString()}`);
|
|
console.log(`Batch size: ${BATCH_SIZE}`);
|
|
console.log(`Max failures: ${maxFailures}`);
|
|
console.log(`Started: ${new Date().toISOString()}`);
|
|
console.log('============================================================\n');
|
|
|
|
if (totalEligible === 0) {
|
|
console.log('No eligible churches to scrape. All done!');
|
|
return;
|
|
}
|
|
|
|
// Job tracking
|
|
let jobId = await createOrResumeJob(args);
|
|
if (!jobId) {
|
|
jobId = await createNewJob(language, { allMode: true, maxFailures, language });
|
|
}
|
|
console.log(`Job ID: ${jobId}\n`);
|
|
|
|
// Graceful shutdown handlers
|
|
process.on('SIGINT', () => {
|
|
if (shuttingDown) {
|
|
console.log('\nForce quit.');
|
|
process.exit(1);
|
|
}
|
|
console.log('\nShutting down gracefully (finishing current batch)...');
|
|
shuttingDown = true;
|
|
});
|
|
process.on('SIGTERM', () => {
|
|
console.log('\nSIGTERM received, shutting down after current batch...');
|
|
shuttingDown = true;
|
|
});
|
|
|
|
const allResults: ScrapeJobResult[] = [];
|
|
const globalStart = Date.now();
|
|
let batchNum = 0;
|
|
let totalSchedulesFound = 0;
|
|
|
|
try {
|
|
while (!shuttingDown) {
|
|
batchNum++;
|
|
const batchStart = Date.now();
|
|
|
|
const batchResults = await scrapeAllChurches({ limit: BATCH_SIZE, maxFailures, language: language || undefined });
|
|
|
|
if (batchResults.length === 0) {
|
|
console.log('\nNo more eligible churches. All done!');
|
|
break;
|
|
}
|
|
|
|
allResults.push(...batchResults);
|
|
|
|
// Batch summary
|
|
const batchElapsed = (Date.now() - batchStart) / 1000;
|
|
const batchSuccess = batchResults.filter((r) => r.success).length;
|
|
const batchSchedules = batchResults.reduce((sum, r) => sum + r.schedulesFound, 0);
|
|
totalSchedulesFound += batchSchedules;
|
|
|
|
// Overall progress
|
|
const totalElapsed = (Date.now() - globalStart) / 1000;
|
|
const rate = allResults.length / (totalElapsed / 3600);
|
|
const remaining = totalEligible - allResults.length;
|
|
const etaSeconds = remaining > 0 && rate > 0 ? (remaining / rate) * 3600 : 0;
|
|
|
|
console.log(`\n--- Batch ${batchNum} (${batchResults.length} churches) ---`);
|
|
console.log(` Success: ${batchSuccess}/${batchResults.length} | Schedules: ${batchSchedules} | Time: ${formatDuration(batchElapsed)}`);
|
|
console.log(` Progress: ${allResults.length.toLocaleString()}/${totalEligible.toLocaleString()} (${((allResults.length / totalEligible) * 100).toFixed(1)}%)`);
|
|
console.log(` Rate: ${rate.toFixed(0)}/hr | ETA: ~${formatDuration(etaSeconds)}`);
|
|
|
|
// Update job progress
|
|
const succeeded = allResults.filter(r => r.success).length;
|
|
const failed = allResults.filter(r => !r.success).length;
|
|
await updateJobProgress(jobId, allResults.length, succeeded, failed, totalSchedulesFound, totalEligible);
|
|
|
|
// Check if job was requested to stop (every 10 items)
|
|
if (allResults.length % 10 === 0) {
|
|
const stopping = await checkJobStopping(jobId);
|
|
if (stopping) {
|
|
console.log('\nJob stop requested via admin dashboard.');
|
|
shuttingDown = true;
|
|
}
|
|
}
|
|
|
|
if (shuttingDown) {
|
|
console.log('\nGraceful shutdown: batch completed.');
|
|
break;
|
|
}
|
|
}
|
|
|
|
await completeJob(jobId);
|
|
} catch (error) {
|
|
await completeJob(jobId, error instanceof Error ? error.message : 'Unknown error');
|
|
throw error;
|
|
}
|
|
|
|
printSummary(allResults, globalStart);
|
|
return;
|
|
}
|
|
|
|
// Default mode: single batch with --limit
|
|
const limit = limitIndex !== -1 ? parseInt(args[limitIndex + 1]) : 100;
|
|
|
|
console.log('============================================================');
|
|
console.log('Church Website Scraper');
|
|
console.log('============================================================');
|
|
console.log(`Language: ${language || 'all'}`);
|
|
console.log(`Limit: ${limit}`);
|
|
console.log(`Max failures: ${maxFailures}`);
|
|
console.log(`Started: ${new Date().toISOString()}`);
|
|
console.log('============================================================\n');
|
|
|
|
// Job tracking for single batch mode too
|
|
let jobId = await createOrResumeJob(args);
|
|
if (!jobId) {
|
|
jobId = await createNewJob(language, { limit, maxFailures, language });
|
|
}
|
|
console.log(`Job ID: ${jobId}\n`);
|
|
|
|
const startTime = Date.now();
|
|
try {
|
|
const results = await scrapeAllChurches({ limit, maxFailures, language: language || undefined });
|
|
const succeeded = results.filter(r => r.success).length;
|
|
const failed = results.filter(r => !r.success).length;
|
|
const totalSchedules = results.reduce((sum, r) => sum + r.schedulesFound, 0);
|
|
await updateJobProgress(jobId, results.length, succeeded, failed, totalSchedules, limit);
|
|
await completeJob(jobId);
|
|
printSummary(results, startTime);
|
|
} catch (error) {
|
|
await completeJob(jobId, error instanceof Error ? error.message : 'Unknown error');
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
function printSummary(results: ScrapeJobResult[], startTime: number) {
|
|
const elapsed = (Date.now() - startTime) / 1000;
|
|
const succeeded = results.filter((r) => r.success);
|
|
const failed = results.filter((r) => !r.success);
|
|
const totalSchedules = results.reduce((sum, r) => sum + r.schedulesFound, 0);
|
|
const rate = results.length / (elapsed / 3600);
|
|
|
|
console.log('\n============================================================');
|
|
console.log('Scraping Summary');
|
|
console.log('============================================================');
|
|
console.log(`Churches processed: ${results.length.toLocaleString()}`);
|
|
console.log(`Succeeded: ${succeeded.length.toLocaleString()}`);
|
|
console.log(`Failed: ${failed.length.toLocaleString()}`);
|
|
console.log(`Total schedules found: ${totalSchedules.toLocaleString()}`);
|
|
console.log(`Elapsed time: ${formatDuration(elapsed)}`);
|
|
console.log(`Average rate: ${rate.toFixed(0)}/hr`);
|
|
console.log(`Finished: ${new Date().toISOString()}`);
|
|
console.log('============================================================');
|
|
|
|
if (failed.length > 0) {
|
|
console.log(`\nFailed churches (${failed.length}):`);
|
|
// Show first 50 failures to avoid overwhelming output
|
|
const toShow = failed.slice(0, 50);
|
|
for (const f of toShow) {
|
|
console.log(` - ${f.churchName}: ${f.error}`);
|
|
}
|
|
if (failed.length > 50) {
|
|
console.log(` ... and ${failed.length - 50} more`);
|
|
}
|
|
}
|
|
}
|
|
|
|
main().catch((error) => {
|
|
console.error('Fatal error:', error);
|
|
process.exit(1);
|
|
}).finally(async () => {
|
|
await jobPrisma.$disconnect();
|
|
await pool.end();
|
|
});
|