chore: sync with Gitea master and restore local-only files
Reset local main to gitea/master (new source of truth) and restored local-only files: web scrapers, admin dashboard, ChromaDB integration, debug scripts, and utility libraries that aren't tracked in Gitea. Gitea master adds: discovermass, buscarmisas-network, hk-parishes, bohosluzby, kerknet, gottesdienstzeiten, miserend importers, ClaimRequest model, forward geocoding, heartbeat healthcheck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
299
scripts/scrape-churches.ts
Normal file
299
scripts/scrape-churches.ts
Normal file
@@ -0,0 +1,299 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Bulk church website scraper
|
||||
* Scrapes mass schedules from church websites and updates the database.
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/scrape-churches.ts --limit 100
|
||||
* npx tsx scripts/scrape-churches.ts --limit 50 --max-failures 3
|
||||
* npx tsx scripts/scrape-churches.ts --all # Process ALL eligible churches
|
||||
* npx tsx scripts/scrape-churches.ts --all --language english
|
||||
* npx tsx scripts/scrape-churches.ts --all --max-failures 3
|
||||
* npx tsx scripts/scrape-churches.ts --ids id1,id2,id3
|
||||
* npx tsx scripts/scrape-churches.ts --all --job-id <uuid> # Resume/track existing job
|
||||
*/
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { scrapeAllChurches, scrapeChurch, countEligibleChurches } from '../src/lib/scraper-service';
|
||||
import type { ScrapeJobResult } from '../src/lib/scraper-service';
|
||||
|
||||
// Fresh DB connection for scripts
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const jobPrisma = new PrismaClient({ adapter });
|
||||
|
||||
let shuttingDown = false;
|
||||
|
||||
function formatDuration(seconds: number): string {
|
||||
if (seconds < 60) return `${seconds.toFixed(0)}s`;
|
||||
if (seconds < 3600) return `${Math.floor(seconds / 60)}m ${Math.floor(seconds % 60)}s`;
|
||||
const h = Math.floor(seconds / 3600);
|
||||
const m = Math.floor((seconds % 3600) / 60);
|
||||
return `${h}h ${m}m`;
|
||||
}
|
||||
|
||||
// --- Job Tracking ---
|
||||
|
||||
async function createOrResumeJob(args: string[]): Promise<string | null> {
|
||||
const jobIdIndex = args.indexOf('--job-id');
|
||||
if (jobIdIndex !== -1) {
|
||||
const jobId = args[jobIdIndex + 1];
|
||||
await jobPrisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: { status: 'running', startedAt: new Date() },
|
||||
});
|
||||
return jobId;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function createNewJob(language: string | null, config: Record<string, unknown>): Promise<string> {
|
||||
const job = await jobPrisma.backgroundJob.create({
|
||||
data: {
|
||||
type: 'scraper',
|
||||
language: language || 'generic',
|
||||
status: 'running',
|
||||
startedAt: new Date(),
|
||||
config,
|
||||
},
|
||||
});
|
||||
return job.id;
|
||||
}
|
||||
|
||||
async function updateJobProgress(jobId: string, processed: number, succeeded: number, failed: number, itemsFound: number, totalItems: number): Promise<void> {
|
||||
await jobPrisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: { processed, succeeded, failed, itemsFound, totalItems },
|
||||
});
|
||||
}
|
||||
|
||||
async function checkJobStopping(jobId: string): Promise<boolean> {
|
||||
const job = await jobPrisma.backgroundJob.findUnique({ where: { id: jobId } });
|
||||
return job?.status === 'stopping';
|
||||
}
|
||||
|
||||
async function completeJob(jobId: string, error?: string): Promise<void> {
|
||||
await jobPrisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: {
|
||||
status: error ? 'failed' : 'completed',
|
||||
error,
|
||||
completedAt: new Date(),
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = process.argv.slice(2);
|
||||
const limitIndex = args.indexOf('--limit');
|
||||
const maxFailIndex = args.indexOf('--max-failures');
|
||||
const idsIndex = args.indexOf('--ids');
|
||||
const allMode = args.includes('--all');
|
||||
const langIndex = args.indexOf('--language');
|
||||
|
||||
const maxFailures = maxFailIndex !== -1 ? parseInt(args[maxFailIndex + 1]) : 5;
|
||||
const ids = idsIndex !== -1 ? args[idsIndex + 1].split(',') : null;
|
||||
const language = langIndex !== -1 ? args[langIndex + 1] : null;
|
||||
|
||||
// --ids mode: scrape specific churches
|
||||
if (ids) {
|
||||
console.log('============================================================');
|
||||
console.log('Church Website Scraper — Targeted Mode');
|
||||
console.log('============================================================');
|
||||
console.log(`Targeting ${ids.length} specific churches`);
|
||||
console.log(`Max failures: ${maxFailures}`);
|
||||
console.log(`Started: ${new Date().toISOString()}`);
|
||||
console.log('============================================================\n');
|
||||
|
||||
const startTime = Date.now();
|
||||
const results = await Promise.all(ids.map((id) => scrapeChurch(id.trim())));
|
||||
printSummary(results, startTime);
|
||||
return;
|
||||
}
|
||||
|
||||
// --all mode: batch loop through ALL eligible churches
|
||||
if (allMode) {
|
||||
const BATCH_SIZE = 100;
|
||||
const totalEligible = await countEligibleChurches(maxFailures);
|
||||
|
||||
console.log('============================================================');
|
||||
console.log('Church Website Scraper — Full Run');
|
||||
console.log('============================================================');
|
||||
console.log(`Language: ${language || 'all'}`);
|
||||
console.log(`Eligible churches: ${totalEligible.toLocaleString()}`);
|
||||
console.log(`Batch size: ${BATCH_SIZE}`);
|
||||
console.log(`Max failures: ${maxFailures}`);
|
||||
console.log(`Started: ${new Date().toISOString()}`);
|
||||
console.log('============================================================\n');
|
||||
|
||||
if (totalEligible === 0) {
|
||||
console.log('No eligible churches to scrape. All done!');
|
||||
return;
|
||||
}
|
||||
|
||||
// Job tracking
|
||||
let jobId = await createOrResumeJob(args);
|
||||
if (!jobId) {
|
||||
jobId = await createNewJob(language, { allMode: true, maxFailures, language });
|
||||
}
|
||||
console.log(`Job ID: ${jobId}\n`);
|
||||
|
||||
// Graceful shutdown handlers
|
||||
process.on('SIGINT', () => {
|
||||
if (shuttingDown) {
|
||||
console.log('\nForce quit.');
|
||||
process.exit(1);
|
||||
}
|
||||
console.log('\nShutting down gracefully (finishing current batch)...');
|
||||
shuttingDown = true;
|
||||
});
|
||||
process.on('SIGTERM', () => {
|
||||
console.log('\nSIGTERM received, shutting down after current batch...');
|
||||
shuttingDown = true;
|
||||
});
|
||||
|
||||
const allResults: ScrapeJobResult[] = [];
|
||||
const globalStart = Date.now();
|
||||
let batchNum = 0;
|
||||
let totalSchedulesFound = 0;
|
||||
|
||||
try {
|
||||
while (!shuttingDown) {
|
||||
batchNum++;
|
||||
const batchStart = Date.now();
|
||||
|
||||
const batchResults = await scrapeAllChurches({ limit: BATCH_SIZE, maxFailures, language: language || undefined });
|
||||
|
||||
if (batchResults.length === 0) {
|
||||
console.log('\nNo more eligible churches. All done!');
|
||||
break;
|
||||
}
|
||||
|
||||
allResults.push(...batchResults);
|
||||
|
||||
// Batch summary
|
||||
const batchElapsed = (Date.now() - batchStart) / 1000;
|
||||
const batchSuccess = batchResults.filter((r) => r.success).length;
|
||||
const batchSchedules = batchResults.reduce((sum, r) => sum + r.schedulesFound, 0);
|
||||
totalSchedulesFound += batchSchedules;
|
||||
|
||||
// Overall progress
|
||||
const totalElapsed = (Date.now() - globalStart) / 1000;
|
||||
const rate = allResults.length / (totalElapsed / 3600);
|
||||
const remaining = totalEligible - allResults.length;
|
||||
const etaSeconds = remaining > 0 && rate > 0 ? (remaining / rate) * 3600 : 0;
|
||||
|
||||
console.log(`\n--- Batch ${batchNum} (${batchResults.length} churches) ---`);
|
||||
console.log(` Success: ${batchSuccess}/${batchResults.length} | Schedules: ${batchSchedules} | Time: ${formatDuration(batchElapsed)}`);
|
||||
console.log(` Progress: ${allResults.length.toLocaleString()}/${totalEligible.toLocaleString()} (${((allResults.length / totalEligible) * 100).toFixed(1)}%)`);
|
||||
console.log(` Rate: ${rate.toFixed(0)}/hr | ETA: ~${formatDuration(etaSeconds)}`);
|
||||
|
||||
// Update job progress
|
||||
const succeeded = allResults.filter(r => r.success).length;
|
||||
const failed = allResults.filter(r => !r.success).length;
|
||||
await updateJobProgress(jobId, allResults.length, succeeded, failed, totalSchedulesFound, totalEligible);
|
||||
|
||||
// Check if job was requested to stop (every 10 items)
|
||||
if (allResults.length % 10 === 0) {
|
||||
const stopping = await checkJobStopping(jobId);
|
||||
if (stopping) {
|
||||
console.log('\nJob stop requested via admin dashboard.');
|
||||
shuttingDown = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (shuttingDown) {
|
||||
console.log('\nGraceful shutdown: batch completed.');
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
await completeJob(jobId);
|
||||
} catch (error) {
|
||||
await completeJob(jobId, error instanceof Error ? error.message : 'Unknown error');
|
||||
throw error;
|
||||
}
|
||||
|
||||
printSummary(allResults, globalStart);
|
||||
return;
|
||||
}
|
||||
|
||||
// Default mode: single batch with --limit
|
||||
const limit = limitIndex !== -1 ? parseInt(args[limitIndex + 1]) : 100;
|
||||
|
||||
console.log('============================================================');
|
||||
console.log('Church Website Scraper');
|
||||
console.log('============================================================');
|
||||
console.log(`Language: ${language || 'all'}`);
|
||||
console.log(`Limit: ${limit}`);
|
||||
console.log(`Max failures: ${maxFailures}`);
|
||||
console.log(`Started: ${new Date().toISOString()}`);
|
||||
console.log('============================================================\n');
|
||||
|
||||
// Job tracking for single batch mode too
|
||||
let jobId = await createOrResumeJob(args);
|
||||
if (!jobId) {
|
||||
jobId = await createNewJob(language, { limit, maxFailures, language });
|
||||
}
|
||||
console.log(`Job ID: ${jobId}\n`);
|
||||
|
||||
const startTime = Date.now();
|
||||
try {
|
||||
const results = await scrapeAllChurches({ limit, maxFailures, language: language || undefined });
|
||||
const succeeded = results.filter(r => r.success).length;
|
||||
const failed = results.filter(r => !r.success).length;
|
||||
const totalSchedules = results.reduce((sum, r) => sum + r.schedulesFound, 0);
|
||||
await updateJobProgress(jobId, results.length, succeeded, failed, totalSchedules, limit);
|
||||
await completeJob(jobId);
|
||||
printSummary(results, startTime);
|
||||
} catch (error) {
|
||||
await completeJob(jobId, error instanceof Error ? error.message : 'Unknown error');
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
function printSummary(results: ScrapeJobResult[], startTime: number) {
|
||||
const elapsed = (Date.now() - startTime) / 1000;
|
||||
const succeeded = results.filter((r) => r.success);
|
||||
const failed = results.filter((r) => !r.success);
|
||||
const totalSchedules = results.reduce((sum, r) => sum + r.schedulesFound, 0);
|
||||
const rate = results.length / (elapsed / 3600);
|
||||
|
||||
console.log('\n============================================================');
|
||||
console.log('Scraping Summary');
|
||||
console.log('============================================================');
|
||||
console.log(`Churches processed: ${results.length.toLocaleString()}`);
|
||||
console.log(`Succeeded: ${succeeded.length.toLocaleString()}`);
|
||||
console.log(`Failed: ${failed.length.toLocaleString()}`);
|
||||
console.log(`Total schedules found: ${totalSchedules.toLocaleString()}`);
|
||||
console.log(`Elapsed time: ${formatDuration(elapsed)}`);
|
||||
console.log(`Average rate: ${rate.toFixed(0)}/hr`);
|
||||
console.log(`Finished: ${new Date().toISOString()}`);
|
||||
console.log('============================================================');
|
||||
|
||||
if (failed.length > 0) {
|
||||
console.log(`\nFailed churches (${failed.length}):`);
|
||||
// Show first 50 failures to avoid overwhelming output
|
||||
const toShow = failed.slice(0, 50);
|
||||
for (const f of toShow) {
|
||||
console.log(` - ${f.churchName}: ${f.error}`);
|
||||
}
|
||||
if (failed.length > 50) {
|
||||
console.log(` ... and ${failed.length - 50} more`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
console.error('Fatal error:', error);
|
||||
process.exit(1);
|
||||
}).finally(async () => {
|
||||
await jobPrisma.$disconnect();
|
||||
await pool.end();
|
||||
});
|
||||
Reference in New Issue
Block a user