chore: sync with Gitea master and restore local-only files

Reset local main to gitea/master (new source of truth) and restored
local-only files: web scrapers, admin dashboard, ChromaDB integration,
debug scripts, and utility libraries that aren't tracked in Gitea.

Gitea master adds: discovermass, buscarmisas-network, hk-parishes,
bohosluzby, kerknet, gottesdienstzeiten, miserend importers,
ClaimRequest model, forward geocoding, heartbeat healthcheck.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Albert
2026-04-12 19:11:22 -04:00
parent 76cca3ba75
commit 2c51513851
133 changed files with 30381 additions and 0 deletions

299
scripts/scrape-churches.ts Normal file
View File

@@ -0,0 +1,299 @@
#!/usr/bin/env tsx
/**
* Bulk church website scraper
* Scrapes mass schedules from church websites and updates the database.
*
* Usage:
* npx tsx scripts/scrape-churches.ts --limit 100
* npx tsx scripts/scrape-churches.ts --limit 50 --max-failures 3
* npx tsx scripts/scrape-churches.ts --all # Process ALL eligible churches
* npx tsx scripts/scrape-churches.ts --all --language english
* npx tsx scripts/scrape-churches.ts --all --max-failures 3
* npx tsx scripts/scrape-churches.ts --ids id1,id2,id3
* npx tsx scripts/scrape-churches.ts --all --job-id <uuid> # Resume/track existing job
*/
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { scrapeAllChurches, scrapeChurch, countEligibleChurches } from '../src/lib/scraper-service';
import type { ScrapeJobResult } from '../src/lib/scraper-service';
// Fresh DB connection for scripts
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const jobPrisma = new PrismaClient({ adapter });
let shuttingDown = false;
function formatDuration(seconds: number): string {
if (seconds < 60) return `${seconds.toFixed(0)}s`;
if (seconds < 3600) return `${Math.floor(seconds / 60)}m ${Math.floor(seconds % 60)}s`;
const h = Math.floor(seconds / 3600);
const m = Math.floor((seconds % 3600) / 60);
return `${h}h ${m}m`;
}
// --- Job Tracking ---
async function createOrResumeJob(args: string[]): Promise<string | null> {
const jobIdIndex = args.indexOf('--job-id');
if (jobIdIndex !== -1) {
const jobId = args[jobIdIndex + 1];
await jobPrisma.backgroundJob.update({
where: { id: jobId },
data: { status: 'running', startedAt: new Date() },
});
return jobId;
}
return null;
}
async function createNewJob(language: string | null, config: Record<string, unknown>): Promise<string> {
const job = await jobPrisma.backgroundJob.create({
data: {
type: 'scraper',
language: language || 'generic',
status: 'running',
startedAt: new Date(),
config,
},
});
return job.id;
}
async function updateJobProgress(jobId: string, processed: number, succeeded: number, failed: number, itemsFound: number, totalItems: number): Promise<void> {
await jobPrisma.backgroundJob.update({
where: { id: jobId },
data: { processed, succeeded, failed, itemsFound, totalItems },
});
}
async function checkJobStopping(jobId: string): Promise<boolean> {
const job = await jobPrisma.backgroundJob.findUnique({ where: { id: jobId } });
return job?.status === 'stopping';
}
async function completeJob(jobId: string, error?: string): Promise<void> {
await jobPrisma.backgroundJob.update({
where: { id: jobId },
data: {
status: error ? 'failed' : 'completed',
error,
completedAt: new Date(),
},
});
}
async function main() {
const args = process.argv.slice(2);
const limitIndex = args.indexOf('--limit');
const maxFailIndex = args.indexOf('--max-failures');
const idsIndex = args.indexOf('--ids');
const allMode = args.includes('--all');
const langIndex = args.indexOf('--language');
const maxFailures = maxFailIndex !== -1 ? parseInt(args[maxFailIndex + 1]) : 5;
const ids = idsIndex !== -1 ? args[idsIndex + 1].split(',') : null;
const language = langIndex !== -1 ? args[langIndex + 1] : null;
// --ids mode: scrape specific churches
if (ids) {
console.log('============================================================');
console.log('Church Website Scraper — Targeted Mode');
console.log('============================================================');
console.log(`Targeting ${ids.length} specific churches`);
console.log(`Max failures: ${maxFailures}`);
console.log(`Started: ${new Date().toISOString()}`);
console.log('============================================================\n');
const startTime = Date.now();
const results = await Promise.all(ids.map((id) => scrapeChurch(id.trim())));
printSummary(results, startTime);
return;
}
// --all mode: batch loop through ALL eligible churches
if (allMode) {
const BATCH_SIZE = 100;
const totalEligible = await countEligibleChurches(maxFailures);
console.log('============================================================');
console.log('Church Website Scraper — Full Run');
console.log('============================================================');
console.log(`Language: ${language || 'all'}`);
console.log(`Eligible churches: ${totalEligible.toLocaleString()}`);
console.log(`Batch size: ${BATCH_SIZE}`);
console.log(`Max failures: ${maxFailures}`);
console.log(`Started: ${new Date().toISOString()}`);
console.log('============================================================\n');
if (totalEligible === 0) {
console.log('No eligible churches to scrape. All done!');
return;
}
// Job tracking
let jobId = await createOrResumeJob(args);
if (!jobId) {
jobId = await createNewJob(language, { allMode: true, maxFailures, language });
}
console.log(`Job ID: ${jobId}\n`);
// Graceful shutdown handlers
process.on('SIGINT', () => {
if (shuttingDown) {
console.log('\nForce quit.');
process.exit(1);
}
console.log('\nShutting down gracefully (finishing current batch)...');
shuttingDown = true;
});
process.on('SIGTERM', () => {
console.log('\nSIGTERM received, shutting down after current batch...');
shuttingDown = true;
});
const allResults: ScrapeJobResult[] = [];
const globalStart = Date.now();
let batchNum = 0;
let totalSchedulesFound = 0;
try {
while (!shuttingDown) {
batchNum++;
const batchStart = Date.now();
const batchResults = await scrapeAllChurches({ limit: BATCH_SIZE, maxFailures, language: language || undefined });
if (batchResults.length === 0) {
console.log('\nNo more eligible churches. All done!');
break;
}
allResults.push(...batchResults);
// Batch summary
const batchElapsed = (Date.now() - batchStart) / 1000;
const batchSuccess = batchResults.filter((r) => r.success).length;
const batchSchedules = batchResults.reduce((sum, r) => sum + r.schedulesFound, 0);
totalSchedulesFound += batchSchedules;
// Overall progress
const totalElapsed = (Date.now() - globalStart) / 1000;
const rate = allResults.length / (totalElapsed / 3600);
const remaining = totalEligible - allResults.length;
const etaSeconds = remaining > 0 && rate > 0 ? (remaining / rate) * 3600 : 0;
console.log(`\n--- Batch ${batchNum} (${batchResults.length} churches) ---`);
console.log(` Success: ${batchSuccess}/${batchResults.length} | Schedules: ${batchSchedules} | Time: ${formatDuration(batchElapsed)}`);
console.log(` Progress: ${allResults.length.toLocaleString()}/${totalEligible.toLocaleString()} (${((allResults.length / totalEligible) * 100).toFixed(1)}%)`);
console.log(` Rate: ${rate.toFixed(0)}/hr | ETA: ~${formatDuration(etaSeconds)}`);
// Update job progress
const succeeded = allResults.filter(r => r.success).length;
const failed = allResults.filter(r => !r.success).length;
await updateJobProgress(jobId, allResults.length, succeeded, failed, totalSchedulesFound, totalEligible);
// Check if job was requested to stop (every 10 items)
if (allResults.length % 10 === 0) {
const stopping = await checkJobStopping(jobId);
if (stopping) {
console.log('\nJob stop requested via admin dashboard.');
shuttingDown = true;
}
}
if (shuttingDown) {
console.log('\nGraceful shutdown: batch completed.');
break;
}
}
await completeJob(jobId);
} catch (error) {
await completeJob(jobId, error instanceof Error ? error.message : 'Unknown error');
throw error;
}
printSummary(allResults, globalStart);
return;
}
// Default mode: single batch with --limit
const limit = limitIndex !== -1 ? parseInt(args[limitIndex + 1]) : 100;
console.log('============================================================');
console.log('Church Website Scraper');
console.log('============================================================');
console.log(`Language: ${language || 'all'}`);
console.log(`Limit: ${limit}`);
console.log(`Max failures: ${maxFailures}`);
console.log(`Started: ${new Date().toISOString()}`);
console.log('============================================================\n');
// Job tracking for single batch mode too
let jobId = await createOrResumeJob(args);
if (!jobId) {
jobId = await createNewJob(language, { limit, maxFailures, language });
}
console.log(`Job ID: ${jobId}\n`);
const startTime = Date.now();
try {
const results = await scrapeAllChurches({ limit, maxFailures, language: language || undefined });
const succeeded = results.filter(r => r.success).length;
const failed = results.filter(r => !r.success).length;
const totalSchedules = results.reduce((sum, r) => sum + r.schedulesFound, 0);
await updateJobProgress(jobId, results.length, succeeded, failed, totalSchedules, limit);
await completeJob(jobId);
printSummary(results, startTime);
} catch (error) {
await completeJob(jobId, error instanceof Error ? error.message : 'Unknown error');
throw error;
}
}
function printSummary(results: ScrapeJobResult[], startTime: number) {
const elapsed = (Date.now() - startTime) / 1000;
const succeeded = results.filter((r) => r.success);
const failed = results.filter((r) => !r.success);
const totalSchedules = results.reduce((sum, r) => sum + r.schedulesFound, 0);
const rate = results.length / (elapsed / 3600);
console.log('\n============================================================');
console.log('Scraping Summary');
console.log('============================================================');
console.log(`Churches processed: ${results.length.toLocaleString()}`);
console.log(`Succeeded: ${succeeded.length.toLocaleString()}`);
console.log(`Failed: ${failed.length.toLocaleString()}`);
console.log(`Total schedules found: ${totalSchedules.toLocaleString()}`);
console.log(`Elapsed time: ${formatDuration(elapsed)}`);
console.log(`Average rate: ${rate.toFixed(0)}/hr`);
console.log(`Finished: ${new Date().toISOString()}`);
console.log('============================================================');
if (failed.length > 0) {
console.log(`\nFailed churches (${failed.length}):`);
// Show first 50 failures to avoid overwhelming output
const toShow = failed.slice(0, 50);
for (const f of toShow) {
console.log(` - ${f.churchName}: ${f.error}`);
}
if (failed.length > 50) {
console.log(` ... and ${failed.length - 50} more`);
}
}
}
main().catch((error) => {
console.error('Fatal error:', error);
process.exit(1);
}).finally(async () => {
await jobPrisma.$disconnect();
await pool.end();
});