chore: sync with Gitea master and restore local-only files

Reset local main to gitea/master (new source of truth) and restored
local-only files: web scrapers, admin dashboard, ChromaDB integration,
debug scripts, and utility libraries that aren't tracked in Gitea.

Gitea master adds: discovermass, buscarmisas-network, hk-parishes,
bohosluzby, kerknet, gottesdienstzeiten, miserend importers,
ClaimRequest model, forward geocoding, heartbeat healthcheck.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Albert
2026-04-12 19:11:22 -04:00
parent 76cca3ba75
commit 2c51513851
133 changed files with 30381 additions and 0 deletions

View File

@@ -0,0 +1,372 @@
#!/usr/bin/env tsx
/**
* Scrape diocese directories to discover parish URLs and mass schedules
*
* Usage:
* npx tsx scripts/scrape-diocese-directory.ts --diocese <id> # Single diocese
* npx tsx scripts/scrape-diocese-directory.ts --country DE # All dioceses in country
* npx tsx scripts/scrape-diocese-directory.ts --all # All active dioceses
* npx tsx scripts/scrape-diocese-directory.ts --all --dry-run # Preview only
* npx tsx scripts/scrape-diocese-directory.ts --job-id <uuid> # Resume tracked job
*/
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { DioceseDirectoryScraper, DioceseScrapeConfig } from '../src/scrapers/diocese-directory-scraper';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
function log(msg: string) {
console.log(`[${new Date().toISOString()}] ${msg}`);
}
function logError(msg: string) {
console.error(`[${new Date().toISOString()}] ERROR: ${msg}`);
}
// Haversine distance in km
function haversineKm(lat1: number, lon1: number, lat2: number, lon2: number): number {
const R = 6371;
const dLat = (lat2 - lat1) * Math.PI / 180;
const dLon = (lon2 - lon1) * Math.PI / 180;
const a = Math.sin(dLat / 2) ** 2 +
Math.cos(lat1 * Math.PI / 180) * Math.cos(lat2 * Math.PI / 180) *
Math.sin(dLon / 2) ** 2;
return R * 2 * Math.asin(Math.sqrt(a));
}
function normalizeForMatch(str: string): string {
return str.toLowerCase()
.normalize('NFD').replace(/[\u0300-\u036f]/g, '')
.replace(/[^a-z0-9\s]/g, '')
.replace(/\s+/g, ' ')
.trim();
}
interface MatchCandidate {
id: string;
name: string;
latitude: number;
longitude: number;
distance: number;
nameScore: number;
}
async function findMatchingChurch(
name: string,
address: string | undefined,
city: string | undefined,
country: string,
): Promise<MatchCandidate | null> {
// Search by name similarity + country
const nameNorm = normalizeForMatch(name);
const nameWords = nameNorm.split(' ').filter(w => w.length >= 3);
if (nameWords.length === 0) return null;
// Find churches in the same country
const candidates = await prisma.church.findMany({
where: {
country,
...(city ? { city: { contains: city, mode: 'insensitive' } } : {}),
},
select: { id: true, name: true, latitude: true, longitude: true, website: true },
take: 50,
});
let bestMatch: MatchCandidate | null = null;
for (const church of candidates) {
const churchNameNorm = normalizeForMatch(church.name);
const churchWords = churchNameNorm.split(' ').filter(w => w.length >= 3);
let matchingWords = 0;
for (const w of nameWords) {
if (churchWords.includes(w)) matchingWords++;
}
const nameScore = nameWords.length > 0 ? matchingWords / nameWords.length : 0;
// Require at least 40% word overlap
if (nameScore < 0.4) continue;
if (!bestMatch || nameScore > bestMatch.nameScore) {
bestMatch = {
id: church.id,
name: church.name,
latitude: church.latitude,
longitude: church.longitude,
distance: 0,
nameScore,
};
}
}
return bestMatch;
}
// --- Job Tracking ---
async function createOrResumeJob(args: string[]): Promise<string | null> {
const jobIdIndex = args.indexOf('--job-id');
if (jobIdIndex !== -1) {
const jobId = args[jobIdIndex + 1];
await prisma.backgroundJob.update({
where: { id: jobId },
data: { status: 'running', startedAt: new Date() },
});
return jobId;
}
return null;
}
async function scrapeDiocese(
dioceseId: string,
dryRun: boolean,
stats: { processed: number; matched: number; created: number; schedules: number; errors: number }
): Promise<void> {
const diocese = await prisma.diocese.findUnique({ where: { id: dioceseId } });
if (!diocese) {
logError(`Diocese not found: ${dioceseId}`);
return;
}
if (!diocese.directoryUrl) {
log(` Skipping ${diocese.name}: no directory URL`);
return;
}
const config = diocese.scrapeConfig as DioceseScrapeConfig | null;
if (!config?.selectors) {
log(` Skipping ${diocese.name}: no scrape config`);
return;
}
log(`Scraping diocese: ${diocese.name} (${diocese.country})`);
log(` Directory URL: ${diocese.directoryUrl}`);
const scraper = new DioceseDirectoryScraper();
try {
let parishes;
if (config.scheduleInDirectory) {
parishes = await scraper.scrapeDirectoryWithSchedules(
diocese.directoryUrl,
config,
diocese.language
);
} else {
const discovered = await scraper.scrapeDirectory(diocese.directoryUrl, config);
parishes = discovered.map(p => ({
...p,
scheduleText: '',
schedules: [] as Array<{ dayOfWeek: number; time: string; massType?: string; language?: string; notes?: string }>,
}));
}
log(` Discovered ${parishes.length} parishes`);
for (const parish of parishes) {
stats.processed++;
// Try to match to existing church
const match = await findMatchingChurch(
parish.name,
parish.address,
parish.city,
diocese.country,
);
if (match) {
stats.matched++;
log(` Match: "${parish.name}" -> "${match.name}" (score: ${match.nameScore.toFixed(2)})`);
if (!dryRun) {
// Update matched church with website and diocese link
await prisma.church.update({
where: { id: match.id },
data: {
website: parish.url,
hasWebsite: true,
dioceseId: diocese.id,
},
});
// Save schedules if available
if ('schedules' in parish && parish.schedules.length > 0) {
await prisma.massSchedule.deleteMany({ where: { churchId: match.id } });
await prisma.massSchedule.createMany({
data: parish.schedules.map(s => ({
churchId: match.id,
dayOfWeek: s.dayOfWeek,
time: s.time,
massType: s.massType,
language: s.language ?? 'English',
notes: s.notes,
})),
});
stats.schedules += parish.schedules.length;
}
}
} else {
log(` No match: "${parish.name}" (${parish.city || 'no city'})`);
stats.created++;
// In non-dry-run, we could create new churches, but for safety
// we only log unmatched parishes for manual review
// (Creating churches from directory data without coordinates is risky)
}
}
// Update diocese tracking
if (!dryRun) {
await prisma.diocese.update({
where: { id: diocese.id },
data: {
lastScrapedAt: new Date(),
lastSuccessAt: new Date(),
churchCount: parishes.length,
failureCount: 0,
},
});
}
} catch (err: any) {
stats.errors++;
logError(` Failed to scrape ${diocese.name}: ${err.message}`);
if (!dryRun) {
await prisma.diocese.update({
where: { id: diocese.id },
data: {
lastScrapedAt: new Date(),
lastFailureAt: new Date(),
failureCount: { increment: 1 },
},
});
}
} finally {
await scraper.close();
}
}
async function main() {
const args = process.argv.slice(2);
const dryRun = args.includes('--dry-run');
const dioceseIdx = args.indexOf('--diocese');
const countryIdx = args.indexOf('--country');
const all = args.includes('--all');
const dioceseId = dioceseIdx !== -1 ? args[dioceseIdx + 1] : undefined;
const country = countryIdx !== -1 ? args[countryIdx + 1] : undefined;
log('============================================================');
log('Diocese Directory Scraper');
log('============================================================');
log(`Mode: ${dryRun ? 'Dry run' : 'Execute'}`);
log(`Target: ${dioceseId ? `Diocese ${dioceseId}` : country ? `Country ${country}` : 'All active'}`);
log('============================================================');
// Job tracking
let jobId = await createOrResumeJob(args);
if (!jobId && !dryRun) {
const job = await prisma.backgroundJob.create({
data: {
type: 'diocese-directory',
status: 'running',
startedAt: new Date(),
config: { dioceseId, country, all, dryRun },
},
});
jobId = job.id;
log(`Job ID: ${jobId}`);
}
const stats = { processed: 0, matched: 0, created: 0, schedules: 0, errors: 0 };
try {
let dioceses;
if (dioceseId) {
dioceses = [{ id: dioceseId }];
} else {
dioceses = await prisma.diocese.findMany({
where: {
active: true,
directoryUrl: { not: null },
...(country ? { country } : {}),
},
select: { id: true, name: true },
orderBy: { name: 'asc' },
});
}
log(`Found ${dioceses.length} dioceses to scrape`);
for (const d of dioceses) {
await scrapeDiocese(d.id, dryRun, stats);
// Check for job stop
if (jobId) {
await prisma.backgroundJob.update({
where: { id: jobId },
data: { processed: stats.processed, succeeded: stats.matched, itemsFound: stats.matched },
});
const job = await prisma.backgroundJob.findUnique({ where: { id: jobId } });
if (job?.status === 'stopping') {
log('Job stop requested.');
break;
}
}
}
} catch (error: any) {
logError(`Fatal error: ${error.message}`);
if (jobId) {
await prisma.backgroundJob.update({
where: { id: jobId },
data: { status: 'failed', error: error.message, completedAt: new Date() },
});
}
throw error;
}
// Complete job
if (jobId) {
await prisma.backgroundJob.update({
where: { id: jobId },
data: {
status: 'completed',
completedAt: new Date(),
processed: stats.processed,
succeeded: stats.matched,
itemsFound: stats.matched,
},
});
}
log('');
log('============================================================');
log('Diocese Directory Scraper Summary');
log('============================================================');
log(`Parishes discovered: ${stats.processed}`);
log(`Matched to DB: ${stats.matched}`);
log(`Unmatched (new): ${stats.created}`);
log(`Schedules saved: ${stats.schedules}`);
log(`Errors: ${stats.errors}`);
log('============================================================');
await prisma.$disconnect();
await pool.end();
}
main().catch((error) => {
logError(`Fatal error: ${error.message}`);
process.exit(1);
});