chore: sync with Gitea master and restore local-only files
Reset local main to gitea/master (new source of truth) and restored local-only files: web scrapers, admin dashboard, ChromaDB integration, debug scripts, and utility libraries that aren't tracked in Gitea. Gitea master adds: discovermass, buscarmisas-network, hk-parishes, bohosluzby, kerknet, gottesdienstzeiten, miserend importers, ClaimRequest model, forward geocoding, heartbeat healthcheck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
372
scripts/scrape-diocese-directory.ts
Normal file
372
scripts/scrape-diocese-directory.ts
Normal file
@@ -0,0 +1,372 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Scrape diocese directories to discover parish URLs and mass schedules
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/scrape-diocese-directory.ts --diocese <id> # Single diocese
|
||||
* npx tsx scripts/scrape-diocese-directory.ts --country DE # All dioceses in country
|
||||
* npx tsx scripts/scrape-diocese-directory.ts --all # All active dioceses
|
||||
* npx tsx scripts/scrape-diocese-directory.ts --all --dry-run # Preview only
|
||||
* npx tsx scripts/scrape-diocese-directory.ts --job-id <uuid> # Resume tracked job
|
||||
*/
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { DioceseDirectoryScraper, DioceseScrapeConfig } from '../src/scrapers/diocese-directory-scraper';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
function log(msg: string) {
|
||||
console.log(`[${new Date().toISOString()}] ${msg}`);
|
||||
}
|
||||
|
||||
function logError(msg: string) {
|
||||
console.error(`[${new Date().toISOString()}] ERROR: ${msg}`);
|
||||
}
|
||||
|
||||
// Haversine distance in km
|
||||
function haversineKm(lat1: number, lon1: number, lat2: number, lon2: number): number {
|
||||
const R = 6371;
|
||||
const dLat = (lat2 - lat1) * Math.PI / 180;
|
||||
const dLon = (lon2 - lon1) * Math.PI / 180;
|
||||
const a = Math.sin(dLat / 2) ** 2 +
|
||||
Math.cos(lat1 * Math.PI / 180) * Math.cos(lat2 * Math.PI / 180) *
|
||||
Math.sin(dLon / 2) ** 2;
|
||||
return R * 2 * Math.asin(Math.sqrt(a));
|
||||
}
|
||||
|
||||
function normalizeForMatch(str: string): string {
|
||||
return str.toLowerCase()
|
||||
.normalize('NFD').replace(/[\u0300-\u036f]/g, '')
|
||||
.replace(/[^a-z0-9\s]/g, '')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
}
|
||||
|
||||
interface MatchCandidate {
|
||||
id: string;
|
||||
name: string;
|
||||
latitude: number;
|
||||
longitude: number;
|
||||
distance: number;
|
||||
nameScore: number;
|
||||
}
|
||||
|
||||
async function findMatchingChurch(
|
||||
name: string,
|
||||
address: string | undefined,
|
||||
city: string | undefined,
|
||||
country: string,
|
||||
): Promise<MatchCandidate | null> {
|
||||
// Search by name similarity + country
|
||||
const nameNorm = normalizeForMatch(name);
|
||||
const nameWords = nameNorm.split(' ').filter(w => w.length >= 3);
|
||||
|
||||
if (nameWords.length === 0) return null;
|
||||
|
||||
// Find churches in the same country
|
||||
const candidates = await prisma.church.findMany({
|
||||
where: {
|
||||
country,
|
||||
...(city ? { city: { contains: city, mode: 'insensitive' } } : {}),
|
||||
},
|
||||
select: { id: true, name: true, latitude: true, longitude: true, website: true },
|
||||
take: 50,
|
||||
});
|
||||
|
||||
let bestMatch: MatchCandidate | null = null;
|
||||
|
||||
for (const church of candidates) {
|
||||
const churchNameNorm = normalizeForMatch(church.name);
|
||||
const churchWords = churchNameNorm.split(' ').filter(w => w.length >= 3);
|
||||
|
||||
let matchingWords = 0;
|
||||
for (const w of nameWords) {
|
||||
if (churchWords.includes(w)) matchingWords++;
|
||||
}
|
||||
|
||||
const nameScore = nameWords.length > 0 ? matchingWords / nameWords.length : 0;
|
||||
|
||||
// Require at least 40% word overlap
|
||||
if (nameScore < 0.4) continue;
|
||||
|
||||
if (!bestMatch || nameScore > bestMatch.nameScore) {
|
||||
bestMatch = {
|
||||
id: church.id,
|
||||
name: church.name,
|
||||
latitude: church.latitude,
|
||||
longitude: church.longitude,
|
||||
distance: 0,
|
||||
nameScore,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return bestMatch;
|
||||
}
|
||||
|
||||
// --- Job Tracking ---
|
||||
|
||||
async function createOrResumeJob(args: string[]): Promise<string | null> {
|
||||
const jobIdIndex = args.indexOf('--job-id');
|
||||
if (jobIdIndex !== -1) {
|
||||
const jobId = args[jobIdIndex + 1];
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: { status: 'running', startedAt: new Date() },
|
||||
});
|
||||
return jobId;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function scrapeDiocese(
|
||||
dioceseId: string,
|
||||
dryRun: boolean,
|
||||
stats: { processed: number; matched: number; created: number; schedules: number; errors: number }
|
||||
): Promise<void> {
|
||||
const diocese = await prisma.diocese.findUnique({ where: { id: dioceseId } });
|
||||
if (!diocese) {
|
||||
logError(`Diocese not found: ${dioceseId}`);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!diocese.directoryUrl) {
|
||||
log(` Skipping ${diocese.name}: no directory URL`);
|
||||
return;
|
||||
}
|
||||
|
||||
const config = diocese.scrapeConfig as DioceseScrapeConfig | null;
|
||||
if (!config?.selectors) {
|
||||
log(` Skipping ${diocese.name}: no scrape config`);
|
||||
return;
|
||||
}
|
||||
|
||||
log(`Scraping diocese: ${diocese.name} (${diocese.country})`);
|
||||
log(` Directory URL: ${diocese.directoryUrl}`);
|
||||
|
||||
const scraper = new DioceseDirectoryScraper();
|
||||
|
||||
try {
|
||||
let parishes;
|
||||
|
||||
if (config.scheduleInDirectory) {
|
||||
parishes = await scraper.scrapeDirectoryWithSchedules(
|
||||
diocese.directoryUrl,
|
||||
config,
|
||||
diocese.language
|
||||
);
|
||||
} else {
|
||||
const discovered = await scraper.scrapeDirectory(diocese.directoryUrl, config);
|
||||
parishes = discovered.map(p => ({
|
||||
...p,
|
||||
scheduleText: '',
|
||||
schedules: [] as Array<{ dayOfWeek: number; time: string; massType?: string; language?: string; notes?: string }>,
|
||||
}));
|
||||
}
|
||||
|
||||
log(` Discovered ${parishes.length} parishes`);
|
||||
|
||||
for (const parish of parishes) {
|
||||
stats.processed++;
|
||||
|
||||
// Try to match to existing church
|
||||
const match = await findMatchingChurch(
|
||||
parish.name,
|
||||
parish.address,
|
||||
parish.city,
|
||||
diocese.country,
|
||||
);
|
||||
|
||||
if (match) {
|
||||
stats.matched++;
|
||||
log(` Match: "${parish.name}" -> "${match.name}" (score: ${match.nameScore.toFixed(2)})`);
|
||||
|
||||
if (!dryRun) {
|
||||
// Update matched church with website and diocese link
|
||||
await prisma.church.update({
|
||||
where: { id: match.id },
|
||||
data: {
|
||||
website: parish.url,
|
||||
hasWebsite: true,
|
||||
dioceseId: diocese.id,
|
||||
},
|
||||
});
|
||||
|
||||
// Save schedules if available
|
||||
if ('schedules' in parish && parish.schedules.length > 0) {
|
||||
await prisma.massSchedule.deleteMany({ where: { churchId: match.id } });
|
||||
await prisma.massSchedule.createMany({
|
||||
data: parish.schedules.map(s => ({
|
||||
churchId: match.id,
|
||||
dayOfWeek: s.dayOfWeek,
|
||||
time: s.time,
|
||||
massType: s.massType,
|
||||
language: s.language ?? 'English',
|
||||
notes: s.notes,
|
||||
})),
|
||||
});
|
||||
stats.schedules += parish.schedules.length;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
log(` No match: "${parish.name}" (${parish.city || 'no city'})`);
|
||||
stats.created++;
|
||||
|
||||
// In non-dry-run, we could create new churches, but for safety
|
||||
// we only log unmatched parishes for manual review
|
||||
// (Creating churches from directory data without coordinates is risky)
|
||||
}
|
||||
}
|
||||
|
||||
// Update diocese tracking
|
||||
if (!dryRun) {
|
||||
await prisma.diocese.update({
|
||||
where: { id: diocese.id },
|
||||
data: {
|
||||
lastScrapedAt: new Date(),
|
||||
lastSuccessAt: new Date(),
|
||||
churchCount: parishes.length,
|
||||
failureCount: 0,
|
||||
},
|
||||
});
|
||||
}
|
||||
} catch (err: any) {
|
||||
stats.errors++;
|
||||
logError(` Failed to scrape ${diocese.name}: ${err.message}`);
|
||||
|
||||
if (!dryRun) {
|
||||
await prisma.diocese.update({
|
||||
where: { id: diocese.id },
|
||||
data: {
|
||||
lastScrapedAt: new Date(),
|
||||
lastFailureAt: new Date(),
|
||||
failureCount: { increment: 1 },
|
||||
},
|
||||
});
|
||||
}
|
||||
} finally {
|
||||
await scraper.close();
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = process.argv.slice(2);
|
||||
const dryRun = args.includes('--dry-run');
|
||||
const dioceseIdx = args.indexOf('--diocese');
|
||||
const countryIdx = args.indexOf('--country');
|
||||
const all = args.includes('--all');
|
||||
|
||||
const dioceseId = dioceseIdx !== -1 ? args[dioceseIdx + 1] : undefined;
|
||||
const country = countryIdx !== -1 ? args[countryIdx + 1] : undefined;
|
||||
|
||||
log('============================================================');
|
||||
log('Diocese Directory Scraper');
|
||||
log('============================================================');
|
||||
log(`Mode: ${dryRun ? 'Dry run' : 'Execute'}`);
|
||||
log(`Target: ${dioceseId ? `Diocese ${dioceseId}` : country ? `Country ${country}` : 'All active'}`);
|
||||
log('============================================================');
|
||||
|
||||
// Job tracking
|
||||
let jobId = await createOrResumeJob(args);
|
||||
if (!jobId && !dryRun) {
|
||||
const job = await prisma.backgroundJob.create({
|
||||
data: {
|
||||
type: 'diocese-directory',
|
||||
status: 'running',
|
||||
startedAt: new Date(),
|
||||
config: { dioceseId, country, all, dryRun },
|
||||
},
|
||||
});
|
||||
jobId = job.id;
|
||||
log(`Job ID: ${jobId}`);
|
||||
}
|
||||
|
||||
const stats = { processed: 0, matched: 0, created: 0, schedules: 0, errors: 0 };
|
||||
|
||||
try {
|
||||
let dioceses;
|
||||
|
||||
if (dioceseId) {
|
||||
dioceses = [{ id: dioceseId }];
|
||||
} else {
|
||||
dioceses = await prisma.diocese.findMany({
|
||||
where: {
|
||||
active: true,
|
||||
directoryUrl: { not: null },
|
||||
...(country ? { country } : {}),
|
||||
},
|
||||
select: { id: true, name: true },
|
||||
orderBy: { name: 'asc' },
|
||||
});
|
||||
}
|
||||
|
||||
log(`Found ${dioceses.length} dioceses to scrape`);
|
||||
|
||||
for (const d of dioceses) {
|
||||
await scrapeDiocese(d.id, dryRun, stats);
|
||||
|
||||
// Check for job stop
|
||||
if (jobId) {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: { processed: stats.processed, succeeded: stats.matched, itemsFound: stats.matched },
|
||||
});
|
||||
const job = await prisma.backgroundJob.findUnique({ where: { id: jobId } });
|
||||
if (job?.status === 'stopping') {
|
||||
log('Job stop requested.');
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error: any) {
|
||||
logError(`Fatal error: ${error.message}`);
|
||||
if (jobId) {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: { status: 'failed', error: error.message, completedAt: new Date() },
|
||||
});
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
|
||||
// Complete job
|
||||
if (jobId) {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: {
|
||||
status: 'completed',
|
||||
completedAt: new Date(),
|
||||
processed: stats.processed,
|
||||
succeeded: stats.matched,
|
||||
itemsFound: stats.matched,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
log('');
|
||||
log('============================================================');
|
||||
log('Diocese Directory Scraper Summary');
|
||||
log('============================================================');
|
||||
log(`Parishes discovered: ${stats.processed}`);
|
||||
log(`Matched to DB: ${stats.matched}`);
|
||||
log(`Unmatched (new): ${stats.created}`);
|
||||
log(`Schedules saved: ${stats.schedules}`);
|
||||
log(`Errors: ${stats.errors}`);
|
||||
log('============================================================');
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
logError(`Fatal error: ${error.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user