chore: sync with Gitea master and restore local-only files

Reset local main to gitea/master (new source of truth) and restored
local-only files: web scrapers, admin dashboard, ChromaDB integration,
debug scripts, and utility libraries that aren't tracked in Gitea.

Gitea master adds: discovermass, buscarmisas-network, hk-parishes,
bohosluzby, kerknet, gottesdienstzeiten, miserend importers,
ClaimRequest model, forward geocoding, heartbeat healthcheck.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Albert
2026-04-12 19:11:22 -04:00
parent 76cca3ba75
commit 2c51513851
133 changed files with 30381 additions and 0 deletions

View File

@@ -0,0 +1,328 @@
#!/usr/bin/env tsx
/**
* Enrich churches with website URLs from Wikidata
*
* Queries Wikidata SPARQL endpoint for Catholic churches that have official websites,
* then matches them to existing churches in the database via proximity + name matching.
*
* Usage:
* npx tsx scripts/enrich-with-wikidata.ts --dry-run
* npx tsx scripts/enrich-with-wikidata.ts --execute
* npx tsx scripts/enrich-with-wikidata.ts --execute --country DE
* npx tsx scripts/enrich-with-wikidata.ts --job-id <uuid>
*/
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import axios from 'axios';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
const WIKIDATA_SPARQL_URL = 'https://query.wikidata.org/sparql';
const MATCH_RADIUS_KM = 1.0; // Max distance for matching
const BATCH_SIZE = 500; // SPARQL results per query
function log(msg: string) {
console.log(`[${new Date().toISOString()}] ${msg}`);
}
function logError(msg: string) {
console.error(`[${new Date().toISOString()}] ${msg}`);
}
// Haversine distance in km
function haversineKm(lat1: number, lon1: number, lat2: number, lon2: number): number {
const R = 6371;
const dLat = (lat2 - lat1) * Math.PI / 180;
const dLon = (lon2 - lon1) * Math.PI / 180;
const a = Math.sin(dLat / 2) ** 2 +
Math.cos(lat1 * Math.PI / 180) * Math.cos(lat2 * Math.PI / 180) *
Math.sin(dLon / 2) ** 2;
return R * 2 * Math.asin(Math.sqrt(a));
}
function normalizeForMatch(str: string): string {
return str.toLowerCase()
.normalize('NFD').replace(/[\u0300-\u036f]/g, '') // strip accents
.replace(/[^a-z0-9\s]/g, '')
.replace(/\s+/g, ' ')
.trim();
}
interface WikidataChurch {
label: string;
website: string;
lat: number;
lon: number;
wikidataId: string;
}
async function queryWikidata(country?: string, offset = 0): Promise<WikidataChurch[]> {
// SPARQL query for Catholic churches with websites
let countryFilter = '';
if (country) {
// Map ISO alpha-2 to Wikidata country item
const countryMap: Record<string, string> = {
DE: 'Q183', FR: 'Q142', ES: 'Q29', IT: 'Q38', PL: 'Q36',
PT: 'Q45', BR: 'Q155', NL: 'Q55', CZ: 'Q213', HU: 'Q28',
AT: 'Q40', BE: 'Q31', CH: 'Q39', IE: 'Q27', GB: 'Q145',
US: 'Q30', CA: 'Q16', MX: 'Q96', AR: 'Q414', CO: 'Q739',
HR: 'Q224', SK: 'Q214', SI: 'Q215',
};
const qid = countryMap[country];
if (qid) {
countryFilter = `?church wdt:P17 wd:${qid} .`;
}
}
const sparql = `
SELECT ?church ?churchLabel ?website ?lat ?lon WHERE {
?church wdt:P31/wdt:P279* wd:Q16970 .
?church wdt:P140 wd:Q9592 .
?church wdt:P856 ?website .
?church p:P625 ?coordStatement .
?coordStatement ps:P625 ?coord .
BIND(geof:latitude(?coord) AS ?lat)
BIND(geof:longitude(?coord) AS ?lon)
${countryFilter}
SERVICE wikibase:label { bd:serviceParam wikibase:language "en,de,fr,es,it,pt,pl,nl,cs,hu" . }
}
ORDER BY ?church
LIMIT ${BATCH_SIZE}
OFFSET ${offset}
`;
const response = await axios.get(WIKIDATA_SPARQL_URL, {
params: { query: sparql, format: 'json' },
headers: {
'User-Agent': 'NearestMass/1.0 (https://nearestmass.com; contact: privacy@nearestmass.com)',
'Accept': 'application/sparql-results+json',
},
timeout: 60000,
});
const bindings = response.data?.results?.bindings || [];
return bindings.map((b: any) => ({
label: b.churchLabel?.value || '',
website: b.website?.value || '',
lat: parseFloat(b.lat?.value || '0'),
lon: parseFloat(b.lon?.value || '0'),
wikidataId: b.church?.value?.replace('http://www.wikidata.org/entity/', '') || '',
}));
}
interface MatchResult {
churchId: string;
churchName: string;
distance: number;
nameScore: number;
}
async function findMatch(wdChurch: WikidataChurch): Promise<MatchResult | null> {
// Find nearby churches without a website
const nearby = await prisma.church.findMany({
where: {
website: null,
latitude: { gte: wdChurch.lat - 0.01, lte: wdChurch.lat + 0.01 },
longitude: { gte: wdChurch.lon - 0.01, lte: wdChurch.lon + 0.01 },
},
select: { id: true, name: true, latitude: true, longitude: true },
take: 20,
});
if (nearby.length === 0) return null;
// Score each candidate
const wdNameNorm = normalizeForMatch(wdChurch.label);
const wdWords = wdNameNorm.split(' ').filter(w => w.length >= 3);
let bestMatch: MatchResult | null = null;
for (const church of nearby) {
const dist = haversineKm(wdChurch.lat, wdChurch.lon, church.latitude, church.longitude);
if (dist > MATCH_RADIUS_KM) continue;
const churchNameNorm = normalizeForMatch(church.name);
const churchWords = churchNameNorm.split(' ').filter(w => w.length >= 3);
// Count matching words
let matchingWords = 0;
for (const w of wdWords) {
if (churchWords.includes(w)) matchingWords++;
}
const nameScore = wdWords.length > 0 ? matchingWords / wdWords.length : 0;
// Require at least 50% word overlap or distance < 100m
if (nameScore < 0.5 && dist > 0.1) continue;
if (!bestMatch || nameScore > bestMatch.nameScore ||
(nameScore === bestMatch.nameScore && dist < bestMatch.distance)) {
bestMatch = {
churchId: church.id,
churchName: church.name,
distance: dist,
nameScore,
};
}
}
return bestMatch;
}
// --- Job Tracking ---
async function createOrResumeJob(args: string[]): Promise<string | null> {
const jobIdIndex = args.indexOf('--job-id');
if (jobIdIndex !== -1) {
const jobId = args[jobIdIndex + 1];
await prisma.backgroundJob.update({
where: { id: jobId },
data: { status: 'running', startedAt: new Date() },
});
return jobId;
}
return null;
}
async function main() {
const args = process.argv.slice(2);
const dryRun = !args.includes('--execute');
const countryIdx = args.indexOf('--country');
const country = countryIdx !== -1 ? args[countryIdx + 1] : undefined;
log('============================================================');
log('Wikidata Church Website Enrichment');
log('============================================================');
log(`Mode: ${dryRun ? 'Dry run' : 'Execute'}`);
log(`Country: ${country || 'All'}`);
log('============================================================');
// Job tracking
let jobId = await createOrResumeJob(args);
if (!jobId && !dryRun) {
const job = await prisma.backgroundJob.create({
data: {
type: 'wikidata-enrichment',
status: 'running',
startedAt: new Date(),
config: { country, dryRun },
},
});
jobId = job.id;
log(`Job ID: ${jobId}`);
}
let totalFetched = 0;
let matched = 0;
let updated = 0;
let noMatch = 0;
let alreadyHasWebsite = 0;
let offset = 0;
try {
while (true) {
log(`Querying Wikidata (offset ${offset})...`);
const results = await queryWikidata(country, offset);
if (results.length === 0) {
log('No more results from Wikidata.');
break;
}
totalFetched += results.length;
log(`Fetched ${results.length} churches from Wikidata (total: ${totalFetched})`);
for (const wdChurch of results) {
if (!wdChurch.website || !wdChurch.lat || !wdChurch.lon) continue;
const match = await findMatch(wdChurch);
if (!match) {
noMatch++;
continue;
}
matched++;
log(` Match: "${wdChurch.label}" (${wdChurch.wikidataId}) -> "${match.churchName}" (dist: ${match.distance.toFixed(3)}km, score: ${match.nameScore.toFixed(2)})`);
if (!dryRun) {
await prisma.church.update({
where: { id: match.churchId },
data: {
website: wdChurch.website,
hasWebsite: true,
},
});
updated++;
}
}
// Rate limit SPARQL queries
await new Promise(r => setTimeout(r, 2000));
offset += BATCH_SIZE;
// Update job progress
if (jobId) {
await prisma.backgroundJob.update({
where: { id: jobId },
data: {
processed: totalFetched,
succeeded: updated,
itemsFound: matched,
},
});
// Check for stop
const job = await prisma.backgroundJob.findUnique({ where: { id: jobId } });
if (job?.status === 'stopping') {
log('Job stop requested.');
break;
}
}
}
} catch (error: any) {
logError(`Error: ${error.message}`);
if (jobId) {
await prisma.backgroundJob.update({
where: { id: jobId },
data: { status: 'failed', error: error.message, completedAt: new Date() },
});
}
throw error;
}
// Complete job
if (jobId) {
await prisma.backgroundJob.update({
where: { id: jobId },
data: { status: 'completed', completedAt: new Date(), processed: totalFetched, succeeded: updated, itemsFound: matched },
});
}
log('');
log('============================================================');
log('Wikidata Enrichment Summary');
log('============================================================');
log(`Wikidata churches fetched: ${totalFetched}`);
log(`Matched to DB churches: ${matched}`);
log(`Websites updated: ${updated}`);
log(`No match found: ${noMatch}`);
log(`Already had website: ${alreadyHasWebsite}`);
log('============================================================');
await prisma.$disconnect();
await pool.end();
}
main().catch((error) => {
logError(`Fatal error: ${error.message}`);
process.exit(1);
});