#!/usr/bin/env tsx /** * Enrich churches with website URLs from Wikidata * * Queries Wikidata SPARQL endpoint for Catholic churches that have official websites, * then matches them to existing churches in the database via proximity + name matching. * * Usage: * npx tsx scripts/enrich-with-wikidata.ts --dry-run * npx tsx scripts/enrich-with-wikidata.ts --execute * npx tsx scripts/enrich-with-wikidata.ts --execute --country DE * npx tsx scripts/enrich-with-wikidata.ts --job-id */ import dotenv from 'dotenv'; import path from 'path'; dotenv.config({ path: path.resolve(process.cwd(), '.env') }); import { Pool } from 'pg'; import { PrismaPg } from '@prisma/adapter-pg'; import { PrismaClient } from '@prisma/client'; import axios from 'axios'; const pool = new Pool({ connectionString: process.env.DATABASE_URL }); const adapter = new PrismaPg(pool); const prisma = new PrismaClient({ adapter }); const WIKIDATA_SPARQL_URL = 'https://query.wikidata.org/sparql'; const MATCH_RADIUS_KM = 1.0; // Max distance for matching const BATCH_SIZE = 500; // SPARQL results per query function log(msg: string) { console.log(`[${new Date().toISOString()}] ${msg}`); } function logError(msg: string) { console.error(`[${new Date().toISOString()}] ${msg}`); } // Haversine distance in km function haversineKm(lat1: number, lon1: number, lat2: number, lon2: number): number { const R = 6371; const dLat = (lat2 - lat1) * Math.PI / 180; const dLon = (lon2 - lon1) * Math.PI / 180; const a = Math.sin(dLat / 2) ** 2 + Math.cos(lat1 * Math.PI / 180) * Math.cos(lat2 * Math.PI / 180) * Math.sin(dLon / 2) ** 2; return R * 2 * Math.asin(Math.sqrt(a)); } function normalizeForMatch(str: string): string { return str.toLowerCase() .normalize('NFD').replace(/[\u0300-\u036f]/g, '') // strip accents .replace(/[^a-z0-9\s]/g, '') .replace(/\s+/g, ' ') .trim(); } interface WikidataChurch { label: string; website: string; lat: number; lon: number; wikidataId: string; } async function queryWikidata(country?: string, offset = 0): Promise { // SPARQL query for Catholic churches with websites let countryFilter = ''; if (country) { // Map ISO alpha-2 to Wikidata country item const countryMap: Record = { DE: 'Q183', FR: 'Q142', ES: 'Q29', IT: 'Q38', PL: 'Q36', PT: 'Q45', BR: 'Q155', NL: 'Q55', CZ: 'Q213', HU: 'Q28', AT: 'Q40', BE: 'Q31', CH: 'Q39', IE: 'Q27', GB: 'Q145', US: 'Q30', CA: 'Q16', MX: 'Q96', AR: 'Q414', CO: 'Q739', HR: 'Q224', SK: 'Q214', SI: 'Q215', }; const qid = countryMap[country]; if (qid) { countryFilter = `?church wdt:P17 wd:${qid} .`; } } const sparql = ` SELECT ?church ?churchLabel ?website ?lat ?lon WHERE { ?church wdt:P31/wdt:P279* wd:Q16970 . ?church wdt:P140 wd:Q9592 . ?church wdt:P856 ?website . ?church p:P625 ?coordStatement . ?coordStatement ps:P625 ?coord . BIND(geof:latitude(?coord) AS ?lat) BIND(geof:longitude(?coord) AS ?lon) ${countryFilter} SERVICE wikibase:label { bd:serviceParam wikibase:language "en,de,fr,es,it,pt,pl,nl,cs,hu" . } } ORDER BY ?church LIMIT ${BATCH_SIZE} OFFSET ${offset} `; const response = await axios.get(WIKIDATA_SPARQL_URL, { params: { query: sparql, format: 'json' }, headers: { 'User-Agent': 'NearestMass/1.0 (https://nearestmass.com; contact: privacy@nearestmass.com)', 'Accept': 'application/sparql-results+json', }, timeout: 60000, }); const bindings = response.data?.results?.bindings || []; return bindings.map((b: any) => ({ label: b.churchLabel?.value || '', website: b.website?.value || '', lat: parseFloat(b.lat?.value || '0'), lon: parseFloat(b.lon?.value || '0'), wikidataId: b.church?.value?.replace('http://www.wikidata.org/entity/', '') || '', })); } interface MatchResult { churchId: string; churchName: string; distance: number; nameScore: number; } async function findMatch(wdChurch: WikidataChurch): Promise { // Find nearby churches without a website const nearby = await prisma.church.findMany({ where: { website: null, latitude: { gte: wdChurch.lat - 0.01, lte: wdChurch.lat + 0.01 }, longitude: { gte: wdChurch.lon - 0.01, lte: wdChurch.lon + 0.01 }, }, select: { id: true, name: true, latitude: true, longitude: true }, take: 20, }); if (nearby.length === 0) return null; // Score each candidate const wdNameNorm = normalizeForMatch(wdChurch.label); const wdWords = wdNameNorm.split(' ').filter(w => w.length >= 3); let bestMatch: MatchResult | null = null; for (const church of nearby) { const dist = haversineKm(wdChurch.lat, wdChurch.lon, church.latitude, church.longitude); if (dist > MATCH_RADIUS_KM) continue; const churchNameNorm = normalizeForMatch(church.name); const churchWords = churchNameNorm.split(' ').filter(w => w.length >= 3); // Count matching words let matchingWords = 0; for (const w of wdWords) { if (churchWords.includes(w)) matchingWords++; } const nameScore = wdWords.length > 0 ? matchingWords / wdWords.length : 0; // Require at least 50% word overlap or distance < 100m if (nameScore < 0.5 && dist > 0.1) continue; if (!bestMatch || nameScore > bestMatch.nameScore || (nameScore === bestMatch.nameScore && dist < bestMatch.distance)) { bestMatch = { churchId: church.id, churchName: church.name, distance: dist, nameScore, }; } } return bestMatch; } // --- Job Tracking --- async function createOrResumeJob(args: string[]): Promise { const jobIdIndex = args.indexOf('--job-id'); if (jobIdIndex !== -1) { const jobId = args[jobIdIndex + 1]; await prisma.backgroundJob.update({ where: { id: jobId }, data: { status: 'running', startedAt: new Date() }, }); return jobId; } return null; } async function main() { const args = process.argv.slice(2); const dryRun = !args.includes('--execute'); const countryIdx = args.indexOf('--country'); const country = countryIdx !== -1 ? args[countryIdx + 1] : undefined; log('============================================================'); log('Wikidata Church Website Enrichment'); log('============================================================'); log(`Mode: ${dryRun ? 'Dry run' : 'Execute'}`); log(`Country: ${country || 'All'}`); log('============================================================'); // Job tracking let jobId = await createOrResumeJob(args); if (!jobId && !dryRun) { const job = await prisma.backgroundJob.create({ data: { type: 'wikidata-enrichment', status: 'running', startedAt: new Date(), config: { country, dryRun }, }, }); jobId = job.id; log(`Job ID: ${jobId}`); } let totalFetched = 0; let matched = 0; let updated = 0; let noMatch = 0; let alreadyHasWebsite = 0; let offset = 0; try { while (true) { log(`Querying Wikidata (offset ${offset})...`); const results = await queryWikidata(country, offset); if (results.length === 0) { log('No more results from Wikidata.'); break; } totalFetched += results.length; log(`Fetched ${results.length} churches from Wikidata (total: ${totalFetched})`); for (const wdChurch of results) { if (!wdChurch.website || !wdChurch.lat || !wdChurch.lon) continue; const match = await findMatch(wdChurch); if (!match) { noMatch++; continue; } matched++; log(` Match: "${wdChurch.label}" (${wdChurch.wikidataId}) -> "${match.churchName}" (dist: ${match.distance.toFixed(3)}km, score: ${match.nameScore.toFixed(2)})`); if (!dryRun) { await prisma.church.update({ where: { id: match.churchId }, data: { website: wdChurch.website, hasWebsite: true, }, }); updated++; } } // Rate limit SPARQL queries await new Promise(r => setTimeout(r, 2000)); offset += BATCH_SIZE; // Update job progress if (jobId) { await prisma.backgroundJob.update({ where: { id: jobId }, data: { processed: totalFetched, succeeded: updated, itemsFound: matched, }, }); // Check for stop const job = await prisma.backgroundJob.findUnique({ where: { id: jobId } }); if (job?.status === 'stopping') { log('Job stop requested.'); break; } } } } catch (error: any) { logError(`Error: ${error.message}`); if (jobId) { await prisma.backgroundJob.update({ where: { id: jobId }, data: { status: 'failed', error: error.message, completedAt: new Date() }, }); } throw error; } // Complete job if (jobId) { await prisma.backgroundJob.update({ where: { id: jobId }, data: { status: 'completed', completedAt: new Date(), processed: totalFetched, succeeded: updated, itemsFound: matched }, }); } log(''); log('============================================================'); log('Wikidata Enrichment Summary'); log('============================================================'); log(`Wikidata churches fetched: ${totalFetched}`); log(`Matched to DB churches: ${matched}`); log(`Websites updated: ${updated}`); log(`No match found: ${noMatch}`); log(`Already had website: ${alreadyHasWebsite}`); log('============================================================'); await prisma.$disconnect(); await pool.end(); } main().catch((error) => { logError(`Fatal error: ${error.message}`); process.exit(1); });