329 lines
9.8 KiB
TypeScript
329 lines
9.8 KiB
TypeScript
|
|
#!/usr/bin/env tsx
|
||
|
|
/**
|
||
|
|
* Enrich churches with website URLs from Wikidata
|
||
|
|
*
|
||
|
|
* Queries Wikidata SPARQL endpoint for Catholic churches that have official websites,
|
||
|
|
* then matches them to existing churches in the database via proximity + name matching.
|
||
|
|
*
|
||
|
|
* Usage:
|
||
|
|
* npx tsx scripts/enrich-with-wikidata.ts --dry-run
|
||
|
|
* npx tsx scripts/enrich-with-wikidata.ts --execute
|
||
|
|
* npx tsx scripts/enrich-with-wikidata.ts --execute --country DE
|
||
|
|
* npx tsx scripts/enrich-with-wikidata.ts --job-id <uuid>
|
||
|
|
*/
|
||
|
|
|
||
|
|
import dotenv from 'dotenv';
|
||
|
|
import path from 'path';
|
||
|
|
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||
|
|
|
||
|
|
import { Pool } from 'pg';
|
||
|
|
import { PrismaPg } from '@prisma/adapter-pg';
|
||
|
|
import { PrismaClient } from '@prisma/client';
|
||
|
|
import axios from 'axios';
|
||
|
|
|
||
|
|
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||
|
|
const adapter = new PrismaPg(pool);
|
||
|
|
const prisma = new PrismaClient({ adapter });
|
||
|
|
|
||
|
|
const WIKIDATA_SPARQL_URL = 'https://query.wikidata.org/sparql';
|
||
|
|
const MATCH_RADIUS_KM = 1.0; // Max distance for matching
|
||
|
|
const BATCH_SIZE = 500; // SPARQL results per query
|
||
|
|
|
||
|
|
function log(msg: string) {
|
||
|
|
console.log(`[${new Date().toISOString()}] ${msg}`);
|
||
|
|
}
|
||
|
|
|
||
|
|
function logError(msg: string) {
|
||
|
|
console.error(`[${new Date().toISOString()}] ${msg}`);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Haversine distance in km
|
||
|
|
function haversineKm(lat1: number, lon1: number, lat2: number, lon2: number): number {
|
||
|
|
const R = 6371;
|
||
|
|
const dLat = (lat2 - lat1) * Math.PI / 180;
|
||
|
|
const dLon = (lon2 - lon1) * Math.PI / 180;
|
||
|
|
const a = Math.sin(dLat / 2) ** 2 +
|
||
|
|
Math.cos(lat1 * Math.PI / 180) * Math.cos(lat2 * Math.PI / 180) *
|
||
|
|
Math.sin(dLon / 2) ** 2;
|
||
|
|
return R * 2 * Math.asin(Math.sqrt(a));
|
||
|
|
}
|
||
|
|
|
||
|
|
function normalizeForMatch(str: string): string {
|
||
|
|
return str.toLowerCase()
|
||
|
|
.normalize('NFD').replace(/[\u0300-\u036f]/g, '') // strip accents
|
||
|
|
.replace(/[^a-z0-9\s]/g, '')
|
||
|
|
.replace(/\s+/g, ' ')
|
||
|
|
.trim();
|
||
|
|
}
|
||
|
|
|
||
|
|
interface WikidataChurch {
|
||
|
|
label: string;
|
||
|
|
website: string;
|
||
|
|
lat: number;
|
||
|
|
lon: number;
|
||
|
|
wikidataId: string;
|
||
|
|
}
|
||
|
|
|
||
|
|
async function queryWikidata(country?: string, offset = 0): Promise<WikidataChurch[]> {
|
||
|
|
// SPARQL query for Catholic churches with websites
|
||
|
|
let countryFilter = '';
|
||
|
|
if (country) {
|
||
|
|
// Map ISO alpha-2 to Wikidata country item
|
||
|
|
const countryMap: Record<string, string> = {
|
||
|
|
DE: 'Q183', FR: 'Q142', ES: 'Q29', IT: 'Q38', PL: 'Q36',
|
||
|
|
PT: 'Q45', BR: 'Q155', NL: 'Q55', CZ: 'Q213', HU: 'Q28',
|
||
|
|
AT: 'Q40', BE: 'Q31', CH: 'Q39', IE: 'Q27', GB: 'Q145',
|
||
|
|
US: 'Q30', CA: 'Q16', MX: 'Q96', AR: 'Q414', CO: 'Q739',
|
||
|
|
HR: 'Q224', SK: 'Q214', SI: 'Q215',
|
||
|
|
};
|
||
|
|
const qid = countryMap[country];
|
||
|
|
if (qid) {
|
||
|
|
countryFilter = `?church wdt:P17 wd:${qid} .`;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
const sparql = `
|
||
|
|
SELECT ?church ?churchLabel ?website ?lat ?lon WHERE {
|
||
|
|
?church wdt:P31/wdt:P279* wd:Q16970 .
|
||
|
|
?church wdt:P140 wd:Q9592 .
|
||
|
|
?church wdt:P856 ?website .
|
||
|
|
?church p:P625 ?coordStatement .
|
||
|
|
?coordStatement ps:P625 ?coord .
|
||
|
|
BIND(geof:latitude(?coord) AS ?lat)
|
||
|
|
BIND(geof:longitude(?coord) AS ?lon)
|
||
|
|
${countryFilter}
|
||
|
|
SERVICE wikibase:label { bd:serviceParam wikibase:language "en,de,fr,es,it,pt,pl,nl,cs,hu" . }
|
||
|
|
}
|
||
|
|
ORDER BY ?church
|
||
|
|
LIMIT ${BATCH_SIZE}
|
||
|
|
OFFSET ${offset}
|
||
|
|
`;
|
||
|
|
|
||
|
|
const response = await axios.get(WIKIDATA_SPARQL_URL, {
|
||
|
|
params: { query: sparql, format: 'json' },
|
||
|
|
headers: {
|
||
|
|
'User-Agent': 'NearestMass/1.0 (https://nearestmass.com; contact: privacy@nearestmass.com)',
|
||
|
|
'Accept': 'application/sparql-results+json',
|
||
|
|
},
|
||
|
|
timeout: 60000,
|
||
|
|
});
|
||
|
|
|
||
|
|
const bindings = response.data?.results?.bindings || [];
|
||
|
|
return bindings.map((b: any) => ({
|
||
|
|
label: b.churchLabel?.value || '',
|
||
|
|
website: b.website?.value || '',
|
||
|
|
lat: parseFloat(b.lat?.value || '0'),
|
||
|
|
lon: parseFloat(b.lon?.value || '0'),
|
||
|
|
wikidataId: b.church?.value?.replace('http://www.wikidata.org/entity/', '') || '',
|
||
|
|
}));
|
||
|
|
}
|
||
|
|
|
||
|
|
interface MatchResult {
|
||
|
|
churchId: string;
|
||
|
|
churchName: string;
|
||
|
|
distance: number;
|
||
|
|
nameScore: number;
|
||
|
|
}
|
||
|
|
|
||
|
|
async function findMatch(wdChurch: WikidataChurch): Promise<MatchResult | null> {
|
||
|
|
// Find nearby churches without a website
|
||
|
|
const nearby = await prisma.church.findMany({
|
||
|
|
where: {
|
||
|
|
website: null,
|
||
|
|
latitude: { gte: wdChurch.lat - 0.01, lte: wdChurch.lat + 0.01 },
|
||
|
|
longitude: { gte: wdChurch.lon - 0.01, lte: wdChurch.lon + 0.01 },
|
||
|
|
},
|
||
|
|
select: { id: true, name: true, latitude: true, longitude: true },
|
||
|
|
take: 20,
|
||
|
|
});
|
||
|
|
|
||
|
|
if (nearby.length === 0) return null;
|
||
|
|
|
||
|
|
// Score each candidate
|
||
|
|
const wdNameNorm = normalizeForMatch(wdChurch.label);
|
||
|
|
const wdWords = wdNameNorm.split(' ').filter(w => w.length >= 3);
|
||
|
|
|
||
|
|
let bestMatch: MatchResult | null = null;
|
||
|
|
|
||
|
|
for (const church of nearby) {
|
||
|
|
const dist = haversineKm(wdChurch.lat, wdChurch.lon, church.latitude, church.longitude);
|
||
|
|
if (dist > MATCH_RADIUS_KM) continue;
|
||
|
|
|
||
|
|
const churchNameNorm = normalizeForMatch(church.name);
|
||
|
|
const churchWords = churchNameNorm.split(' ').filter(w => w.length >= 3);
|
||
|
|
|
||
|
|
// Count matching words
|
||
|
|
let matchingWords = 0;
|
||
|
|
for (const w of wdWords) {
|
||
|
|
if (churchWords.includes(w)) matchingWords++;
|
||
|
|
}
|
||
|
|
|
||
|
|
const nameScore = wdWords.length > 0 ? matchingWords / wdWords.length : 0;
|
||
|
|
|
||
|
|
// Require at least 50% word overlap or distance < 100m
|
||
|
|
if (nameScore < 0.5 && dist > 0.1) continue;
|
||
|
|
|
||
|
|
if (!bestMatch || nameScore > bestMatch.nameScore ||
|
||
|
|
(nameScore === bestMatch.nameScore && dist < bestMatch.distance)) {
|
||
|
|
bestMatch = {
|
||
|
|
churchId: church.id,
|
||
|
|
churchName: church.name,
|
||
|
|
distance: dist,
|
||
|
|
nameScore,
|
||
|
|
};
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return bestMatch;
|
||
|
|
}
|
||
|
|
|
||
|
|
// --- Job Tracking ---
|
||
|
|
|
||
|
|
async function createOrResumeJob(args: string[]): Promise<string | null> {
|
||
|
|
const jobIdIndex = args.indexOf('--job-id');
|
||
|
|
if (jobIdIndex !== -1) {
|
||
|
|
const jobId = args[jobIdIndex + 1];
|
||
|
|
await prisma.backgroundJob.update({
|
||
|
|
where: { id: jobId },
|
||
|
|
data: { status: 'running', startedAt: new Date() },
|
||
|
|
});
|
||
|
|
return jobId;
|
||
|
|
}
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
async function main() {
|
||
|
|
const args = process.argv.slice(2);
|
||
|
|
const dryRun = !args.includes('--execute');
|
||
|
|
const countryIdx = args.indexOf('--country');
|
||
|
|
const country = countryIdx !== -1 ? args[countryIdx + 1] : undefined;
|
||
|
|
|
||
|
|
log('============================================================');
|
||
|
|
log('Wikidata Church Website Enrichment');
|
||
|
|
log('============================================================');
|
||
|
|
log(`Mode: ${dryRun ? 'Dry run' : 'Execute'}`);
|
||
|
|
log(`Country: ${country || 'All'}`);
|
||
|
|
log('============================================================');
|
||
|
|
|
||
|
|
// Job tracking
|
||
|
|
let jobId = await createOrResumeJob(args);
|
||
|
|
if (!jobId && !dryRun) {
|
||
|
|
const job = await prisma.backgroundJob.create({
|
||
|
|
data: {
|
||
|
|
type: 'wikidata-enrichment',
|
||
|
|
status: 'running',
|
||
|
|
startedAt: new Date(),
|
||
|
|
config: { country, dryRun },
|
||
|
|
},
|
||
|
|
});
|
||
|
|
jobId = job.id;
|
||
|
|
log(`Job ID: ${jobId}`);
|
||
|
|
}
|
||
|
|
|
||
|
|
let totalFetched = 0;
|
||
|
|
let matched = 0;
|
||
|
|
let updated = 0;
|
||
|
|
let noMatch = 0;
|
||
|
|
let alreadyHasWebsite = 0;
|
||
|
|
let offset = 0;
|
||
|
|
|
||
|
|
try {
|
||
|
|
while (true) {
|
||
|
|
log(`Querying Wikidata (offset ${offset})...`);
|
||
|
|
const results = await queryWikidata(country, offset);
|
||
|
|
|
||
|
|
if (results.length === 0) {
|
||
|
|
log('No more results from Wikidata.');
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
totalFetched += results.length;
|
||
|
|
log(`Fetched ${results.length} churches from Wikidata (total: ${totalFetched})`);
|
||
|
|
|
||
|
|
for (const wdChurch of results) {
|
||
|
|
if (!wdChurch.website || !wdChurch.lat || !wdChurch.lon) continue;
|
||
|
|
|
||
|
|
const match = await findMatch(wdChurch);
|
||
|
|
|
||
|
|
if (!match) {
|
||
|
|
noMatch++;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
matched++;
|
||
|
|
log(` Match: "${wdChurch.label}" (${wdChurch.wikidataId}) -> "${match.churchName}" (dist: ${match.distance.toFixed(3)}km, score: ${match.nameScore.toFixed(2)})`);
|
||
|
|
|
||
|
|
if (!dryRun) {
|
||
|
|
await prisma.church.update({
|
||
|
|
where: { id: match.churchId },
|
||
|
|
data: {
|
||
|
|
website: wdChurch.website,
|
||
|
|
hasWebsite: true,
|
||
|
|
},
|
||
|
|
});
|
||
|
|
updated++;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Rate limit SPARQL queries
|
||
|
|
await new Promise(r => setTimeout(r, 2000));
|
||
|
|
offset += BATCH_SIZE;
|
||
|
|
|
||
|
|
// Update job progress
|
||
|
|
if (jobId) {
|
||
|
|
await prisma.backgroundJob.update({
|
||
|
|
where: { id: jobId },
|
||
|
|
data: {
|
||
|
|
processed: totalFetched,
|
||
|
|
succeeded: updated,
|
||
|
|
itemsFound: matched,
|
||
|
|
},
|
||
|
|
});
|
||
|
|
|
||
|
|
// Check for stop
|
||
|
|
const job = await prisma.backgroundJob.findUnique({ where: { id: jobId } });
|
||
|
|
if (job?.status === 'stopping') {
|
||
|
|
log('Job stop requested.');
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
} catch (error: any) {
|
||
|
|
logError(`Error: ${error.message}`);
|
||
|
|
if (jobId) {
|
||
|
|
await prisma.backgroundJob.update({
|
||
|
|
where: { id: jobId },
|
||
|
|
data: { status: 'failed', error: error.message, completedAt: new Date() },
|
||
|
|
});
|
||
|
|
}
|
||
|
|
throw error;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Complete job
|
||
|
|
if (jobId) {
|
||
|
|
await prisma.backgroundJob.update({
|
||
|
|
where: { id: jobId },
|
||
|
|
data: { status: 'completed', completedAt: new Date(), processed: totalFetched, succeeded: updated, itemsFound: matched },
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
log('');
|
||
|
|
log('============================================================');
|
||
|
|
log('Wikidata Enrichment Summary');
|
||
|
|
log('============================================================');
|
||
|
|
log(`Wikidata churches fetched: ${totalFetched}`);
|
||
|
|
log(`Matched to DB churches: ${matched}`);
|
||
|
|
log(`Websites updated: ${updated}`);
|
||
|
|
log(`No match found: ${noMatch}`);
|
||
|
|
log(`Already had website: ${alreadyHasWebsite}`);
|
||
|
|
log('============================================================');
|
||
|
|
|
||
|
|
await prisma.$disconnect();
|
||
|
|
await pool.end();
|
||
|
|
}
|
||
|
|
|
||
|
|
main().catch((error) => {
|
||
|
|
logError(`Fatal error: ${error.message}`);
|
||
|
|
process.exit(1);
|
||
|
|
});
|