#!/usr/bin/env tsx /** * Second-pass matching: analyze stored ChromaDB search results to find websites * that the FreeSearch first pass missed. * * Usage: * npx tsx scripts/match-search-results.ts --dry-run * npx tsx scripts/match-search-results.ts --country IT --limit 100 * npx tsx scripts/match-search-results.ts --threshold 0.3 * * Algorithm: * 1. Get churches without websites that have been FreeSearch'd * 2. Query ChromaDB search_results collection for semantically similar results * 3. Cross-church matching: URLs from nearby churches may match * 4. URL frequency analysis: URLs appearing for multiple churches in same area * 5. Verify best candidates against page content * 6. Update church.website if verified */ import dotenv from 'dotenv'; import path from 'path'; dotenv.config({ path: path.resolve(process.cwd(), '.env') }); import { Pool } from 'pg'; import { PrismaPg } from '@prisma/adapter-pg'; import { PrismaClient } from '@prisma/client'; import { Collection } from 'chromadb'; import axios from 'axios'; import { getCollection, COLLECTION_NAMES } from '../src/chromadb/collections'; import { embedSingle } from '../src/chromadb/embeddings'; // Fresh DB connection const pool = new Pool({ connectionString: process.env.DATABASE_URL }); const adapter = new PrismaPg(pool); const prisma = new PrismaClient({ adapter }); // --- Job Tracking --- async function createOrResumeJob(args: string[]): Promise { const jobIdIndex = args.indexOf('--job-id'); if (jobIdIndex !== -1) { const jobId = args[jobIdIndex + 1]; await prisma.backgroundJob.update({ where: { id: jobId }, data: { status: 'running', startedAt: new Date() }, }); return jobId; } return null; } async function createNewJob(config: Record): Promise { const job = await prisma.backgroundJob.create({ data: { type: 'match-search-results', status: 'running', startedAt: new Date(), config, }, }); return job.id; } async function updateJobProgress(jobId: string, processed: number, found: number, total: number): Promise { await prisma.backgroundJob.update({ where: { id: jobId }, data: { processed, succeeded: found, totalItems: total }, }); } async function checkJobStopping(jobId: string): Promise { const job = await prisma.backgroundJob.findUnique({ where: { id: jobId } }); return job?.status === 'stopping'; } async function completeJob(jobId: string, error?: string): Promise { await prisma.backgroundJob.update({ where: { id: jobId }, data: { status: error ? 'failed' : 'completed', error, completedAt: new Date(), }, }); } // --- Types --- interface ChurchRecord { id: string; name: string; address: string | null; city: string | null; state: string | null; country: string; latitude: number; longitude: number; } interface MatchStats { processed: number; matched: number; noResults: number; verifyFailed: number; errors: number; startTime: number; } // --- Helpers --- let shuttingDown = false; function log(msg: string) { console.log(`[${new Date().toISOString()}] ${msg}`); } function logError(msg: string) { console.error(`[${new Date().toISOString()}] ${msg}`); } function normalizeForMatch(str: string): string { return str.toLowerCase() .replace(/[^a-z0-9\s]/g, '') .replace(/\s+/g, ' ') .trim(); } const CATHOLIC_KEYWORDS = [ 'parish', 'church', 'catholic', 'parroquia', 'paroisse', 'pfarrei', 'parafia', 'paroquia', 'parrocchia', 'farnost', 'plebania', 'parochie', 'župnija', 'farnosť', 'iglesia', 'église', 'kirche', 'kościół', 'chiesa', 'kostel', 'templom', 'kerk', ]; const MASS_SCHEDULE_KEYWORDS = [ 'mass schedule', 'mass times', 'worship schedule', 'worship times', 'service times', 'sunday mass', 'weekday mass', 'horario de misas', 'horarios de misa', 'horaires des messes', 'gottesdienst', 'gottesdienstzeiten', 'messzeiten', 'msze święte', 'godziny mszy', 'msze św', 'orari delle messe', 'orario messe', 'horário das missas', ]; const TOURISM_KEYWORDS = [ 'tourism', 'turismo', 'tourisme', 'turisme', 'touristik', 'turistico', 'attractions', 'things to do', 'sightseeing', 'sehenswürdigkeiten', 'what to see', 'places to visit', 'travel guide', 'reiseführer', 'patrimoine', 'heritage trail', 'cultural heritage', 'punto de interés', 'point of interest', 'points of interest', ]; function getSignificantWords(name: string): string[] { const stopWords = new Set([ 'the', 'of', 'and', 'in', 'at', 'for', 'our', 'lady', 'st', 'saint', 'saints', 'san', 'sant', 'santa', 'santo', 'sacred', 'christ', 'jesus', 'mary', 'maria', 'king', 'lord', 'heart', 'cross', 'lady', 'queen', 'angel', 'angels', 'good', 'star', 'nome', 'pere', 'madre', 'notre', 'dame', 'bien', 'onze', 'lieve', 'vrouw', 'heer', 'rosa', 'paul', 'anne', 'jean', 'joan', 'luke', 'marc', 'rita', 'jose', 'leon', 'pius', 'roch', 'yves', 'ines', 'vita', 'fara', 'bona', 'cristo', 'fatima', 'lourdes', 'perpetuo', 'socorro', 'calvario', 'rosario', 'pilar', 'carmen', 'dolores', 'remedios', 'nieves', 'grotte', 'mission', 'sagrada', 'sagrado', 'familia', 'guadalupe', 'assumption', 'immaculate', 'perpetual', 'divine', 'knights', 'columbus', 'house', 'home', 'hall', 'center', 'centre', 'centro', 'deacon', 'priest', 'bishop', 'father', 'sister', 'brother', 'school', 'academy', 'college', 'seminary', 'rectory', 'retreat', 'church', 'parish', 'catholic', 'roman', 'holy', 'chapel', 'cathedral', 'basilica', 'shrine', 'convent', 'monastery', 'chapelle', 'eglise', 'église', 'paroisse', 'couvent', 'grotte', 'iglesia', 'parroquia', 'capilla', 'ermita', 'convento', 'basílica', 'kirche', 'kapelle', 'pfarrei', 'kloster', 'chiesa', 'parrocchia', 'cappella', 'oratorio', 'igreja', 'capela', 'paroquia', 'kościół', 'kaplica', 'parafia', 'droga', 'kostel', 'kaple', 'farnost', 'templom', 'kápolna', 'de', 'la', 'le', 'les', 'du', 'des', 'el', 'los', 'las', 'di', 'del', 'della', 'delle', 'degli', 'do', 'da', 'dos', 'das', 'und', 'der', 'die', 'das', 'von', 'nad', 'pod', 'przy', ]); return normalizeForMatch(name) .split(' ') .filter(w => w.length >= 3 && !stopWords.has(w)); } function stripHtml(html: string): string { return html .replace(/]*>[\s\S]*?<\/script>/gi, '') .replace(/]*>[\s\S]*?<\/style>/gi, '') .replace(/<[^>]+>/g, ' ') .replace(/&[a-z]+;/gi, ' ') .replace(/\s+/g, ' ') .toLowerCase(); } // --- URL Verification (same logic as enrich-with-freesearch.ts) --- async function verifyUrl(url: string, church: ChurchRecord): Promise { try { const response = await axios.get(url, { timeout: 10000, maxRedirects: 3, headers: { 'User-Agent': 'Mozilla/5.0 (compatible; NearestMass/1.0; +https://nearestmass.com)', 'Accept': 'text/html', }, maxContentLength: 200000, responseType: 'text', }); if (typeof response.data !== 'string') return false; const text = stripHtml(response.data); const nameWords = getSignificantWords(church.name); let nameMatches = 0; for (const word of nameWords) { if (text.includes(word)) nameMatches++; } let cityMatch = false; if (church.city) { const cityNorm = normalizeForMatch(church.city); if (cityNorm.length > 2 && text.includes(cityNorm)) cityMatch = true; } let addressMatch = false; if (church.address) { const addrNorm = normalizeForMatch(church.address); const addrWords = addrNorm.split(' ').filter(w => w.length >= 4 && !/^\d+$/.test(w)); let addrWordMatches = 0; for (const w of addrWords) { if (text.includes(w)) addrWordMatches++; } if (addrWordMatches >= 2) addressMatch = true; } let hasCatholicKeyword = false; for (const kw of CATHOLIC_KEYWORDS) { if (text.includes(kw)) { hasCatholicKeyword = true; break; } } let hasMassSchedule = false; for (const kw of MASS_SCHEDULE_KEYWORDS) { if (text.includes(kw)) { hasMassSchedule = true; break; } } let isTourismPage = false; for (const kw of TOURISM_KEYWORDS) { if (text.includes(kw)) { isTourismPage = true; break; } } let domainMatchesName = false; try { const hostname = new URL(url).hostname.toLowerCase(); for (const word of nameWords) { if (word.length >= 4 && hostname.includes(word)) { domainMatchesName = true; break; } } } catch { /* ignore */ } if (isTourismPage && !hasMassSchedule) return false; let isDeepUrl = false; try { const pathSegments = new URL(url).pathname.split('/').filter(Boolean); isDeepUrl = pathSegments.length > 2; } catch { /* ignore */ } if (isDeepUrl && !domainMatchesName && !hasMassSchedule) return false; const hasCity = !!(church.city && church.city.trim()); if (hasMassSchedule && nameMatches >= 1) return true; if (domainMatchesName && nameMatches >= 1 && hasCatholicKeyword) return true; if (hasCity) { if (nameMatches >= 2) return true; if (nameMatches >= 1 && cityMatch) return true; if (nameMatches >= 1 && addressMatch) return true; } if (!hasCity) { if (nameMatches >= 1 && addressMatch) return true; if (nameMatches >= 3) return true; } return false; } catch { return false; } } // --- ChromaDB Querying --- interface ChromaResult { id: string; url: string; title: string; score: number; distance: number; churchId: string; churchName: string; churchCity: string; verified?: boolean; } async function findCandidatesForChurch( church: ChurchRecord, collection: Collection, threshold: number, nResults: number ): Promise { // Build identity text for semantic search const identityText = `${church.name} ${church.address || ''} ${church.city || ''} ${church.country}`.trim(); const queryEmbedding = await embedSingle(identityText); const results = await collection.query({ queryEmbeddings: [queryEmbedding], nResults, where: { churchCountry: church.country }, }); if (!results.ids[0]) return []; return results.ids[0] .map((id, i) => { const metadata = results.metadatas[0][i] as Record; return { id, url: (metadata.resultUrl as string) || '', title: (metadata.resultTitle as string) || '', score: (metadata.score as number) || 0, distance: results.distances?.[0]?.[i] ?? 1, churchId: (metadata.churchId as string) || '', churchName: (metadata.churchName as string) || '', churchCity: (metadata.churchCity as string) || '', verified: (metadata.verified as boolean) || false, }; }) .filter(r => r.distance <= threshold && r.url); } function deduplicateByUrl(results: ChromaResult[]): ChromaResult[] { const seen = new Map(); for (const r of results) { const existing = seen.get(r.url); if (!existing || r.distance < existing.distance) { seen.set(r.url, r); } } return [...seen.values()].sort((a, b) => a.distance - b.distance); } // --- Main Processing --- async function processChurch( church: ChurchRecord, collection: Collection, stats: MatchStats, threshold: number, dryRun: boolean ): Promise { const label = `${church.name} (${church.city || 'unknown'}, ${church.country})`; try { // 1. Semantic search for similar results in ChromaDB const candidates = await findCandidatesForChurch(church, collection, threshold, 20); if (candidates.length === 0) { log(` - ${label} => no ChromaDB results within threshold`); stats.noResults++; return; } // 2. Separate results: own church vs cross-church const ownResults = candidates.filter(r => r.churchId === church.id); const crossResults = candidates.filter(r => r.churchId !== church.id); // 3. URL frequency: URLs appearing for multiple churches are likely real parish/diocese sites const urlFrequency = new Map(); for (const r of candidates) { urlFrequency.set(r.url, (urlFrequency.get(r.url) || 0) + 1); } // 4. Prioritize: already-verified URLs from other churches, then high-frequency URLs, // then own-church results, then cross-church results const verifiedFromOthers = crossResults.filter(r => r.verified); const highFreqUrls = [...urlFrequency.entries()] .filter(([, count]) => count >= 2) .map(([url]) => url); // Build candidate list in priority order const urlsToTry: string[] = []; const addUrl = (url: string) => { if (!urlsToTry.includes(url)) urlsToTry.push(url); }; // Verified URLs from nearby churches (highest priority) for (const r of verifiedFromOthers) addUrl(r.url); // High-frequency URLs (appear in results for multiple churches) for (const url of highFreqUrls) addUrl(url); // Own church results by distance (closest semantic match first) const dedupedOwn = deduplicateByUrl(ownResults); for (const r of dedupedOwn) addUrl(r.url); // Cross-church results from same city const sameCityCross = crossResults.filter(r => church.city && r.churchCity && normalizeForMatch(r.churchCity) === normalizeForMatch(church.city) ); const dedupedCross = deduplicateByUrl(sameCityCross); for (const r of dedupedCross) addUrl(r.url); // Limit to top 5 candidates const topUrls = urlsToTry.slice(0, 5); log(` ? ${label} => ${candidates.length} results, trying ${topUrls.length} candidates`); // 5. Verify each candidate let verifiedUrl: string | null = null; for (const url of topUrls) { const ok = await verifyUrl(url, church); if (ok) { verifiedUrl = url; break; } else { stats.verifyFailed++; } } if (verifiedUrl) { log(` + ${label} => ${verifiedUrl}`); stats.matched++; if (!dryRun) { await prisma.church.update({ where: { id: church.id }, data: { website: verifiedUrl, hasWebsite: true, }, }); // Mark in ChromaDB (update replaces metadata, so include all fields) try { const matchingResult = candidates.find(r => r.url === verifiedUrl); if (matchingResult) { await collection.update({ ids: [matchingResult.id], metadatas: [{ churchId: matchingResult.churchId, churchName: matchingResult.churchName, churchCity: matchingResult.churchCity, churchCountry: church.country, searchQuery: '', resultUrl: verifiedUrl, resultTitle: matchingResult.title || '', score: matchingResult.score || 0, verified: true, }], }); } } catch { /* ignore */ } } } else { log(` ~ ${label} => ${topUrls.length} candidates failed verification`); stats.noResults++; } } catch (error: any) { stats.errors++; logError(` ! ${label} => error: ${error.message}`); } } // --- Main --- async function main() { const args = process.argv.slice(2); const countryIndex = args.indexOf('--country'); const limitIndex = args.indexOf('--limit'); const thresholdIndex = args.indexOf('--threshold'); const dryRun = args.includes('--dry-run'); const countryCode = countryIndex !== -1 ? args[countryIndex + 1] : undefined; const limit = limitIndex !== -1 ? parseInt(args[limitIndex + 1]) : 500; const threshold = thresholdIndex !== -1 ? parseFloat(args[thresholdIndex + 1]) : 0.4; // Graceful shutdown process.on('SIGTERM', () => { log('Received SIGTERM'); shuttingDown = true; }); process.on('SIGINT', () => { log('Received SIGINT'); shuttingDown = true; }); log('============================================================'); log('Second-Pass Search Result Matching'); log('============================================================'); log(`Country: ${countryCode || 'All'}`); log(`Limit: ${limit}`); log(`Threshold: ${threshold}`); log(`Dry run: ${dryRun ? 'Yes' : 'No'}`); log('============================================================'); // Connect to ChromaDB let collection: Collection; try { collection = await getCollection(COLLECTION_NAMES.SEARCH_RESULTS); log('ChromaDB search_results collection connected'); } catch (e: any) { logError(`ChromaDB unavailable: ${e.message}`); logError('This script requires ChromaDB. Make sure it is running.'); process.exit(1); } // Check collection has data const count = await collection.count(); log(`ChromaDB search_results: ${count} entries`); if (count === 0) { log('No search results stored yet. Run enrich-with-freesearch.ts first.'); process.exit(0); } // Job tracking let jobId = await createOrResumeJob(args); if (!jobId) { jobId = await createNewJob({ countryCode, limit, threshold, dryRun }); } log(`Job ID: ${jobId}`); // Get churches without websites that have been FreeSearch'd const whereClause: Record = { source: 'osm', website: null, freeSearchedAt: { not: null }, }; if (countryCode) { (whereClause as any).country = countryCode; } const churches = await prisma.church.findMany({ where: whereClause as any, select: { id: true, name: true, address: true, city: true, state: true, country: true, latitude: true, longitude: true, }, take: limit, orderBy: { updatedAt: 'asc' }, }); log(`Found ${churches.length} churches without websites (already FreeSearch'd)`); const stats: MatchStats = { processed: 0, matched: 0, noResults: 0, verifyFailed: 0, errors: 0, startTime: Date.now(), }; for (const church of churches) { if (shuttingDown) break; stats.processed++; await processChurch(church, collection, stats, threshold, dryRun); // Job tracking every 10 items if (jobId && stats.processed % 10 === 0) { await updateJobProgress(jobId, stats.processed, stats.matched, churches.length); const stopping = await checkJobStopping(jobId); if (stopping) { log('Job stop requested via admin dashboard.'); shuttingDown = true; break; } } // Progress logging every 50 items if (stats.processed % 50 === 0) { const elapsed = (Date.now() - stats.startTime) / 1000; const rate = Math.round((stats.processed / elapsed) * 3600); log(`Progress: ${stats.processed}/${churches.length} processed, ${stats.matched} matched, ${stats.noResults} no match, ${stats.errors} errors (~${rate}/hour)`); } } // Complete job if (jobId) { await updateJobProgress(jobId, stats.processed, stats.matched, churches.length); await completeJob(jobId); } // Print summary const elapsed = ((Date.now() - stats.startTime) / 1000).toFixed(1); const matchRate = stats.processed > 0 ? ((stats.matched / stats.processed) * 100).toFixed(1) : '0.0'; log(''); log('============================================================'); log('Second-Pass Matching Summary'); log('============================================================'); log(`Churches processed: ${stats.processed}`); log(`Websites matched: ${stats.matched}`); log(`No match found: ${stats.noResults}`); log(`Verify rejected: ${stats.verifyFailed}`); log(`Errors: ${stats.errors}`); log(`Match rate: ${matchRate}%`); log(`Elapsed: ${elapsed}s`); log('============================================================'); await prisma.$disconnect(); await pool.end(); } main().catch((error) => { logError(`Fatal error: ${error.message}`); process.exit(1); });