#!/usr/bin/env tsx /** * Enrich OSM churches with website URLs via FreeSearch (self-hosted SearXNG) * * Usage: * npx tsx scripts/enrich-with-freesearch.ts --limit 10 --dry-run * npx tsx scripts/enrich-with-freesearch.ts --country US --limit 50 * npx tsx scripts/enrich-with-freesearch.ts --continuous * npx tsx scripts/enrich-with-freesearch.ts --re-search --country CR --limit 50 * * --re-search: Re-search previously-searched churches that have no website. * Uses multi-query strategy (2-3 query variations per church) to get * broader FreeSearch coverage. Prioritizes churches that got city data * after their initial search (via reverse geocoding). * * FreeSearch is free and unlimited (~1,500-2,400 churches/hour). * Full initial pass of ~193K churches in ~4-6 days. */ import dotenv from 'dotenv'; import path from 'path'; dotenv.config({ path: path.resolve(process.cwd(), '.env') }); import { Pool } from 'pg'; import { PrismaPg } from '@prisma/adapter-pg'; import { PrismaClient } from '@prisma/client'; import { Collection } from 'chromadb'; import axios from 'axios'; import crypto from 'crypto'; import { getCollection, COLLECTION_NAMES } from '../src/chromadb/collections'; import { embed } from '../src/chromadb/embeddings'; // Fresh DB connection (not cached singleton) const pool = new Pool({ connectionString: process.env.DATABASE_URL }); const adapter = new PrismaPg(pool); const prisma = new PrismaClient({ adapter }); const FREESEARCH_URL = process.env.FREESEARCH_URL || 'http://192.168.0.145:3111'; // --- Job Tracking --- async function createOrResumeJob(args: string[]): Promise { const jobIdIndex = args.indexOf('--job-id'); if (jobIdIndex !== -1) { const jobId = args[jobIdIndex + 1]; await prisma.backgroundJob.update({ where: { id: jobId }, data: { status: 'running', startedAt: new Date() }, }); return jobId; } return null; } async function createNewJob(config: Record): Promise { const job = await prisma.backgroundJob.create({ data: { type: 'freesearch-enrichment', status: 'running', startedAt: new Date(), config, }, }); return job.id; } async function updateJobProgress(jobId: string, stats: EnrichmentStats, totalItems: number): Promise { await prisma.backgroundJob.update({ where: { id: jobId }, data: { processed: stats.processed, succeeded: stats.enriched, failed: stats.errors, itemsFound: stats.enriched, totalItems, }, }); } async function checkJobStopping(jobId: string): Promise { const job = await prisma.backgroundJob.findUnique({ where: { id: jobId } }); return job?.status === 'stopping'; } async function completeJob(jobId: string, error?: string): Promise { await prisma.backgroundJob.update({ where: { id: jobId }, data: { status: error ? 'failed' : 'completed', error, completedAt: new Date(), }, }); } const BATCH_SIZE = 20; const INTER_BATCH_DELAY_MS = 5000; const PROGRESS_INTERVAL = 100; // --- Country priority (same as Google Places enrichment) --- const COUNTRY_PRIORITY = [ 'FR', 'DE', 'ES', 'PL', 'BR', 'PT', 'PH', 'CZ', 'MX', 'HU', 'US', 'CR', 'BE', 'AR', 'CH', 'CO', 'CA', 'SK', 'EC', 'SI', 'NL', 'PE', 'GB', 'ID', 'CL', 'IE', 'BO', 'VN', 'VE', 'UG', 'LU', 'HN', 'CD', 'SV', 'KE', 'PA', 'AU', 'CU', 'GT', 'CN', ]; // Country-specific church keywords for search queries const COUNTRY_KEYWORDS: Record = { FR: 'paroisse', DE: 'pfarrei', ES: 'parroquia', MX: 'parroquia', PL: 'parafia', BR: 'paroquia', PT: 'paroquia', IT: 'parrocchia', CZ: 'farnost', HU: 'plebania', AR: 'parroquia', CO: 'parroquia', EC: 'parroquia', PE: 'parroquia', CL: 'parroquia', VE: 'parroquia', CR: 'parroquia', SV: 'parroquia', GT: 'parroquia', CU: 'parroquia', PA: 'parroquia', BO: 'parroquia', HN: 'parroquia', BE: 'paroisse', LU: 'paroisse', CH: 'pfarrei', NL: 'parochie', SK: 'farnosť', SI: 'župnija', }; // Country-specific "parish" keywords for multi-query strategy const PARISH_KEYWORDS: Record = { US: 'parish', GB: 'parish', IE: 'parish', AU: 'parish', CA: 'parish', ES: 'parroquia', MX: 'parroquia', AR: 'parroquia', CO: 'parroquia', FR: 'paroisse', BE: 'paroisse', DE: 'pfarrei', AT: 'pfarrei', CH: 'pfarrei', IT: 'parrocchia', PT: 'paroquia', BR: 'paroquia', PL: 'parafia', CR: 'parroquia', CL: 'parroquia', PE: 'parroquia', HN: 'parroquia', SV: 'parroquia', GT: 'parroquia', NI: 'parroquia', EC: 'parroquia', VE: 'parroquia', BO: 'parroquia', PA: 'parroquia', CU: 'parroquia', CZ: 'farnost', HU: 'plebania', SK: 'farnosť', SI: 'župnija', NL: 'parochie', LU: 'paroisse', }; // --- Blocked domains --- const BLOCKED_DOMAINS = new Set([ // Social media 'facebook.com', 'fb.com', 'twitter.com', 'x.com', 'instagram.com', 'linkedin.com', 'youtube.com', 'tiktok.com', 'pinterest.com', // Directories & listings 'yelp.com', 'yellowpages.com', 'yp.com', 'bbb.org', 'manta.com', 'superpages.com', 'whitepages.com', 'foursquare.com', // Church directories (not the parish's own site) 'masstimes.org', 'catholicdirectory.com', 'findmass.com', 'parishesonline.com', 'gcatholic.org', 'catholicmasstime.org', 'discovermass.com', 'faithstreet.com', 'miserend.hu', // Schools & landmarks with church-like names (common false positives) 'archbishopchapelle.org', 'sainte-chapelle.fr', // Map & geo services 'google.com', 'mapquest.com', 'apple.com', 'bing.com', 'openstreetmap.org', 'mapcarta.com', // Travel & reviews (multiple TLDs) 'tripadvisor.com', 'tripadvisor.co.nz', 'tripadvisor.co.uk', 'tripadvisor.es', 'tripadvisor.fr', 'tripadvisor.de', 'tripadvisor.it', 'tripadvisor.com.br', 'tripadvisor.com.mx', 'tripadvisor.pl', // Reference & encyclopedias 'wikipedia.org', 'wikidata.org', 'worldhistory.org', 'britannica.com', // Dictionaries & translation 'spanishdict.com', 'wiktionary.org', 'cambridge.org', 'merriam-webster.com', 'wordreference.com', 'dict.cc', 'linguee.com', 'deepl.com', 'translate.google.com', 'collinsdictionary.com', 'reverso.net', 'thefreedictionary.com', 'dictionary.com', // Non-Catholic religious sites 'jw.org', // News & media (not parish websites) 'pbs.org', 'bbc.com', 'cnn.com', 'nytimes.com', // Wikimedia projects 'wikimedia.org', // Free hosting platforms (generic, not parish-specific) 'wixsite.com', 'weebly.com', 'blogspot.com', 'wordpress.com', // Travel, ticket & tourism sites 'trip.com', 'seatgeek.com', 'airial.travel', 'booking.com', 'expedia.com', 'hotels.com', 'viator.com', 'getyourguide.com', 'wanderlog.com', 'spain-places.com', 'burgundy-tourism.com', 'philippineairlines.com', // Q&A, forums, news 'stackexchange.com', 'stackoverflow.com', 'reddit.com', 'quora.com', 'thisisanfield.com', // Software / SaaS / false matches on church terms 'myconvento.com', 'socha.net', 'drogaraia.com.br', // Heritage & cultural directories 'zabytek.pl', 'monumentos.gov.pt', // Location / directory aggregators 'wheree.com', 'seety.co', // Banks (false matches on "sant" → "santander", etc.) 'santanderbank.com', 'santander.com', 'bancosantander.es', // General reference / clearly unrelated 'brainly.in', 'zhihu.com', 'countyoffice.org', 'weforum.org', 'themoviedb.org', 'imdb.com', 'amazon.com', 'ebay.com', 'thereformation.com', 'rarest.org', 'gutenberg.org', 'consumersadvocate.org', 'endlessmile.com', 'sacredrootsnc.com', // Government sites (city pages, not parish sites) 'madrid.es', // Regional tourism portals 'romantischer-rhein.de', 'riberana.es', 'lascuarre.es', // Action/advocacy organizations (not parish sites) 'franciscanaction.org', // Church directories (additional) 'iglesiaslocales.com', // Catholic media & organizations (not individual parishes) 'usccb.org', 'vaticannews.va', 'ewtn.com', 'catholic.org', 'newadvent.org', 'catholicnewsagency.com', 'ncronline.org', 'americamagazine.org', 'catholic-hierarchy.org', 'aleteia.org', // Dictionaries (additional) 'lerobert.com', 'larousse.fr', // News (regional) 'murciatoday.com', // National/international organization sites (not local parishes/chapters) 'kofc.org', 'sagradafamilia.org', // Bible/religious Q&A sites 'gotquestions.org', 'biblegateway.com', 'biblehub.com', // Universities/schools (not parish sites) 'rivier.edu', // Religious directories (additional) 'latinmassdir.org', 'myhollyland.org', ]); // Country-specific TLDs for scoring bonus const COUNTRY_TLDS: Record = { DE: 'de', FR: 'fr', ES: 'es', IT: 'it', PT: 'pt', PL: 'pl', NL: 'nl', BE: 'be', AT: 'at', CH: 'ch', CZ: 'cz', HU: 'hu', HR: 'hr', SK: 'sk', SI: 'si', IE: 'ie', GB: 'uk', BR: 'com\\.br', MX: 'com\\.mx', AR: 'com\\.ar', CO: 'com\\.co', }; // Catholic keywords used in scoring const CATHOLIC_KEYWORDS = [ 'parish', 'church', 'catholic', 'parroquia', 'paroisse', 'pfarrei', 'parafia', 'paroquia', 'parrocchia', 'farnost', 'plebania', 'parochie', 'župnija', 'farnosť', 'iglesia', 'église', 'kirche', 'kościół', 'chiesa', 'kostel', 'templom', 'kerk', ]; // --- Types --- interface ChurchRecord { id: string; name: string; address: string | null; city: string | null; state: string | null; country: string; latitude: number; longitude: number; diocese: string | null; } interface SearchResult { title: string; url: string; description?: string; source?: string; } interface FreeSearchResponse { results: SearchResult[]; provider: string; query: string; elapsed_ms: number; } interface EnrichmentStats { processed: number; enriched: number; notFound: number; errors: number; verified: number; verifyFailed: number; cycles: number; startTime: number; } // --- Circuit Breaker --- class CircuitBreaker { private failures = 0; private isOpen = false; private backoffMs = 10000; private readonly maxBackoffMs = 300000; // 5 minutes private readonly threshold = 5; async checkAndWait(): Promise { if (!this.isOpen) return true; log(`Circuit breaker open. Waiting ${Math.round(this.backoffMs / 1000)}s before retry...`); await sleep(this.backoffMs); // Health check try { const resp = await axios.get(`${FREESEARCH_URL}/api/health`, { timeout: 5000 }); if (resp.status === 200) { this.reset(); log('Circuit breaker closed: FreeSearch is back'); return true; } } catch { // Still down } this.backoffMs = Math.min(this.backoffMs * 2, this.maxBackoffMs); return false; } recordFailure() { this.failures++; if (this.failures >= this.threshold && !this.isOpen) { this.isOpen = true; this.backoffMs = 10000; log(`Circuit breaker OPEN after ${this.failures} consecutive failures`); } } reset() { if (this.failures > 0 || this.isOpen) { this.failures = 0; this.isOpen = false; this.backoffMs = 10000; } } get opened() { return this.isOpen; } } // --- Helpers --- let shuttingDown = false; function log(msg: string) { console.log(`[${new Date().toISOString()}] ${msg}`); } function logError(msg: string) { console.error(`[${new Date().toISOString()}] ${msg}`); } function sleep(ms: number): Promise { return new Promise(resolve => { const timer = setTimeout(resolve, ms); // Allow early exit on shutdown const check = setInterval(() => { if (shuttingDown) { clearTimeout(timer); clearInterval(check); resolve(); } }, 1000); // Clean up interval when timer fires normally setTimeout(() => clearInterval(check), ms + 100); }); } function getDomain(url: string): string { try { const hostname = new URL(url).hostname.toLowerCase(); // Get root domain (e.g., "www.parish.org" -> "parish.org") const parts = hostname.split('.'); if (parts.length >= 2) { return parts.slice(-2).join('.'); } return hostname; } catch { return ''; } } // Domain name keywords that indicate non-parish sites const BLOCKED_DOMAIN_KEYWORDS = [ 'tripadvisor', 'archinform', 'seatgeek', // Tourism/travel 'turismo', 'tourism', 'tourisme', 'touristik', 'turistico', 'turistik', 'reisefuhrer', 'wanderlog', 'viator', 'getyourguide', ]; function isBlockedUrl(url: string): boolean { try { const hostname = new URL(url).hostname.toLowerCase(); // Check if hostname ends with any blocked domain for (const blocked of BLOCKED_DOMAINS) { if (hostname === blocked || hostname.endsWith('.' + blocked)) { return true; } } // Check domain keywords (catches all TLD variants) for (const keyword of BLOCKED_DOMAIN_KEYWORDS) { if (hostname.includes(keyword)) { return true; } } // Block PDF/document URLs (not parish homepages) const path = new URL(url).pathname.toLowerCase(); if (path.endsWith('.pdf') || path.endsWith('.doc') || path.endsWith('.docx')) { return true; } return false; } catch { return true; // Block unparseable URLs } } function hashUrl(url: string): string { return crypto.createHash('md5').update(url).digest('hex').slice(0, 12); } function normalizeForMatch(str: string): string { return str.toLowerCase() .replace(/[^a-z0-9\s]/g, '') .replace(/\s+/g, ' ') .trim(); } function getSignificantWords(name: string): string[] { const stopWords = new Set([ // English articles/prepositions 'the', 'of', 'and', 'in', 'at', 'for', 'our', 'lady', // Religious titles & very common church name words 'st', 'saint', 'saints', 'san', 'sant', 'santa', 'santo', 'sacred', 'christ', 'jesus', 'mary', 'maria', 'king', 'lord', 'heart', 'cross', 'lady', 'queen', 'angel', 'angels', 'good', 'star', 'nome', 'nome', 'pere', 'madre', 'notre', 'dame', 'bien', 'onze', 'lieve', 'vrouw', 'heer', // Very common short saint/religious names (match too many unrelated domains) 'rosa', 'paul', 'anne', 'jean', 'joan', 'luke', 'marc', 'rita', 'jose', 'leon', 'pius', 'roch', 'yves', 'ines', 'vita', 'fara', 'bona', // Common religious words that match wrong churches/organizations 'cristo', 'fatima', 'lourdes', 'perpetuo', 'socorro', 'calvario', 'rosario', 'pilar', 'carmen', 'dolores', 'remedios', 'nieves', 'grotte', 'mission', 'sagrada', 'sagrado', 'familia', 'guadalupe', 'assumption', 'immaculate', 'perpetual', 'divine', // Organization names (match national sites, not local chapters) 'knights', 'columbus', // Structural/role words (not distinctive church names) 'house', 'home', 'hall', 'center', 'centre', 'centro', 'deacon', 'priest', 'bishop', 'father', 'sister', 'brother', 'school', 'academy', 'college', 'seminary', 'rectory', 'retreat', // Church-generic words (EN) 'church', 'parish', 'catholic', 'roman', 'holy', 'chapel', 'cathedral', 'basilica', 'shrine', 'convent', 'monastery', // Church-generic words (FR) 'chapelle', 'eglise', 'église', 'paroisse', 'couvent', 'grotte', // Church-generic words (ES) 'iglesia', 'parroquia', 'capilla', 'ermita', 'convento', 'basílica', // Church-generic words (DE) 'kirche', 'kapelle', 'pfarrei', 'kloster', // Church-generic words (IT) 'chiesa', 'parrocchia', 'cappella', 'oratorio', // Church-generic words (PT) 'igreja', 'capela', 'paroquia', // Church-generic words (PL) 'kościół', 'kaplica', 'parafia', 'droga', // Church-generic words (CZ/SK/HU) 'kostel', 'kaple', 'farnost', 'templom', 'kápolna', // Articles/prepositions (Romance) 'de', 'la', 'le', 'les', 'du', 'des', 'el', 'los', 'las', 'di', 'del', 'della', 'delle', 'degli', 'do', 'da', 'dos', 'das', // Articles/prepositions (Germanic) 'und', 'der', 'die', 'das', 'von', // Articles/prepositions (Slavic) 'nad', 'pod', 'przy', ]); const words = normalizeForMatch(name).split(' '); const significant = words.filter(w => w.length >= 3 && !stopWords.has(w)); // If all words were filtered, return top 2 longest words (>= 4 chars) regardless of stop list if (significant.length === 0) { return words .filter(w => w.length >= 4) .sort((a, b) => b.length - a.length) .slice(0, 2); } return significant; } // --- URL Scoring --- function scoreUrl(result: SearchResult, church: ChurchRecord): number { let score = 0; const url = result.url.toLowerCase(); const title = (result.title || '').toLowerCase(); const description = (result.description || '').toLowerCase(); const domain = getDomain(result.url); const domainWithoutTld = domain.split('.')[0] || ''; const nameWords = getSignificantWords(church.name); let hasNameMatch = false; // Church name words in domain (strongest signal) // Require >= 4 chars to avoid short-word false matches (e.g. "rosa" in "rosaparks") for (const word of nameWords) { if (word.length >= 4 && domainWithoutTld.includes(word)) { score += 10; hasNameMatch = true; break; } } // Church name words in URL path (e.g. /san-bartolome/ or /stmichael) try { const urlPath = new URL(result.url).pathname.toLowerCase(); for (const word of nameWords) { if (word.length >= 4 && urlPath.includes(word)) { score += 2; hasNameMatch = true; break; } } } catch { /* ignore */ } // Church name words in title for (const word of nameWords) { if (title.includes(word)) { score += 5; hasNameMatch = true; } } // Church name words in description for (const word of nameWords) { if (description.includes(word)) { score += 2; hasNameMatch = true; } } // Catholic keywords ONLY count if there's also a name match // (prevents "iglesia" in dictionary, "chiesa" in soccer news, etc.) if (hasNameMatch) { for (const kw of CATHOLIC_KEYWORDS) { if (title.includes(kw) || description.includes(kw)) { score += 3; break; } } } // TLD bonus (only with name match) if (hasNameMatch && url.match(/\.(org|church)(\/|$)/)) { score += 3; } // Country-specific TLD bonus (e.g. .pl for Polish churches, .com.br for Brazilian) const countryTld = COUNTRY_TLDS[church.country]; if (countryTld && hasNameMatch && url.match(new RegExp(`\\.${countryTld}(\\/|$)`))) { score += 4; } // City name in title/domain if (church.city) { const cityNorm = normalizeForMatch(church.city); if (cityNorm.length > 2) { if (title.includes(cityNorm)) score += 3; if (domainWithoutTld.includes(cityNorm.replace(/\s/g, ''))) score += 5; } } // Penalty: diocese/archdiocese in domain (not the parish site) if (domainWithoutTld.includes('diocese') || domainWithoutTld.includes('archdiocese') || domainWithoutTld.includes('bistum') || domainWithoutTld.includes('diecezja')) { score -= 5; } // Penalty: deep URL path try { const pathSegments = new URL(result.url).pathname.split('/').filter(Boolean); if (pathSegments.length > 3) { score -= 5; } } catch { // ignore } return score; } /** * Returns top candidate URLs sorted by score (min score 1). * Relaxed threshold — verifyUrl() is the real quality gate. * Multiple candidates are returned so verification can try the next one if first fails. */ function pickCandidateUrls(results: SearchResult[], church: ChurchRecord): string[] { const candidates = results.filter(r => !isBlockedUrl(r.url)); if (candidates.length === 0) return []; const scored = candidates.map(r => ({ url: r.url, score: scoreUrl(r, church), })); scored.sort((a, b) => b.score - a.score); // Return top 5 candidates with score >= 1 (verifyUrl is the real quality gate) return scored .filter(s => s.score >= 1) .slice(0, 5) .map(s => s.url); } // --- URL Verification --- function stripHtml(html: string): string { return html .replace(/]*>[\s\S]*?<\/script>/gi, '') .replace(/]*>[\s\S]*?<\/style>/gi, '') .replace(/<[^>]+>/g, ' ') .replace(/&[a-z]+;/gi, ' ') .replace(/\s+/g, ' ') .toLowerCase(); } // Tourism/cultural keywords — pages about churches as attractions, NOT the church's own site const TOURISM_KEYWORDS = [ 'tourism', 'turismo', 'tourisme', 'turisme', 'touristik', 'turistico', 'attractions', 'things to do', 'sightseeing', 'sehenswürdigkeiten', 'what to see', 'places to visit', 'travel guide', 'reiseführer', 'patrimoine', 'heritage trail', 'cultural heritage', 'punto de interés', 'point of interest', 'points of interest', ]; // Mass schedule keywords — pages with these are almost certainly the church's own site const MASS_SCHEDULE_KEYWORDS = [ 'mass schedule', 'mass times', 'worship schedule', 'worship times', 'service times', 'sunday mass', 'weekday mass', 'horario de misas', 'horarios de misa', 'horaires des messes', 'gottesdienst', 'gottesdienstzeiten', 'messzeiten', 'msze święte', 'godziny mszy', 'msze św', 'orari delle messe', 'orario messe', 'horário das missas', ]; /** * Fetch the URL and verify the page content mentions the church. * Returns true if the page appears to belong to this church. */ async function verifyUrl(url: string, church: ChurchRecord): Promise { try { const response = await axios.get(url, { timeout: 10000, maxRedirects: 3, headers: { 'User-Agent': 'Mozilla/5.0 (compatible; NearestMass/1.0; +https://nearestmass.com)', 'Accept': 'text/html', }, // Only read first 200KB to avoid downloading huge pages maxContentLength: 200000, responseType: 'text', }); if (typeof response.data !== 'string') return false; const text = stripHtml(response.data); const nameWords = getSignificantWords(church.name); // Count how many significant name words appear in the page let nameMatches = 0; for (const word of nameWords) { if (text.includes(word)) { nameMatches++; } } // Check for city name on page let cityMatch = false; if (church.city) { const cityNorm = normalizeForMatch(church.city); if (cityNorm.length > 2 && text.includes(cityNorm)) { cityMatch = true; } } // Check for address on page (street name is a strong location signal) let addressMatch = false; if (church.address) { const addrNorm = normalizeForMatch(church.address); // Extract significant address words (street names, not numbers) const addrWords = addrNorm.split(' ').filter(w => w.length >= 4 && !/^\d+$/.test(w)); // If 2+ address words found on page, it's likely the right location let addrWordMatches = 0; for (const w of addrWords) { if (text.includes(w)) addrWordMatches++; } if (addrWordMatches >= 2) addressMatch = true; } // Check for Catholic/church keywords on page let hasCatholicKeyword = false; for (const kw of CATHOLIC_KEYWORDS) { if (text.includes(kw)) { hasCatholicKeyword = true; break; } } // Check if page has mass schedule info (strong positive signal) let hasMassSchedule = false; for (const kw of MASS_SCHEDULE_KEYWORDS) { if (text.includes(kw)) { hasMassSchedule = true; break; } } // Check if page is a tourism/cultural page (negative signal) let isTourismPage = false; for (const kw of TOURISM_KEYWORDS) { if (text.includes(kw)) { isTourismPage = true; break; } } // Check if domain contains church name words (strong signal) // Require word length >= 5 to avoid false matches (e.g. "rosa" in "rosaparks") let domainMatchesName = false; try { const hostname = new URL(url).hostname.toLowerCase(); for (const word of nameWords) { if (word.length >= 5 && hostname.includes(word)) { domainMatchesName = true; break; } } } catch { /* ignore */ } // Reject tourism pages unless they also have mass schedules (= the church itself) if (isTourismPage && !hasMassSchedule) return false; // Reject deep URLs on non-matching domains (articles ABOUT churches, not church sites) // Church sites are usually at root or shallow paths let isDeepUrl = false; try { const pathSegments = new URL(url).pathname.split('/').filter(Boolean); isDeepUrl = pathSegments.length > 2; } catch { /* ignore */ } if (isDeepUrl && !domainMatchesName && !hasMassSchedule) return false; const hasCity = !!(church.city && church.city.trim()); // Strong signal: mass schedule + at least 1 name word = almost certainly the right church if (hasMassSchedule && nameMatches >= 1) return true; // Domain matches name + name word on page + Catholic keyword = likely the church's site if (domainMatchesName && nameMatches >= 1 && hasCatholicKeyword) return true; // For churches WITH city data, require name + city (city provides disambiguation) if (hasCity) { if (nameMatches >= 2) return true; if (nameMatches >= 1 && cityMatch) return true; if (nameMatches >= 1 && addressMatch) return true; } // For churches WITHOUT city data, require stronger evidence: // - Domain match already handled above // - Mass schedule already handled above // - Address match can substitute for city // - Need 3+ name words as fallback (very distinctive name) if (!hasCity) { if (nameMatches >= 1 && addressMatch) return true; if (nameMatches >= 3) return true; } return false; } catch { // If we can't fetch the page (timeout, SSL error, etc.), reject it return false; } } // --- Search Query Construction --- function buildSearchQuery(church: ChurchRecord): string { const parts: string[] = []; // Church name parts.push(`"${church.name}"`); // City if (church.city) parts.push(church.city); // State (for US/CA/AU/BR) if (church.state && ['US', 'CA', 'AU', 'BR'].includes(church.country)) { parts.push(church.state); } // Diocese (helps disambiguate churches with common names) if (church.diocese) parts.push(church.diocese); // Country-specific keyword const keyword = COUNTRY_KEYWORDS[church.country]; if (keyword) parts.push(keyword); parts.push('official website'); return parts.join(' '); } /** * Build multiple query variations for re-search mode. * Returns 2-3 different queries to get broader FreeSearch coverage. */ function buildSearchQueries(church: ChurchRecord): string[] { const queries: string[] = []; // Query 1: Standard query (quoted name + city + keyword) queries.push(buildSearchQuery(church)); // Query 2: Unquoted name + city (catches accent/spelling variations) const nameWords = church.name.split(/\s+/).filter(w => w.length >= 3); const simpleName = nameWords.join(' '); const q2Parts = [simpleName]; if (church.city) q2Parts.push(church.city); const keyword = COUNTRY_KEYWORDS[church.country]; if (keyword) q2Parts.push(keyword); queries.push(q2Parts.join(' ')); // Query 3: Distinctive words only + city + local parish keyword const sigWords = getSignificantWords(church.name); if (sigWords.length > 0 && church.city) { const q3Parts = [...sigWords, church.city]; const parishKeyword = PARISH_KEYWORDS[church.country]; if (parishKeyword) q3Parts.push(parishKeyword); queries.push(q3Parts.join(' ')); } // Query 4: Country-specific TLD search (for non-English countries) const countryTld = COUNTRY_TLDS[church.country]; if (countryTld && church.country !== 'US' && church.country !== 'GB') { const tldClean = countryTld.replace(/\\\./g, '.'); const q4Parts = [simpleName]; if (church.city) q4Parts.push(church.city); q4Parts.push(`site:.${tldClean}`); queries.push(q4Parts.join(' ')); } // Deduplicate queries return [...new Set(queries)]; } // --- FreeSearch API --- async function searchSingle(query: string): Promise { const response = await axios.get(`${FREESEARCH_URL}/api/search`, { params: { q: query }, timeout: 30000, }); return response.data; } async function healthCheck(): Promise { try { const resp = await axios.get(`${FREESEARCH_URL}/api/health`, { timeout: 5000 }); return resp.status === 200; } catch { return false; } } async function waitForFreeSearch(): Promise { let backoffMs = 30_000; const maxBackoffMs = 300_000; // 5 minutes let attempt = 0; while (!shuttingDown) { attempt++; const healthy = await healthCheck(); if (healthy) { if (attempt > 1) log('FreeSearch is back. Continuing...'); return; } const waitSec = Math.round(backoffMs / 1000); logError(`FreeSearch not reachable at ${FREESEARCH_URL} (attempt ${attempt}). Retrying in ${waitSec}s...`); await sleep(backoffMs); backoffMs = Math.min(backoffMs * 2, maxBackoffMs); } } // --- Database Queries --- // Cutoff timestamp for re-search: only re-search churches searched BEFORE script start const reSearchCutoff = new Date(); async function getNextBatch( batchSize: number, countryCode?: string, reSearch?: boolean ): Promise { const selectFields = { id: true, name: true, address: true, city: true, state: true, country: true, latitude: true, longitude: true, diocese: true, }; // Re-search mode: find previously-searched churches that have no website // Only pick churches searched BEFORE this script started (avoids infinite loop // since processBatch updates freeSearchedAt to now, which is after the cutoff) if (reSearch) { return prisma.church.findMany({ where: { source: 'osm', website: null, freeSearchedAt: { not: null, lt: reSearchCutoff }, city: { not: null }, NOT: { city: '' }, ...(countryCode ? { country: countryCode } : {}), }, select: selectFields, take: batchSize, // Prioritize reverse-geocoded churches (got city after first search) orderBy: [{ reverseGeocodedAt: { sort: 'desc', nulls: 'last' } }, { createdAt: 'asc' }], }); } if (countryCode) { return prisma.church.findMany({ where: { source: 'osm', website: null, freeSearchedAt: null, country: countryCode, city: { not: null }, NOT: { city: '' }, }, select: selectFields, take: batchSize, orderBy: { createdAt: 'asc' }, }); } // Round-robin: take a few from each priority country // Prioritize churches WITH city data (better search results) const perCountry = Math.max(1, Math.ceil(batchSize / COUNTRY_PRIORITY.length)); const churches: ChurchRecord[] = []; for (const country of COUNTRY_PRIORITY) { if (churches.length >= batchSize) break; // First try churches with city data (higher quality searches) const batch = await prisma.church.findMany({ where: { source: 'osm', website: null, freeSearchedAt: null, country, city: { not: null }, NOT: { city: '' }, }, select: selectFields, take: perCountry, orderBy: { createdAt: 'asc' }, }); churches.push(...batch); } return churches.slice(0, batchSize); } // --- Main Processing --- async function processBatch( churches: ChurchRecord[], stats: EnrichmentStats, dryRun: boolean, jobId?: string | null, chromaCollection?: Collection | null, reSearch?: boolean ): Promise { for (const church of churches) { if (shuttingDown) break; stats.processed++; const label = `${church.name} (${church.city || 'unknown'}, ${church.country})`; // Multi-query for re-search mode, single query for first-pass const queries = reSearch ? buildSearchQueries(church) : [buildSearchQuery(church)]; try { // Search with all queries, merge results const allResults: SearchResult[] = []; const seenUrls = new Set(); for (const query of queries) { try { const response = await searchSingle(query); for (const r of (response.results || [])) { if (!seenUrls.has(r.url)) { seenUrls.add(r.url); allResults.push(r); } } } catch (error: any) { logError(` Query failed: ${query} — ${error.message}`); } // Brief pause between queries to not overwhelm FreeSearch if (queries.length > 1) await sleep(500); } if (reSearch && queries.length > 1) { log(` ? ${label} => ${queries.length} queries, ${allResults.length} unique results`); } const results = allResults; const candidateUrls = pickCandidateUrls(results, church); // Store all non-blocked search results in ChromaDB for later analysis if (chromaCollection && results.length > 0 && !dryRun) { try { const nonBlocked = results.filter(r => !isBlockedUrl(r.url)); if (nonBlocked.length > 0) { const docs = nonBlocked.map(r => `${r.title || ''} ${r.description || ''} ${r.url}`.trim() ); const embeddings = await embed(docs); await chromaCollection.upsert({ ids: nonBlocked.map(r => `search-${church.id}-${hashUrl(r.url)}`), embeddings, documents: docs, metadatas: nonBlocked.map(r => ({ churchId: church.id, churchName: church.name, churchCity: church.city || '', churchCountry: church.country, searchQuery: queries[0], resultUrl: r.url, resultTitle: r.title || '', score: scoreUrl(r, church), })), }); } } catch (e: any) { logError(`ChromaDB store failed: ${e.message}`); } } if (results.length === 0) { log(` - ${label} => no results`); stats.notFound++; if (!dryRun) { await prisma.church.update({ where: { id: church.id }, data: { freeSearchedAt: new Date() }, }); } continue; } if (candidateUrls.length === 0) { log(` - ${label} => no candidates above threshold`); stats.notFound++; if (!dryRun) { await prisma.church.update({ where: { id: church.id }, data: { freeSearchedAt: new Date() }, }); } continue; } // Try each candidate URL with verification let verifiedUrl: string | null = null; for (const url of candidateUrls) { const ok = await verifyUrl(url, church); if (ok) { verifiedUrl = url; stats.verified++; break; } else { stats.verifyFailed++; } } if (verifiedUrl) { log(` + ${label} => ${verifiedUrl}`); stats.enriched++; if (!dryRun) { await prisma.church.update({ where: { id: church.id }, data: { website: verifiedUrl, hasWebsite: true, freeSearchedAt: new Date(), }, }); // Mark the verified result in ChromaDB (update replaces metadata, so include all fields) if (chromaCollection) { try { await chromaCollection.update({ ids: [`search-${church.id}-${hashUrl(verifiedUrl)}`], metadatas: [{ churchId: church.id, churchName: church.name, churchCity: church.city || '', churchCountry: church.country, searchQuery: queries[0], resultUrl: verifiedUrl, resultTitle: '', score: 0, verified: true, }], }); } catch { /* ignore — entry may not exist if ChromaDB was down during store */ } } } } else { log(` ~ ${label} => ${candidateUrls.length} candidates failed verification`); stats.notFound++; if (!dryRun) { await prisma.church.update({ where: { id: church.id }, data: { freeSearchedAt: new Date() }, }); } } } catch (error: any) { stats.errors++; logError(` ! ${label} => error: ${error.message}`); throw error; } // Job tracking: update progress every 10 items if (jobId && stats.processed % 10 === 0) { await updateJobProgress(jobId, stats, churches.length); const stopping = await checkJobStopping(jobId); if (stopping) { log('Job stop requested via admin dashboard.'); shuttingDown = true; break; } } // Progress logging if (stats.processed % PROGRESS_INTERVAL === 0) { const elapsed = (Date.now() - stats.startTime) / 1000; const rate = Math.round((stats.processed / elapsed) * 3600); const hitRate = stats.processed > 0 ? ((stats.enriched / stats.processed) * 100).toFixed(1) : '0.0'; log(`Progress: ${stats.processed} processed, ${stats.enriched} enriched, ${stats.notFound} not found, ${stats.errors} errors`); log(`Hit rate: ${hitRate}%, Verified: ${stats.verified}, Verify failed: ${stats.verifyFailed}, Rate: ~${rate} churches/hour`); } } } async function runSinglePass( stats: EnrichmentStats, countryCode?: string, limit?: number, dryRun: boolean = false, jobId?: string | null, chromaCollection?: Collection | null, reSearch?: boolean ): Promise { let totalProcessed = 0; let consecutiveErrors = 0; while (!shuttingDown) { if (limit && totalProcessed >= limit) break; const batchLimit = limit ? Math.min(BATCH_SIZE, limit - totalProcessed) : BATCH_SIZE; const churches = await getNextBatch(batchLimit, countryCode, reSearch); if (churches.length === 0) break; try { await processBatch(churches, stats, dryRun, jobId, chromaCollection, reSearch); consecutiveErrors = 0; } catch (error: any) { consecutiveErrors++; if (consecutiveErrors >= 5) { logError(`5 consecutive batch errors. Stopping.`); break; } logError(`Batch error (${consecutiveErrors}/5): ${error.message}`); } totalProcessed += churches.length; if (!shuttingDown && churches.length === batchLimit) { await sleep(INTER_BATCH_DELAY_MS); } } } async function runContinuous(stats: EnrichmentStats, jobId?: string | null, chromaCollection?: Collection | null, reSearch?: boolean): Promise { log('Running in continuous mode (24/7). Press Ctrl+C to stop.'); const circuitBreaker = new CircuitBreaker(); while (!shuttingDown) { stats.cycles++; log(`--- Cycle ${stats.cycles} ---`); let batchesInCycle = 0; while (!shuttingDown) { // Circuit breaker check if (circuitBreaker.opened) { const ok = await circuitBreaker.checkAndWait(); if (!ok) continue; } const churches = await getNextBatch(BATCH_SIZE, undefined, reSearch); if (churches.length === 0) break; try { await processBatch(churches, stats, false, jobId, chromaCollection, reSearch); circuitBreaker.reset(); batchesInCycle++; } catch (error: any) { stats.errors++; circuitBreaker.recordFailure(); logError(`Batch error: ${error.message}`); } if (!shuttingDown) { await sleep(INTER_BATCH_DELAY_MS); } } if (shuttingDown) break; if (batchesInCycle === 0) { log('No unsearched churches found. Waiting 1 hour...'); // Sleep 1 hour, checking shutdown every 10s for (let i = 0; i < 360 && !shuttingDown; i++) { await sleep(10000); } } else { log(`Cycle ${stats.cycles} complete. ${batchesInCycle} batches processed. Brief pause...`); await sleep(10000); } } } // --- Main --- async function main() { const args = process.argv.slice(2); const countryIndex = args.indexOf('--country'); const limitIndex = args.indexOf('--limit'); const dryRun = args.includes('--dry-run'); const continuous = args.includes('--continuous'); const reSearch = args.includes('--re-search'); const countryCode = countryIndex !== -1 ? args[countryIndex + 1] : undefined; const limit = limitIndex !== -1 ? parseInt(args[limitIndex + 1]) : undefined; // Graceful shutdown process.on('SIGTERM', () => { log('Received SIGTERM, finishing current batch...'); shuttingDown = true; }); process.on('SIGINT', () => { log('Received SIGINT, finishing current batch...'); shuttingDown = true; }); log('============================================================'); log('FreeSearch Church Website Enrichment'); log('============================================================'); log(`FreeSearch URL: ${FREESEARCH_URL}`); log(`Mode: ${continuous ? 'Continuous (24/7)' : 'Single pass'}${reSearch ? ' (RE-SEARCH)' : ''}`); log(`Re-search: ${reSearch ? 'Yes (multi-query, previously searched churches)' : 'No'}`); log(`Country: ${countryCode || 'All (round-robin priority)'}`); log(`Limit: ${limit || 'No limit'}`); log(`Dry run: ${dryRun ? 'Yes' : 'No'}`); log(`Batch size: ${BATCH_SIZE}`); log('============================================================'); // Wait for FreeSearch to be reachable (indefinite retry with backoff) log('Waiting for FreeSearch to be reachable...'); await waitForFreeSearch(); if (shuttingDown) return; log('FreeSearch health check: OK'); // ChromaDB connection (optional — results stored if available) let chromaCollection: Collection | null = null; try { chromaCollection = await getCollection(COLLECTION_NAMES.SEARCH_RESULTS); log('ChromaDB search_results collection connected'); } catch { log('ChromaDB unavailable — results will not be stored'); } // Job tracking — clean up any running jobs left by a previous container restart await prisma.backgroundJob.updateMany({ where: { type: 'freesearch-enrichment', status: 'running' }, data: { status: 'failed', error: 'Container restarted', completedAt: new Date() }, }); let jobId = await createOrResumeJob(args); if (!jobId) { jobId = await createNewJob({ countryCode, limit, continuous, dryRun, reSearch }); } log(`Job ID: ${jobId}`); const stats: EnrichmentStats = { processed: 0, enriched: 0, notFound: 0, errors: 0, verified: 0, verifyFailed: 0, cycles: 0, startTime: Date.now(), }; if (continuous) { await runContinuous(stats, jobId, chromaCollection, reSearch); } else { await runSinglePass(stats, countryCode, limit, dryRun, jobId, chromaCollection, reSearch); } // Complete job if (jobId) { await updateJobProgress(jobId, stats, 0); await completeJob(jobId); } // Print summary const elapsed = ((Date.now() - stats.startTime) / 1000).toFixed(1); const hitRate = stats.processed > 0 ? ((stats.enriched / stats.processed) * 100).toFixed(1) : '0.0'; log(''); log('============================================================'); log('Enrichment Summary'); log('============================================================'); log(`Churches processed: ${stats.processed}`); log(`Websites found: ${stats.enriched}`); log(`No website found: ${stats.notFound}`); log(`Errors: ${stats.errors}`); log(`URLs verified: ${stats.verified}`); log(`Verify rejected: ${stats.verifyFailed}`); log(`Hit rate: ${hitRate}%`); log(`Elapsed: ${elapsed}s`); if (stats.cycles > 0) { log(`Cycles completed: ${stats.cycles}`); } log('============================================================'); await prisma.$disconnect(); await pool.end(); } main().catch((error) => { logError(`Fatal error: ${error.message}`); process.exit(1); });