From 857eaedbcf09247f6c527cc8382f77dffd573d32 Mon Sep 17 00:00:00 2001 From: albertfj114 Date: Sat, 28 Mar 2026 08:46:03 -0400 Subject: [PATCH] fix: wait for FreeSearch on startup instead of exiting; clean stale jobs Co-Authored-By: Claude Sonnet 4.6 --- scripts/enrich-with-freesearch.ts | 1371 +++++++++++++++++++++++++++++ 1 file changed, 1371 insertions(+) create mode 100644 scripts/enrich-with-freesearch.ts diff --git a/scripts/enrich-with-freesearch.ts b/scripts/enrich-with-freesearch.ts new file mode 100644 index 0000000..77af145 --- /dev/null +++ b/scripts/enrich-with-freesearch.ts @@ -0,0 +1,1371 @@ +#!/usr/bin/env tsx +/** + * Enrich OSM churches with website URLs via FreeSearch (self-hosted SearXNG) + * + * Usage: + * npx tsx scripts/enrich-with-freesearch.ts --limit 10 --dry-run + * npx tsx scripts/enrich-with-freesearch.ts --country US --limit 50 + * npx tsx scripts/enrich-with-freesearch.ts --continuous + * npx tsx scripts/enrich-with-freesearch.ts --re-search --country CR --limit 50 + * + * --re-search: Re-search previously-searched churches that have no website. + * Uses multi-query strategy (2-3 query variations per church) to get + * broader FreeSearch coverage. Prioritizes churches that got city data + * after their initial search (via reverse geocoding). + * + * FreeSearch is free and unlimited (~1,500-2,400 churches/hour). + * Full initial pass of ~193K churches in ~4-6 days. + */ + +import dotenv from 'dotenv'; +import path from 'path'; +dotenv.config({ path: path.resolve(process.cwd(), '.env') }); + +import { Pool } from 'pg'; +import { PrismaPg } from '@prisma/adapter-pg'; +import { PrismaClient } from '@prisma/client'; +import { Collection } from 'chromadb'; +import axios from 'axios'; +import crypto from 'crypto'; +import { getCollection, COLLECTION_NAMES } from '../src/chromadb/collections'; +import { embed } from '../src/chromadb/embeddings'; + +// Fresh DB connection (not cached singleton) +const pool = new Pool({ connectionString: process.env.DATABASE_URL }); +const adapter = new PrismaPg(pool); +const prisma = new PrismaClient({ adapter }); + +const FREESEARCH_URL = process.env.FREESEARCH_URL || 'http://192.168.0.145:3111'; + +// --- Job Tracking --- +async function createOrResumeJob(args: string[]): Promise { + const jobIdIndex = args.indexOf('--job-id'); + if (jobIdIndex !== -1) { + const jobId = args[jobIdIndex + 1]; + await prisma.backgroundJob.update({ + where: { id: jobId }, + data: { status: 'running', startedAt: new Date() }, + }); + return jobId; + } + return null; +} + +async function createNewJob(config: Record): Promise { + const job = await prisma.backgroundJob.create({ + data: { + type: 'freesearch-enrichment', + status: 'running', + startedAt: new Date(), + config, + }, + }); + return job.id; +} + +async function updateJobProgress(jobId: string, stats: EnrichmentStats, totalItems: number): Promise { + await prisma.backgroundJob.update({ + where: { id: jobId }, + data: { + processed: stats.processed, + succeeded: stats.enriched, + failed: stats.errors, + itemsFound: stats.enriched, + totalItems, + }, + }); +} + +async function checkJobStopping(jobId: string): Promise { + const job = await prisma.backgroundJob.findUnique({ where: { id: jobId } }); + return job?.status === 'stopping'; +} + +async function completeJob(jobId: string, error?: string): Promise { + await prisma.backgroundJob.update({ + where: { id: jobId }, + data: { + status: error ? 'failed' : 'completed', + error, + completedAt: new Date(), + }, + }); +} +const BATCH_SIZE = 20; +const INTER_BATCH_DELAY_MS = 5000; +const PROGRESS_INTERVAL = 100; + +// --- Country priority (same as Google Places enrichment) --- + +const COUNTRY_PRIORITY = [ + 'FR', 'DE', 'ES', 'PL', 'BR', 'PT', 'PH', 'CZ', 'MX', 'HU', + 'US', 'CR', 'BE', 'AR', 'CH', 'CO', 'CA', 'SK', 'EC', 'SI', + 'NL', 'PE', 'GB', 'ID', 'CL', 'IE', 'BO', 'VN', 'VE', 'UG', + 'LU', 'HN', 'CD', 'SV', 'KE', 'PA', 'AU', 'CU', 'GT', 'CN', +]; + +// Country-specific church keywords for search queries +const COUNTRY_KEYWORDS: Record = { + FR: 'paroisse', + DE: 'pfarrei', + ES: 'parroquia', + MX: 'parroquia', + PL: 'parafia', + BR: 'paroquia', + PT: 'paroquia', + IT: 'parrocchia', + CZ: 'farnost', + HU: 'plebania', + AR: 'parroquia', + CO: 'parroquia', + EC: 'parroquia', + PE: 'parroquia', + CL: 'parroquia', + VE: 'parroquia', + CR: 'parroquia', + SV: 'parroquia', + GT: 'parroquia', + CU: 'parroquia', + PA: 'parroquia', + BO: 'parroquia', + HN: 'parroquia', + BE: 'paroisse', + LU: 'paroisse', + CH: 'pfarrei', + NL: 'parochie', + SK: 'farnosť', + SI: 'župnija', +}; + +// Country-specific "parish" keywords for multi-query strategy +const PARISH_KEYWORDS: Record = { + US: 'parish', GB: 'parish', IE: 'parish', AU: 'parish', CA: 'parish', + ES: 'parroquia', MX: 'parroquia', AR: 'parroquia', CO: 'parroquia', + FR: 'paroisse', BE: 'paroisse', + DE: 'pfarrei', AT: 'pfarrei', CH: 'pfarrei', + IT: 'parrocchia', + PT: 'paroquia', BR: 'paroquia', + PL: 'parafia', + CR: 'parroquia', CL: 'parroquia', PE: 'parroquia', + HN: 'parroquia', SV: 'parroquia', GT: 'parroquia', NI: 'parroquia', + EC: 'parroquia', VE: 'parroquia', BO: 'parroquia', PA: 'parroquia', CU: 'parroquia', + CZ: 'farnost', HU: 'plebania', SK: 'farnosť', SI: 'župnija', NL: 'parochie', + LU: 'paroisse', +}; + +// --- Blocked domains --- + +const BLOCKED_DOMAINS = new Set([ + // Social media + 'facebook.com', 'fb.com', 'twitter.com', 'x.com', 'instagram.com', + 'linkedin.com', 'youtube.com', 'tiktok.com', 'pinterest.com', + // Directories & listings + 'yelp.com', 'yellowpages.com', 'yp.com', 'bbb.org', 'manta.com', + 'superpages.com', 'whitepages.com', 'foursquare.com', + // Church directories (not the parish's own site) + 'masstimes.org', 'catholicdirectory.com', 'findmass.com', 'parishesonline.com', + 'gcatholic.org', 'catholicmasstime.org', 'discovermass.com', 'faithstreet.com', + 'miserend.hu', + // Schools & landmarks with church-like names (common false positives) + 'archbishopchapelle.org', 'sainte-chapelle.fr', + // Map & geo services + 'google.com', 'mapquest.com', 'apple.com', 'bing.com', + 'openstreetmap.org', 'mapcarta.com', + // Travel & reviews (multiple TLDs) + 'tripadvisor.com', 'tripadvisor.co.nz', 'tripadvisor.co.uk', + 'tripadvisor.es', 'tripadvisor.fr', 'tripadvisor.de', 'tripadvisor.it', + 'tripadvisor.com.br', 'tripadvisor.com.mx', 'tripadvisor.pl', + // Reference & encyclopedias + 'wikipedia.org', 'wikidata.org', 'worldhistory.org', 'britannica.com', + // Dictionaries & translation + 'spanishdict.com', 'wiktionary.org', 'cambridge.org', 'merriam-webster.com', + 'wordreference.com', 'dict.cc', 'linguee.com', 'deepl.com', 'translate.google.com', + 'collinsdictionary.com', 'reverso.net', 'thefreedictionary.com', 'dictionary.com', + // Non-Catholic religious sites + 'jw.org', + // News & media (not parish websites) + 'pbs.org', 'bbc.com', 'cnn.com', 'nytimes.com', + // Wikimedia projects + 'wikimedia.org', + // Free hosting platforms (generic, not parish-specific) + 'wixsite.com', 'weebly.com', 'blogspot.com', 'wordpress.com', + // Travel, ticket & tourism sites + 'trip.com', 'seatgeek.com', 'airial.travel', 'booking.com', + 'expedia.com', 'hotels.com', 'viator.com', 'getyourguide.com', + 'wanderlog.com', 'spain-places.com', 'burgundy-tourism.com', + 'philippineairlines.com', + // Q&A, forums, news + 'stackexchange.com', 'stackoverflow.com', 'reddit.com', 'quora.com', + 'thisisanfield.com', + // Software / SaaS / false matches on church terms + 'myconvento.com', 'socha.net', 'drogaraia.com.br', + // Heritage & cultural directories + 'zabytek.pl', 'monumentos.gov.pt', + // Location / directory aggregators + 'wheree.com', 'seety.co', + // Banks (false matches on "sant" → "santander", etc.) + 'santanderbank.com', 'santander.com', 'bancosantander.es', + // General reference / clearly unrelated + 'brainly.in', 'zhihu.com', 'countyoffice.org', 'weforum.org', + 'themoviedb.org', 'imdb.com', 'amazon.com', 'ebay.com', + 'thereformation.com', 'rarest.org', 'gutenberg.org', + 'consumersadvocate.org', 'endlessmile.com', 'sacredrootsnc.com', + // Government sites (city pages, not parish sites) + 'madrid.es', + // Regional tourism portals + 'romantischer-rhein.de', 'riberana.es', 'lascuarre.es', + // Action/advocacy organizations (not parish sites) + 'franciscanaction.org', + // Church directories (additional) + 'iglesiaslocales.com', + // Catholic media & organizations (not individual parishes) + 'usccb.org', 'vaticannews.va', 'ewtn.com', 'catholic.org', 'newadvent.org', + 'catholicnewsagency.com', 'ncronline.org', 'americamagazine.org', + 'catholic-hierarchy.org', 'aleteia.org', + // Dictionaries (additional) + 'lerobert.com', 'larousse.fr', + // News (regional) + 'murciatoday.com', + // National/international organization sites (not local parishes/chapters) + 'kofc.org', 'sagradafamilia.org', + // Bible/religious Q&A sites + 'gotquestions.org', 'biblegateway.com', 'biblehub.com', + // Universities/schools (not parish sites) + 'rivier.edu', + // Religious directories (additional) + 'latinmassdir.org', 'myhollyland.org', +]); + +// Country-specific TLDs for scoring bonus +const COUNTRY_TLDS: Record = { + DE: 'de', FR: 'fr', ES: 'es', IT: 'it', PT: 'pt', PL: 'pl', + NL: 'nl', BE: 'be', AT: 'at', CH: 'ch', CZ: 'cz', HU: 'hu', + HR: 'hr', SK: 'sk', SI: 'si', IE: 'ie', GB: 'uk', + BR: 'com\\.br', MX: 'com\\.mx', AR: 'com\\.ar', CO: 'com\\.co', +}; + +// Catholic keywords used in scoring +const CATHOLIC_KEYWORDS = [ + 'parish', 'church', 'catholic', 'parroquia', 'paroisse', 'pfarrei', + 'parafia', 'paroquia', 'parrocchia', 'farnost', 'plebania', 'parochie', + 'župnija', 'farnosť', 'iglesia', 'église', 'kirche', 'kościół', + 'chiesa', 'kostel', 'templom', 'kerk', +]; + +// --- Types --- + +interface ChurchRecord { + id: string; + name: string; + address: string | null; + city: string | null; + state: string | null; + country: string; + latitude: number; + longitude: number; + diocese: string | null; +} + +interface SearchResult { + title: string; + url: string; + description?: string; + source?: string; +} + +interface FreeSearchResponse { + results: SearchResult[]; + provider: string; + query: string; + elapsed_ms: number; +} + +interface EnrichmentStats { + processed: number; + enriched: number; + notFound: number; + errors: number; + verified: number; + verifyFailed: number; + cycles: number; + startTime: number; +} + +// --- Circuit Breaker --- + +class CircuitBreaker { + private failures = 0; + private isOpen = false; + private backoffMs = 10000; + private readonly maxBackoffMs = 300000; // 5 minutes + private readonly threshold = 5; + + async checkAndWait(): Promise { + if (!this.isOpen) return true; + + log(`Circuit breaker open. Waiting ${Math.round(this.backoffMs / 1000)}s before retry...`); + await sleep(this.backoffMs); + + // Health check + try { + const resp = await axios.get(`${FREESEARCH_URL}/api/health`, { timeout: 5000 }); + if (resp.status === 200) { + this.reset(); + log('Circuit breaker closed: FreeSearch is back'); + return true; + } + } catch { + // Still down + } + + this.backoffMs = Math.min(this.backoffMs * 2, this.maxBackoffMs); + return false; + } + + recordFailure() { + this.failures++; + if (this.failures >= this.threshold && !this.isOpen) { + this.isOpen = true; + this.backoffMs = 10000; + log(`Circuit breaker OPEN after ${this.failures} consecutive failures`); + } + } + + reset() { + if (this.failures > 0 || this.isOpen) { + this.failures = 0; + this.isOpen = false; + this.backoffMs = 10000; + } + } + + get opened() { return this.isOpen; } +} + +// --- Helpers --- + +let shuttingDown = false; + +function log(msg: string) { + console.log(`[${new Date().toISOString()}] ${msg}`); +} + +function logError(msg: string) { + console.error(`[${new Date().toISOString()}] ${msg}`); +} + +function sleep(ms: number): Promise { + return new Promise(resolve => { + const timer = setTimeout(resolve, ms); + // Allow early exit on shutdown + const check = setInterval(() => { + if (shuttingDown) { + clearTimeout(timer); + clearInterval(check); + resolve(); + } + }, 1000); + // Clean up interval when timer fires normally + setTimeout(() => clearInterval(check), ms + 100); + }); +} + +function getDomain(url: string): string { + try { + const hostname = new URL(url).hostname.toLowerCase(); + // Get root domain (e.g., "www.parish.org" -> "parish.org") + const parts = hostname.split('.'); + if (parts.length >= 2) { + return parts.slice(-2).join('.'); + } + return hostname; + } catch { + return ''; + } +} + +// Domain name keywords that indicate non-parish sites +const BLOCKED_DOMAIN_KEYWORDS = [ + 'tripadvisor', 'archinform', 'seatgeek', + // Tourism/travel + 'turismo', 'tourism', 'tourisme', 'touristik', 'turistico', 'turistik', + 'reisefuhrer', 'wanderlog', 'viator', 'getyourguide', +]; + +function isBlockedUrl(url: string): boolean { + try { + const hostname = new URL(url).hostname.toLowerCase(); + // Check if hostname ends with any blocked domain + for (const blocked of BLOCKED_DOMAINS) { + if (hostname === blocked || hostname.endsWith('.' + blocked)) { + return true; + } + } + // Check domain keywords (catches all TLD variants) + for (const keyword of BLOCKED_DOMAIN_KEYWORDS) { + if (hostname.includes(keyword)) { + return true; + } + } + // Block PDF/document URLs (not parish homepages) + const path = new URL(url).pathname.toLowerCase(); + if (path.endsWith('.pdf') || path.endsWith('.doc') || path.endsWith('.docx')) { + return true; + } + return false; + } catch { + return true; // Block unparseable URLs + } +} + +function hashUrl(url: string): string { + return crypto.createHash('md5').update(url).digest('hex').slice(0, 12); +} + +function normalizeForMatch(str: string): string { + return str.toLowerCase() + .replace(/[^a-z0-9\s]/g, '') + .replace(/\s+/g, ' ') + .trim(); +} + +function getSignificantWords(name: string): string[] { + const stopWords = new Set([ + // English articles/prepositions + 'the', 'of', 'and', 'in', 'at', 'for', 'our', 'lady', + // Religious titles & very common church name words + 'st', 'saint', 'saints', 'san', 'sant', 'santa', 'santo', 'sacred', + 'christ', 'jesus', 'mary', 'maria', 'king', 'lord', 'heart', + 'cross', 'lady', 'queen', 'angel', 'angels', 'good', 'star', + 'nome', 'nome', 'pere', 'madre', 'notre', 'dame', 'bien', + 'onze', 'lieve', 'vrouw', 'heer', + // Very common short saint/religious names (match too many unrelated domains) + 'rosa', 'paul', 'anne', 'jean', 'joan', 'luke', 'marc', + 'rita', 'jose', 'leon', 'pius', 'roch', 'yves', 'ines', + 'vita', 'fara', 'bona', + // Common religious words that match wrong churches/organizations + 'cristo', 'fatima', 'lourdes', 'perpetuo', 'socorro', 'calvario', + 'rosario', 'pilar', 'carmen', 'dolores', 'remedios', 'nieves', + 'grotte', 'mission', 'sagrada', 'sagrado', 'familia', + 'guadalupe', 'assumption', 'immaculate', 'perpetual', 'divine', + // Organization names (match national sites, not local chapters) + 'knights', 'columbus', + // Structural/role words (not distinctive church names) + 'house', 'home', 'hall', 'center', 'centre', 'centro', + 'deacon', 'priest', 'bishop', 'father', 'sister', 'brother', + 'school', 'academy', 'college', 'seminary', 'rectory', 'retreat', + // Church-generic words (EN) + 'church', 'parish', 'catholic', 'roman', 'holy', 'chapel', + 'cathedral', 'basilica', 'shrine', 'convent', 'monastery', + // Church-generic words (FR) + 'chapelle', 'eglise', 'église', 'paroisse', 'couvent', 'grotte', + // Church-generic words (ES) + 'iglesia', 'parroquia', 'capilla', 'ermita', 'convento', 'basílica', + // Church-generic words (DE) + 'kirche', 'kapelle', 'pfarrei', 'kloster', + // Church-generic words (IT) + 'chiesa', 'parrocchia', 'cappella', 'oratorio', + // Church-generic words (PT) + 'igreja', 'capela', 'paroquia', + // Church-generic words (PL) + 'kościół', 'kaplica', 'parafia', 'droga', + // Church-generic words (CZ/SK/HU) + 'kostel', 'kaple', 'farnost', 'templom', 'kápolna', + // Articles/prepositions (Romance) + 'de', 'la', 'le', 'les', 'du', 'des', 'el', 'los', 'las', + 'di', 'del', 'della', 'delle', 'degli', + 'do', 'da', 'dos', 'das', + // Articles/prepositions (Germanic) + 'und', 'der', 'die', 'das', 'von', + // Articles/prepositions (Slavic) + 'nad', 'pod', 'przy', + ]); + + const words = normalizeForMatch(name).split(' '); + const significant = words.filter(w => w.length >= 3 && !stopWords.has(w)); + + // If all words were filtered, return top 2 longest words (>= 4 chars) regardless of stop list + if (significant.length === 0) { + return words + .filter(w => w.length >= 4) + .sort((a, b) => b.length - a.length) + .slice(0, 2); + } + + return significant; +} + +// --- URL Scoring --- + +function scoreUrl(result: SearchResult, church: ChurchRecord): number { + let score = 0; + const url = result.url.toLowerCase(); + const title = (result.title || '').toLowerCase(); + const description = (result.description || '').toLowerCase(); + const domain = getDomain(result.url); + const domainWithoutTld = domain.split('.')[0] || ''; + + const nameWords = getSignificantWords(church.name); + let hasNameMatch = false; + + // Church name words in domain (strongest signal) + // Require >= 4 chars to avoid short-word false matches (e.g. "rosa" in "rosaparks") + for (const word of nameWords) { + if (word.length >= 4 && domainWithoutTld.includes(word)) { + score += 10; + hasNameMatch = true; + break; + } + } + + // Church name words in URL path (e.g. /san-bartolome/ or /stmichael) + try { + const urlPath = new URL(result.url).pathname.toLowerCase(); + for (const word of nameWords) { + if (word.length >= 4 && urlPath.includes(word)) { + score += 2; + hasNameMatch = true; + break; + } + } + } catch { /* ignore */ } + + // Church name words in title + for (const word of nameWords) { + if (title.includes(word)) { + score += 5; + hasNameMatch = true; + } + } + + // Church name words in description + for (const word of nameWords) { + if (description.includes(word)) { + score += 2; + hasNameMatch = true; + } + } + + // Catholic keywords ONLY count if there's also a name match + // (prevents "iglesia" in dictionary, "chiesa" in soccer news, etc.) + if (hasNameMatch) { + for (const kw of CATHOLIC_KEYWORDS) { + if (title.includes(kw) || description.includes(kw)) { + score += 3; + break; + } + } + } + + // TLD bonus (only with name match) + if (hasNameMatch && url.match(/\.(org|church)(\/|$)/)) { + score += 3; + } + + // Country-specific TLD bonus (e.g. .pl for Polish churches, .com.br for Brazilian) + const countryTld = COUNTRY_TLDS[church.country]; + if (countryTld && hasNameMatch && url.match(new RegExp(`\\.${countryTld}(\\/|$)`))) { + score += 4; + } + + // City name in title/domain + if (church.city) { + const cityNorm = normalizeForMatch(church.city); + if (cityNorm.length > 2) { + if (title.includes(cityNorm)) score += 3; + if (domainWithoutTld.includes(cityNorm.replace(/\s/g, ''))) score += 5; + } + } + + // Penalty: diocese/archdiocese in domain (not the parish site) + if (domainWithoutTld.includes('diocese') || domainWithoutTld.includes('archdiocese') || + domainWithoutTld.includes('bistum') || domainWithoutTld.includes('diecezja')) { + score -= 5; + } + + // Penalty: deep URL path + try { + const pathSegments = new URL(result.url).pathname.split('/').filter(Boolean); + if (pathSegments.length > 3) { + score -= 5; + } + } catch { + // ignore + } + + return score; +} + +/** + * Returns top candidate URLs sorted by score (min score 1). + * Relaxed threshold — verifyUrl() is the real quality gate. + * Multiple candidates are returned so verification can try the next one if first fails. + */ +function pickCandidateUrls(results: SearchResult[], church: ChurchRecord): string[] { + const candidates = results.filter(r => !isBlockedUrl(r.url)); + if (candidates.length === 0) return []; + + const scored = candidates.map(r => ({ + url: r.url, + score: scoreUrl(r, church), + })); + + scored.sort((a, b) => b.score - a.score); + + // Return top 5 candidates with score >= 1 (verifyUrl is the real quality gate) + return scored + .filter(s => s.score >= 1) + .slice(0, 5) + .map(s => s.url); +} + +// --- URL Verification --- + +function stripHtml(html: string): string { + return html + .replace(/]*>[\s\S]*?<\/script>/gi, '') + .replace(/]*>[\s\S]*?<\/style>/gi, '') + .replace(/<[^>]+>/g, ' ') + .replace(/&[a-z]+;/gi, ' ') + .replace(/\s+/g, ' ') + .toLowerCase(); +} + +// Tourism/cultural keywords — pages about churches as attractions, NOT the church's own site +const TOURISM_KEYWORDS = [ + 'tourism', 'turismo', 'tourisme', 'turisme', 'touristik', 'turistico', + 'attractions', 'things to do', 'sightseeing', 'sehenswürdigkeiten', + 'what to see', 'places to visit', 'travel guide', 'reiseführer', + 'patrimoine', 'heritage trail', 'cultural heritage', + 'punto de interés', 'point of interest', 'points of interest', +]; + +// Mass schedule keywords — pages with these are almost certainly the church's own site +const MASS_SCHEDULE_KEYWORDS = [ + 'mass schedule', 'mass times', 'worship schedule', 'worship times', + 'service times', 'sunday mass', 'weekday mass', + 'horario de misas', 'horarios de misa', 'horaires des messes', + 'gottesdienst', 'gottesdienstzeiten', 'messzeiten', + 'msze święte', 'godziny mszy', 'msze św', + 'orari delle messe', 'orario messe', + 'horário das missas', +]; + +/** + * Fetch the URL and verify the page content mentions the church. + * Returns true if the page appears to belong to this church. + */ +async function verifyUrl(url: string, church: ChurchRecord): Promise { + try { + const response = await axios.get(url, { + timeout: 10000, + maxRedirects: 3, + headers: { + 'User-Agent': 'Mozilla/5.0 (compatible; NearestMass/1.0; +https://nearestmass.com)', + 'Accept': 'text/html', + }, + // Only read first 200KB to avoid downloading huge pages + maxContentLength: 200000, + responseType: 'text', + }); + + if (typeof response.data !== 'string') return false; + + const text = stripHtml(response.data); + const nameWords = getSignificantWords(church.name); + + // Count how many significant name words appear in the page + let nameMatches = 0; + for (const word of nameWords) { + if (text.includes(word)) { + nameMatches++; + } + } + + // Check for city name on page + let cityMatch = false; + if (church.city) { + const cityNorm = normalizeForMatch(church.city); + if (cityNorm.length > 2 && text.includes(cityNorm)) { + cityMatch = true; + } + } + + // Check for address on page (street name is a strong location signal) + let addressMatch = false; + if (church.address) { + const addrNorm = normalizeForMatch(church.address); + // Extract significant address words (street names, not numbers) + const addrWords = addrNorm.split(' ').filter(w => w.length >= 4 && !/^\d+$/.test(w)); + // If 2+ address words found on page, it's likely the right location + let addrWordMatches = 0; + for (const w of addrWords) { + if (text.includes(w)) addrWordMatches++; + } + if (addrWordMatches >= 2) addressMatch = true; + } + + // Check for Catholic/church keywords on page + let hasCatholicKeyword = false; + for (const kw of CATHOLIC_KEYWORDS) { + if (text.includes(kw)) { + hasCatholicKeyword = true; + break; + } + } + + // Check if page has mass schedule info (strong positive signal) + let hasMassSchedule = false; + for (const kw of MASS_SCHEDULE_KEYWORDS) { + if (text.includes(kw)) { + hasMassSchedule = true; + break; + } + } + + // Check if page is a tourism/cultural page (negative signal) + let isTourismPage = false; + for (const kw of TOURISM_KEYWORDS) { + if (text.includes(kw)) { + isTourismPage = true; + break; + } + } + + // Check if domain contains church name words (strong signal) + // Require word length >= 5 to avoid false matches (e.g. "rosa" in "rosaparks") + let domainMatchesName = false; + try { + const hostname = new URL(url).hostname.toLowerCase(); + for (const word of nameWords) { + if (word.length >= 5 && hostname.includes(word)) { + domainMatchesName = true; + break; + } + } + } catch { /* ignore */ } + + // Reject tourism pages unless they also have mass schedules (= the church itself) + if (isTourismPage && !hasMassSchedule) return false; + + // Reject deep URLs on non-matching domains (articles ABOUT churches, not church sites) + // Church sites are usually at root or shallow paths + let isDeepUrl = false; + try { + const pathSegments = new URL(url).pathname.split('/').filter(Boolean); + isDeepUrl = pathSegments.length > 2; + } catch { /* ignore */ } + if (isDeepUrl && !domainMatchesName && !hasMassSchedule) return false; + + const hasCity = !!(church.city && church.city.trim()); + + // Strong signal: mass schedule + at least 1 name word = almost certainly the right church + if (hasMassSchedule && nameMatches >= 1) return true; + + // Domain matches name + name word on page + Catholic keyword = likely the church's site + if (domainMatchesName && nameMatches >= 1 && hasCatholicKeyword) return true; + + // For churches WITH city data, require name + city (city provides disambiguation) + if (hasCity) { + if (nameMatches >= 2) return true; + if (nameMatches >= 1 && cityMatch) return true; + if (nameMatches >= 1 && addressMatch) return true; + } + + // For churches WITHOUT city data, require stronger evidence: + // - Domain match already handled above + // - Mass schedule already handled above + // - Address match can substitute for city + // - Need 3+ name words as fallback (very distinctive name) + if (!hasCity) { + if (nameMatches >= 1 && addressMatch) return true; + if (nameMatches >= 3) return true; + } + + return false; + } catch { + // If we can't fetch the page (timeout, SSL error, etc.), reject it + return false; + } +} + +// --- Search Query Construction --- + +function buildSearchQuery(church: ChurchRecord): string { + const parts: string[] = []; + + // Church name + parts.push(`"${church.name}"`); + + // City + if (church.city) parts.push(church.city); + + // State (for US/CA/AU/BR) + if (church.state && ['US', 'CA', 'AU', 'BR'].includes(church.country)) { + parts.push(church.state); + } + + // Diocese (helps disambiguate churches with common names) + if (church.diocese) parts.push(church.diocese); + + // Country-specific keyword + const keyword = COUNTRY_KEYWORDS[church.country]; + if (keyword) parts.push(keyword); + + parts.push('official website'); + + return parts.join(' '); +} + +/** + * Build multiple query variations for re-search mode. + * Returns 2-3 different queries to get broader FreeSearch coverage. + */ +function buildSearchQueries(church: ChurchRecord): string[] { + const queries: string[] = []; + + // Query 1: Standard query (quoted name + city + keyword) + queries.push(buildSearchQuery(church)); + + // Query 2: Unquoted name + city (catches accent/spelling variations) + const nameWords = church.name.split(/\s+/).filter(w => w.length >= 3); + const simpleName = nameWords.join(' '); + const q2Parts = [simpleName]; + if (church.city) q2Parts.push(church.city); + const keyword = COUNTRY_KEYWORDS[church.country]; + if (keyword) q2Parts.push(keyword); + queries.push(q2Parts.join(' ')); + + // Query 3: Distinctive words only + city + local parish keyword + const sigWords = getSignificantWords(church.name); + if (sigWords.length > 0 && church.city) { + const q3Parts = [...sigWords, church.city]; + const parishKeyword = PARISH_KEYWORDS[church.country]; + if (parishKeyword) q3Parts.push(parishKeyword); + queries.push(q3Parts.join(' ')); + } + + // Query 4: Country-specific TLD search (for non-English countries) + const countryTld = COUNTRY_TLDS[church.country]; + if (countryTld && church.country !== 'US' && church.country !== 'GB') { + const tldClean = countryTld.replace(/\\\./g, '.'); + const q4Parts = [simpleName]; + if (church.city) q4Parts.push(church.city); + q4Parts.push(`site:.${tldClean}`); + queries.push(q4Parts.join(' ')); + } + + // Deduplicate queries + return [...new Set(queries)]; +} + +// --- FreeSearch API --- + +async function searchSingle(query: string): Promise { + const response = await axios.get(`${FREESEARCH_URL}/api/search`, { + params: { q: query }, + timeout: 30000, + }); + return response.data; +} + +async function healthCheck(): Promise { + try { + const resp = await axios.get(`${FREESEARCH_URL}/api/health`, { timeout: 5000 }); + return resp.status === 200; + } catch { + return false; + } +} + +async function waitForFreeSearch(): Promise { + let backoffMs = 30_000; + const maxBackoffMs = 300_000; // 5 minutes + let attempt = 0; + + while (!shuttingDown) { + attempt++; + const healthy = await healthCheck(); + if (healthy) { + if (attempt > 1) log('FreeSearch is back. Continuing...'); + return; + } + const waitSec = Math.round(backoffMs / 1000); + logError(`FreeSearch not reachable at ${FREESEARCH_URL} (attempt ${attempt}). Retrying in ${waitSec}s...`); + await sleep(backoffMs); + backoffMs = Math.min(backoffMs * 2, maxBackoffMs); + } +} + +// --- Database Queries --- + +// Cutoff timestamp for re-search: only re-search churches searched BEFORE script start +const reSearchCutoff = new Date(); + +async function getNextBatch( + batchSize: number, + countryCode?: string, + reSearch?: boolean +): Promise { + const selectFields = { + id: true, name: true, address: true, city: true, state: true, + country: true, latitude: true, longitude: true, diocese: true, + }; + + // Re-search mode: find previously-searched churches that have no website + // Only pick churches searched BEFORE this script started (avoids infinite loop + // since processBatch updates freeSearchedAt to now, which is after the cutoff) + if (reSearch) { + return prisma.church.findMany({ + where: { + source: 'osm', + website: null, + freeSearchedAt: { not: null, lt: reSearchCutoff }, + city: { not: null }, + NOT: { city: '' }, + ...(countryCode ? { country: countryCode } : {}), + }, + select: selectFields, + take: batchSize, + // Prioritize reverse-geocoded churches (got city after first search) + orderBy: [{ reverseGeocodedAt: { sort: 'desc', nulls: 'last' } }, { createdAt: 'asc' }], + }); + } + + if (countryCode) { + return prisma.church.findMany({ + where: { + source: 'osm', + website: null, + freeSearchedAt: null, + country: countryCode, + city: { not: null }, + NOT: { city: '' }, + }, + select: selectFields, + take: batchSize, + orderBy: { createdAt: 'asc' }, + }); + } + + // Round-robin: take a few from each priority country + // Prioritize churches WITH city data (better search results) + const perCountry = Math.max(1, Math.ceil(batchSize / COUNTRY_PRIORITY.length)); + const churches: ChurchRecord[] = []; + + for (const country of COUNTRY_PRIORITY) { + if (churches.length >= batchSize) break; + + // First try churches with city data (higher quality searches) + const batch = await prisma.church.findMany({ + where: { + source: 'osm', + website: null, + freeSearchedAt: null, + country, + city: { not: null }, + NOT: { city: '' }, + }, + select: selectFields, + take: perCountry, + orderBy: { createdAt: 'asc' }, + }); + + churches.push(...batch); + } + + return churches.slice(0, batchSize); +} + +// --- Main Processing --- + +async function processBatch( + churches: ChurchRecord[], + stats: EnrichmentStats, + dryRun: boolean, + jobId?: string | null, + chromaCollection?: Collection | null, + reSearch?: boolean +): Promise { + for (const church of churches) { + if (shuttingDown) break; + stats.processed++; + + const label = `${church.name} (${church.city || 'unknown'}, ${church.country})`; + + // Multi-query for re-search mode, single query for first-pass + const queries = reSearch ? buildSearchQueries(church) : [buildSearchQuery(church)]; + + try { + // Search with all queries, merge results + const allResults: SearchResult[] = []; + const seenUrls = new Set(); + + for (const query of queries) { + try { + const response = await searchSingle(query); + for (const r of (response.results || [])) { + if (!seenUrls.has(r.url)) { + seenUrls.add(r.url); + allResults.push(r); + } + } + } catch (error: any) { + logError(` Query failed: ${query} — ${error.message}`); + } + // Brief pause between queries to not overwhelm FreeSearch + if (queries.length > 1) await sleep(500); + } + + if (reSearch && queries.length > 1) { + log(` ? ${label} => ${queries.length} queries, ${allResults.length} unique results`); + } + + const results = allResults; + const candidateUrls = pickCandidateUrls(results, church); + + // Store all non-blocked search results in ChromaDB for later analysis + if (chromaCollection && results.length > 0 && !dryRun) { + try { + const nonBlocked = results.filter(r => !isBlockedUrl(r.url)); + if (nonBlocked.length > 0) { + const docs = nonBlocked.map(r => + `${r.title || ''} ${r.description || ''} ${r.url}`.trim() + ); + const embeddings = await embed(docs); + await chromaCollection.upsert({ + ids: nonBlocked.map(r => `search-${church.id}-${hashUrl(r.url)}`), + embeddings, + documents: docs, + metadatas: nonBlocked.map(r => ({ + churchId: church.id, + churchName: church.name, + churchCity: church.city || '', + churchCountry: church.country, + searchQuery: queries[0], + resultUrl: r.url, + resultTitle: r.title || '', + score: scoreUrl(r, church), + })), + }); + } + } catch (e: any) { + logError(`ChromaDB store failed: ${e.message}`); + } + } + + if (results.length === 0) { + log(` - ${label} => no results`); + stats.notFound++; + if (!dryRun) { + await prisma.church.update({ + where: { id: church.id }, + data: { freeSearchedAt: new Date() }, + }); + } + continue; + } + + if (candidateUrls.length === 0) { + log(` - ${label} => no candidates above threshold`); + stats.notFound++; + if (!dryRun) { + await prisma.church.update({ + where: { id: church.id }, + data: { freeSearchedAt: new Date() }, + }); + } + continue; + } + + // Try each candidate URL with verification + let verifiedUrl: string | null = null; + for (const url of candidateUrls) { + const ok = await verifyUrl(url, church); + if (ok) { + verifiedUrl = url; + stats.verified++; + break; + } else { + stats.verifyFailed++; + } + } + + if (verifiedUrl) { + log(` + ${label} => ${verifiedUrl}`); + stats.enriched++; + if (!dryRun) { + await prisma.church.update({ + where: { id: church.id }, + data: { + website: verifiedUrl, + hasWebsite: true, + freeSearchedAt: new Date(), + }, + }); + // Mark the verified result in ChromaDB (update replaces metadata, so include all fields) + if (chromaCollection) { + try { + await chromaCollection.update({ + ids: [`search-${church.id}-${hashUrl(verifiedUrl)}`], + metadatas: [{ + churchId: church.id, + churchName: church.name, + churchCity: church.city || '', + churchCountry: church.country, + searchQuery: queries[0], + resultUrl: verifiedUrl, + resultTitle: '', + score: 0, + verified: true, + }], + }); + } catch { /* ignore — entry may not exist if ChromaDB was down during store */ } + } + } + } else { + log(` ~ ${label} => ${candidateUrls.length} candidates failed verification`); + stats.notFound++; + if (!dryRun) { + await prisma.church.update({ + where: { id: church.id }, + data: { freeSearchedAt: new Date() }, + }); + } + } + } catch (error: any) { + stats.errors++; + logError(` ! ${label} => error: ${error.message}`); + throw error; + } + + // Job tracking: update progress every 10 items + if (jobId && stats.processed % 10 === 0) { + await updateJobProgress(jobId, stats, churches.length); + const stopping = await checkJobStopping(jobId); + if (stopping) { + log('Job stop requested via admin dashboard.'); + shuttingDown = true; + break; + } + } + + // Progress logging + if (stats.processed % PROGRESS_INTERVAL === 0) { + const elapsed = (Date.now() - stats.startTime) / 1000; + const rate = Math.round((stats.processed / elapsed) * 3600); + const hitRate = stats.processed > 0 + ? ((stats.enriched / stats.processed) * 100).toFixed(1) + : '0.0'; + log(`Progress: ${stats.processed} processed, ${stats.enriched} enriched, ${stats.notFound} not found, ${stats.errors} errors`); + log(`Hit rate: ${hitRate}%, Verified: ${stats.verified}, Verify failed: ${stats.verifyFailed}, Rate: ~${rate} churches/hour`); + } + } +} + +async function runSinglePass( + stats: EnrichmentStats, + countryCode?: string, + limit?: number, + dryRun: boolean = false, + jobId?: string | null, + chromaCollection?: Collection | null, + reSearch?: boolean +): Promise { + let totalProcessed = 0; + let consecutiveErrors = 0; + + while (!shuttingDown) { + if (limit && totalProcessed >= limit) break; + + const batchLimit = limit + ? Math.min(BATCH_SIZE, limit - totalProcessed) + : BATCH_SIZE; + + const churches = await getNextBatch(batchLimit, countryCode, reSearch); + if (churches.length === 0) break; + + try { + await processBatch(churches, stats, dryRun, jobId, chromaCollection, reSearch); + consecutiveErrors = 0; + } catch (error: any) { + consecutiveErrors++; + if (consecutiveErrors >= 5) { + logError(`5 consecutive batch errors. Stopping.`); + break; + } + logError(`Batch error (${consecutiveErrors}/5): ${error.message}`); + } + totalProcessed += churches.length; + + if (!shuttingDown && churches.length === batchLimit) { + await sleep(INTER_BATCH_DELAY_MS); + } + } +} + +async function runContinuous(stats: EnrichmentStats, jobId?: string | null, chromaCollection?: Collection | null, reSearch?: boolean): Promise { + log('Running in continuous mode (24/7). Press Ctrl+C to stop.'); + const circuitBreaker = new CircuitBreaker(); + + while (!shuttingDown) { + stats.cycles++; + log(`--- Cycle ${stats.cycles} ---`); + let batchesInCycle = 0; + + while (!shuttingDown) { + // Circuit breaker check + if (circuitBreaker.opened) { + const ok = await circuitBreaker.checkAndWait(); + if (!ok) continue; + } + + const churches = await getNextBatch(BATCH_SIZE, undefined, reSearch); + if (churches.length === 0) break; + + try { + await processBatch(churches, stats, false, jobId, chromaCollection, reSearch); + circuitBreaker.reset(); + batchesInCycle++; + } catch (error: any) { + stats.errors++; + circuitBreaker.recordFailure(); + logError(`Batch error: ${error.message}`); + } + + if (!shuttingDown) { + await sleep(INTER_BATCH_DELAY_MS); + } + } + + if (shuttingDown) break; + + if (batchesInCycle === 0) { + log('No unsearched churches found. Waiting 1 hour...'); + // Sleep 1 hour, checking shutdown every 10s + for (let i = 0; i < 360 && !shuttingDown; i++) { + await sleep(10000); + } + } else { + log(`Cycle ${stats.cycles} complete. ${batchesInCycle} batches processed. Brief pause...`); + await sleep(10000); + } + } +} + +// --- Main --- + +async function main() { + const args = process.argv.slice(2); + const countryIndex = args.indexOf('--country'); + const limitIndex = args.indexOf('--limit'); + const dryRun = args.includes('--dry-run'); + const continuous = args.includes('--continuous'); + const reSearch = args.includes('--re-search'); + + const countryCode = countryIndex !== -1 ? args[countryIndex + 1] : undefined; + const limit = limitIndex !== -1 ? parseInt(args[limitIndex + 1]) : undefined; + + // Graceful shutdown + process.on('SIGTERM', () => { + log('Received SIGTERM, finishing current batch...'); + shuttingDown = true; + }); + process.on('SIGINT', () => { + log('Received SIGINT, finishing current batch...'); + shuttingDown = true; + }); + + log('============================================================'); + log('FreeSearch Church Website Enrichment'); + log('============================================================'); + log(`FreeSearch URL: ${FREESEARCH_URL}`); + log(`Mode: ${continuous ? 'Continuous (24/7)' : 'Single pass'}${reSearch ? ' (RE-SEARCH)' : ''}`); + log(`Re-search: ${reSearch ? 'Yes (multi-query, previously searched churches)' : 'No'}`); + log(`Country: ${countryCode || 'All (round-robin priority)'}`); + log(`Limit: ${limit || 'No limit'}`); + log(`Dry run: ${dryRun ? 'Yes' : 'No'}`); + log(`Batch size: ${BATCH_SIZE}`); + log('============================================================'); + + // Wait for FreeSearch to be reachable (indefinite retry with backoff) + log('Waiting for FreeSearch to be reachable...'); + await waitForFreeSearch(); + if (shuttingDown) return; + log('FreeSearch health check: OK'); + + // ChromaDB connection (optional — results stored if available) + let chromaCollection: Collection | null = null; + try { + chromaCollection = await getCollection(COLLECTION_NAMES.SEARCH_RESULTS); + log('ChromaDB search_results collection connected'); + } catch { + log('ChromaDB unavailable — results will not be stored'); + } + + // Job tracking — clean up any running jobs left by a previous container restart + await prisma.backgroundJob.updateMany({ + where: { type: 'freesearch-enrichment', status: 'running' }, + data: { status: 'failed', error: 'Container restarted', completedAt: new Date() }, + }); + + let jobId = await createOrResumeJob(args); + if (!jobId) { + jobId = await createNewJob({ countryCode, limit, continuous, dryRun, reSearch }); + } + log(`Job ID: ${jobId}`); + + const stats: EnrichmentStats = { + processed: 0, + enriched: 0, + notFound: 0, + errors: 0, + verified: 0, + verifyFailed: 0, + cycles: 0, + startTime: Date.now(), + }; + + if (continuous) { + await runContinuous(stats, jobId, chromaCollection, reSearch); + } else { + await runSinglePass(stats, countryCode, limit, dryRun, jobId, chromaCollection, reSearch); + } + + // Complete job + if (jobId) { + await updateJobProgress(jobId, stats, 0); + await completeJob(jobId); + } + + // Print summary + const elapsed = ((Date.now() - stats.startTime) / 1000).toFixed(1); + const hitRate = stats.processed > 0 + ? ((stats.enriched / stats.processed) * 100).toFixed(1) + : '0.0'; + + log(''); + log('============================================================'); + log('Enrichment Summary'); + log('============================================================'); + log(`Churches processed: ${stats.processed}`); + log(`Websites found: ${stats.enriched}`); + log(`No website found: ${stats.notFound}`); + log(`Errors: ${stats.errors}`); + log(`URLs verified: ${stats.verified}`); + log(`Verify rejected: ${stats.verifyFailed}`); + log(`Hit rate: ${hitRate}%`); + log(`Elapsed: ${elapsed}s`); + if (stats.cycles > 0) { + log(`Cycles completed: ${stats.cycles}`); + } + log('============================================================'); + + await prisma.$disconnect(); + await pool.end(); +} + +main().catch((error) => { + logError(`Fatal error: ${error.message}`); + process.exit(1); +});