1372 lines
43 KiB
TypeScript
1372 lines
43 KiB
TypeScript
#!/usr/bin/env tsx
|
|
/**
|
|
* Enrich OSM churches with website URLs via FreeSearch (self-hosted SearXNG)
|
|
*
|
|
* Usage:
|
|
* npx tsx scripts/enrich-with-freesearch.ts --limit 10 --dry-run
|
|
* npx tsx scripts/enrich-with-freesearch.ts --country US --limit 50
|
|
* npx tsx scripts/enrich-with-freesearch.ts --continuous
|
|
* npx tsx scripts/enrich-with-freesearch.ts --re-search --country CR --limit 50
|
|
*
|
|
* --re-search: Re-search previously-searched churches that have no website.
|
|
* Uses multi-query strategy (2-3 query variations per church) to get
|
|
* broader FreeSearch coverage. Prioritizes churches that got city data
|
|
* after their initial search (via reverse geocoding).
|
|
*
|
|
* FreeSearch is free and unlimited (~1,500-2,400 churches/hour).
|
|
* Full initial pass of ~193K churches in ~4-6 days.
|
|
*/
|
|
|
|
import dotenv from 'dotenv';
|
|
import path from 'path';
|
|
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
|
|
|
import { Pool } from 'pg';
|
|
import { PrismaPg } from '@prisma/adapter-pg';
|
|
import { PrismaClient } from '@prisma/client';
|
|
import { Collection } from 'chromadb';
|
|
import axios from 'axios';
|
|
import crypto from 'crypto';
|
|
import { getCollection, COLLECTION_NAMES } from '../src/chromadb/collections';
|
|
import { embed } from '../src/chromadb/embeddings';
|
|
|
|
// Fresh DB connection (not cached singleton)
|
|
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
|
const adapter = new PrismaPg(pool);
|
|
const prisma = new PrismaClient({ adapter });
|
|
|
|
const FREESEARCH_URL = process.env.FREESEARCH_URL || 'http://192.168.0.145:3111';
|
|
|
|
// --- Job Tracking ---
|
|
async function createOrResumeJob(args: string[]): Promise<string | null> {
|
|
const jobIdIndex = args.indexOf('--job-id');
|
|
if (jobIdIndex !== -1) {
|
|
const jobId = args[jobIdIndex + 1];
|
|
await prisma.backgroundJob.update({
|
|
where: { id: jobId },
|
|
data: { status: 'running', startedAt: new Date() },
|
|
});
|
|
return jobId;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
async function createNewJob(config: Record<string, unknown>): Promise<string> {
|
|
const job = await prisma.backgroundJob.create({
|
|
data: {
|
|
type: 'freesearch-enrichment',
|
|
status: 'running',
|
|
startedAt: new Date(),
|
|
config,
|
|
},
|
|
});
|
|
return job.id;
|
|
}
|
|
|
|
async function updateJobProgress(jobId: string, stats: EnrichmentStats, totalItems: number): Promise<void> {
|
|
await prisma.backgroundJob.update({
|
|
where: { id: jobId },
|
|
data: {
|
|
processed: stats.processed,
|
|
succeeded: stats.enriched,
|
|
failed: stats.errors,
|
|
itemsFound: stats.enriched,
|
|
totalItems,
|
|
},
|
|
});
|
|
}
|
|
|
|
async function checkJobStopping(jobId: string): Promise<boolean> {
|
|
const job = await prisma.backgroundJob.findUnique({ where: { id: jobId } });
|
|
return job?.status === 'stopping';
|
|
}
|
|
|
|
async function completeJob(jobId: string, error?: string): Promise<void> {
|
|
await prisma.backgroundJob.update({
|
|
where: { id: jobId },
|
|
data: {
|
|
status: error ? 'failed' : 'completed',
|
|
error,
|
|
completedAt: new Date(),
|
|
},
|
|
});
|
|
}
|
|
const BATCH_SIZE = 20;
|
|
const INTER_BATCH_DELAY_MS = 5000;
|
|
const PROGRESS_INTERVAL = 100;
|
|
|
|
// --- Country priority (same as Google Places enrichment) ---
|
|
|
|
const COUNTRY_PRIORITY = [
|
|
'FR', 'DE', 'ES', 'PL', 'BR', 'PT', 'PH', 'CZ', 'MX', 'HU',
|
|
'US', 'CR', 'BE', 'AR', 'CH', 'CO', 'CA', 'SK', 'EC', 'SI',
|
|
'NL', 'PE', 'GB', 'ID', 'CL', 'IE', 'BO', 'VN', 'VE', 'UG',
|
|
'LU', 'HN', 'CD', 'SV', 'KE', 'PA', 'AU', 'CU', 'GT', 'CN',
|
|
];
|
|
|
|
// Country-specific church keywords for search queries
|
|
const COUNTRY_KEYWORDS: Record<string, string> = {
|
|
FR: 'paroisse',
|
|
DE: 'pfarrei',
|
|
ES: 'parroquia',
|
|
MX: 'parroquia',
|
|
PL: 'parafia',
|
|
BR: 'paroquia',
|
|
PT: 'paroquia',
|
|
IT: 'parrocchia',
|
|
CZ: 'farnost',
|
|
HU: 'plebania',
|
|
AR: 'parroquia',
|
|
CO: 'parroquia',
|
|
EC: 'parroquia',
|
|
PE: 'parroquia',
|
|
CL: 'parroquia',
|
|
VE: 'parroquia',
|
|
CR: 'parroquia',
|
|
SV: 'parroquia',
|
|
GT: 'parroquia',
|
|
CU: 'parroquia',
|
|
PA: 'parroquia',
|
|
BO: 'parroquia',
|
|
HN: 'parroquia',
|
|
BE: 'paroisse',
|
|
LU: 'paroisse',
|
|
CH: 'pfarrei',
|
|
NL: 'parochie',
|
|
SK: 'farnosť',
|
|
SI: 'župnija',
|
|
};
|
|
|
|
// Country-specific "parish" keywords for multi-query strategy
|
|
const PARISH_KEYWORDS: Record<string, string> = {
|
|
US: 'parish', GB: 'parish', IE: 'parish', AU: 'parish', CA: 'parish',
|
|
ES: 'parroquia', MX: 'parroquia', AR: 'parroquia', CO: 'parroquia',
|
|
FR: 'paroisse', BE: 'paroisse',
|
|
DE: 'pfarrei', AT: 'pfarrei', CH: 'pfarrei',
|
|
IT: 'parrocchia',
|
|
PT: 'paroquia', BR: 'paroquia',
|
|
PL: 'parafia',
|
|
CR: 'parroquia', CL: 'parroquia', PE: 'parroquia',
|
|
HN: 'parroquia', SV: 'parroquia', GT: 'parroquia', NI: 'parroquia',
|
|
EC: 'parroquia', VE: 'parroquia', BO: 'parroquia', PA: 'parroquia', CU: 'parroquia',
|
|
CZ: 'farnost', HU: 'plebania', SK: 'farnosť', SI: 'župnija', NL: 'parochie',
|
|
LU: 'paroisse',
|
|
};
|
|
|
|
// --- Blocked domains ---
|
|
|
|
const BLOCKED_DOMAINS = new Set([
|
|
// Social media
|
|
'facebook.com', 'fb.com', 'twitter.com', 'x.com', 'instagram.com',
|
|
'linkedin.com', 'youtube.com', 'tiktok.com', 'pinterest.com',
|
|
// Directories & listings
|
|
'yelp.com', 'yellowpages.com', 'yp.com', 'bbb.org', 'manta.com',
|
|
'superpages.com', 'whitepages.com', 'foursquare.com',
|
|
// Church directories (not the parish's own site)
|
|
'masstimes.org', 'catholicdirectory.com', 'findmass.com', 'parishesonline.com',
|
|
'gcatholic.org', 'catholicmasstime.org', 'discovermass.com', 'faithstreet.com',
|
|
'miserend.hu',
|
|
// Schools & landmarks with church-like names (common false positives)
|
|
'archbishopchapelle.org', 'sainte-chapelle.fr',
|
|
// Map & geo services
|
|
'google.com', 'mapquest.com', 'apple.com', 'bing.com',
|
|
'openstreetmap.org', 'mapcarta.com',
|
|
// Travel & reviews (multiple TLDs)
|
|
'tripadvisor.com', 'tripadvisor.co.nz', 'tripadvisor.co.uk',
|
|
'tripadvisor.es', 'tripadvisor.fr', 'tripadvisor.de', 'tripadvisor.it',
|
|
'tripadvisor.com.br', 'tripadvisor.com.mx', 'tripadvisor.pl',
|
|
// Reference & encyclopedias
|
|
'wikipedia.org', 'wikidata.org', 'worldhistory.org', 'britannica.com',
|
|
// Dictionaries & translation
|
|
'spanishdict.com', 'wiktionary.org', 'cambridge.org', 'merriam-webster.com',
|
|
'wordreference.com', 'dict.cc', 'linguee.com', 'deepl.com', 'translate.google.com',
|
|
'collinsdictionary.com', 'reverso.net', 'thefreedictionary.com', 'dictionary.com',
|
|
// Non-Catholic religious sites
|
|
'jw.org',
|
|
// News & media (not parish websites)
|
|
'pbs.org', 'bbc.com', 'cnn.com', 'nytimes.com',
|
|
// Wikimedia projects
|
|
'wikimedia.org',
|
|
// Free hosting platforms (generic, not parish-specific)
|
|
'wixsite.com', 'weebly.com', 'blogspot.com', 'wordpress.com',
|
|
// Travel, ticket & tourism sites
|
|
'trip.com', 'seatgeek.com', 'airial.travel', 'booking.com',
|
|
'expedia.com', 'hotels.com', 'viator.com', 'getyourguide.com',
|
|
'wanderlog.com', 'spain-places.com', 'burgundy-tourism.com',
|
|
'philippineairlines.com',
|
|
// Q&A, forums, news
|
|
'stackexchange.com', 'stackoverflow.com', 'reddit.com', 'quora.com',
|
|
'thisisanfield.com',
|
|
// Software / SaaS / false matches on church terms
|
|
'myconvento.com', 'socha.net', 'drogaraia.com.br',
|
|
// Heritage & cultural directories
|
|
'zabytek.pl', 'monumentos.gov.pt',
|
|
// Location / directory aggregators
|
|
'wheree.com', 'seety.co',
|
|
// Banks (false matches on "sant" → "santander", etc.)
|
|
'santanderbank.com', 'santander.com', 'bancosantander.es',
|
|
// General reference / clearly unrelated
|
|
'brainly.in', 'zhihu.com', 'countyoffice.org', 'weforum.org',
|
|
'themoviedb.org', 'imdb.com', 'amazon.com', 'ebay.com',
|
|
'thereformation.com', 'rarest.org', 'gutenberg.org',
|
|
'consumersadvocate.org', 'endlessmile.com', 'sacredrootsnc.com',
|
|
// Government sites (city pages, not parish sites)
|
|
'madrid.es',
|
|
// Regional tourism portals
|
|
'romantischer-rhein.de', 'riberana.es', 'lascuarre.es',
|
|
// Action/advocacy organizations (not parish sites)
|
|
'franciscanaction.org',
|
|
// Church directories (additional)
|
|
'iglesiaslocales.com',
|
|
// Catholic media & organizations (not individual parishes)
|
|
'usccb.org', 'vaticannews.va', 'ewtn.com', 'catholic.org', 'newadvent.org',
|
|
'catholicnewsagency.com', 'ncronline.org', 'americamagazine.org',
|
|
'catholic-hierarchy.org', 'aleteia.org',
|
|
// Dictionaries (additional)
|
|
'lerobert.com', 'larousse.fr',
|
|
// News (regional)
|
|
'murciatoday.com',
|
|
// National/international organization sites (not local parishes/chapters)
|
|
'kofc.org', 'sagradafamilia.org',
|
|
// Bible/religious Q&A sites
|
|
'gotquestions.org', 'biblegateway.com', 'biblehub.com',
|
|
// Universities/schools (not parish sites)
|
|
'rivier.edu',
|
|
// Religious directories (additional)
|
|
'latinmassdir.org', 'myhollyland.org',
|
|
]);
|
|
|
|
// Country-specific TLDs for scoring bonus
|
|
const COUNTRY_TLDS: Record<string, string> = {
|
|
DE: 'de', FR: 'fr', ES: 'es', IT: 'it', PT: 'pt', PL: 'pl',
|
|
NL: 'nl', BE: 'be', AT: 'at', CH: 'ch', CZ: 'cz', HU: 'hu',
|
|
HR: 'hr', SK: 'sk', SI: 'si', IE: 'ie', GB: 'uk',
|
|
BR: 'com\\.br', MX: 'com\\.mx', AR: 'com\\.ar', CO: 'com\\.co',
|
|
};
|
|
|
|
// Catholic keywords used in scoring
|
|
const CATHOLIC_KEYWORDS = [
|
|
'parish', 'church', 'catholic', 'parroquia', 'paroisse', 'pfarrei',
|
|
'parafia', 'paroquia', 'parrocchia', 'farnost', 'plebania', 'parochie',
|
|
'župnija', 'farnosť', 'iglesia', 'église', 'kirche', 'kościół',
|
|
'chiesa', 'kostel', 'templom', 'kerk',
|
|
];
|
|
|
|
// --- Types ---
|
|
|
|
interface ChurchRecord {
|
|
id: string;
|
|
name: string;
|
|
address: string | null;
|
|
city: string | null;
|
|
state: string | null;
|
|
country: string;
|
|
latitude: number;
|
|
longitude: number;
|
|
diocese: string | null;
|
|
}
|
|
|
|
interface SearchResult {
|
|
title: string;
|
|
url: string;
|
|
description?: string;
|
|
source?: string;
|
|
}
|
|
|
|
interface FreeSearchResponse {
|
|
results: SearchResult[];
|
|
provider: string;
|
|
query: string;
|
|
elapsed_ms: number;
|
|
}
|
|
|
|
interface EnrichmentStats {
|
|
processed: number;
|
|
enriched: number;
|
|
notFound: number;
|
|
errors: number;
|
|
verified: number;
|
|
verifyFailed: number;
|
|
cycles: number;
|
|
startTime: number;
|
|
}
|
|
|
|
// --- Circuit Breaker ---
|
|
|
|
class CircuitBreaker {
|
|
private failures = 0;
|
|
private isOpen = false;
|
|
private backoffMs = 10000;
|
|
private readonly maxBackoffMs = 300000; // 5 minutes
|
|
private readonly threshold = 5;
|
|
|
|
async checkAndWait(): Promise<boolean> {
|
|
if (!this.isOpen) return true;
|
|
|
|
log(`Circuit breaker open. Waiting ${Math.round(this.backoffMs / 1000)}s before retry...`);
|
|
await sleep(this.backoffMs);
|
|
|
|
// Health check
|
|
try {
|
|
const resp = await axios.get(`${FREESEARCH_URL}/api/health`, { timeout: 5000 });
|
|
if (resp.status === 200) {
|
|
this.reset();
|
|
log('Circuit breaker closed: FreeSearch is back');
|
|
return true;
|
|
}
|
|
} catch {
|
|
// Still down
|
|
}
|
|
|
|
this.backoffMs = Math.min(this.backoffMs * 2, this.maxBackoffMs);
|
|
return false;
|
|
}
|
|
|
|
recordFailure() {
|
|
this.failures++;
|
|
if (this.failures >= this.threshold && !this.isOpen) {
|
|
this.isOpen = true;
|
|
this.backoffMs = 10000;
|
|
log(`Circuit breaker OPEN after ${this.failures} consecutive failures`);
|
|
}
|
|
}
|
|
|
|
reset() {
|
|
if (this.failures > 0 || this.isOpen) {
|
|
this.failures = 0;
|
|
this.isOpen = false;
|
|
this.backoffMs = 10000;
|
|
}
|
|
}
|
|
|
|
get opened() { return this.isOpen; }
|
|
}
|
|
|
|
// --- Helpers ---
|
|
|
|
let shuttingDown = false;
|
|
|
|
function log(msg: string) {
|
|
console.log(`[${new Date().toISOString()}] ${msg}`);
|
|
}
|
|
|
|
function logError(msg: string) {
|
|
console.error(`[${new Date().toISOString()}] ${msg}`);
|
|
}
|
|
|
|
function sleep(ms: number): Promise<void> {
|
|
return new Promise(resolve => {
|
|
const timer = setTimeout(resolve, ms);
|
|
// Allow early exit on shutdown
|
|
const check = setInterval(() => {
|
|
if (shuttingDown) {
|
|
clearTimeout(timer);
|
|
clearInterval(check);
|
|
resolve();
|
|
}
|
|
}, 1000);
|
|
// Clean up interval when timer fires normally
|
|
setTimeout(() => clearInterval(check), ms + 100);
|
|
});
|
|
}
|
|
|
|
function getDomain(url: string): string {
|
|
try {
|
|
const hostname = new URL(url).hostname.toLowerCase();
|
|
// Get root domain (e.g., "www.parish.org" -> "parish.org")
|
|
const parts = hostname.split('.');
|
|
if (parts.length >= 2) {
|
|
return parts.slice(-2).join('.');
|
|
}
|
|
return hostname;
|
|
} catch {
|
|
return '';
|
|
}
|
|
}
|
|
|
|
// Domain name keywords that indicate non-parish sites
|
|
const BLOCKED_DOMAIN_KEYWORDS = [
|
|
'tripadvisor', 'archinform', 'seatgeek',
|
|
// Tourism/travel
|
|
'turismo', 'tourism', 'tourisme', 'touristik', 'turistico', 'turistik',
|
|
'reisefuhrer', 'wanderlog', 'viator', 'getyourguide',
|
|
];
|
|
|
|
function isBlockedUrl(url: string): boolean {
|
|
try {
|
|
const hostname = new URL(url).hostname.toLowerCase();
|
|
// Check if hostname ends with any blocked domain
|
|
for (const blocked of BLOCKED_DOMAINS) {
|
|
if (hostname === blocked || hostname.endsWith('.' + blocked)) {
|
|
return true;
|
|
}
|
|
}
|
|
// Check domain keywords (catches all TLD variants)
|
|
for (const keyword of BLOCKED_DOMAIN_KEYWORDS) {
|
|
if (hostname.includes(keyword)) {
|
|
return true;
|
|
}
|
|
}
|
|
// Block PDF/document URLs (not parish homepages)
|
|
const path = new URL(url).pathname.toLowerCase();
|
|
if (path.endsWith('.pdf') || path.endsWith('.doc') || path.endsWith('.docx')) {
|
|
return true;
|
|
}
|
|
return false;
|
|
} catch {
|
|
return true; // Block unparseable URLs
|
|
}
|
|
}
|
|
|
|
function hashUrl(url: string): string {
|
|
return crypto.createHash('md5').update(url).digest('hex').slice(0, 12);
|
|
}
|
|
|
|
function normalizeForMatch(str: string): string {
|
|
return str.toLowerCase()
|
|
.replace(/[^a-z0-9\s]/g, '')
|
|
.replace(/\s+/g, ' ')
|
|
.trim();
|
|
}
|
|
|
|
function getSignificantWords(name: string): string[] {
|
|
const stopWords = new Set([
|
|
// English articles/prepositions
|
|
'the', 'of', 'and', 'in', 'at', 'for', 'our', 'lady',
|
|
// Religious titles & very common church name words
|
|
'st', 'saint', 'saints', 'san', 'sant', 'santa', 'santo', 'sacred',
|
|
'christ', 'jesus', 'mary', 'maria', 'king', 'lord', 'heart',
|
|
'cross', 'lady', 'queen', 'angel', 'angels', 'good', 'star',
|
|
'nome', 'nome', 'pere', 'madre', 'notre', 'dame', 'bien',
|
|
'onze', 'lieve', 'vrouw', 'heer',
|
|
// Very common short saint/religious names (match too many unrelated domains)
|
|
'rosa', 'paul', 'anne', 'jean', 'joan', 'luke', 'marc',
|
|
'rita', 'jose', 'leon', 'pius', 'roch', 'yves', 'ines',
|
|
'vita', 'fara', 'bona',
|
|
// Common religious words that match wrong churches/organizations
|
|
'cristo', 'fatima', 'lourdes', 'perpetuo', 'socorro', 'calvario',
|
|
'rosario', 'pilar', 'carmen', 'dolores', 'remedios', 'nieves',
|
|
'grotte', 'mission', 'sagrada', 'sagrado', 'familia',
|
|
'guadalupe', 'assumption', 'immaculate', 'perpetual', 'divine',
|
|
// Organization names (match national sites, not local chapters)
|
|
'knights', 'columbus',
|
|
// Structural/role words (not distinctive church names)
|
|
'house', 'home', 'hall', 'center', 'centre', 'centro',
|
|
'deacon', 'priest', 'bishop', 'father', 'sister', 'brother',
|
|
'school', 'academy', 'college', 'seminary', 'rectory', 'retreat',
|
|
// Church-generic words (EN)
|
|
'church', 'parish', 'catholic', 'roman', 'holy', 'chapel',
|
|
'cathedral', 'basilica', 'shrine', 'convent', 'monastery',
|
|
// Church-generic words (FR)
|
|
'chapelle', 'eglise', 'église', 'paroisse', 'couvent', 'grotte',
|
|
// Church-generic words (ES)
|
|
'iglesia', 'parroquia', 'capilla', 'ermita', 'convento', 'basílica',
|
|
// Church-generic words (DE)
|
|
'kirche', 'kapelle', 'pfarrei', 'kloster',
|
|
// Church-generic words (IT)
|
|
'chiesa', 'parrocchia', 'cappella', 'oratorio',
|
|
// Church-generic words (PT)
|
|
'igreja', 'capela', 'paroquia',
|
|
// Church-generic words (PL)
|
|
'kościół', 'kaplica', 'parafia', 'droga',
|
|
// Church-generic words (CZ/SK/HU)
|
|
'kostel', 'kaple', 'farnost', 'templom', 'kápolna',
|
|
// Articles/prepositions (Romance)
|
|
'de', 'la', 'le', 'les', 'du', 'des', 'el', 'los', 'las',
|
|
'di', 'del', 'della', 'delle', 'degli',
|
|
'do', 'da', 'dos', 'das',
|
|
// Articles/prepositions (Germanic)
|
|
'und', 'der', 'die', 'das', 'von',
|
|
// Articles/prepositions (Slavic)
|
|
'nad', 'pod', 'przy',
|
|
]);
|
|
|
|
const words = normalizeForMatch(name).split(' ');
|
|
const significant = words.filter(w => w.length >= 3 && !stopWords.has(w));
|
|
|
|
// If all words were filtered, return top 2 longest words (>= 4 chars) regardless of stop list
|
|
if (significant.length === 0) {
|
|
return words
|
|
.filter(w => w.length >= 4)
|
|
.sort((a, b) => b.length - a.length)
|
|
.slice(0, 2);
|
|
}
|
|
|
|
return significant;
|
|
}
|
|
|
|
// --- URL Scoring ---
|
|
|
|
function scoreUrl(result: SearchResult, church: ChurchRecord): number {
|
|
let score = 0;
|
|
const url = result.url.toLowerCase();
|
|
const title = (result.title || '').toLowerCase();
|
|
const description = (result.description || '').toLowerCase();
|
|
const domain = getDomain(result.url);
|
|
const domainWithoutTld = domain.split('.')[0] || '';
|
|
|
|
const nameWords = getSignificantWords(church.name);
|
|
let hasNameMatch = false;
|
|
|
|
// Church name words in domain (strongest signal)
|
|
// Require >= 4 chars to avoid short-word false matches (e.g. "rosa" in "rosaparks")
|
|
for (const word of nameWords) {
|
|
if (word.length >= 4 && domainWithoutTld.includes(word)) {
|
|
score += 10;
|
|
hasNameMatch = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Church name words in URL path (e.g. /san-bartolome/ or /stmichael)
|
|
try {
|
|
const urlPath = new URL(result.url).pathname.toLowerCase();
|
|
for (const word of nameWords) {
|
|
if (word.length >= 4 && urlPath.includes(word)) {
|
|
score += 2;
|
|
hasNameMatch = true;
|
|
break;
|
|
}
|
|
}
|
|
} catch { /* ignore */ }
|
|
|
|
// Church name words in title
|
|
for (const word of nameWords) {
|
|
if (title.includes(word)) {
|
|
score += 5;
|
|
hasNameMatch = true;
|
|
}
|
|
}
|
|
|
|
// Church name words in description
|
|
for (const word of nameWords) {
|
|
if (description.includes(word)) {
|
|
score += 2;
|
|
hasNameMatch = true;
|
|
}
|
|
}
|
|
|
|
// Catholic keywords ONLY count if there's also a name match
|
|
// (prevents "iglesia" in dictionary, "chiesa" in soccer news, etc.)
|
|
if (hasNameMatch) {
|
|
for (const kw of CATHOLIC_KEYWORDS) {
|
|
if (title.includes(kw) || description.includes(kw)) {
|
|
score += 3;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// TLD bonus (only with name match)
|
|
if (hasNameMatch && url.match(/\.(org|church)(\/|$)/)) {
|
|
score += 3;
|
|
}
|
|
|
|
// Country-specific TLD bonus (e.g. .pl for Polish churches, .com.br for Brazilian)
|
|
const countryTld = COUNTRY_TLDS[church.country];
|
|
if (countryTld && hasNameMatch && url.match(new RegExp(`\\.${countryTld}(\\/|$)`))) {
|
|
score += 4;
|
|
}
|
|
|
|
// City name in title/domain
|
|
if (church.city) {
|
|
const cityNorm = normalizeForMatch(church.city);
|
|
if (cityNorm.length > 2) {
|
|
if (title.includes(cityNorm)) score += 3;
|
|
if (domainWithoutTld.includes(cityNorm.replace(/\s/g, ''))) score += 5;
|
|
}
|
|
}
|
|
|
|
// Penalty: diocese/archdiocese in domain (not the parish site)
|
|
if (domainWithoutTld.includes('diocese') || domainWithoutTld.includes('archdiocese') ||
|
|
domainWithoutTld.includes('bistum') || domainWithoutTld.includes('diecezja')) {
|
|
score -= 5;
|
|
}
|
|
|
|
// Penalty: deep URL path
|
|
try {
|
|
const pathSegments = new URL(result.url).pathname.split('/').filter(Boolean);
|
|
if (pathSegments.length > 3) {
|
|
score -= 5;
|
|
}
|
|
} catch {
|
|
// ignore
|
|
}
|
|
|
|
return score;
|
|
}
|
|
|
|
/**
|
|
* Returns top candidate URLs sorted by score (min score 1).
|
|
* Relaxed threshold — verifyUrl() is the real quality gate.
|
|
* Multiple candidates are returned so verification can try the next one if first fails.
|
|
*/
|
|
function pickCandidateUrls(results: SearchResult[], church: ChurchRecord): string[] {
|
|
const candidates = results.filter(r => !isBlockedUrl(r.url));
|
|
if (candidates.length === 0) return [];
|
|
|
|
const scored = candidates.map(r => ({
|
|
url: r.url,
|
|
score: scoreUrl(r, church),
|
|
}));
|
|
|
|
scored.sort((a, b) => b.score - a.score);
|
|
|
|
// Return top 5 candidates with score >= 1 (verifyUrl is the real quality gate)
|
|
return scored
|
|
.filter(s => s.score >= 1)
|
|
.slice(0, 5)
|
|
.map(s => s.url);
|
|
}
|
|
|
|
// --- URL Verification ---
|
|
|
|
function stripHtml(html: string): string {
|
|
return html
|
|
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
|
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
|
.replace(/<[^>]+>/g, ' ')
|
|
.replace(/&[a-z]+;/gi, ' ')
|
|
.replace(/\s+/g, ' ')
|
|
.toLowerCase();
|
|
}
|
|
|
|
// Tourism/cultural keywords — pages about churches as attractions, NOT the church's own site
|
|
const TOURISM_KEYWORDS = [
|
|
'tourism', 'turismo', 'tourisme', 'turisme', 'touristik', 'turistico',
|
|
'attractions', 'things to do', 'sightseeing', 'sehenswürdigkeiten',
|
|
'what to see', 'places to visit', 'travel guide', 'reiseführer',
|
|
'patrimoine', 'heritage trail', 'cultural heritage',
|
|
'punto de interés', 'point of interest', 'points of interest',
|
|
];
|
|
|
|
// Mass schedule keywords — pages with these are almost certainly the church's own site
|
|
const MASS_SCHEDULE_KEYWORDS = [
|
|
'mass schedule', 'mass times', 'worship schedule', 'worship times',
|
|
'service times', 'sunday mass', 'weekday mass',
|
|
'horario de misas', 'horarios de misa', 'horaires des messes',
|
|
'gottesdienst', 'gottesdienstzeiten', 'messzeiten',
|
|
'msze święte', 'godziny mszy', 'msze św',
|
|
'orari delle messe', 'orario messe',
|
|
'horário das missas',
|
|
];
|
|
|
|
/**
|
|
* Fetch the URL and verify the page content mentions the church.
|
|
* Returns true if the page appears to belong to this church.
|
|
*/
|
|
async function verifyUrl(url: string, church: ChurchRecord): Promise<boolean> {
|
|
try {
|
|
const response = await axios.get(url, {
|
|
timeout: 10000,
|
|
maxRedirects: 3,
|
|
headers: {
|
|
'User-Agent': 'Mozilla/5.0 (compatible; NearestMass/1.0; +https://nearestmass.com)',
|
|
'Accept': 'text/html',
|
|
},
|
|
// Only read first 200KB to avoid downloading huge pages
|
|
maxContentLength: 200000,
|
|
responseType: 'text',
|
|
});
|
|
|
|
if (typeof response.data !== 'string') return false;
|
|
|
|
const text = stripHtml(response.data);
|
|
const nameWords = getSignificantWords(church.name);
|
|
|
|
// Count how many significant name words appear in the page
|
|
let nameMatches = 0;
|
|
for (const word of nameWords) {
|
|
if (text.includes(word)) {
|
|
nameMatches++;
|
|
}
|
|
}
|
|
|
|
// Check for city name on page
|
|
let cityMatch = false;
|
|
if (church.city) {
|
|
const cityNorm = normalizeForMatch(church.city);
|
|
if (cityNorm.length > 2 && text.includes(cityNorm)) {
|
|
cityMatch = true;
|
|
}
|
|
}
|
|
|
|
// Check for address on page (street name is a strong location signal)
|
|
let addressMatch = false;
|
|
if (church.address) {
|
|
const addrNorm = normalizeForMatch(church.address);
|
|
// Extract significant address words (street names, not numbers)
|
|
const addrWords = addrNorm.split(' ').filter(w => w.length >= 4 && !/^\d+$/.test(w));
|
|
// If 2+ address words found on page, it's likely the right location
|
|
let addrWordMatches = 0;
|
|
for (const w of addrWords) {
|
|
if (text.includes(w)) addrWordMatches++;
|
|
}
|
|
if (addrWordMatches >= 2) addressMatch = true;
|
|
}
|
|
|
|
// Check for Catholic/church keywords on page
|
|
let hasCatholicKeyword = false;
|
|
for (const kw of CATHOLIC_KEYWORDS) {
|
|
if (text.includes(kw)) {
|
|
hasCatholicKeyword = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Check if page has mass schedule info (strong positive signal)
|
|
let hasMassSchedule = false;
|
|
for (const kw of MASS_SCHEDULE_KEYWORDS) {
|
|
if (text.includes(kw)) {
|
|
hasMassSchedule = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Check if page is a tourism/cultural page (negative signal)
|
|
let isTourismPage = false;
|
|
for (const kw of TOURISM_KEYWORDS) {
|
|
if (text.includes(kw)) {
|
|
isTourismPage = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Check if domain contains church name words (strong signal)
|
|
// Require word length >= 5 to avoid false matches (e.g. "rosa" in "rosaparks")
|
|
let domainMatchesName = false;
|
|
try {
|
|
const hostname = new URL(url).hostname.toLowerCase();
|
|
for (const word of nameWords) {
|
|
if (word.length >= 5 && hostname.includes(word)) {
|
|
domainMatchesName = true;
|
|
break;
|
|
}
|
|
}
|
|
} catch { /* ignore */ }
|
|
|
|
// Reject tourism pages unless they also have mass schedules (= the church itself)
|
|
if (isTourismPage && !hasMassSchedule) return false;
|
|
|
|
// Reject deep URLs on non-matching domains (articles ABOUT churches, not church sites)
|
|
// Church sites are usually at root or shallow paths
|
|
let isDeepUrl = false;
|
|
try {
|
|
const pathSegments = new URL(url).pathname.split('/').filter(Boolean);
|
|
isDeepUrl = pathSegments.length > 2;
|
|
} catch { /* ignore */ }
|
|
if (isDeepUrl && !domainMatchesName && !hasMassSchedule) return false;
|
|
|
|
const hasCity = !!(church.city && church.city.trim());
|
|
|
|
// Strong signal: mass schedule + at least 1 name word = almost certainly the right church
|
|
if (hasMassSchedule && nameMatches >= 1) return true;
|
|
|
|
// Domain matches name + name word on page + Catholic keyword = likely the church's site
|
|
if (domainMatchesName && nameMatches >= 1 && hasCatholicKeyword) return true;
|
|
|
|
// For churches WITH city data, require name + city (city provides disambiguation)
|
|
if (hasCity) {
|
|
if (nameMatches >= 2) return true;
|
|
if (nameMatches >= 1 && cityMatch) return true;
|
|
if (nameMatches >= 1 && addressMatch) return true;
|
|
}
|
|
|
|
// For churches WITHOUT city data, require stronger evidence:
|
|
// - Domain match already handled above
|
|
// - Mass schedule already handled above
|
|
// - Address match can substitute for city
|
|
// - Need 3+ name words as fallback (very distinctive name)
|
|
if (!hasCity) {
|
|
if (nameMatches >= 1 && addressMatch) return true;
|
|
if (nameMatches >= 3) return true;
|
|
}
|
|
|
|
return false;
|
|
} catch {
|
|
// If we can't fetch the page (timeout, SSL error, etc.), reject it
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// --- Search Query Construction ---
|
|
|
|
function buildSearchQuery(church: ChurchRecord): string {
|
|
const parts: string[] = [];
|
|
|
|
// Church name
|
|
parts.push(`"${church.name}"`);
|
|
|
|
// City
|
|
if (church.city) parts.push(church.city);
|
|
|
|
// State (for US/CA/AU/BR)
|
|
if (church.state && ['US', 'CA', 'AU', 'BR'].includes(church.country)) {
|
|
parts.push(church.state);
|
|
}
|
|
|
|
// Diocese (helps disambiguate churches with common names)
|
|
if (church.diocese) parts.push(church.diocese);
|
|
|
|
// Country-specific keyword
|
|
const keyword = COUNTRY_KEYWORDS[church.country];
|
|
if (keyword) parts.push(keyword);
|
|
|
|
parts.push('official website');
|
|
|
|
return parts.join(' ');
|
|
}
|
|
|
|
/**
|
|
* Build multiple query variations for re-search mode.
|
|
* Returns 2-3 different queries to get broader FreeSearch coverage.
|
|
*/
|
|
function buildSearchQueries(church: ChurchRecord): string[] {
|
|
const queries: string[] = [];
|
|
|
|
// Query 1: Standard query (quoted name + city + keyword)
|
|
queries.push(buildSearchQuery(church));
|
|
|
|
// Query 2: Unquoted name + city (catches accent/spelling variations)
|
|
const nameWords = church.name.split(/\s+/).filter(w => w.length >= 3);
|
|
const simpleName = nameWords.join(' ');
|
|
const q2Parts = [simpleName];
|
|
if (church.city) q2Parts.push(church.city);
|
|
const keyword = COUNTRY_KEYWORDS[church.country];
|
|
if (keyword) q2Parts.push(keyword);
|
|
queries.push(q2Parts.join(' '));
|
|
|
|
// Query 3: Distinctive words only + city + local parish keyword
|
|
const sigWords = getSignificantWords(church.name);
|
|
if (sigWords.length > 0 && church.city) {
|
|
const q3Parts = [...sigWords, church.city];
|
|
const parishKeyword = PARISH_KEYWORDS[church.country];
|
|
if (parishKeyword) q3Parts.push(parishKeyword);
|
|
queries.push(q3Parts.join(' '));
|
|
}
|
|
|
|
// Query 4: Country-specific TLD search (for non-English countries)
|
|
const countryTld = COUNTRY_TLDS[church.country];
|
|
if (countryTld && church.country !== 'US' && church.country !== 'GB') {
|
|
const tldClean = countryTld.replace(/\\\./g, '.');
|
|
const q4Parts = [simpleName];
|
|
if (church.city) q4Parts.push(church.city);
|
|
q4Parts.push(`site:.${tldClean}`);
|
|
queries.push(q4Parts.join(' '));
|
|
}
|
|
|
|
// Deduplicate queries
|
|
return [...new Set(queries)];
|
|
}
|
|
|
|
// --- FreeSearch API ---
|
|
|
|
async function searchSingle(query: string): Promise<FreeSearchResponse> {
|
|
const response = await axios.get(`${FREESEARCH_URL}/api/search`, {
|
|
params: { q: query },
|
|
timeout: 30000,
|
|
});
|
|
return response.data;
|
|
}
|
|
|
|
async function healthCheck(): Promise<boolean> {
|
|
try {
|
|
const resp = await axios.get(`${FREESEARCH_URL}/api/health`, { timeout: 5000 });
|
|
return resp.status === 200;
|
|
} catch {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
async function waitForFreeSearch(): Promise<void> {
|
|
let backoffMs = 30_000;
|
|
const maxBackoffMs = 300_000; // 5 minutes
|
|
let attempt = 0;
|
|
|
|
while (!shuttingDown) {
|
|
attempt++;
|
|
const healthy = await healthCheck();
|
|
if (healthy) {
|
|
if (attempt > 1) log('FreeSearch is back. Continuing...');
|
|
return;
|
|
}
|
|
const waitSec = Math.round(backoffMs / 1000);
|
|
logError(`FreeSearch not reachable at ${FREESEARCH_URL} (attempt ${attempt}). Retrying in ${waitSec}s...`);
|
|
await sleep(backoffMs);
|
|
backoffMs = Math.min(backoffMs * 2, maxBackoffMs);
|
|
}
|
|
}
|
|
|
|
// --- Database Queries ---
|
|
|
|
// Cutoff timestamp for re-search: only re-search churches searched BEFORE script start
|
|
const reSearchCutoff = new Date();
|
|
|
|
async function getNextBatch(
|
|
batchSize: number,
|
|
countryCode?: string,
|
|
reSearch?: boolean
|
|
): Promise<ChurchRecord[]> {
|
|
const selectFields = {
|
|
id: true, name: true, address: true, city: true, state: true,
|
|
country: true, latitude: true, longitude: true, diocese: true,
|
|
};
|
|
|
|
// Re-search mode: find previously-searched churches that have no website
|
|
// Only pick churches searched BEFORE this script started (avoids infinite loop
|
|
// since processBatch updates freeSearchedAt to now, which is after the cutoff)
|
|
if (reSearch) {
|
|
return prisma.church.findMany({
|
|
where: {
|
|
source: 'osm',
|
|
website: null,
|
|
freeSearchedAt: { not: null, lt: reSearchCutoff },
|
|
city: { not: null },
|
|
NOT: { city: '' },
|
|
...(countryCode ? { country: countryCode } : {}),
|
|
},
|
|
select: selectFields,
|
|
take: batchSize,
|
|
// Prioritize reverse-geocoded churches (got city after first search)
|
|
orderBy: [{ reverseGeocodedAt: { sort: 'desc', nulls: 'last' } }, { createdAt: 'asc' }],
|
|
});
|
|
}
|
|
|
|
if (countryCode) {
|
|
return prisma.church.findMany({
|
|
where: {
|
|
source: 'osm',
|
|
website: null,
|
|
freeSearchedAt: null,
|
|
country: countryCode,
|
|
city: { not: null },
|
|
NOT: { city: '' },
|
|
},
|
|
select: selectFields,
|
|
take: batchSize,
|
|
orderBy: { createdAt: 'asc' },
|
|
});
|
|
}
|
|
|
|
// Round-robin: take a few from each priority country
|
|
// Prioritize churches WITH city data (better search results)
|
|
const perCountry = Math.max(1, Math.ceil(batchSize / COUNTRY_PRIORITY.length));
|
|
const churches: ChurchRecord[] = [];
|
|
|
|
for (const country of COUNTRY_PRIORITY) {
|
|
if (churches.length >= batchSize) break;
|
|
|
|
// First try churches with city data (higher quality searches)
|
|
const batch = await prisma.church.findMany({
|
|
where: {
|
|
source: 'osm',
|
|
website: null,
|
|
freeSearchedAt: null,
|
|
country,
|
|
city: { not: null },
|
|
NOT: { city: '' },
|
|
},
|
|
select: selectFields,
|
|
take: perCountry,
|
|
orderBy: { createdAt: 'asc' },
|
|
});
|
|
|
|
churches.push(...batch);
|
|
}
|
|
|
|
return churches.slice(0, batchSize);
|
|
}
|
|
|
|
// --- Main Processing ---
|
|
|
|
async function processBatch(
|
|
churches: ChurchRecord[],
|
|
stats: EnrichmentStats,
|
|
dryRun: boolean,
|
|
jobId?: string | null,
|
|
chromaCollection?: Collection | null,
|
|
reSearch?: boolean
|
|
): Promise<void> {
|
|
for (const church of churches) {
|
|
if (shuttingDown) break;
|
|
stats.processed++;
|
|
|
|
const label = `${church.name} (${church.city || 'unknown'}, ${church.country})`;
|
|
|
|
// Multi-query for re-search mode, single query for first-pass
|
|
const queries = reSearch ? buildSearchQueries(church) : [buildSearchQuery(church)];
|
|
|
|
try {
|
|
// Search with all queries, merge results
|
|
const allResults: SearchResult[] = [];
|
|
const seenUrls = new Set<string>();
|
|
|
|
for (const query of queries) {
|
|
try {
|
|
const response = await searchSingle(query);
|
|
for (const r of (response.results || [])) {
|
|
if (!seenUrls.has(r.url)) {
|
|
seenUrls.add(r.url);
|
|
allResults.push(r);
|
|
}
|
|
}
|
|
} catch (error: any) {
|
|
logError(` Query failed: ${query} — ${error.message}`);
|
|
}
|
|
// Brief pause between queries to not overwhelm FreeSearch
|
|
if (queries.length > 1) await sleep(500);
|
|
}
|
|
|
|
if (reSearch && queries.length > 1) {
|
|
log(` ? ${label} => ${queries.length} queries, ${allResults.length} unique results`);
|
|
}
|
|
|
|
const results = allResults;
|
|
const candidateUrls = pickCandidateUrls(results, church);
|
|
|
|
// Store all non-blocked search results in ChromaDB for later analysis
|
|
if (chromaCollection && results.length > 0 && !dryRun) {
|
|
try {
|
|
const nonBlocked = results.filter(r => !isBlockedUrl(r.url));
|
|
if (nonBlocked.length > 0) {
|
|
const docs = nonBlocked.map(r =>
|
|
`${r.title || ''} ${r.description || ''} ${r.url}`.trim()
|
|
);
|
|
const embeddings = await embed(docs);
|
|
await chromaCollection.upsert({
|
|
ids: nonBlocked.map(r => `search-${church.id}-${hashUrl(r.url)}`),
|
|
embeddings,
|
|
documents: docs,
|
|
metadatas: nonBlocked.map(r => ({
|
|
churchId: church.id,
|
|
churchName: church.name,
|
|
churchCity: church.city || '',
|
|
churchCountry: church.country,
|
|
searchQuery: queries[0],
|
|
resultUrl: r.url,
|
|
resultTitle: r.title || '',
|
|
score: scoreUrl(r, church),
|
|
})),
|
|
});
|
|
}
|
|
} catch (e: any) {
|
|
logError(`ChromaDB store failed: ${e.message}`);
|
|
}
|
|
}
|
|
|
|
if (results.length === 0) {
|
|
log(` - ${label} => no results`);
|
|
stats.notFound++;
|
|
if (!dryRun) {
|
|
await prisma.church.update({
|
|
where: { id: church.id },
|
|
data: { freeSearchedAt: new Date() },
|
|
});
|
|
}
|
|
continue;
|
|
}
|
|
|
|
if (candidateUrls.length === 0) {
|
|
log(` - ${label} => no candidates above threshold`);
|
|
stats.notFound++;
|
|
if (!dryRun) {
|
|
await prisma.church.update({
|
|
where: { id: church.id },
|
|
data: { freeSearchedAt: new Date() },
|
|
});
|
|
}
|
|
continue;
|
|
}
|
|
|
|
// Try each candidate URL with verification
|
|
let verifiedUrl: string | null = null;
|
|
for (const url of candidateUrls) {
|
|
const ok = await verifyUrl(url, church);
|
|
if (ok) {
|
|
verifiedUrl = url;
|
|
stats.verified++;
|
|
break;
|
|
} else {
|
|
stats.verifyFailed++;
|
|
}
|
|
}
|
|
|
|
if (verifiedUrl) {
|
|
log(` + ${label} => ${verifiedUrl}`);
|
|
stats.enriched++;
|
|
if (!dryRun) {
|
|
await prisma.church.update({
|
|
where: { id: church.id },
|
|
data: {
|
|
website: verifiedUrl,
|
|
hasWebsite: true,
|
|
freeSearchedAt: new Date(),
|
|
},
|
|
});
|
|
// Mark the verified result in ChromaDB (update replaces metadata, so include all fields)
|
|
if (chromaCollection) {
|
|
try {
|
|
await chromaCollection.update({
|
|
ids: [`search-${church.id}-${hashUrl(verifiedUrl)}`],
|
|
metadatas: [{
|
|
churchId: church.id,
|
|
churchName: church.name,
|
|
churchCity: church.city || '',
|
|
churchCountry: church.country,
|
|
searchQuery: queries[0],
|
|
resultUrl: verifiedUrl,
|
|
resultTitle: '',
|
|
score: 0,
|
|
verified: true,
|
|
}],
|
|
});
|
|
} catch { /* ignore — entry may not exist if ChromaDB was down during store */ }
|
|
}
|
|
}
|
|
} else {
|
|
log(` ~ ${label} => ${candidateUrls.length} candidates failed verification`);
|
|
stats.notFound++;
|
|
if (!dryRun) {
|
|
await prisma.church.update({
|
|
where: { id: church.id },
|
|
data: { freeSearchedAt: new Date() },
|
|
});
|
|
}
|
|
}
|
|
} catch (error: any) {
|
|
stats.errors++;
|
|
logError(` ! ${label} => error: ${error.message}`);
|
|
throw error;
|
|
}
|
|
|
|
// Job tracking: update progress every 10 items
|
|
if (jobId && stats.processed % 10 === 0) {
|
|
await updateJobProgress(jobId, stats, churches.length);
|
|
const stopping = await checkJobStopping(jobId);
|
|
if (stopping) {
|
|
log('Job stop requested via admin dashboard.');
|
|
shuttingDown = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Progress logging
|
|
if (stats.processed % PROGRESS_INTERVAL === 0) {
|
|
const elapsed = (Date.now() - stats.startTime) / 1000;
|
|
const rate = Math.round((stats.processed / elapsed) * 3600);
|
|
const hitRate = stats.processed > 0
|
|
? ((stats.enriched / stats.processed) * 100).toFixed(1)
|
|
: '0.0';
|
|
log(`Progress: ${stats.processed} processed, ${stats.enriched} enriched, ${stats.notFound} not found, ${stats.errors} errors`);
|
|
log(`Hit rate: ${hitRate}%, Verified: ${stats.verified}, Verify failed: ${stats.verifyFailed}, Rate: ~${rate} churches/hour`);
|
|
}
|
|
}
|
|
}
|
|
|
|
async function runSinglePass(
|
|
stats: EnrichmentStats,
|
|
countryCode?: string,
|
|
limit?: number,
|
|
dryRun: boolean = false,
|
|
jobId?: string | null,
|
|
chromaCollection?: Collection | null,
|
|
reSearch?: boolean
|
|
): Promise<void> {
|
|
let totalProcessed = 0;
|
|
let consecutiveErrors = 0;
|
|
|
|
while (!shuttingDown) {
|
|
if (limit && totalProcessed >= limit) break;
|
|
|
|
const batchLimit = limit
|
|
? Math.min(BATCH_SIZE, limit - totalProcessed)
|
|
: BATCH_SIZE;
|
|
|
|
const churches = await getNextBatch(batchLimit, countryCode, reSearch);
|
|
if (churches.length === 0) break;
|
|
|
|
try {
|
|
await processBatch(churches, stats, dryRun, jobId, chromaCollection, reSearch);
|
|
consecutiveErrors = 0;
|
|
} catch (error: any) {
|
|
consecutiveErrors++;
|
|
if (consecutiveErrors >= 5) {
|
|
logError(`5 consecutive batch errors. Stopping.`);
|
|
break;
|
|
}
|
|
logError(`Batch error (${consecutiveErrors}/5): ${error.message}`);
|
|
}
|
|
totalProcessed += churches.length;
|
|
|
|
if (!shuttingDown && churches.length === batchLimit) {
|
|
await sleep(INTER_BATCH_DELAY_MS);
|
|
}
|
|
}
|
|
}
|
|
|
|
async function runContinuous(stats: EnrichmentStats, jobId?: string | null, chromaCollection?: Collection | null, reSearch?: boolean): Promise<void> {
|
|
log('Running in continuous mode (24/7). Press Ctrl+C to stop.');
|
|
const circuitBreaker = new CircuitBreaker();
|
|
|
|
while (!shuttingDown) {
|
|
stats.cycles++;
|
|
log(`--- Cycle ${stats.cycles} ---`);
|
|
let batchesInCycle = 0;
|
|
|
|
while (!shuttingDown) {
|
|
// Circuit breaker check
|
|
if (circuitBreaker.opened) {
|
|
const ok = await circuitBreaker.checkAndWait();
|
|
if (!ok) continue;
|
|
}
|
|
|
|
const churches = await getNextBatch(BATCH_SIZE, undefined, reSearch);
|
|
if (churches.length === 0) break;
|
|
|
|
try {
|
|
await processBatch(churches, stats, false, jobId, chromaCollection, reSearch);
|
|
circuitBreaker.reset();
|
|
batchesInCycle++;
|
|
} catch (error: any) {
|
|
stats.errors++;
|
|
circuitBreaker.recordFailure();
|
|
logError(`Batch error: ${error.message}`);
|
|
}
|
|
|
|
if (!shuttingDown) {
|
|
await sleep(INTER_BATCH_DELAY_MS);
|
|
}
|
|
}
|
|
|
|
if (shuttingDown) break;
|
|
|
|
if (batchesInCycle === 0) {
|
|
log('No unsearched churches found. Waiting 1 hour...');
|
|
// Sleep 1 hour, checking shutdown every 10s
|
|
for (let i = 0; i < 360 && !shuttingDown; i++) {
|
|
await sleep(10000);
|
|
}
|
|
} else {
|
|
log(`Cycle ${stats.cycles} complete. ${batchesInCycle} batches processed. Brief pause...`);
|
|
await sleep(10000);
|
|
}
|
|
}
|
|
}
|
|
|
|
// --- Main ---
|
|
|
|
async function main() {
|
|
const args = process.argv.slice(2);
|
|
const countryIndex = args.indexOf('--country');
|
|
const limitIndex = args.indexOf('--limit');
|
|
const dryRun = args.includes('--dry-run');
|
|
const continuous = args.includes('--continuous');
|
|
const reSearch = args.includes('--re-search');
|
|
|
|
const countryCode = countryIndex !== -1 ? args[countryIndex + 1] : undefined;
|
|
const limit = limitIndex !== -1 ? parseInt(args[limitIndex + 1]) : undefined;
|
|
|
|
// Graceful shutdown
|
|
process.on('SIGTERM', () => {
|
|
log('Received SIGTERM, finishing current batch...');
|
|
shuttingDown = true;
|
|
});
|
|
process.on('SIGINT', () => {
|
|
log('Received SIGINT, finishing current batch...');
|
|
shuttingDown = true;
|
|
});
|
|
|
|
log('============================================================');
|
|
log('FreeSearch Church Website Enrichment');
|
|
log('============================================================');
|
|
log(`FreeSearch URL: ${FREESEARCH_URL}`);
|
|
log(`Mode: ${continuous ? 'Continuous (24/7)' : 'Single pass'}${reSearch ? ' (RE-SEARCH)' : ''}`);
|
|
log(`Re-search: ${reSearch ? 'Yes (multi-query, previously searched churches)' : 'No'}`);
|
|
log(`Country: ${countryCode || 'All (round-robin priority)'}`);
|
|
log(`Limit: ${limit || 'No limit'}`);
|
|
log(`Dry run: ${dryRun ? 'Yes' : 'No'}`);
|
|
log(`Batch size: ${BATCH_SIZE}`);
|
|
log('============================================================');
|
|
|
|
// Wait for FreeSearch to be reachable (indefinite retry with backoff)
|
|
log('Waiting for FreeSearch to be reachable...');
|
|
await waitForFreeSearch();
|
|
if (shuttingDown) return;
|
|
log('FreeSearch health check: OK');
|
|
|
|
// ChromaDB connection (optional — results stored if available)
|
|
let chromaCollection: Collection | null = null;
|
|
try {
|
|
chromaCollection = await getCollection(COLLECTION_NAMES.SEARCH_RESULTS);
|
|
log('ChromaDB search_results collection connected');
|
|
} catch {
|
|
log('ChromaDB unavailable — results will not be stored');
|
|
}
|
|
|
|
// Job tracking — clean up any running jobs left by a previous container restart
|
|
await prisma.backgroundJob.updateMany({
|
|
where: { type: 'freesearch-enrichment', status: 'running' },
|
|
data: { status: 'failed', error: 'Container restarted', completedAt: new Date() },
|
|
});
|
|
|
|
let jobId = await createOrResumeJob(args);
|
|
if (!jobId) {
|
|
jobId = await createNewJob({ countryCode, limit, continuous, dryRun, reSearch });
|
|
}
|
|
log(`Job ID: ${jobId}`);
|
|
|
|
const stats: EnrichmentStats = {
|
|
processed: 0,
|
|
enriched: 0,
|
|
notFound: 0,
|
|
errors: 0,
|
|
verified: 0,
|
|
verifyFailed: 0,
|
|
cycles: 0,
|
|
startTime: Date.now(),
|
|
};
|
|
|
|
if (continuous) {
|
|
await runContinuous(stats, jobId, chromaCollection, reSearch);
|
|
} else {
|
|
await runSinglePass(stats, countryCode, limit, dryRun, jobId, chromaCollection, reSearch);
|
|
}
|
|
|
|
// Complete job
|
|
if (jobId) {
|
|
await updateJobProgress(jobId, stats, 0);
|
|
await completeJob(jobId);
|
|
}
|
|
|
|
// Print summary
|
|
const elapsed = ((Date.now() - stats.startTime) / 1000).toFixed(1);
|
|
const hitRate = stats.processed > 0
|
|
? ((stats.enriched / stats.processed) * 100).toFixed(1)
|
|
: '0.0';
|
|
|
|
log('');
|
|
log('============================================================');
|
|
log('Enrichment Summary');
|
|
log('============================================================');
|
|
log(`Churches processed: ${stats.processed}`);
|
|
log(`Websites found: ${stats.enriched}`);
|
|
log(`No website found: ${stats.notFound}`);
|
|
log(`Errors: ${stats.errors}`);
|
|
log(`URLs verified: ${stats.verified}`);
|
|
log(`Verify rejected: ${stats.verifyFailed}`);
|
|
log(`Hit rate: ${hitRate}%`);
|
|
log(`Elapsed: ${elapsed}s`);
|
|
if (stats.cycles > 0) {
|
|
log(`Cycles completed: ${stats.cycles}`);
|
|
}
|
|
log('============================================================');
|
|
|
|
await prisma.$disconnect();
|
|
await pool.end();
|
|
}
|
|
|
|
main().catch((error) => {
|
|
logError(`Fatal error: ${error.message}`);
|
|
process.exit(1);
|
|
});
|