chore: sync with Gitea master and restore local-only files
Reset local main to gitea/master (new source of truth) and restored local-only files: web scrapers, admin dashboard, ChromaDB integration, debug scripts, and utility libraries that aren't tracked in Gitea. Gitea master adds: discovermass, buscarmisas-network, hk-parishes, bohosluzby, kerknet, gottesdienstzeiten, miserend importers, ClaimRequest model, forward geocoding, heartbeat healthcheck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
623
scripts/match-search-results.ts
Normal file
623
scripts/match-search-results.ts
Normal file
@@ -0,0 +1,623 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Second-pass matching: analyze stored ChromaDB search results to find websites
|
||||
* that the FreeSearch first pass missed.
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/match-search-results.ts --dry-run
|
||||
* npx tsx scripts/match-search-results.ts --country IT --limit 100
|
||||
* npx tsx scripts/match-search-results.ts --threshold 0.3
|
||||
*
|
||||
* Algorithm:
|
||||
* 1. Get churches without websites that have been FreeSearch'd
|
||||
* 2. Query ChromaDB search_results collection for semantically similar results
|
||||
* 3. Cross-church matching: URLs from nearby churches may match
|
||||
* 4. URL frequency analysis: URLs appearing for multiple churches in same area
|
||||
* 5. Verify best candidates against page content
|
||||
* 6. Update church.website if verified
|
||||
*/
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { Collection } from 'chromadb';
|
||||
import axios from 'axios';
|
||||
import { getCollection, COLLECTION_NAMES } from '../src/chromadb/collections';
|
||||
import { embedSingle } from '../src/chromadb/embeddings';
|
||||
|
||||
// Fresh DB connection
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
// --- Job Tracking ---
|
||||
async function createOrResumeJob(args: string[]): Promise<string | null> {
|
||||
const jobIdIndex = args.indexOf('--job-id');
|
||||
if (jobIdIndex !== -1) {
|
||||
const jobId = args[jobIdIndex + 1];
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: { status: 'running', startedAt: new Date() },
|
||||
});
|
||||
return jobId;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function createNewJob(config: Record<string, unknown>): Promise<string> {
|
||||
const job = await prisma.backgroundJob.create({
|
||||
data: {
|
||||
type: 'match-search-results',
|
||||
status: 'running',
|
||||
startedAt: new Date(),
|
||||
config,
|
||||
},
|
||||
});
|
||||
return job.id;
|
||||
}
|
||||
|
||||
async function updateJobProgress(jobId: string, processed: number, found: number, total: number): Promise<void> {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: { processed, succeeded: found, totalItems: total },
|
||||
});
|
||||
}
|
||||
|
||||
async function checkJobStopping(jobId: string): Promise<boolean> {
|
||||
const job = await prisma.backgroundJob.findUnique({ where: { id: jobId } });
|
||||
return job?.status === 'stopping';
|
||||
}
|
||||
|
||||
async function completeJob(jobId: string, error?: string): Promise<void> {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: {
|
||||
status: error ? 'failed' : 'completed',
|
||||
error,
|
||||
completedAt: new Date(),
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
// --- Types ---
|
||||
|
||||
interface ChurchRecord {
|
||||
id: string;
|
||||
name: string;
|
||||
address: string | null;
|
||||
city: string | null;
|
||||
state: string | null;
|
||||
country: string;
|
||||
latitude: number;
|
||||
longitude: number;
|
||||
}
|
||||
|
||||
interface MatchStats {
|
||||
processed: number;
|
||||
matched: number;
|
||||
noResults: number;
|
||||
verifyFailed: number;
|
||||
errors: number;
|
||||
startTime: number;
|
||||
}
|
||||
|
||||
// --- Helpers ---
|
||||
|
||||
let shuttingDown = false;
|
||||
|
||||
function log(msg: string) {
|
||||
console.log(`[${new Date().toISOString()}] ${msg}`);
|
||||
}
|
||||
|
||||
function logError(msg: string) {
|
||||
console.error(`[${new Date().toISOString()}] ${msg}`);
|
||||
}
|
||||
|
||||
function normalizeForMatch(str: string): string {
|
||||
return str.toLowerCase()
|
||||
.replace(/[^a-z0-9\s]/g, '')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
}
|
||||
|
||||
const CATHOLIC_KEYWORDS = [
|
||||
'parish', 'church', 'catholic', 'parroquia', 'paroisse', 'pfarrei',
|
||||
'parafia', 'paroquia', 'parrocchia', 'farnost', 'plebania', 'parochie',
|
||||
'župnija', 'farnosť', 'iglesia', 'église', 'kirche', 'kościół',
|
||||
'chiesa', 'kostel', 'templom', 'kerk',
|
||||
];
|
||||
|
||||
const MASS_SCHEDULE_KEYWORDS = [
|
||||
'mass schedule', 'mass times', 'worship schedule', 'worship times',
|
||||
'service times', 'sunday mass', 'weekday mass',
|
||||
'horario de misas', 'horarios de misa', 'horaires des messes',
|
||||
'gottesdienst', 'gottesdienstzeiten', 'messzeiten',
|
||||
'msze święte', 'godziny mszy', 'msze św',
|
||||
'orari delle messe', 'orario messe',
|
||||
'horário das missas',
|
||||
];
|
||||
|
||||
const TOURISM_KEYWORDS = [
|
||||
'tourism', 'turismo', 'tourisme', 'turisme', 'touristik', 'turistico',
|
||||
'attractions', 'things to do', 'sightseeing', 'sehenswürdigkeiten',
|
||||
'what to see', 'places to visit', 'travel guide', 'reiseführer',
|
||||
'patrimoine', 'heritage trail', 'cultural heritage',
|
||||
'punto de interés', 'point of interest', 'points of interest',
|
||||
];
|
||||
|
||||
function getSignificantWords(name: string): string[] {
|
||||
const stopWords = new Set([
|
||||
'the', 'of', 'and', 'in', 'at', 'for', 'our', 'lady',
|
||||
'st', 'saint', 'saints', 'san', 'sant', 'santa', 'santo', 'sacred',
|
||||
'christ', 'jesus', 'mary', 'maria', 'king', 'lord', 'heart',
|
||||
'cross', 'lady', 'queen', 'angel', 'angels', 'good', 'star',
|
||||
'nome', 'pere', 'madre', 'notre', 'dame', 'bien',
|
||||
'onze', 'lieve', 'vrouw', 'heer',
|
||||
'rosa', 'paul', 'anne', 'jean', 'joan', 'luke', 'marc',
|
||||
'rita', 'jose', 'leon', 'pius', 'roch', 'yves', 'ines',
|
||||
'vita', 'fara', 'bona',
|
||||
'cristo', 'fatima', 'lourdes', 'perpetuo', 'socorro', 'calvario',
|
||||
'rosario', 'pilar', 'carmen', 'dolores', 'remedios', 'nieves',
|
||||
'grotte', 'mission', 'sagrada', 'sagrado', 'familia',
|
||||
'guadalupe', 'assumption', 'immaculate', 'perpetual', 'divine',
|
||||
'knights', 'columbus',
|
||||
'house', 'home', 'hall', 'center', 'centre', 'centro',
|
||||
'deacon', 'priest', 'bishop', 'father', 'sister', 'brother',
|
||||
'school', 'academy', 'college', 'seminary', 'rectory', 'retreat',
|
||||
'church', 'parish', 'catholic', 'roman', 'holy', 'chapel',
|
||||
'cathedral', 'basilica', 'shrine', 'convent', 'monastery',
|
||||
'chapelle', 'eglise', 'église', 'paroisse', 'couvent', 'grotte',
|
||||
'iglesia', 'parroquia', 'capilla', 'ermita', 'convento', 'basílica',
|
||||
'kirche', 'kapelle', 'pfarrei', 'kloster',
|
||||
'chiesa', 'parrocchia', 'cappella', 'oratorio',
|
||||
'igreja', 'capela', 'paroquia',
|
||||
'kościół', 'kaplica', 'parafia', 'droga',
|
||||
'kostel', 'kaple', 'farnost', 'templom', 'kápolna',
|
||||
'de', 'la', 'le', 'les', 'du', 'des', 'el', 'los', 'las',
|
||||
'di', 'del', 'della', 'delle', 'degli',
|
||||
'do', 'da', 'dos', 'das',
|
||||
'und', 'der', 'die', 'das', 'von',
|
||||
'nad', 'pod', 'przy',
|
||||
]);
|
||||
|
||||
return normalizeForMatch(name)
|
||||
.split(' ')
|
||||
.filter(w => w.length >= 3 && !stopWords.has(w));
|
||||
}
|
||||
|
||||
function stripHtml(html: string): string {
|
||||
return html
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/&[a-z]+;/gi, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
}
|
||||
|
||||
// --- URL Verification (same logic as enrich-with-freesearch.ts) ---
|
||||
|
||||
async function verifyUrl(url: string, church: ChurchRecord): Promise<boolean> {
|
||||
try {
|
||||
const response = await axios.get(url, {
|
||||
timeout: 10000,
|
||||
maxRedirects: 3,
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; NearestMass/1.0; +https://nearestmass.com)',
|
||||
'Accept': 'text/html',
|
||||
},
|
||||
maxContentLength: 200000,
|
||||
responseType: 'text',
|
||||
});
|
||||
|
||||
if (typeof response.data !== 'string') return false;
|
||||
|
||||
const text = stripHtml(response.data);
|
||||
const nameWords = getSignificantWords(church.name);
|
||||
|
||||
let nameMatches = 0;
|
||||
for (const word of nameWords) {
|
||||
if (text.includes(word)) nameMatches++;
|
||||
}
|
||||
|
||||
let cityMatch = false;
|
||||
if (church.city) {
|
||||
const cityNorm = normalizeForMatch(church.city);
|
||||
if (cityNorm.length > 2 && text.includes(cityNorm)) cityMatch = true;
|
||||
}
|
||||
|
||||
let addressMatch = false;
|
||||
if (church.address) {
|
||||
const addrNorm = normalizeForMatch(church.address);
|
||||
const addrWords = addrNorm.split(' ').filter(w => w.length >= 4 && !/^\d+$/.test(w));
|
||||
let addrWordMatches = 0;
|
||||
for (const w of addrWords) {
|
||||
if (text.includes(w)) addrWordMatches++;
|
||||
}
|
||||
if (addrWordMatches >= 2) addressMatch = true;
|
||||
}
|
||||
|
||||
let hasCatholicKeyword = false;
|
||||
for (const kw of CATHOLIC_KEYWORDS) {
|
||||
if (text.includes(kw)) { hasCatholicKeyword = true; break; }
|
||||
}
|
||||
|
||||
let hasMassSchedule = false;
|
||||
for (const kw of MASS_SCHEDULE_KEYWORDS) {
|
||||
if (text.includes(kw)) { hasMassSchedule = true; break; }
|
||||
}
|
||||
|
||||
let isTourismPage = false;
|
||||
for (const kw of TOURISM_KEYWORDS) {
|
||||
if (text.includes(kw)) { isTourismPage = true; break; }
|
||||
}
|
||||
|
||||
let domainMatchesName = false;
|
||||
try {
|
||||
const hostname = new URL(url).hostname.toLowerCase();
|
||||
for (const word of nameWords) {
|
||||
if (word.length >= 4 && hostname.includes(word)) {
|
||||
domainMatchesName = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch { /* ignore */ }
|
||||
|
||||
if (isTourismPage && !hasMassSchedule) return false;
|
||||
|
||||
let isDeepUrl = false;
|
||||
try {
|
||||
const pathSegments = new URL(url).pathname.split('/').filter(Boolean);
|
||||
isDeepUrl = pathSegments.length > 2;
|
||||
} catch { /* ignore */ }
|
||||
if (isDeepUrl && !domainMatchesName && !hasMassSchedule) return false;
|
||||
|
||||
const hasCity = !!(church.city && church.city.trim());
|
||||
|
||||
if (hasMassSchedule && nameMatches >= 1) return true;
|
||||
if (domainMatchesName && nameMatches >= 1 && hasCatholicKeyword) return true;
|
||||
|
||||
if (hasCity) {
|
||||
if (nameMatches >= 2) return true;
|
||||
if (nameMatches >= 1 && cityMatch) return true;
|
||||
if (nameMatches >= 1 && addressMatch) return true;
|
||||
}
|
||||
|
||||
if (!hasCity) {
|
||||
if (nameMatches >= 1 && addressMatch) return true;
|
||||
if (nameMatches >= 3) return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// --- ChromaDB Querying ---
|
||||
|
||||
interface ChromaResult {
|
||||
id: string;
|
||||
url: string;
|
||||
title: string;
|
||||
score: number;
|
||||
distance: number;
|
||||
churchId: string;
|
||||
churchName: string;
|
||||
churchCity: string;
|
||||
verified?: boolean;
|
||||
}
|
||||
|
||||
async function findCandidatesForChurch(
|
||||
church: ChurchRecord,
|
||||
collection: Collection,
|
||||
threshold: number,
|
||||
nResults: number
|
||||
): Promise<ChromaResult[]> {
|
||||
// Build identity text for semantic search
|
||||
const identityText = `${church.name} ${church.address || ''} ${church.city || ''} ${church.country}`.trim();
|
||||
const queryEmbedding = await embedSingle(identityText);
|
||||
|
||||
const results = await collection.query({
|
||||
queryEmbeddings: [queryEmbedding],
|
||||
nResults,
|
||||
where: { churchCountry: church.country },
|
||||
});
|
||||
|
||||
if (!results.ids[0]) return [];
|
||||
|
||||
return results.ids[0]
|
||||
.map((id, i) => {
|
||||
const metadata = results.metadatas[0][i] as Record<string, unknown>;
|
||||
return {
|
||||
id,
|
||||
url: (metadata.resultUrl as string) || '',
|
||||
title: (metadata.resultTitle as string) || '',
|
||||
score: (metadata.score as number) || 0,
|
||||
distance: results.distances?.[0]?.[i] ?? 1,
|
||||
churchId: (metadata.churchId as string) || '',
|
||||
churchName: (metadata.churchName as string) || '',
|
||||
churchCity: (metadata.churchCity as string) || '',
|
||||
verified: (metadata.verified as boolean) || false,
|
||||
};
|
||||
})
|
||||
.filter(r => r.distance <= threshold && r.url);
|
||||
}
|
||||
|
||||
function deduplicateByUrl(results: ChromaResult[]): ChromaResult[] {
|
||||
const seen = new Map<string, ChromaResult>();
|
||||
for (const r of results) {
|
||||
const existing = seen.get(r.url);
|
||||
if (!existing || r.distance < existing.distance) {
|
||||
seen.set(r.url, r);
|
||||
}
|
||||
}
|
||||
return [...seen.values()].sort((a, b) => a.distance - b.distance);
|
||||
}
|
||||
|
||||
// --- Main Processing ---
|
||||
|
||||
async function processChurch(
|
||||
church: ChurchRecord,
|
||||
collection: Collection,
|
||||
stats: MatchStats,
|
||||
threshold: number,
|
||||
dryRun: boolean
|
||||
): Promise<void> {
|
||||
const label = `${church.name} (${church.city || 'unknown'}, ${church.country})`;
|
||||
|
||||
try {
|
||||
// 1. Semantic search for similar results in ChromaDB
|
||||
const candidates = await findCandidatesForChurch(church, collection, threshold, 20);
|
||||
|
||||
if (candidates.length === 0) {
|
||||
log(` - ${label} => no ChromaDB results within threshold`);
|
||||
stats.noResults++;
|
||||
return;
|
||||
}
|
||||
|
||||
// 2. Separate results: own church vs cross-church
|
||||
const ownResults = candidates.filter(r => r.churchId === church.id);
|
||||
const crossResults = candidates.filter(r => r.churchId !== church.id);
|
||||
|
||||
// 3. URL frequency: URLs appearing for multiple churches are likely real parish/diocese sites
|
||||
const urlFrequency = new Map<string, number>();
|
||||
for (const r of candidates) {
|
||||
urlFrequency.set(r.url, (urlFrequency.get(r.url) || 0) + 1);
|
||||
}
|
||||
|
||||
// 4. Prioritize: already-verified URLs from other churches, then high-frequency URLs,
|
||||
// then own-church results, then cross-church results
|
||||
const verifiedFromOthers = crossResults.filter(r => r.verified);
|
||||
const highFreqUrls = [...urlFrequency.entries()]
|
||||
.filter(([, count]) => count >= 2)
|
||||
.map(([url]) => url);
|
||||
|
||||
// Build candidate list in priority order
|
||||
const urlsToTry: string[] = [];
|
||||
const addUrl = (url: string) => {
|
||||
if (!urlsToTry.includes(url)) urlsToTry.push(url);
|
||||
};
|
||||
|
||||
// Verified URLs from nearby churches (highest priority)
|
||||
for (const r of verifiedFromOthers) addUrl(r.url);
|
||||
|
||||
// High-frequency URLs (appear in results for multiple churches)
|
||||
for (const url of highFreqUrls) addUrl(url);
|
||||
|
||||
// Own church results by distance (closest semantic match first)
|
||||
const dedupedOwn = deduplicateByUrl(ownResults);
|
||||
for (const r of dedupedOwn) addUrl(r.url);
|
||||
|
||||
// Cross-church results from same city
|
||||
const sameCityCross = crossResults.filter(r =>
|
||||
church.city && r.churchCity &&
|
||||
normalizeForMatch(r.churchCity) === normalizeForMatch(church.city)
|
||||
);
|
||||
const dedupedCross = deduplicateByUrl(sameCityCross);
|
||||
for (const r of dedupedCross) addUrl(r.url);
|
||||
|
||||
// Limit to top 5 candidates
|
||||
const topUrls = urlsToTry.slice(0, 5);
|
||||
|
||||
log(` ? ${label} => ${candidates.length} results, trying ${topUrls.length} candidates`);
|
||||
|
||||
// 5. Verify each candidate
|
||||
let verifiedUrl: string | null = null;
|
||||
for (const url of topUrls) {
|
||||
const ok = await verifyUrl(url, church);
|
||||
if (ok) {
|
||||
verifiedUrl = url;
|
||||
break;
|
||||
} else {
|
||||
stats.verifyFailed++;
|
||||
}
|
||||
}
|
||||
|
||||
if (verifiedUrl) {
|
||||
log(` + ${label} => ${verifiedUrl}`);
|
||||
stats.matched++;
|
||||
if (!dryRun) {
|
||||
await prisma.church.update({
|
||||
where: { id: church.id },
|
||||
data: {
|
||||
website: verifiedUrl,
|
||||
hasWebsite: true,
|
||||
},
|
||||
});
|
||||
// Mark in ChromaDB (update replaces metadata, so include all fields)
|
||||
try {
|
||||
const matchingResult = candidates.find(r => r.url === verifiedUrl);
|
||||
if (matchingResult) {
|
||||
await collection.update({
|
||||
ids: [matchingResult.id],
|
||||
metadatas: [{
|
||||
churchId: matchingResult.churchId,
|
||||
churchName: matchingResult.churchName,
|
||||
churchCity: matchingResult.churchCity,
|
||||
churchCountry: church.country,
|
||||
searchQuery: '',
|
||||
resultUrl: verifiedUrl,
|
||||
resultTitle: matchingResult.title || '',
|
||||
score: matchingResult.score || 0,
|
||||
verified: true,
|
||||
}],
|
||||
});
|
||||
}
|
||||
} catch { /* ignore */ }
|
||||
}
|
||||
} else {
|
||||
log(` ~ ${label} => ${topUrls.length} candidates failed verification`);
|
||||
stats.noResults++;
|
||||
}
|
||||
} catch (error: any) {
|
||||
stats.errors++;
|
||||
logError(` ! ${label} => error: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// --- Main ---
|
||||
|
||||
async function main() {
|
||||
const args = process.argv.slice(2);
|
||||
const countryIndex = args.indexOf('--country');
|
||||
const limitIndex = args.indexOf('--limit');
|
||||
const thresholdIndex = args.indexOf('--threshold');
|
||||
const dryRun = args.includes('--dry-run');
|
||||
|
||||
const countryCode = countryIndex !== -1 ? args[countryIndex + 1] : undefined;
|
||||
const limit = limitIndex !== -1 ? parseInt(args[limitIndex + 1]) : 500;
|
||||
const threshold = thresholdIndex !== -1 ? parseFloat(args[thresholdIndex + 1]) : 0.4;
|
||||
|
||||
// Graceful shutdown
|
||||
process.on('SIGTERM', () => { log('Received SIGTERM'); shuttingDown = true; });
|
||||
process.on('SIGINT', () => { log('Received SIGINT'); shuttingDown = true; });
|
||||
|
||||
log('============================================================');
|
||||
log('Second-Pass Search Result Matching');
|
||||
log('============================================================');
|
||||
log(`Country: ${countryCode || 'All'}`);
|
||||
log(`Limit: ${limit}`);
|
||||
log(`Threshold: ${threshold}`);
|
||||
log(`Dry run: ${dryRun ? 'Yes' : 'No'}`);
|
||||
log('============================================================');
|
||||
|
||||
// Connect to ChromaDB
|
||||
let collection: Collection;
|
||||
try {
|
||||
collection = await getCollection(COLLECTION_NAMES.SEARCH_RESULTS);
|
||||
log('ChromaDB search_results collection connected');
|
||||
} catch (e: any) {
|
||||
logError(`ChromaDB unavailable: ${e.message}`);
|
||||
logError('This script requires ChromaDB. Make sure it is running.');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Check collection has data
|
||||
const count = await collection.count();
|
||||
log(`ChromaDB search_results: ${count} entries`);
|
||||
if (count === 0) {
|
||||
log('No search results stored yet. Run enrich-with-freesearch.ts first.');
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Job tracking
|
||||
let jobId = await createOrResumeJob(args);
|
||||
if (!jobId) {
|
||||
jobId = await createNewJob({ countryCode, limit, threshold, dryRun });
|
||||
}
|
||||
log(`Job ID: ${jobId}`);
|
||||
|
||||
// Get churches without websites that have been FreeSearch'd
|
||||
const whereClause: Record<string, unknown> = {
|
||||
source: 'osm',
|
||||
website: null,
|
||||
freeSearchedAt: { not: null },
|
||||
};
|
||||
if (countryCode) {
|
||||
(whereClause as any).country = countryCode;
|
||||
}
|
||||
|
||||
const churches = await prisma.church.findMany({
|
||||
where: whereClause as any,
|
||||
select: {
|
||||
id: true, name: true, address: true, city: true, state: true,
|
||||
country: true, latitude: true, longitude: true,
|
||||
},
|
||||
take: limit,
|
||||
orderBy: { updatedAt: 'asc' },
|
||||
});
|
||||
|
||||
log(`Found ${churches.length} churches without websites (already FreeSearch'd)`);
|
||||
|
||||
const stats: MatchStats = {
|
||||
processed: 0,
|
||||
matched: 0,
|
||||
noResults: 0,
|
||||
verifyFailed: 0,
|
||||
errors: 0,
|
||||
startTime: Date.now(),
|
||||
};
|
||||
|
||||
for (const church of churches) {
|
||||
if (shuttingDown) break;
|
||||
stats.processed++;
|
||||
|
||||
await processChurch(church, collection, stats, threshold, dryRun);
|
||||
|
||||
// Job tracking every 10 items
|
||||
if (jobId && stats.processed % 10 === 0) {
|
||||
await updateJobProgress(jobId, stats.processed, stats.matched, churches.length);
|
||||
const stopping = await checkJobStopping(jobId);
|
||||
if (stopping) {
|
||||
log('Job stop requested via admin dashboard.');
|
||||
shuttingDown = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Progress logging every 50 items
|
||||
if (stats.processed % 50 === 0) {
|
||||
const elapsed = (Date.now() - stats.startTime) / 1000;
|
||||
const rate = Math.round((stats.processed / elapsed) * 3600);
|
||||
log(`Progress: ${stats.processed}/${churches.length} processed, ${stats.matched} matched, ${stats.noResults} no match, ${stats.errors} errors (~${rate}/hour)`);
|
||||
}
|
||||
}
|
||||
|
||||
// Complete job
|
||||
if (jobId) {
|
||||
await updateJobProgress(jobId, stats.processed, stats.matched, churches.length);
|
||||
await completeJob(jobId);
|
||||
}
|
||||
|
||||
// Print summary
|
||||
const elapsed = ((Date.now() - stats.startTime) / 1000).toFixed(1);
|
||||
const matchRate = stats.processed > 0
|
||||
? ((stats.matched / stats.processed) * 100).toFixed(1)
|
||||
: '0.0';
|
||||
|
||||
log('');
|
||||
log('============================================================');
|
||||
log('Second-Pass Matching Summary');
|
||||
log('============================================================');
|
||||
log(`Churches processed: ${stats.processed}`);
|
||||
log(`Websites matched: ${stats.matched}`);
|
||||
log(`No match found: ${stats.noResults}`);
|
||||
log(`Verify rejected: ${stats.verifyFailed}`);
|
||||
log(`Errors: ${stats.errors}`);
|
||||
log(`Match rate: ${matchRate}%`);
|
||||
log(`Elapsed: ${elapsed}s`);
|
||||
log('============================================================');
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
logError(`Fatal error: ${error.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user