/** * Find duplicate churches using ChromaDB semantic similarity. * * Usage: * npx tsx scripts/dedup-churches.ts # Dry run, show duplicates * npx tsx scripts/dedup-churches.ts --threshold 0.15 # Custom similarity threshold * npx tsx scripts/dedup-churches.ts --country US # Only check US churches * npx tsx scripts/dedup-churches.ts --limit 100 # Check first 100 churches */ import { Pool } from 'pg'; import { PrismaPg } from '@prisma/adapter-pg'; import { PrismaClient } from '@prisma/client'; import { findSimilarChurches } from '../src/chromadb/queries'; const pool = new Pool({ connectionString: process.env.DATABASE_URL }); const adapter = new PrismaPg(pool); const prisma = new PrismaClient({ adapter }); const args = process.argv.slice(2); const threshold = args.includes('--threshold') ? parseFloat(args[args.indexOf('--threshold') + 1]) : 0.15; // Cosine distance threshold (lower = more similar) const country = args.includes('--country') ? args[args.indexOf('--country') + 1] : undefined; const limit = args.includes('--limit') ? parseInt(args[args.indexOf('--limit') + 1]) : 500; async function main() { console.log(`Finding duplicate churches (threshold=${threshold}, country=${country || 'all'}, limit=${limit})`); console.log('---'); const churches = await prisma.church.findMany({ take: limit, where: country ? { country } : undefined, orderBy: { name: 'asc' }, select: { id: true, name: true, address: true, city: true, country: true, source: true, latitude: true, longitude: true, _count: { select: { massSchedules: true } }, }, }); console.log(`Checking ${churches.length} churches...\n`); const seen = new Set(); let duplicateCount = 0; for (const church of churches) { if (seen.has(church.id)) continue; const text = `${church.name} ${church.address || ''} ${church.city || ''} ${church.country}`.trim(); const similar = await findSimilarChurches(text, { country: church.country, nResults: 5, }); // Filter to matches within threshold, excluding self const matches = similar.filter( (s) => s.churchId !== church.id && s.distance <= threshold ); if (matches.length > 0) { duplicateCount++; console.log(`\nPotential duplicate #${duplicateCount}:`); console.log(` Original: "${church.name}" (${church.city || 'no city'}, ${church.country})`); console.log(` ID: ${church.id}, Source: ${church.source}, Schedules: ${church._count.massSchedules}`); console.log(` Lat/Lng: ${church.latitude}, ${church.longitude}`); for (const match of matches) { console.log(` Match: "${match.document}" (distance: ${match.distance.toFixed(4)})`); console.log(` ID: ${match.churchId}`); seen.add(match.churchId); } } } console.log(`\n---`); console.log(`Found ${duplicateCount} potential duplicate groups from ${churches.length} churches`); console.log(`Threshold: ${threshold} (lower = stricter matching)`); await prisma.$disconnect(); await pool.end(); } main().catch((err) => { console.error(err); process.exit(1); });