Files
ScraperControl/scripts/dedup-churches.ts
Albert 2c51513851 chore: sync with Gitea master and restore local-only files
Reset local main to gitea/master (new source of truth) and restored
local-only files: web scrapers, admin dashboard, ChromaDB integration,
debug scripts, and utility libraries that aren't tracked in Gitea.

Gitea master adds: discovermass, buscarmisas-network, hk-parishes,
bohosluzby, kerknet, gottesdienstzeiten, miserend importers,
ClaimRequest model, forward geocoding, heartbeat healthcheck.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-12 19:11:22 -04:00

98 lines
3.2 KiB
TypeScript

/**
* Find duplicate churches using ChromaDB semantic similarity.
*
* Usage:
* npx tsx scripts/dedup-churches.ts # Dry run, show duplicates
* npx tsx scripts/dedup-churches.ts --threshold 0.15 # Custom similarity threshold
* npx tsx scripts/dedup-churches.ts --country US # Only check US churches
* npx tsx scripts/dedup-churches.ts --limit 100 # Check first 100 churches
*/
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { findSimilarChurches } from '../src/chromadb/queries';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
const args = process.argv.slice(2);
const threshold = args.includes('--threshold')
? parseFloat(args[args.indexOf('--threshold') + 1])
: 0.15; // Cosine distance threshold (lower = more similar)
const country = args.includes('--country')
? args[args.indexOf('--country') + 1]
: undefined;
const limit = args.includes('--limit')
? parseInt(args[args.indexOf('--limit') + 1])
: 500;
async function main() {
console.log(`Finding duplicate churches (threshold=${threshold}, country=${country || 'all'}, limit=${limit})`);
console.log('---');
const churches = await prisma.church.findMany({
take: limit,
where: country ? { country } : undefined,
orderBy: { name: 'asc' },
select: {
id: true,
name: true,
address: true,
city: true,
country: true,
source: true,
latitude: true,
longitude: true,
_count: { select: { massSchedules: true } },
},
});
console.log(`Checking ${churches.length} churches...\n`);
const seen = new Set<string>();
let duplicateCount = 0;
for (const church of churches) {
if (seen.has(church.id)) continue;
const text = `${church.name} ${church.address || ''} ${church.city || ''} ${church.country}`.trim();
const similar = await findSimilarChurches(text, {
country: church.country,
nResults: 5,
});
// Filter to matches within threshold, excluding self
const matches = similar.filter(
(s) => s.churchId !== church.id && s.distance <= threshold
);
if (matches.length > 0) {
duplicateCount++;
console.log(`\nPotential duplicate #${duplicateCount}:`);
console.log(` Original: "${church.name}" (${church.city || 'no city'}, ${church.country})`);
console.log(` ID: ${church.id}, Source: ${church.source}, Schedules: ${church._count.massSchedules}`);
console.log(` Lat/Lng: ${church.latitude}, ${church.longitude}`);
for (const match of matches) {
console.log(` Match: "${match.document}" (distance: ${match.distance.toFixed(4)})`);
console.log(` ID: ${match.churchId}`);
seen.add(match.churchId);
}
}
}
console.log(`\n---`);
console.log(`Found ${duplicateCount} potential duplicate groups from ${churches.length} churches`);
console.log(`Threshold: ${threshold} (lower = stricter matching)`);
await prisma.$disconnect();
await pool.end();
}
main().catch((err) => {
console.error(err);
process.exit(1);
});