98 lines
3.2 KiB
TypeScript
98 lines
3.2 KiB
TypeScript
|
|
/**
|
||
|
|
* Find duplicate churches using ChromaDB semantic similarity.
|
||
|
|
*
|
||
|
|
* Usage:
|
||
|
|
* npx tsx scripts/dedup-churches.ts # Dry run, show duplicates
|
||
|
|
* npx tsx scripts/dedup-churches.ts --threshold 0.15 # Custom similarity threshold
|
||
|
|
* npx tsx scripts/dedup-churches.ts --country US # Only check US churches
|
||
|
|
* npx tsx scripts/dedup-churches.ts --limit 100 # Check first 100 churches
|
||
|
|
*/
|
||
|
|
|
||
|
|
import { Pool } from 'pg';
|
||
|
|
import { PrismaPg } from '@prisma/adapter-pg';
|
||
|
|
import { PrismaClient } from '@prisma/client';
|
||
|
|
import { findSimilarChurches } from '../src/chromadb/queries';
|
||
|
|
|
||
|
|
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||
|
|
const adapter = new PrismaPg(pool);
|
||
|
|
const prisma = new PrismaClient({ adapter });
|
||
|
|
|
||
|
|
const args = process.argv.slice(2);
|
||
|
|
const threshold = args.includes('--threshold')
|
||
|
|
? parseFloat(args[args.indexOf('--threshold') + 1])
|
||
|
|
: 0.15; // Cosine distance threshold (lower = more similar)
|
||
|
|
const country = args.includes('--country')
|
||
|
|
? args[args.indexOf('--country') + 1]
|
||
|
|
: undefined;
|
||
|
|
const limit = args.includes('--limit')
|
||
|
|
? parseInt(args[args.indexOf('--limit') + 1])
|
||
|
|
: 500;
|
||
|
|
|
||
|
|
async function main() {
|
||
|
|
console.log(`Finding duplicate churches (threshold=${threshold}, country=${country || 'all'}, limit=${limit})`);
|
||
|
|
console.log('---');
|
||
|
|
|
||
|
|
const churches = await prisma.church.findMany({
|
||
|
|
take: limit,
|
||
|
|
where: country ? { country } : undefined,
|
||
|
|
orderBy: { name: 'asc' },
|
||
|
|
select: {
|
||
|
|
id: true,
|
||
|
|
name: true,
|
||
|
|
address: true,
|
||
|
|
city: true,
|
||
|
|
country: true,
|
||
|
|
source: true,
|
||
|
|
latitude: true,
|
||
|
|
longitude: true,
|
||
|
|
_count: { select: { massSchedules: true } },
|
||
|
|
},
|
||
|
|
});
|
||
|
|
|
||
|
|
console.log(`Checking ${churches.length} churches...\n`);
|
||
|
|
|
||
|
|
const seen = new Set<string>();
|
||
|
|
let duplicateCount = 0;
|
||
|
|
|
||
|
|
for (const church of churches) {
|
||
|
|
if (seen.has(church.id)) continue;
|
||
|
|
|
||
|
|
const text = `${church.name} ${church.address || ''} ${church.city || ''} ${church.country}`.trim();
|
||
|
|
const similar = await findSimilarChurches(text, {
|
||
|
|
country: church.country,
|
||
|
|
nResults: 5,
|
||
|
|
});
|
||
|
|
|
||
|
|
// Filter to matches within threshold, excluding self
|
||
|
|
const matches = similar.filter(
|
||
|
|
(s) => s.churchId !== church.id && s.distance <= threshold
|
||
|
|
);
|
||
|
|
|
||
|
|
if (matches.length > 0) {
|
||
|
|
duplicateCount++;
|
||
|
|
console.log(`\nPotential duplicate #${duplicateCount}:`);
|
||
|
|
console.log(` Original: "${church.name}" (${church.city || 'no city'}, ${church.country})`);
|
||
|
|
console.log(` ID: ${church.id}, Source: ${church.source}, Schedules: ${church._count.massSchedules}`);
|
||
|
|
console.log(` Lat/Lng: ${church.latitude}, ${church.longitude}`);
|
||
|
|
|
||
|
|
for (const match of matches) {
|
||
|
|
console.log(` Match: "${match.document}" (distance: ${match.distance.toFixed(4)})`);
|
||
|
|
console.log(` ID: ${match.churchId}`);
|
||
|
|
seen.add(match.churchId);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log(`\n---`);
|
||
|
|
console.log(`Found ${duplicateCount} potential duplicate groups from ${churches.length} churches`);
|
||
|
|
console.log(`Threshold: ${threshold} (lower = stricter matching)`);
|
||
|
|
|
||
|
|
await prisma.$disconnect();
|
||
|
|
await pool.end();
|
||
|
|
}
|
||
|
|
|
||
|
|
main().catch((err) => {
|
||
|
|
console.error(err);
|
||
|
|
process.exit(1);
|
||
|
|
});
|