chore: sync with Gitea master and restore local-only files
Reset local main to gitea/master (new source of truth) and restored local-only files: web scrapers, admin dashboard, ChromaDB integration, debug scripts, and utility libraries that aren't tracked in Gitea. Gitea master adds: discovermass, buscarmisas-network, hk-parishes, bohosluzby, kerknet, gottesdienstzeiten, miserend importers, ClaimRequest model, forward geocoding, heartbeat healthcheck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
97
scripts/dedup-churches.ts
Normal file
97
scripts/dedup-churches.ts
Normal file
@@ -0,0 +1,97 @@
|
||||
/**
|
||||
* Find duplicate churches using ChromaDB semantic similarity.
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/dedup-churches.ts # Dry run, show duplicates
|
||||
* npx tsx scripts/dedup-churches.ts --threshold 0.15 # Custom similarity threshold
|
||||
* npx tsx scripts/dedup-churches.ts --country US # Only check US churches
|
||||
* npx tsx scripts/dedup-churches.ts --limit 100 # Check first 100 churches
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { findSimilarChurches } from '../src/chromadb/queries';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
const args = process.argv.slice(2);
|
||||
const threshold = args.includes('--threshold')
|
||||
? parseFloat(args[args.indexOf('--threshold') + 1])
|
||||
: 0.15; // Cosine distance threshold (lower = more similar)
|
||||
const country = args.includes('--country')
|
||||
? args[args.indexOf('--country') + 1]
|
||||
: undefined;
|
||||
const limit = args.includes('--limit')
|
||||
? parseInt(args[args.indexOf('--limit') + 1])
|
||||
: 500;
|
||||
|
||||
async function main() {
|
||||
console.log(`Finding duplicate churches (threshold=${threshold}, country=${country || 'all'}, limit=${limit})`);
|
||||
console.log('---');
|
||||
|
||||
const churches = await prisma.church.findMany({
|
||||
take: limit,
|
||||
where: country ? { country } : undefined,
|
||||
orderBy: { name: 'asc' },
|
||||
select: {
|
||||
id: true,
|
||||
name: true,
|
||||
address: true,
|
||||
city: true,
|
||||
country: true,
|
||||
source: true,
|
||||
latitude: true,
|
||||
longitude: true,
|
||||
_count: { select: { massSchedules: true } },
|
||||
},
|
||||
});
|
||||
|
||||
console.log(`Checking ${churches.length} churches...\n`);
|
||||
|
||||
const seen = new Set<string>();
|
||||
let duplicateCount = 0;
|
||||
|
||||
for (const church of churches) {
|
||||
if (seen.has(church.id)) continue;
|
||||
|
||||
const text = `${church.name} ${church.address || ''} ${church.city || ''} ${church.country}`.trim();
|
||||
const similar = await findSimilarChurches(text, {
|
||||
country: church.country,
|
||||
nResults: 5,
|
||||
});
|
||||
|
||||
// Filter to matches within threshold, excluding self
|
||||
const matches = similar.filter(
|
||||
(s) => s.churchId !== church.id && s.distance <= threshold
|
||||
);
|
||||
|
||||
if (matches.length > 0) {
|
||||
duplicateCount++;
|
||||
console.log(`\nPotential duplicate #${duplicateCount}:`);
|
||||
console.log(` Original: "${church.name}" (${church.city || 'no city'}, ${church.country})`);
|
||||
console.log(` ID: ${church.id}, Source: ${church.source}, Schedules: ${church._count.massSchedules}`);
|
||||
console.log(` Lat/Lng: ${church.latitude}, ${church.longitude}`);
|
||||
|
||||
for (const match of matches) {
|
||||
console.log(` Match: "${match.document}" (distance: ${match.distance.toFixed(4)})`);
|
||||
console.log(` ID: ${match.churchId}`);
|
||||
seen.add(match.churchId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\n---`);
|
||||
console.log(`Found ${duplicateCount} potential duplicate groups from ${churches.length} churches`);
|
||||
console.log(`Threshold: ${threshold} (lower = stricter matching)`);
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user