chore: sync with Gitea master and restore local-only files
Reset local main to gitea/master (new source of truth) and restored local-only files: web scrapers, admin dashboard, ChromaDB integration, debug scripts, and utility libraries that aren't tracked in Gitea. Gitea master adds: discovermass, buscarmisas-network, hk-parishes, bohosluzby, kerknet, gottesdienstzeiten, miserend importers, ClaimRequest model, forward geocoding, heartbeat healthcheck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
197
scripts/populate-chromadb.ts
Normal file
197
scripts/populate-chromadb.ts
Normal file
@@ -0,0 +1,197 @@
|
||||
/**
|
||||
* Bulk-populate ChromaDB collections from the database.
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/populate-chromadb.ts --collection church_identity
|
||||
* npx tsx scripts/populate-chromadb.ts --collection page_classification
|
||||
* npx tsx scripts/populate-chromadb.ts --all
|
||||
* npx tsx scripts/populate-chromadb.ts --all --batch-size 50 --limit 1000
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { getCollection, COLLECTION_NAMES, CollectionName } from '../src/chromadb/collections';
|
||||
import { embed } from '../src/chromadb/embeddings';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
const args = process.argv.slice(2);
|
||||
const collectionArg = args.includes('--collection')
|
||||
? args[args.indexOf('--collection') + 1]
|
||||
: null;
|
||||
const populateAll = args.includes('--all');
|
||||
const batchSize = args.includes('--batch-size')
|
||||
? parseInt(args[args.indexOf('--batch-size') + 1])
|
||||
: 100;
|
||||
const limit = args.includes('--limit')
|
||||
? parseInt(args[args.indexOf('--limit') + 1])
|
||||
: 0;
|
||||
|
||||
async function populateChurchIdentity() {
|
||||
console.log('\n=== Populating church_identity ===');
|
||||
const collection = await getCollection(COLLECTION_NAMES.CHURCH_IDENTITY);
|
||||
|
||||
const totalCount = await prisma.church.count();
|
||||
const maxItems = limit > 0 ? Math.min(limit, totalCount) : totalCount;
|
||||
console.log(`Total churches: ${totalCount}, processing: ${maxItems}`);
|
||||
|
||||
let processed = 0;
|
||||
let cursor: string | undefined = undefined;
|
||||
|
||||
while (processed < maxItems) {
|
||||
const currentBatch = Math.min(batchSize, maxItems - processed);
|
||||
const churches = await prisma.church.findMany({
|
||||
take: currentBatch,
|
||||
...(cursor ? { skip: 1, cursor: { id: cursor } } : {}),
|
||||
orderBy: { id: 'asc' },
|
||||
select: {
|
||||
id: true,
|
||||
name: true,
|
||||
address: true,
|
||||
city: true,
|
||||
country: true,
|
||||
source: true,
|
||||
latitude: true,
|
||||
longitude: true,
|
||||
},
|
||||
});
|
||||
|
||||
if (churches.length === 0) break;
|
||||
|
||||
const documents = churches.map(
|
||||
(c) => `${c.name} ${c.address || ''} ${c.city || ''} ${c.country}`.trim()
|
||||
);
|
||||
|
||||
const embeddings = await embed(documents);
|
||||
|
||||
await collection.upsert({
|
||||
ids: churches.map((c) => `church-${c.id}`),
|
||||
embeddings,
|
||||
documents,
|
||||
metadatas: churches.map((c) => ({
|
||||
churchId: c.id,
|
||||
country: c.country,
|
||||
source: c.source,
|
||||
lat: c.latitude,
|
||||
lng: c.longitude,
|
||||
})),
|
||||
});
|
||||
|
||||
processed += churches.length;
|
||||
cursor = churches[churches.length - 1].id;
|
||||
console.log(` Processed ${processed}/${maxItems}`);
|
||||
}
|
||||
|
||||
console.log(` Done: ${processed} churches indexed`);
|
||||
}
|
||||
|
||||
async function populatePageClassification() {
|
||||
console.log('\n=== Populating page_classification ===');
|
||||
const collection = await getCollection(COLLECTION_NAMES.PAGE_CLASSIFICATION);
|
||||
|
||||
// Index churches that have been successfully scraped (have mass schedules)
|
||||
const totalCount = await prisma.church.count({
|
||||
where: {
|
||||
lastScrapedAt: { not: null },
|
||||
massSchedules: { some: { isActive: true } },
|
||||
},
|
||||
});
|
||||
const maxItems = limit > 0 ? Math.min(limit, totalCount) : totalCount;
|
||||
console.log(`Scraped churches with schedules: ${totalCount}, processing: ${maxItems}`);
|
||||
|
||||
let processed = 0;
|
||||
let cursor: string | undefined = undefined;
|
||||
|
||||
while (processed < maxItems) {
|
||||
const currentBatch = Math.min(batchSize, maxItems - processed);
|
||||
const churches = await prisma.church.findMany({
|
||||
take: currentBatch,
|
||||
...(cursor ? { skip: 1, cursor: { id: cursor } } : {}),
|
||||
where: {
|
||||
lastScrapedAt: { not: null },
|
||||
massSchedules: { some: { isActive: true } },
|
||||
},
|
||||
orderBy: { id: 'asc' },
|
||||
select: {
|
||||
id: true,
|
||||
massScheduleUrl: true,
|
||||
website: true,
|
||||
websiteLanguage: true,
|
||||
scraperConfig: { select: { rawHtml: true } },
|
||||
},
|
||||
});
|
||||
|
||||
if (churches.length === 0) break;
|
||||
|
||||
// Use stored raw HTML (truncated) as the document
|
||||
const validChurches = churches.filter((c) => c.scraperConfig?.rawHtml);
|
||||
if (validChurches.length > 0) {
|
||||
const documents = validChurches.map(
|
||||
(c) => (c.scraperConfig?.rawHtml || '').slice(0, 2000)
|
||||
);
|
||||
|
||||
const embeddings = await embed(documents);
|
||||
|
||||
await collection.upsert({
|
||||
ids: validChurches.map((c) => `page-${c.id}`),
|
||||
embeddings,
|
||||
documents,
|
||||
metadatas: validChurches.map((c) => ({
|
||||
url: c.massScheduleUrl || c.website || '',
|
||||
isMassSchedulePage: true,
|
||||
language: c.websiteLanguage || 'unknown',
|
||||
})),
|
||||
});
|
||||
}
|
||||
|
||||
processed += churches.length;
|
||||
cursor = churches[churches.length - 1].id;
|
||||
console.log(` Processed ${processed}/${maxItems} (${validChurches.length} had raw HTML)`);
|
||||
}
|
||||
|
||||
console.log(` Done: ${processed} pages classified`);
|
||||
}
|
||||
|
||||
async function main() {
|
||||
try {
|
||||
if (!populateAll && !collectionArg) {
|
||||
console.log('Usage:');
|
||||
console.log(' npx tsx scripts/populate-chromadb.ts --collection church_identity');
|
||||
console.log(' npx tsx scripts/populate-chromadb.ts --collection page_classification');
|
||||
console.log(' npx tsx scripts/populate-chromadb.ts --all');
|
||||
console.log(' npx tsx scripts/populate-chromadb.ts --all --batch-size 50 --limit 1000');
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const collectionsToPopulate: CollectionName[] = populateAll
|
||||
? [COLLECTION_NAMES.CHURCH_IDENTITY, COLLECTION_NAMES.PAGE_CLASSIFICATION]
|
||||
: [collectionArg as CollectionName];
|
||||
|
||||
for (const name of collectionsToPopulate) {
|
||||
switch (name) {
|
||||
case COLLECTION_NAMES.CHURCH_IDENTITY:
|
||||
await populateChurchIdentity();
|
||||
break;
|
||||
case COLLECTION_NAMES.PAGE_CLASSIFICATION:
|
||||
await populatePageClassification();
|
||||
break;
|
||||
default:
|
||||
console.log(`Collection '${name}' does not have a populate function yet.`);
|
||||
console.log('Available: church_identity, page_classification');
|
||||
}
|
||||
}
|
||||
|
||||
console.log('\nPopulation complete!');
|
||||
} catch (error) {
|
||||
console.error('Error:', error);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
Reference in New Issue
Block a user