chore: sync with Gitea master and restore local-only files

Reset local main to gitea/master (new source of truth) and restored local-only files: web scrapers, admin dashboard, ChromaDB integration, debug scripts, and utility libraries that aren't tracked in Gitea. Gitea master adds: discovermass, buscarmisas-network, hk-parishes, bohosluzby, kerknet, gottesdienstzeiten, miserend importers, ClaimRequest model, forward geocoding, heartbeat healthcheck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-12 19:11:22 -04:00
parent 76cca3ba75
commit 2c51513851
133 changed files with 30381 additions and 0 deletions
--- a/scripts/populate-chromadb.ts
+++ b/scripts/populate-chromadb.ts
@@ -0,0 +1,197 @@
+/**
+ * Bulk-populate ChromaDB collections from the database.
+ *
+ * Usage:
+ *   npx tsx scripts/populate-chromadb.ts --collection church_identity
+ *   npx tsx scripts/populate-chromadb.ts --collection page_classification
+ *   npx tsx scripts/populate-chromadb.ts --all
+ *   npx tsx scripts/populate-chromadb.ts --all --batch-size 50 --limit 1000
+ */
+
+import { Pool } from 'pg';
+import { PrismaPg } from '@prisma/adapter-pg';
+import { PrismaClient } from '@prisma/client';
+import { getCollection, COLLECTION_NAMES, CollectionName } from '../src/chromadb/collections';
+import { embed } from '../src/chromadb/embeddings';
+
+const pool = new Pool({ connectionString: process.env.DATABASE_URL });
+const adapter = new PrismaPg(pool);
+const prisma = new PrismaClient({ adapter });
+
+const args = process.argv.slice(2);
+const collectionArg = args.includes('--collection')
+  ? args[args.indexOf('--collection') + 1]
+  : null;
+const populateAll = args.includes('--all');
+const batchSize = args.includes('--batch-size')
+  ? parseInt(args[args.indexOf('--batch-size') + 1])
+  : 100;
+const limit = args.includes('--limit')
+  ? parseInt(args[args.indexOf('--limit') + 1])
+  : 0;
+
+async function populateChurchIdentity() {
+  console.log('\n=== Populating church_identity ===');
+  const collection = await getCollection(COLLECTION_NAMES.CHURCH_IDENTITY);
+
+  const totalCount = await prisma.church.count();
+  const maxItems = limit > 0 ? Math.min(limit, totalCount) : totalCount;
+  console.log(`Total churches: ${totalCount}, processing: ${maxItems}`);
+
+  let processed = 0;
+  let cursor: string | undefined = undefined;
+
+  while (processed < maxItems) {
+    const currentBatch = Math.min(batchSize, maxItems - processed);
+    const churches = await prisma.church.findMany({
+      take: currentBatch,
+      ...(cursor ? { skip: 1, cursor: { id: cursor } } : {}),
+      orderBy: { id: 'asc' },
+      select: {
+        id: true,
+        name: true,
+        address: true,
+        city: true,
+        country: true,
+        source: true,
+        latitude: true,
+        longitude: true,
+      },
+    });
+
+    if (churches.length === 0) break;
+
+    const documents = churches.map(
+      (c) => `${c.name} ${c.address || ''} ${c.city || ''} ${c.country}`.trim()
+    );
+
+    const embeddings = await embed(documents);
+
+    await collection.upsert({
+      ids: churches.map((c) => `church-${c.id}`),
+      embeddings,
+      documents,
+      metadatas: churches.map((c) => ({
+        churchId: c.id,
+        country: c.country,
+        source: c.source,
+        lat: c.latitude,
+        lng: c.longitude,
+      })),
+    });
+
+    processed += churches.length;
+    cursor = churches[churches.length - 1].id;
+    console.log(`  Processed ${processed}/${maxItems}`);
+  }
+
+  console.log(`  Done: ${processed} churches indexed`);
+}
+
+async function populatePageClassification() {
+  console.log('\n=== Populating page_classification ===');
+  const collection = await getCollection(COLLECTION_NAMES.PAGE_CLASSIFICATION);
+
+  // Index churches that have been successfully scraped (have mass schedules)
+  const totalCount = await prisma.church.count({
+    where: {
+      lastScrapedAt: { not: null },
+      massSchedules: { some: { isActive: true } },
+    },
+  });
+  const maxItems = limit > 0 ? Math.min(limit, totalCount) : totalCount;
+  console.log(`Scraped churches with schedules: ${totalCount}, processing: ${maxItems}`);
+
+  let processed = 0;
+  let cursor: string | undefined = undefined;
+
+  while (processed < maxItems) {
+    const currentBatch = Math.min(batchSize, maxItems - processed);
+    const churches = await prisma.church.findMany({
+      take: currentBatch,
+      ...(cursor ? { skip: 1, cursor: { id: cursor } } : {}),
+      where: {
+        lastScrapedAt: { not: null },
+        massSchedules: { some: { isActive: true } },
+      },
+      orderBy: { id: 'asc' },
+      select: {
+        id: true,
+        massScheduleUrl: true,
+        website: true,
+        websiteLanguage: true,
+        scraperConfig: { select: { rawHtml: true } },
+      },
+    });
+
+    if (churches.length === 0) break;
+
+    // Use stored raw HTML (truncated) as the document
+    const validChurches = churches.filter((c) => c.scraperConfig?.rawHtml);
+    if (validChurches.length > 0) {
+      const documents = validChurches.map(
+        (c) => (c.scraperConfig?.rawHtml || '').slice(0, 2000)
+      );
+
+      const embeddings = await embed(documents);
+
+      await collection.upsert({
+        ids: validChurches.map((c) => `page-${c.id}`),
+        embeddings,
+        documents,
+        metadatas: validChurches.map((c) => ({
+          url: c.massScheduleUrl || c.website || '',
+          isMassSchedulePage: true,
+          language: c.websiteLanguage || 'unknown',
+        })),
+      });
+    }
+
+    processed += churches.length;
+    cursor = churches[churches.length - 1].id;
+    console.log(`  Processed ${processed}/${maxItems} (${validChurches.length} had raw HTML)`);
+  }
+
+  console.log(`  Done: ${processed} pages classified`);
+}
+
+async function main() {
+  try {
+    if (!populateAll && !collectionArg) {
+      console.log('Usage:');
+      console.log('  npx tsx scripts/populate-chromadb.ts --collection church_identity');
+      console.log('  npx tsx scripts/populate-chromadb.ts --collection page_classification');
+      console.log('  npx tsx scripts/populate-chromadb.ts --all');
+      console.log('  npx tsx scripts/populate-chromadb.ts --all --batch-size 50 --limit 1000');
+      process.exit(0);
+    }
+
+    const collectionsToPopulate: CollectionName[] = populateAll
+      ? [COLLECTION_NAMES.CHURCH_IDENTITY, COLLECTION_NAMES.PAGE_CLASSIFICATION]
+      : [collectionArg as CollectionName];
+
+    for (const name of collectionsToPopulate) {
+      switch (name) {
+        case COLLECTION_NAMES.CHURCH_IDENTITY:
+          await populateChurchIdentity();
+          break;
+        case COLLECTION_NAMES.PAGE_CLASSIFICATION:
+          await populatePageClassification();
+          break;
+        default:
+          console.log(`Collection '${name}' does not have a populate function yet.`);
+          console.log('Available: church_identity, page_classification');
+      }
+    }
+
+    console.log('\nPopulation complete!');
+  } catch (error) {
+    console.error('Error:', error);
+    process.exit(1);
+  } finally {
+    await prisma.$disconnect();
+    await pool.end();
+  }
+}
+
+main();