Files
ScraperControl/scripts/debug/analyze-enrichment-priority.ts
Albert 2c51513851 chore: sync with Gitea master and restore local-only files
Reset local main to gitea/master (new source of truth) and restored
local-only files: web scrapers, admin dashboard, ChromaDB integration,
debug scripts, and utility libraries that aren't tracked in Gitea.

Gitea master adds: discovermass, buscarmisas-network, hk-parishes,
bohosluzby, kerknet, gottesdienstzeiten, miserend importers,
ClaimRequest model, forward geocoding, heartbeat healthcheck.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-12 19:11:22 -04:00

166 lines
7.1 KiB
TypeScript

import { config } from 'dotenv';
import { PrismaClient } from '@prisma/client';
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
// Load .env.local first, then .env
config({ path: '.env.local' });
config({ path: '.env' });
const connectionString = process.env.DATABASE_URL;
if (!connectionString) {
throw new Error('DATABASE_URL environment variable is not set');
}
const pool = new Pool({ connectionString });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
interface CountryStats {
country: string;
totalChurches: number;
withWebsite: number;
withoutWebsite: number;
websitePercent: number;
needEnrichment: number;
priority: number;
}
async function analyzeEnrichmentPriority() {
try {
console.log('Analyzing enrichment priority by country...\n');
// Get all OSM churches grouped by country
const churches = await prisma.church.findMany({
where: {
source: 'osm',
},
select: {
country: true,
hasWebsite: true,
website: true,
},
});
// Group by country and calculate stats
const byCountry = churches.reduce((acc, church) => {
const country = church.country || 'Unknown';
if (!acc[country]) {
acc[country] = {
country,
totalChurches: 0,
withWebsite: 0,
withoutWebsite: 0,
websitePercent: 0,
needEnrichment: 0,
priority: 0,
};
}
acc[country].totalChurches++;
if (church.hasWebsite || church.website) {
acc[country].withWebsite++;
} else {
acc[country].withoutWebsite++;
acc[country].needEnrichment++;
}
return acc;
}, {} as Record<string, CountryStats>);
// Calculate percentages and priority score
const stats = Object.values(byCountry).map((stat) => {
stat.websitePercent = (stat.withWebsite / stat.totalChurches) * 100;
// Priority formula:
// - Weight heavily on churches needing enrichment (80%)
// - Weight on low website coverage (20%)
// This favors large countries with low coverage
const needWeight = stat.needEnrichment / 1000; // Normalize to thousands
const coverageGap = 100 - stat.websitePercent; // How much coverage is missing
stat.priority = needWeight * 0.8 + (coverageGap / 100) * needWeight * 0.2;
return stat;
});
// Sort by priority (highest first)
stats.sort((a, b) => b.priority - a.priority);
// Display results
console.log('═══════════════════════════════════════════════════════════════════════════');
console.log('ENRICHMENT PRIORITY RANKING');
console.log('═══════════════════════════════════════════════════════════════════════════');
console.log('');
console.log('Priority formula: (churches_needing_enrichment * 0.8) + (coverage_gap * 0.2)');
console.log('This favors countries with many churches and low website coverage.');
console.log('');
console.log('Rank | Country | Total | Need Enrichment | Coverage | Priority Score');
console.log('─────┼─────────┼───────┼────────────────┼──────────┼────────────────');
stats.forEach((stat, index) => {
const rank = String(index + 1).padStart(4);
const country = stat.country.padEnd(7);
const total = String(stat.totalChurches).padStart(5);
const need = String(stat.needEnrichment).padStart(15);
const coverage = `${stat.websitePercent.toFixed(1)}%`.padStart(8);
const priority = stat.priority.toFixed(2).padStart(14);
console.log(`${rank} | ${country} | ${total} | ${need} | ${coverage} | ${priority}`);
});
console.log('');
console.log('═══════════════════════════════════════════════════════════════════════════');
console.log('');
// Show top 10 with details
console.log('TOP 10 COUNTRIES TO PRIORITIZE:');
console.log('');
stats.slice(0, 10).forEach((stat, index) => {
console.log(`${index + 1}. ${stat.country}`);
console.log(` Total churches: ${stat.totalChurches.toLocaleString()}`);
console.log(` Need enrichment: ${stat.needEnrichment.toLocaleString()} (${(100 - stat.websitePercent).toFixed(1)}% missing)`);
console.log(` Current coverage: ${stat.websitePercent.toFixed(1)}%`);
console.log(` Priority score: ${stat.priority.toFixed(2)}`);
console.log('');
});
// Calculate enrichment timeline
const totalNeedEnrichment = stats.reduce((sum, s) => sum + s.needEnrichment, 0);
const daysAtFullSpeed = Math.ceil(totalNeedEnrichment / 390);
const monthsAtFullSpeed = (daysAtFullSpeed / 30).toFixed(1);
console.log('═══════════════════════════════════════════════════════════════════════════');
console.log('ENRICHMENT TIMELINE');
console.log('═══════════════════════════════════════════════════════════════════════════');
console.log(`Total churches needing enrichment: ${totalNeedEnrichment.toLocaleString()}`);
console.log(`At 390 churches/day (free tier): ${daysAtFullSpeed} days (~${monthsAtFullSpeed} months)`);
console.log('');
// Output country priority order for the script
console.log('═══════════════════════════════════════════════════════════════════════════');
console.log('COUNTRY PRIORITY ORDER (for enrichment script)');
console.log('═══════════════════════════════════════════════════════════════════════════');
console.log('');
console.log('const COUNTRY_PRIORITY = [');
stats
.filter((s) => s.needEnrichment > 0)
.forEach((stat, index) => {
const comma = index < stats.filter((s) => s.needEnrichment > 0).length - 1 ? ',' : '';
console.log(` '${stat.country}'${comma} // ${stat.needEnrichment.toLocaleString()} churches`);
});
console.log('];');
console.log('');
} catch (error) {
console.error('Error:', error);
process.exit(1);
} finally {
await prisma.$disconnect();
await pool.end();
}
}
analyzeEnrichmentPriority();