chore: sync with Gitea master and restore local-only files
Reset local main to gitea/master (new source of truth) and restored local-only files: web scrapers, admin dashboard, ChromaDB integration, debug scripts, and utility libraries that aren't tracked in Gitea. Gitea master adds: discovermass, buscarmisas-network, hk-parishes, bohosluzby, kerknet, gottesdienstzeiten, miserend importers, ClaimRequest model, forward geocoding, heartbeat healthcheck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
193
scripts/debug/test-top5-countries.ts
Normal file
193
scripts/debug/test-top5-countries.ts
Normal file
@@ -0,0 +1,193 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Quick test of top 5 priority countries
|
||||
*/
|
||||
|
||||
import { config } from 'dotenv';
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
const COUNTRIES = [
|
||||
{ code: 'FR', name: 'France' },
|
||||
{ code: 'DE', name: 'Germany' },
|
||||
{ code: 'ES', name: 'Spain' },
|
||||
{ code: 'PL', name: 'Poland' },
|
||||
{ code: 'BR', name: 'Brazil' },
|
||||
];
|
||||
|
||||
const PER_COUNTRY = 10;
|
||||
|
||||
interface CountryResult {
|
||||
country: string;
|
||||
countryName: string;
|
||||
tested: number;
|
||||
success: number;
|
||||
failed: number;
|
||||
successRate: number;
|
||||
hasBothButFailed: number; // Has days + times but parsing failed
|
||||
totalSchedules: number;
|
||||
sampleSuccess?: string;
|
||||
}
|
||||
|
||||
async function testTop5() {
|
||||
console.log('Testing top 5 priority countries (10 churches each)...\n');
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
|
||||
const results: CountryResult[] = [];
|
||||
|
||||
for (const country of COUNTRIES) {
|
||||
console.log(`\n${'='.repeat(60)}`);
|
||||
console.log(`Testing ${country.name} (${country.code})`);
|
||||
console.log('='.repeat(60));
|
||||
|
||||
const churches = await prisma.church.findMany({
|
||||
where: {
|
||||
country: country.code,
|
||||
website: { not: null },
|
||||
source: 'osm',
|
||||
},
|
||||
take: PER_COUNTRY,
|
||||
orderBy: { createdAt: 'asc' },
|
||||
});
|
||||
|
||||
if (churches.length === 0) {
|
||||
console.log(`No churches with websites found for ${country.name}\n`);
|
||||
continue;
|
||||
}
|
||||
|
||||
scraper.setCountry(country.code);
|
||||
|
||||
let success = 0;
|
||||
let failed = 0;
|
||||
let hasBothButFailed = 0;
|
||||
let totalSchedules = 0;
|
||||
let sampleSuccess: string | undefined;
|
||||
|
||||
for (let i = 0; i < churches.length; i++) {
|
||||
const church = churches[i];
|
||||
process.stdout.write(`[${i + 1}/${churches.length}] ${church.name.substring(0, 40).padEnd(40)} `);
|
||||
|
||||
try {
|
||||
const result = await scraper.scrape(church.website!);
|
||||
|
||||
if (result.success && result.schedules.length > 0) {
|
||||
success++;
|
||||
totalSchedules += result.schedules.length;
|
||||
process.stdout.write(`✅ ${result.schedules.length} schedules\n`);
|
||||
|
||||
if (!sampleSuccess && result.schedules.length > 0) {
|
||||
sampleSuccess = `${church.name}: ${result.schedules.length} schedules`;
|
||||
}
|
||||
} else {
|
||||
failed++;
|
||||
process.stdout.write(`❌ ${result.error}\n`);
|
||||
|
||||
// Check if has both days and times (parsing bug indicator)
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Check for day names in any language
|
||||
const hasDays = text.match(/\b(sunday|monday|tuesday|wednesday|thursday|friday|saturday|dimanche|lundi|mardi|mercredi|jeudi|vendredi|samedi|sonntag|montag|dienstag|mittwoch|donnerstag|freitag|samstag|domingo|lunes|martes|miércoles|miercoles|jueves|viernes|sábado|sabado|niedziela|poniedziałek|poniedzialek|wtorek|środa|sroda|czwartek|piątek|piatek|sobota|segunda|terça|terca|quarta|quinta|sexta)\b/i);
|
||||
|
||||
const hasTimes = text.match(/\d{1,2}[h:\.]\s*\d{0,2}/);
|
||||
|
||||
if (hasDays && hasTimes) {
|
||||
hasBothButFailed++;
|
||||
process.stdout.write(` ⚠️ Has days + times but failed to parse\n`);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (err: any) {
|
||||
failed++;
|
||||
process.stdout.write(`❌ ERROR: ${err.message}\n`);
|
||||
}
|
||||
}
|
||||
|
||||
const successRate = churches.length > 0 ? (success / churches.length) * 100 : 0;
|
||||
|
||||
results.push({
|
||||
country: country.code,
|
||||
countryName: country.name,
|
||||
tested: churches.length,
|
||||
success,
|
||||
failed,
|
||||
successRate,
|
||||
hasBothButFailed,
|
||||
totalSchedules,
|
||||
sampleSuccess,
|
||||
});
|
||||
|
||||
console.log(`\n${country.name} Summary: ${success}/${churches.length} (${successRate.toFixed(0)}%)`);
|
||||
console.log(` Total schedules extracted: ${totalSchedules}`);
|
||||
if (hasBothButFailed > 0) {
|
||||
console.log(` ⚠️ Parsing bugs: ${hasBothButFailed} (has content but failed to parse)`);
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
|
||||
// Final summary
|
||||
console.log('\n\n');
|
||||
console.log('═'.repeat(80));
|
||||
console.log('FINAL RESULTS - TOP 5 COUNTRIES');
|
||||
console.log('═'.repeat(80));
|
||||
console.log('');
|
||||
console.log('Country | Tested | Success | Rate | Schedules | Bugs');
|
||||
console.log('─'.repeat(80));
|
||||
|
||||
const totalTested = results.reduce((sum, r) => sum + r.tested, 0);
|
||||
const totalSuccess = results.reduce((sum, r) => sum + r.success, 0);
|
||||
const totalSchedules = results.reduce((sum, r) => sum + r.totalSchedules, 0);
|
||||
const totalBugs = results.reduce((sum, r) => sum + r.hasBothButFailed, 0);
|
||||
|
||||
results.forEach(r => {
|
||||
const country = r.countryName.padEnd(12);
|
||||
const tested = String(r.tested).padStart(6);
|
||||
const success = String(r.success).padStart(7);
|
||||
const rate = `${r.successRate.toFixed(0)}%`.padStart(5);
|
||||
const schedules = String(r.totalSchedules).padStart(9);
|
||||
const bugs = r.hasBothButFailed > 0 ? `⚠️ ${r.hasBothButFailed}` : '✓';
|
||||
|
||||
console.log(`${country} | ${tested} | ${success} | ${rate} | ${schedules} | ${bugs}`);
|
||||
});
|
||||
|
||||
console.log('─'.repeat(80));
|
||||
const avgRate = totalTested > 0 ? (totalSuccess / totalTested) * 100 : 0;
|
||||
console.log(`OVERALL | ${String(totalTested).padStart(6)} | ${String(totalSuccess).padStart(7)} | ${avgRate.toFixed(0).padStart(4)}% | ${String(totalSchedules).padStart(9)} | ${totalBugs > 0 ? `⚠️ ${totalBugs}` : '✓'}`);
|
||||
console.log('');
|
||||
console.log('═'.repeat(80));
|
||||
console.log('');
|
||||
|
||||
if (totalBugs > 0) {
|
||||
console.log(`⚠️ ${totalBugs} parsing bugs detected (has days + times but failed)`);
|
||||
console.log(' These need investigation and fixes.\n');
|
||||
} else {
|
||||
console.log('✅ No parsing bugs! All failures are legitimate (no content or wrong page).\n');
|
||||
}
|
||||
|
||||
console.log(`Total churches tested: ${totalTested}`);
|
||||
console.log(`Total successful: ${totalSuccess} (${avgRate.toFixed(1)}%)`);
|
||||
console.log(`Total mass schedules extracted: ${totalSchedules}`);
|
||||
console.log('');
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
testTop5().catch(console.error);
|
||||
Reference in New Issue
Block a user