Files
ScraperControl/scripts/debug/test-top5-countries.ts

194 lines
6.5 KiB
TypeScript
Raw Normal View History

#!/usr/bin/env tsx
/**
* Quick test of top 5 priority countries
*/
import { config } from 'dotenv';
config({ path: '.env.local' });
config({ path: '.env' });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { GenericScraper } from '../../src/scrapers/strategies/generic';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
const COUNTRIES = [
{ code: 'FR', name: 'France' },
{ code: 'DE', name: 'Germany' },
{ code: 'ES', name: 'Spain' },
{ code: 'PL', name: 'Poland' },
{ code: 'BR', name: 'Brazil' },
];
const PER_COUNTRY = 10;
interface CountryResult {
country: string;
countryName: string;
tested: number;
success: number;
failed: number;
successRate: number;
hasBothButFailed: number; // Has days + times but parsing failed
totalSchedules: number;
sampleSuccess?: string;
}
async function testTop5() {
console.log('Testing top 5 priority countries (10 churches each)...\n');
const scraper = new GenericScraper();
await scraper.init();
const results: CountryResult[] = [];
for (const country of COUNTRIES) {
console.log(`\n${'='.repeat(60)}`);
console.log(`Testing ${country.name} (${country.code})`);
console.log('='.repeat(60));
const churches = await prisma.church.findMany({
where: {
country: country.code,
website: { not: null },
source: 'osm',
},
take: PER_COUNTRY,
orderBy: { createdAt: 'asc' },
});
if (churches.length === 0) {
console.log(`No churches with websites found for ${country.name}\n`);
continue;
}
scraper.setCountry(country.code);
let success = 0;
let failed = 0;
let hasBothButFailed = 0;
let totalSchedules = 0;
let sampleSuccess: string | undefined;
for (let i = 0; i < churches.length; i++) {
const church = churches[i];
process.stdout.write(`[${i + 1}/${churches.length}] ${church.name.substring(0, 40).padEnd(40)} `);
try {
const result = await scraper.scrape(church.website!);
if (result.success && result.schedules.length > 0) {
success++;
totalSchedules += result.schedules.length;
process.stdout.write(`${result.schedules.length} schedules\n`);
if (!sampleSuccess && result.schedules.length > 0) {
sampleSuccess = `${church.name}: ${result.schedules.length} schedules`;
}
} else {
failed++;
process.stdout.write(`${result.error}\n`);
// Check if has both days and times (parsing bug indicator)
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Check for day names in any language
const hasDays = text.match(/\b(sunday|monday|tuesday|wednesday|thursday|friday|saturday|dimanche|lundi|mardi|mercredi|jeudi|vendredi|samedi|sonntag|montag|dienstag|mittwoch|donnerstag|freitag|samstag|domingo|lunes|martes|miércoles|miercoles|jueves|viernes|sábado|sabado|niedziela|poniedziałek|poniedzialek|wtorek|środa|sroda|czwartek|piątek|piatek|sobota|segunda|terça|terca|quarta|quinta|sexta)\b/i);
const hasTimes = text.match(/\d{1,2}[h:\.]\s*\d{0,2}/);
if (hasDays && hasTimes) {
hasBothButFailed++;
process.stdout.write(` ⚠️ Has days + times but failed to parse\n`);
}
}
}
} catch (err: any) {
failed++;
process.stdout.write(`❌ ERROR: ${err.message}\n`);
}
}
const successRate = churches.length > 0 ? (success / churches.length) * 100 : 0;
results.push({
country: country.code,
countryName: country.name,
tested: churches.length,
success,
failed,
successRate,
hasBothButFailed,
totalSchedules,
sampleSuccess,
});
console.log(`\n${country.name} Summary: ${success}/${churches.length} (${successRate.toFixed(0)}%)`);
console.log(` Total schedules extracted: ${totalSchedules}`);
if (hasBothButFailed > 0) {
console.log(` ⚠️ Parsing bugs: ${hasBothButFailed} (has content but failed to parse)`);
}
}
await scraper.close();
// Final summary
console.log('\n\n');
console.log('═'.repeat(80));
console.log('FINAL RESULTS - TOP 5 COUNTRIES');
console.log('═'.repeat(80));
console.log('');
console.log('Country | Tested | Success | Rate | Schedules | Bugs');
console.log('─'.repeat(80));
const totalTested = results.reduce((sum, r) => sum + r.tested, 0);
const totalSuccess = results.reduce((sum, r) => sum + r.success, 0);
const totalSchedules = results.reduce((sum, r) => sum + r.totalSchedules, 0);
const totalBugs = results.reduce((sum, r) => sum + r.hasBothButFailed, 0);
results.forEach(r => {
const country = r.countryName.padEnd(12);
const tested = String(r.tested).padStart(6);
const success = String(r.success).padStart(7);
const rate = `${r.successRate.toFixed(0)}%`.padStart(5);
const schedules = String(r.totalSchedules).padStart(9);
const bugs = r.hasBothButFailed > 0 ? `⚠️ ${r.hasBothButFailed}` : '✓';
console.log(`${country} | ${tested} | ${success} | ${rate} | ${schedules} | ${bugs}`);
});
console.log('─'.repeat(80));
const avgRate = totalTested > 0 ? (totalSuccess / totalTested) * 100 : 0;
console.log(`OVERALL | ${String(totalTested).padStart(6)} | ${String(totalSuccess).padStart(7)} | ${avgRate.toFixed(0).padStart(4)}% | ${String(totalSchedules).padStart(9)} | ${totalBugs > 0 ? `⚠️ ${totalBugs}` : '✓'}`);
console.log('');
console.log('═'.repeat(80));
console.log('');
if (totalBugs > 0) {
console.log(`⚠️ ${totalBugs} parsing bugs detected (has days + times but failed)`);
console.log(' These need investigation and fixes.\n');
} else {
console.log('✅ No parsing bugs! All failures are legitimate (no content or wrong page).\n');
}
console.log(`Total churches tested: ${totalTested}`);
console.log(`Total successful: ${totalSuccess} (${avgRate.toFixed(1)}%)`);
console.log(`Total mass schedules extracted: ${totalSchedules}`);
console.log('');
await prisma.$disconnect();
await pool.end();
}
testTop5().catch(console.error);