#!/usr/bin/env tsx /** * Quick test of top 5 priority countries */ import { config } from 'dotenv'; config({ path: '.env.local' }); config({ path: '.env' }); import { Pool } from 'pg'; import { PrismaPg } from '@prisma/adapter-pg'; import { PrismaClient } from '@prisma/client'; import { GenericScraper } from '../../src/scrapers/strategies/generic'; const pool = new Pool({ connectionString: process.env.DATABASE_URL }); const adapter = new PrismaPg(pool); const prisma = new PrismaClient({ adapter }); const COUNTRIES = [ { code: 'FR', name: 'France' }, { code: 'DE', name: 'Germany' }, { code: 'ES', name: 'Spain' }, { code: 'PL', name: 'Poland' }, { code: 'BR', name: 'Brazil' }, ]; const PER_COUNTRY = 10; interface CountryResult { country: string; countryName: string; tested: number; success: number; failed: number; successRate: number; hasBothButFailed: number; // Has days + times but parsing failed totalSchedules: number; sampleSuccess?: string; } async function testTop5() { console.log('Testing top 5 priority countries (10 churches each)...\n'); const scraper = new GenericScraper(); await scraper.init(); const results: CountryResult[] = []; for (const country of COUNTRIES) { console.log(`\n${'='.repeat(60)}`); console.log(`Testing ${country.name} (${country.code})`); console.log('='.repeat(60)); const churches = await prisma.church.findMany({ where: { country: country.code, website: { not: null }, source: 'osm', }, take: PER_COUNTRY, orderBy: { createdAt: 'asc' }, }); if (churches.length === 0) { console.log(`No churches with websites found for ${country.name}\n`); continue; } scraper.setCountry(country.code); let success = 0; let failed = 0; let hasBothButFailed = 0; let totalSchedules = 0; let sampleSuccess: string | undefined; for (let i = 0; i < churches.length; i++) { const church = churches[i]; process.stdout.write(`[${i + 1}/${churches.length}] ${church.name.substring(0, 40).padEnd(40)} `); try { const result = await scraper.scrape(church.website!); if (result.success && result.schedules.length > 0) { success++; totalSchedules += result.schedules.length; process.stdout.write(`✅ ${result.schedules.length} schedules\n`); if (!sampleSuccess && result.schedules.length > 0) { sampleSuccess = `${church.name}: ${result.schedules.length} schedules`; } } else { failed++; process.stdout.write(`❌ ${result.error}\n`); // Check if has both days and times (parsing bug indicator) if (result.rawHtml) { const text = result.rawHtml .replace(/]*>[\s\S]*?<\/script>/gi, '') .replace(/]*>[\s\S]*?<\/style>/gi, '') .replace(/<[^>]+>/g, ' ') .replace(/\s+/g, ' ') .toLowerCase(); // Check for day names in any language const hasDays = text.match(/\b(sunday|monday|tuesday|wednesday|thursday|friday|saturday|dimanche|lundi|mardi|mercredi|jeudi|vendredi|samedi|sonntag|montag|dienstag|mittwoch|donnerstag|freitag|samstag|domingo|lunes|martes|miércoles|miercoles|jueves|viernes|sábado|sabado|niedziela|poniedziałek|poniedzialek|wtorek|środa|sroda|czwartek|piątek|piatek|sobota|segunda|terça|terca|quarta|quinta|sexta)\b/i); const hasTimes = text.match(/\d{1,2}[h:\.]\s*\d{0,2}/); if (hasDays && hasTimes) { hasBothButFailed++; process.stdout.write(` ⚠️ Has days + times but failed to parse\n`); } } } } catch (err: any) { failed++; process.stdout.write(`❌ ERROR: ${err.message}\n`); } } const successRate = churches.length > 0 ? (success / churches.length) * 100 : 0; results.push({ country: country.code, countryName: country.name, tested: churches.length, success, failed, successRate, hasBothButFailed, totalSchedules, sampleSuccess, }); console.log(`\n${country.name} Summary: ${success}/${churches.length} (${successRate.toFixed(0)}%)`); console.log(` Total schedules extracted: ${totalSchedules}`); if (hasBothButFailed > 0) { console.log(` ⚠️ Parsing bugs: ${hasBothButFailed} (has content but failed to parse)`); } } await scraper.close(); // Final summary console.log('\n\n'); console.log('═'.repeat(80)); console.log('FINAL RESULTS - TOP 5 COUNTRIES'); console.log('═'.repeat(80)); console.log(''); console.log('Country | Tested | Success | Rate | Schedules | Bugs'); console.log('─'.repeat(80)); const totalTested = results.reduce((sum, r) => sum + r.tested, 0); const totalSuccess = results.reduce((sum, r) => sum + r.success, 0); const totalSchedules = results.reduce((sum, r) => sum + r.totalSchedules, 0); const totalBugs = results.reduce((sum, r) => sum + r.hasBothButFailed, 0); results.forEach(r => { const country = r.countryName.padEnd(12); const tested = String(r.tested).padStart(6); const success = String(r.success).padStart(7); const rate = `${r.successRate.toFixed(0)}%`.padStart(5); const schedules = String(r.totalSchedules).padStart(9); const bugs = r.hasBothButFailed > 0 ? `⚠️ ${r.hasBothButFailed}` : '✓'; console.log(`${country} | ${tested} | ${success} | ${rate} | ${schedules} | ${bugs}`); }); console.log('─'.repeat(80)); const avgRate = totalTested > 0 ? (totalSuccess / totalTested) * 100 : 0; console.log(`OVERALL | ${String(totalTested).padStart(6)} | ${String(totalSuccess).padStart(7)} | ${avgRate.toFixed(0).padStart(4)}% | ${String(totalSchedules).padStart(9)} | ${totalBugs > 0 ? `⚠️ ${totalBugs}` : '✓'}`); console.log(''); console.log('═'.repeat(80)); console.log(''); if (totalBugs > 0) { console.log(`⚠️ ${totalBugs} parsing bugs detected (has days + times but failed)`); console.log(' These need investigation and fixes.\n'); } else { console.log('✅ No parsing bugs! All failures are legitimate (no content or wrong page).\n'); } console.log(`Total churches tested: ${totalTested}`); console.log(`Total successful: ${totalSuccess} (${avgRate.toFixed(1)}%)`); console.log(`Total mass schedules extracted: ${totalSchedules}`); console.log(''); await prisma.$disconnect(); await pool.end(); } testTop5().catch(console.error);