194 lines
6.5 KiB
TypeScript
194 lines
6.5 KiB
TypeScript
|
|
#!/usr/bin/env tsx
|
||
|
|
/**
|
||
|
|
* Quick test of top 5 priority countries
|
||
|
|
*/
|
||
|
|
|
||
|
|
import { config } from 'dotenv';
|
||
|
|
config({ path: '.env.local' });
|
||
|
|
config({ path: '.env' });
|
||
|
|
|
||
|
|
import { Pool } from 'pg';
|
||
|
|
import { PrismaPg } from '@prisma/adapter-pg';
|
||
|
|
import { PrismaClient } from '@prisma/client';
|
||
|
|
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||
|
|
|
||
|
|
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||
|
|
const adapter = new PrismaPg(pool);
|
||
|
|
const prisma = new PrismaClient({ adapter });
|
||
|
|
|
||
|
|
const COUNTRIES = [
|
||
|
|
{ code: 'FR', name: 'France' },
|
||
|
|
{ code: 'DE', name: 'Germany' },
|
||
|
|
{ code: 'ES', name: 'Spain' },
|
||
|
|
{ code: 'PL', name: 'Poland' },
|
||
|
|
{ code: 'BR', name: 'Brazil' },
|
||
|
|
];
|
||
|
|
|
||
|
|
const PER_COUNTRY = 10;
|
||
|
|
|
||
|
|
interface CountryResult {
|
||
|
|
country: string;
|
||
|
|
countryName: string;
|
||
|
|
tested: number;
|
||
|
|
success: number;
|
||
|
|
failed: number;
|
||
|
|
successRate: number;
|
||
|
|
hasBothButFailed: number; // Has days + times but parsing failed
|
||
|
|
totalSchedules: number;
|
||
|
|
sampleSuccess?: string;
|
||
|
|
}
|
||
|
|
|
||
|
|
async function testTop5() {
|
||
|
|
console.log('Testing top 5 priority countries (10 churches each)...\n');
|
||
|
|
|
||
|
|
const scraper = new GenericScraper();
|
||
|
|
await scraper.init();
|
||
|
|
|
||
|
|
const results: CountryResult[] = [];
|
||
|
|
|
||
|
|
for (const country of COUNTRIES) {
|
||
|
|
console.log(`\n${'='.repeat(60)}`);
|
||
|
|
console.log(`Testing ${country.name} (${country.code})`);
|
||
|
|
console.log('='.repeat(60));
|
||
|
|
|
||
|
|
const churches = await prisma.church.findMany({
|
||
|
|
where: {
|
||
|
|
country: country.code,
|
||
|
|
website: { not: null },
|
||
|
|
source: 'osm',
|
||
|
|
},
|
||
|
|
take: PER_COUNTRY,
|
||
|
|
orderBy: { createdAt: 'asc' },
|
||
|
|
});
|
||
|
|
|
||
|
|
if (churches.length === 0) {
|
||
|
|
console.log(`No churches with websites found for ${country.name}\n`);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
scraper.setCountry(country.code);
|
||
|
|
|
||
|
|
let success = 0;
|
||
|
|
let failed = 0;
|
||
|
|
let hasBothButFailed = 0;
|
||
|
|
let totalSchedules = 0;
|
||
|
|
let sampleSuccess: string | undefined;
|
||
|
|
|
||
|
|
for (let i = 0; i < churches.length; i++) {
|
||
|
|
const church = churches[i];
|
||
|
|
process.stdout.write(`[${i + 1}/${churches.length}] ${church.name.substring(0, 40).padEnd(40)} `);
|
||
|
|
|
||
|
|
try {
|
||
|
|
const result = await scraper.scrape(church.website!);
|
||
|
|
|
||
|
|
if (result.success && result.schedules.length > 0) {
|
||
|
|
success++;
|
||
|
|
totalSchedules += result.schedules.length;
|
||
|
|
process.stdout.write(`✅ ${result.schedules.length} schedules\n`);
|
||
|
|
|
||
|
|
if (!sampleSuccess && result.schedules.length > 0) {
|
||
|
|
sampleSuccess = `${church.name}: ${result.schedules.length} schedules`;
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
failed++;
|
||
|
|
process.stdout.write(`❌ ${result.error}\n`);
|
||
|
|
|
||
|
|
// Check if has both days and times (parsing bug indicator)
|
||
|
|
if (result.rawHtml) {
|
||
|
|
const text = result.rawHtml
|
||
|
|
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||
|
|
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||
|
|
.replace(/<[^>]+>/g, ' ')
|
||
|
|
.replace(/\s+/g, ' ')
|
||
|
|
.toLowerCase();
|
||
|
|
|
||
|
|
// Check for day names in any language
|
||
|
|
const hasDays = text.match(/\b(sunday|monday|tuesday|wednesday|thursday|friday|saturday|dimanche|lundi|mardi|mercredi|jeudi|vendredi|samedi|sonntag|montag|dienstag|mittwoch|donnerstag|freitag|samstag|domingo|lunes|martes|miércoles|miercoles|jueves|viernes|sábado|sabado|niedziela|poniedziałek|poniedzialek|wtorek|środa|sroda|czwartek|piątek|piatek|sobota|segunda|terça|terca|quarta|quinta|sexta)\b/i);
|
||
|
|
|
||
|
|
const hasTimes = text.match(/\d{1,2}[h:\.]\s*\d{0,2}/);
|
||
|
|
|
||
|
|
if (hasDays && hasTimes) {
|
||
|
|
hasBothButFailed++;
|
||
|
|
process.stdout.write(` ⚠️ Has days + times but failed to parse\n`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
} catch (err: any) {
|
||
|
|
failed++;
|
||
|
|
process.stdout.write(`❌ ERROR: ${err.message}\n`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
const successRate = churches.length > 0 ? (success / churches.length) * 100 : 0;
|
||
|
|
|
||
|
|
results.push({
|
||
|
|
country: country.code,
|
||
|
|
countryName: country.name,
|
||
|
|
tested: churches.length,
|
||
|
|
success,
|
||
|
|
failed,
|
||
|
|
successRate,
|
||
|
|
hasBothButFailed,
|
||
|
|
totalSchedules,
|
||
|
|
sampleSuccess,
|
||
|
|
});
|
||
|
|
|
||
|
|
console.log(`\n${country.name} Summary: ${success}/${churches.length} (${successRate.toFixed(0)}%)`);
|
||
|
|
console.log(` Total schedules extracted: ${totalSchedules}`);
|
||
|
|
if (hasBothButFailed > 0) {
|
||
|
|
console.log(` ⚠️ Parsing bugs: ${hasBothButFailed} (has content but failed to parse)`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
await scraper.close();
|
||
|
|
|
||
|
|
// Final summary
|
||
|
|
console.log('\n\n');
|
||
|
|
console.log('═'.repeat(80));
|
||
|
|
console.log('FINAL RESULTS - TOP 5 COUNTRIES');
|
||
|
|
console.log('═'.repeat(80));
|
||
|
|
console.log('');
|
||
|
|
console.log('Country | Tested | Success | Rate | Schedules | Bugs');
|
||
|
|
console.log('─'.repeat(80));
|
||
|
|
|
||
|
|
const totalTested = results.reduce((sum, r) => sum + r.tested, 0);
|
||
|
|
const totalSuccess = results.reduce((sum, r) => sum + r.success, 0);
|
||
|
|
const totalSchedules = results.reduce((sum, r) => sum + r.totalSchedules, 0);
|
||
|
|
const totalBugs = results.reduce((sum, r) => sum + r.hasBothButFailed, 0);
|
||
|
|
|
||
|
|
results.forEach(r => {
|
||
|
|
const country = r.countryName.padEnd(12);
|
||
|
|
const tested = String(r.tested).padStart(6);
|
||
|
|
const success = String(r.success).padStart(7);
|
||
|
|
const rate = `${r.successRate.toFixed(0)}%`.padStart(5);
|
||
|
|
const schedules = String(r.totalSchedules).padStart(9);
|
||
|
|
const bugs = r.hasBothButFailed > 0 ? `⚠️ ${r.hasBothButFailed}` : '✓';
|
||
|
|
|
||
|
|
console.log(`${country} | ${tested} | ${success} | ${rate} | ${schedules} | ${bugs}`);
|
||
|
|
});
|
||
|
|
|
||
|
|
console.log('─'.repeat(80));
|
||
|
|
const avgRate = totalTested > 0 ? (totalSuccess / totalTested) * 100 : 0;
|
||
|
|
console.log(`OVERALL | ${String(totalTested).padStart(6)} | ${String(totalSuccess).padStart(7)} | ${avgRate.toFixed(0).padStart(4)}% | ${String(totalSchedules).padStart(9)} | ${totalBugs > 0 ? `⚠️ ${totalBugs}` : '✓'}`);
|
||
|
|
console.log('');
|
||
|
|
console.log('═'.repeat(80));
|
||
|
|
console.log('');
|
||
|
|
|
||
|
|
if (totalBugs > 0) {
|
||
|
|
console.log(`⚠️ ${totalBugs} parsing bugs detected (has days + times but failed)`);
|
||
|
|
console.log(' These need investigation and fixes.\n');
|
||
|
|
} else {
|
||
|
|
console.log('✅ No parsing bugs! All failures are legitimate (no content or wrong page).\n');
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log(`Total churches tested: ${totalTested}`);
|
||
|
|
console.log(`Total successful: ${totalSuccess} (${avgRate.toFixed(1)}%)`);
|
||
|
|
console.log(`Total mass schedules extracted: ${totalSchedules}`);
|
||
|
|
console.log('');
|
||
|
|
|
||
|
|
await prisma.$disconnect();
|
||
|
|
await pool.end();
|
||
|
|
}
|
||
|
|
|
||
|
|
testTop5().catch(console.error);
|