#!/usr/bin/env tsx /** * Test more French churches and collect diagnostic data */ import { config } from 'dotenv'; config({ path: '.env.local' }); config({ path: '.env' }); import { Pool } from 'pg'; import { PrismaPg } from '@prisma/adapter-pg'; import { PrismaClient } from '@prisma/client'; import { GenericScraper } from '../../src/scrapers/strategies/generic'; const pool = new Pool({ connectionString: process.env.DATABASE_URL }); const adapter = new PrismaPg(pool); const prisma = new PrismaClient({ adapter }); interface DiagnosticInfo { url: string; churchName: string; success: boolean; schedulesFound: number; hasFrenchDays: boolean; hasTimePatterns: boolean; timePatternsSample: string[]; textSample: string; error?: string; } async function testFrenchBroader() { console.log('Testing 20 French churches with diagnostics...\n'); // Get more French churches const churches = await prisma.church.findMany({ where: { country: 'FR', website: { not: null }, source: 'osm', }, take: 20, orderBy: { createdAt: 'asc' }, }); if (churches.length === 0) { console.log('No French churches found.'); await prisma.$disconnect(); await pool.end(); return; } console.log(`Found ${churches.length} French churches to test\n`); const scraper = new GenericScraper(); await scraper.init(); scraper.setCountry('FR'); let successCount = 0; let failCount = 0; const diagnostics: DiagnosticInfo[] = []; for (let i = 0; i < churches.length; i++) { const church = churches[i]; console.log(`[${i + 1}/${churches.length}] Testing: ${church.name} (${church.city || 'Unknown'})`); console.log(`URL: ${church.website}`); try { const result = await scraper.scrape(church.website!); // Extract diagnostics let hasFrenchDays = false; let hasTimePatterns = false; let timePatternsSample: string[] = []; let textSample = ''; if (result.rawHtml) { const text = result.rawHtml .replace(/]*>[\s\S]*?<\/script>/gi, '') .replace(/]*>[\s\S]*?<\/style>/gi, '') .replace(/<[^>]+>/g, ' ') .replace(/\s+/g, ' ') .toLowerCase(); textSample = text.substring(0, 500); const frenchDays = ['dimanche', 'lundi', 'mardi', 'mercredi', 'jeudi', 'vendredi', 'samedi']; hasFrenchDays = frenchDays.some(day => text.includes(day)); const timeRegex = /\d{1,2}[h:\.]\s*\d{0,2}\s*(?:h)?/g; const times = text.match(timeRegex); if (times) { hasTimePatterns = true; timePatternsSample = [...new Set(times)].slice(0, 10); } } const diagnostic: DiagnosticInfo = { url: church.website!, churchName: church.name, success: result.success, schedulesFound: result.schedules.length, hasFrenchDays, hasTimePatterns, timePatternsSample, textSample, error: result.error, }; diagnostics.push(diagnostic); if (result.success && result.schedules.length > 0) { successCount++; console.log(`✅ SUCCESS - ${result.schedules.length} schedules`); } else { failCount++; console.log(`❌ FAILED - ${result.error}`); if (hasFrenchDays && !hasTimePatterns) { console.log(` 💡 Has French days but no times`); } else if (!hasFrenchDays && hasTimePatterns) { console.log(` 💡 Has times but no French days`); } else if (hasFrenchDays && hasTimePatterns) { console.log(` 💡 Has BOTH days and times - parsing issue!`); console.log(` Sample times: ${timePatternsSample.slice(0, 5).join(', ')}`); } else { console.log(` 💡 No mass schedule content found`); } } console.log(''); } catch (err: any) { failCount++; console.log(`❌ ERROR - ${err.message}\n`); diagnostics.push({ url: church.website!, churchName: church.name, success: false, schedulesFound: 0, hasFrenchDays: false, hasTimePatterns: false, timePatternsSample: [], textSample: '', error: err.message, }); } } await scraper.close(); // Analysis console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'); console.log(`\nRESULTS: ${successCount}/${churches.length} successful (${((successCount / churches.length) * 100).toFixed(0)}%)`); console.log(''); const hasBoth = diagnostics.filter(d => !d.success && d.hasFrenchDays && d.hasTimePatterns); const hasDaysNoTimes = diagnostics.filter(d => !d.success && d.hasFrenchDays && !d.hasTimePatterns); const hasTimesNoDays = diagnostics.filter(d => !d.success && !d.hasFrenchDays && d.hasTimePatterns); const hasNeither = diagnostics.filter(d => !d.success && !d.hasFrenchDays && !d.hasTimePatterns); console.log('FAILURE ANALYSIS:'); console.log(` Has days + times but failed: ${hasBoth.length} (PARSING BUG)`); console.log(` Has days but no times: ${hasDaysNoTimes.length}`); console.log(` Has times but no days: ${hasTimesNoDays.length}`); console.log(` Has neither: ${hasNeither.length} (no mass schedule on page)`); console.log(''); if (hasBoth.length > 0) { console.log('⚠️ PARSING BUGS TO FIX (has both days and times but failed):'); hasBoth.forEach(d => { console.log(` ${d.churchName}`); console.log(` URL: ${d.url}`); console.log(` Sample times found: ${d.timePatternsSample.slice(0, 5).join(', ')}`); console.log(` Text sample: ${d.textSample.substring(0, 150)}...`); console.log(''); }); } await prisma.$disconnect(); await pool.end(); } testFrenchBroader().catch(console.error);