#!/usr/bin/env tsx /** * Debug the 5 parsing bugs identified in top 5 test */ import { config } from 'dotenv'; config({ path: '.env.local' }); config({ path: '.env' }); import { Pool } from 'pg'; import { PrismaPg } from '@prisma/adapter-pg'; import { PrismaClient } from '@prisma/client'; import { GenericScraper } from '../../src/scrapers/strategies/generic'; const pool = new Pool({ connectionString: process.env.DATABASE_URL }); const adapter = new PrismaPg(pool); const prisma = new PrismaClient({ adapter }); // The churches with parsing bugs const BUG_CHURCHES = [ { name: 'St. Marien', country: 'DE', searchTerm: 'St. Marien' }, { name: 'Santuario de Manalagua', country: 'ES', searchTerm: 'Santuario de Manalagua' }, { name: 'Kościół pw. Najświętszego Serca', country: 'PL', searchTerm: 'Najświętszego Serca Pana Jez' }, { name: 'Paróquia de Nossa Senhora do Desterro', country: 'BR', searchTerm: 'Nossa Senhora do Desterro' }, { name: 'Paróquia da Paz', country: 'BR', searchTerm: 'Paróquia da Paz' }, ]; async function debugBugs() { console.log('Debugging parsing bugs...\n'); const scraper = new GenericScraper(); await scraper.init(); for (const bug of BUG_CHURCHES) { console.log('═'.repeat(80)); console.log(`BUG: ${bug.name} (${bug.country})`); console.log('═'.repeat(80)); const church = await prisma.church.findFirst({ where: { country: bug.country, name: { contains: bug.searchTerm }, website: { not: null }, }, }); if (!church) { console.log(`❌ Church not found in database\n`); continue; } console.log(`Church: ${church.name}`); console.log(`URL: ${church.website}\n`); scraper.setCountry(bug.country); try { const result = await scraper.scrape(church.website!); console.log(`Success: ${result.success}`); console.log(`Schedules found: ${result.schedules.length}`); if (result.error) console.log(`Error: ${result.error}`); if (result.rawHtml) { const text = result.rawHtml .replace(/]*>[\s\S]*?<\/script>/gi, '') .replace(/]*>[\s\S]*?<\/style>/gi, '') .replace(/<[^>]+>/g, ' ') .replace(/\s+/g, ' ') .toLowerCase(); console.log('\n--- Text Sample (first 1000 chars) ---'); console.log(text.substring(0, 1000)); // Check for day names console.log('\n--- Day Names Found ---'); const dayPatterns: Record = { DE: ['sonntag', 'montag', 'dienstag', 'mittwoch', 'donnerstag', 'freitag', 'samstag'], ES: ['domingo', 'lunes', 'martes', 'miércoles', 'miercoles', 'jueves', 'viernes', 'sábado', 'sabado'], PL: ['niedziela', 'poniedziałek', 'poniedzialek', 'wtorek', 'środa', 'sroda', 'czwartek', 'piątek', 'piatek', 'sobota'], BR: ['domingo', 'segunda', 'terça', 'terca', 'quarta', 'quinta', 'sexta', 'sábado', 'sabado'], }; const days = dayPatterns[bug.country] || []; const foundDays: string[] = []; for (const day of days) { if (text.includes(day)) { foundDays.push(day); } } console.log(`Found: ${foundDays.join(', ') || 'none'}`); // Check for time patterns console.log('\n--- Time Patterns Found ---'); const timeRegex = /\d{1,2}[h:\.]\s*\d{0,2}\s*(?:h|uhr)?/gi; const times = text.match(timeRegex); if (times) { const uniqueTimes = [...new Set(times)].slice(0, 20); console.log(`Found ${times.length} time patterns (showing first 20 unique):`); console.log(uniqueTimes.join(', ')); } else { console.log('No time patterns found'); } // Look for specific mass schedule keywords console.log('\n--- Mass Schedule Keywords ---'); const keywords: Record = { DE: ['gottesdienst', 'messe', 'heilige messe', 'messzeiten'], ES: ['misa', 'horario', 'eucaristía', 'eucaristia'], PL: ['msza', 'msze', 'nabożeństwo', 'nabozenstwo'], BR: ['missa', 'horário', 'horario', 'eucaristia'], }; const countryKeywords = keywords[bug.country] || []; const foundKeywords: string[] = []; for (const keyword of countryKeywords) { if (text.includes(keyword)) { foundKeywords.push(keyword); } } console.log(`Found: ${foundKeywords.join(', ') || 'none'}`); // Look for specific problematic patterns console.log('\n--- Looking for edge cases ---'); // Check if times and days are separated (not in same section) const hasTimeBeforeDays = text.indexOf(foundDays[0] || 'zzz') > text.indexOf((times || [])[0] || 'aaa'); console.log(`Times come before days: ${hasTimeBeforeDays ? 'YES (potential issue)' : 'no'}`); // Check for table structures const hasTables = text.includes('colspan') || text.includes('rowspan') || (text.match(/\s+\|\s+/g)?.length || 0) > 5; console.log(`Likely table format: ${hasTables ? 'YES (may need special handling)' : 'no'}`); // Check for multiple languages on same page const hasMultiLang = (text.match(/english|español|espanol|portuguese|português|portugues|deutsch|polski/gi)?.length || 0) > 1; console.log(`Multiple languages: ${hasMultiLang ? 'YES (may confuse parser)' : 'no'}`); } console.log('\n'); } catch (err: any) { console.log(`❌ ERROR: ${err.message}\n`); } } await scraper.close(); await prisma.$disconnect(); await pool.end(); } debugBugs().catch(console.error);