#!/usr/bin/env tsx /** * Investigate the 8 potential parsing bugs */ import { GenericScraper } from '../../src/scrapers/strategies/generic'; const BUGS = [ { name: 'Chapelle Saint-Jean-XXIII', country: 'FR', url: 'https://www.chemin-neuf.fr/' }, { name: 'St. Marien', country: 'DE', url: 'https://www.willehad.de/start/' }, { name: 'Iglesia de San Fernando', country: 'ES', url: 'https://www.parroquiasanfernandomaspalomas.net/de/' }, { name: 'Monestir de Sant Esperit', country: 'ES', url: 'https://www.santoespiritu.org/' }, { name: 'Santuario de Manalagua', country: 'ES', url: 'http://tierrasdeburgos.blogspot.com.es/2013/12/escultura-del-agua-santuario-de.html' }, { name: 'Kościół pw. Najświętszego Serca', country: 'PL', url: 'http://parafialubojna.pl' }, { name: 'Paróquia do Desterro', country: 'BR', url: 'https://paroquiaportodegalinhas.blogspot.com.br/' }, { name: 'Catedral Diocesana', country: 'BR', url: 'http://diocesedejuazeiro.org.br/' }, ]; async function investigate() { console.log('Investigating 8 potential bugs...\n'); const scraper = new GenericScraper(); await scraper.init(); for (let i = 0; i < BUGS.length; i++) { const bug = BUGS[i]; console.log(`${'='.repeat(80)}`); console.log(`${i + 1}. ${bug.name} (${bug.country})`); console.log(` ${bug.url}`); console.log('='.repeat(80)); scraper.setCountry(bug.country); try { const result = await scraper.scrape(bug.url); console.log(`Success: ${result.success}`); console.log(`Schedules: ${result.schedules.length}`); console.log(`Error: ${result.error || 'none'}`); if (!result.success && result.rawHtml) { const text = result.rawHtml .replace(/]*>[\s\S]*?<\/script>/gi, '') .replace(/]*>[\s\S]*?<\/style>/gi, '') .replace(/<[^>]+>/g, ' ') .replace(/\s+/g, ' ') .toLowerCase(); // Check page type console.log('\nPage analysis:'); if (text.includes('blogspot')) { console.log(' ⚠️ Blogspot page (likely blog post, not church website)'); } if (text.includes('hotel') || text.includes('reservation') || text.includes('booking')) { console.log(' ⚠️ Contains hotel/booking keywords'); } if (text.includes('restaurant') || text.includes('menu')) { console.log(' ⚠️ Contains restaurant keywords'); } if (text.includes('404') || text.includes('not found') || text.includes('error')) { console.log(' ⚠️ Error/404 page'); } // Check if it has schedule keywords const hasScheduleKeywords = text.match(/(mass|messe|misa|missa|horário|horario|gottesdienst|eucarist)/i); console.log(` Schedule keywords: ${hasScheduleKeywords ? '✓ Found' : '✗ Not found'}`); // Show sample text const massIndex = text.indexOf('mass') || text.indexOf('messe') || text.indexOf('misa') || text.indexOf('missa') || 0; const sampleStart = Math.max(0, massIndex - 50); const sample = text.substring(sampleStart, sampleStart + 300); console.log(`\n Sample text: "${sample.substring(0, 200)}..."`); } console.log('\n'); } catch (err: any) { console.log(`ERROR: ${err.message}\n\n`); } } await scraper.close(); } investigate().catch(console.error);