Files
ScraperControl/scripts/debug/investigate-8-bugs.ts

85 lines
3.3 KiB
TypeScript
Raw Permalink Normal View History

#!/usr/bin/env tsx
/**
* Investigate the 8 potential parsing bugs
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
const BUGS = [
{ name: 'Chapelle Saint-Jean-XXIII', country: 'FR', url: 'https://www.chemin-neuf.fr/' },
{ name: 'St. Marien', country: 'DE', url: 'https://www.willehad.de/start/' },
{ name: 'Iglesia de San Fernando', country: 'ES', url: 'https://www.parroquiasanfernandomaspalomas.net/de/' },
{ name: 'Monestir de Sant Esperit', country: 'ES', url: 'https://www.santoespiritu.org/' },
{ name: 'Santuario de Manalagua', country: 'ES', url: 'http://tierrasdeburgos.blogspot.com.es/2013/12/escultura-del-agua-santuario-de.html' },
{ name: 'Kościół pw. Najświętszego Serca', country: 'PL', url: 'http://parafialubojna.pl' },
{ name: 'Paróquia do Desterro', country: 'BR', url: 'https://paroquiaportodegalinhas.blogspot.com.br/' },
{ name: 'Catedral Diocesana', country: 'BR', url: 'http://diocesedejuazeiro.org.br/' },
];
async function investigate() {
console.log('Investigating 8 potential bugs...\n');
const scraper = new GenericScraper();
await scraper.init();
for (let i = 0; i < BUGS.length; i++) {
const bug = BUGS[i];
console.log(`${'='.repeat(80)}`);
console.log(`${i + 1}. ${bug.name} (${bug.country})`);
console.log(` ${bug.url}`);
console.log('='.repeat(80));
scraper.setCountry(bug.country);
try {
const result = await scraper.scrape(bug.url);
console.log(`Success: ${result.success}`);
console.log(`Schedules: ${result.schedules.length}`);
console.log(`Error: ${result.error || 'none'}`);
if (!result.success && result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Check page type
console.log('\nPage analysis:');
if (text.includes('blogspot')) {
console.log(' ⚠️ Blogspot page (likely blog post, not church website)');
}
if (text.includes('hotel') || text.includes('reservation') || text.includes('booking')) {
console.log(' ⚠️ Contains hotel/booking keywords');
}
if (text.includes('restaurant') || text.includes('menu')) {
console.log(' ⚠️ Contains restaurant keywords');
}
if (text.includes('404') || text.includes('not found') || text.includes('error')) {
console.log(' ⚠️ Error/404 page');
}
// Check if it has schedule keywords
const hasScheduleKeywords = text.match(/(mass|messe|misa|missa|horário|horario|gottesdienst|eucarist)/i);
console.log(` Schedule keywords: ${hasScheduleKeywords ? '✓ Found' : '✗ Not found'}`);
// Show sample text
const massIndex = text.indexOf('mass') || text.indexOf('messe') || text.indexOf('misa') || text.indexOf('missa') || 0;
const sampleStart = Math.max(0, massIndex - 50);
const sample = text.substring(sampleStart, sampleStart + 300);
console.log(`\n Sample text: "${sample.substring(0, 200)}..."`);
}
console.log('\n');
} catch (err: any) {
console.log(`ERROR: ${err.message}\n\n`);
}
}
await scraper.close();
}
investigate().catch(console.error);