85 lines
3.3 KiB
TypeScript
85 lines
3.3 KiB
TypeScript
|
|
#!/usr/bin/env tsx
|
||
|
|
/**
|
||
|
|
* Investigate the 8 potential parsing bugs
|
||
|
|
*/
|
||
|
|
|
||
|
|
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||
|
|
|
||
|
|
const BUGS = [
|
||
|
|
{ name: 'Chapelle Saint-Jean-XXIII', country: 'FR', url: 'https://www.chemin-neuf.fr/' },
|
||
|
|
{ name: 'St. Marien', country: 'DE', url: 'https://www.willehad.de/start/' },
|
||
|
|
{ name: 'Iglesia de San Fernando', country: 'ES', url: 'https://www.parroquiasanfernandomaspalomas.net/de/' },
|
||
|
|
{ name: 'Monestir de Sant Esperit', country: 'ES', url: 'https://www.santoespiritu.org/' },
|
||
|
|
{ name: 'Santuario de Manalagua', country: 'ES', url: 'http://tierrasdeburgos.blogspot.com.es/2013/12/escultura-del-agua-santuario-de.html' },
|
||
|
|
{ name: 'Kościół pw. Najświętszego Serca', country: 'PL', url: 'http://parafialubojna.pl' },
|
||
|
|
{ name: 'Paróquia do Desterro', country: 'BR', url: 'https://paroquiaportodegalinhas.blogspot.com.br/' },
|
||
|
|
{ name: 'Catedral Diocesana', country: 'BR', url: 'http://diocesedejuazeiro.org.br/' },
|
||
|
|
];
|
||
|
|
|
||
|
|
async function investigate() {
|
||
|
|
console.log('Investigating 8 potential bugs...\n');
|
||
|
|
|
||
|
|
const scraper = new GenericScraper();
|
||
|
|
await scraper.init();
|
||
|
|
|
||
|
|
for (let i = 0; i < BUGS.length; i++) {
|
||
|
|
const bug = BUGS[i];
|
||
|
|
console.log(`${'='.repeat(80)}`);
|
||
|
|
console.log(`${i + 1}. ${bug.name} (${bug.country})`);
|
||
|
|
console.log(` ${bug.url}`);
|
||
|
|
console.log('='.repeat(80));
|
||
|
|
|
||
|
|
scraper.setCountry(bug.country);
|
||
|
|
|
||
|
|
try {
|
||
|
|
const result = await scraper.scrape(bug.url);
|
||
|
|
|
||
|
|
console.log(`Success: ${result.success}`);
|
||
|
|
console.log(`Schedules: ${result.schedules.length}`);
|
||
|
|
console.log(`Error: ${result.error || 'none'}`);
|
||
|
|
|
||
|
|
if (!result.success && result.rawHtml) {
|
||
|
|
const text = result.rawHtml
|
||
|
|
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||
|
|
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||
|
|
.replace(/<[^>]+>/g, ' ')
|
||
|
|
.replace(/\s+/g, ' ')
|
||
|
|
.toLowerCase();
|
||
|
|
|
||
|
|
// Check page type
|
||
|
|
console.log('\nPage analysis:');
|
||
|
|
if (text.includes('blogspot')) {
|
||
|
|
console.log(' ⚠️ Blogspot page (likely blog post, not church website)');
|
||
|
|
}
|
||
|
|
if (text.includes('hotel') || text.includes('reservation') || text.includes('booking')) {
|
||
|
|
console.log(' ⚠️ Contains hotel/booking keywords');
|
||
|
|
}
|
||
|
|
if (text.includes('restaurant') || text.includes('menu')) {
|
||
|
|
console.log(' ⚠️ Contains restaurant keywords');
|
||
|
|
}
|
||
|
|
if (text.includes('404') || text.includes('not found') || text.includes('error')) {
|
||
|
|
console.log(' ⚠️ Error/404 page');
|
||
|
|
}
|
||
|
|
|
||
|
|
// Check if it has schedule keywords
|
||
|
|
const hasScheduleKeywords = text.match(/(mass|messe|misa|missa|horário|horario|gottesdienst|eucarist)/i);
|
||
|
|
console.log(` Schedule keywords: ${hasScheduleKeywords ? '✓ Found' : '✗ Not found'}`);
|
||
|
|
|
||
|
|
// Show sample text
|
||
|
|
const massIndex = text.indexOf('mass') || text.indexOf('messe') || text.indexOf('misa') || text.indexOf('missa') || 0;
|
||
|
|
const sampleStart = Math.max(0, massIndex - 50);
|
||
|
|
const sample = text.substring(sampleStart, sampleStart + 300);
|
||
|
|
console.log(`\n Sample text: "${sample.substring(0, 200)}..."`);
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log('\n');
|
||
|
|
} catch (err: any) {
|
||
|
|
console.log(`ERROR: ${err.message}\n\n`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
await scraper.close();
|
||
|
|
}
|
||
|
|
|
||
|
|
investigate().catch(console.error);
|