chore: sync with Gitea master and restore local-only files
Reset local main to gitea/master (new source of truth) and restored local-only files: web scrapers, admin dashboard, ChromaDB integration, debug scripts, and utility libraries that aren't tracked in Gitea. Gitea master adds: discovermass, buscarmisas-network, hk-parishes, bohosluzby, kerknet, gottesdienstzeiten, miserend importers, ClaimRequest model, forward geocoding, heartbeat healthcheck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
84
scripts/debug/investigate-8-bugs.ts
Normal file
84
scripts/debug/investigate-8-bugs.ts
Normal file
@@ -0,0 +1,84 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Investigate the 8 potential parsing bugs
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
const BUGS = [
|
||||
{ name: 'Chapelle Saint-Jean-XXIII', country: 'FR', url: 'https://www.chemin-neuf.fr/' },
|
||||
{ name: 'St. Marien', country: 'DE', url: 'https://www.willehad.de/start/' },
|
||||
{ name: 'Iglesia de San Fernando', country: 'ES', url: 'https://www.parroquiasanfernandomaspalomas.net/de/' },
|
||||
{ name: 'Monestir de Sant Esperit', country: 'ES', url: 'https://www.santoespiritu.org/' },
|
||||
{ name: 'Santuario de Manalagua', country: 'ES', url: 'http://tierrasdeburgos.blogspot.com.es/2013/12/escultura-del-agua-santuario-de.html' },
|
||||
{ name: 'Kościół pw. Najświętszego Serca', country: 'PL', url: 'http://parafialubojna.pl' },
|
||||
{ name: 'Paróquia do Desterro', country: 'BR', url: 'https://paroquiaportodegalinhas.blogspot.com.br/' },
|
||||
{ name: 'Catedral Diocesana', country: 'BR', url: 'http://diocesedejuazeiro.org.br/' },
|
||||
];
|
||||
|
||||
async function investigate() {
|
||||
console.log('Investigating 8 potential bugs...\n');
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
|
||||
for (let i = 0; i < BUGS.length; i++) {
|
||||
const bug = BUGS[i];
|
||||
console.log(`${'='.repeat(80)}`);
|
||||
console.log(`${i + 1}. ${bug.name} (${bug.country})`);
|
||||
console.log(` ${bug.url}`);
|
||||
console.log('='.repeat(80));
|
||||
|
||||
scraper.setCountry(bug.country);
|
||||
|
||||
try {
|
||||
const result = await scraper.scrape(bug.url);
|
||||
|
||||
console.log(`Success: ${result.success}`);
|
||||
console.log(`Schedules: ${result.schedules.length}`);
|
||||
console.log(`Error: ${result.error || 'none'}`);
|
||||
|
||||
if (!result.success && result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Check page type
|
||||
console.log('\nPage analysis:');
|
||||
if (text.includes('blogspot')) {
|
||||
console.log(' ⚠️ Blogspot page (likely blog post, not church website)');
|
||||
}
|
||||
if (text.includes('hotel') || text.includes('reservation') || text.includes('booking')) {
|
||||
console.log(' ⚠️ Contains hotel/booking keywords');
|
||||
}
|
||||
if (text.includes('restaurant') || text.includes('menu')) {
|
||||
console.log(' ⚠️ Contains restaurant keywords');
|
||||
}
|
||||
if (text.includes('404') || text.includes('not found') || text.includes('error')) {
|
||||
console.log(' ⚠️ Error/404 page');
|
||||
}
|
||||
|
||||
// Check if it has schedule keywords
|
||||
const hasScheduleKeywords = text.match(/(mass|messe|misa|missa|horário|horario|gottesdienst|eucarist)/i);
|
||||
console.log(` Schedule keywords: ${hasScheduleKeywords ? '✓ Found' : '✗ Not found'}`);
|
||||
|
||||
// Show sample text
|
||||
const massIndex = text.indexOf('mass') || text.indexOf('messe') || text.indexOf('misa') || text.indexOf('missa') || 0;
|
||||
const sampleStart = Math.max(0, massIndex - 50);
|
||||
const sample = text.substring(sampleStart, sampleStart + 300);
|
||||
console.log(`\n Sample text: "${sample.substring(0, 200)}..."`);
|
||||
}
|
||||
|
||||
console.log('\n');
|
||||
} catch (err: any) {
|
||||
console.log(`ERROR: ${err.message}\n\n`);
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
investigate().catch(console.error);
|
||||
Reference in New Issue
Block a user