151 lines
5.6 KiB
TypeScript
151 lines
5.6 KiB
TypeScript
|
|
#!/usr/bin/env tsx
|
||
|
|
/**
|
||
|
|
* Debug the 5 parsing bugs identified in top 5 test
|
||
|
|
*/
|
||
|
|
|
||
|
|
import { config } from 'dotenv';
|
||
|
|
config({ path: '.env.local' });
|
||
|
|
config({ path: '.env' });
|
||
|
|
|
||
|
|
import { Pool } from 'pg';
|
||
|
|
import { PrismaPg } from '@prisma/adapter-pg';
|
||
|
|
import { PrismaClient } from '@prisma/client';
|
||
|
|
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||
|
|
|
||
|
|
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||
|
|
const adapter = new PrismaPg(pool);
|
||
|
|
const prisma = new PrismaClient({ adapter });
|
||
|
|
|
||
|
|
// The churches with parsing bugs
|
||
|
|
const BUG_CHURCHES = [
|
||
|
|
{ name: 'St. Marien', country: 'DE', searchTerm: 'St. Marien' },
|
||
|
|
{ name: 'Santuario de Manalagua', country: 'ES', searchTerm: 'Santuario de Manalagua' },
|
||
|
|
{ name: 'Kościół pw. Najświętszego Serca', country: 'PL', searchTerm: 'Najświętszego Serca Pana Jez' },
|
||
|
|
{ name: 'Paróquia de Nossa Senhora do Desterro', country: 'BR', searchTerm: 'Nossa Senhora do Desterro' },
|
||
|
|
{ name: 'Paróquia da Paz', country: 'BR', searchTerm: 'Paróquia da Paz' },
|
||
|
|
];
|
||
|
|
|
||
|
|
async function debugBugs() {
|
||
|
|
console.log('Debugging parsing bugs...\n');
|
||
|
|
|
||
|
|
const scraper = new GenericScraper();
|
||
|
|
await scraper.init();
|
||
|
|
|
||
|
|
for (const bug of BUG_CHURCHES) {
|
||
|
|
console.log('═'.repeat(80));
|
||
|
|
console.log(`BUG: ${bug.name} (${bug.country})`);
|
||
|
|
console.log('═'.repeat(80));
|
||
|
|
|
||
|
|
const church = await prisma.church.findFirst({
|
||
|
|
where: {
|
||
|
|
country: bug.country,
|
||
|
|
name: { contains: bug.searchTerm },
|
||
|
|
website: { not: null },
|
||
|
|
},
|
||
|
|
});
|
||
|
|
|
||
|
|
if (!church) {
|
||
|
|
console.log(`❌ Church not found in database\n`);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log(`Church: ${church.name}`);
|
||
|
|
console.log(`URL: ${church.website}\n`);
|
||
|
|
|
||
|
|
scraper.setCountry(bug.country);
|
||
|
|
|
||
|
|
try {
|
||
|
|
const result = await scraper.scrape(church.website!);
|
||
|
|
|
||
|
|
console.log(`Success: ${result.success}`);
|
||
|
|
console.log(`Schedules found: ${result.schedules.length}`);
|
||
|
|
if (result.error) console.log(`Error: ${result.error}`);
|
||
|
|
|
||
|
|
if (result.rawHtml) {
|
||
|
|
const text = result.rawHtml
|
||
|
|
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||
|
|
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||
|
|
.replace(/<[^>]+>/g, ' ')
|
||
|
|
.replace(/\s+/g, ' ')
|
||
|
|
.toLowerCase();
|
||
|
|
|
||
|
|
console.log('\n--- Text Sample (first 1000 chars) ---');
|
||
|
|
console.log(text.substring(0, 1000));
|
||
|
|
|
||
|
|
// Check for day names
|
||
|
|
console.log('\n--- Day Names Found ---');
|
||
|
|
const dayPatterns: Record<string, string[]> = {
|
||
|
|
DE: ['sonntag', 'montag', 'dienstag', 'mittwoch', 'donnerstag', 'freitag', 'samstag'],
|
||
|
|
ES: ['domingo', 'lunes', 'martes', 'miércoles', 'miercoles', 'jueves', 'viernes', 'sábado', 'sabado'],
|
||
|
|
PL: ['niedziela', 'poniedziałek', 'poniedzialek', 'wtorek', 'środa', 'sroda', 'czwartek', 'piątek', 'piatek', 'sobota'],
|
||
|
|
BR: ['domingo', 'segunda', 'terça', 'terca', 'quarta', 'quinta', 'sexta', 'sábado', 'sabado'],
|
||
|
|
};
|
||
|
|
|
||
|
|
const days = dayPatterns[bug.country] || [];
|
||
|
|
const foundDays: string[] = [];
|
||
|
|
for (const day of days) {
|
||
|
|
if (text.includes(day)) {
|
||
|
|
foundDays.push(day);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
console.log(`Found: ${foundDays.join(', ') || 'none'}`);
|
||
|
|
|
||
|
|
// Check for time patterns
|
||
|
|
console.log('\n--- Time Patterns Found ---');
|
||
|
|
const timeRegex = /\d{1,2}[h:\.]\s*\d{0,2}\s*(?:h|uhr)?/gi;
|
||
|
|
const times = text.match(timeRegex);
|
||
|
|
if (times) {
|
||
|
|
const uniqueTimes = [...new Set(times)].slice(0, 20);
|
||
|
|
console.log(`Found ${times.length} time patterns (showing first 20 unique):`);
|
||
|
|
console.log(uniqueTimes.join(', '));
|
||
|
|
} else {
|
||
|
|
console.log('No time patterns found');
|
||
|
|
}
|
||
|
|
|
||
|
|
// Look for specific mass schedule keywords
|
||
|
|
console.log('\n--- Mass Schedule Keywords ---');
|
||
|
|
const keywords: Record<string, string[]> = {
|
||
|
|
DE: ['gottesdienst', 'messe', 'heilige messe', 'messzeiten'],
|
||
|
|
ES: ['misa', 'horario', 'eucaristía', 'eucaristia'],
|
||
|
|
PL: ['msza', 'msze', 'nabożeństwo', 'nabozenstwo'],
|
||
|
|
BR: ['missa', 'horário', 'horario', 'eucaristia'],
|
||
|
|
};
|
||
|
|
|
||
|
|
const countryKeywords = keywords[bug.country] || [];
|
||
|
|
const foundKeywords: string[] = [];
|
||
|
|
for (const keyword of countryKeywords) {
|
||
|
|
if (text.includes(keyword)) {
|
||
|
|
foundKeywords.push(keyword);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
console.log(`Found: ${foundKeywords.join(', ') || 'none'}`);
|
||
|
|
|
||
|
|
// Look for specific problematic patterns
|
||
|
|
console.log('\n--- Looking for edge cases ---');
|
||
|
|
|
||
|
|
// Check if times and days are separated (not in same section)
|
||
|
|
const hasTimeBeforeDays = text.indexOf(foundDays[0] || 'zzz') > text.indexOf((times || [])[0] || 'aaa');
|
||
|
|
console.log(`Times come before days: ${hasTimeBeforeDays ? 'YES (potential issue)' : 'no'}`);
|
||
|
|
|
||
|
|
// Check for table structures
|
||
|
|
const hasTables = text.includes('colspan') || text.includes('rowspan') || (text.match(/\s+\|\s+/g)?.length || 0) > 5;
|
||
|
|
console.log(`Likely table format: ${hasTables ? 'YES (may need special handling)' : 'no'}`);
|
||
|
|
|
||
|
|
// Check for multiple languages on same page
|
||
|
|
const hasMultiLang = (text.match(/english|español|espanol|portuguese|português|portugues|deutsch|polski/gi)?.length || 0) > 1;
|
||
|
|
console.log(`Multiple languages: ${hasMultiLang ? 'YES (may confuse parser)' : 'no'}`);
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log('\n');
|
||
|
|
} catch (err: any) {
|
||
|
|
console.log(`❌ ERROR: ${err.message}\n`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
await scraper.close();
|
||
|
|
await prisma.$disconnect();
|
||
|
|
await pool.end();
|
||
|
|
}
|
||
|
|
|
||
|
|
debugBugs().catch(console.error);
|