67 lines
2.2 KiB
TypeScript
67 lines
2.2 KiB
TypeScript
|
|
#!/usr/bin/env tsx
|
||
|
|
/**
|
||
|
|
* Check the 2 potentially real bugs
|
||
|
|
*/
|
||
|
|
|
||
|
|
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||
|
|
|
||
|
|
async function checkRealBugs() {
|
||
|
|
const scraper = new GenericScraper();
|
||
|
|
await scraper.init();
|
||
|
|
|
||
|
|
console.log('=== 1. Iglesia de San Fernando (trying Spanish page) ===\n');
|
||
|
|
|
||
|
|
scraper.setCountry('ES');
|
||
|
|
const spanishUrl = 'https://www.parroquiasanfernandomaspalomas.net/'; // Remove /de/
|
||
|
|
const result1 = await scraper.scrape(spanishUrl);
|
||
|
|
|
||
|
|
console.log(`URL: ${spanishUrl}`);
|
||
|
|
console.log(`Success: ${result1.success}`);
|
||
|
|
console.log(`Schedules: ${result1.schedules.length}`);
|
||
|
|
console.log(`Error: ${result1.error || 'none'}\n`);
|
||
|
|
|
||
|
|
if (result1.schedules.length > 0) {
|
||
|
|
console.log('Sample schedules:');
|
||
|
|
result1.schedules.slice(0, 5).forEach(s => {
|
||
|
|
const days = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'];
|
||
|
|
console.log(` ${days[s.dayOfWeek]} ${s.time} - ${s.language} ${s.massType}`);
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log('\n=== 2. Kościół (Poland) ===\n');
|
||
|
|
|
||
|
|
scraper.setCountry('PL');
|
||
|
|
const result2 = await scraper.scrape('http://parafialubojna.pl');
|
||
|
|
|
||
|
|
console.log(`Success: ${result2.success}`);
|
||
|
|
console.log(`Schedules: ${result2.schedules.length}`);
|
||
|
|
console.log(`Error: ${result2.error || 'none'}\n`);
|
||
|
|
|
||
|
|
if (result2.schedules.length > 0) {
|
||
|
|
console.log('Sample schedules:');
|
||
|
|
result2.schedules.slice(0, 5).forEach(s => {
|
||
|
|
const days = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'];
|
||
|
|
console.log(` ${days[s.dayOfWeek]} ${s.time} - ${s.language} ${s.massType}`);
|
||
|
|
});
|
||
|
|
} else if (result2.rawHtml) {
|
||
|
|
const text = result2.rawHtml
|
||
|
|
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||
|
|
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||
|
|
.replace(/<[^>]+>/g, ' ')
|
||
|
|
.replace(/\s+/g, ' ')
|
||
|
|
.toLowerCase();
|
||
|
|
|
||
|
|
// Look for Polish schedule keywords
|
||
|
|
const scheduleIndex = text.indexOf('msze') || text.indexOf('msza') || text.indexOf('nabożeńst');
|
||
|
|
if (scheduleIndex !== -1) {
|
||
|
|
const snippet = text.substring(scheduleIndex, scheduleIndex + 300);
|
||
|
|
console.log('Found schedule section:');
|
||
|
|
console.log(snippet);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
await scraper.close();
|
||
|
|
}
|
||
|
|
|
||
|
|
checkRealBugs().catch(console.error);
|