66 lines
2.0 KiB
TypeScript
66 lines
2.0 KiB
TypeScript
|
|
#!/usr/bin/env tsx
|
||
|
|
/**
|
||
|
|
* Debug why German church has duplicate schedules
|
||
|
|
*/
|
||
|
|
|
||
|
|
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||
|
|
|
||
|
|
// Temporarily patch GenericScraper to log sections
|
||
|
|
const originalParse = GenericScraper.prototype['parseSchedules'];
|
||
|
|
GenericScraper.prototype['parseSchedules'] = function(html: string) {
|
||
|
|
const text = html
|
||
|
|
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||
|
|
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||
|
|
.replace(/<[^>]+>/g, ' ')
|
||
|
|
.replace(/\s+/g, ' ')
|
||
|
|
.toLowerCase();
|
||
|
|
|
||
|
|
// Call findScheduleSections and log result
|
||
|
|
const sections = this['findScheduleSections'](text);
|
||
|
|
|
||
|
|
console.log('\n=== Sections found ===\n');
|
||
|
|
const dayNames = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'];
|
||
|
|
sections.forEach((section: any, i: number) => {
|
||
|
|
console.log(`Section ${i + 1}: ${dayNames[section.day]} (day ${section.day})`);
|
||
|
|
console.log(` Text preview: "${section.text.substring(0, 100)}..."`);
|
||
|
|
});
|
||
|
|
console.log(`\nTotal sections: ${sections.length}\n`);
|
||
|
|
|
||
|
|
// Continue with normal processing
|
||
|
|
const result = originalParse.call(this, html);
|
||
|
|
|
||
|
|
console.log(`\n=== Extracted times per section ===\n`);
|
||
|
|
const schedsByDay: Record<number, typeof result> = {};
|
||
|
|
for (const sched of result) {
|
||
|
|
if (!schedsByDay[sched.dayOfWeek]) schedsByDay[sched.dayOfWeek] = [];
|
||
|
|
schedsByDay[sched.dayOfWeek].push(sched);
|
||
|
|
}
|
||
|
|
|
||
|
|
for (let i = 0; i < 7; i++) {
|
||
|
|
if (schedsByDay[i]) {
|
||
|
|
console.log(`${dayNames[i]}: ${schedsByDay[i].map(s => s.time).join(', ')}`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return result;
|
||
|
|
};
|
||
|
|
|
||
|
|
async function testGerman() {
|
||
|
|
const url = 'https://www.alterpeter.de/';
|
||
|
|
console.log(`Testing: ${url}`);
|
||
|
|
|
||
|
|
const scraper = new GenericScraper();
|
||
|
|
await scraper.init();
|
||
|
|
scraper.setCountry('DE');
|
||
|
|
|
||
|
|
const result = await scraper.scrape(url);
|
||
|
|
|
||
|
|
console.log(`\n=== Final Result ===`);
|
||
|
|
console.log(`Success: ${result.success}`);
|
||
|
|
console.log(`Total schedules: ${result.schedules.length}`);
|
||
|
|
|
||
|
|
await scraper.close();
|
||
|
|
}
|
||
|
|
|
||
|
|
testGerman().catch(console.error);
|