59 lines
1.7 KiB
TypeScript
59 lines
1.7 KiB
TypeScript
|
|
#!/usr/bin/env tsx
|
||
|
|
/**
|
||
|
|
* Debug a specific French page to see why scraping failed
|
||
|
|
*/
|
||
|
|
|
||
|
|
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||
|
|
|
||
|
|
async function debugPage() {
|
||
|
|
const url = 'https://www.chemin-neuf.fr/'; // Last failed church
|
||
|
|
console.log(`Debugging: ${url}\n`);
|
||
|
|
|
||
|
|
const scraper = new GenericScraper();
|
||
|
|
await scraper.init();
|
||
|
|
scraper.setCountry('FR');
|
||
|
|
|
||
|
|
const result = await scraper.scrape(url);
|
||
|
|
|
||
|
|
console.log(`Success: ${result.success}`);
|
||
|
|
console.log(`Schedules found: ${result.schedules.length}`);
|
||
|
|
if (result.error) console.log(`Error: ${result.error}`);
|
||
|
|
|
||
|
|
if (result.rawHtml) {
|
||
|
|
const text = result.rawHtml
|
||
|
|
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||
|
|
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||
|
|
.replace(/<[^>]+>/g, ' ')
|
||
|
|
.replace(/\s+/g, ' ')
|
||
|
|
.toLowerCase();
|
||
|
|
|
||
|
|
console.log('\n=== Page Text Sample (first 2000 chars) ===');
|
||
|
|
console.log(text.substring(0, 2000));
|
||
|
|
console.log('\n');
|
||
|
|
|
||
|
|
// Check for French day names
|
||
|
|
const frenchDays = ['dimanche', 'lundi', 'mardi', 'mercredi', 'jeudi', 'vendredi', 'samedi'];
|
||
|
|
console.log('=== French day names found ===');
|
||
|
|
for (const day of frenchDays) {
|
||
|
|
if (text.includes(day)) {
|
||
|
|
console.log(`✓ Found: ${day}`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Check for time patterns
|
||
|
|
console.log('\n=== Time patterns (sample) ===');
|
||
|
|
const timeRegex = /\d{1,2}[h:\.]\s*\d{0,2}\s*(?:AM|PM|am|pm|Uhr|uur|h)?/g;
|
||
|
|
const times = text.match(timeRegex);
|
||
|
|
if (times) {
|
||
|
|
console.log(`Found ${times.length} time-like patterns:`);
|
||
|
|
console.log(times.slice(0, 20).join(', '));
|
||
|
|
} else {
|
||
|
|
console.log('No time patterns found');
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
await scraper.close();
|
||
|
|
}
|
||
|
|
|
||
|
|
debugPage().catch(console.error);
|