Files
ScraperControl/scripts/debug/find-donnerstag-sections.ts

42 lines
1.2 KiB
TypeScript
Raw Permalink Normal View History

#!/usr/bin/env tsx
import { GenericScraper } from '../../src/scrapers/strategies/generic';
async function main() {
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('DE');
const result = await scraper.scrape('https://www.alterpeter.de/');
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Find all instances of "donnerstag" (Thursday)
let idx = 0;
let count = 0;
while ((idx = text.indexOf('donnerstag', idx)) !== -1) {
count++;
const contextBefore = text.substring(Math.max(0, idx - 100), idx);
const contextAfter = text.substring(idx, Math.min(text.length, idx + 200));
console.log(`=== Donnerstag occurrence ${count} at position ${idx} ===`);
console.log(`BEFORE: "...${contextBefore}"`);
console.log(`AFTER: "${contextAfter}..."`);
console.log('');
idx++;
}
console.log(`Total "donnerstag" occurrences: ${count}`);
}
await scraper.close();
}
main().catch(console.error);