Files
ScraperControl/scripts/debug/debug-thursday-context.ts
Albert 2c51513851 chore: sync with Gitea master and restore local-only files
Reset local main to gitea/master (new source of truth) and restored
local-only files: web scrapers, admin dashboard, ChromaDB integration,
debug scripts, and utility libraries that aren't tracked in Gitea.

Gitea master adds: discovermass, buscarmisas-network, hk-parishes,
bohosluzby, kerknet, gottesdienstzeiten, miserend importers,
ClaimRequest model, forward geocoding, heartbeat healthcheck.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-12 19:11:22 -04:00

45 lines
1.4 KiB
TypeScript

#!/usr/bin/env tsx
import { GenericScraper } from '../../src/scrapers/strategies/generic';
async function main() {
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('DE');
const result = await scraper.scrape('https://www.alterpeter.de/');
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Find "montag bis donnerstag" pattern
const pattern = /montag[^]*?bis[^]*?donnerstag/gi;
const matches = [...text.matchAll(pattern)];
console.log(`Found ${matches.length} instances of "montag bis donnerstag":\n`);
for (let i = 0; i < matches.length; i++) {
const match = matches[i];
const matchIndex = match.index || 0;
const contextBefore = text.substring(Math.max(0, matchIndex - 150), matchIndex);
const contextAfter = text.substring(matchIndex, Math.min(text.length, matchIndex + 250));
console.log(`=== Instance ${i + 1} ===`);
console.log(`Position: ${matchIndex}`);
console.log(`\nContext BEFORE (150 chars):`);
console.log(`"${contextBefore}"`);
console.log(`\nContext AFTER (250 chars):`);
console.log(`"${contextAfter}"`);
console.log('');
}
}
await scraper.close();
}
main().catch(console.error);