Reset local main to gitea/master (new source of truth) and restored local-only files: web scrapers, admin dashboard, ChromaDB integration, debug scripts, and utility libraries that aren't tracked in Gitea. Gitea master adds: discovermass, buscarmisas-network, hk-parishes, bohosluzby, kerknet, gottesdienstzeiten, miserend importers, ClaimRequest model, forward geocoding, heartbeat healthcheck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
46 lines
1.3 KiB
TypeScript
46 lines
1.3 KiB
TypeScript
#!/usr/bin/env tsx
|
|
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
|
|
|
async function main() {
|
|
const scraper = new GenericScraper();
|
|
await scraper.init();
|
|
scraper.setCountry('DE');
|
|
|
|
const result = await scraper.scrape('https://www.alterpeter.de/');
|
|
|
|
if (result.rawHtml) {
|
|
const text = result.rawHtml
|
|
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
|
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
|
.replace(/<[^>]+>/g, ' ')
|
|
.replace(/\s+/g, ' ')
|
|
.toLowerCase();
|
|
|
|
// Find all instances of "00 uhr" pattern
|
|
let idx = 0;
|
|
let count = 0;
|
|
const pattern = /\b00\s*uhr/g;
|
|
let match;
|
|
|
|
console.log('Looking for "00 uhr" patterns:\n');
|
|
|
|
while ((match = pattern.exec(text)) !== null) {
|
|
count++;
|
|
const matchIndex = match.index;
|
|
const contextBefore = text.substring(Math.max(0, matchIndex - 50), matchIndex);
|
|
const contextAfter = text.substring(matchIndex, Math.min(text.length, matchIndex + 100));
|
|
|
|
console.log(`=== Occurrence ${count} at position ${matchIndex} ===`);
|
|
console.log(`BEFORE: "...${contextBefore}"`);
|
|
console.log(`MATCH + AFTER: "${contextAfter}..."`);
|
|
console.log('');
|
|
}
|
|
|
|
console.log(`Total "00 uhr" occurrences: ${count}`);
|
|
}
|
|
|
|
await scraper.close();
|
|
}
|
|
|
|
main().catch(console.error);
|