39 lines
1.0 KiB
TypeScript
39 lines
1.0 KiB
TypeScript
|
|
#!/usr/bin/env tsx
|
||
|
|
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||
|
|
|
||
|
|
async function check() {
|
||
|
|
const scraper = new GenericScraper();
|
||
|
|
await scraper.init();
|
||
|
|
scraper.setCountry('PL');
|
||
|
|
|
||
|
|
const result = await scraper.scrape('http://parafialubojna.pl');
|
||
|
|
if (result.rawHtml) {
|
||
|
|
const text = result.rawHtml
|
||
|
|
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||
|
|
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||
|
|
.replace(/<[^>]+>/g, ' ')
|
||
|
|
.replace(/\s+/g, ' ')
|
||
|
|
.toLowerCase();
|
||
|
|
|
||
|
|
const niedziela_matches = [];
|
||
|
|
let idx = 0;
|
||
|
|
while ((idx = text.indexOf('niedziela', idx)) !== -1) {
|
||
|
|
niedziela_matches.push({
|
||
|
|
position: idx,
|
||
|
|
context: text.substring(Math.max(0, idx-30), idx+70)
|
||
|
|
});
|
||
|
|
idx++;
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log(`niedziela occurrences: ${niedziela_matches.length}\n`);
|
||
|
|
niedziela_matches.forEach((m, i) => {
|
||
|
|
console.log(`Occurrence ${i+1} at position ${m.position}:`);
|
||
|
|
console.log(` "${m.context}"`);
|
||
|
|
console.log('');
|
||
|
|
});
|
||
|
|
}
|
||
|
|
await scraper.close();
|
||
|
|
}
|
||
|
|
|
||
|
|
check();
|