chore: sync with Gitea master and restore local-only files
Reset local main to gitea/master (new source of truth) and restored local-only files: web scrapers, admin dashboard, ChromaDB integration, debug scripts, and utility libraries that aren't tracked in Gitea. Gitea master adds: discovermass, buscarmisas-network, hk-parishes, bohosluzby, kerknet, gottesdienstzeiten, miserend importers, ClaimRequest model, forward geocoding, heartbeat healthcheck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
152
scripts/test-scraper.ts
Normal file
152
scripts/test-scraper.ts
Normal file
@@ -0,0 +1,152 @@
|
||||
import { GenericScraper } from '../src/scrapers/strategies/generic';
|
||||
import { getScraper } from '../src/scrapers/registry';
|
||||
import type { BaseScraper, ScrapeResult } from '../src/scrapers/base-scraper';
|
||||
|
||||
const TEST_URL = process.argv[2] || 'https://www.saintpatrickscathedral.org/masses';
|
||||
|
||||
// Parse --country flag from CLI args
|
||||
const countryFlagIndex = process.argv.indexOf('--country');
|
||||
const COUNTRY_CODE = countryFlagIndex !== -1 ? process.argv[countryFlagIndex + 1] : null;
|
||||
|
||||
// Parse --lang flag from CLI args (e.g., --lang english)
|
||||
const langFlagIndex = process.argv.indexOf('--lang');
|
||||
const LANG = langFlagIndex !== -1 ? process.argv[langFlagIndex + 1] : null;
|
||||
|
||||
const DAY_NAMES = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'];
|
||||
|
||||
async function main() {
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log('NEARESTMASS SCRAPER TEST');
|
||||
console.log('='.repeat(70));
|
||||
console.log(`\nURL: ${TEST_URL}`);
|
||||
console.log(`Country: ${COUNTRY_CODE || '(auto-detect from <html lang>)'}`);
|
||||
console.log(`Scraper: ${LANG || 'generic'}`);
|
||||
console.log(`Time: ${new Date().toISOString()}`);
|
||||
console.log('\n' + '-'.repeat(70));
|
||||
|
||||
let scraper: BaseScraper;
|
||||
|
||||
if (LANG) {
|
||||
scraper = getScraper(LANG);
|
||||
console.log(`\n Using ${LANG} scraper`);
|
||||
} else {
|
||||
scraper = new GenericScraper();
|
||||
}
|
||||
|
||||
try {
|
||||
console.log('\n[1/4] Initializing browser...');
|
||||
await scraper.init();
|
||||
console.log(' ✓ Browser ready');
|
||||
|
||||
if (COUNTRY_CODE && scraper instanceof GenericScraper) {
|
||||
scraper.setCountry(COUNTRY_CODE);
|
||||
console.log(` Country set to: ${COUNTRY_CODE}`);
|
||||
}
|
||||
|
||||
console.log('\n[2/4] Fetching page...');
|
||||
const startTime = Date.now();
|
||||
const result: ScrapeResult = await scraper.scrape(TEST_URL);
|
||||
const elapsed = Date.now() - startTime;
|
||||
console.log(` ✓ Page loaded in ${elapsed}ms`);
|
||||
|
||||
console.log('\n[3/4] Parsing results...');
|
||||
console.log(` Status: ${result.success ? '✓ SUCCESS' : '✗ FAILED'}`);
|
||||
console.log(` Schedules found: ${result.schedules.length}`);
|
||||
|
||||
if (result.detectedLanguage) {
|
||||
console.log(` Detected language: ${result.detectedLanguage}`);
|
||||
}
|
||||
|
||||
if (result.churchData) {
|
||||
console.log('\n Church Data:');
|
||||
if (result.churchData.phone) console.log(` Phone: ${result.churchData.phone}`);
|
||||
if (result.churchData.email) console.log(` Email: ${result.churchData.email}`);
|
||||
if (result.churchData.pastorName) console.log(` Pastor: ${result.churchData.pastorName}`);
|
||||
if (result.churchData.diocese) console.log(` Diocese: ${result.churchData.diocese}`);
|
||||
}
|
||||
|
||||
if (result.error) {
|
||||
console.log(` Error: ${result.error}`);
|
||||
}
|
||||
|
||||
if (result.schedules.length > 0) {
|
||||
console.log('\n' + '-'.repeat(70));
|
||||
console.log('PARSED MASS SCHEDULES');
|
||||
console.log('-'.repeat(70));
|
||||
|
||||
const byDay: Record<number, typeof result.schedules> = {};
|
||||
for (const schedule of result.schedules) {
|
||||
if (!byDay[schedule.dayOfWeek]) {
|
||||
byDay[schedule.dayOfWeek] = [];
|
||||
}
|
||||
byDay[schedule.dayOfWeek].push(schedule);
|
||||
}
|
||||
|
||||
for (let day = 0; day < 7; day++) {
|
||||
const schedules = byDay[day];
|
||||
if (schedules && schedules.length > 0) {
|
||||
console.log(`\n${DAY_NAMES[day]}:`);
|
||||
for (const s of schedules) {
|
||||
const parts = [
|
||||
` ${s.time}`,
|
||||
s.language && s.language !== 'English' ? `(${s.language})` : '',
|
||||
s.massType ? `[${s.massType}]` : '',
|
||||
s.notes ? `- ${s.notes}` : '',
|
||||
].filter(Boolean);
|
||||
console.log(parts.join(' '));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (result.rawHtml) {
|
||||
console.log('\n' + '-'.repeat(70));
|
||||
console.log('RAW TEXT PREVIEW (first 1000 chars, stripped of HTML)');
|
||||
console.log('-'.repeat(70));
|
||||
|
||||
const textOnly = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/[\u2013\u2014]/g, '-')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim()
|
||||
.substring(0, 1000);
|
||||
|
||||
console.log('\n' + textOnly);
|
||||
|
||||
if (result.rawHtml.length > 1000) {
|
||||
console.log('\n... (truncated)');
|
||||
}
|
||||
}
|
||||
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log('SUMMARY');
|
||||
console.log('='.repeat(70));
|
||||
console.log(`URL: ${TEST_URL}`);
|
||||
console.log(`Scraper: ${LANG || 'generic'}`);
|
||||
console.log(`Country: ${COUNTRY_CODE || '(auto-detected)'}`);
|
||||
console.log(`Language: ${result.detectedLanguage || '(unknown)'}`);
|
||||
console.log(`Success: ${result.success ? 'Yes' : 'No'}`);
|
||||
console.log(`Schedules: ${result.schedules.length}`);
|
||||
console.log(`HTML Size: ${result.rawHtml ? Math.round(result.rawHtml.length / 1024) + ' KB' : 'N/A'}`);
|
||||
|
||||
if (result.schedules.length > 0) {
|
||||
const days = [...new Set(result.schedules.map(s => s.dayOfWeek))];
|
||||
const languages = [...new Set(result.schedules.map(s => s.language || 'English'))];
|
||||
console.log(`Days: ${days.map(d => DAY_NAMES[d]).join(', ')}`);
|
||||
console.log(`Languages: ${languages.join(', ')}`);
|
||||
}
|
||||
|
||||
console.log('='.repeat(70) + '\n');
|
||||
|
||||
} catch (error) {
|
||||
console.error('\n[ERROR]', error);
|
||||
} finally {
|
||||
console.log('[4/4] Closing browser...');
|
||||
await scraper.close();
|
||||
console.log(' ✓ Done\n');
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
Reference in New Issue
Block a user