chore: sync with Gitea master and restore local-only files

Reset local main to gitea/master (new source of truth) and restored
local-only files: web scrapers, admin dashboard, ChromaDB integration,
debug scripts, and utility libraries that aren't tracked in Gitea.

Gitea master adds: discovermass, buscarmisas-network, hk-parishes,
bohosluzby, kerknet, gottesdienstzeiten, miserend importers,
ClaimRequest model, forward geocoding, heartbeat healthcheck.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Albert
2026-04-12 19:11:22 -04:00
parent 76cca3ba75
commit 2c51513851
133 changed files with 30381 additions and 0 deletions

152
scripts/test-scraper.ts Normal file
View File

@@ -0,0 +1,152 @@
import { GenericScraper } from '../src/scrapers/strategies/generic';
import { getScraper } from '../src/scrapers/registry';
import type { BaseScraper, ScrapeResult } from '../src/scrapers/base-scraper';
const TEST_URL = process.argv[2] || 'https://www.saintpatrickscathedral.org/masses';
// Parse --country flag from CLI args
const countryFlagIndex = process.argv.indexOf('--country');
const COUNTRY_CODE = countryFlagIndex !== -1 ? process.argv[countryFlagIndex + 1] : null;
// Parse --lang flag from CLI args (e.g., --lang english)
const langFlagIndex = process.argv.indexOf('--lang');
const LANG = langFlagIndex !== -1 ? process.argv[langFlagIndex + 1] : null;
const DAY_NAMES = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'];
async function main() {
console.log('\n' + '='.repeat(70));
console.log('NEARESTMASS SCRAPER TEST');
console.log('='.repeat(70));
console.log(`\nURL: ${TEST_URL}`);
console.log(`Country: ${COUNTRY_CODE || '(auto-detect from <html lang>)'}`);
console.log(`Scraper: ${LANG || 'generic'}`);
console.log(`Time: ${new Date().toISOString()}`);
console.log('\n' + '-'.repeat(70));
let scraper: BaseScraper;
if (LANG) {
scraper = getScraper(LANG);
console.log(`\n Using ${LANG} scraper`);
} else {
scraper = new GenericScraper();
}
try {
console.log('\n[1/4] Initializing browser...');
await scraper.init();
console.log(' ✓ Browser ready');
if (COUNTRY_CODE && scraper instanceof GenericScraper) {
scraper.setCountry(COUNTRY_CODE);
console.log(` Country set to: ${COUNTRY_CODE}`);
}
console.log('\n[2/4] Fetching page...');
const startTime = Date.now();
const result: ScrapeResult = await scraper.scrape(TEST_URL);
const elapsed = Date.now() - startTime;
console.log(` ✓ Page loaded in ${elapsed}ms`);
console.log('\n[3/4] Parsing results...');
console.log(` Status: ${result.success ? '✓ SUCCESS' : '✗ FAILED'}`);
console.log(` Schedules found: ${result.schedules.length}`);
if (result.detectedLanguage) {
console.log(` Detected language: ${result.detectedLanguage}`);
}
if (result.churchData) {
console.log('\n Church Data:');
if (result.churchData.phone) console.log(` Phone: ${result.churchData.phone}`);
if (result.churchData.email) console.log(` Email: ${result.churchData.email}`);
if (result.churchData.pastorName) console.log(` Pastor: ${result.churchData.pastorName}`);
if (result.churchData.diocese) console.log(` Diocese: ${result.churchData.diocese}`);
}
if (result.error) {
console.log(` Error: ${result.error}`);
}
if (result.schedules.length > 0) {
console.log('\n' + '-'.repeat(70));
console.log('PARSED MASS SCHEDULES');
console.log('-'.repeat(70));
const byDay: Record<number, typeof result.schedules> = {};
for (const schedule of result.schedules) {
if (!byDay[schedule.dayOfWeek]) {
byDay[schedule.dayOfWeek] = [];
}
byDay[schedule.dayOfWeek].push(schedule);
}
for (let day = 0; day < 7; day++) {
const schedules = byDay[day];
if (schedules && schedules.length > 0) {
console.log(`\n${DAY_NAMES[day]}:`);
for (const s of schedules) {
const parts = [
` ${s.time}`,
s.language && s.language !== 'English' ? `(${s.language})` : '',
s.massType ? `[${s.massType}]` : '',
s.notes ? `- ${s.notes}` : '',
].filter(Boolean);
console.log(parts.join(' '));
}
}
}
}
if (result.rawHtml) {
console.log('\n' + '-'.repeat(70));
console.log('RAW TEXT PREVIEW (first 1000 chars, stripped of HTML)');
console.log('-'.repeat(70));
const textOnly = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/[\u2013\u2014]/g, '-')
.replace(/\s+/g, ' ')
.trim()
.substring(0, 1000);
console.log('\n' + textOnly);
if (result.rawHtml.length > 1000) {
console.log('\n... (truncated)');
}
}
console.log('\n' + '='.repeat(70));
console.log('SUMMARY');
console.log('='.repeat(70));
console.log(`URL: ${TEST_URL}`);
console.log(`Scraper: ${LANG || 'generic'}`);
console.log(`Country: ${COUNTRY_CODE || '(auto-detected)'}`);
console.log(`Language: ${result.detectedLanguage || '(unknown)'}`);
console.log(`Success: ${result.success ? 'Yes' : 'No'}`);
console.log(`Schedules: ${result.schedules.length}`);
console.log(`HTML Size: ${result.rawHtml ? Math.round(result.rawHtml.length / 1024) + ' KB' : 'N/A'}`);
if (result.schedules.length > 0) {
const days = [...new Set(result.schedules.map(s => s.dayOfWeek))];
const languages = [...new Set(result.schedules.map(s => s.language || 'English'))];
console.log(`Days: ${days.map(d => DAY_NAMES[d]).join(', ')}`);
console.log(`Languages: ${languages.join(', ')}`);
}
console.log('='.repeat(70) + '\n');
} catch (error) {
console.error('\n[ERROR]', error);
} finally {
console.log('[4/4] Closing browser...');
await scraper.close();
console.log(' ✓ Done\n');
}
}
main().catch(console.error);