Files
ScraperControl/scripts/debug/test-website-scraper.ts
Albert 2c51513851 chore: sync with Gitea master and restore local-only files
Reset local main to gitea/master (new source of truth) and restored
local-only files: web scrapers, admin dashboard, ChromaDB integration,
debug scripts, and utility libraries that aren't tracked in Gitea.

Gitea master adds: discovermass, buscarmisas-network, hk-parishes,
bohosluzby, kerknet, gottesdienstzeiten, miserend importers,
ClaimRequest model, forward geocoding, heartbeat healthcheck.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-12 19:11:22 -04:00

174 lines
5.2 KiB
TypeScript

#!/usr/bin/env tsx
/**
* Test website scraper on churches with websites
* Analyzes which websites can be scraped successfully
*/
// Load .env
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { GenericScraper } from '../../src/scrapers/strategies/generic';
import fs from 'fs';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
interface TestResult {
churchId: string;
name: string;
website: string;
country: string;
success: boolean;
massesFound: number;
schedules?: { dayOfWeek: number; time: string; massType?: string; language?: string }[];
error?: string;
}
function normalizeUrl(url: string): string {
if (!url.startsWith('http://') && !url.startsWith('https://')) {
return `https://${url}`;
}
return url;
}
async function testScrapers(limit: number = 50, country?: string) {
const results: TestResult[] = [];
// Get churches with websites
const whereClause: any = {
website: { not: null },
};
if (country) {
whereClause.country = country;
}
const churches = await prisma.church.findMany({
where: whereClause,
take: limit,
orderBy: { createdAt: 'desc' },
});
console.log(`Testing ${churches.length} churches with websites...\n`);
// Initialize the scraper (launches Playwright browser)
const scraper = new GenericScraper();
await scraper.init();
try {
for (let i = 0; i < churches.length; i++) {
const church = churches[i];
const url = normalizeUrl(church.website!);
console.log(`[${i + 1}/${churches.length}] Testing: ${church.name}`);
console.log(` Website: ${url}`);
try {
const result = await scraper.scrape(url);
results.push({
churchId: church.id,
name: church.name,
website: url,
country: church.country,
success: result.success,
massesFound: result.schedules.length,
schedules: result.schedules.map((s) => ({
dayOfWeek: s.dayOfWeek,
time: s.time,
massType: s.massType,
language: s.language,
})),
error: result.error,
});
if (result.success) {
console.log(`${result.schedules.length} masses found`);
for (const s of result.schedules) {
const days = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'];
console.log(` ${days[s.dayOfWeek]} ${s.time} (${s.language || 'English'}${s.massType ? ', ' + s.massType : ''})`);
}
} else {
console.log(` ✗ No masses found: ${result.error}`);
}
} catch (error: any) {
console.log(` ✗ Error: ${error.message}`);
results.push({
churchId: church.id,
name: church.name,
website: url,
country: church.country,
success: false,
massesFound: 0,
error: error.message,
});
}
console.log('');
}
} finally {
// Always close the browser
await scraper.close();
}
// Summary
const successful = results.filter((r) => r.success);
const failed = results.filter((r) => !r.success);
const totalMasses = results.reduce((sum, r) => sum + r.massesFound, 0);
console.log('============================================================');
console.log('Test Summary');
console.log('============================================================');
console.log(`Total churches tested: ${results.length}`);
console.log(`Successful scrapes: ${successful.length} (${((successful.length / results.length) * 100).toFixed(1)}%)`);
console.log(`Failed scrapes: ${failed.length} (${((failed.length / results.length) * 100).toFixed(1)}%)`);
console.log(`Total masses found: ${totalMasses}`);
console.log('============================================================');
if (failed.length > 0) {
console.log('\nFailed websites:');
for (const f of failed) {
console.log(` - ${f.name}: ${f.website} (${f.error})`);
}
}
console.log('');
// Export results (without raw HTML to keep file manageable)
fs.writeFileSync(
'scraper-test-results.json',
JSON.stringify(results, null, 2)
);
console.log('Results saved to scraper-test-results.json');
return results;
}
async function main() {
const args = process.argv.slice(2);
const limitIndex = args.indexOf('--limit');
const countryIndex = args.indexOf('--country');
const limit = limitIndex !== -1 ? parseInt(args[limitIndex + 1]) : 50;
const country = countryIndex !== -1 ? args[countryIndex + 1] : undefined;
console.log('============================================================');
console.log('Website Scraper Testing');
console.log('============================================================');
console.log(`Limit: ${limit}`);
console.log(`Country: ${country || 'All'}`);
console.log('============================================================\n');
await testScrapers(limit, country);
await prisma.$disconnect();
await pool.end();
}
main().catch(console.error);