#!/usr/bin/env tsx /** * Test website scraper on churches with websites * Analyzes which websites can be scraped successfully */ // Load .env import dotenv from 'dotenv'; import path from 'path'; dotenv.config({ path: path.resolve(process.cwd(), '.env') }); import { Pool } from 'pg'; import { PrismaPg } from '@prisma/adapter-pg'; import { PrismaClient } from '@prisma/client'; import { GenericScraper } from '../../src/scrapers/strategies/generic'; import fs from 'fs'; const pool = new Pool({ connectionString: process.env.DATABASE_URL }); const adapter = new PrismaPg(pool); const prisma = new PrismaClient({ adapter }); interface TestResult { churchId: string; name: string; website: string; country: string; success: boolean; massesFound: number; schedules?: { dayOfWeek: number; time: string; massType?: string; language?: string }[]; error?: string; } function normalizeUrl(url: string): string { if (!url.startsWith('http://') && !url.startsWith('https://')) { return `https://${url}`; } return url; } async function testScrapers(limit: number = 50, country?: string) { const results: TestResult[] = []; // Get churches with websites const whereClause: any = { website: { not: null }, }; if (country) { whereClause.country = country; } const churches = await prisma.church.findMany({ where: whereClause, take: limit, orderBy: { createdAt: 'desc' }, }); console.log(`Testing ${churches.length} churches with websites...\n`); // Initialize the scraper (launches Playwright browser) const scraper = new GenericScraper(); await scraper.init(); try { for (let i = 0; i < churches.length; i++) { const church = churches[i]; const url = normalizeUrl(church.website!); console.log(`[${i + 1}/${churches.length}] Testing: ${church.name}`); console.log(` Website: ${url}`); try { const result = await scraper.scrape(url); results.push({ churchId: church.id, name: church.name, website: url, country: church.country, success: result.success, massesFound: result.schedules.length, schedules: result.schedules.map((s) => ({ dayOfWeek: s.dayOfWeek, time: s.time, massType: s.massType, language: s.language, })), error: result.error, }); if (result.success) { console.log(` ✓ ${result.schedules.length} masses found`); for (const s of result.schedules) { const days = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat']; console.log(` ${days[s.dayOfWeek]} ${s.time} (${s.language || 'English'}${s.massType ? ', ' + s.massType : ''})`); } } else { console.log(` ✗ No masses found: ${result.error}`); } } catch (error: any) { console.log(` ✗ Error: ${error.message}`); results.push({ churchId: church.id, name: church.name, website: url, country: church.country, success: false, massesFound: 0, error: error.message, }); } console.log(''); } } finally { // Always close the browser await scraper.close(); } // Summary const successful = results.filter((r) => r.success); const failed = results.filter((r) => !r.success); const totalMasses = results.reduce((sum, r) => sum + r.massesFound, 0); console.log('============================================================'); console.log('Test Summary'); console.log('============================================================'); console.log(`Total churches tested: ${results.length}`); console.log(`Successful scrapes: ${successful.length} (${((successful.length / results.length) * 100).toFixed(1)}%)`); console.log(`Failed scrapes: ${failed.length} (${((failed.length / results.length) * 100).toFixed(1)}%)`); console.log(`Total masses found: ${totalMasses}`); console.log('============================================================'); if (failed.length > 0) { console.log('\nFailed websites:'); for (const f of failed) { console.log(` - ${f.name}: ${f.website} (${f.error})`); } } console.log(''); // Export results (without raw HTML to keep file manageable) fs.writeFileSync( 'scraper-test-results.json', JSON.stringify(results, null, 2) ); console.log('Results saved to scraper-test-results.json'); return results; } async function main() { const args = process.argv.slice(2); const limitIndex = args.indexOf('--limit'); const countryIndex = args.indexOf('--country'); const limit = limitIndex !== -1 ? parseInt(args[limitIndex + 1]) : 50; const country = countryIndex !== -1 ? args[countryIndex + 1] : undefined; console.log('============================================================'); console.log('Website Scraper Testing'); console.log('============================================================'); console.log(`Limit: ${limit}`); console.log(`Country: ${country || 'All'}`); console.log('============================================================\n'); await testScrapers(limit, country); await prisma.$disconnect(); await pool.end(); } main().catch(console.error);