174 lines
5.2 KiB
TypeScript
174 lines
5.2 KiB
TypeScript
|
|
#!/usr/bin/env tsx
|
||
|
|
/**
|
||
|
|
* Test website scraper on churches with websites
|
||
|
|
* Analyzes which websites can be scraped successfully
|
||
|
|
*/
|
||
|
|
|
||
|
|
// Load .env
|
||
|
|
import dotenv from 'dotenv';
|
||
|
|
import path from 'path';
|
||
|
|
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||
|
|
|
||
|
|
import { Pool } from 'pg';
|
||
|
|
import { PrismaPg } from '@prisma/adapter-pg';
|
||
|
|
import { PrismaClient } from '@prisma/client';
|
||
|
|
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||
|
|
import fs from 'fs';
|
||
|
|
|
||
|
|
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||
|
|
const adapter = new PrismaPg(pool);
|
||
|
|
const prisma = new PrismaClient({ adapter });
|
||
|
|
|
||
|
|
interface TestResult {
|
||
|
|
churchId: string;
|
||
|
|
name: string;
|
||
|
|
website: string;
|
||
|
|
country: string;
|
||
|
|
success: boolean;
|
||
|
|
massesFound: number;
|
||
|
|
schedules?: { dayOfWeek: number; time: string; massType?: string; language?: string }[];
|
||
|
|
error?: string;
|
||
|
|
}
|
||
|
|
|
||
|
|
function normalizeUrl(url: string): string {
|
||
|
|
if (!url.startsWith('http://') && !url.startsWith('https://')) {
|
||
|
|
return `https://${url}`;
|
||
|
|
}
|
||
|
|
return url;
|
||
|
|
}
|
||
|
|
|
||
|
|
async function testScrapers(limit: number = 50, country?: string) {
|
||
|
|
const results: TestResult[] = [];
|
||
|
|
|
||
|
|
// Get churches with websites
|
||
|
|
const whereClause: any = {
|
||
|
|
website: { not: null },
|
||
|
|
};
|
||
|
|
|
||
|
|
if (country) {
|
||
|
|
whereClause.country = country;
|
||
|
|
}
|
||
|
|
|
||
|
|
const churches = await prisma.church.findMany({
|
||
|
|
where: whereClause,
|
||
|
|
take: limit,
|
||
|
|
orderBy: { createdAt: 'desc' },
|
||
|
|
});
|
||
|
|
|
||
|
|
console.log(`Testing ${churches.length} churches with websites...\n`);
|
||
|
|
|
||
|
|
// Initialize the scraper (launches Playwright browser)
|
||
|
|
const scraper = new GenericScraper();
|
||
|
|
await scraper.init();
|
||
|
|
|
||
|
|
try {
|
||
|
|
for (let i = 0; i < churches.length; i++) {
|
||
|
|
const church = churches[i];
|
||
|
|
const url = normalizeUrl(church.website!);
|
||
|
|
console.log(`[${i + 1}/${churches.length}] Testing: ${church.name}`);
|
||
|
|
console.log(` Website: ${url}`);
|
||
|
|
|
||
|
|
try {
|
||
|
|
const result = await scraper.scrape(url);
|
||
|
|
|
||
|
|
results.push({
|
||
|
|
churchId: church.id,
|
||
|
|
name: church.name,
|
||
|
|
website: url,
|
||
|
|
country: church.country,
|
||
|
|
success: result.success,
|
||
|
|
massesFound: result.schedules.length,
|
||
|
|
schedules: result.schedules.map((s) => ({
|
||
|
|
dayOfWeek: s.dayOfWeek,
|
||
|
|
time: s.time,
|
||
|
|
massType: s.massType,
|
||
|
|
language: s.language,
|
||
|
|
})),
|
||
|
|
error: result.error,
|
||
|
|
});
|
||
|
|
|
||
|
|
if (result.success) {
|
||
|
|
console.log(` ✓ ${result.schedules.length} masses found`);
|
||
|
|
for (const s of result.schedules) {
|
||
|
|
const days = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'];
|
||
|
|
console.log(` ${days[s.dayOfWeek]} ${s.time} (${s.language || 'English'}${s.massType ? ', ' + s.massType : ''})`);
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
console.log(` ✗ No masses found: ${result.error}`);
|
||
|
|
}
|
||
|
|
} catch (error: any) {
|
||
|
|
console.log(` ✗ Error: ${error.message}`);
|
||
|
|
results.push({
|
||
|
|
churchId: church.id,
|
||
|
|
name: church.name,
|
||
|
|
website: url,
|
||
|
|
country: church.country,
|
||
|
|
success: false,
|
||
|
|
massesFound: 0,
|
||
|
|
error: error.message,
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log('');
|
||
|
|
}
|
||
|
|
} finally {
|
||
|
|
// Always close the browser
|
||
|
|
await scraper.close();
|
||
|
|
}
|
||
|
|
|
||
|
|
// Summary
|
||
|
|
const successful = results.filter((r) => r.success);
|
||
|
|
const failed = results.filter((r) => !r.success);
|
||
|
|
const totalMasses = results.reduce((sum, r) => sum + r.massesFound, 0);
|
||
|
|
|
||
|
|
console.log('============================================================');
|
||
|
|
console.log('Test Summary');
|
||
|
|
console.log('============================================================');
|
||
|
|
console.log(`Total churches tested: ${results.length}`);
|
||
|
|
console.log(`Successful scrapes: ${successful.length} (${((successful.length / results.length) * 100).toFixed(1)}%)`);
|
||
|
|
console.log(`Failed scrapes: ${failed.length} (${((failed.length / results.length) * 100).toFixed(1)}%)`);
|
||
|
|
console.log(`Total masses found: ${totalMasses}`);
|
||
|
|
console.log('============================================================');
|
||
|
|
|
||
|
|
if (failed.length > 0) {
|
||
|
|
console.log('\nFailed websites:');
|
||
|
|
for (const f of failed) {
|
||
|
|
console.log(` - ${f.name}: ${f.website} (${f.error})`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log('');
|
||
|
|
|
||
|
|
// Export results (without raw HTML to keep file manageable)
|
||
|
|
fs.writeFileSync(
|
||
|
|
'scraper-test-results.json',
|
||
|
|
JSON.stringify(results, null, 2)
|
||
|
|
);
|
||
|
|
console.log('Results saved to scraper-test-results.json');
|
||
|
|
|
||
|
|
return results;
|
||
|
|
}
|
||
|
|
|
||
|
|
async function main() {
|
||
|
|
const args = process.argv.slice(2);
|
||
|
|
const limitIndex = args.indexOf('--limit');
|
||
|
|
const countryIndex = args.indexOf('--country');
|
||
|
|
|
||
|
|
const limit = limitIndex !== -1 ? parseInt(args[limitIndex + 1]) : 50;
|
||
|
|
const country = countryIndex !== -1 ? args[countryIndex + 1] : undefined;
|
||
|
|
|
||
|
|
console.log('============================================================');
|
||
|
|
console.log('Website Scraper Testing');
|
||
|
|
console.log('============================================================');
|
||
|
|
console.log(`Limit: ${limit}`);
|
||
|
|
console.log(`Country: ${country || 'All'}`);
|
||
|
|
console.log('============================================================\n');
|
||
|
|
|
||
|
|
await testScrapers(limit, country);
|
||
|
|
|
||
|
|
await prisma.$disconnect();
|
||
|
|
await pool.end();
|
||
|
|
}
|
||
|
|
|
||
|
|
main().catch(console.error);
|