chore: sync with Gitea master and restore local-only files
Reset local main to gitea/master (new source of truth) and restored local-only files: web scrapers, admin dashboard, ChromaDB integration, debug scripts, and utility libraries that aren't tracked in Gitea. Gitea master adds: discovermass, buscarmisas-network, hk-parishes, bohosluzby, kerknet, gottesdienstzeiten, miserend importers, ClaimRequest model, forward geocoding, heartbeat healthcheck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
173
scripts/debug/test-website-scraper.ts
Normal file
173
scripts/debug/test-website-scraper.ts
Normal file
@@ -0,0 +1,173 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Test website scraper on churches with websites
|
||||
* Analyzes which websites can be scraped successfully
|
||||
*/
|
||||
|
||||
// Load .env
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
import fs from 'fs';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
interface TestResult {
|
||||
churchId: string;
|
||||
name: string;
|
||||
website: string;
|
||||
country: string;
|
||||
success: boolean;
|
||||
massesFound: number;
|
||||
schedules?: { dayOfWeek: number; time: string; massType?: string; language?: string }[];
|
||||
error?: string;
|
||||
}
|
||||
|
||||
function normalizeUrl(url: string): string {
|
||||
if (!url.startsWith('http://') && !url.startsWith('https://')) {
|
||||
return `https://${url}`;
|
||||
}
|
||||
return url;
|
||||
}
|
||||
|
||||
async function testScrapers(limit: number = 50, country?: string) {
|
||||
const results: TestResult[] = [];
|
||||
|
||||
// Get churches with websites
|
||||
const whereClause: any = {
|
||||
website: { not: null },
|
||||
};
|
||||
|
||||
if (country) {
|
||||
whereClause.country = country;
|
||||
}
|
||||
|
||||
const churches = await prisma.church.findMany({
|
||||
where: whereClause,
|
||||
take: limit,
|
||||
orderBy: { createdAt: 'desc' },
|
||||
});
|
||||
|
||||
console.log(`Testing ${churches.length} churches with websites...\n`);
|
||||
|
||||
// Initialize the scraper (launches Playwright browser)
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
|
||||
try {
|
||||
for (let i = 0; i < churches.length; i++) {
|
||||
const church = churches[i];
|
||||
const url = normalizeUrl(church.website!);
|
||||
console.log(`[${i + 1}/${churches.length}] Testing: ${church.name}`);
|
||||
console.log(` Website: ${url}`);
|
||||
|
||||
try {
|
||||
const result = await scraper.scrape(url);
|
||||
|
||||
results.push({
|
||||
churchId: church.id,
|
||||
name: church.name,
|
||||
website: url,
|
||||
country: church.country,
|
||||
success: result.success,
|
||||
massesFound: result.schedules.length,
|
||||
schedules: result.schedules.map((s) => ({
|
||||
dayOfWeek: s.dayOfWeek,
|
||||
time: s.time,
|
||||
massType: s.massType,
|
||||
language: s.language,
|
||||
})),
|
||||
error: result.error,
|
||||
});
|
||||
|
||||
if (result.success) {
|
||||
console.log(` ✓ ${result.schedules.length} masses found`);
|
||||
for (const s of result.schedules) {
|
||||
const days = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'];
|
||||
console.log(` ${days[s.dayOfWeek]} ${s.time} (${s.language || 'English'}${s.massType ? ', ' + s.massType : ''})`);
|
||||
}
|
||||
} else {
|
||||
console.log(` ✗ No masses found: ${result.error}`);
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.log(` ✗ Error: ${error.message}`);
|
||||
results.push({
|
||||
churchId: church.id,
|
||||
name: church.name,
|
||||
website: url,
|
||||
country: church.country,
|
||||
success: false,
|
||||
massesFound: 0,
|
||||
error: error.message,
|
||||
});
|
||||
}
|
||||
|
||||
console.log('');
|
||||
}
|
||||
} finally {
|
||||
// Always close the browser
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
// Summary
|
||||
const successful = results.filter((r) => r.success);
|
||||
const failed = results.filter((r) => !r.success);
|
||||
const totalMasses = results.reduce((sum, r) => sum + r.massesFound, 0);
|
||||
|
||||
console.log('============================================================');
|
||||
console.log('Test Summary');
|
||||
console.log('============================================================');
|
||||
console.log(`Total churches tested: ${results.length}`);
|
||||
console.log(`Successful scrapes: ${successful.length} (${((successful.length / results.length) * 100).toFixed(1)}%)`);
|
||||
console.log(`Failed scrapes: ${failed.length} (${((failed.length / results.length) * 100).toFixed(1)}%)`);
|
||||
console.log(`Total masses found: ${totalMasses}`);
|
||||
console.log('============================================================');
|
||||
|
||||
if (failed.length > 0) {
|
||||
console.log('\nFailed websites:');
|
||||
for (const f of failed) {
|
||||
console.log(` - ${f.name}: ${f.website} (${f.error})`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log('');
|
||||
|
||||
// Export results (without raw HTML to keep file manageable)
|
||||
fs.writeFileSync(
|
||||
'scraper-test-results.json',
|
||||
JSON.stringify(results, null, 2)
|
||||
);
|
||||
console.log('Results saved to scraper-test-results.json');
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = process.argv.slice(2);
|
||||
const limitIndex = args.indexOf('--limit');
|
||||
const countryIndex = args.indexOf('--country');
|
||||
|
||||
const limit = limitIndex !== -1 ? parseInt(args[limitIndex + 1]) : 50;
|
||||
const country = countryIndex !== -1 ? args[countryIndex + 1] : undefined;
|
||||
|
||||
console.log('============================================================');
|
||||
console.log('Website Scraper Testing');
|
||||
console.log('============================================================');
|
||||
console.log(`Limit: ${limit}`);
|
||||
console.log(`Country: ${country || 'All'}`);
|
||||
console.log('============================================================\n');
|
||||
|
||||
await testScrapers(limit, country);
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
Reference in New Issue
Block a user