Files
ScraperControl/scripts/debug/identify-top5-bugs.ts
Albert 2c51513851 chore: sync with Gitea master and restore local-only files
Reset local main to gitea/master (new source of truth) and restored
local-only files: web scrapers, admin dashboard, ChromaDB integration,
debug scripts, and utility libraries that aren't tracked in Gitea.

Gitea master adds: discovermass, buscarmisas-network, hk-parishes,
bohosluzby, kerknet, gottesdienstzeiten, miserend importers,
ClaimRequest model, forward geocoding, heartbeat healthcheck.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-12 19:11:22 -04:00

103 lines
3.0 KiB
TypeScript

#!/usr/bin/env tsx
/**
* Identify which churches are flagged as "parsing bugs" in top 5 test
*/
import { config } from 'dotenv';
config({ path: '.env.local' });
config({ path: '.env' });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { GenericScraper } from '../../src/scrapers/strategies/generic';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
const COUNTRIES = [
{ code: 'FR', name: 'France' },
{ code: 'DE', name: 'Germany' },
{ code: 'ES', name: 'Spain' },
{ code: 'PL', name: 'Poland' },
{ code: 'BR', name: 'Brazil' },
];
async function identifyBugs() {
console.log('Identifying "parsing bugs" from top 5 test...\n');
const scraper = new GenericScraper();
await scraper.init();
const bugs: Array<{
country: string;
church: string;
url: string;
hasDays: boolean;
hasTimes: boolean;
}> = [];
for (const country of COUNTRIES) {
const churches = await prisma.church.findMany({
where: {
country: country.code,
website: { not: null },
source: 'osm',
},
take: 10,
orderBy: { createdAt: 'asc' },
});
scraper.setCountry(country.code);
for (const church of churches) {
try {
const result = await scraper.scrape(church.website!);
if (!result.success && result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Check for day names and times
const hasDays = text.match(/\b(sunday|monday|tuesday|wednesday|thursday|friday|saturday|dimanche|lundi|mardi|mercredi|jeudi|vendredi|samedi|sonntag|montag|dienstag|mittwoch|donnerstag|freitag|samstag|domingo|domingos|lunes|martes|miércoles|miercoles|jueves|viernes|sábado|sabado|sábados|sabados|niedziela|poniedziałek|poniedzialek|wtorek|środa|sroda|czwartek|piątek|piatek|sobota|segunda|segundas|terça|terca|terças|tercas|quarta|quartas|quinta|quintas|sexta|sextas)\b/i);
const hasTimes = text.match(/\d{1,2}[h:\.]?\s*\d{0,2}\s*(am|pm|h|uhr)?/i);
if (hasDays && hasTimes) {
bugs.push({
country: country.name,
church: church.name,
url: church.website!,
hasDays: !!hasDays,
hasTimes: !!hasTimes,
});
}
}
} catch (err: any) {
// Skip errors
}
}
}
await scraper.close();
console.log(`\n${'='.repeat(80)}`);
console.log(`FOUND ${bugs.length} POTENTIAL PARSING BUGS\n`);
bugs.forEach((bug, i) => {
console.log(`${i + 1}. ${bug.church} (${bug.country})`);
console.log(` URL: ${bug.url}`);
console.log('');
});
await prisma.$disconnect();
await pool.end();
}
identifyBugs().catch(console.error);