#!/usr/bin/env tsx /** * Identify which churches are flagged as "parsing bugs" in top 5 test */ import { config } from 'dotenv'; config({ path: '.env.local' }); config({ path: '.env' }); import { Pool } from 'pg'; import { PrismaPg } from '@prisma/adapter-pg'; import { PrismaClient } from '@prisma/client'; import { GenericScraper } from '../../src/scrapers/strategies/generic'; const pool = new Pool({ connectionString: process.env.DATABASE_URL }); const adapter = new PrismaPg(pool); const prisma = new PrismaClient({ adapter }); const COUNTRIES = [ { code: 'FR', name: 'France' }, { code: 'DE', name: 'Germany' }, { code: 'ES', name: 'Spain' }, { code: 'PL', name: 'Poland' }, { code: 'BR', name: 'Brazil' }, ]; async function identifyBugs() { console.log('Identifying "parsing bugs" from top 5 test...\n'); const scraper = new GenericScraper(); await scraper.init(); const bugs: Array<{ country: string; church: string; url: string; hasDays: boolean; hasTimes: boolean; }> = []; for (const country of COUNTRIES) { const churches = await prisma.church.findMany({ where: { country: country.code, website: { not: null }, source: 'osm', }, take: 10, orderBy: { createdAt: 'asc' }, }); scraper.setCountry(country.code); for (const church of churches) { try { const result = await scraper.scrape(church.website!); if (!result.success && result.rawHtml) { const text = result.rawHtml .replace(/]*>[\s\S]*?<\/script>/gi, '') .replace(/]*>[\s\S]*?<\/style>/gi, '') .replace(/<[^>]+>/g, ' ') .replace(/\s+/g, ' ') .toLowerCase(); // Check for day names and times const hasDays = text.match(/\b(sunday|monday|tuesday|wednesday|thursday|friday|saturday|dimanche|lundi|mardi|mercredi|jeudi|vendredi|samedi|sonntag|montag|dienstag|mittwoch|donnerstag|freitag|samstag|domingo|domingos|lunes|martes|miércoles|miercoles|jueves|viernes|sábado|sabado|sábados|sabados|niedziela|poniedziałek|poniedzialek|wtorek|środa|sroda|czwartek|piątek|piatek|sobota|segunda|segundas|terça|terca|terças|tercas|quarta|quartas|quinta|quintas|sexta|sextas)\b/i); const hasTimes = text.match(/\d{1,2}[h:\.]?\s*\d{0,2}\s*(am|pm|h|uhr)?/i); if (hasDays && hasTimes) { bugs.push({ country: country.name, church: church.name, url: church.website!, hasDays: !!hasDays, hasTimes: !!hasTimes, }); } } } catch (err: any) { // Skip errors } } } await scraper.close(); console.log(`\n${'='.repeat(80)}`); console.log(`FOUND ${bugs.length} POTENTIAL PARSING BUGS\n`); bugs.forEach((bug, i) => { console.log(`${i + 1}. ${bug.church} (${bug.country})`); console.log(` URL: ${bug.url}`); console.log(''); }); await prisma.$disconnect(); await pool.end(); } identifyBugs().catch(console.error);