chore: sync with Gitea master and restore local-only files
Reset local main to gitea/master (new source of truth) and restored local-only files: web scrapers, admin dashboard, ChromaDB integration, debug scripts, and utility libraries that aren't tracked in Gitea. Gitea master adds: discovermass, buscarmisas-network, hk-parishes, bohosluzby, kerknet, gottesdienstzeiten, miserend importers, ClaimRequest model, forward geocoding, heartbeat healthcheck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
102
scripts/debug/identify-top5-bugs.ts
Normal file
102
scripts/debug/identify-top5-bugs.ts
Normal file
@@ -0,0 +1,102 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Identify which churches are flagged as "parsing bugs" in top 5 test
|
||||
*/
|
||||
|
||||
import { config } from 'dotenv';
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
const COUNTRIES = [
|
||||
{ code: 'FR', name: 'France' },
|
||||
{ code: 'DE', name: 'Germany' },
|
||||
{ code: 'ES', name: 'Spain' },
|
||||
{ code: 'PL', name: 'Poland' },
|
||||
{ code: 'BR', name: 'Brazil' },
|
||||
];
|
||||
|
||||
async function identifyBugs() {
|
||||
console.log('Identifying "parsing bugs" from top 5 test...\n');
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
|
||||
const bugs: Array<{
|
||||
country: string;
|
||||
church: string;
|
||||
url: string;
|
||||
hasDays: boolean;
|
||||
hasTimes: boolean;
|
||||
}> = [];
|
||||
|
||||
for (const country of COUNTRIES) {
|
||||
const churches = await prisma.church.findMany({
|
||||
where: {
|
||||
country: country.code,
|
||||
website: { not: null },
|
||||
source: 'osm',
|
||||
},
|
||||
take: 10,
|
||||
orderBy: { createdAt: 'asc' },
|
||||
});
|
||||
|
||||
scraper.setCountry(country.code);
|
||||
|
||||
for (const church of churches) {
|
||||
try {
|
||||
const result = await scraper.scrape(church.website!);
|
||||
|
||||
if (!result.success && result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Check for day names and times
|
||||
const hasDays = text.match(/\b(sunday|monday|tuesday|wednesday|thursday|friday|saturday|dimanche|lundi|mardi|mercredi|jeudi|vendredi|samedi|sonntag|montag|dienstag|mittwoch|donnerstag|freitag|samstag|domingo|domingos|lunes|martes|miércoles|miercoles|jueves|viernes|sábado|sabado|sábados|sabados|niedziela|poniedziałek|poniedzialek|wtorek|środa|sroda|czwartek|piątek|piatek|sobota|segunda|segundas|terça|terca|terças|tercas|quarta|quartas|quinta|quintas|sexta|sextas)\b/i);
|
||||
|
||||
const hasTimes = text.match(/\d{1,2}[h:\.]?\s*\d{0,2}\s*(am|pm|h|uhr)?/i);
|
||||
|
||||
if (hasDays && hasTimes) {
|
||||
bugs.push({
|
||||
country: country.name,
|
||||
church: church.name,
|
||||
url: church.website!,
|
||||
hasDays: !!hasDays,
|
||||
hasTimes: !!hasTimes,
|
||||
});
|
||||
}
|
||||
}
|
||||
} catch (err: any) {
|
||||
// Skip errors
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
|
||||
console.log(`\n${'='.repeat(80)}`);
|
||||
console.log(`FOUND ${bugs.length} POTENTIAL PARSING BUGS\n`);
|
||||
|
||||
bugs.forEach((bug, i) => {
|
||||
console.log(`${i + 1}. ${bug.church} (${bug.country})`);
|
||||
console.log(` URL: ${bug.url}`);
|
||||
console.log('');
|
||||
});
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
identifyBugs().catch(console.error);
|
||||
Reference in New Issue
Block a user