103 lines
3.0 KiB
TypeScript
103 lines
3.0 KiB
TypeScript
|
|
#!/usr/bin/env tsx
|
||
|
|
/**
|
||
|
|
* Identify which churches are flagged as "parsing bugs" in top 5 test
|
||
|
|
*/
|
||
|
|
|
||
|
|
import { config } from 'dotenv';
|
||
|
|
config({ path: '.env.local' });
|
||
|
|
config({ path: '.env' });
|
||
|
|
|
||
|
|
import { Pool } from 'pg';
|
||
|
|
import { PrismaPg } from '@prisma/adapter-pg';
|
||
|
|
import { PrismaClient } from '@prisma/client';
|
||
|
|
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||
|
|
|
||
|
|
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||
|
|
const adapter = new PrismaPg(pool);
|
||
|
|
const prisma = new PrismaClient({ adapter });
|
||
|
|
|
||
|
|
const COUNTRIES = [
|
||
|
|
{ code: 'FR', name: 'France' },
|
||
|
|
{ code: 'DE', name: 'Germany' },
|
||
|
|
{ code: 'ES', name: 'Spain' },
|
||
|
|
{ code: 'PL', name: 'Poland' },
|
||
|
|
{ code: 'BR', name: 'Brazil' },
|
||
|
|
];
|
||
|
|
|
||
|
|
async function identifyBugs() {
|
||
|
|
console.log('Identifying "parsing bugs" from top 5 test...\n');
|
||
|
|
|
||
|
|
const scraper = new GenericScraper();
|
||
|
|
await scraper.init();
|
||
|
|
|
||
|
|
const bugs: Array<{
|
||
|
|
country: string;
|
||
|
|
church: string;
|
||
|
|
url: string;
|
||
|
|
hasDays: boolean;
|
||
|
|
hasTimes: boolean;
|
||
|
|
}> = [];
|
||
|
|
|
||
|
|
for (const country of COUNTRIES) {
|
||
|
|
const churches = await prisma.church.findMany({
|
||
|
|
where: {
|
||
|
|
country: country.code,
|
||
|
|
website: { not: null },
|
||
|
|
source: 'osm',
|
||
|
|
},
|
||
|
|
take: 10,
|
||
|
|
orderBy: { createdAt: 'asc' },
|
||
|
|
});
|
||
|
|
|
||
|
|
scraper.setCountry(country.code);
|
||
|
|
|
||
|
|
for (const church of churches) {
|
||
|
|
try {
|
||
|
|
const result = await scraper.scrape(church.website!);
|
||
|
|
|
||
|
|
if (!result.success && result.rawHtml) {
|
||
|
|
const text = result.rawHtml
|
||
|
|
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||
|
|
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||
|
|
.replace(/<[^>]+>/g, ' ')
|
||
|
|
.replace(/\s+/g, ' ')
|
||
|
|
.toLowerCase();
|
||
|
|
|
||
|
|
// Check for day names and times
|
||
|
|
const hasDays = text.match(/\b(sunday|monday|tuesday|wednesday|thursday|friday|saturday|dimanche|lundi|mardi|mercredi|jeudi|vendredi|samedi|sonntag|montag|dienstag|mittwoch|donnerstag|freitag|samstag|domingo|domingos|lunes|martes|miércoles|miercoles|jueves|viernes|sábado|sabado|sábados|sabados|niedziela|poniedziałek|poniedzialek|wtorek|środa|sroda|czwartek|piątek|piatek|sobota|segunda|segundas|terça|terca|terças|tercas|quarta|quartas|quinta|quintas|sexta|sextas)\b/i);
|
||
|
|
|
||
|
|
const hasTimes = text.match(/\d{1,2}[h:\.]?\s*\d{0,2}\s*(am|pm|h|uhr)?/i);
|
||
|
|
|
||
|
|
if (hasDays && hasTimes) {
|
||
|
|
bugs.push({
|
||
|
|
country: country.name,
|
||
|
|
church: church.name,
|
||
|
|
url: church.website!,
|
||
|
|
hasDays: !!hasDays,
|
||
|
|
hasTimes: !!hasTimes,
|
||
|
|
});
|
||
|
|
}
|
||
|
|
}
|
||
|
|
} catch (err: any) {
|
||
|
|
// Skip errors
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
await scraper.close();
|
||
|
|
|
||
|
|
console.log(`\n${'='.repeat(80)}`);
|
||
|
|
console.log(`FOUND ${bugs.length} POTENTIAL PARSING BUGS\n`);
|
||
|
|
|
||
|
|
bugs.forEach((bug, i) => {
|
||
|
|
console.log(`${i + 1}. ${bug.church} (${bug.country})`);
|
||
|
|
console.log(` URL: ${bug.url}`);
|
||
|
|
console.log('');
|
||
|
|
});
|
||
|
|
|
||
|
|
await prisma.$disconnect();
|
||
|
|
await pool.end();
|
||
|
|
}
|
||
|
|
|
||
|
|
identifyBugs().catch(console.error);
|