Files
ScraperControl/scripts/debug/test-french-broader.ts

181 lines
5.8 KiB
TypeScript
Raw Permalink Normal View History

#!/usr/bin/env tsx
/**
* Test more French churches and collect diagnostic data
*/
import { config } from 'dotenv';
config({ path: '.env.local' });
config({ path: '.env' });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { GenericScraper } from '../../src/scrapers/strategies/generic';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
interface DiagnosticInfo {
url: string;
churchName: string;
success: boolean;
schedulesFound: number;
hasFrenchDays: boolean;
hasTimePatterns: boolean;
timePatternsSample: string[];
textSample: string;
error?: string;
}
async function testFrenchBroader() {
console.log('Testing 20 French churches with diagnostics...\n');
// Get more French churches
const churches = await prisma.church.findMany({
where: {
country: 'FR',
website: { not: null },
source: 'osm',
},
take: 20,
orderBy: { createdAt: 'asc' },
});
if (churches.length === 0) {
console.log('No French churches found.');
await prisma.$disconnect();
await pool.end();
return;
}
console.log(`Found ${churches.length} French churches to test\n`);
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('FR');
let successCount = 0;
let failCount = 0;
const diagnostics: DiagnosticInfo[] = [];
for (let i = 0; i < churches.length; i++) {
const church = churches[i];
console.log(`[${i + 1}/${churches.length}] Testing: ${church.name} (${church.city || 'Unknown'})`);
console.log(`URL: ${church.website}`);
try {
const result = await scraper.scrape(church.website!);
// Extract diagnostics
let hasFrenchDays = false;
let hasTimePatterns = false;
let timePatternsSample: string[] = [];
let textSample = '';
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
textSample = text.substring(0, 500);
const frenchDays = ['dimanche', 'lundi', 'mardi', 'mercredi', 'jeudi', 'vendredi', 'samedi'];
hasFrenchDays = frenchDays.some(day => text.includes(day));
const timeRegex = /\d{1,2}[h:\.]\s*\d{0,2}\s*(?:h)?/g;
const times = text.match(timeRegex);
if (times) {
hasTimePatterns = true;
timePatternsSample = [...new Set(times)].slice(0, 10);
}
}
const diagnostic: DiagnosticInfo = {
url: church.website!,
churchName: church.name,
success: result.success,
schedulesFound: result.schedules.length,
hasFrenchDays,
hasTimePatterns,
timePatternsSample,
textSample,
error: result.error,
};
diagnostics.push(diagnostic);
if (result.success && result.schedules.length > 0) {
successCount++;
console.log(`✅ SUCCESS - ${result.schedules.length} schedules`);
} else {
failCount++;
console.log(`❌ FAILED - ${result.error}`);
if (hasFrenchDays && !hasTimePatterns) {
console.log(` 💡 Has French days but no times`);
} else if (!hasFrenchDays && hasTimePatterns) {
console.log(` 💡 Has times but no French days`);
} else if (hasFrenchDays && hasTimePatterns) {
console.log(` 💡 Has BOTH days and times - parsing issue!`);
console.log(` Sample times: ${timePatternsSample.slice(0, 5).join(', ')}`);
} else {
console.log(` 💡 No mass schedule content found`);
}
}
console.log('');
} catch (err: any) {
failCount++;
console.log(`❌ ERROR - ${err.message}\n`);
diagnostics.push({
url: church.website!,
churchName: church.name,
success: false,
schedulesFound: 0,
hasFrenchDays: false,
hasTimePatterns: false,
timePatternsSample: [],
textSample: '',
error: err.message,
});
}
}
await scraper.close();
// Analysis
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
console.log(`\nRESULTS: ${successCount}/${churches.length} successful (${((successCount / churches.length) * 100).toFixed(0)}%)`);
console.log('');
const hasBoth = diagnostics.filter(d => !d.success && d.hasFrenchDays && d.hasTimePatterns);
const hasDaysNoTimes = diagnostics.filter(d => !d.success && d.hasFrenchDays && !d.hasTimePatterns);
const hasTimesNoDays = diagnostics.filter(d => !d.success && !d.hasFrenchDays && d.hasTimePatterns);
const hasNeither = diagnostics.filter(d => !d.success && !d.hasFrenchDays && !d.hasTimePatterns);
console.log('FAILURE ANALYSIS:');
console.log(` Has days + times but failed: ${hasBoth.length} (PARSING BUG)`);
console.log(` Has days but no times: ${hasDaysNoTimes.length}`);
console.log(` Has times but no days: ${hasTimesNoDays.length}`);
console.log(` Has neither: ${hasNeither.length} (no mass schedule on page)`);
console.log('');
if (hasBoth.length > 0) {
console.log('⚠️ PARSING BUGS TO FIX (has both days and times but failed):');
hasBoth.forEach(d => {
console.log(` ${d.churchName}`);
console.log(` URL: ${d.url}`);
console.log(` Sample times found: ${d.timePatternsSample.slice(0, 5).join(', ')}`);
console.log(` Text sample: ${d.textSample.substring(0, 150)}...`);
console.log('');
});
}
await prisma.$disconnect();
await pool.end();
}
testFrenchBroader().catch(console.error);