181 lines
5.8 KiB
TypeScript
181 lines
5.8 KiB
TypeScript
|
|
#!/usr/bin/env tsx
|
||
|
|
/**
|
||
|
|
* Test more French churches and collect diagnostic data
|
||
|
|
*/
|
||
|
|
|
||
|
|
import { config } from 'dotenv';
|
||
|
|
config({ path: '.env.local' });
|
||
|
|
config({ path: '.env' });
|
||
|
|
|
||
|
|
import { Pool } from 'pg';
|
||
|
|
import { PrismaPg } from '@prisma/adapter-pg';
|
||
|
|
import { PrismaClient } from '@prisma/client';
|
||
|
|
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||
|
|
|
||
|
|
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||
|
|
const adapter = new PrismaPg(pool);
|
||
|
|
const prisma = new PrismaClient({ adapter });
|
||
|
|
|
||
|
|
interface DiagnosticInfo {
|
||
|
|
url: string;
|
||
|
|
churchName: string;
|
||
|
|
success: boolean;
|
||
|
|
schedulesFound: number;
|
||
|
|
hasFrenchDays: boolean;
|
||
|
|
hasTimePatterns: boolean;
|
||
|
|
timePatternsSample: string[];
|
||
|
|
textSample: string;
|
||
|
|
error?: string;
|
||
|
|
}
|
||
|
|
|
||
|
|
async function testFrenchBroader() {
|
||
|
|
console.log('Testing 20 French churches with diagnostics...\n');
|
||
|
|
|
||
|
|
// Get more French churches
|
||
|
|
const churches = await prisma.church.findMany({
|
||
|
|
where: {
|
||
|
|
country: 'FR',
|
||
|
|
website: { not: null },
|
||
|
|
source: 'osm',
|
||
|
|
},
|
||
|
|
take: 20,
|
||
|
|
orderBy: { createdAt: 'asc' },
|
||
|
|
});
|
||
|
|
|
||
|
|
if (churches.length === 0) {
|
||
|
|
console.log('No French churches found.');
|
||
|
|
await prisma.$disconnect();
|
||
|
|
await pool.end();
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log(`Found ${churches.length} French churches to test\n`);
|
||
|
|
|
||
|
|
const scraper = new GenericScraper();
|
||
|
|
await scraper.init();
|
||
|
|
scraper.setCountry('FR');
|
||
|
|
|
||
|
|
let successCount = 0;
|
||
|
|
let failCount = 0;
|
||
|
|
const diagnostics: DiagnosticInfo[] = [];
|
||
|
|
|
||
|
|
for (let i = 0; i < churches.length; i++) {
|
||
|
|
const church = churches[i];
|
||
|
|
console.log(`[${i + 1}/${churches.length}] Testing: ${church.name} (${church.city || 'Unknown'})`);
|
||
|
|
console.log(`URL: ${church.website}`);
|
||
|
|
|
||
|
|
try {
|
||
|
|
const result = await scraper.scrape(church.website!);
|
||
|
|
|
||
|
|
// Extract diagnostics
|
||
|
|
let hasFrenchDays = false;
|
||
|
|
let hasTimePatterns = false;
|
||
|
|
let timePatternsSample: string[] = [];
|
||
|
|
let textSample = '';
|
||
|
|
|
||
|
|
if (result.rawHtml) {
|
||
|
|
const text = result.rawHtml
|
||
|
|
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||
|
|
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||
|
|
.replace(/<[^>]+>/g, ' ')
|
||
|
|
.replace(/\s+/g, ' ')
|
||
|
|
.toLowerCase();
|
||
|
|
|
||
|
|
textSample = text.substring(0, 500);
|
||
|
|
|
||
|
|
const frenchDays = ['dimanche', 'lundi', 'mardi', 'mercredi', 'jeudi', 'vendredi', 'samedi'];
|
||
|
|
hasFrenchDays = frenchDays.some(day => text.includes(day));
|
||
|
|
|
||
|
|
const timeRegex = /\d{1,2}[h:\.]\s*\d{0,2}\s*(?:h)?/g;
|
||
|
|
const times = text.match(timeRegex);
|
||
|
|
if (times) {
|
||
|
|
hasTimePatterns = true;
|
||
|
|
timePatternsSample = [...new Set(times)].slice(0, 10);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
const diagnostic: DiagnosticInfo = {
|
||
|
|
url: church.website!,
|
||
|
|
churchName: church.name,
|
||
|
|
success: result.success,
|
||
|
|
schedulesFound: result.schedules.length,
|
||
|
|
hasFrenchDays,
|
||
|
|
hasTimePatterns,
|
||
|
|
timePatternsSample,
|
||
|
|
textSample,
|
||
|
|
error: result.error,
|
||
|
|
};
|
||
|
|
|
||
|
|
diagnostics.push(diagnostic);
|
||
|
|
|
||
|
|
if (result.success && result.schedules.length > 0) {
|
||
|
|
successCount++;
|
||
|
|
console.log(`✅ SUCCESS - ${result.schedules.length} schedules`);
|
||
|
|
} else {
|
||
|
|
failCount++;
|
||
|
|
console.log(`❌ FAILED - ${result.error}`);
|
||
|
|
if (hasFrenchDays && !hasTimePatterns) {
|
||
|
|
console.log(` 💡 Has French days but no times`);
|
||
|
|
} else if (!hasFrenchDays && hasTimePatterns) {
|
||
|
|
console.log(` 💡 Has times but no French days`);
|
||
|
|
} else if (hasFrenchDays && hasTimePatterns) {
|
||
|
|
console.log(` 💡 Has BOTH days and times - parsing issue!`);
|
||
|
|
console.log(` Sample times: ${timePatternsSample.slice(0, 5).join(', ')}`);
|
||
|
|
} else {
|
||
|
|
console.log(` 💡 No mass schedule content found`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
console.log('');
|
||
|
|
} catch (err: any) {
|
||
|
|
failCount++;
|
||
|
|
console.log(`❌ ERROR - ${err.message}\n`);
|
||
|
|
diagnostics.push({
|
||
|
|
url: church.website!,
|
||
|
|
churchName: church.name,
|
||
|
|
success: false,
|
||
|
|
schedulesFound: 0,
|
||
|
|
hasFrenchDays: false,
|
||
|
|
hasTimePatterns: false,
|
||
|
|
timePatternsSample: [],
|
||
|
|
textSample: '',
|
||
|
|
error: err.message,
|
||
|
|
});
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
await scraper.close();
|
||
|
|
|
||
|
|
// Analysis
|
||
|
|
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
|
||
|
|
console.log(`\nRESULTS: ${successCount}/${churches.length} successful (${((successCount / churches.length) * 100).toFixed(0)}%)`);
|
||
|
|
console.log('');
|
||
|
|
|
||
|
|
const hasBoth = diagnostics.filter(d => !d.success && d.hasFrenchDays && d.hasTimePatterns);
|
||
|
|
const hasDaysNoTimes = diagnostics.filter(d => !d.success && d.hasFrenchDays && !d.hasTimePatterns);
|
||
|
|
const hasTimesNoDays = diagnostics.filter(d => !d.success && !d.hasFrenchDays && d.hasTimePatterns);
|
||
|
|
const hasNeither = diagnostics.filter(d => !d.success && !d.hasFrenchDays && !d.hasTimePatterns);
|
||
|
|
|
||
|
|
console.log('FAILURE ANALYSIS:');
|
||
|
|
console.log(` Has days + times but failed: ${hasBoth.length} (PARSING BUG)`);
|
||
|
|
console.log(` Has days but no times: ${hasDaysNoTimes.length}`);
|
||
|
|
console.log(` Has times but no days: ${hasTimesNoDays.length}`);
|
||
|
|
console.log(` Has neither: ${hasNeither.length} (no mass schedule on page)`);
|
||
|
|
console.log('');
|
||
|
|
|
||
|
|
if (hasBoth.length > 0) {
|
||
|
|
console.log('⚠️ PARSING BUGS TO FIX (has both days and times but failed):');
|
||
|
|
hasBoth.forEach(d => {
|
||
|
|
console.log(` ${d.churchName}`);
|
||
|
|
console.log(` URL: ${d.url}`);
|
||
|
|
console.log(` Sample times found: ${d.timePatternsSample.slice(0, 5).join(', ')}`);
|
||
|
|
console.log(` Text sample: ${d.textSample.substring(0, 150)}...`);
|
||
|
|
console.log('');
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
await prisma.$disconnect();
|
||
|
|
await pool.end();
|
||
|
|
}
|
||
|
|
|
||
|
|
testFrenchBroader().catch(console.error);
|