chore: sync with Gitea master and restore local-only files
Reset local main to gitea/master (new source of truth) and restored local-only files: web scrapers, admin dashboard, ChromaDB integration, debug scripts, and utility libraries that aren't tracked in Gitea. Gitea master adds: discovermass, buscarmisas-network, hk-parishes, bohosluzby, kerknet, gottesdienstzeiten, miserend importers, ClaimRequest model, forward geocoding, heartbeat healthcheck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
180
scripts/debug/test-french-broader.ts
Normal file
180
scripts/debug/test-french-broader.ts
Normal file
@@ -0,0 +1,180 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Test more French churches and collect diagnostic data
|
||||
*/
|
||||
|
||||
import { config } from 'dotenv';
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
interface DiagnosticInfo {
|
||||
url: string;
|
||||
churchName: string;
|
||||
success: boolean;
|
||||
schedulesFound: number;
|
||||
hasFrenchDays: boolean;
|
||||
hasTimePatterns: boolean;
|
||||
timePatternsSample: string[];
|
||||
textSample: string;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
async function testFrenchBroader() {
|
||||
console.log('Testing 20 French churches with diagnostics...\n');
|
||||
|
||||
// Get more French churches
|
||||
const churches = await prisma.church.findMany({
|
||||
where: {
|
||||
country: 'FR',
|
||||
website: { not: null },
|
||||
source: 'osm',
|
||||
},
|
||||
take: 20,
|
||||
orderBy: { createdAt: 'asc' },
|
||||
});
|
||||
|
||||
if (churches.length === 0) {
|
||||
console.log('No French churches found.');
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`Found ${churches.length} French churches to test\n`);
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('FR');
|
||||
|
||||
let successCount = 0;
|
||||
let failCount = 0;
|
||||
const diagnostics: DiagnosticInfo[] = [];
|
||||
|
||||
for (let i = 0; i < churches.length; i++) {
|
||||
const church = churches[i];
|
||||
console.log(`[${i + 1}/${churches.length}] Testing: ${church.name} (${church.city || 'Unknown'})`);
|
||||
console.log(`URL: ${church.website}`);
|
||||
|
||||
try {
|
||||
const result = await scraper.scrape(church.website!);
|
||||
|
||||
// Extract diagnostics
|
||||
let hasFrenchDays = false;
|
||||
let hasTimePatterns = false;
|
||||
let timePatternsSample: string[] = [];
|
||||
let textSample = '';
|
||||
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
textSample = text.substring(0, 500);
|
||||
|
||||
const frenchDays = ['dimanche', 'lundi', 'mardi', 'mercredi', 'jeudi', 'vendredi', 'samedi'];
|
||||
hasFrenchDays = frenchDays.some(day => text.includes(day));
|
||||
|
||||
const timeRegex = /\d{1,2}[h:\.]\s*\d{0,2}\s*(?:h)?/g;
|
||||
const times = text.match(timeRegex);
|
||||
if (times) {
|
||||
hasTimePatterns = true;
|
||||
timePatternsSample = [...new Set(times)].slice(0, 10);
|
||||
}
|
||||
}
|
||||
|
||||
const diagnostic: DiagnosticInfo = {
|
||||
url: church.website!,
|
||||
churchName: church.name,
|
||||
success: result.success,
|
||||
schedulesFound: result.schedules.length,
|
||||
hasFrenchDays,
|
||||
hasTimePatterns,
|
||||
timePatternsSample,
|
||||
textSample,
|
||||
error: result.error,
|
||||
};
|
||||
|
||||
diagnostics.push(diagnostic);
|
||||
|
||||
if (result.success && result.schedules.length > 0) {
|
||||
successCount++;
|
||||
console.log(`✅ SUCCESS - ${result.schedules.length} schedules`);
|
||||
} else {
|
||||
failCount++;
|
||||
console.log(`❌ FAILED - ${result.error}`);
|
||||
if (hasFrenchDays && !hasTimePatterns) {
|
||||
console.log(` 💡 Has French days but no times`);
|
||||
} else if (!hasFrenchDays && hasTimePatterns) {
|
||||
console.log(` 💡 Has times but no French days`);
|
||||
} else if (hasFrenchDays && hasTimePatterns) {
|
||||
console.log(` 💡 Has BOTH days and times - parsing issue!`);
|
||||
console.log(` Sample times: ${timePatternsSample.slice(0, 5).join(', ')}`);
|
||||
} else {
|
||||
console.log(` 💡 No mass schedule content found`);
|
||||
}
|
||||
}
|
||||
console.log('');
|
||||
} catch (err: any) {
|
||||
failCount++;
|
||||
console.log(`❌ ERROR - ${err.message}\n`);
|
||||
diagnostics.push({
|
||||
url: church.website!,
|
||||
churchName: church.name,
|
||||
success: false,
|
||||
schedulesFound: 0,
|
||||
hasFrenchDays: false,
|
||||
hasTimePatterns: false,
|
||||
timePatternsSample: [],
|
||||
textSample: '',
|
||||
error: err.message,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
|
||||
// Analysis
|
||||
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
|
||||
console.log(`\nRESULTS: ${successCount}/${churches.length} successful (${((successCount / churches.length) * 100).toFixed(0)}%)`);
|
||||
console.log('');
|
||||
|
||||
const hasBoth = diagnostics.filter(d => !d.success && d.hasFrenchDays && d.hasTimePatterns);
|
||||
const hasDaysNoTimes = diagnostics.filter(d => !d.success && d.hasFrenchDays && !d.hasTimePatterns);
|
||||
const hasTimesNoDays = diagnostics.filter(d => !d.success && !d.hasFrenchDays && d.hasTimePatterns);
|
||||
const hasNeither = diagnostics.filter(d => !d.success && !d.hasFrenchDays && !d.hasTimePatterns);
|
||||
|
||||
console.log('FAILURE ANALYSIS:');
|
||||
console.log(` Has days + times but failed: ${hasBoth.length} (PARSING BUG)`);
|
||||
console.log(` Has days but no times: ${hasDaysNoTimes.length}`);
|
||||
console.log(` Has times but no days: ${hasTimesNoDays.length}`);
|
||||
console.log(` Has neither: ${hasNeither.length} (no mass schedule on page)`);
|
||||
console.log('');
|
||||
|
||||
if (hasBoth.length > 0) {
|
||||
console.log('⚠️ PARSING BUGS TO FIX (has both days and times but failed):');
|
||||
hasBoth.forEach(d => {
|
||||
console.log(` ${d.churchName}`);
|
||||
console.log(` URL: ${d.url}`);
|
||||
console.log(` Sample times found: ${d.timePatternsSample.slice(0, 5).join(', ')}`);
|
||||
console.log(` Text sample: ${d.textSample.substring(0, 150)}...`);
|
||||
console.log('');
|
||||
});
|
||||
}
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
testFrenchBroader().catch(console.error);
|
||||
Reference in New Issue
Block a user