chore: sync with Gitea master and restore local-only files

Reset local main to gitea/master (new source of truth) and restored
local-only files: web scrapers, admin dashboard, ChromaDB integration,
debug scripts, and utility libraries that aren't tracked in Gitea.

Gitea master adds: discovermass, buscarmisas-network, hk-parishes,
bohosluzby, kerknet, gottesdienstzeiten, miserend importers,
ClaimRequest model, forward geocoding, heartbeat healthcheck.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Albert
2026-04-12 19:11:22 -04:00
parent 76cca3ba75
commit 2c51513851
133 changed files with 30381 additions and 0 deletions

View File

@@ -0,0 +1,150 @@
#!/usr/bin/env tsx
/**
* Debug the 5 parsing bugs identified in top 5 test
*/
import { config } from 'dotenv';
config({ path: '.env.local' });
config({ path: '.env' });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { GenericScraper } from '../../src/scrapers/strategies/generic';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
// The churches with parsing bugs
const BUG_CHURCHES = [
{ name: 'St. Marien', country: 'DE', searchTerm: 'St. Marien' },
{ name: 'Santuario de Manalagua', country: 'ES', searchTerm: 'Santuario de Manalagua' },
{ name: 'Kościół pw. Najświętszego Serca', country: 'PL', searchTerm: 'Najświętszego Serca Pana Jez' },
{ name: 'Paróquia de Nossa Senhora do Desterro', country: 'BR', searchTerm: 'Nossa Senhora do Desterro' },
{ name: 'Paróquia da Paz', country: 'BR', searchTerm: 'Paróquia da Paz' },
];
async function debugBugs() {
console.log('Debugging parsing bugs...\n');
const scraper = new GenericScraper();
await scraper.init();
for (const bug of BUG_CHURCHES) {
console.log('═'.repeat(80));
console.log(`BUG: ${bug.name} (${bug.country})`);
console.log('═'.repeat(80));
const church = await prisma.church.findFirst({
where: {
country: bug.country,
name: { contains: bug.searchTerm },
website: { not: null },
},
});
if (!church) {
console.log(`❌ Church not found in database\n`);
continue;
}
console.log(`Church: ${church.name}`);
console.log(`URL: ${church.website}\n`);
scraper.setCountry(bug.country);
try {
const result = await scraper.scrape(church.website!);
console.log(`Success: ${result.success}`);
console.log(`Schedules found: ${result.schedules.length}`);
if (result.error) console.log(`Error: ${result.error}`);
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
console.log('\n--- Text Sample (first 1000 chars) ---');
console.log(text.substring(0, 1000));
// Check for day names
console.log('\n--- Day Names Found ---');
const dayPatterns: Record<string, string[]> = {
DE: ['sonntag', 'montag', 'dienstag', 'mittwoch', 'donnerstag', 'freitag', 'samstag'],
ES: ['domingo', 'lunes', 'martes', 'miércoles', 'miercoles', 'jueves', 'viernes', 'sábado', 'sabado'],
PL: ['niedziela', 'poniedziałek', 'poniedzialek', 'wtorek', 'środa', 'sroda', 'czwartek', 'piątek', 'piatek', 'sobota'],
BR: ['domingo', 'segunda', 'terça', 'terca', 'quarta', 'quinta', 'sexta', 'sábado', 'sabado'],
};
const days = dayPatterns[bug.country] || [];
const foundDays: string[] = [];
for (const day of days) {
if (text.includes(day)) {
foundDays.push(day);
}
}
console.log(`Found: ${foundDays.join(', ') || 'none'}`);
// Check for time patterns
console.log('\n--- Time Patterns Found ---');
const timeRegex = /\d{1,2}[h:\.]\s*\d{0,2}\s*(?:h|uhr)?/gi;
const times = text.match(timeRegex);
if (times) {
const uniqueTimes = [...new Set(times)].slice(0, 20);
console.log(`Found ${times.length} time patterns (showing first 20 unique):`);
console.log(uniqueTimes.join(', '));
} else {
console.log('No time patterns found');
}
// Look for specific mass schedule keywords
console.log('\n--- Mass Schedule Keywords ---');
const keywords: Record<string, string[]> = {
DE: ['gottesdienst', 'messe', 'heilige messe', 'messzeiten'],
ES: ['misa', 'horario', 'eucaristía', 'eucaristia'],
PL: ['msza', 'msze', 'nabożeństwo', 'nabozenstwo'],
BR: ['missa', 'horário', 'horario', 'eucaristia'],
};
const countryKeywords = keywords[bug.country] || [];
const foundKeywords: string[] = [];
for (const keyword of countryKeywords) {
if (text.includes(keyword)) {
foundKeywords.push(keyword);
}
}
console.log(`Found: ${foundKeywords.join(', ') || 'none'}`);
// Look for specific problematic patterns
console.log('\n--- Looking for edge cases ---');
// Check if times and days are separated (not in same section)
const hasTimeBeforeDays = text.indexOf(foundDays[0] || 'zzz') > text.indexOf((times || [])[0] || 'aaa');
console.log(`Times come before days: ${hasTimeBeforeDays ? 'YES (potential issue)' : 'no'}`);
// Check for table structures
const hasTables = text.includes('colspan') || text.includes('rowspan') || (text.match(/\s+\|\s+/g)?.length || 0) > 5;
console.log(`Likely table format: ${hasTables ? 'YES (may need special handling)' : 'no'}`);
// Check for multiple languages on same page
const hasMultiLang = (text.match(/english|español|espanol|portuguese|português|portugues|deutsch|polski/gi)?.length || 0) > 1;
console.log(`Multiple languages: ${hasMultiLang ? 'YES (may confuse parser)' : 'no'}`);
}
console.log('\n');
} catch (err: any) {
console.log(`❌ ERROR: ${err.message}\n`);
}
}
await scraper.close();
await prisma.$disconnect();
await pool.end();
}
debugBugs().catch(console.error);