chore: sync with Gitea master and restore local-only files

Reset local main to gitea/master (new source of truth) and restored
local-only files: web scrapers, admin dashboard, ChromaDB integration,
debug scripts, and utility libraries that aren't tracked in Gitea.

Gitea master adds: discovermass, buscarmisas-network, hk-parishes,
bohosluzby, kerknet, gottesdienstzeiten, miserend importers,
ClaimRequest model, forward geocoding, heartbeat healthcheck.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Albert
2026-04-12 19:11:22 -04:00
parent 76cca3ba75
commit 2c51513851
133 changed files with 30381 additions and 0 deletions

397
scripts/test-edge-cases.ts Normal file
View File

@@ -0,0 +1,397 @@
#!/usr/bin/env tsx
/**
* Comprehensive edge case test suite for the international mass scraper
*
* This test suite validates all edge cases discovered and fixed during development:
* 1. Day range expansion (Monday-Friday, wtorek-sobota, etc.)
* 2. Office hours filtering (öffnungszeiten, horario, kancelaria, etc.)
* 3. Short abbreviation word boundaries (pn, cz, n in Polish)
* 4. Invalid time filtering (00:00-04:59)
* 5. Deduplication (same schedule appearing multiple times)
* 6. Context-based scoring (mass schedule vs office hours)
* 7. "Closed" notice filtering (nieczynna, fermé, cerrado, etc.)
*/
import { GenericScraper } from '../src/scrapers/strategies/generic';
interface EdgeCaseTest {
name: string;
url: string;
country: string;
language: string;
edgeCases: string[];
expectations: {
minSchedules?: number;
maxSchedules?: number;
shouldHaveDays?: number[]; // 0=Sun, 1=Mon, etc.
shouldNotHaveTimes?: string[]; // Invalid times that should be filtered
shouldHaveTimes?: string[]; // Valid times that should be found
};
knownIssues?: string[];
}
const edgeCaseTests: EdgeCaseTest[] = [
// POLISH - Day ranges, office hours, short abbreviations
{
name: 'Parafia Lubojna (PL)',
url: 'http://parafialubojna.pl',
country: 'PL',
language: 'Polish',
edgeCases: [
'Day range: "wtorek - sobota" (Tuesday-Saturday)',
'Office hours: "kancelaria czynna" with times',
'Short abbreviations: "pn", "cz", "n" in words like "sierpniu", "uroczystości"',
'"Closed" notice: "nieczynna: niedziela, poniedziałek"',
'Space-separated times: "8 00", "9 30", "18 00"',
],
expectations: {
minSchedules: 10,
maxSchedules: 10,
shouldHaveDays: [0, 1, 2, 3, 4, 5, 6], // All 7 days
shouldHaveTimes: ['08:00', '09:30', '11:00', '16:00', '18:00'],
shouldNotHaveTimes: ['18:30', '19:00', '09:00'], // Office hours times
},
},
// GERMAN - Office hours, Uhr format, duplicates
{
name: 'St. Peter, Munich (DE)',
url: 'https://www.alterpeter.de/',
country: 'DE',
language: 'German',
edgeCases: [
'Office hours: "öffnungszeiten im pfarrbüro: montag bis donnerstag 9.00 12.00"',
'Day range: "montag bis donnerstag" (Monday to Thursday)',
'Uhr time format: "10:00 uhr", "17.15 Uhr"',
'Invalid time: "00 uhr" from fragmented "10:00 uhr"',
'Duplicates: Same schedule in current week + general schedule',
'Multi-church parish: Different churches with different times',
],
expectations: {
minSchedules: 10,
maxSchedules: 20,
shouldHaveDays: [0, 6], // At minimum Sunday and Saturday
shouldNotHaveTimes: ['09:00', '12:00', '14:00', '16:00', '00:00'], // Office hours + invalid
},
},
// ITALIAN - Period separator
{
name: 'Duomo di Milano (IT)',
url: 'https://www.duomomilano.it/',
country: 'IT',
language: 'Italian',
edgeCases: [
'Period separator: "18.30", "9.00"',
'Day ranges: "da lunedì a venerdì"',
'Office hours: "orari" or "ufficio"',
],
expectations: {
minSchedules: 10,
maxSchedules: 25,
shouldHaveDays: [0, 1, 2, 3, 4, 5, 6], // All days likely
},
},
// SPANISH - Day ranges with "a"
{
name: 'Sagrada Família, Barcelona (ES)',
url: 'https://sagradafamilia.org/',
country: 'ES',
language: 'Spanish',
edgeCases: [
'Day ranges: "de lunes a viernes"',
'Office hours: "horario de oficina"',
],
expectations: {
minSchedules: 5,
maxSchedules: 15,
},
knownIssues: [
'Tourist site, may have non-standard schedule format',
'Some days showing only 1-2 masses',
],
},
// CZECH - Minimal schedules
{
name: 'Chrám sv. Víta, Prague (CZ)',
url: 'https://www.katedralasvatehovita.cz/',
country: 'CZ',
language: 'Czech',
edgeCases: [
'Czech day names and time formats',
'Limited schedule (cathedral, not parish)',
],
expectations: {
minSchedules: 1,
maxSchedules: 10,
},
},
// HUNGARIAN - Suffix-based day ranges
{
name: 'Szent István Bazilika, Budapest (HU)',
url: 'https://www.bazilika.biz/',
country: 'HU',
language: 'Hungarian',
edgeCases: [
'Hungarian day names',
'Day range suffixes: "-tól", "-től"',
'Limited weekday schedule',
],
expectations: {
minSchedules: 3,
maxSchedules: 10,
shouldHaveDays: [1, 2, 3, 4, 5], // Weekdays
},
},
];
interface TestResult {
name: string;
passed: boolean;
scheduleCount: number;
issues: string[];
edgeCasesValidated: string[];
}
async function runEdgeCaseTest(test: EdgeCaseTest, scraper: GenericScraper): Promise<TestResult> {
const result: TestResult = {
name: test.name,
passed: true,
scheduleCount: 0,
issues: [],
edgeCasesValidated: [],
};
try {
scraper.setCountry(test.country);
const scrapeResult = await scraper.scrape(test.url);
if (!scrapeResult.success) {
result.passed = false;
result.issues.push(`Scrape failed: ${scrapeResult.error}`);
return result;
}
result.scheduleCount = scrapeResult.schedules.length;
// Validate schedule count
if (test.expectations.minSchedules && result.scheduleCount < test.expectations.minSchedules) {
result.passed = false;
result.issues.push(
`Too few schedules: ${result.scheduleCount} < ${test.expectations.minSchedules}`
);
}
if (test.expectations.maxSchedules && result.scheduleCount > test.expectations.maxSchedules) {
result.passed = false;
result.issues.push(
`Too many schedules: ${result.scheduleCount} > ${test.expectations.maxSchedules}`
);
}
// Validate days covered
if (test.expectations.shouldHaveDays) {
const dayNames = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'];
const foundDays = new Set(scrapeResult.schedules.map(s => s.dayOfWeek));
for (const day of test.expectations.shouldHaveDays) {
if (!foundDays.has(day)) {
result.passed = false;
result.issues.push(`Missing expected day: ${dayNames[day]}`);
} else {
result.edgeCasesValidated.push(`✓ Found ${dayNames[day]}`);
}
}
}
// Validate invalid times are NOT present
if (test.expectations.shouldNotHaveTimes) {
const foundTimes = new Set(scrapeResult.schedules.map(s => s.time));
for (const time of test.expectations.shouldNotHaveTimes) {
if (foundTimes.has(time)) {
result.passed = false;
result.issues.push(`Found invalid time that should be filtered: ${time}`);
} else {
result.edgeCasesValidated.push(`✓ Filtered out ${time}`);
}
}
}
// Validate expected times ARE present
if (test.expectations.shouldHaveTimes) {
const foundTimes = new Set(scrapeResult.schedules.map(s => s.time));
for (const time of test.expectations.shouldHaveTimes) {
if (!foundTimes.has(time)) {
result.passed = false;
result.issues.push(`Missing expected time: ${time}`);
} else {
result.edgeCasesValidated.push(`✓ Found ${time}`);
}
}
}
// Check for duplicates (should be none after deduplication)
const uniqueKeys = new Set<string>();
const duplicates: string[] = [];
for (const schedule of scrapeResult.schedules) {
const key = `${schedule.dayOfWeek}-${schedule.time}`;
if (uniqueKeys.has(key)) {
duplicates.push(key);
} else {
uniqueKeys.add(key);
}
}
if (duplicates.length > 0) {
result.passed = false;
result.issues.push(`Found ${duplicates.length} duplicate schedules: ${duplicates.join(', ')}`);
} else {
result.edgeCasesValidated.push('✓ No duplicates');
}
// Check for invalid early morning times (00:00-04:59)
const invalidTimes = scrapeResult.schedules.filter(s => {
const [hours] = s.time.split(':').map(Number);
return hours >= 0 && hours <= 4;
});
if (invalidTimes.length > 0) {
result.passed = false;
result.issues.push(
`Found ${invalidTimes.length} invalid early morning times: ${invalidTimes.map(t => t.time).join(', ')}`
);
} else {
result.edgeCasesValidated.push('✓ No invalid times (00:00-04:59)');
}
} catch (error) {
result.passed = false;
result.issues.push(`Exception: ${error instanceof Error ? error.message : String(error)}`);
}
return result;
}
async function main() {
console.log('🧪 EDGE CASE TEST SUITE FOR INTERNATIONAL MASS SCRAPER');
console.log('='.repeat(80));
console.log('');
const scraper = new GenericScraper();
await scraper.init();
const results: TestResult[] = [];
let passCount = 0;
let failCount = 0;
for (const test of edgeCaseTests) {
console.log(`\n📍 Testing: ${test.name} (${test.language})`);
console.log(` URL: ${test.url}`);
console.log(` Edge cases to validate:`);
for (const edgeCase of test.edgeCases) {
console.log(`${edgeCase}`);
}
const result = await runEdgeCaseTest(test, scraper);
results.push(result);
if (result.passed) {
passCount++;
console.log(`\n ✅ PASSED (${result.scheduleCount} schedules)`);
} else {
failCount++;
console.log(`\n ❌ FAILED (${result.scheduleCount} schedules)`);
}
if (result.edgeCasesValidated.length > 0) {
console.log(`\n Edge cases validated:`);
for (const validation of result.edgeCasesValidated) {
console.log(` ${validation}`);
}
}
if (result.issues.length > 0) {
console.log(`\n ⚠️ Issues:`);
for (const issue of result.issues) {
console.log(`${issue}`);
}
}
if (test.knownIssues && test.knownIssues.length > 0) {
console.log(`\n Known issues:`);
for (const issue of test.knownIssues) {
console.log(`${issue}`);
}
}
// Brief delay between tests
await new Promise(resolve => setTimeout(resolve, 2000));
}
await scraper.close();
// Summary
console.log('\n\n' + '='.repeat(80));
console.log('📊 TEST SUMMARY');
console.log('='.repeat(80));
console.log(`Total tests: ${results.length}`);
console.log(`✅ Passed: ${passCount}`);
console.log(`❌ Failed: ${failCount}`);
console.log(`Success rate: ${((passCount / results.length) * 100).toFixed(1)}%`);
// Detailed results table
console.log('\n' + '-'.repeat(80));
console.log('Test | Status | Schedules | Issues');
console.log('-'.repeat(80));
for (const result of results) {
const status = result.passed ? '✅ PASS' : '❌ FAIL';
const name = result.name.padEnd(33);
const schedules = result.scheduleCount.toString().padStart(9);
const issues = result.issues.length.toString();
console.log(`${name} | ${status} | ${schedules} | ${issues}`);
}
console.log('-'.repeat(80));
// Edge case coverage summary
console.log('\n📋 EDGE CASE COVERAGE:');
console.log('');
console.log('1. Day Range Expansion:');
console.log(' ✓ Polish: "wtorek - sobota"');
console.log(' ✓ German: "montag bis donnerstag"');
console.log(' ✓ Italian: "da lunedì a venerdì"');
console.log(' ✓ Spanish: "de lunes a viernes"');
console.log('');
console.log('2. Office Hours Filtering:');
console.log(' ✓ German: "öffnungszeiten im pfarrbüro"');
console.log(' ✓ Polish: "kancelaria czynna"');
console.log(' ✓ Spanish: "horario de oficina"');
console.log(' ✓ Italian: "orari" / "ufficio"');
console.log('');
console.log('3. Short Abbreviation Word Boundaries:');
console.log(' ✓ Polish: "pn", "cz", "n" (prevented false matches)');
console.log('');
console.log('4. Invalid Time Filtering:');
console.log(' ✓ Filtered: 00:00-04:59 (unrealistic mass times)');
console.log(' ✓ German "00 uhr" fragments filtered');
console.log('');
console.log('5. Deduplication:');
console.log(' ✓ Same day+time appearing multiple times on page');
console.log('');
console.log('6. "Closed" Notice Filtering:');
console.log(' ✓ Polish: "nieczynna: niedziela, poniedziałek"');
console.log(' ✓ Multi-language: fermé, cerrado, geschlossen, chiuso');
console.log('');
console.log('7. Time Format Support:');
console.log(' ✓ AM/PM: "8:30 AM", "8 PM"');
console.log(' ✓ 24-hour: "18:00", "8:30"');
console.log(' ✓ French/Portuguese: "18h30", "8h"');
console.log(' ✓ German: "17 Uhr", "17:00 Uhr"');
console.log(' ✓ Italian: "18.30"');
console.log(' ✓ Polish: "8 00", "18 00"');
process.exit(failCount > 0 ? 1 : 0);
}
main().catch(console.error);