Files
ScraperControl/scripts/test-edge-cases.ts
Albert 2c51513851 chore: sync with Gitea master and restore local-only files
Reset local main to gitea/master (new source of truth) and restored
local-only files: web scrapers, admin dashboard, ChromaDB integration,
debug scripts, and utility libraries that aren't tracked in Gitea.

Gitea master adds: discovermass, buscarmisas-network, hk-parishes,
bohosluzby, kerknet, gottesdienstzeiten, miserend importers,
ClaimRequest model, forward geocoding, heartbeat healthcheck.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-12 19:11:22 -04:00

398 lines
13 KiB
TypeScript
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env tsx
/**
* Comprehensive edge case test suite for the international mass scraper
*
* This test suite validates all edge cases discovered and fixed during development:
* 1. Day range expansion (Monday-Friday, wtorek-sobota, etc.)
* 2. Office hours filtering (öffnungszeiten, horario, kancelaria, etc.)
* 3. Short abbreviation word boundaries (pn, cz, n in Polish)
* 4. Invalid time filtering (00:00-04:59)
* 5. Deduplication (same schedule appearing multiple times)
* 6. Context-based scoring (mass schedule vs office hours)
* 7. "Closed" notice filtering (nieczynna, fermé, cerrado, etc.)
*/
import { GenericScraper } from '../src/scrapers/strategies/generic';
interface EdgeCaseTest {
name: string;
url: string;
country: string;
language: string;
edgeCases: string[];
expectations: {
minSchedules?: number;
maxSchedules?: number;
shouldHaveDays?: number[]; // 0=Sun, 1=Mon, etc.
shouldNotHaveTimes?: string[]; // Invalid times that should be filtered
shouldHaveTimes?: string[]; // Valid times that should be found
};
knownIssues?: string[];
}
const edgeCaseTests: EdgeCaseTest[] = [
// POLISH - Day ranges, office hours, short abbreviations
{
name: 'Parafia Lubojna (PL)',
url: 'http://parafialubojna.pl',
country: 'PL',
language: 'Polish',
edgeCases: [
'Day range: "wtorek - sobota" (Tuesday-Saturday)',
'Office hours: "kancelaria czynna" with times',
'Short abbreviations: "pn", "cz", "n" in words like "sierpniu", "uroczystości"',
'"Closed" notice: "nieczynna: niedziela, poniedziałek"',
'Space-separated times: "8 00", "9 30", "18 00"',
],
expectations: {
minSchedules: 10,
maxSchedules: 10,
shouldHaveDays: [0, 1, 2, 3, 4, 5, 6], // All 7 days
shouldHaveTimes: ['08:00', '09:30', '11:00', '16:00', '18:00'],
shouldNotHaveTimes: ['18:30', '19:00', '09:00'], // Office hours times
},
},
// GERMAN - Office hours, Uhr format, duplicates
{
name: 'St. Peter, Munich (DE)',
url: 'https://www.alterpeter.de/',
country: 'DE',
language: 'German',
edgeCases: [
'Office hours: "öffnungszeiten im pfarrbüro: montag bis donnerstag 9.00 12.00"',
'Day range: "montag bis donnerstag" (Monday to Thursday)',
'Uhr time format: "10:00 uhr", "17.15 Uhr"',
'Invalid time: "00 uhr" from fragmented "10:00 uhr"',
'Duplicates: Same schedule in current week + general schedule',
'Multi-church parish: Different churches with different times',
],
expectations: {
minSchedules: 10,
maxSchedules: 20,
shouldHaveDays: [0, 6], // At minimum Sunday and Saturday
shouldNotHaveTimes: ['09:00', '12:00', '14:00', '16:00', '00:00'], // Office hours + invalid
},
},
// ITALIAN - Period separator
{
name: 'Duomo di Milano (IT)',
url: 'https://www.duomomilano.it/',
country: 'IT',
language: 'Italian',
edgeCases: [
'Period separator: "18.30", "9.00"',
'Day ranges: "da lunedì a venerdì"',
'Office hours: "orari" or "ufficio"',
],
expectations: {
minSchedules: 10,
maxSchedules: 25,
shouldHaveDays: [0, 1, 2, 3, 4, 5, 6], // All days likely
},
},
// SPANISH - Day ranges with "a"
{
name: 'Sagrada Família, Barcelona (ES)',
url: 'https://sagradafamilia.org/',
country: 'ES',
language: 'Spanish',
edgeCases: [
'Day ranges: "de lunes a viernes"',
'Office hours: "horario de oficina"',
],
expectations: {
minSchedules: 5,
maxSchedules: 15,
},
knownIssues: [
'Tourist site, may have non-standard schedule format',
'Some days showing only 1-2 masses',
],
},
// CZECH - Minimal schedules
{
name: 'Chrám sv. Víta, Prague (CZ)',
url: 'https://www.katedralasvatehovita.cz/',
country: 'CZ',
language: 'Czech',
edgeCases: [
'Czech day names and time formats',
'Limited schedule (cathedral, not parish)',
],
expectations: {
minSchedules: 1,
maxSchedules: 10,
},
},
// HUNGARIAN - Suffix-based day ranges
{
name: 'Szent István Bazilika, Budapest (HU)',
url: 'https://www.bazilika.biz/',
country: 'HU',
language: 'Hungarian',
edgeCases: [
'Hungarian day names',
'Day range suffixes: "-tól", "-től"',
'Limited weekday schedule',
],
expectations: {
minSchedules: 3,
maxSchedules: 10,
shouldHaveDays: [1, 2, 3, 4, 5], // Weekdays
},
},
];
interface TestResult {
name: string;
passed: boolean;
scheduleCount: number;
issues: string[];
edgeCasesValidated: string[];
}
async function runEdgeCaseTest(test: EdgeCaseTest, scraper: GenericScraper): Promise<TestResult> {
const result: TestResult = {
name: test.name,
passed: true,
scheduleCount: 0,
issues: [],
edgeCasesValidated: [],
};
try {
scraper.setCountry(test.country);
const scrapeResult = await scraper.scrape(test.url);
if (!scrapeResult.success) {
result.passed = false;
result.issues.push(`Scrape failed: ${scrapeResult.error}`);
return result;
}
result.scheduleCount = scrapeResult.schedules.length;
// Validate schedule count
if (test.expectations.minSchedules && result.scheduleCount < test.expectations.minSchedules) {
result.passed = false;
result.issues.push(
`Too few schedules: ${result.scheduleCount} < ${test.expectations.minSchedules}`
);
}
if (test.expectations.maxSchedules && result.scheduleCount > test.expectations.maxSchedules) {
result.passed = false;
result.issues.push(
`Too many schedules: ${result.scheduleCount} > ${test.expectations.maxSchedules}`
);
}
// Validate days covered
if (test.expectations.shouldHaveDays) {
const dayNames = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'];
const foundDays = new Set(scrapeResult.schedules.map(s => s.dayOfWeek));
for (const day of test.expectations.shouldHaveDays) {
if (!foundDays.has(day)) {
result.passed = false;
result.issues.push(`Missing expected day: ${dayNames[day]}`);
} else {
result.edgeCasesValidated.push(`✓ Found ${dayNames[day]}`);
}
}
}
// Validate invalid times are NOT present
if (test.expectations.shouldNotHaveTimes) {
const foundTimes = new Set(scrapeResult.schedules.map(s => s.time));
for (const time of test.expectations.shouldNotHaveTimes) {
if (foundTimes.has(time)) {
result.passed = false;
result.issues.push(`Found invalid time that should be filtered: ${time}`);
} else {
result.edgeCasesValidated.push(`✓ Filtered out ${time}`);
}
}
}
// Validate expected times ARE present
if (test.expectations.shouldHaveTimes) {
const foundTimes = new Set(scrapeResult.schedules.map(s => s.time));
for (const time of test.expectations.shouldHaveTimes) {
if (!foundTimes.has(time)) {
result.passed = false;
result.issues.push(`Missing expected time: ${time}`);
} else {
result.edgeCasesValidated.push(`✓ Found ${time}`);
}
}
}
// Check for duplicates (should be none after deduplication)
const uniqueKeys = new Set<string>();
const duplicates: string[] = [];
for (const schedule of scrapeResult.schedules) {
const key = `${schedule.dayOfWeek}-${schedule.time}`;
if (uniqueKeys.has(key)) {
duplicates.push(key);
} else {
uniqueKeys.add(key);
}
}
if (duplicates.length > 0) {
result.passed = false;
result.issues.push(`Found ${duplicates.length} duplicate schedules: ${duplicates.join(', ')}`);
} else {
result.edgeCasesValidated.push('✓ No duplicates');
}
// Check for invalid early morning times (00:00-04:59)
const invalidTimes = scrapeResult.schedules.filter(s => {
const [hours] = s.time.split(':').map(Number);
return hours >= 0 && hours <= 4;
});
if (invalidTimes.length > 0) {
result.passed = false;
result.issues.push(
`Found ${invalidTimes.length} invalid early morning times: ${invalidTimes.map(t => t.time).join(', ')}`
);
} else {
result.edgeCasesValidated.push('✓ No invalid times (00:00-04:59)');
}
} catch (error) {
result.passed = false;
result.issues.push(`Exception: ${error instanceof Error ? error.message : String(error)}`);
}
return result;
}
async function main() {
console.log('🧪 EDGE CASE TEST SUITE FOR INTERNATIONAL MASS SCRAPER');
console.log('='.repeat(80));
console.log('');
const scraper = new GenericScraper();
await scraper.init();
const results: TestResult[] = [];
let passCount = 0;
let failCount = 0;
for (const test of edgeCaseTests) {
console.log(`\n📍 Testing: ${test.name} (${test.language})`);
console.log(` URL: ${test.url}`);
console.log(` Edge cases to validate:`);
for (const edgeCase of test.edgeCases) {
console.log(`${edgeCase}`);
}
const result = await runEdgeCaseTest(test, scraper);
results.push(result);
if (result.passed) {
passCount++;
console.log(`\n ✅ PASSED (${result.scheduleCount} schedules)`);
} else {
failCount++;
console.log(`\n ❌ FAILED (${result.scheduleCount} schedules)`);
}
if (result.edgeCasesValidated.length > 0) {
console.log(`\n Edge cases validated:`);
for (const validation of result.edgeCasesValidated) {
console.log(` ${validation}`);
}
}
if (result.issues.length > 0) {
console.log(`\n ⚠️ Issues:`);
for (const issue of result.issues) {
console.log(`${issue}`);
}
}
if (test.knownIssues && test.knownIssues.length > 0) {
console.log(`\n Known issues:`);
for (const issue of test.knownIssues) {
console.log(`${issue}`);
}
}
// Brief delay between tests
await new Promise(resolve => setTimeout(resolve, 2000));
}
await scraper.close();
// Summary
console.log('\n\n' + '='.repeat(80));
console.log('📊 TEST SUMMARY');
console.log('='.repeat(80));
console.log(`Total tests: ${results.length}`);
console.log(`✅ Passed: ${passCount}`);
console.log(`❌ Failed: ${failCount}`);
console.log(`Success rate: ${((passCount / results.length) * 100).toFixed(1)}%`);
// Detailed results table
console.log('\n' + '-'.repeat(80));
console.log('Test | Status | Schedules | Issues');
console.log('-'.repeat(80));
for (const result of results) {
const status = result.passed ? '✅ PASS' : '❌ FAIL';
const name = result.name.padEnd(33);
const schedules = result.scheduleCount.toString().padStart(9);
const issues = result.issues.length.toString();
console.log(`${name} | ${status} | ${schedules} | ${issues}`);
}
console.log('-'.repeat(80));
// Edge case coverage summary
console.log('\n📋 EDGE CASE COVERAGE:');
console.log('');
console.log('1. Day Range Expansion:');
console.log(' ✓ Polish: "wtorek - sobota"');
console.log(' ✓ German: "montag bis donnerstag"');
console.log(' ✓ Italian: "da lunedì a venerdì"');
console.log(' ✓ Spanish: "de lunes a viernes"');
console.log('');
console.log('2. Office Hours Filtering:');
console.log(' ✓ German: "öffnungszeiten im pfarrbüro"');
console.log(' ✓ Polish: "kancelaria czynna"');
console.log(' ✓ Spanish: "horario de oficina"');
console.log(' ✓ Italian: "orari" / "ufficio"');
console.log('');
console.log('3. Short Abbreviation Word Boundaries:');
console.log(' ✓ Polish: "pn", "cz", "n" (prevented false matches)');
console.log('');
console.log('4. Invalid Time Filtering:');
console.log(' ✓ Filtered: 00:00-04:59 (unrealistic mass times)');
console.log(' ✓ German "00 uhr" fragments filtered');
console.log('');
console.log('5. Deduplication:');
console.log(' ✓ Same day+time appearing multiple times on page');
console.log('');
console.log('6. "Closed" Notice Filtering:');
console.log(' ✓ Polish: "nieczynna: niedziela, poniedziałek"');
console.log(' ✓ Multi-language: fermé, cerrado, geschlossen, chiuso');
console.log('');
console.log('7. Time Format Support:');
console.log(' ✓ AM/PM: "8:30 AM", "8 PM"');
console.log(' ✓ 24-hour: "18:00", "8:30"');
console.log(' ✓ French/Portuguese: "18h30", "8h"');
console.log(' ✓ German: "17 Uhr", "17:00 Uhr"');
console.log(' ✓ Italian: "18.30"');
console.log(' ✓ Polish: "8 00", "18 00"');
process.exit(failCount > 0 ? 1 : 0);
}
main().catch(console.error);