Reset local main to gitea/master (new source of truth) and restored local-only files: web scrapers, admin dashboard, ChromaDB integration, debug scripts, and utility libraries that aren't tracked in Gitea. Gitea master adds: discovermass, buscarmisas-network, hk-parishes, bohosluzby, kerknet, gottesdienstzeiten, miserend importers, ClaimRequest model, forward geocoding, heartbeat healthcheck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
398 lines
13 KiB
TypeScript
398 lines
13 KiB
TypeScript
#!/usr/bin/env tsx
|
||
/**
|
||
* Comprehensive edge case test suite for the international mass scraper
|
||
*
|
||
* This test suite validates all edge cases discovered and fixed during development:
|
||
* 1. Day range expansion (Monday-Friday, wtorek-sobota, etc.)
|
||
* 2. Office hours filtering (öffnungszeiten, horario, kancelaria, etc.)
|
||
* 3. Short abbreviation word boundaries (pn, cz, n in Polish)
|
||
* 4. Invalid time filtering (00:00-04:59)
|
||
* 5. Deduplication (same schedule appearing multiple times)
|
||
* 6. Context-based scoring (mass schedule vs office hours)
|
||
* 7. "Closed" notice filtering (nieczynna, fermé, cerrado, etc.)
|
||
*/
|
||
|
||
import { GenericScraper } from '../src/scrapers/strategies/generic';
|
||
|
||
interface EdgeCaseTest {
|
||
name: string;
|
||
url: string;
|
||
country: string;
|
||
language: string;
|
||
edgeCases: string[];
|
||
expectations: {
|
||
minSchedules?: number;
|
||
maxSchedules?: number;
|
||
shouldHaveDays?: number[]; // 0=Sun, 1=Mon, etc.
|
||
shouldNotHaveTimes?: string[]; // Invalid times that should be filtered
|
||
shouldHaveTimes?: string[]; // Valid times that should be found
|
||
};
|
||
knownIssues?: string[];
|
||
}
|
||
|
||
const edgeCaseTests: EdgeCaseTest[] = [
|
||
// POLISH - Day ranges, office hours, short abbreviations
|
||
{
|
||
name: 'Parafia Lubojna (PL)',
|
||
url: 'http://parafialubojna.pl',
|
||
country: 'PL',
|
||
language: 'Polish',
|
||
edgeCases: [
|
||
'Day range: "wtorek - sobota" (Tuesday-Saturday)',
|
||
'Office hours: "kancelaria czynna" with times',
|
||
'Short abbreviations: "pn", "cz", "n" in words like "sierpniu", "uroczystości"',
|
||
'"Closed" notice: "nieczynna: niedziela, poniedziałek"',
|
||
'Space-separated times: "8 00", "9 30", "18 00"',
|
||
],
|
||
expectations: {
|
||
minSchedules: 10,
|
||
maxSchedules: 10,
|
||
shouldHaveDays: [0, 1, 2, 3, 4, 5, 6], // All 7 days
|
||
shouldHaveTimes: ['08:00', '09:30', '11:00', '16:00', '18:00'],
|
||
shouldNotHaveTimes: ['18:30', '19:00', '09:00'], // Office hours times
|
||
},
|
||
},
|
||
|
||
// GERMAN - Office hours, Uhr format, duplicates
|
||
{
|
||
name: 'St. Peter, Munich (DE)',
|
||
url: 'https://www.alterpeter.de/',
|
||
country: 'DE',
|
||
language: 'German',
|
||
edgeCases: [
|
||
'Office hours: "öffnungszeiten im pfarrbüro: montag bis donnerstag 9.00 – 12.00"',
|
||
'Day range: "montag bis donnerstag" (Monday to Thursday)',
|
||
'Uhr time format: "10:00 uhr", "17.15 Uhr"',
|
||
'Invalid time: "00 uhr" from fragmented "10:00 uhr"',
|
||
'Duplicates: Same schedule in current week + general schedule',
|
||
'Multi-church parish: Different churches with different times',
|
||
],
|
||
expectations: {
|
||
minSchedules: 10,
|
||
maxSchedules: 20,
|
||
shouldHaveDays: [0, 6], // At minimum Sunday and Saturday
|
||
shouldNotHaveTimes: ['09:00', '12:00', '14:00', '16:00', '00:00'], // Office hours + invalid
|
||
},
|
||
},
|
||
|
||
// ITALIAN - Period separator
|
||
{
|
||
name: 'Duomo di Milano (IT)',
|
||
url: 'https://www.duomomilano.it/',
|
||
country: 'IT',
|
||
language: 'Italian',
|
||
edgeCases: [
|
||
'Period separator: "18.30", "9.00"',
|
||
'Day ranges: "da lunedì a venerdì"',
|
||
'Office hours: "orari" or "ufficio"',
|
||
],
|
||
expectations: {
|
||
minSchedules: 10,
|
||
maxSchedules: 25,
|
||
shouldHaveDays: [0, 1, 2, 3, 4, 5, 6], // All days likely
|
||
},
|
||
},
|
||
|
||
// SPANISH - Day ranges with "a"
|
||
{
|
||
name: 'Sagrada Família, Barcelona (ES)',
|
||
url: 'https://sagradafamilia.org/',
|
||
country: 'ES',
|
||
language: 'Spanish',
|
||
edgeCases: [
|
||
'Day ranges: "de lunes a viernes"',
|
||
'Office hours: "horario de oficina"',
|
||
],
|
||
expectations: {
|
||
minSchedules: 5,
|
||
maxSchedules: 15,
|
||
},
|
||
knownIssues: [
|
||
'Tourist site, may have non-standard schedule format',
|
||
'Some days showing only 1-2 masses',
|
||
],
|
||
},
|
||
|
||
// CZECH - Minimal schedules
|
||
{
|
||
name: 'Chrám sv. Víta, Prague (CZ)',
|
||
url: 'https://www.katedralasvatehovita.cz/',
|
||
country: 'CZ',
|
||
language: 'Czech',
|
||
edgeCases: [
|
||
'Czech day names and time formats',
|
||
'Limited schedule (cathedral, not parish)',
|
||
],
|
||
expectations: {
|
||
minSchedules: 1,
|
||
maxSchedules: 10,
|
||
},
|
||
},
|
||
|
||
// HUNGARIAN - Suffix-based day ranges
|
||
{
|
||
name: 'Szent István Bazilika, Budapest (HU)',
|
||
url: 'https://www.bazilika.biz/',
|
||
country: 'HU',
|
||
language: 'Hungarian',
|
||
edgeCases: [
|
||
'Hungarian day names',
|
||
'Day range suffixes: "-tól", "-től"',
|
||
'Limited weekday schedule',
|
||
],
|
||
expectations: {
|
||
minSchedules: 3,
|
||
maxSchedules: 10,
|
||
shouldHaveDays: [1, 2, 3, 4, 5], // Weekdays
|
||
},
|
||
},
|
||
];
|
||
|
||
interface TestResult {
|
||
name: string;
|
||
passed: boolean;
|
||
scheduleCount: number;
|
||
issues: string[];
|
||
edgeCasesValidated: string[];
|
||
}
|
||
|
||
async function runEdgeCaseTest(test: EdgeCaseTest, scraper: GenericScraper): Promise<TestResult> {
|
||
const result: TestResult = {
|
||
name: test.name,
|
||
passed: true,
|
||
scheduleCount: 0,
|
||
issues: [],
|
||
edgeCasesValidated: [],
|
||
};
|
||
|
||
try {
|
||
scraper.setCountry(test.country);
|
||
const scrapeResult = await scraper.scrape(test.url);
|
||
|
||
if (!scrapeResult.success) {
|
||
result.passed = false;
|
||
result.issues.push(`Scrape failed: ${scrapeResult.error}`);
|
||
return result;
|
||
}
|
||
|
||
result.scheduleCount = scrapeResult.schedules.length;
|
||
|
||
// Validate schedule count
|
||
if (test.expectations.minSchedules && result.scheduleCount < test.expectations.minSchedules) {
|
||
result.passed = false;
|
||
result.issues.push(
|
||
`Too few schedules: ${result.scheduleCount} < ${test.expectations.minSchedules}`
|
||
);
|
||
}
|
||
|
||
if (test.expectations.maxSchedules && result.scheduleCount > test.expectations.maxSchedules) {
|
||
result.passed = false;
|
||
result.issues.push(
|
||
`Too many schedules: ${result.scheduleCount} > ${test.expectations.maxSchedules}`
|
||
);
|
||
}
|
||
|
||
// Validate days covered
|
||
if (test.expectations.shouldHaveDays) {
|
||
const dayNames = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'];
|
||
const foundDays = new Set(scrapeResult.schedules.map(s => s.dayOfWeek));
|
||
for (const day of test.expectations.shouldHaveDays) {
|
||
if (!foundDays.has(day)) {
|
||
result.passed = false;
|
||
result.issues.push(`Missing expected day: ${dayNames[day]}`);
|
||
} else {
|
||
result.edgeCasesValidated.push(`✓ Found ${dayNames[day]}`);
|
||
}
|
||
}
|
||
}
|
||
|
||
// Validate invalid times are NOT present
|
||
if (test.expectations.shouldNotHaveTimes) {
|
||
const foundTimes = new Set(scrapeResult.schedules.map(s => s.time));
|
||
for (const time of test.expectations.shouldNotHaveTimes) {
|
||
if (foundTimes.has(time)) {
|
||
result.passed = false;
|
||
result.issues.push(`Found invalid time that should be filtered: ${time}`);
|
||
} else {
|
||
result.edgeCasesValidated.push(`✓ Filtered out ${time}`);
|
||
}
|
||
}
|
||
}
|
||
|
||
// Validate expected times ARE present
|
||
if (test.expectations.shouldHaveTimes) {
|
||
const foundTimes = new Set(scrapeResult.schedules.map(s => s.time));
|
||
for (const time of test.expectations.shouldHaveTimes) {
|
||
if (!foundTimes.has(time)) {
|
||
result.passed = false;
|
||
result.issues.push(`Missing expected time: ${time}`);
|
||
} else {
|
||
result.edgeCasesValidated.push(`✓ Found ${time}`);
|
||
}
|
||
}
|
||
}
|
||
|
||
// Check for duplicates (should be none after deduplication)
|
||
const uniqueKeys = new Set<string>();
|
||
const duplicates: string[] = [];
|
||
for (const schedule of scrapeResult.schedules) {
|
||
const key = `${schedule.dayOfWeek}-${schedule.time}`;
|
||
if (uniqueKeys.has(key)) {
|
||
duplicates.push(key);
|
||
} else {
|
||
uniqueKeys.add(key);
|
||
}
|
||
}
|
||
|
||
if (duplicates.length > 0) {
|
||
result.passed = false;
|
||
result.issues.push(`Found ${duplicates.length} duplicate schedules: ${duplicates.join(', ')}`);
|
||
} else {
|
||
result.edgeCasesValidated.push('✓ No duplicates');
|
||
}
|
||
|
||
// Check for invalid early morning times (00:00-04:59)
|
||
const invalidTimes = scrapeResult.schedules.filter(s => {
|
||
const [hours] = s.time.split(':').map(Number);
|
||
return hours >= 0 && hours <= 4;
|
||
});
|
||
|
||
if (invalidTimes.length > 0) {
|
||
result.passed = false;
|
||
result.issues.push(
|
||
`Found ${invalidTimes.length} invalid early morning times: ${invalidTimes.map(t => t.time).join(', ')}`
|
||
);
|
||
} else {
|
||
result.edgeCasesValidated.push('✓ No invalid times (00:00-04:59)');
|
||
}
|
||
|
||
} catch (error) {
|
||
result.passed = false;
|
||
result.issues.push(`Exception: ${error instanceof Error ? error.message : String(error)}`);
|
||
}
|
||
|
||
return result;
|
||
}
|
||
|
||
async function main() {
|
||
console.log('🧪 EDGE CASE TEST SUITE FOR INTERNATIONAL MASS SCRAPER');
|
||
console.log('='.repeat(80));
|
||
console.log('');
|
||
|
||
const scraper = new GenericScraper();
|
||
await scraper.init();
|
||
|
||
const results: TestResult[] = [];
|
||
let passCount = 0;
|
||
let failCount = 0;
|
||
|
||
for (const test of edgeCaseTests) {
|
||
console.log(`\n📍 Testing: ${test.name} (${test.language})`);
|
||
console.log(` URL: ${test.url}`);
|
||
console.log(` Edge cases to validate:`);
|
||
for (const edgeCase of test.edgeCases) {
|
||
console.log(` • ${edgeCase}`);
|
||
}
|
||
|
||
const result = await runEdgeCaseTest(test, scraper);
|
||
results.push(result);
|
||
|
||
if (result.passed) {
|
||
passCount++;
|
||
console.log(`\n ✅ PASSED (${result.scheduleCount} schedules)`);
|
||
} else {
|
||
failCount++;
|
||
console.log(`\n ❌ FAILED (${result.scheduleCount} schedules)`);
|
||
}
|
||
|
||
if (result.edgeCasesValidated.length > 0) {
|
||
console.log(`\n Edge cases validated:`);
|
||
for (const validation of result.edgeCasesValidated) {
|
||
console.log(` ${validation}`);
|
||
}
|
||
}
|
||
|
||
if (result.issues.length > 0) {
|
||
console.log(`\n ⚠️ Issues:`);
|
||
for (const issue of result.issues) {
|
||
console.log(` • ${issue}`);
|
||
}
|
||
}
|
||
|
||
if (test.knownIssues && test.knownIssues.length > 0) {
|
||
console.log(`\n ℹ️ Known issues:`);
|
||
for (const issue of test.knownIssues) {
|
||
console.log(` • ${issue}`);
|
||
}
|
||
}
|
||
|
||
// Brief delay between tests
|
||
await new Promise(resolve => setTimeout(resolve, 2000));
|
||
}
|
||
|
||
await scraper.close();
|
||
|
||
// Summary
|
||
console.log('\n\n' + '='.repeat(80));
|
||
console.log('📊 TEST SUMMARY');
|
||
console.log('='.repeat(80));
|
||
console.log(`Total tests: ${results.length}`);
|
||
console.log(`✅ Passed: ${passCount}`);
|
||
console.log(`❌ Failed: ${failCount}`);
|
||
console.log(`Success rate: ${((passCount / results.length) * 100).toFixed(1)}%`);
|
||
|
||
// Detailed results table
|
||
console.log('\n' + '-'.repeat(80));
|
||
console.log('Test | Status | Schedules | Issues');
|
||
console.log('-'.repeat(80));
|
||
for (const result of results) {
|
||
const status = result.passed ? '✅ PASS' : '❌ FAIL';
|
||
const name = result.name.padEnd(33);
|
||
const schedules = result.scheduleCount.toString().padStart(9);
|
||
const issues = result.issues.length.toString();
|
||
console.log(`${name} | ${status} | ${schedules} | ${issues}`);
|
||
}
|
||
console.log('-'.repeat(80));
|
||
|
||
// Edge case coverage summary
|
||
console.log('\n📋 EDGE CASE COVERAGE:');
|
||
console.log('');
|
||
console.log('1. Day Range Expansion:');
|
||
console.log(' ✓ Polish: "wtorek - sobota"');
|
||
console.log(' ✓ German: "montag bis donnerstag"');
|
||
console.log(' ✓ Italian: "da lunedì a venerdì"');
|
||
console.log(' ✓ Spanish: "de lunes a viernes"');
|
||
console.log('');
|
||
console.log('2. Office Hours Filtering:');
|
||
console.log(' ✓ German: "öffnungszeiten im pfarrbüro"');
|
||
console.log(' ✓ Polish: "kancelaria czynna"');
|
||
console.log(' ✓ Spanish: "horario de oficina"');
|
||
console.log(' ✓ Italian: "orari" / "ufficio"');
|
||
console.log('');
|
||
console.log('3. Short Abbreviation Word Boundaries:');
|
||
console.log(' ✓ Polish: "pn", "cz", "n" (prevented false matches)');
|
||
console.log('');
|
||
console.log('4. Invalid Time Filtering:');
|
||
console.log(' ✓ Filtered: 00:00-04:59 (unrealistic mass times)');
|
||
console.log(' ✓ German "00 uhr" fragments filtered');
|
||
console.log('');
|
||
console.log('5. Deduplication:');
|
||
console.log(' ✓ Same day+time appearing multiple times on page');
|
||
console.log('');
|
||
console.log('6. "Closed" Notice Filtering:');
|
||
console.log(' ✓ Polish: "nieczynna: niedziela, poniedziałek"');
|
||
console.log(' ✓ Multi-language: fermé, cerrado, geschlossen, chiuso');
|
||
console.log('');
|
||
console.log('7. Time Format Support:');
|
||
console.log(' ✓ AM/PM: "8:30 AM", "8 PM"');
|
||
console.log(' ✓ 24-hour: "18:00", "8:30"');
|
||
console.log(' ✓ French/Portuguese: "18h30", "8h"');
|
||
console.log(' ✓ German: "17 Uhr", "17:00 Uhr"');
|
||
console.log(' ✓ Italian: "18.30"');
|
||
console.log(' ✓ Polish: "8 00", "18 00"');
|
||
|
||
process.exit(failCount > 0 ? 1 : 0);
|
||
}
|
||
|
||
main().catch(console.error);
|