Files
ScraperControl/scripts/import-hk-parishes.test.ts
albertfj114 328d146201 feat: add HK parish parser functions (Tasks 2-6) with tests
Implements entry splitter, name extractor, field extractor, time normalizer,
schedule line parser, and weekday day-prefix parser. All 26 tests pass.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-03 16:06:26 -04:00

189 lines
7.7 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { test } from 'node:test';
import assert from 'node:assert/strict';
import {
splitEntries,
extractNames,
extractFields,
normalizeTime,
parseScheduleLine,
parseWeekdayLine,
} from './import-hk-parishes.js';
// ─── Task 2: Entry splitter and name extractor ────────────────────────────────
test('splitEntries splits on Path/Close boundary', () => {
const raw = `HONG KONG CHURCHES\n\nParish A\nChurch A\nPath\nClose\nAddress\n1 Main St\n\nParish B\nChurch B\nPath\nClose\nAddress\n2 Side St\n`;
const entries = splitEntries(raw);
assert.equal(entries.length, 2);
assert.ok(entries[0].includes('Church A'));
assert.ok(entries[1].includes('Church B'));
});
test('extractNames returns locationName and parishName', () => {
const pre = `Holy Cross Parish\nHOLY CROSS CHURCH`;
const result = extractNames(pre);
assert.equal(result.locationName, 'HOLY CROSS CHURCH');
assert.equal(result.parishName, 'Holy Cross Parish');
});
test('extractNames strips Share and leading-space artifacts', () => {
const pre = `Share\n Carmelite Monastery\nSt. Anne's Parish\nCarmelite Monastery`;
const result = extractNames(pre);
assert.equal(result.locationName, 'Carmelite Monastery');
assert.equal(result.parishName, "St. Anne's Parish");
});
test('extractNames handles single name line', () => {
const pre = `Cathedral Parish`;
const result = extractNames(pre);
assert.equal(result.locationName, 'Cathedral Parish');
assert.equal(result.parishName, null);
});
// ─── Task 3: Field extractor ──────────────────────────────────────────────────
test('extractFields parses address, phone, email', () => {
const body = `Address\n1 Holy Cross Path, Shau Kei Wan, Hong Kong\n\nPhone\n8522560-1823\n\nFax\n8522535-8246\n\nEmail\nholycrosshk@gmail.com\n\nWebsite\nClick Here\n\nMass Time\n`;
const f = extractFields(body);
assert.equal(f.address, '1 Holy Cross Path, Shau Kei Wan, Hong Kong');
assert.equal(f.phone, '(852)2560-1823');
assert.equal(f.email, 'holycrosshk@gmail.com');
});
test('extractFields handles missing fields gracefully', () => {
const body = `Address\nSalesian School, 16 Chai Wan Road, Hong Kong.\n\nMass Time\n`;
const f = extractFields(body);
assert.equal(f.address, 'Salesian School, 16 Chai Wan Road, Hong Kong.');
assert.equal(f.phone, null);
assert.equal(f.email, null);
});
test('extractFields strips full-width parens from phone', () => {
const body = `Phone\n8522819-5777, 2819-5845\n\n`;
const f = extractFields(body);
assert.equal(f.phone, '(852)2819-5777, 2819-5845');
});
// ─── Task 4: Time normalizer ──────────────────────────────────────────────────
test('normalizeTime handles am/pm with spaces', () => {
assert.equal(normalizeTime('8:00am'), '08:00');
assert.equal(normalizeTime('11:30 am'), '11:30');
assert.equal(normalizeTime('6:00pm'), '18:00');
assert.equal(normalizeTime('6:30 pm'), '18:30');
});
test('normalizeTime handles a.m./p.m. format', () => {
assert.equal(normalizeTime('7:00 a.m.'), '07:00');
assert.equal(normalizeTime('7:45 a.m.'), '07:45');
assert.equal(normalizeTime('6:00 p.m.'), '18:00');
});
test('normalizeTime handles noon', () => {
assert.equal(normalizeTime('12:00 noon'), '12:00');
assert.equal(normalizeTime('12:30 pm'), '12:30');
});
test('normalizeTime handles 12:00am as midnight', () => {
assert.equal(normalizeTime('12:00am'), '00:00');
});
test('normalizeTime returns null for unrecognised input', () => {
assert.equal(normalizeTime('Monday'), null);
assert.equal(normalizeTime(''), null);
});
// ─── Task 5: Schedule line parser ────────────────────────────────────────────
test('parseScheduleLine parses single time with language', () => {
const results = parseScheduleLine('9:30am (English)', 0);
assert.equal(results.length, 1);
assert.deepEqual(results[0], { dayOfWeek: 0, time: '09:30', language: 'English', notes: null });
});
test('parseScheduleLine parses multiple comma-separated times', () => {
const results = parseScheduleLine('8:00am,10:30 am (Cantonese)', 0);
assert.equal(results.length, 2);
assert.equal(results[0].time, '08:00');
assert.equal(results[1].time, '10:30');
assert.equal(results[1].language, 'Cantonese');
});
test('parseScheduleLine handles missing closing paren', () => {
const results = parseScheduleLine('9:30 am (Cantonese', 0);
assert.equal(results[0].language, 'Cantonese');
});
test('parseScheduleLine defaults language to English when not specified', () => {
const results = parseScheduleLine('8:00am', 0);
assert.equal(results[0].language, 'English');
});
test('parseScheduleLine stores embedded note text', () => {
const results = parseScheduleLine('9:00 am Sunday School & Family Mass,11:30am (English)', 0);
assert.equal(results.length, 2);
assert.equal(results[0].time, '09:00');
assert.equal(results[0].notes, 'Sunday School & Family Mass');
});
test('parseScheduleLine handles Saturday anticipated format variations', () => {
const results = parseScheduleLine('Saturday 3:45 pm,Saturday 6:30 pm (Cantonese)', 6);
assert.equal(results.length, 2);
assert.equal(results[0].time, '15:45');
assert.equal(results[1].time, '18:30');
});
test('parseScheduleLine handles "on Saturday" suffix format', () => {
const results = parseScheduleLine('6:00pm on Saturday (Cantonese)', 6);
assert.equal(results.length, 1);
assert.equal(results[0].time, '18:00');
assert.equal(results[0].language, 'Cantonese');
});
test('parseScheduleLine handles conditional prefix as notes', () => {
const results = parseScheduleLine('5th Sunday of the month: 7:15 am (Tagalog)', 0);
assert.equal(results.length, 1);
assert.equal(results[0].time, '07:15');
assert.equal(results[0].language, 'Tagalog');
assert.ok(results[0].notes?.includes('5th Sunday'));
});
// ─── Task 6: Weekday day-prefix parser ───────────────────────────────────────
test('parseWeekdayLine no prefix = all weekdays Mon-Fri', () => {
const results = parseWeekdayLine('7:15 am (Cantonese)');
assert.equal(results.length, 5);
assert.ok(results.every(r => r.time === '07:15'));
assert.ok(results.every(r => r.language === 'Cantonese'));
assert.deepEqual(results.map(r => r.dayOfWeek), [1, 2, 3, 4, 5]);
});
test('parseWeekdayLine abbreviation list', () => {
const results = parseWeekdayLine('Mon., Tue., Thur. 8:00 a.m. (Cantonese)');
assert.equal(results.length, 3);
assert.deepEqual(results.map(r => r.dayOfWeek).sort(), [1, 2, 4]);
});
test('parseWeekdayLine abbreviation range Mon. to Sat.', () => {
const results = parseWeekdayLine('Mon. to Sat. 9:15 am (English)');
assert.equal(results.length, 6);
assert.deepEqual(results.map(r => r.dayOfWeek).sort(), [1, 2, 3, 4, 5, 6]);
});
test('parseWeekdayLine full-word range Monday to Friday', () => {
const results = parseWeekdayLine('Monday to Friday: 12:00 noon (English)');
assert.equal(results.length, 5);
assert.ok(results.every(r => r.time === '12:00'));
});
test('parseWeekdayLine ampersand separator', () => {
const results = parseWeekdayLine('Tue., Thur. & Sat. 9:45 a.m. (Cantonese)');
assert.equal(results.length, 3);
assert.deepEqual(results.map(r => r.dayOfWeek).sort(), [2, 4, 6]);
});
test('parseWeekdayLine multiple time groups on one line', () => {
const results = parseWeekdayLine('Monday to Saturday: 7:45 am,Monday to Friday: 12:00 noon,Monday to Friday: 6:00 pm (English)');
assert.equal(results.length, 16);
});