feat: add HK parish parser functions (Tasks 2-6) with tests

Implements entry splitter, name extractor, field extractor, time normalizer, schedule line parser, and weekday day-prefix parser. All 26 tests pass. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-03 16:06:26 -04:00
parent 9aea12f4b0
commit 328d146201
2 changed files with 447 additions and 0 deletions
--- a/scripts/import-hk-parishes.test.ts
+++ b/scripts/import-hk-parishes.test.ts
@@ -0,0 +1,188 @@
+import { test } from 'node:test';
+import assert from 'node:assert/strict';
+import {
+  splitEntries,
+  extractNames,
+  extractFields,
+  normalizeTime,
+  parseScheduleLine,
+  parseWeekdayLine,
+} from './import-hk-parishes.js';
+
+// ─── Task 2: Entry splitter and name extractor ────────────────────────────────
+
+test('splitEntries splits on Path/Close boundary', () => {
+  const raw = `HONG KONG CHURCHES\n\nParish A\nChurch A\nPath\nClose\nAddress\n1 Main St\n\nParish B\nChurch B\nPath\nClose\nAddress\n2 Side St\n`;
+  const entries = splitEntries(raw);
+  assert.equal(entries.length, 2);
+  assert.ok(entries[0].includes('Church A'));
+  assert.ok(entries[1].includes('Church B'));
+});
+
+test('extractNames returns locationName and parishName', () => {
+  const pre = `Holy Cross Parish\nHOLY CROSS CHURCH`;
+  const result = extractNames(pre);
+  assert.equal(result.locationName, 'HOLY CROSS CHURCH');
+  assert.equal(result.parishName, 'Holy Cross Parish');
+});
+
+test('extractNames strips Share and leading-space artifacts', () => {
+  const pre = `Share\n Carmelite Monastery\nSt. Anne's Parish\nCarmelite Monastery`;
+  const result = extractNames(pre);
+  assert.equal(result.locationName, 'Carmelite Monastery');
+  assert.equal(result.parishName, "St. Anne's Parish");
+});
+
+test('extractNames handles single name line', () => {
+  const pre = `Cathedral Parish`;
+  const result = extractNames(pre);
+  assert.equal(result.locationName, 'Cathedral Parish');
+  assert.equal(result.parishName, null);
+});
+
+// ─── Task 3: Field extractor ──────────────────────────────────────────────────
+
+test('extractFields parses address, phone, email', () => {
+  const body = `Address\n1 Holy Cross Path, Shau Kei Wan, Hong Kong\n\nPhone\n（852）2560-1823\n\nFax\n（852）2535-8246\n\nEmail\nholycrosshk@gmail.com\n\nWebsite\nClick Here\n\nMass Time\n`;
+  const f = extractFields(body);
+  assert.equal(f.address, '1 Holy Cross Path, Shau Kei Wan, Hong Kong');
+  assert.equal(f.phone, '(852)2560-1823');
+  assert.equal(f.email, 'holycrosshk@gmail.com');
+});
+
+test('extractFields handles missing fields gracefully', () => {
+  const body = `Address\nSalesian School, 16 Chai Wan Road, Hong Kong.\n\nMass Time\n`;
+  const f = extractFields(body);
+  assert.equal(f.address, 'Salesian School, 16 Chai Wan Road, Hong Kong.');
+  assert.equal(f.phone, null);
+  assert.equal(f.email, null);
+});
+
+test('extractFields strips full-width parens from phone', () => {
+  const body = `Phone\n（852）2819-5777, 2819-5845\n\n`;
+  const f = extractFields(body);
+  assert.equal(f.phone, '(852)2819-5777, 2819-5845');
+});
+
+// ─── Task 4: Time normalizer ──────────────────────────────────────────────────
+
+test('normalizeTime handles am/pm with spaces', () => {
+  assert.equal(normalizeTime('8:00am'), '08:00');
+  assert.equal(normalizeTime('11:30 am'), '11:30');
+  assert.equal(normalizeTime('6:00pm'), '18:00');
+  assert.equal(normalizeTime('6:30 pm'), '18:30');
+});
+
+test('normalizeTime handles a.m./p.m. format', () => {
+  assert.equal(normalizeTime('7:00 a.m.'), '07:00');
+  assert.equal(normalizeTime('7:45 a.m.'), '07:45');
+  assert.equal(normalizeTime('6:00 p.m.'), '18:00');
+});
+
+test('normalizeTime handles noon', () => {
+  assert.equal(normalizeTime('12:00 noon'), '12:00');
+  assert.equal(normalizeTime('12:30 pm'), '12:30');
+});
+
+test('normalizeTime handles 12:00am as midnight', () => {
+  assert.equal(normalizeTime('12:00am'), '00:00');
+});
+
+test('normalizeTime returns null for unrecognised input', () => {
+  assert.equal(normalizeTime('Monday'), null);
+  assert.equal(normalizeTime(''), null);
+});
+
+// ─── Task 5: Schedule line parser ────────────────────────────────────────────
+
+test('parseScheduleLine parses single time with language', () => {
+  const results = parseScheduleLine('9:30am (English)', 0);
+  assert.equal(results.length, 1);
+  assert.deepEqual(results[0], { dayOfWeek: 0, time: '09:30', language: 'English', notes: null });
+});
+
+test('parseScheduleLine parses multiple comma-separated times', () => {
+  const results = parseScheduleLine('8:00am,10:30 am (Cantonese)', 0);
+  assert.equal(results.length, 2);
+  assert.equal(results[0].time, '08:00');
+  assert.equal(results[1].time, '10:30');
+  assert.equal(results[1].language, 'Cantonese');
+});
+
+test('parseScheduleLine handles missing closing paren', () => {
+  const results = parseScheduleLine('9:30 am (Cantonese', 0);
+  assert.equal(results[0].language, 'Cantonese');
+});
+
+test('parseScheduleLine defaults language to English when not specified', () => {
+  const results = parseScheduleLine('8:00am', 0);
+  assert.equal(results[0].language, 'English');
+});
+
+test('parseScheduleLine stores embedded note text', () => {
+  const results = parseScheduleLine('9:00 am Sunday School & Family Mass,11:30am (English)', 0);
+  assert.equal(results.length, 2);
+  assert.equal(results[0].time, '09:00');
+  assert.equal(results[0].notes, 'Sunday School & Family Mass');
+});
+
+test('parseScheduleLine handles Saturday anticipated format variations', () => {
+  const results = parseScheduleLine('Saturday 3:45 pm,Saturday 6:30 pm (Cantonese)', 6);
+  assert.equal(results.length, 2);
+  assert.equal(results[0].time, '15:45');
+  assert.equal(results[1].time, '18:30');
+});
+
+test('parseScheduleLine handles "on Saturday" suffix format', () => {
+  const results = parseScheduleLine('6:00pm on Saturday (Cantonese)', 6);
+  assert.equal(results.length, 1);
+  assert.equal(results[0].time, '18:00');
+  assert.equal(results[0].language, 'Cantonese');
+});
+
+test('parseScheduleLine handles conditional prefix as notes', () => {
+  const results = parseScheduleLine('5th Sunday of the month: 7:15 am (Tagalog)', 0);
+  assert.equal(results.length, 1);
+  assert.equal(results[0].time, '07:15');
+  assert.equal(results[0].language, 'Tagalog');
+  assert.ok(results[0].notes?.includes('5th Sunday'));
+});
+
+// ─── Task 6: Weekday day-prefix parser ───────────────────────────────────────
+
+test('parseWeekdayLine no prefix = all weekdays Mon-Fri', () => {
+  const results = parseWeekdayLine('7:15 am (Cantonese)');
+  assert.equal(results.length, 5);
+  assert.ok(results.every(r => r.time === '07:15'));
+  assert.ok(results.every(r => r.language === 'Cantonese'));
+  assert.deepEqual(results.map(r => r.dayOfWeek), [1, 2, 3, 4, 5]);
+});
+
+test('parseWeekdayLine abbreviation list', () => {
+  const results = parseWeekdayLine('Mon., Tue., Thur. 8:00 a.m. (Cantonese)');
+  assert.equal(results.length, 3);
+  assert.deepEqual(results.map(r => r.dayOfWeek).sort(), [1, 2, 4]);
+});
+
+test('parseWeekdayLine abbreviation range Mon. to Sat.', () => {
+  const results = parseWeekdayLine('Mon. to Sat. 9:15 am (English)');
+  assert.equal(results.length, 6);
+  assert.deepEqual(results.map(r => r.dayOfWeek).sort(), [1, 2, 3, 4, 5, 6]);
+});
+
+test('parseWeekdayLine full-word range Monday to Friday', () => {
+  const results = parseWeekdayLine('Monday to Friday: 12:00 noon (English)');
+  assert.equal(results.length, 5);
+  assert.ok(results.every(r => r.time === '12:00'));
+});
+
+test('parseWeekdayLine ampersand separator', () => {
+  const results = parseWeekdayLine('Tue., Thur. & Sat. 9:45 a.m. (Cantonese)');
+  assert.equal(results.length, 3);
+  assert.deepEqual(results.map(r => r.dayOfWeek).sort(), [2, 4, 6]);
+});
+
+test('parseWeekdayLine multiple time groups on one line', () => {
+  const results = parseWeekdayLine('Monday to Saturday: 7:45 am,Monday to Friday: 12:00 noon,Monday to Friday: 6:00 pm (English)');
+  assert.equal(results.length, 16);
+});