Files
ScraperControl/scripts/import-hk-parishes.test.ts
albertfj114 3ebbc3732f feat: add name normalizer and church matcher for HK import
normalizeName strips noise words (church/parish/chapel/etc), accents,
and punctuation for robust name comparison. findMatch uses word-overlap
Jaccard score (threshold 0.4) with address-prefix fallback for Chinese-
named churches where English name overlap may be low.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-03 16:23:58 -04:00

245 lines
11 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { test } from 'node:test';
import assert from 'node:assert/strict';
import {
splitEntries,
extractNames,
extractFields,
normalizeTime,
parseScheduleLine,
parseWeekdayLine,
parseEntry,
normalizeName,
findMatch,
} from './import-hk-parishes.js';
// ─── Task 2: Entry splitter and name extractor ────────────────────────────────
test('splitEntries splits on Path/Close boundary', () => {
const raw = `HONG KONG CHURCHES\n\nParish A\nChurch A\nPath\nClose\nAddress\n1 Main St\n\nParish B\nChurch B\nPath\nClose\nAddress\n2 Side St\n`;
const entries = splitEntries(raw);
assert.equal(entries.length, 2);
assert.ok(entries[0].includes('Church A'));
assert.ok(entries[1].includes('Church B'));
});
test('extractNames returns locationName and parishName', () => {
const pre = `Holy Cross Parish\nHOLY CROSS CHURCH`;
const result = extractNames(pre);
assert.equal(result.locationName, 'HOLY CROSS CHURCH');
assert.equal(result.parishName, 'Holy Cross Parish');
});
test('extractNames strips Share and leading-space artifacts', () => {
const pre = `Share\n Carmelite Monastery\nSt. Anne's Parish\nCarmelite Monastery`;
const result = extractNames(pre);
assert.equal(result.locationName, 'Carmelite Monastery');
assert.equal(result.parishName, "St. Anne's Parish");
});
test('extractNames handles single name line', () => {
const pre = `Cathedral Parish`;
const result = extractNames(pre);
assert.equal(result.locationName, 'Cathedral Parish');
assert.equal(result.parishName, null);
});
// ─── Task 3: Field extractor ──────────────────────────────────────────────────
test('extractFields parses address, phone, email', () => {
const body = `Address\n1 Holy Cross Path, Shau Kei Wan, Hong Kong\n\nPhone\n8522560-1823\n\nFax\n8522535-8246\n\nEmail\nholycrosshk@gmail.com\n\nWebsite\nClick Here\n\nMass Time\n`;
const f = extractFields(body);
assert.equal(f.address, '1 Holy Cross Path, Shau Kei Wan, Hong Kong');
assert.equal(f.phone, '(852)2560-1823');
assert.equal(f.email, 'holycrosshk@gmail.com');
});
test('extractFields handles missing fields gracefully', () => {
const body = `Address\nSalesian School, 16 Chai Wan Road, Hong Kong.\n\nMass Time\n`;
const f = extractFields(body);
assert.equal(f.address, 'Salesian School, 16 Chai Wan Road, Hong Kong.');
assert.equal(f.phone, null);
assert.equal(f.email, null);
});
test('extractFields strips full-width parens from phone', () => {
const body = `Phone\n8522819-5777, 2819-5845\n\n`;
const f = extractFields(body);
assert.equal(f.phone, '(852)2819-5777, 2819-5845');
});
// ─── Task 4: Time normalizer ──────────────────────────────────────────────────
test('normalizeTime handles am/pm with spaces', () => {
assert.equal(normalizeTime('8:00am'), '08:00');
assert.equal(normalizeTime('11:30 am'), '11:30');
assert.equal(normalizeTime('6:00pm'), '18:00');
assert.equal(normalizeTime('6:30 pm'), '18:30');
});
test('normalizeTime handles a.m./p.m. format', () => {
assert.equal(normalizeTime('7:00 a.m.'), '07:00');
assert.equal(normalizeTime('7:45 a.m.'), '07:45');
assert.equal(normalizeTime('6:00 p.m.'), '18:00');
});
test('normalizeTime handles noon', () => {
assert.equal(normalizeTime('12:00 noon'), '12:00');
assert.equal(normalizeTime('12:30 pm'), '12:30');
});
test('normalizeTime handles 12:00am as midnight', () => {
assert.equal(normalizeTime('12:00am'), '00:00');
});
test('normalizeTime returns null for unrecognised input', () => {
assert.equal(normalizeTime('Monday'), null);
assert.equal(normalizeTime(''), null);
});
// ─── Task 5: Schedule line parser ────────────────────────────────────────────
test('parseScheduleLine parses single time with language', () => {
const results = parseScheduleLine('9:30am (English)', 0);
assert.equal(results.length, 1);
assert.deepEqual(results[0], { dayOfWeek: 0, time: '09:30', language: 'English', notes: null });
});
test('parseScheduleLine parses multiple comma-separated times', () => {
const results = parseScheduleLine('8:00am,10:30 am (Cantonese)', 0);
assert.equal(results.length, 2);
assert.equal(results[0].time, '08:00');
assert.equal(results[1].time, '10:30');
assert.equal(results[1].language, 'Cantonese');
});
test('parseScheduleLine handles missing closing paren', () => {
const results = parseScheduleLine('9:30 am (Cantonese', 0);
assert.equal(results[0].language, 'Cantonese');
});
test('parseScheduleLine defaults language to English when not specified', () => {
const results = parseScheduleLine('8:00am', 0);
assert.equal(results[0].language, 'English');
});
test('parseScheduleLine stores embedded note text', () => {
const results = parseScheduleLine('9:00 am Sunday School & Family Mass,11:30am (English)', 0);
assert.equal(results.length, 2);
assert.equal(results[0].time, '09:00');
assert.equal(results[0].notes, 'Sunday School & Family Mass');
});
test('parseScheduleLine handles Saturday anticipated format variations', () => {
const results = parseScheduleLine('Saturday 3:45 pm,Saturday 6:30 pm (Cantonese)', 6);
assert.equal(results.length, 2);
assert.equal(results[0].time, '15:45');
assert.equal(results[1].time, '18:30');
});
test('parseScheduleLine handles "on Saturday" suffix format', () => {
const results = parseScheduleLine('6:00pm on Saturday (Cantonese)', 6);
assert.equal(results.length, 1);
assert.equal(results[0].time, '18:00');
assert.equal(results[0].language, 'Cantonese');
});
test('parseScheduleLine handles conditional prefix as notes', () => {
const results = parseScheduleLine('5th Sunday of the month: 7:15 am (Tagalog)', 0);
assert.equal(results.length, 1);
assert.equal(results[0].time, '07:15');
assert.equal(results[0].language, 'Tagalog');
assert.ok(results[0].notes?.includes('5th Sunday'));
});
// ─── Task 6: Weekday day-prefix parser ───────────────────────────────────────
test('parseWeekdayLine no prefix = all weekdays Mon-Fri', () => {
const results = parseWeekdayLine('7:15 am (Cantonese)');
assert.equal(results.length, 5);
assert.ok(results.every(r => r.time === '07:15'));
assert.ok(results.every(r => r.language === 'Cantonese'));
assert.deepEqual(results.map(r => r.dayOfWeek), [1, 2, 3, 4, 5]);
});
test('parseWeekdayLine abbreviation list', () => {
const results = parseWeekdayLine('Mon., Tue., Thur. 8:00 a.m. (Cantonese)');
assert.equal(results.length, 3);
assert.deepEqual(results.map(r => r.dayOfWeek).sort(), [1, 2, 4]);
});
test('parseWeekdayLine abbreviation range Mon. to Sat.', () => {
const results = parseWeekdayLine('Mon. to Sat. 9:15 am (English)');
assert.equal(results.length, 6);
assert.deepEqual(results.map(r => r.dayOfWeek).sort(), [1, 2, 3, 4, 5, 6]);
});
test('parseWeekdayLine full-word range Monday to Friday', () => {
const results = parseWeekdayLine('Monday to Friday: 12:00 noon (English)');
assert.equal(results.length, 5);
assert.ok(results.every(r => r.time === '12:00'));
});
test('parseWeekdayLine ampersand separator', () => {
const results = parseWeekdayLine('Tue., Thur. & Sat. 9:45 a.m. (Cantonese)');
assert.equal(results.length, 3);
assert.deepEqual(results.map(r => r.dayOfWeek).sort(), [2, 4, 6]);
});
test('parseWeekdayLine multiple time groups on one line', () => {
const results = parseWeekdayLine('Monday to Saturday: 7:45 am,Monday to Friday: 12:00 noon,Monday to Friday: 6:00 pm (English)');
assert.equal(results.length, 16);
});
// ─── Task 7: Full entry parser ────────────────────────────────────────────────
test('parseEntry extracts names, fields, and schedules from a full entry', () => {
const raw = `Holy Cross Parish\nHOLY CROSS CHURCH\nPath\nClose\nAddress\n1 Holy Cross Path, Shau Kei Wan, Hong Kong\n\nPhone\n8522560-1823\n\nEmail\nholycrosshk@gmail.com\n\nWebsite\nClick Here\n\nMass Time\nSunday Masses\n8:00am,9:30am (Cantonese)\n1:00 pm (English)\n\nAnticipated Sunday Masses\nSaturday 3:45 pm,Saturday 6:30 pm (Cantonese)\n\nWeekday Masses\n7:15 am (Cantonese)\n\nSpecial Masses\nSomething irrelevant\n`;
const entry = parseEntry(raw);
assert.equal(entry.locationName, 'HOLY CROSS CHURCH');
assert.equal(entry.parishName, 'Holy Cross Parish');
assert.equal(entry.address, '1 Holy Cross Path, Shau Kei Wan, Hong Kong');
assert.equal(entry.phone, '(852)2560-1823');
assert.equal(entry.email, 'holycrosshk@gmail.com');
// Sunday: 2 Cantonese + 1 English = 3 entries
const sunday = entry.schedules.filter(s => s.dayOfWeek === 0);
assert.equal(sunday.length, 3);
// Anticipated (Saturday): 2 entries
const saturday = entry.schedules.filter(s => s.dayOfWeek === 6);
assert.equal(saturday.length, 2);
// Weekday: 5 entries (MonFri)
const weekday = entry.schedules.filter(s => s.dayOfWeek >= 1 && s.dayOfWeek <= 5);
assert.equal(weekday.length, 5);
});
// ─── Task 8: Name normalizer + matcher ───────────────────────────────────────
test('normalizeName strips noise words and lowercases', () => {
assert.equal(normalizeName('HOLY CROSS CHURCH'), 'holy cross');
assert.equal(normalizeName('Our Lady Of Mount Carmel Church'), 'mount carmel');
assert.equal(normalizeName("St. Joseph's Parish"), 'joseph');
assert.equal(normalizeName('Salesian Mass Centre'), 'salesian');
});
test('findMatch matches by name overlap', () => {
const existing = [
{ id: '1', name: 'Holy Cross (Sai Wan Ho)', address: '1 Holy Cross Path', phone: null, email: null },
{ id: '2', name: 'St Joseph (Central)', address: '37 Garden Road', phone: null, email: null },
];
assert.equal(findMatch('HOLY CROSS CHURCH', '1 Holy Cross Path', existing)?.id, '1');
assert.equal(findMatch("St. Joseph's Church", '37 Garden Road', existing)?.id, '2');
});
test('findMatch falls back to address prefix match', () => {
const existing = [
{ id: '3', name: '聖母聖衣堂 (Our Lady of Mount Carmel Wanchai)', address: 'No.1, Star Street', phone: null, email: null },
];
assert.equal(findMatch('Our Lady Of Mount Carmel Church', 'No.1, Star Street, Wan Chai', existing)?.id, '3');
});
test('findMatch returns null for no match', () => {
const existing = [
{ id: '1', name: 'Holy Cross (Sai Wan Ho)', address: '1 Holy Cross Path', phone: null, email: null },
];
assert.equal(findMatch('Salesian Mass Centre', 'Salesian School, 16 Chai Wan Road', existing), null);
});