diff --git a/scripts/import-hk-parishes.test.ts b/scripts/import-hk-parishes.test.ts index 40296c2..1969fdb 100644 --- a/scripts/import-hk-parishes.test.ts +++ b/scripts/import-hk-parishes.test.ts @@ -8,6 +8,8 @@ import { parseScheduleLine, parseWeekdayLine, parseEntry, + normalizeName, + findMatch, } from './import-hk-parishes.js'; // ─── Task 2: Entry splitter and name extractor ──────────────────────────────── @@ -208,3 +210,35 @@ test('parseEntry extracts names, fields, and schedules from a full entry', () => const weekday = entry.schedules.filter(s => s.dayOfWeek >= 1 && s.dayOfWeek <= 5); assert.equal(weekday.length, 5); }); + +// ─── Task 8: Name normalizer + matcher ─────────────────────────────────────── + +test('normalizeName strips noise words and lowercases', () => { + assert.equal(normalizeName('HOLY CROSS CHURCH'), 'holy cross'); + assert.equal(normalizeName('Our Lady Of Mount Carmel Church'), 'mount carmel'); + assert.equal(normalizeName("St. Joseph's Parish"), 'joseph'); + assert.equal(normalizeName('Salesian Mass Centre'), 'salesian'); +}); + +test('findMatch matches by name overlap', () => { + const existing = [ + { id: '1', name: 'Holy Cross (Sai Wan Ho)', address: '1 Holy Cross Path', phone: null, email: null }, + { id: '2', name: 'St Joseph (Central)', address: '37 Garden Road', phone: null, email: null }, + ]; + assert.equal(findMatch('HOLY CROSS CHURCH', '1 Holy Cross Path', existing)?.id, '1'); + assert.equal(findMatch("St. Joseph's Church", '37 Garden Road', existing)?.id, '2'); +}); + +test('findMatch falls back to address prefix match', () => { + const existing = [ + { id: '3', name: '聖母聖衣堂 (Our Lady of Mount Carmel Wanchai)', address: 'No.1, Star Street', phone: null, email: null }, + ]; + assert.equal(findMatch('Our Lady Of Mount Carmel Church', 'No.1, Star Street, Wan Chai', existing)?.id, '3'); +}); + +test('findMatch returns null for no match', () => { + const existing = [ + { id: '1', name: 'Holy Cross (Sai Wan Ho)', address: '1 Holy Cross Path', phone: null, email: null }, + ]; + assert.equal(findMatch('Salesian Mass Centre', 'Salesian School, 16 Chai Wan Road', existing), null); +}); diff --git a/scripts/import-hk-parishes.ts b/scripts/import-hk-parishes.ts index 6758e19..8bb35d0 100644 --- a/scripts/import-hk-parishes.ts +++ b/scripts/import-hk-parishes.ts @@ -369,3 +369,70 @@ export function parseEntry(raw: string): ParsedEntry { return { locationName, parishName, address, phone, email, schedules }; } + +// ─── Task 8: Name normalizer + matcher ─────────────────────────────────────── + +const NOISE_WORDS = new Set([ + 'church', 'parish', 'chapel', 'centre', 'center', 'mass', + 'saint', 'st', 'our', 'lady', 'of', 'the', 'a', 'an', +]); + +/** + * Normalise a church name for comparison: + * lowercase, strip accents, remove noise words, collapse whitespace. + */ +export function normalizeName(name: string): string { + return name + .toLowerCase() + .normalize('NFD').replace(/[\u0300-\u036f]/g, '') + .replace(/[^a-z0-9\s]/g, ' ') + .split(/\s+/) + .filter(w => w.length >= 2 && !NOISE_WORDS.has(w)) + .join(' ') + .trim(); +} + +function wordOverlap(a: string, b: string): number { + const setA = new Set(a.split(' ').filter(Boolean)); + const setB = new Set(b.split(' ').filter(Boolean)); + if (setA.size === 0 || setB.size === 0) return 0; + let common = 0; + for (const w of setA) if (setB.has(w)) common++; + return common / Math.max(setA.size, setB.size); +} + +/** + * Find the best-matching existing church for a parsed entry. + * Returns null if no match meets the threshold. + */ +export function findMatch( + locationName: string, + address: string | null, + existing: ExistingChurch[] +): ExistingChurch | null { + const normTarget = normalizeName(locationName); + let best: ExistingChurch | null = null; + let bestScore = 0; + + for (const church of existing) { + const normExisting = normalizeName(church.name); + const score = wordOverlap(normTarget, normExisting); + + if (score > bestScore) { + bestScore = score; + best = church; + } + } + + if (bestScore >= 0.4) return best; + + // Fallback: address prefix match (first 12 chars) + if (address && address.length >= 5) { + const addrPrefix = address.slice(0, 12).toLowerCase(); + for (const church of existing) { + if (church.address?.toLowerCase().includes(addrPrefix)) return church; + } + } + + return null; +}