feat: add name normalizer and church matcher for HK import
normalizeName strips noise words (church/parish/chapel/etc), accents, and punctuation for robust name comparison. findMatch uses word-overlap Jaccard score (threshold 0.4) with address-prefix fallback for Chinese- named churches where English name overlap may be low. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -8,6 +8,8 @@ import {
|
|||||||
parseScheduleLine,
|
parseScheduleLine,
|
||||||
parseWeekdayLine,
|
parseWeekdayLine,
|
||||||
parseEntry,
|
parseEntry,
|
||||||
|
normalizeName,
|
||||||
|
findMatch,
|
||||||
} from './import-hk-parishes.js';
|
} from './import-hk-parishes.js';
|
||||||
|
|
||||||
// ─── Task 2: Entry splitter and name extractor ────────────────────────────────
|
// ─── Task 2: Entry splitter and name extractor ────────────────────────────────
|
||||||
@@ -208,3 +210,35 @@ test('parseEntry extracts names, fields, and schedules from a full entry', () =>
|
|||||||
const weekday = entry.schedules.filter(s => s.dayOfWeek >= 1 && s.dayOfWeek <= 5);
|
const weekday = entry.schedules.filter(s => s.dayOfWeek >= 1 && s.dayOfWeek <= 5);
|
||||||
assert.equal(weekday.length, 5);
|
assert.equal(weekday.length, 5);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// ─── Task 8: Name normalizer + matcher ───────────────────────────────────────
|
||||||
|
|
||||||
|
test('normalizeName strips noise words and lowercases', () => {
|
||||||
|
assert.equal(normalizeName('HOLY CROSS CHURCH'), 'holy cross');
|
||||||
|
assert.equal(normalizeName('Our Lady Of Mount Carmel Church'), 'mount carmel');
|
||||||
|
assert.equal(normalizeName("St. Joseph's Parish"), 'joseph');
|
||||||
|
assert.equal(normalizeName('Salesian Mass Centre'), 'salesian');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('findMatch matches by name overlap', () => {
|
||||||
|
const existing = [
|
||||||
|
{ id: '1', name: 'Holy Cross (Sai Wan Ho)', address: '1 Holy Cross Path', phone: null, email: null },
|
||||||
|
{ id: '2', name: 'St Joseph (Central)', address: '37 Garden Road', phone: null, email: null },
|
||||||
|
];
|
||||||
|
assert.equal(findMatch('HOLY CROSS CHURCH', '1 Holy Cross Path', existing)?.id, '1');
|
||||||
|
assert.equal(findMatch("St. Joseph's Church", '37 Garden Road', existing)?.id, '2');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('findMatch falls back to address prefix match', () => {
|
||||||
|
const existing = [
|
||||||
|
{ id: '3', name: '聖母聖衣堂 (Our Lady of Mount Carmel Wanchai)', address: 'No.1, Star Street', phone: null, email: null },
|
||||||
|
];
|
||||||
|
assert.equal(findMatch('Our Lady Of Mount Carmel Church', 'No.1, Star Street, Wan Chai', existing)?.id, '3');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('findMatch returns null for no match', () => {
|
||||||
|
const existing = [
|
||||||
|
{ id: '1', name: 'Holy Cross (Sai Wan Ho)', address: '1 Holy Cross Path', phone: null, email: null },
|
||||||
|
];
|
||||||
|
assert.equal(findMatch('Salesian Mass Centre', 'Salesian School, 16 Chai Wan Road', existing), null);
|
||||||
|
});
|
||||||
|
|||||||
@@ -369,3 +369,70 @@ export function parseEntry(raw: string): ParsedEntry {
|
|||||||
|
|
||||||
return { locationName, parishName, address, phone, email, schedules };
|
return { locationName, parishName, address, phone, email, schedules };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ─── Task 8: Name normalizer + matcher ───────────────────────────────────────
|
||||||
|
|
||||||
|
const NOISE_WORDS = new Set([
|
||||||
|
'church', 'parish', 'chapel', 'centre', 'center', 'mass',
|
||||||
|
'saint', 'st', 'our', 'lady', 'of', 'the', 'a', 'an',
|
||||||
|
]);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalise a church name for comparison:
|
||||||
|
* lowercase, strip accents, remove noise words, collapse whitespace.
|
||||||
|
*/
|
||||||
|
export function normalizeName(name: string): string {
|
||||||
|
return name
|
||||||
|
.toLowerCase()
|
||||||
|
.normalize('NFD').replace(/[\u0300-\u036f]/g, '')
|
||||||
|
.replace(/[^a-z0-9\s]/g, ' ')
|
||||||
|
.split(/\s+/)
|
||||||
|
.filter(w => w.length >= 2 && !NOISE_WORDS.has(w))
|
||||||
|
.join(' ')
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
function wordOverlap(a: string, b: string): number {
|
||||||
|
const setA = new Set(a.split(' ').filter(Boolean));
|
||||||
|
const setB = new Set(b.split(' ').filter(Boolean));
|
||||||
|
if (setA.size === 0 || setB.size === 0) return 0;
|
||||||
|
let common = 0;
|
||||||
|
for (const w of setA) if (setB.has(w)) common++;
|
||||||
|
return common / Math.max(setA.size, setB.size);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find the best-matching existing church for a parsed entry.
|
||||||
|
* Returns null if no match meets the threshold.
|
||||||
|
*/
|
||||||
|
export function findMatch(
|
||||||
|
locationName: string,
|
||||||
|
address: string | null,
|
||||||
|
existing: ExistingChurch[]
|
||||||
|
): ExistingChurch | null {
|
||||||
|
const normTarget = normalizeName(locationName);
|
||||||
|
let best: ExistingChurch | null = null;
|
||||||
|
let bestScore = 0;
|
||||||
|
|
||||||
|
for (const church of existing) {
|
||||||
|
const normExisting = normalizeName(church.name);
|
||||||
|
const score = wordOverlap(normTarget, normExisting);
|
||||||
|
|
||||||
|
if (score > bestScore) {
|
||||||
|
bestScore = score;
|
||||||
|
best = church;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (bestScore >= 0.4) return best;
|
||||||
|
|
||||||
|
// Fallback: address prefix match (first 12 chars)
|
||||||
|
if (address && address.length >= 5) {
|
||||||
|
const addrPrefix = address.slice(0, 12).toLowerCase();
|
||||||
|
for (const church of existing) {
|
||||||
|
if (church.address?.toLowerCase().includes(addrPrefix)) return church;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user