feat: add name normalizer and church matcher for HK import

normalizeName strips noise words (church/parish/chapel/etc), accents,
and punctuation for robust name comparison. findMatch uses word-overlap
Jaccard score (threshold 0.4) with address-prefix fallback for Chinese-
named churches where English name overlap may be low.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
albertfj114
2026-04-03 16:23:58 -04:00
parent eedb442e78
commit 3ebbc3732f
2 changed files with 101 additions and 0 deletions

View File

@@ -369,3 +369,70 @@ export function parseEntry(raw: string): ParsedEntry {
return { locationName, parishName, address, phone, email, schedules };
}
// ─── Task 8: Name normalizer + matcher ───────────────────────────────────────
const NOISE_WORDS = new Set([
'church', 'parish', 'chapel', 'centre', 'center', 'mass',
'saint', 'st', 'our', 'lady', 'of', 'the', 'a', 'an',
]);
/**
* Normalise a church name for comparison:
* lowercase, strip accents, remove noise words, collapse whitespace.
*/
export function normalizeName(name: string): string {
return name
.toLowerCase()
.normalize('NFD').replace(/[\u0300-\u036f]/g, '')
.replace(/[^a-z0-9\s]/g, ' ')
.split(/\s+/)
.filter(w => w.length >= 2 && !NOISE_WORDS.has(w))
.join(' ')
.trim();
}
function wordOverlap(a: string, b: string): number {
const setA = new Set(a.split(' ').filter(Boolean));
const setB = new Set(b.split(' ').filter(Boolean));
if (setA.size === 0 || setB.size === 0) return 0;
let common = 0;
for (const w of setA) if (setB.has(w)) common++;
return common / Math.max(setA.size, setB.size);
}
/**
* Find the best-matching existing church for a parsed entry.
* Returns null if no match meets the threshold.
*/
export function findMatch(
locationName: string,
address: string | null,
existing: ExistingChurch[]
): ExistingChurch | null {
const normTarget = normalizeName(locationName);
let best: ExistingChurch | null = null;
let bestScore = 0;
for (const church of existing) {
const normExisting = normalizeName(church.name);
const score = wordOverlap(normTarget, normExisting);
if (score > bestScore) {
bestScore = score;
best = church;
}
}
if (bestScore >= 0.4) return best;
// Fallback: address prefix match (first 12 chars)
if (address && address.length >= 5) {
const addrPrefix = address.slice(0, 12).toLowerCase();
for (const church of existing) {
if (church.address?.toLowerCase().includes(addrPrefix)) return church;
}
}
return null;
}