feat: add name normalizer and church matcher for HK import

normalizeName strips noise words (church/parish/chapel/etc), accents, and punctuation for robust name comparison. findMatch uses word-overlap Jaccard score (threshold 0.4) with address-prefix fallback for Chinese- named churches where English name overlap may be low. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-03 16:23:58 -04:00
parent eedb442e78
commit 3ebbc3732f
2 changed files with 101 additions and 0 deletions
--- a/scripts/import-hk-parishes.ts
+++ b/scripts/import-hk-parishes.ts
@@ -369,3 +369,70 @@ export function parseEntry(raw: string): ParsedEntry {

  return { locationName, parishName, address, phone, email, schedules };
 }
+
+// ─── Task 8: Name normalizer + matcher ───────────────────────────────────────
+
+const NOISE_WORDS = new Set([
+  'church', 'parish', 'chapel', 'centre', 'center', 'mass',
+  'saint', 'st', 'our', 'lady', 'of', 'the', 'a', 'an',
+]);
+
+/**
+ * Normalise a church name for comparison:
+ * lowercase, strip accents, remove noise words, collapse whitespace.
+ */
+export function normalizeName(name: string): string {
+  return name
+    .toLowerCase()
+    .normalize('NFD').replace(/[\u0300-\u036f]/g, '')
+    .replace(/[^a-z0-9\s]/g, ' ')
+    .split(/\s+/)
+    .filter(w => w.length >= 2 && !NOISE_WORDS.has(w))
+    .join(' ')
+    .trim();
+}
+
+function wordOverlap(a: string, b: string): number {
+  const setA = new Set(a.split(' ').filter(Boolean));
+  const setB = new Set(b.split(' ').filter(Boolean));
+  if (setA.size === 0 || setB.size === 0) return 0;
+  let common = 0;
+  for (const w of setA) if (setB.has(w)) common++;
+  return common / Math.max(setA.size, setB.size);
+}
+
+/**
+ * Find the best-matching existing church for a parsed entry.
+ * Returns null if no match meets the threshold.
+ */
+export function findMatch(
+  locationName: string,
+  address: string | null,
+  existing: ExistingChurch[]
+): ExistingChurch | null {
+  const normTarget = normalizeName(locationName);
+  let best: ExistingChurch | null = null;
+  let bestScore = 0;
+
+  for (const church of existing) {
+    const normExisting = normalizeName(church.name);
+    const score = wordOverlap(normTarget, normExisting);
+
+    if (score > bestScore) {
+      bestScore = score;
+      best = church;
+    }
+  }
+
+  if (bestScore >= 0.4) return best;
+
+  // Fallback: address prefix match (first 12 chars)
+  if (address && address.length >= 5) {
+    const addrPrefix = address.slice(0, 12).toLowerCase();
+    for (const church of existing) {
+      if (church.address?.toLowerCase().includes(addrPrefix)) return church;
+    }
+  }
+
+  return null;
+}