From 8075072c241433ad08d6ac89fdf44e7b91008fd1 Mon Sep 17 00:00:00 2001 From: albertfj114 Date: Fri, 3 Apr 2026 16:25:24 -0400 Subject: [PATCH] fix: use true Jaccard similarity in wordOverlap (intersection/union) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces max(|A|,|B|) denominator with |A∪B| = |A|+|B|-intersection, which is the correct Jaccard formula and avoids inflating similarity when both name sets have significant unique words. Co-Authored-By: Claude Sonnet 4.6 --- scripts/import-hk-parishes.ts | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/import-hk-parishes.ts b/scripts/import-hk-parishes.ts index 8bb35d0..4c7d2de 100644 --- a/scripts/import-hk-parishes.ts +++ b/scripts/import-hk-parishes.ts @@ -396,9 +396,10 @@ function wordOverlap(a: string, b: string): number { const setA = new Set(a.split(' ').filter(Boolean)); const setB = new Set(b.split(' ').filter(Boolean)); if (setA.size === 0 || setB.size === 0) return 0; - let common = 0; - for (const w of setA) if (setB.has(w)) common++; - return common / Math.max(setA.size, setB.size); + let intersection = 0; + for (const w of setA) if (setB.has(w)) intersection++; + const union = setA.size + setB.size - intersection; + return intersection / union; } /**