fix: use true Jaccard similarity in wordOverlap (intersection/union)
Replaces max(|A|,|B|) denominator with |A∪B| = |A|+|B|-intersection, which is the correct Jaccard formula and avoids inflating similarity when both name sets have significant unique words. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -396,9 +396,10 @@ function wordOverlap(a: string, b: string): number {
|
||||
const setA = new Set(a.split(' ').filter(Boolean));
|
||||
const setB = new Set(b.split(' ').filter(Boolean));
|
||||
if (setA.size === 0 || setB.size === 0) return 0;
|
||||
let common = 0;
|
||||
for (const w of setA) if (setB.has(w)) common++;
|
||||
return common / Math.max(setA.size, setB.size);
|
||||
let intersection = 0;
|
||||
for (const w of setA) if (setB.has(w)) intersection++;
|
||||
const union = setA.size + setB.size - intersection;
|
||||
return intersection / union;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user