85 lines
2.2 KiB
TypeScript
85 lines
2.2 KiB
TypeScript
/**
|
|
* Extract JSON arrays from raw model output texts, merge into single list.
|
|
*/
|
|
export function parseJsonItems(rawTexts: string[]): Record<string, unknown>[] {
|
|
const allItems: Record<string, unknown>[] = [];
|
|
|
|
for (const text of rawTexts) {
|
|
// Try array first
|
|
try {
|
|
const start = text.indexOf("[");
|
|
const end = text.lastIndexOf("]");
|
|
if (start !== -1 && end > start) {
|
|
const items = JSON.parse(text.slice(start, end + 1));
|
|
if (Array.isArray(items)) {
|
|
for (const item of items) {
|
|
if (typeof item === "object" && item !== null) {
|
|
allItems.push(item as Record<string, unknown>);
|
|
}
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
} catch {
|
|
// fall through to single object
|
|
}
|
|
|
|
// Try single object
|
|
try {
|
|
const start = text.indexOf("{");
|
|
const end = text.lastIndexOf("}");
|
|
if (start !== -1 && end > start) {
|
|
const item = JSON.parse(text.slice(start, end + 1));
|
|
if (typeof item === "object" && item !== null && (item as Record<string, unknown>).name) {
|
|
allItems.push(item as Record<string, unknown>);
|
|
}
|
|
}
|
|
} catch {
|
|
// unparseable
|
|
}
|
|
}
|
|
|
|
return allItems;
|
|
}
|
|
|
|
function tokenSet(name: string): Set<string> {
|
|
return new Set(name.toLowerCase().split(/\s+/));
|
|
}
|
|
|
|
/**
|
|
* Remove near-duplicate items based on token overlap (>80% = duplicate).
|
|
*/
|
|
export function deduplicateItems(
|
|
items: Record<string, unknown>[],
|
|
key: string = "name"
|
|
): Record<string, unknown>[] {
|
|
const unique: Record<string, unknown>[] = [];
|
|
|
|
for (const item of items) {
|
|
const name = String(item[key] || "").trim();
|
|
if (!name) continue;
|
|
|
|
const tokens = tokenSet(name);
|
|
let isDup = false;
|
|
|
|
for (const existing of unique) {
|
|
const existingTokens = tokenSet(String(existing[key] || ""));
|
|
if (tokens.size === 0 || existingTokens.size === 0) continue;
|
|
|
|
const intersection = new Set([...tokens].filter((t) => existingTokens.has(t)));
|
|
const overlap = intersection.size / Math.max(tokens.size, existingTokens.size);
|
|
|
|
if (overlap > 0.8) {
|
|
isDup = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!isDup) {
|
|
unique.push(item);
|
|
}
|
|
}
|
|
|
|
return unique;
|
|
}
|