Initial commit: Vision scanner for shelf/pantry product extraction
This commit is contained in:
39
src/chroma.ts
Normal file
39
src/chroma.ts
Normal file
@@ -0,0 +1,39 @@
|
||||
import { ChromaClient, Collection } from "chromadb";
|
||||
import { config } from "./config";
|
||||
|
||||
let client: ChromaClient | null = null;
|
||||
let collection: Collection | null = null;
|
||||
|
||||
async function getCollection(): Promise<Collection> {
|
||||
if (!collection) {
|
||||
client = new ChromaClient({ path: config.chromaHost });
|
||||
collection = await client.getOrCreateCollection({
|
||||
name: "product_images",
|
||||
metadata: { "hnsw:space": "cosine" },
|
||||
});
|
||||
}
|
||||
return collection;
|
||||
}
|
||||
|
||||
export interface ChromaQueryResult {
|
||||
ids: string[][];
|
||||
distances: number[][] | null;
|
||||
metadatas: (Record<string, unknown> | null)[][] | null;
|
||||
}
|
||||
|
||||
export async function queryProducts(
|
||||
queryEmbeddings: number[][],
|
||||
nResults: number = 3
|
||||
): Promise<ChromaQueryResult> {
|
||||
const coll = await getCollection();
|
||||
return coll.query({ queryEmbeddings, nResults }) as unknown as ChromaQueryResult;
|
||||
}
|
||||
|
||||
export async function getCount(): Promise<number> {
|
||||
try {
|
||||
const coll = await getCollection();
|
||||
return await coll.count();
|
||||
} catch {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
14
src/config.ts
Normal file
14
src/config.ts
Normal file
@@ -0,0 +1,14 @@
|
||||
export const config = {
|
||||
port: parseInt(process.env.PORT || "8002", 10),
|
||||
visionAiUrl: process.env.VISION_AI_URL || "http://localhost:8000/v1/chat/completions",
|
||||
visionAiModel: process.env.VISION_AI_MODEL || "qwen2.5vl-it:3b",
|
||||
visionAiTimeout: parseInt(process.env.VISION_AI_TIMEOUT || "120000", 10),
|
||||
ollamaHost: process.env.OLLAMA_HOST || "http://192.168.0.15:11434",
|
||||
ollamaEmbedModel: process.env.OLLAMA_EMBED_MODEL || "nomic-embed-text",
|
||||
chromaHost: process.env.CHROMA_HOST || "http://192.168.0.15:8000",
|
||||
geminiApiKey: process.env.GEMINI_API_KEY || "",
|
||||
geminiModel: process.env.GEMINI_MODEL || "gemini-2.5-flash",
|
||||
maxConcurrentTiles: parseInt(process.env.MAX_CONCURRENT_TILES || "4", 10),
|
||||
uploadDir: process.env.UPLOAD_DIR || "uploads",
|
||||
maxFileSize: 10 * 1024 * 1024, // 10MB
|
||||
};
|
||||
30
src/embeddings.ts
Normal file
30
src/embeddings.ts
Normal file
@@ -0,0 +1,30 @@
|
||||
import { config } from "./config";
|
||||
|
||||
/**
|
||||
* Call Ollama embed endpoint for text embeddings.
|
||||
*/
|
||||
export async function embed(text: string): Promise<number[]> {
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(() => controller.abort(), 60000);
|
||||
|
||||
try {
|
||||
const response = await fetch(`${config.ollamaHost}/api/embed`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
signal: controller.signal,
|
||||
body: JSON.stringify({
|
||||
model: config.ollamaEmbedModel,
|
||||
input: text,
|
||||
}),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`Ollama embed returned ${response.status}`);
|
||||
}
|
||||
|
||||
const data = (await response.json()) as { embeddings: number[][] };
|
||||
return data.embeddings[0];
|
||||
} finally {
|
||||
clearTimeout(timeout);
|
||||
}
|
||||
}
|
||||
192
src/enrich.ts
Normal file
192
src/enrich.ts
Normal file
@@ -0,0 +1,192 @@
|
||||
import sharp from "sharp";
|
||||
import { readBarcodes, type ReaderOptions } from "zxing-wasm/reader";
|
||||
import { callVision } from "./vision";
|
||||
|
||||
const READER_OPTIONS: ReaderOptions = {
|
||||
tryHarder: true,
|
||||
tryRotate: true,
|
||||
tryDownscale: true,
|
||||
tryDenoise: true,
|
||||
formats: ["EAN-13", "EAN-8", "UPC-A", "UPC-E", "Code 128"],
|
||||
maxNumberOfSymbols: 1,
|
||||
};
|
||||
|
||||
const ENRICHMENT_PROMPT = `Analyze this food product image. Extract any of the following that are visible:
|
||||
|
||||
1. Product name and brand
|
||||
2. Nutrition Facts (structured): calories, protein_g, carbs_g, fat_g, fiber_g, sugar_g, sodium_mg, serving_size, serving_unit, servings_per_container
|
||||
3. Ingredients list (raw text as printed on label)
|
||||
4. UPC/Barcode number
|
||||
|
||||
Return a single JSON object:
|
||||
{"name": "...", "brand": "...", "nutrition": {...}, "ingredients_raw": "...", "upc": "..."}
|
||||
Only include fields you can read from the image. Return ONLY the JSON, no other text.`;
|
||||
|
||||
interface EnrichmentResult {
|
||||
name?: string;
|
||||
brand?: string;
|
||||
nutrition?: Record<string, unknown>;
|
||||
ingredients_raw?: string;
|
||||
upc?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract product info from a single close-up product photo.
|
||||
*
|
||||
* Two-phase approach:
|
||||
* 1. Barcode detection via zxing-wasm (fast, no AI)
|
||||
* 2. Vision model extraction for name, brand, nutrition, ingredients
|
||||
*
|
||||
* Results are merged — zxing UPC takes priority over vision-detected UPC.
|
||||
*/
|
||||
export async function extractProductInfo(
|
||||
imageBuffer: Buffer
|
||||
): Promise<EnrichmentResult> {
|
||||
// Phase 1: Fast barcode detection (no AI)
|
||||
const barcodeUpc = await detectBarcode(imageBuffer);
|
||||
|
||||
// Phase 2: Vision model extraction for nutrition/ingredients/name/brand
|
||||
const visionResult = await extractViaVision(imageBuffer);
|
||||
|
||||
// Merge: barcode UPC wins over vision-detected UPC
|
||||
const result: EnrichmentResult = { ...visionResult };
|
||||
if (barcodeUpc) {
|
||||
result.upc = barcodeUpc;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// --- Barcode detection via zxing-wasm ---
|
||||
|
||||
async function detectBarcode(imageBuffer: Buffer): Promise<string | null> {
|
||||
const meta = await sharp(imageBuffer).metadata();
|
||||
const width = meta.width || 0;
|
||||
const height = meta.height || 0;
|
||||
|
||||
// Strategy 1: Full image as PNG
|
||||
const pngFull = await sharp(imageBuffer).png().toBuffer();
|
||||
const result = await tryDecode(pngFull);
|
||||
if (result) return result;
|
||||
|
||||
// Strategy 2: Enhanced — grayscale + sharpen + high contrast
|
||||
const pngEnhanced = await sharp(imageBuffer)
|
||||
.grayscale()
|
||||
.sharpen({ sigma: 2 })
|
||||
.normalize()
|
||||
.png()
|
||||
.toBuffer();
|
||||
const result2 = await tryDecode(pngEnhanced);
|
||||
if (result2) return result2;
|
||||
|
||||
// Strategy 3: Crop bottom half (barcodes are usually on the bottom)
|
||||
if (height > 200) {
|
||||
const bottomHalf = await sharp(imageBuffer)
|
||||
.extract({ left: 0, top: Math.floor(height / 2), width, height: Math.floor(height / 2) })
|
||||
.sharpen({ sigma: 1.5 })
|
||||
.png()
|
||||
.toBuffer();
|
||||
const result3 = await tryDecode(bottomHalf);
|
||||
if (result3) return result3;
|
||||
}
|
||||
|
||||
// Strategy 4: Crop bottom-right quadrant
|
||||
if (width > 200 && height > 200) {
|
||||
const bottomRight = await sharp(imageBuffer)
|
||||
.extract({
|
||||
left: Math.floor(width / 2),
|
||||
top: Math.floor(height / 2),
|
||||
width: Math.floor(width / 2),
|
||||
height: Math.floor(height / 2),
|
||||
})
|
||||
.sharpen({ sigma: 1.5 })
|
||||
.png()
|
||||
.toBuffer();
|
||||
const result4 = await tryDecode(bottomRight);
|
||||
if (result4) return result4;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
async function tryDecode(pngBuffer: Buffer): Promise<string | null> {
|
||||
try {
|
||||
const results = await readBarcodes(new Uint8Array(pngBuffer), READER_OPTIONS);
|
||||
const valid = results.filter((r) => r.isValid);
|
||||
if (valid.length === 0) return null;
|
||||
|
||||
const best = valid[0];
|
||||
const upc = best.text.replace(/\D/g, "");
|
||||
if (upc.length >= 8 && upc.length <= 14) {
|
||||
console.log(`Barcode detected: ${upc} (format: ${best.format})`);
|
||||
return upc;
|
||||
}
|
||||
} catch (err) {
|
||||
console.warn(`Barcode decode error: ${err instanceof Error ? err.message : err}`);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// --- Vision model extraction ---
|
||||
|
||||
/**
|
||||
* Clean raw model output: strip markdown fences, JS comments, trailing commas.
|
||||
* Small models often wrap JSON in ```json ... ``` or add // comments.
|
||||
*/
|
||||
function cleanJsonText(raw: string): string {
|
||||
let text = raw;
|
||||
// Strip markdown code fences
|
||||
text = text.replace(/```(?:json)?\s*/gi, "").replace(/```/g, "");
|
||||
// Strip single-line JS comments (// ...)
|
||||
text = text.replace(/\/\/[^\n]*/g, "");
|
||||
// Strip trailing commas before } or ]
|
||||
text = text.replace(/,\s*([}\]])/g, "$1");
|
||||
return text;
|
||||
}
|
||||
|
||||
async function extractViaVision(imageBuffer: Buffer): Promise<EnrichmentResult> {
|
||||
const base64 = imageBuffer.toString("base64");
|
||||
|
||||
let raw: string;
|
||||
try {
|
||||
raw = await callVision(ENRICHMENT_PROMPT, base64);
|
||||
} catch (err) {
|
||||
console.error("Vision model call failed:", err instanceof Error ? err.message : err);
|
||||
return {};
|
||||
}
|
||||
|
||||
// Clean markdown fences, comments, then find JSON object
|
||||
const cleaned = cleanJsonText(raw);
|
||||
const start = cleaned.indexOf("{");
|
||||
const end = cleaned.lastIndexOf("}");
|
||||
if (start === -1 || end <= start) {
|
||||
console.warn(`Failed to parse enrichment response: ${raw.slice(0, 200)}`);
|
||||
return {};
|
||||
}
|
||||
|
||||
try {
|
||||
const data = JSON.parse(cleaned.slice(start, end + 1)) as Record<string, unknown>;
|
||||
const result: EnrichmentResult = {};
|
||||
|
||||
if (data.name && typeof data.name === "string") {
|
||||
result.name = data.name.trim();
|
||||
}
|
||||
if (data.brand && typeof data.brand === "string") {
|
||||
result.brand = data.brand.trim();
|
||||
}
|
||||
if (data.nutrition && typeof data.nutrition === "object" && data.nutrition !== null) {
|
||||
result.nutrition = data.nutrition as Record<string, unknown>;
|
||||
}
|
||||
if (data.ingredients_raw && typeof data.ingredients_raw === "string") {
|
||||
result.ingredients_raw = data.ingredients_raw.trim();
|
||||
}
|
||||
if (data.upc) {
|
||||
result.upc = String(data.upc).trim();
|
||||
}
|
||||
|
||||
return result;
|
||||
} catch {
|
||||
console.warn(`Failed to parse enrichment JSON: ${raw.slice(0, 200)}`);
|
||||
return {};
|
||||
}
|
||||
}
|
||||
80
src/gemini.ts
Normal file
80
src/gemini.ts
Normal file
@@ -0,0 +1,80 @@
|
||||
import { config } from "./config";
|
||||
|
||||
let genaiModule: typeof import("@google/generative-ai") | null = null;
|
||||
|
||||
async function getClient() {
|
||||
if (!config.geminiApiKey) {
|
||||
throw new Error("GEMINI_API_KEY not configured");
|
||||
}
|
||||
if (!genaiModule) {
|
||||
genaiModule = await import("@google/generative-ai");
|
||||
}
|
||||
return new genaiModule.GoogleGenerativeAI(config.geminiApiKey);
|
||||
}
|
||||
|
||||
/**
|
||||
* Send an image + prompt to Gemini vision.
|
||||
*/
|
||||
export async function geminiVision(prompt: string, imageBase64: string): Promise<string> {
|
||||
const ai = await getClient();
|
||||
const model = ai.getGenerativeModel({ model: config.geminiModel });
|
||||
|
||||
const result = await model.generateContent([
|
||||
prompt,
|
||||
{
|
||||
inlineData: {
|
||||
mimeType: "image/jpeg",
|
||||
data: imageBase64,
|
||||
},
|
||||
},
|
||||
]);
|
||||
|
||||
return result.response.text();
|
||||
}
|
||||
|
||||
/**
|
||||
* Identify ALL food/grocery products visible in a photo.
|
||||
*/
|
||||
export async function geminiIdentifyProducts(
|
||||
imageBase64: string
|
||||
): Promise<Record<string, unknown>[]> {
|
||||
const prompt = `Identify ALL food and grocery products visible in this photo. \
|
||||
There may be one product or many (e.g. a grocery haul, a shelf, a receipt).
|
||||
|
||||
Return ONLY a JSON array of objects. Each object must have:
|
||||
- "name": product name (string)
|
||||
- "brand": brand name if visible, otherwise "" (string). Produce typically has no brand.
|
||||
- "category": one of "produce", "dairy", "meat", "seafood", "bakery", "snacks", "beverages", "frozen", "pantry", "condiments", "other" (string)
|
||||
- "is_organic": "yes" or "no" based on visible labels (string)
|
||||
|
||||
Return ONLY the JSON array, no other text.`;
|
||||
|
||||
const raw = await geminiVision(prompt, imageBase64);
|
||||
|
||||
try {
|
||||
const start = raw.indexOf("[");
|
||||
const end = raw.lastIndexOf("]");
|
||||
if (start !== -1 && end > start) {
|
||||
const items = JSON.parse(raw.slice(start, end + 1));
|
||||
return (items as Record<string, unknown>[]).filter(
|
||||
(i) => typeof i === "object" && i !== null && i.name
|
||||
);
|
||||
}
|
||||
} catch {
|
||||
// try single object
|
||||
}
|
||||
|
||||
try {
|
||||
const start = raw.indexOf("{");
|
||||
const end = raw.lastIndexOf("}");
|
||||
if (start !== -1 && end > start) {
|
||||
const item = JSON.parse(raw.slice(start, end + 1)) as Record<string, unknown>;
|
||||
if (item.name) return [item];
|
||||
}
|
||||
} catch {
|
||||
// unparseable
|
||||
}
|
||||
|
||||
console.warn(`Failed to parse Gemini product identification: ${raw.slice(0, 200)}`);
|
||||
return [];
|
||||
}
|
||||
67
src/matching.ts
Normal file
67
src/matching.ts
Normal file
@@ -0,0 +1,67 @@
|
||||
import { embed } from "./embeddings";
|
||||
import { queryProducts } from "./chroma";
|
||||
|
||||
export interface MatchedItem extends Record<string, unknown> {
|
||||
matched_product_id: number | null;
|
||||
match_confidence: number;
|
||||
reference_image_url: string | null;
|
||||
}
|
||||
|
||||
/**
|
||||
* For each item, try to match against ChromaDB product_images embeddings.
|
||||
* No PostgreSQL — ILIKE fallback happens in the backend.
|
||||
*/
|
||||
export async function matchToKnownProducts(
|
||||
items: Record<string, unknown>[]
|
||||
): Promise<MatchedItem[]> {
|
||||
const results: MatchedItem[] = [];
|
||||
|
||||
for (const item of items) {
|
||||
const name = String(item.name || "");
|
||||
const brand = String(item.brand || "");
|
||||
|
||||
const matched: MatchedItem = {
|
||||
...item,
|
||||
matched_product_id: null,
|
||||
match_confidence: 0,
|
||||
reference_image_url: null,
|
||||
};
|
||||
|
||||
if (!name) {
|
||||
results.push(matched);
|
||||
continue;
|
||||
}
|
||||
|
||||
const queryText = [name, brand].filter(Boolean).join(", ");
|
||||
|
||||
try {
|
||||
const embedding = await embed(queryText);
|
||||
const chromaResults = await queryProducts([embedding], 3);
|
||||
|
||||
if (
|
||||
chromaResults.distances &&
|
||||
chromaResults.distances[0] &&
|
||||
chromaResults.distances[0].length > 0
|
||||
) {
|
||||
const bestDistance = chromaResults.distances[0][0];
|
||||
const confidence = Math.max(0, 1 - bestDistance);
|
||||
|
||||
if (confidence > 0.5 && chromaResults.metadatas?.[0]?.[0]) {
|
||||
const meta = chromaResults.metadatas[0][0];
|
||||
matched.matched_product_id = (meta.product_id as number) || null;
|
||||
matched.match_confidence = Math.round(confidence * 1000) / 1000;
|
||||
matched.reference_image_url = (meta.image_url as string) || null;
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
console.debug(
|
||||
`ChromaDB match failed for '${name}':`,
|
||||
err instanceof Error ? err.message : err
|
||||
);
|
||||
}
|
||||
|
||||
results.push(matched);
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
91
src/pantry.ts
Normal file
91
src/pantry.ts
Normal file
@@ -0,0 +1,91 @@
|
||||
import { tileImage, scanTiles, saveScanImage } from "./tiling";
|
||||
import { parseJsonItems, deduplicateItems } from "./parsing";
|
||||
import { matchToKnownProducts } from "./matching";
|
||||
import { geminiIdentifyProducts } from "./gemini";
|
||||
|
||||
const PANTRY_PROMPT = `Identify ALL food and grocery products visible in this photo of a pantry, fridge, or kitchen.
|
||||
For each product, extract:
|
||||
- Product name (as shown on the label)
|
||||
- Brand (if visible)
|
||||
- Category (produce/dairy/meat/seafood/bakery/snacks/beverages/frozen/pantry/condiments/other)
|
||||
- Approximate quantity (e.g. "2 cans", "1 bottle", "half gallon")
|
||||
|
||||
Return ONLY a JSON array of objects with keys: "name", "brand", "category", "quantity_desc"
|
||||
Example: [{"name": "Greek Yogurt", "brand": "Fage", "category": "dairy", "quantity_desc": "2 containers"}]
|
||||
Return ONLY the JSON array, no other text.`;
|
||||
|
||||
const MIN_LOCAL_RESULTS = 2;
|
||||
|
||||
interface PantryItem extends Record<string, unknown> {
|
||||
name: string;
|
||||
brand: string;
|
||||
category: string;
|
||||
quantity_desc: string;
|
||||
}
|
||||
|
||||
function cleanPantryItems(items: Record<string, unknown>[]): PantryItem[] {
|
||||
const cleaned: PantryItem[] = [];
|
||||
|
||||
for (const item of items) {
|
||||
if (typeof item !== "object" || item === null) continue;
|
||||
const name = String(item.name || "").trim();
|
||||
if (!name) continue;
|
||||
|
||||
cleaned.push({
|
||||
name,
|
||||
brand: String(item.brand || "").trim(),
|
||||
category: String(item.category || "other").trim().toLowerCase(),
|
||||
quantity_desc: String(item.quantity_desc || "1").trim(),
|
||||
});
|
||||
}
|
||||
|
||||
return cleaned;
|
||||
}
|
||||
|
||||
export async function scanPantryPhoto(
|
||||
imageBuffer: Buffer
|
||||
): Promise<Record<string, unknown>> {
|
||||
const { scanId } = saveScanImage(imageBuffer, "pantry");
|
||||
let modelUsed = "local";
|
||||
|
||||
// Step 1: Tile and scan with local model
|
||||
const tiles = await tileImage(imageBuffer);
|
||||
const rawTexts = await scanTiles(tiles, PANTRY_PROMPT);
|
||||
let items: Record<string, unknown>[] = cleanPantryItems(parseJsonItems(rawTexts));
|
||||
items = deduplicateItems(items);
|
||||
|
||||
// Step 2: Gemini fallback if too few results
|
||||
if (items.length < MIN_LOCAL_RESULTS) {
|
||||
console.log(`Local model found ${items.length} pantry items, falling back to Gemini`);
|
||||
try {
|
||||
const fullBase64 = imageBuffer.toString("base64");
|
||||
const geminiItems = await geminiIdentifyProducts(fullBase64);
|
||||
const normalized: PantryItem[] = geminiItems.map((gi) => ({
|
||||
name: String(gi.name || ""),
|
||||
brand: String(gi.brand || ""),
|
||||
category: String(gi.category || "other"),
|
||||
quantity_desc: "1",
|
||||
}));
|
||||
if (normalized.length > items.length) {
|
||||
items = normalized;
|
||||
modelUsed = "gemini";
|
||||
}
|
||||
} catch (err) {
|
||||
console.error("Gemini fallback failed for pantry scan:", err instanceof Error ? err.message : err);
|
||||
}
|
||||
}
|
||||
|
||||
// Step 3: Match against known products (ChromaDB only)
|
||||
const matched = await matchToKnownProducts(items);
|
||||
|
||||
// Add index
|
||||
const indexed = matched.map((item, i) => ({ ...item, index: i }));
|
||||
|
||||
console.log(`Pantry scan ${scanId}: ${indexed.length} products via ${modelUsed}`);
|
||||
return {
|
||||
scan_id: scanId,
|
||||
items: indexed,
|
||||
total_found: indexed.length,
|
||||
model_used: modelUsed,
|
||||
};
|
||||
}
|
||||
84
src/parsing.ts
Normal file
84
src/parsing.ts
Normal file
@@ -0,0 +1,84 @@
|
||||
/**
|
||||
* Extract JSON arrays from raw model output texts, merge into single list.
|
||||
*/
|
||||
export function parseJsonItems(rawTexts: string[]): Record<string, unknown>[] {
|
||||
const allItems: Record<string, unknown>[] = [];
|
||||
|
||||
for (const text of rawTexts) {
|
||||
// Try array first
|
||||
try {
|
||||
const start = text.indexOf("[");
|
||||
const end = text.lastIndexOf("]");
|
||||
if (start !== -1 && end > start) {
|
||||
const items = JSON.parse(text.slice(start, end + 1));
|
||||
if (Array.isArray(items)) {
|
||||
for (const item of items) {
|
||||
if (typeof item === "object" && item !== null) {
|
||||
allItems.push(item as Record<string, unknown>);
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// fall through to single object
|
||||
}
|
||||
|
||||
// Try single object
|
||||
try {
|
||||
const start = text.indexOf("{");
|
||||
const end = text.lastIndexOf("}");
|
||||
if (start !== -1 && end > start) {
|
||||
const item = JSON.parse(text.slice(start, end + 1));
|
||||
if (typeof item === "object" && item !== null && (item as Record<string, unknown>).name) {
|
||||
allItems.push(item as Record<string, unknown>);
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// unparseable
|
||||
}
|
||||
}
|
||||
|
||||
return allItems;
|
||||
}
|
||||
|
||||
function tokenSet(name: string): Set<string> {
|
||||
return new Set(name.toLowerCase().split(/\s+/));
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove near-duplicate items based on token overlap (>80% = duplicate).
|
||||
*/
|
||||
export function deduplicateItems(
|
||||
items: Record<string, unknown>[],
|
||||
key: string = "name"
|
||||
): Record<string, unknown>[] {
|
||||
const unique: Record<string, unknown>[] = [];
|
||||
|
||||
for (const item of items) {
|
||||
const name = String(item[key] || "").trim();
|
||||
if (!name) continue;
|
||||
|
||||
const tokens = tokenSet(name);
|
||||
let isDup = false;
|
||||
|
||||
for (const existing of unique) {
|
||||
const existingTokens = tokenSet(String(existing[key] || ""));
|
||||
if (tokens.size === 0 || existingTokens.size === 0) continue;
|
||||
|
||||
const intersection = new Set([...tokens].filter((t) => existingTokens.has(t)));
|
||||
const overlap = intersection.size / Math.max(tokens.size, existingTokens.size);
|
||||
|
||||
if (overlap > 0.8) {
|
||||
isDup = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!isDup) {
|
||||
unique.push(item);
|
||||
}
|
||||
}
|
||||
|
||||
return unique;
|
||||
}
|
||||
176
src/server.ts
Normal file
176
src/server.ts
Normal file
@@ -0,0 +1,176 @@
|
||||
import "dotenv/config";
|
||||
import express, { Request, Response, NextFunction } from "express";
|
||||
import multer from "multer";
|
||||
import { config } from "./config";
|
||||
import { scanShelfPhoto } from "./shelf";
|
||||
import { scanPantryPhoto } from "./pantry";
|
||||
import { extractProductInfo } from "./enrich";
|
||||
import { getCount } from "./chroma";
|
||||
|
||||
const ALLOWED_MIMES = new Set([
|
||||
"image/jpeg",
|
||||
"image/png",
|
||||
"image/webp",
|
||||
"image/gif",
|
||||
"image/heic",
|
||||
"image/heif",
|
||||
]);
|
||||
|
||||
const app = express();
|
||||
|
||||
const upload = multer({
|
||||
storage: multer.memoryStorage(),
|
||||
limits: {
|
||||
fileSize: config.maxFileSize,
|
||||
files: 1,
|
||||
},
|
||||
fileFilter: (_req, file, cb) => {
|
||||
if (ALLOWED_MIMES.has(file.mimetype)) {
|
||||
cb(null, true);
|
||||
} else {
|
||||
cb(new Error("Invalid file type. Allowed: JPEG, PNG, WebP, GIF, HEIC, HEIF"));
|
||||
}
|
||||
},
|
||||
});
|
||||
|
||||
// --- Health check ---
|
||||
|
||||
app.get("/health", async (_req: Request, res: Response) => {
|
||||
const status: Record<string, unknown> = {
|
||||
status: "ok",
|
||||
vision_model: config.visionAiModel,
|
||||
};
|
||||
|
||||
// Check vision model
|
||||
try {
|
||||
const visionUrl = config.visionAiUrl.replace("/v1/chat/completions", "/v1/models");
|
||||
const resp = await fetch(visionUrl, { signal: AbortSignal.timeout(5000) });
|
||||
status.vision = resp.ok ? "connected" : `status ${resp.status}`;
|
||||
} catch (err) {
|
||||
status.vision = `unreachable: ${err instanceof Error ? err.message : err}`;
|
||||
status.status = "degraded";
|
||||
}
|
||||
|
||||
// Check Ollama
|
||||
try {
|
||||
const resp = await fetch(`${config.ollamaHost}/api/tags`, {
|
||||
signal: AbortSignal.timeout(5000),
|
||||
});
|
||||
status.ollama = resp.ok ? "connected" : `status ${resp.status}`;
|
||||
} catch (err) {
|
||||
status.ollama = `unreachable: ${err instanceof Error ? err.message : err}`;
|
||||
status.status = "degraded";
|
||||
}
|
||||
|
||||
// Check ChromaDB
|
||||
try {
|
||||
const count = await getCount();
|
||||
status.chroma = "connected";
|
||||
status.chroma_count = count;
|
||||
} catch (err) {
|
||||
status.chroma = `unreachable: ${err instanceof Error ? err.message : err}`;
|
||||
status.status = "degraded";
|
||||
}
|
||||
|
||||
res.json(status);
|
||||
});
|
||||
|
||||
// --- Scan endpoints ---
|
||||
|
||||
app.post(
|
||||
"/scan/shelf",
|
||||
upload.single("image"),
|
||||
async (req: Request, res: Response, next: NextFunction) => {
|
||||
try {
|
||||
const file = req.file;
|
||||
if (!file) {
|
||||
res.status(400).json({ error: "No image provided" });
|
||||
return;
|
||||
}
|
||||
|
||||
const storeName = req.body?.store_name;
|
||||
if (!storeName) {
|
||||
res.status(400).json({ error: "store_name is required" });
|
||||
return;
|
||||
}
|
||||
|
||||
const result = await scanShelfPhoto(file.buffer, storeName);
|
||||
res.json(result);
|
||||
} catch (err) {
|
||||
next(err);
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
app.post(
|
||||
"/scan/pantry",
|
||||
upload.single("image"),
|
||||
async (req: Request, res: Response, next: NextFunction) => {
|
||||
try {
|
||||
const file = req.file;
|
||||
if (!file) {
|
||||
res.status(400).json({ error: "No image provided" });
|
||||
return;
|
||||
}
|
||||
|
||||
const result = await scanPantryPhoto(file.buffer);
|
||||
res.json(result);
|
||||
} catch (err) {
|
||||
next(err);
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
// --- Product enrichment ---
|
||||
|
||||
app.post(
|
||||
"/enrich/product",
|
||||
upload.single("image"),
|
||||
async (req: Request, res: Response, next: NextFunction) => {
|
||||
try {
|
||||
const file = req.file;
|
||||
if (!file) {
|
||||
res.status(400).json({ error: "No image provided" });
|
||||
return;
|
||||
}
|
||||
|
||||
const result = await extractProductInfo(file.buffer);
|
||||
res.json(result);
|
||||
} catch (err) {
|
||||
next(err);
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
// --- 404 ---
|
||||
|
||||
app.use((_req: Request, res: Response) => {
|
||||
res.status(404).json({ error: "Not found" });
|
||||
});
|
||||
|
||||
// --- Error handler ---
|
||||
|
||||
app.use((err: Error, _req: Request, res: Response, _next: NextFunction) => {
|
||||
if (err instanceof multer.MulterError && err.code === "LIMIT_FILE_SIZE") {
|
||||
res.status(413).json({ error: `File too large. Maximum size is ${config.maxFileSize / 1024 / 1024}MB.` });
|
||||
return;
|
||||
}
|
||||
|
||||
if (err.message.startsWith("Invalid file type")) {
|
||||
res.status(400).json({ error: err.message });
|
||||
return;
|
||||
}
|
||||
|
||||
console.error("Scan error:", err.message);
|
||||
res.status(502).json({ error: "Failed to process image" });
|
||||
});
|
||||
|
||||
// --- Start ---
|
||||
|
||||
app.listen(config.port, "0.0.0.0", () => {
|
||||
console.log(`Vision Scanner Service listening on port ${config.port}`);
|
||||
console.log(`Vision AI: ${config.visionAiUrl} (model: ${config.visionAiModel})`);
|
||||
console.log(`Ollama: ${config.ollamaHost} (embed: ${config.ollamaEmbedModel})`);
|
||||
console.log(`ChromaDB: ${config.chromaHost}`);
|
||||
});
|
||||
|
||||
103
src/shelf.ts
Normal file
103
src/shelf.ts
Normal file
@@ -0,0 +1,103 @@
|
||||
import { tileImage, scanTiles, saveScanImage } from "./tiling";
|
||||
import { parseJsonItems, deduplicateItems } from "./parsing";
|
||||
import { matchToKnownProducts } from "./matching";
|
||||
import { geminiVision } from "./gemini";
|
||||
|
||||
const SHELF_PROMPT = `Look at this photo of a store shelf or price display. For each product visible, extract:
|
||||
- Product name (as shown on the label/tag)
|
||||
- Price (the number on the price tag)
|
||||
- Size/weight if visible (e.g. "32 oz", "5 lb", "1 gal")
|
||||
- Unit price if shown on the tag (e.g. "$0.25/oz")
|
||||
|
||||
Return ONLY a JSON array of objects with keys: "name", "price", "quantity", "unit", "unit_price"
|
||||
Example: [{"name": "Kirkland Organic Eggs", "price": 7.99, "quantity": 24, "unit": "ct", "unit_price": null}]
|
||||
Return ONLY the JSON array, no other text.`;
|
||||
|
||||
const GEMINI_SHELF_PROMPT = `Look at this photo of a store shelf or price display. For each product you can see, extract:
|
||||
- Product name (as shown on the label/tag)
|
||||
- Price (the number on the price tag)
|
||||
- Size/weight if visible (e.g. "32 oz", "5 lb", "1 gal")
|
||||
- Unit price if shown on the tag (e.g. "$0.25/oz")
|
||||
|
||||
Return ONLY a JSON array of objects with keys: "name", "price", "quantity", "unit", "unit_price"
|
||||
Be precise with numbers. Return ONLY the JSON array, no other text.`;
|
||||
|
||||
const MIN_LOCAL_RESULTS = 2;
|
||||
|
||||
interface ShelfItem extends Record<string, unknown> {
|
||||
name: string;
|
||||
price: number;
|
||||
quantity: number;
|
||||
unit: string;
|
||||
unit_price: number | null;
|
||||
store_name: string;
|
||||
}
|
||||
|
||||
function cleanShelfItems(
|
||||
items: Record<string, unknown>[],
|
||||
storeName: string
|
||||
): ShelfItem[] {
|
||||
const cleaned: ShelfItem[] = [];
|
||||
|
||||
for (const item of items) {
|
||||
if (typeof item !== "object" || item === null) continue;
|
||||
const name = String(item.name || "").trim();
|
||||
const price = Number(item.price);
|
||||
if (!name || isNaN(price) || price <= 0) continue;
|
||||
|
||||
cleaned.push({
|
||||
name,
|
||||
price,
|
||||
quantity: Number(item.quantity) || 1,
|
||||
unit: String(item.unit || "unit"),
|
||||
unit_price: item.unit_price ? Number(item.unit_price) : null,
|
||||
store_name: storeName,
|
||||
});
|
||||
}
|
||||
|
||||
return cleaned;
|
||||
}
|
||||
|
||||
export async function scanShelfPhoto(
|
||||
imageBuffer: Buffer,
|
||||
storeName: string
|
||||
): Promise<Record<string, unknown>> {
|
||||
const { scanId } = saveScanImage(imageBuffer, "shelf");
|
||||
let modelUsed = "local";
|
||||
|
||||
// Step 1: Tile and scan with local model
|
||||
const tiles = await tileImage(imageBuffer);
|
||||
const rawTexts = await scanTiles(tiles, SHELF_PROMPT);
|
||||
let items: Record<string, unknown>[] = cleanShelfItems(parseJsonItems(rawTexts), storeName);
|
||||
items = deduplicateItems(items);
|
||||
|
||||
// Step 2: Gemini fallback if too few results
|
||||
if (items.length < MIN_LOCAL_RESULTS) {
|
||||
console.log(`Local model found ${items.length} items, falling back to Gemini`);
|
||||
try {
|
||||
const fullBase64 = imageBuffer.toString("base64");
|
||||
const raw = await geminiVision(GEMINI_SHELF_PROMPT, fullBase64);
|
||||
const geminiItems = cleanShelfItems(parseJsonItems([raw]), storeName);
|
||||
if (geminiItems.length > items.length) {
|
||||
items = geminiItems;
|
||||
modelUsed = "gemini";
|
||||
}
|
||||
} catch (err) {
|
||||
console.error("Gemini fallback failed for shelf scan:", err instanceof Error ? err.message : err);
|
||||
}
|
||||
}
|
||||
|
||||
// Step 3: Match against known products (ChromaDB only)
|
||||
const matched = await matchToKnownProducts(items);
|
||||
|
||||
// Add index
|
||||
const indexed = matched.map((item, i) => ({ ...item, index: i }));
|
||||
|
||||
console.log(`Shelf scan ${scanId}: ${indexed.length} products via ${modelUsed}`);
|
||||
return {
|
||||
scan_id: scanId,
|
||||
items: indexed,
|
||||
total_found: indexed.length,
|
||||
model_used: modelUsed,
|
||||
};
|
||||
}
|
||||
100
src/tiling.ts
Normal file
100
src/tiling.ts
Normal file
@@ -0,0 +1,100 @@
|
||||
import sharp from "sharp";
|
||||
import * as fs from "fs";
|
||||
import * as path from "path";
|
||||
import * as crypto from "crypto";
|
||||
import { config } from "./config";
|
||||
import { callVision } from "./vision";
|
||||
|
||||
export interface TileResult {
|
||||
buffer: Buffer;
|
||||
base64: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Split image into grid tiles with overlap, returning JPEG buffers.
|
||||
*/
|
||||
export async function tileImage(
|
||||
imageBuffer: Buffer,
|
||||
grid: [number, number] = [2, 2],
|
||||
overlap: number = 0.10
|
||||
): Promise<TileResult[]> {
|
||||
const metadata = await sharp(imageBuffer).metadata();
|
||||
const w = metadata.width || 0;
|
||||
const h = metadata.height || 0;
|
||||
|
||||
if (w === 0 || h === 0) return [];
|
||||
|
||||
const [cols, rows] = grid;
|
||||
const tileW = Math.floor(w / cols);
|
||||
const tileH = Math.floor(h / rows);
|
||||
const overlapX = Math.floor(tileW * overlap);
|
||||
const overlapY = Math.floor(tileH * overlap);
|
||||
|
||||
const tilePromises: Promise<TileResult>[] = [];
|
||||
|
||||
for (let row = 0; row < rows; row++) {
|
||||
for (let col = 0; col < cols; col++) {
|
||||
const x1 = Math.max(0, col * tileW - overlapX);
|
||||
const y1 = Math.max(0, row * tileH - overlapY);
|
||||
const x2 = Math.min(w, (col + 1) * tileW + overlapX);
|
||||
const y2 = Math.min(h, (row + 1) * tileH + overlapY);
|
||||
|
||||
tilePromises.push(
|
||||
sharp(imageBuffer)
|
||||
.extract({ left: x1, top: y1, width: x2 - x1, height: y2 - y1 })
|
||||
.jpeg({ quality: 85 })
|
||||
.toBuffer()
|
||||
.then((buffer) => ({
|
||||
buffer,
|
||||
base64: buffer.toString("base64"),
|
||||
}))
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return Promise.all(tilePromises);
|
||||
}
|
||||
|
||||
/**
|
||||
* Send all tiles to vision model in parallel (throttled), return raw text per tile.
|
||||
*/
|
||||
export async function scanTiles(
|
||||
tiles: TileResult[],
|
||||
prompt: string
|
||||
): Promise<string[]> {
|
||||
const maxConcurrent = config.maxConcurrentTiles;
|
||||
const results: string[] = [];
|
||||
|
||||
// Process in batches to throttle GPU usage
|
||||
for (let i = 0; i < tiles.length; i += maxConcurrent) {
|
||||
const batch = tiles.slice(i, i + maxConcurrent);
|
||||
const batchResults = await Promise.all(
|
||||
batch.map(async (tile) => {
|
||||
try {
|
||||
return await callVision(prompt, tile.base64);
|
||||
} catch (err) {
|
||||
console.warn("Vision tile scan failed:", err instanceof Error ? err.message : err);
|
||||
return "";
|
||||
}
|
||||
})
|
||||
);
|
||||
results.push(...batchResults);
|
||||
}
|
||||
|
||||
return results.filter((r) => r.trim().length > 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Save uploaded image to disk. Returns scan_id and file_path.
|
||||
*/
|
||||
export function saveScanImage(
|
||||
imageBuffer: Buffer,
|
||||
scanType: string
|
||||
): { scanId: string; filePath: string } {
|
||||
const scanId = crypto.randomUUID().slice(0, 12);
|
||||
const dirPath = path.join(config.uploadDir, scanType);
|
||||
fs.mkdirSync(dirPath, { recursive: true });
|
||||
const filePath = path.join(dirPath, `${scanId}.jpg`);
|
||||
fs.writeFileSync(filePath, imageBuffer);
|
||||
return { scanId, filePath };
|
||||
}
|
||||
46
src/vision.ts
Normal file
46
src/vision.ts
Normal file
@@ -0,0 +1,46 @@
|
||||
import { config } from "./config";
|
||||
|
||||
/**
|
||||
* Call local vision model via OpenAI-compatible API.
|
||||
*/
|
||||
export async function callVision(prompt: string, imageBase64: string): Promise<string> {
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(() => controller.abort(), config.visionAiTimeout);
|
||||
|
||||
try {
|
||||
const response = await fetch(config.visionAiUrl, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
signal: controller.signal,
|
||||
body: JSON.stringify({
|
||||
model: config.visionAiModel,
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{ type: "text", text: prompt },
|
||||
{
|
||||
type: "image_url",
|
||||
image_url: {
|
||||
url: `data:image/jpeg;base64,${imageBase64}`,
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
max_tokens: 2048,
|
||||
}),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`Vision AI returned ${response.status}`);
|
||||
}
|
||||
|
||||
const data = (await response.json()) as {
|
||||
choices?: { message?: { content?: string } }[];
|
||||
};
|
||||
return data.choices?.[0]?.message?.content?.trim() || "";
|
||||
} finally {
|
||||
clearTimeout(timeout);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user