Initial commit: Vision scanner for shelf/pantry product extraction

This commit is contained in:
2026-03-29 21:58:07 -04:00
commit 5de44e7579
19 changed files with 3673 additions and 0 deletions

39
src/chroma.ts Normal file
View File

@@ -0,0 +1,39 @@
import { ChromaClient, Collection } from "chromadb";
import { config } from "./config";
let client: ChromaClient | null = null;
let collection: Collection | null = null;
async function getCollection(): Promise<Collection> {
if (!collection) {
client = new ChromaClient({ path: config.chromaHost });
collection = await client.getOrCreateCollection({
name: "product_images",
metadata: { "hnsw:space": "cosine" },
});
}
return collection;
}
export interface ChromaQueryResult {
ids: string[][];
distances: number[][] | null;
metadatas: (Record<string, unknown> | null)[][] | null;
}
export async function queryProducts(
queryEmbeddings: number[][],
nResults: number = 3
): Promise<ChromaQueryResult> {
const coll = await getCollection();
return coll.query({ queryEmbeddings, nResults }) as unknown as ChromaQueryResult;
}
export async function getCount(): Promise<number> {
try {
const coll = await getCollection();
return await coll.count();
} catch {
return 0;
}
}

14
src/config.ts Normal file
View File

@@ -0,0 +1,14 @@
export const config = {
port: parseInt(process.env.PORT || "8002", 10),
visionAiUrl: process.env.VISION_AI_URL || "http://localhost:8000/v1/chat/completions",
visionAiModel: process.env.VISION_AI_MODEL || "qwen2.5vl-it:3b",
visionAiTimeout: parseInt(process.env.VISION_AI_TIMEOUT || "120000", 10),
ollamaHost: process.env.OLLAMA_HOST || "http://192.168.0.15:11434",
ollamaEmbedModel: process.env.OLLAMA_EMBED_MODEL || "nomic-embed-text",
chromaHost: process.env.CHROMA_HOST || "http://192.168.0.15:8000",
geminiApiKey: process.env.GEMINI_API_KEY || "",
geminiModel: process.env.GEMINI_MODEL || "gemini-2.5-flash",
maxConcurrentTiles: parseInt(process.env.MAX_CONCURRENT_TILES || "4", 10),
uploadDir: process.env.UPLOAD_DIR || "uploads",
maxFileSize: 10 * 1024 * 1024, // 10MB
};

30
src/embeddings.ts Normal file
View File

@@ -0,0 +1,30 @@
import { config } from "./config";
/**
* Call Ollama embed endpoint for text embeddings.
*/
export async function embed(text: string): Promise<number[]> {
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), 60000);
try {
const response = await fetch(`${config.ollamaHost}/api/embed`, {
method: "POST",
headers: { "Content-Type": "application/json" },
signal: controller.signal,
body: JSON.stringify({
model: config.ollamaEmbedModel,
input: text,
}),
});
if (!response.ok) {
throw new Error(`Ollama embed returned ${response.status}`);
}
const data = (await response.json()) as { embeddings: number[][] };
return data.embeddings[0];
} finally {
clearTimeout(timeout);
}
}

192
src/enrich.ts Normal file
View File

@@ -0,0 +1,192 @@
import sharp from "sharp";
import { readBarcodes, type ReaderOptions } from "zxing-wasm/reader";
import { callVision } from "./vision";
const READER_OPTIONS: ReaderOptions = {
tryHarder: true,
tryRotate: true,
tryDownscale: true,
tryDenoise: true,
formats: ["EAN-13", "EAN-8", "UPC-A", "UPC-E", "Code 128"],
maxNumberOfSymbols: 1,
};
const ENRICHMENT_PROMPT = `Analyze this food product image. Extract any of the following that are visible:
1. Product name and brand
2. Nutrition Facts (structured): calories, protein_g, carbs_g, fat_g, fiber_g, sugar_g, sodium_mg, serving_size, serving_unit, servings_per_container
3. Ingredients list (raw text as printed on label)
4. UPC/Barcode number
Return a single JSON object:
{"name": "...", "brand": "...", "nutrition": {...}, "ingredients_raw": "...", "upc": "..."}
Only include fields you can read from the image. Return ONLY the JSON, no other text.`;
interface EnrichmentResult {
name?: string;
brand?: string;
nutrition?: Record<string, unknown>;
ingredients_raw?: string;
upc?: string;
}
/**
* Extract product info from a single close-up product photo.
*
* Two-phase approach:
* 1. Barcode detection via zxing-wasm (fast, no AI)
* 2. Vision model extraction for name, brand, nutrition, ingredients
*
* Results are merged — zxing UPC takes priority over vision-detected UPC.
*/
export async function extractProductInfo(
imageBuffer: Buffer
): Promise<EnrichmentResult> {
// Phase 1: Fast barcode detection (no AI)
const barcodeUpc = await detectBarcode(imageBuffer);
// Phase 2: Vision model extraction for nutrition/ingredients/name/brand
const visionResult = await extractViaVision(imageBuffer);
// Merge: barcode UPC wins over vision-detected UPC
const result: EnrichmentResult = { ...visionResult };
if (barcodeUpc) {
result.upc = barcodeUpc;
}
return result;
}
// --- Barcode detection via zxing-wasm ---
async function detectBarcode(imageBuffer: Buffer): Promise<string | null> {
const meta = await sharp(imageBuffer).metadata();
const width = meta.width || 0;
const height = meta.height || 0;
// Strategy 1: Full image as PNG
const pngFull = await sharp(imageBuffer).png().toBuffer();
const result = await tryDecode(pngFull);
if (result) return result;
// Strategy 2: Enhanced — grayscale + sharpen + high contrast
const pngEnhanced = await sharp(imageBuffer)
.grayscale()
.sharpen({ sigma: 2 })
.normalize()
.png()
.toBuffer();
const result2 = await tryDecode(pngEnhanced);
if (result2) return result2;
// Strategy 3: Crop bottom half (barcodes are usually on the bottom)
if (height > 200) {
const bottomHalf = await sharp(imageBuffer)
.extract({ left: 0, top: Math.floor(height / 2), width, height: Math.floor(height / 2) })
.sharpen({ sigma: 1.5 })
.png()
.toBuffer();
const result3 = await tryDecode(bottomHalf);
if (result3) return result3;
}
// Strategy 4: Crop bottom-right quadrant
if (width > 200 && height > 200) {
const bottomRight = await sharp(imageBuffer)
.extract({
left: Math.floor(width / 2),
top: Math.floor(height / 2),
width: Math.floor(width / 2),
height: Math.floor(height / 2),
})
.sharpen({ sigma: 1.5 })
.png()
.toBuffer();
const result4 = await tryDecode(bottomRight);
if (result4) return result4;
}
return null;
}
async function tryDecode(pngBuffer: Buffer): Promise<string | null> {
try {
const results = await readBarcodes(new Uint8Array(pngBuffer), READER_OPTIONS);
const valid = results.filter((r) => r.isValid);
if (valid.length === 0) return null;
const best = valid[0];
const upc = best.text.replace(/\D/g, "");
if (upc.length >= 8 && upc.length <= 14) {
console.log(`Barcode detected: ${upc} (format: ${best.format})`);
return upc;
}
} catch (err) {
console.warn(`Barcode decode error: ${err instanceof Error ? err.message : err}`);
}
return null;
}
// --- Vision model extraction ---
/**
* Clean raw model output: strip markdown fences, JS comments, trailing commas.
* Small models often wrap JSON in ```json ... ``` or add // comments.
*/
function cleanJsonText(raw: string): string {
let text = raw;
// Strip markdown code fences
text = text.replace(/```(?:json)?\s*/gi, "").replace(/```/g, "");
// Strip single-line JS comments (// ...)
text = text.replace(/\/\/[^\n]*/g, "");
// Strip trailing commas before } or ]
text = text.replace(/,\s*([}\]])/g, "$1");
return text;
}
async function extractViaVision(imageBuffer: Buffer): Promise<EnrichmentResult> {
const base64 = imageBuffer.toString("base64");
let raw: string;
try {
raw = await callVision(ENRICHMENT_PROMPT, base64);
} catch (err) {
console.error("Vision model call failed:", err instanceof Error ? err.message : err);
return {};
}
// Clean markdown fences, comments, then find JSON object
const cleaned = cleanJsonText(raw);
const start = cleaned.indexOf("{");
const end = cleaned.lastIndexOf("}");
if (start === -1 || end <= start) {
console.warn(`Failed to parse enrichment response: ${raw.slice(0, 200)}`);
return {};
}
try {
const data = JSON.parse(cleaned.slice(start, end + 1)) as Record<string, unknown>;
const result: EnrichmentResult = {};
if (data.name && typeof data.name === "string") {
result.name = data.name.trim();
}
if (data.brand && typeof data.brand === "string") {
result.brand = data.brand.trim();
}
if (data.nutrition && typeof data.nutrition === "object" && data.nutrition !== null) {
result.nutrition = data.nutrition as Record<string, unknown>;
}
if (data.ingredients_raw && typeof data.ingredients_raw === "string") {
result.ingredients_raw = data.ingredients_raw.trim();
}
if (data.upc) {
result.upc = String(data.upc).trim();
}
return result;
} catch {
console.warn(`Failed to parse enrichment JSON: ${raw.slice(0, 200)}`);
return {};
}
}

80
src/gemini.ts Normal file
View File

@@ -0,0 +1,80 @@
import { config } from "./config";
let genaiModule: typeof import("@google/generative-ai") | null = null;
async function getClient() {
if (!config.geminiApiKey) {
throw new Error("GEMINI_API_KEY not configured");
}
if (!genaiModule) {
genaiModule = await import("@google/generative-ai");
}
return new genaiModule.GoogleGenerativeAI(config.geminiApiKey);
}
/**
* Send an image + prompt to Gemini vision.
*/
export async function geminiVision(prompt: string, imageBase64: string): Promise<string> {
const ai = await getClient();
const model = ai.getGenerativeModel({ model: config.geminiModel });
const result = await model.generateContent([
prompt,
{
inlineData: {
mimeType: "image/jpeg",
data: imageBase64,
},
},
]);
return result.response.text();
}
/**
* Identify ALL food/grocery products visible in a photo.
*/
export async function geminiIdentifyProducts(
imageBase64: string
): Promise<Record<string, unknown>[]> {
const prompt = `Identify ALL food and grocery products visible in this photo. \
There may be one product or many (e.g. a grocery haul, a shelf, a receipt).
Return ONLY a JSON array of objects. Each object must have:
- "name": product name (string)
- "brand": brand name if visible, otherwise "" (string). Produce typically has no brand.
- "category": one of "produce", "dairy", "meat", "seafood", "bakery", "snacks", "beverages", "frozen", "pantry", "condiments", "other" (string)
- "is_organic": "yes" or "no" based on visible labels (string)
Return ONLY the JSON array, no other text.`;
const raw = await geminiVision(prompt, imageBase64);
try {
const start = raw.indexOf("[");
const end = raw.lastIndexOf("]");
if (start !== -1 && end > start) {
const items = JSON.parse(raw.slice(start, end + 1));
return (items as Record<string, unknown>[]).filter(
(i) => typeof i === "object" && i !== null && i.name
);
}
} catch {
// try single object
}
try {
const start = raw.indexOf("{");
const end = raw.lastIndexOf("}");
if (start !== -1 && end > start) {
const item = JSON.parse(raw.slice(start, end + 1)) as Record<string, unknown>;
if (item.name) return [item];
}
} catch {
// unparseable
}
console.warn(`Failed to parse Gemini product identification: ${raw.slice(0, 200)}`);
return [];
}

67
src/matching.ts Normal file
View File

@@ -0,0 +1,67 @@
import { embed } from "./embeddings";
import { queryProducts } from "./chroma";
export interface MatchedItem extends Record<string, unknown> {
matched_product_id: number | null;
match_confidence: number;
reference_image_url: string | null;
}
/**
* For each item, try to match against ChromaDB product_images embeddings.
* No PostgreSQL — ILIKE fallback happens in the backend.
*/
export async function matchToKnownProducts(
items: Record<string, unknown>[]
): Promise<MatchedItem[]> {
const results: MatchedItem[] = [];
for (const item of items) {
const name = String(item.name || "");
const brand = String(item.brand || "");
const matched: MatchedItem = {
...item,
matched_product_id: null,
match_confidence: 0,
reference_image_url: null,
};
if (!name) {
results.push(matched);
continue;
}
const queryText = [name, brand].filter(Boolean).join(", ");
try {
const embedding = await embed(queryText);
const chromaResults = await queryProducts([embedding], 3);
if (
chromaResults.distances &&
chromaResults.distances[0] &&
chromaResults.distances[0].length > 0
) {
const bestDistance = chromaResults.distances[0][0];
const confidence = Math.max(0, 1 - bestDistance);
if (confidence > 0.5 && chromaResults.metadatas?.[0]?.[0]) {
const meta = chromaResults.metadatas[0][0];
matched.matched_product_id = (meta.product_id as number) || null;
matched.match_confidence = Math.round(confidence * 1000) / 1000;
matched.reference_image_url = (meta.image_url as string) || null;
}
}
} catch (err) {
console.debug(
`ChromaDB match failed for '${name}':`,
err instanceof Error ? err.message : err
);
}
results.push(matched);
}
return results;
}

91
src/pantry.ts Normal file
View File

@@ -0,0 +1,91 @@
import { tileImage, scanTiles, saveScanImage } from "./tiling";
import { parseJsonItems, deduplicateItems } from "./parsing";
import { matchToKnownProducts } from "./matching";
import { geminiIdentifyProducts } from "./gemini";
const PANTRY_PROMPT = `Identify ALL food and grocery products visible in this photo of a pantry, fridge, or kitchen.
For each product, extract:
- Product name (as shown on the label)
- Brand (if visible)
- Category (produce/dairy/meat/seafood/bakery/snacks/beverages/frozen/pantry/condiments/other)
- Approximate quantity (e.g. "2 cans", "1 bottle", "half gallon")
Return ONLY a JSON array of objects with keys: "name", "brand", "category", "quantity_desc"
Example: [{"name": "Greek Yogurt", "brand": "Fage", "category": "dairy", "quantity_desc": "2 containers"}]
Return ONLY the JSON array, no other text.`;
const MIN_LOCAL_RESULTS = 2;
interface PantryItem extends Record<string, unknown> {
name: string;
brand: string;
category: string;
quantity_desc: string;
}
function cleanPantryItems(items: Record<string, unknown>[]): PantryItem[] {
const cleaned: PantryItem[] = [];
for (const item of items) {
if (typeof item !== "object" || item === null) continue;
const name = String(item.name || "").trim();
if (!name) continue;
cleaned.push({
name,
brand: String(item.brand || "").trim(),
category: String(item.category || "other").trim().toLowerCase(),
quantity_desc: String(item.quantity_desc || "1").trim(),
});
}
return cleaned;
}
export async function scanPantryPhoto(
imageBuffer: Buffer
): Promise<Record<string, unknown>> {
const { scanId } = saveScanImage(imageBuffer, "pantry");
let modelUsed = "local";
// Step 1: Tile and scan with local model
const tiles = await tileImage(imageBuffer);
const rawTexts = await scanTiles(tiles, PANTRY_PROMPT);
let items: Record<string, unknown>[] = cleanPantryItems(parseJsonItems(rawTexts));
items = deduplicateItems(items);
// Step 2: Gemini fallback if too few results
if (items.length < MIN_LOCAL_RESULTS) {
console.log(`Local model found ${items.length} pantry items, falling back to Gemini`);
try {
const fullBase64 = imageBuffer.toString("base64");
const geminiItems = await geminiIdentifyProducts(fullBase64);
const normalized: PantryItem[] = geminiItems.map((gi) => ({
name: String(gi.name || ""),
brand: String(gi.brand || ""),
category: String(gi.category || "other"),
quantity_desc: "1",
}));
if (normalized.length > items.length) {
items = normalized;
modelUsed = "gemini";
}
} catch (err) {
console.error("Gemini fallback failed for pantry scan:", err instanceof Error ? err.message : err);
}
}
// Step 3: Match against known products (ChromaDB only)
const matched = await matchToKnownProducts(items);
// Add index
const indexed = matched.map((item, i) => ({ ...item, index: i }));
console.log(`Pantry scan ${scanId}: ${indexed.length} products via ${modelUsed}`);
return {
scan_id: scanId,
items: indexed,
total_found: indexed.length,
model_used: modelUsed,
};
}

84
src/parsing.ts Normal file
View File

@@ -0,0 +1,84 @@
/**
* Extract JSON arrays from raw model output texts, merge into single list.
*/
export function parseJsonItems(rawTexts: string[]): Record<string, unknown>[] {
const allItems: Record<string, unknown>[] = [];
for (const text of rawTexts) {
// Try array first
try {
const start = text.indexOf("[");
const end = text.lastIndexOf("]");
if (start !== -1 && end > start) {
const items = JSON.parse(text.slice(start, end + 1));
if (Array.isArray(items)) {
for (const item of items) {
if (typeof item === "object" && item !== null) {
allItems.push(item as Record<string, unknown>);
}
}
continue;
}
}
} catch {
// fall through to single object
}
// Try single object
try {
const start = text.indexOf("{");
const end = text.lastIndexOf("}");
if (start !== -1 && end > start) {
const item = JSON.parse(text.slice(start, end + 1));
if (typeof item === "object" && item !== null && (item as Record<string, unknown>).name) {
allItems.push(item as Record<string, unknown>);
}
}
} catch {
// unparseable
}
}
return allItems;
}
function tokenSet(name: string): Set<string> {
return new Set(name.toLowerCase().split(/\s+/));
}
/**
* Remove near-duplicate items based on token overlap (>80% = duplicate).
*/
export function deduplicateItems(
items: Record<string, unknown>[],
key: string = "name"
): Record<string, unknown>[] {
const unique: Record<string, unknown>[] = [];
for (const item of items) {
const name = String(item[key] || "").trim();
if (!name) continue;
const tokens = tokenSet(name);
let isDup = false;
for (const existing of unique) {
const existingTokens = tokenSet(String(existing[key] || ""));
if (tokens.size === 0 || existingTokens.size === 0) continue;
const intersection = new Set([...tokens].filter((t) => existingTokens.has(t)));
const overlap = intersection.size / Math.max(tokens.size, existingTokens.size);
if (overlap > 0.8) {
isDup = true;
break;
}
}
if (!isDup) {
unique.push(item);
}
}
return unique;
}

176
src/server.ts Normal file
View File

@@ -0,0 +1,176 @@
import "dotenv/config";
import express, { Request, Response, NextFunction } from "express";
import multer from "multer";
import { config } from "./config";
import { scanShelfPhoto } from "./shelf";
import { scanPantryPhoto } from "./pantry";
import { extractProductInfo } from "./enrich";
import { getCount } from "./chroma";
const ALLOWED_MIMES = new Set([
"image/jpeg",
"image/png",
"image/webp",
"image/gif",
"image/heic",
"image/heif",
]);
const app = express();
const upload = multer({
storage: multer.memoryStorage(),
limits: {
fileSize: config.maxFileSize,
files: 1,
},
fileFilter: (_req, file, cb) => {
if (ALLOWED_MIMES.has(file.mimetype)) {
cb(null, true);
} else {
cb(new Error("Invalid file type. Allowed: JPEG, PNG, WebP, GIF, HEIC, HEIF"));
}
},
});
// --- Health check ---
app.get("/health", async (_req: Request, res: Response) => {
const status: Record<string, unknown> = {
status: "ok",
vision_model: config.visionAiModel,
};
// Check vision model
try {
const visionUrl = config.visionAiUrl.replace("/v1/chat/completions", "/v1/models");
const resp = await fetch(visionUrl, { signal: AbortSignal.timeout(5000) });
status.vision = resp.ok ? "connected" : `status ${resp.status}`;
} catch (err) {
status.vision = `unreachable: ${err instanceof Error ? err.message : err}`;
status.status = "degraded";
}
// Check Ollama
try {
const resp = await fetch(`${config.ollamaHost}/api/tags`, {
signal: AbortSignal.timeout(5000),
});
status.ollama = resp.ok ? "connected" : `status ${resp.status}`;
} catch (err) {
status.ollama = `unreachable: ${err instanceof Error ? err.message : err}`;
status.status = "degraded";
}
// Check ChromaDB
try {
const count = await getCount();
status.chroma = "connected";
status.chroma_count = count;
} catch (err) {
status.chroma = `unreachable: ${err instanceof Error ? err.message : err}`;
status.status = "degraded";
}
res.json(status);
});
// --- Scan endpoints ---
app.post(
"/scan/shelf",
upload.single("image"),
async (req: Request, res: Response, next: NextFunction) => {
try {
const file = req.file;
if (!file) {
res.status(400).json({ error: "No image provided" });
return;
}
const storeName = req.body?.store_name;
if (!storeName) {
res.status(400).json({ error: "store_name is required" });
return;
}
const result = await scanShelfPhoto(file.buffer, storeName);
res.json(result);
} catch (err) {
next(err);
}
}
);
app.post(
"/scan/pantry",
upload.single("image"),
async (req: Request, res: Response, next: NextFunction) => {
try {
const file = req.file;
if (!file) {
res.status(400).json({ error: "No image provided" });
return;
}
const result = await scanPantryPhoto(file.buffer);
res.json(result);
} catch (err) {
next(err);
}
}
);
// --- Product enrichment ---
app.post(
"/enrich/product",
upload.single("image"),
async (req: Request, res: Response, next: NextFunction) => {
try {
const file = req.file;
if (!file) {
res.status(400).json({ error: "No image provided" });
return;
}
const result = await extractProductInfo(file.buffer);
res.json(result);
} catch (err) {
next(err);
}
}
);
// --- 404 ---
app.use((_req: Request, res: Response) => {
res.status(404).json({ error: "Not found" });
});
// --- Error handler ---
app.use((err: Error, _req: Request, res: Response, _next: NextFunction) => {
if (err instanceof multer.MulterError && err.code === "LIMIT_FILE_SIZE") {
res.status(413).json({ error: `File too large. Maximum size is ${config.maxFileSize / 1024 / 1024}MB.` });
return;
}
if (err.message.startsWith("Invalid file type")) {
res.status(400).json({ error: err.message });
return;
}
console.error("Scan error:", err.message);
res.status(502).json({ error: "Failed to process image" });
});
// --- Start ---
app.listen(config.port, "0.0.0.0", () => {
console.log(`Vision Scanner Service listening on port ${config.port}`);
console.log(`Vision AI: ${config.visionAiUrl} (model: ${config.visionAiModel})`);
console.log(`Ollama: ${config.ollamaHost} (embed: ${config.ollamaEmbedModel})`);
console.log(`ChromaDB: ${config.chromaHost}`);
});

103
src/shelf.ts Normal file
View File

@@ -0,0 +1,103 @@
import { tileImage, scanTiles, saveScanImage } from "./tiling";
import { parseJsonItems, deduplicateItems } from "./parsing";
import { matchToKnownProducts } from "./matching";
import { geminiVision } from "./gemini";
const SHELF_PROMPT = `Look at this photo of a store shelf or price display. For each product visible, extract:
- Product name (as shown on the label/tag)
- Price (the number on the price tag)
- Size/weight if visible (e.g. "32 oz", "5 lb", "1 gal")
- Unit price if shown on the tag (e.g. "$0.25/oz")
Return ONLY a JSON array of objects with keys: "name", "price", "quantity", "unit", "unit_price"
Example: [{"name": "Kirkland Organic Eggs", "price": 7.99, "quantity": 24, "unit": "ct", "unit_price": null}]
Return ONLY the JSON array, no other text.`;
const GEMINI_SHELF_PROMPT = `Look at this photo of a store shelf or price display. For each product you can see, extract:
- Product name (as shown on the label/tag)
- Price (the number on the price tag)
- Size/weight if visible (e.g. "32 oz", "5 lb", "1 gal")
- Unit price if shown on the tag (e.g. "$0.25/oz")
Return ONLY a JSON array of objects with keys: "name", "price", "quantity", "unit", "unit_price"
Be precise with numbers. Return ONLY the JSON array, no other text.`;
const MIN_LOCAL_RESULTS = 2;
interface ShelfItem extends Record<string, unknown> {
name: string;
price: number;
quantity: number;
unit: string;
unit_price: number | null;
store_name: string;
}
function cleanShelfItems(
items: Record<string, unknown>[],
storeName: string
): ShelfItem[] {
const cleaned: ShelfItem[] = [];
for (const item of items) {
if (typeof item !== "object" || item === null) continue;
const name = String(item.name || "").trim();
const price = Number(item.price);
if (!name || isNaN(price) || price <= 0) continue;
cleaned.push({
name,
price,
quantity: Number(item.quantity) || 1,
unit: String(item.unit || "unit"),
unit_price: item.unit_price ? Number(item.unit_price) : null,
store_name: storeName,
});
}
return cleaned;
}
export async function scanShelfPhoto(
imageBuffer: Buffer,
storeName: string
): Promise<Record<string, unknown>> {
const { scanId } = saveScanImage(imageBuffer, "shelf");
let modelUsed = "local";
// Step 1: Tile and scan with local model
const tiles = await tileImage(imageBuffer);
const rawTexts = await scanTiles(tiles, SHELF_PROMPT);
let items: Record<string, unknown>[] = cleanShelfItems(parseJsonItems(rawTexts), storeName);
items = deduplicateItems(items);
// Step 2: Gemini fallback if too few results
if (items.length < MIN_LOCAL_RESULTS) {
console.log(`Local model found ${items.length} items, falling back to Gemini`);
try {
const fullBase64 = imageBuffer.toString("base64");
const raw = await geminiVision(GEMINI_SHELF_PROMPT, fullBase64);
const geminiItems = cleanShelfItems(parseJsonItems([raw]), storeName);
if (geminiItems.length > items.length) {
items = geminiItems;
modelUsed = "gemini";
}
} catch (err) {
console.error("Gemini fallback failed for shelf scan:", err instanceof Error ? err.message : err);
}
}
// Step 3: Match against known products (ChromaDB only)
const matched = await matchToKnownProducts(items);
// Add index
const indexed = matched.map((item, i) => ({ ...item, index: i }));
console.log(`Shelf scan ${scanId}: ${indexed.length} products via ${modelUsed}`);
return {
scan_id: scanId,
items: indexed,
total_found: indexed.length,
model_used: modelUsed,
};
}

100
src/tiling.ts Normal file
View File

@@ -0,0 +1,100 @@
import sharp from "sharp";
import * as fs from "fs";
import * as path from "path";
import * as crypto from "crypto";
import { config } from "./config";
import { callVision } from "./vision";
export interface TileResult {
buffer: Buffer;
base64: string;
}
/**
* Split image into grid tiles with overlap, returning JPEG buffers.
*/
export async function tileImage(
imageBuffer: Buffer,
grid: [number, number] = [2, 2],
overlap: number = 0.10
): Promise<TileResult[]> {
const metadata = await sharp(imageBuffer).metadata();
const w = metadata.width || 0;
const h = metadata.height || 0;
if (w === 0 || h === 0) return [];
const [cols, rows] = grid;
const tileW = Math.floor(w / cols);
const tileH = Math.floor(h / rows);
const overlapX = Math.floor(tileW * overlap);
const overlapY = Math.floor(tileH * overlap);
const tilePromises: Promise<TileResult>[] = [];
for (let row = 0; row < rows; row++) {
for (let col = 0; col < cols; col++) {
const x1 = Math.max(0, col * tileW - overlapX);
const y1 = Math.max(0, row * tileH - overlapY);
const x2 = Math.min(w, (col + 1) * tileW + overlapX);
const y2 = Math.min(h, (row + 1) * tileH + overlapY);
tilePromises.push(
sharp(imageBuffer)
.extract({ left: x1, top: y1, width: x2 - x1, height: y2 - y1 })
.jpeg({ quality: 85 })
.toBuffer()
.then((buffer) => ({
buffer,
base64: buffer.toString("base64"),
}))
);
}
}
return Promise.all(tilePromises);
}
/**
* Send all tiles to vision model in parallel (throttled), return raw text per tile.
*/
export async function scanTiles(
tiles: TileResult[],
prompt: string
): Promise<string[]> {
const maxConcurrent = config.maxConcurrentTiles;
const results: string[] = [];
// Process in batches to throttle GPU usage
for (let i = 0; i < tiles.length; i += maxConcurrent) {
const batch = tiles.slice(i, i + maxConcurrent);
const batchResults = await Promise.all(
batch.map(async (tile) => {
try {
return await callVision(prompt, tile.base64);
} catch (err) {
console.warn("Vision tile scan failed:", err instanceof Error ? err.message : err);
return "";
}
})
);
results.push(...batchResults);
}
return results.filter((r) => r.trim().length > 0);
}
/**
* Save uploaded image to disk. Returns scan_id and file_path.
*/
export function saveScanImage(
imageBuffer: Buffer,
scanType: string
): { scanId: string; filePath: string } {
const scanId = crypto.randomUUID().slice(0, 12);
const dirPath = path.join(config.uploadDir, scanType);
fs.mkdirSync(dirPath, { recursive: true });
const filePath = path.join(dirPath, `${scanId}.jpg`);
fs.writeFileSync(filePath, imageBuffer);
return { scanId, filePath };
}

46
src/vision.ts Normal file
View File

@@ -0,0 +1,46 @@
import { config } from "./config";
/**
* Call local vision model via OpenAI-compatible API.
*/
export async function callVision(prompt: string, imageBase64: string): Promise<string> {
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), config.visionAiTimeout);
try {
const response = await fetch(config.visionAiUrl, {
method: "POST",
headers: { "Content-Type": "application/json" },
signal: controller.signal,
body: JSON.stringify({
model: config.visionAiModel,
messages: [
{
role: "user",
content: [
{ type: "text", text: prompt },
{
type: "image_url",
image_url: {
url: `data:image/jpeg;base64,${imageBase64}`,
},
},
],
},
],
max_tokens: 2048,
}),
});
if (!response.ok) {
throw new Error(`Vision AI returned ${response.status}`);
}
const data = (await response.json()) as {
choices?: { message?: { content?: string } }[];
};
return data.choices?.[0]?.message?.content?.trim() || "";
} finally {
clearTimeout(timeout);
}
}