const http = require("http"); const { spawn, execSync } = require("child_process"); const FLM_PATH = "C:\\Users\\sshuser\\FastFlowLM\\flm.exe"; const MODEL = "qwen2.5vl-it:3b"; const HOST = "0.0.0.0"; const PROXY_PORT = 8000; const FLM_PORT = 8001; const IDLE_TIMEOUT_MS = 5 * 60 * 1000; // 5 minutes let flmProcess = null; let idleTimer = null; let starting = false; let ready = false; function log(msg) { console.log(`[${new Date().toLocaleTimeString()}] ${msg}`); } function resetIdleTimer() { if (idleTimer) clearTimeout(idleTimer); idleTimer = setTimeout(() => { log("Idle timeout reached. Stopping model..."); stopFlm(); }, IDLE_TIMEOUT_MS); } function stopFlm() { ready = false; starting = false; if (idleTimer) clearTimeout(idleTimer); if (flmProcess) { try { execSync('taskkill /IM flm.exe /F', { stdio: 'ignore' }); } catch {} flmProcess = null; log("Model stopped. RAM freed."); } } function startFlm() { return new Promise((resolve, reject) => { if (ready) return resolve(); if (starting) { const wait = setInterval(() => { if (ready) { clearInterval(wait); resolve(); } }, 500); return; } starting = true; log("Starting model on NPU..."); flmProcess = spawn(FLM_PATH, [ "serve", MODEL, "--host", "127.0.0.1", "--port", String(FLM_PORT), "--pmode", "performance" ], { stdio: ["pipe", "pipe", "pipe"] }); flmProcess.stderr.on("data", (d) => { const s = d.toString(); if (s.includes("ERROR")) log("FLM: " + s.trim()); }); flmProcess.on("exit", (code) => { log(`FLM exited (code ${code})`); flmProcess = null; ready = false; starting = false; }); // Poll until the server responds const check = setInterval(() => { const req = http.get(`http://127.0.0.1:${FLM_PORT}/v1/models`, (res) => { if (res.statusCode === 200) { clearInterval(check); ready = true; starting = false; log("Model ready!"); resolve(); } }); req.on("error", () => {}); req.setTimeout(1000, () => req.destroy()); }, 1000); // Timeout after 60s setTimeout(() => { if (!ready) { clearInterval(check); reject(new Error("Model failed to start within 60s")); } }, 60000); }); } function proxy(clientReq, clientRes) { const options = { hostname: "127.0.0.1", port: FLM_PORT, path: clientReq.url, method: clientReq.method, headers: clientReq.headers }; const proxyReq = http.request(options, (proxyRes) => { clientRes.writeHead(proxyRes.statusCode, proxyRes.headers); proxyRes.pipe(clientRes); }); proxyReq.on("error", (e) => { clientRes.writeHead(502); clientRes.end(JSON.stringify({ error: "Model backend error: " + e.message })); }); clientReq.pipe(proxyReq); } const server = http.createServer(async (req, res) => { // CORS headers res.setHeader("Access-Control-Allow-Origin", "*"); res.setHeader("Access-Control-Allow-Methods", "GET, POST, OPTIONS"); res.setHeader("Access-Control-Allow-Headers", "Content-Type, Authorization"); if (req.method === "OPTIONS") { res.writeHead(204); res.end(); return; } // Status endpoint if (req.url === "/status") { res.writeHead(200, { "Content-Type": "application/json" }); res.end(JSON.stringify({ model: MODEL, ready, starting, pid: flmProcess?.pid || null })); return; } // Stop endpoint if (req.url === "/stop") { stopFlm(); res.writeHead(200, { "Content-Type": "application/json" }); res.end(JSON.stringify({ status: "stopped" })); return; } try { resetIdleTimer(); if (!ready) { log(`Request received. Waking up model...`); await startFlm(); } proxy(req, res); } catch (e) { res.writeHead(503); res.end(JSON.stringify({ error: e.message })); } }); server.listen(PROXY_PORT, HOST, () => { log(`Proxy listening on ${HOST}:${PROXY_PORT}`); log(`Model will auto-start on first request, auto-stop after ${IDLE_TIMEOUT_MS / 60000}m idle`); log(`Endpoints: /status, /stop`); }); process.on("SIGINT", () => { stopFlm(); process.exit(); }); process.on("SIGTERM", () => { stopFlm(); process.exit(); });