feat(image-processor): switch to OpenRouter Vision for smart crop and remove heavy models

2026-02-22 23:24:22 +01:00
parent 1bbe89c879
commit 2a169f1dfc
4 changed files with 113 additions and 89 deletions
--- a/packages/image-processor/src/processor.ts
+++ b/packages/image-processor/src/processor.ts
@@ -1,51 +1,102 @@
-import * as faceapi from "@vladmandic/face-api";
-// Provide Canvas fallback for face-api in Node.js
-import { Canvas, Image, ImageData } from "canvas";
 import sharp from "sharp";
-import * as path from "node:path";
-import { fileURLToPath } from "node:url";
-
-// @ts-expect-error FaceAPI does not have type definitions for monkeyPatch
-faceapi.env.monkeyPatch({ Canvas, Image, ImageData });
-
-const __filename = fileURLToPath(import.meta.url);
-const __dirname = path.dirname(__filename);
-
-// Path to the downloaded models
-const MODELS_PATH = path.join(__dirname, "..", "models");
-
-let isModelsLoaded = false;
-
-async function loadModels() {
-  if (isModelsLoaded) return;
-  await faceapi.nets.tinyFaceDetector.loadFromDisk(MODELS_PATH);
-  isModelsLoaded = true;
-}

 export interface ProcessImageOptions {
  width: number;
  height: number;
  format?: "webp" | "jpeg" | "png" | "avif";
  quality?: number;
+  openRouterApiKey?: string;
+}
+
+interface FaceDetection {
+  x: number;
+  y: number;
+  width: number;
+  height: number;
+}
+
+/**
+ * Detects faces using OpenRouter Vision API.
+ * Uses a small preview to save bandwidth and tokens.
+ */
+async function detectFacesWithCloud(
+  inputBuffer: Buffer,
+  apiKey: string,
+): Promise<FaceDetection[]> {
+  try {
+    // Generate a small preview for vision API (max 512px)
+    const preview = await sharp(inputBuffer)
+      .resize(512, 512, { fit: "inside" })
+      .jpeg({ quality: 60 })
+      .toBuffer();
+
+    const base64Image = preview.toString("base64");
+
+    const response = await fetch(
+      "https://openrouter.ai/api/v1/chat/completions",
+      {
+        method: "POST",
+        headers: {
+          Authorization: `Bearer ${apiKey}`,
+          "Content-Type": "application/json",
+          "HTTP-Referer": "https://mintel.me",
+          "X-Title": "Mintel Image Service",
+        },
+        body: JSON.stringify({
+          model: "google/gemini-2.0-flash-001", // Fast, cheap, and supports vision
+          messages: [
+            {
+              role: "user",
+              content: [
+                {
+                  type: "text",
+                  text: 'Detect all human faces in this image. Return ONLY a JSON array of bounding boxes like: [{"x": 0.1, "y": 0.2, "width": 0.05, "height": 0.05}]. Coordinates must be normalized (0 to 1). If no faces, return [].',
+                },
+                {
+                  type: "image_url",
+                  image_url: {
+                    url: `data:image/jpeg;base64,${base64Image}`,
+                  },
+                },
+              ],
+            },
+          ],
+          response_format: { type: "json_object" },
+        }),
+      },
+    );
+
+    if (!response.ok) {
+      throw new Error(`OpenRouter API error: ${response.statusText}`);
+    }
+
+    const data = (await response.json()) as any;
+    const content = data.choices[0]?.message?.content;
+
+    if (!content) return [];
+
+    // The model might return directly or wrapped in a json field
+    const parsed = typeof content === "string" ? JSON.parse(content) : content;
+    const detections = (parsed.faces || parsed.detections || parsed) as any[];
+
+    if (!Array.isArray(detections)) return [];
+
+    return detections.map((d) => ({
+      x: d.x,
+      y: d.y,
+      width: d.width,
+      height: d.height,
+    }));
+  } catch (error) {
+    console.error("Cloud face detection failed:", error);
+    return [];
+  }
 }

 export async function processImageWithSmartCrop(
  inputBuffer: Buffer,
  options: ProcessImageOptions,
 ): Promise<Buffer> {
-  await loadModels();
-
-  // Load image via Canvas for face-api
-  const img = new Image();
-  img.src = inputBuffer;
-
-  // Detect faces
-  const detections = await faceapi.detectAllFaces(
-    // @ts-expect-error FaceAPI does not have type definitions for monkeyPatch
-    img,
-    new faceapi.TinyFaceDetectorOptions(),
-  );
-
  const sharpImage = sharp(inputBuffer);
  const metadata = await sharpImage.metadata();

@@ -53,35 +104,36 @@ export async function processImageWithSmartCrop(
    throw new Error("Could not read image metadata");
  }

+  const detections = options.openRouterApiKey
+    ? await detectFacesWithCloud(inputBuffer, options.openRouterApiKey)
+    : [];
+
  // If faces are found, calculate the bounding box containing all faces
  if (detections.length > 0) {
+    // Map normalized coordinates back to pixels
+    const pixelDetections = detections.map((d) => ({
+      x: d.x * (metadata.width || 0),
+      y: d.y * (metadata.height || 0),
+      width: d.width * (metadata.width || 0),
+      height: d.height * (metadata.height || 0),
+    }));
+
    let minX = metadata.width;
    let minY = metadata.height;
    let maxX = 0;
    let maxY = 0;

-    for (const det of detections) {
-      const { x, y, width, height } = det.box;
-      if (x < minX) minX = Math.max(0, x);
-      if (y < minY) minY = Math.max(0, y);
-      if (x + width > maxX) maxX = Math.min(metadata.width, x + width);
-      if (y + height > maxY) maxY = Math.min(metadata.height, y + height);
+    for (const det of pixelDetections) {
+      if (det.x < minX) minX = Math.max(0, det.x);
+      if (det.y < minY) minY = Math.max(0, det.y);
+      if (det.x + det.width > maxX)
+        maxX = Math.min(metadata.width, det.x + det.width);
+      if (det.y + det.height > maxY)
+        maxY = Math.min(metadata.height, det.y + det.height);
    }

-    const faceBoxWidth = maxX - minX;
-    const faceBoxHeight = maxY - minY;
-
-    // Calculate center of the faces
-    const centerX = Math.floor(minX + faceBoxWidth / 2);
-    const centerY = Math.floor(minY + faceBoxHeight / 2);
-
-    // Provide this as a focus point for sharp's extract or resize
-    // We can use sharp's resize with `position` focusing on crop options,
-    // or calculate an exact bounding box. However, extracting an exact bounding box
-    // and then resizing usually yields the best results when focusing on a specific coordinate.
-
-    // A simpler approach is to crop a rectangle with the target aspect ratio
-    // centered on the faces, then resize. Let's calculate the crop box.
+    const centerX = Math.floor(minX + (maxX - minX) / 2);
+    const centerY = Math.floor(minY + (maxY - minY) / 2);

    const targetRatio = options.width / options.height;
    const currentRatio = metadata.width / metadata.height;
@@ -90,18 +142,14 @@ export async function processImageWithSmartCrop(
    let cropHeight = metadata.height;

    if (currentRatio > targetRatio) {
-      // Image is wider than target, calculate new width
      cropWidth = Math.floor(metadata.height * targetRatio);
    } else {
-      // Image is taller than target, calculate new height
      cropHeight = Math.floor(metadata.width / targetRatio);
    }

-    // Try to center the crop box around the faces
    let cropX = Math.floor(centerX - cropWidth / 2);
    let cropY = Math.floor(centerY - cropHeight / 2);

-    // Keep crop box within image bounds
    if (cropX < 0) cropX = 0;
    if (cropY < 0) cropY = 0;
    if (cropX + cropWidth > metadata.width) cropX = metadata.width - cropWidth;
@@ -116,9 +164,7 @@ export async function processImageWithSmartCrop(
    });
  }

-  // Finally, resize to the requested dimensions and format
  let finalImage = sharpImage.resize(options.width, options.height, {
-    // If faces weren't found, default to entropy/attention based cropping as fallback
    fit: "cover",
    position: detections.length > 0 ? "center" : "attention",
  });