feat(image-processor): switch to OpenRouter Vision for smart crop and remove heavy models
This commit is contained in:
@@ -6,39 +6,21 @@ RUN npm install -g pnpm@10.30.1
|
||||
FROM base AS build
|
||||
WORKDIR /app
|
||||
COPY . .
|
||||
# Note: Canvas needs build tools on Debian
|
||||
RUN apt-get update && apt-get install -y python3 make g++ libcairo2-dev libpango1.0-dev libjpeg-dev libgif-dev librsvg2-dev
|
||||
# Delete the prebuilt binary and force a clean rebuild from source for the correct container architecture
|
||||
ENV npm_config_arch=arm64
|
||||
ENV npm_config_target_arch=arm64
|
||||
# We only need standard pnpm install now, no C++ tools needed for basic Sharp
|
||||
RUN pnpm install --frozen-lockfile
|
||||
RUN for dir in $(find /app/node_modules -type d -path "*/@tensorflow/tfjs-node"); do \
|
||||
cd $dir && \
|
||||
rm -rf lib/napi-v8/* && \
|
||||
npm_config_build_from_source=true npm_config_arch=arm64 npm_config_target_arch=arm64 npm run install; \
|
||||
done
|
||||
# Generate models explicitly for Docker
|
||||
RUN ls -la packages/image-processor/scripts || true
|
||||
RUN pnpm dlx tsx packages/image-processor/scripts/download-models.ts
|
||||
RUN pnpm --filter @mintel/image-processor build
|
||||
RUN pnpm --filter image-service build
|
||||
# Generated locally for caching
|
||||
|
||||
FROM base
|
||||
WORKDIR /app
|
||||
COPY --from=build /app/node_modules ./node_modules
|
||||
COPY --from=build /app/apps/image-service/node_modules ./apps/image-service/node_modules
|
||||
COPY --from=build /app/packages/image-processor/node_modules ./packages/image-processor/node_modules
|
||||
# Make sure directories exist to prevent COPY errors
|
||||
RUN mkdir -p /app/packages/image-processor/models /app/apps/image-service/dist
|
||||
RUN mkdir -p /app/apps/image-service/dist
|
||||
COPY --from=build /app/apps/image-service/dist ./apps/image-service/dist
|
||||
COPY --from=build /app/apps/image-service/package.json ./apps/image-service/package.json
|
||||
COPY --from=build /app/packages/image-processor/dist ./packages/image-processor/dist
|
||||
COPY --from=build /app/packages/image-processor/package.json ./packages/image-processor/package.json
|
||||
COPY --from=build /app/packages/image-processor/models ./packages/image-processor/models
|
||||
|
||||
# Need runtime dependencies for canvas/sharp on Debian
|
||||
RUN apt-get update && apt-get install -y libcairo2 libpango-1.0-0 libjpeg62-turbo libgif7 librsvg2-2 && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
EXPOSE 8080
|
||||
WORKDIR /app/apps/image-service
|
||||
|
||||
@@ -35,11 +35,9 @@ fastify.get("/process", async (request, reply) => {
|
||||
try {
|
||||
const response = await fetch(url);
|
||||
if (!response.ok) {
|
||||
return reply
|
||||
.status(response.status)
|
||||
.send({
|
||||
error: `Failed to fetch source image: ${response.statusText}`,
|
||||
});
|
||||
return reply.status(response.status).send({
|
||||
error: `Failed to fetch source image: ${response.statusText}`,
|
||||
});
|
||||
}
|
||||
|
||||
const arrayBuffer = await response.arrayBuffer();
|
||||
@@ -50,6 +48,7 @@ fastify.get("/process", async (request, reply) => {
|
||||
height,
|
||||
format,
|
||||
quality,
|
||||
openRouterApiKey: process.env.OPENROUTER_API_KEY,
|
||||
});
|
||||
|
||||
reply.header("Content-Type", `image/${format}`);
|
||||
|
||||
@@ -18,9 +18,6 @@
|
||||
"lint": "eslint src"
|
||||
},
|
||||
"dependencies": {
|
||||
"@tensorflow/tfjs-node": "^4.22.0",
|
||||
"@vladmandic/face-api": "^1.7.13",
|
||||
"canvas": "^2.11.2",
|
||||
"sharp": "^0.33.2"
|
||||
},
|
||||
"devDependencies": {
|
||||
|
||||
@@ -1,51 +1,102 @@
|
||||
import * as faceapi from "@vladmandic/face-api";
|
||||
// Provide Canvas fallback for face-api in Node.js
|
||||
import { Canvas, Image, ImageData } from "canvas";
|
||||
import sharp from "sharp";
|
||||
import * as path from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
|
||||
// @ts-expect-error FaceAPI does not have type definitions for monkeyPatch
|
||||
faceapi.env.monkeyPatch({ Canvas, Image, ImageData });
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = path.dirname(__filename);
|
||||
|
||||
// Path to the downloaded models
|
||||
const MODELS_PATH = path.join(__dirname, "..", "models");
|
||||
|
||||
let isModelsLoaded = false;
|
||||
|
||||
async function loadModels() {
|
||||
if (isModelsLoaded) return;
|
||||
await faceapi.nets.tinyFaceDetector.loadFromDisk(MODELS_PATH);
|
||||
isModelsLoaded = true;
|
||||
}
|
||||
|
||||
export interface ProcessImageOptions {
|
||||
width: number;
|
||||
height: number;
|
||||
format?: "webp" | "jpeg" | "png" | "avif";
|
||||
quality?: number;
|
||||
openRouterApiKey?: string;
|
||||
}
|
||||
|
||||
interface FaceDetection {
|
||||
x: number;
|
||||
y: number;
|
||||
width: number;
|
||||
height: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detects faces using OpenRouter Vision API.
|
||||
* Uses a small preview to save bandwidth and tokens.
|
||||
*/
|
||||
async function detectFacesWithCloud(
|
||||
inputBuffer: Buffer,
|
||||
apiKey: string,
|
||||
): Promise<FaceDetection[]> {
|
||||
try {
|
||||
// Generate a small preview for vision API (max 512px)
|
||||
const preview = await sharp(inputBuffer)
|
||||
.resize(512, 512, { fit: "inside" })
|
||||
.jpeg({ quality: 60 })
|
||||
.toBuffer();
|
||||
|
||||
const base64Image = preview.toString("base64");
|
||||
|
||||
const response = await fetch(
|
||||
"https://openrouter.ai/api/v1/chat/completions",
|
||||
{
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
"HTTP-Referer": "https://mintel.me",
|
||||
"X-Title": "Mintel Image Service",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: "google/gemini-2.0-flash-001", // Fast, cheap, and supports vision
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{
|
||||
type: "text",
|
||||
text: 'Detect all human faces in this image. Return ONLY a JSON array of bounding boxes like: [{"x": 0.1, "y": 0.2, "width": 0.05, "height": 0.05}]. Coordinates must be normalized (0 to 1). If no faces, return [].',
|
||||
},
|
||||
{
|
||||
type: "image_url",
|
||||
image_url: {
|
||||
url: `data:image/jpeg;base64,${base64Image}`,
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
response_format: { type: "json_object" },
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`OpenRouter API error: ${response.statusText}`);
|
||||
}
|
||||
|
||||
const data = (await response.json()) as any;
|
||||
const content = data.choices[0]?.message?.content;
|
||||
|
||||
if (!content) return [];
|
||||
|
||||
// The model might return directly or wrapped in a json field
|
||||
const parsed = typeof content === "string" ? JSON.parse(content) : content;
|
||||
const detections = (parsed.faces || parsed.detections || parsed) as any[];
|
||||
|
||||
if (!Array.isArray(detections)) return [];
|
||||
|
||||
return detections.map((d) => ({
|
||||
x: d.x,
|
||||
y: d.y,
|
||||
width: d.width,
|
||||
height: d.height,
|
||||
}));
|
||||
} catch (error) {
|
||||
console.error("Cloud face detection failed:", error);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
export async function processImageWithSmartCrop(
|
||||
inputBuffer: Buffer,
|
||||
options: ProcessImageOptions,
|
||||
): Promise<Buffer> {
|
||||
await loadModels();
|
||||
|
||||
// Load image via Canvas for face-api
|
||||
const img = new Image();
|
||||
img.src = inputBuffer;
|
||||
|
||||
// Detect faces
|
||||
const detections = await faceapi.detectAllFaces(
|
||||
// @ts-expect-error FaceAPI does not have type definitions for monkeyPatch
|
||||
img,
|
||||
new faceapi.TinyFaceDetectorOptions(),
|
||||
);
|
||||
|
||||
const sharpImage = sharp(inputBuffer);
|
||||
const metadata = await sharpImage.metadata();
|
||||
|
||||
@@ -53,35 +104,36 @@ export async function processImageWithSmartCrop(
|
||||
throw new Error("Could not read image metadata");
|
||||
}
|
||||
|
||||
const detections = options.openRouterApiKey
|
||||
? await detectFacesWithCloud(inputBuffer, options.openRouterApiKey)
|
||||
: [];
|
||||
|
||||
// If faces are found, calculate the bounding box containing all faces
|
||||
if (detections.length > 0) {
|
||||
// Map normalized coordinates back to pixels
|
||||
const pixelDetections = detections.map((d) => ({
|
||||
x: d.x * (metadata.width || 0),
|
||||
y: d.y * (metadata.height || 0),
|
||||
width: d.width * (metadata.width || 0),
|
||||
height: d.height * (metadata.height || 0),
|
||||
}));
|
||||
|
||||
let minX = metadata.width;
|
||||
let minY = metadata.height;
|
||||
let maxX = 0;
|
||||
let maxY = 0;
|
||||
|
||||
for (const det of detections) {
|
||||
const { x, y, width, height } = det.box;
|
||||
if (x < minX) minX = Math.max(0, x);
|
||||
if (y < minY) minY = Math.max(0, y);
|
||||
if (x + width > maxX) maxX = Math.min(metadata.width, x + width);
|
||||
if (y + height > maxY) maxY = Math.min(metadata.height, y + height);
|
||||
for (const det of pixelDetections) {
|
||||
if (det.x < minX) minX = Math.max(0, det.x);
|
||||
if (det.y < minY) minY = Math.max(0, det.y);
|
||||
if (det.x + det.width > maxX)
|
||||
maxX = Math.min(metadata.width, det.x + det.width);
|
||||
if (det.y + det.height > maxY)
|
||||
maxY = Math.min(metadata.height, det.y + det.height);
|
||||
}
|
||||
|
||||
const faceBoxWidth = maxX - minX;
|
||||
const faceBoxHeight = maxY - minY;
|
||||
|
||||
// Calculate center of the faces
|
||||
const centerX = Math.floor(minX + faceBoxWidth / 2);
|
||||
const centerY = Math.floor(minY + faceBoxHeight / 2);
|
||||
|
||||
// Provide this as a focus point for sharp's extract or resize
|
||||
// We can use sharp's resize with `position` focusing on crop options,
|
||||
// or calculate an exact bounding box. However, extracting an exact bounding box
|
||||
// and then resizing usually yields the best results when focusing on a specific coordinate.
|
||||
|
||||
// A simpler approach is to crop a rectangle with the target aspect ratio
|
||||
// centered on the faces, then resize. Let's calculate the crop box.
|
||||
const centerX = Math.floor(minX + (maxX - minX) / 2);
|
||||
const centerY = Math.floor(minY + (maxY - minY) / 2);
|
||||
|
||||
const targetRatio = options.width / options.height;
|
||||
const currentRatio = metadata.width / metadata.height;
|
||||
@@ -90,18 +142,14 @@ export async function processImageWithSmartCrop(
|
||||
let cropHeight = metadata.height;
|
||||
|
||||
if (currentRatio > targetRatio) {
|
||||
// Image is wider than target, calculate new width
|
||||
cropWidth = Math.floor(metadata.height * targetRatio);
|
||||
} else {
|
||||
// Image is taller than target, calculate new height
|
||||
cropHeight = Math.floor(metadata.width / targetRatio);
|
||||
}
|
||||
|
||||
// Try to center the crop box around the faces
|
||||
let cropX = Math.floor(centerX - cropWidth / 2);
|
||||
let cropY = Math.floor(centerY - cropHeight / 2);
|
||||
|
||||
// Keep crop box within image bounds
|
||||
if (cropX < 0) cropX = 0;
|
||||
if (cropY < 0) cropY = 0;
|
||||
if (cropX + cropWidth > metadata.width) cropX = metadata.width - cropWidth;
|
||||
@@ -116,9 +164,7 @@ export async function processImageWithSmartCrop(
|
||||
});
|
||||
}
|
||||
|
||||
// Finally, resize to the requested dimensions and format
|
||||
let finalImage = sharpImage.resize(options.width, options.height, {
|
||||
// If faces weren't found, default to entropy/attention based cropping as fallback
|
||||
fit: "cover",
|
||||
position: detections.length > 0 ? "center" : "attention",
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user