From 3fcccb858ee3ef22fe9ce479af4ce458ac9a0fe1 Mon Sep 17 00:00:00 2001
From: Mohamed Bassem <me@mbassem.com>
Date: Sun, 1 Feb 2026 22:57:11 +0000
Subject: feat: Add LLM-based OCR as alternative to Tesseract (#2442)

* feat(ocr): add LLM-based OCR support alongside Tesseract

Add support for using configured LLM inference providers (OpenAI or Ollama)
for OCR text extraction from images as an alternative to Tesseract.

Changes:
- Add OCR_USE_LLM environment variable flag (default: false)
- Add buildOCRPrompt function for LLM-based text extraction
- Add readImageTextWithLLM function in asset preprocessing worker
- Update extractAndSaveImageText to route between Tesseract and LLM OCR
- Update documentation with the new configuration option

When OCR_USE_LLM is enabled, the system uses the configured inference model
to extract text from images. If no inference provider is configured, it
falls back to Tesseract.

https://claude.ai/code/session_01Y7h7kDAmqXKXEWDmWbVkDs

* format

---------

Co-authored-by: Claude <noreply@anthropic.com>
---
 apps/workers/workers/assetPreprocessingWorker.ts | 67 ++++++++++++++++++++----
 1 file changed, 58 insertions(+), 9 deletions(-)

(limited to 'apps')
diff --git a/apps/workers/workers/assetPreprocessingWorker.ts b/apps/workers/workers/assetPreprocessingWorker.ts
index a5b439fc..d12457d3 100644
--- a/apps/workers/workers/assetPreprocessingWorker.ts
+++ b/apps/workers/workers/assetPreprocessingWorker.ts
@@ -23,7 +23,9 @@ import {
 } from "@karakeep/shared-server";
 import { newAssetId, readAsset, saveAsset } from "@karakeep/shared/assetdb";
 import serverConfig from "@karakeep/shared/config";
+import { InferenceClientFactory } from "@karakeep/shared/inference";
 import logger from "@karakeep/shared/logger";
+import { buildOCRPrompt } from "@karakeep/shared/prompts";
 import {
   DequeuedJob,
   EnqueueOptions,
@@ -89,6 +91,36 @@ async function readImageText(buffer: Buffer) {
   }
 }
 
+async function readImageTextWithLLM(
+  buffer: Buffer,
+  contentType: string,
+): Promise<string | null> {
+  const inferenceClient = InferenceClientFactory.build();
+  if (!inferenceClient) {
+    logger.warn(
+      "[assetPreprocessing] LLM OCR is enabled but no inference client is configured. Falling back to Tesseract.",
+    );
+    return readImageText(buffer);
+  }
+
+  const base64 = buffer.toString("base64");
+  const prompt = buildOCRPrompt();
+
+  const response = await inferenceClient.inferFromImage(
+    prompt,
+    contentType,
+    base64,
+    { schema: null },
+  );
+
+  const extractedText = response.response.trim();
+  if (!extractedText) {
+    return null;
+  }
+
+  return extractedText;
+}
+
 async function readPDFText(buffer: Buffer): Promise<{
   text: string;
   metadata: Record<string, object>;
@@ -200,6 +232,7 @@ export async function extractAndSavePDFScreenshot(
 async function extractAndSaveImageText(
   jobId: string,
   asset: Buffer,
+  contentType: string,
   bookmark: NonNullable<Awaited<ReturnType<typeof getBookmark>>>,
   isFixMode: boolean,
 ): Promise<boolean> {
@@ -213,16 +246,31 @@ async function extractAndSaveImageText(
     }
   }
   let imageText = null;
-  logger.info(
-    `[assetPreprocessing][${jobId}] Attempting to extract text from image.`,
-  );
-  try {
-    imageText = await readImageText(asset);
-  } catch (e) {
-    logger.error(
-      `[assetPreprocessing][${jobId}] Failed to read image text: ${e}`,
+
+  if (serverConfig.ocr.useLLM) {
+    logger.info(
+      `[assetPreprocessing][${jobId}] Attempting to extract text from image using LLM OCR.`,
     );
+    try {
+      imageText = await readImageTextWithLLM(asset, contentType);
+    } catch (e) {
+      logger.error(
+        `[assetPreprocessing][${jobId}] Failed to read image text with LLM: ${e}`,
+      );
+    }
+  } else {
+    logger.info(
+      `[assetPreprocessing][${jobId}] Attempting to extract text from image using Tesseract.`,
+    );
+    try {
+      imageText = await readImageText(asset);
+    } catch (e) {
+      logger.error(
+        `[assetPreprocessing][${jobId}] Failed to read image text: ${e}`,
+      );
+    }
   }
+
   if (!imageText) {
     return false;
   }
@@ -314,7 +362,7 @@ async function run(req: DequeuedJob<AssetPreprocessingRequest>) {
     );
   }
 
-  const { asset } = await readAsset({
+  const { asset, metadata } = await readAsset({
     userId: bookmark.userId,
     assetId: bookmark.asset.assetId,
   });
@@ -331,6 +379,7 @@ async function run(req: DequeuedJob<AssetPreprocessingRequest>) {
       const extractedText = await extractAndSaveImageText(
         jobId,
         asset,
+        metadata.contentType,
         bookmark,
         isFixMode,
       );
-- 
cgit v1.2.3-70-g09d2