From 3fcccb858ee3ef22fe9ce479af4ce458ac9a0fe1 Mon Sep 17 00:00:00 2001 From: Mohamed Bassem Date: Sun, 1 Feb 2026 22:57:11 +0000 Subject: feat: Add LLM-based OCR as alternative to Tesseract (#2442) * feat(ocr): add LLM-based OCR support alongside Tesseract Add support for using configured LLM inference providers (OpenAI or Ollama) for OCR text extraction from images as an alternative to Tesseract. Changes: - Add OCR_USE_LLM environment variable flag (default: false) - Add buildOCRPrompt function for LLM-based text extraction - Add readImageTextWithLLM function in asset preprocessing worker - Update extractAndSaveImageText to route between Tesseract and LLM OCR - Update documentation with the new configuration option When OCR_USE_LLM is enabled, the system uses the configured inference model to extract text from images. If no inference provider is configured, it falls back to Tesseract. https://claude.ai/code/session_01Y7h7kDAmqXKXEWDmWbVkDs * format --------- Co-authored-by: Claude --- apps/workers/workers/assetPreprocessingWorker.ts | 67 ++++++++++++++++++++---- 1 file changed, 58 insertions(+), 9 deletions(-) (limited to 'apps') diff --git a/apps/workers/workers/assetPreprocessingWorker.ts b/apps/workers/workers/assetPreprocessingWorker.ts index a5b439fc..d12457d3 100644 --- a/apps/workers/workers/assetPreprocessingWorker.ts +++ b/apps/workers/workers/assetPreprocessingWorker.ts @@ -23,7 +23,9 @@ import { } from "@karakeep/shared-server"; import { newAssetId, readAsset, saveAsset } from "@karakeep/shared/assetdb"; import serverConfig from "@karakeep/shared/config"; +import { InferenceClientFactory } from "@karakeep/shared/inference"; import logger from "@karakeep/shared/logger"; +import { buildOCRPrompt } from "@karakeep/shared/prompts"; import { DequeuedJob, EnqueueOptions, @@ -89,6 +91,36 @@ async function readImageText(buffer: Buffer) { } } +async function readImageTextWithLLM( + buffer: Buffer, + contentType: string, +): Promise { + const inferenceClient = InferenceClientFactory.build(); + if (!inferenceClient) { + logger.warn( + "[assetPreprocessing] LLM OCR is enabled but no inference client is configured. Falling back to Tesseract.", + ); + return readImageText(buffer); + } + + const base64 = buffer.toString("base64"); + const prompt = buildOCRPrompt(); + + const response = await inferenceClient.inferFromImage( + prompt, + contentType, + base64, + { schema: null }, + ); + + const extractedText = response.response.trim(); + if (!extractedText) { + return null; + } + + return extractedText; +} + async function readPDFText(buffer: Buffer): Promise<{ text: string; metadata: Record; @@ -200,6 +232,7 @@ export async function extractAndSavePDFScreenshot( async function extractAndSaveImageText( jobId: string, asset: Buffer, + contentType: string, bookmark: NonNullable>>, isFixMode: boolean, ): Promise { @@ -213,16 +246,31 @@ async function extractAndSaveImageText( } } let imageText = null; - logger.info( - `[assetPreprocessing][${jobId}] Attempting to extract text from image.`, - ); - try { - imageText = await readImageText(asset); - } catch (e) { - logger.error( - `[assetPreprocessing][${jobId}] Failed to read image text: ${e}`, + + if (serverConfig.ocr.useLLM) { + logger.info( + `[assetPreprocessing][${jobId}] Attempting to extract text from image using LLM OCR.`, ); + try { + imageText = await readImageTextWithLLM(asset, contentType); + } catch (e) { + logger.error( + `[assetPreprocessing][${jobId}] Failed to read image text with LLM: ${e}`, + ); + } + } else { + logger.info( + `[assetPreprocessing][${jobId}] Attempting to extract text from image using Tesseract.`, + ); + try { + imageText = await readImageText(asset); + } catch (e) { + logger.error( + `[assetPreprocessing][${jobId}] Failed to read image text: ${e}`, + ); + } } + if (!imageText) { return false; } @@ -314,7 +362,7 @@ async function run(req: DequeuedJob) { ); } - const { asset } = await readAsset({ + const { asset, metadata } = await readAsset({ userId: bookmark.userId, assetId: bookmark.asset.assetId, }); @@ -331,6 +379,7 @@ async function run(req: DequeuedJob) { const extractedText = await extractAndSaveImageText( jobId, asset, + metadata.contentType, bookmark, isFixMode, ); -- cgit v1.2.3-70-g09d2