diff options
| author | Mohamed Bassem <me@mbassem.com> | 2026-02-01 22:57:11 +0000 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2026-02-01 22:57:11 +0000 |
| commit | 3fcccb858ee3ef22fe9ce479af4ce458ac9a0fe1 (patch) | |
| tree | 0d6ae299126a581f0ccc58afa89b2dd16a9a0925 | |
| parent | 54243b8cc5ccd76fe23821f6e159b954a2166578 (diff) | |
| download | karakeep-3fcccb858ee3ef22fe9ce479af4ce458ac9a0fe1.tar.zst | |
feat: Add LLM-based OCR as alternative to Tesseract (#2442)
* feat(ocr): add LLM-based OCR support alongside Tesseract
Add support for using configured LLM inference providers (OpenAI or Ollama)
for OCR text extraction from images as an alternative to Tesseract.
Changes:
- Add OCR_USE_LLM environment variable flag (default: false)
- Add buildOCRPrompt function for LLM-based text extraction
- Add readImageTextWithLLM function in asset preprocessing worker
- Update extractAndSaveImageText to route between Tesseract and LLM OCR
- Update documentation with the new configuration option
When OCR_USE_LLM is enabled, the system uses the configured inference model
to extract text from images. If no inference provider is configured, it
falls back to Tesseract.
https://claude.ai/code/session_01Y7h7kDAmqXKXEWDmWbVkDs
* format
---------
Co-authored-by: Claude <noreply@anthropic.com>
| -rw-r--r-- | apps/workers/workers/assetPreprocessingWorker.ts | 67 | ||||
| -rw-r--r-- | docs/docs/03-configuration/01-environment-variables.md | 3 | ||||
| -rw-r--r-- | packages/shared/config.ts | 2 | ||||
| -rw-r--r-- | packages/shared/prompts.ts | 16 |
4 files changed, 78 insertions, 10 deletions
diff --git a/apps/workers/workers/assetPreprocessingWorker.ts b/apps/workers/workers/assetPreprocessingWorker.ts index a5b439fc..d12457d3 100644 --- a/apps/workers/workers/assetPreprocessingWorker.ts +++ b/apps/workers/workers/assetPreprocessingWorker.ts @@ -23,7 +23,9 @@ import { } from "@karakeep/shared-server"; import { newAssetId, readAsset, saveAsset } from "@karakeep/shared/assetdb"; import serverConfig from "@karakeep/shared/config"; +import { InferenceClientFactory } from "@karakeep/shared/inference"; import logger from "@karakeep/shared/logger"; +import { buildOCRPrompt } from "@karakeep/shared/prompts"; import { DequeuedJob, EnqueueOptions, @@ -89,6 +91,36 @@ async function readImageText(buffer: Buffer) { } } +async function readImageTextWithLLM( + buffer: Buffer, + contentType: string, +): Promise<string | null> { + const inferenceClient = InferenceClientFactory.build(); + if (!inferenceClient) { + logger.warn( + "[assetPreprocessing] LLM OCR is enabled but no inference client is configured. Falling back to Tesseract.", + ); + return readImageText(buffer); + } + + const base64 = buffer.toString("base64"); + const prompt = buildOCRPrompt(); + + const response = await inferenceClient.inferFromImage( + prompt, + contentType, + base64, + { schema: null }, + ); + + const extractedText = response.response.trim(); + if (!extractedText) { + return null; + } + + return extractedText; +} + async function readPDFText(buffer: Buffer): Promise<{ text: string; metadata: Record<string, object>; @@ -200,6 +232,7 @@ export async function extractAndSavePDFScreenshot( async function extractAndSaveImageText( jobId: string, asset: Buffer, + contentType: string, bookmark: NonNullable<Awaited<ReturnType<typeof getBookmark>>>, isFixMode: boolean, ): Promise<boolean> { @@ -213,16 +246,31 @@ async function extractAndSaveImageText( } } let imageText = null; - logger.info( - `[assetPreprocessing][${jobId}] Attempting to extract text from image.`, - ); - try { - imageText = await readImageText(asset); - } catch (e) { - logger.error( - `[assetPreprocessing][${jobId}] Failed to read image text: ${e}`, + + if (serverConfig.ocr.useLLM) { + logger.info( + `[assetPreprocessing][${jobId}] Attempting to extract text from image using LLM OCR.`, ); + try { + imageText = await readImageTextWithLLM(asset, contentType); + } catch (e) { + logger.error( + `[assetPreprocessing][${jobId}] Failed to read image text with LLM: ${e}`, + ); + } + } else { + logger.info( + `[assetPreprocessing][${jobId}] Attempting to extract text from image using Tesseract.`, + ); + try { + imageText = await readImageText(asset); + } catch (e) { + logger.error( + `[assetPreprocessing][${jobId}] Failed to read image text: ${e}`, + ); + } } + if (!imageText) { return false; } @@ -314,7 +362,7 @@ async function run(req: DequeuedJob<AssetPreprocessingRequest>) { ); } - const { asset } = await readAsset({ + const { asset, metadata } = await readAsset({ userId: bookmark.userId, assetId: bookmark.asset.assetId, }); @@ -331,6 +379,7 @@ async function run(req: DequeuedJob<AssetPreprocessingRequest>) { const extractedText = await extractAndSaveImageText( jobId, asset, + metadata.contentType, bookmark, isFixMode, ); diff --git a/docs/docs/03-configuration/01-environment-variables.md b/docs/docs/03-configuration/01-environment-variables.md index 7a896fe4..dedc3406 100644 --- a/docs/docs/03-configuration/01-environment-variables.md +++ b/docs/docs/03-configuration/01-environment-variables.md @@ -176,13 +176,14 @@ Example JSON file: ## OCR Configs -Karakeep uses [tesseract.js](https://github.com/naptha/tesseract.js) to extract text from images. +Karakeep uses [tesseract.js](https://github.com/naptha/tesseract.js) to extract text from images by default. Alternatively, you can use an LLM-based OCR by enabling the `OCR_USE_LLM` flag. LLM-based OCR uses the configured inference model (OpenAI or Ollama) to extract text from images, which can provide better results for complex images but requires a configured inference provider. | Name | Required | Default | Description | | ------------------------ | -------- | --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | OCR_CACHE_DIR | No | $TEMP_DIR | The dir where tesseract will download its models. By default, those models are not persisted and stored in the OS' temp dir. | | OCR_LANGS | No | eng | Comma separated list of the language codes that you want tesseract to support. You can find the language codes [here](https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html). Set to empty string to disable OCR. | | OCR_CONFIDENCE_THRESHOLD | No | 50 | A number between 0 and 100 indicating the minimum acceptable confidence from tessaract. If tessaract's confidence is lower than this value, extracted text won't be stored. | +| OCR_USE_LLM | No | false | If set to true, uses the configured inference model (OpenAI or Ollama) for OCR instead of Tesseract. This can provide better results for complex images but requires a configured inference provider (`OPENAI_API_KEY` or `OLLAMA_BASE_URL`). Falls back to Tesseract if no inference provider is configured. | ## Webhook Configs diff --git a/packages/shared/config.ts b/packages/shared/config.ts index 7238e90c..cfcf1532 100644 --- a/packages/shared/config.ts +++ b/packages/shared/config.ts @@ -82,6 +82,7 @@ const allEnv = z.object({ .default("eng") .transform((val) => val.split(",")), OCR_CONFIDENCE_THRESHOLD: z.coerce.number().default(50), + OCR_USE_LLM: stringBool("false"), CRAWLER_HEADLESS_BROWSER: stringBool("true"), BROWSER_WEB_URL: z.string().optional(), BROWSER_WEBSOCKET_URL: z.string().optional(), @@ -337,6 +338,7 @@ const serverConfigSchema = allEnv.transform((val, ctx) => { langs: val.OCR_LANGS, cacheDir: val.OCR_CACHE_DIR, confidenceThreshold: val.OCR_CONFIDENCE_THRESHOLD, + useLLM: val.OCR_USE_LLM, }, search: { numWorkers: val.SEARCH_NUM_WORKERS, diff --git a/packages/shared/prompts.ts b/packages/shared/prompts.ts index 00963550..e878a18b 100644 --- a/packages/shared/prompts.ts +++ b/packages/shared/prompts.ts @@ -106,3 +106,19 @@ export function buildSummaryPromptUntruncated( preprocessContent(content), ); } + +/** + * Build OCR prompt for extracting text from images using LLM + */ +export function buildOCRPrompt(): string { + return `You are an OCR (Optical Character Recognition) expert. Your task is to extract ALL text from this image. + +Rules: +- Extract every piece of text visible in the image, including titles, body text, captions, labels, watermarks, and any other textual content. +- Preserve the original structure and formatting as much as possible (e.g., paragraphs, lists, headings). +- If text appears in multiple columns, read from left to right, top to bottom. +- If text is partially obscured or unclear, make your best attempt and indicate uncertainty with [unclear] if needed. +- Do not add any commentary, explanations, or descriptions of non-text elements. +- If there is no text in the image, respond with an empty string. +- Output ONLY the extracted text, nothing else.`; +} |
