aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMohamed Bassem <me@mbassem.com>2026-02-01 22:57:11 +0000
committerGitHub <noreply@github.com>2026-02-01 22:57:11 +0000
commit3fcccb858ee3ef22fe9ce479af4ce458ac9a0fe1 (patch)
tree0d6ae299126a581f0ccc58afa89b2dd16a9a0925
parent54243b8cc5ccd76fe23821f6e159b954a2166578 (diff)
downloadkarakeep-3fcccb858ee3ef22fe9ce479af4ce458ac9a0fe1.tar.zst
feat: Add LLM-based OCR as alternative to Tesseract (#2442)
* feat(ocr): add LLM-based OCR support alongside Tesseract Add support for using configured LLM inference providers (OpenAI or Ollama) for OCR text extraction from images as an alternative to Tesseract. Changes: - Add OCR_USE_LLM environment variable flag (default: false) - Add buildOCRPrompt function for LLM-based text extraction - Add readImageTextWithLLM function in asset preprocessing worker - Update extractAndSaveImageText to route between Tesseract and LLM OCR - Update documentation with the new configuration option When OCR_USE_LLM is enabled, the system uses the configured inference model to extract text from images. If no inference provider is configured, it falls back to Tesseract. https://claude.ai/code/session_01Y7h7kDAmqXKXEWDmWbVkDs * format --------- Co-authored-by: Claude <noreply@anthropic.com>
-rw-r--r--apps/workers/workers/assetPreprocessingWorker.ts67
-rw-r--r--docs/docs/03-configuration/01-environment-variables.md3
-rw-r--r--packages/shared/config.ts2
-rw-r--r--packages/shared/prompts.ts16
4 files changed, 78 insertions, 10 deletions
diff --git a/apps/workers/workers/assetPreprocessingWorker.ts b/apps/workers/workers/assetPreprocessingWorker.ts
index a5b439fc..d12457d3 100644
--- a/apps/workers/workers/assetPreprocessingWorker.ts
+++ b/apps/workers/workers/assetPreprocessingWorker.ts
@@ -23,7 +23,9 @@ import {
} from "@karakeep/shared-server";
import { newAssetId, readAsset, saveAsset } from "@karakeep/shared/assetdb";
import serverConfig from "@karakeep/shared/config";
+import { InferenceClientFactory } from "@karakeep/shared/inference";
import logger from "@karakeep/shared/logger";
+import { buildOCRPrompt } from "@karakeep/shared/prompts";
import {
DequeuedJob,
EnqueueOptions,
@@ -89,6 +91,36 @@ async function readImageText(buffer: Buffer) {
}
}
+async function readImageTextWithLLM(
+ buffer: Buffer,
+ contentType: string,
+): Promise<string | null> {
+ const inferenceClient = InferenceClientFactory.build();
+ if (!inferenceClient) {
+ logger.warn(
+ "[assetPreprocessing] LLM OCR is enabled but no inference client is configured. Falling back to Tesseract.",
+ );
+ return readImageText(buffer);
+ }
+
+ const base64 = buffer.toString("base64");
+ const prompt = buildOCRPrompt();
+
+ const response = await inferenceClient.inferFromImage(
+ prompt,
+ contentType,
+ base64,
+ { schema: null },
+ );
+
+ const extractedText = response.response.trim();
+ if (!extractedText) {
+ return null;
+ }
+
+ return extractedText;
+}
+
async function readPDFText(buffer: Buffer): Promise<{
text: string;
metadata: Record<string, object>;
@@ -200,6 +232,7 @@ export async function extractAndSavePDFScreenshot(
async function extractAndSaveImageText(
jobId: string,
asset: Buffer,
+ contentType: string,
bookmark: NonNullable<Awaited<ReturnType<typeof getBookmark>>>,
isFixMode: boolean,
): Promise<boolean> {
@@ -213,16 +246,31 @@ async function extractAndSaveImageText(
}
}
let imageText = null;
- logger.info(
- `[assetPreprocessing][${jobId}] Attempting to extract text from image.`,
- );
- try {
- imageText = await readImageText(asset);
- } catch (e) {
- logger.error(
- `[assetPreprocessing][${jobId}] Failed to read image text: ${e}`,
+
+ if (serverConfig.ocr.useLLM) {
+ logger.info(
+ `[assetPreprocessing][${jobId}] Attempting to extract text from image using LLM OCR.`,
);
+ try {
+ imageText = await readImageTextWithLLM(asset, contentType);
+ } catch (e) {
+ logger.error(
+ `[assetPreprocessing][${jobId}] Failed to read image text with LLM: ${e}`,
+ );
+ }
+ } else {
+ logger.info(
+ `[assetPreprocessing][${jobId}] Attempting to extract text from image using Tesseract.`,
+ );
+ try {
+ imageText = await readImageText(asset);
+ } catch (e) {
+ logger.error(
+ `[assetPreprocessing][${jobId}] Failed to read image text: ${e}`,
+ );
+ }
}
+
if (!imageText) {
return false;
}
@@ -314,7 +362,7 @@ async function run(req: DequeuedJob<AssetPreprocessingRequest>) {
);
}
- const { asset } = await readAsset({
+ const { asset, metadata } = await readAsset({
userId: bookmark.userId,
assetId: bookmark.asset.assetId,
});
@@ -331,6 +379,7 @@ async function run(req: DequeuedJob<AssetPreprocessingRequest>) {
const extractedText = await extractAndSaveImageText(
jobId,
asset,
+ metadata.contentType,
bookmark,
isFixMode,
);
diff --git a/docs/docs/03-configuration/01-environment-variables.md b/docs/docs/03-configuration/01-environment-variables.md
index 7a896fe4..dedc3406 100644
--- a/docs/docs/03-configuration/01-environment-variables.md
+++ b/docs/docs/03-configuration/01-environment-variables.md
@@ -176,13 +176,14 @@ Example JSON file:
## OCR Configs
-Karakeep uses [tesseract.js](https://github.com/naptha/tesseract.js) to extract text from images.
+Karakeep uses [tesseract.js](https://github.com/naptha/tesseract.js) to extract text from images by default. Alternatively, you can use an LLM-based OCR by enabling the `OCR_USE_LLM` flag. LLM-based OCR uses the configured inference model (OpenAI or Ollama) to extract text from images, which can provide better results for complex images but requires a configured inference provider.
| Name | Required | Default | Description |
| ------------------------ | -------- | --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| OCR_CACHE_DIR | No | $TEMP_DIR | The dir where tesseract will download its models. By default, those models are not persisted and stored in the OS' temp dir. |
| OCR_LANGS | No | eng | Comma separated list of the language codes that you want tesseract to support. You can find the language codes [here](https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html). Set to empty string to disable OCR. |
| OCR_CONFIDENCE_THRESHOLD | No | 50 | A number between 0 and 100 indicating the minimum acceptable confidence from tessaract. If tessaract's confidence is lower than this value, extracted text won't be stored. |
+| OCR_USE_LLM | No | false | If set to true, uses the configured inference model (OpenAI or Ollama) for OCR instead of Tesseract. This can provide better results for complex images but requires a configured inference provider (`OPENAI_API_KEY` or `OLLAMA_BASE_URL`). Falls back to Tesseract if no inference provider is configured. |
## Webhook Configs
diff --git a/packages/shared/config.ts b/packages/shared/config.ts
index 7238e90c..cfcf1532 100644
--- a/packages/shared/config.ts
+++ b/packages/shared/config.ts
@@ -82,6 +82,7 @@ const allEnv = z.object({
.default("eng")
.transform((val) => val.split(",")),
OCR_CONFIDENCE_THRESHOLD: z.coerce.number().default(50),
+ OCR_USE_LLM: stringBool("false"),
CRAWLER_HEADLESS_BROWSER: stringBool("true"),
BROWSER_WEB_URL: z.string().optional(),
BROWSER_WEBSOCKET_URL: z.string().optional(),
@@ -337,6 +338,7 @@ const serverConfigSchema = allEnv.transform((val, ctx) => {
langs: val.OCR_LANGS,
cacheDir: val.OCR_CACHE_DIR,
confidenceThreshold: val.OCR_CONFIDENCE_THRESHOLD,
+ useLLM: val.OCR_USE_LLM,
},
search: {
numWorkers: val.SEARCH_NUM_WORKERS,
diff --git a/packages/shared/prompts.ts b/packages/shared/prompts.ts
index 00963550..e878a18b 100644
--- a/packages/shared/prompts.ts
+++ b/packages/shared/prompts.ts
@@ -106,3 +106,19 @@ export function buildSummaryPromptUntruncated(
preprocessContent(content),
);
}
+
+/**
+ * Build OCR prompt for extracting text from images using LLM
+ */
+export function buildOCRPrompt(): string {
+ return `You are an OCR (Optical Character Recognition) expert. Your task is to extract ALL text from this image.
+
+Rules:
+- Extract every piece of text visible in the image, including titles, body text, captions, labels, watermarks, and any other textual content.
+- Preserve the original structure and formatting as much as possible (e.g., paragraphs, lists, headings).
+- If text appears in multiple columns, read from left to right, top to bottom.
+- If text is partially obscured or unclear, make your best attempt and indicate uncertainty with [unclear] if needed.
+- Do not add any commentary, explanations, or descriptions of non-text elements.
+- If there is no text in the image, respond with an empty string.
+- Output ONLY the extracted text, nothing else.`;
+}