From 3fcccb858ee3ef22fe9ce479af4ce458ac9a0fe1 Mon Sep 17 00:00:00 2001 From: Mohamed Bassem Date: Sun, 1 Feb 2026 22:57:11 +0000 Subject: feat: Add LLM-based OCR as alternative to Tesseract (#2442) * feat(ocr): add LLM-based OCR support alongside Tesseract Add support for using configured LLM inference providers (OpenAI or Ollama) for OCR text extraction from images as an alternative to Tesseract. Changes: - Add OCR_USE_LLM environment variable flag (default: false) - Add buildOCRPrompt function for LLM-based text extraction - Add readImageTextWithLLM function in asset preprocessing worker - Update extractAndSaveImageText to route between Tesseract and LLM OCR - Update documentation with the new configuration option When OCR_USE_LLM is enabled, the system uses the configured inference model to extract text from images. If no inference provider is configured, it falls back to Tesseract. https://claude.ai/code/session_01Y7h7kDAmqXKXEWDmWbVkDs * format --------- Co-authored-by: Claude --- packages/shared/prompts.ts | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'packages/shared/prompts.ts') diff --git a/packages/shared/prompts.ts b/packages/shared/prompts.ts index 00963550..e878a18b 100644 --- a/packages/shared/prompts.ts +++ b/packages/shared/prompts.ts @@ -106,3 +106,19 @@ export function buildSummaryPromptUntruncated( preprocessContent(content), ); } + +/** + * Build OCR prompt for extracting text from images using LLM + */ +export function buildOCRPrompt(): string { + return `You are an OCR (Optical Character Recognition) expert. Your task is to extract ALL text from this image. + +Rules: +- Extract every piece of text visible in the image, including titles, body text, captions, labels, watermarks, and any other textual content. +- Preserve the original structure and formatting as much as possible (e.g., paragraphs, lists, headings). +- If text appears in multiple columns, read from left to right, top to bottom. +- If text is partially obscured or unclear, make your best attempt and indicate uncertainty with [unclear] if needed. +- Do not add any commentary, explanations, or descriptions of non-text elements. +- If there is no text in the image, respond with an empty string. +- Output ONLY the extracted text, nothing else.`; +} -- cgit v1.2.3-70-g09d2