diff options
| author | Mohamed Bassem <me@mbassem.com> | 2024-10-20 21:06:58 +0000 |
|---|---|---|
| committer | Mohamed Bassem <me@mbassem.com> | 2024-10-20 21:06:58 +0000 |
| commit | 019b5d2f5ea0a78cb6c44be26b1eba60b2a4e88d (patch) | |
| tree | 2b2f99dc9efa90372e277fbc6fe7c3371aafc785 /apps/workers | |
| parent | f793646b0daa007137e2b0bb908be0219c9cfbe8 (diff) | |
| download | karakeep-019b5d2f5ea0a78cb6c44be26b1eba60b2a4e88d.tar.zst | |
feature: Add OCR support for images. Fixes #296
Diffstat (limited to 'apps/workers')
| -rw-r--r-- | apps/workers/openaiWorker.ts | 22 | ||||
| -rw-r--r-- | apps/workers/package.json | 1 | ||||
| -rw-r--r-- | apps/workers/utils.ts | 22 |
3 files changed, 44 insertions, 1 deletions
diff --git a/apps/workers/openaiWorker.ts b/apps/workers/openaiWorker.ts index d51771b2..f436f71b 100644 --- a/apps/workers/openaiWorker.ts +++ b/apps/workers/openaiWorker.ts @@ -23,7 +23,7 @@ import { import type { InferenceClient } from "./inference"; import { InferenceClientFactory } from "./inference"; -import { readPDFText } from "./utils"; +import { readImageText, readPDFText } from "./utils"; const openAIResponseSchema = z.object({ tags: z.array(z.string()), @@ -152,6 +152,26 @@ async function inferTagsFromImage( `[inference][${jobId}] AssetId ${bookmark.asset.assetId} for bookmark ${bookmark.id} not found`, ); } + + let imageText = null; + try { + imageText = await readImageText(asset); + } catch (e) { + logger.error(`[inference][${jobId}] Failed to read image text: ${e}`); + } + + if (imageText) { + logger.info( + `[inference][${jobId}] Extracted ${imageText.length} characters from image.`, + ); + await db + .update(bookmarkAssets) + .set({ + content: imageText, + }) + .where(eq(bookmarkAssets.id, bookmark.id)); + } + const base64 = asset.toString("base64"); return inferenceClient.inferFromImage( buildImagePrompt( diff --git a/apps/workers/package.json b/apps/workers/package.json index 88e803fe..0ab7caa2 100644 --- a/apps/workers/package.json +++ b/apps/workers/package.json @@ -34,6 +34,7 @@ "puppeteer-extra": "^3.3.6", "puppeteer-extra-plugin-adblocker": "^2.13.6", "puppeteer-extra-plugin-stealth": "^2.11.2", + "tesseract.js": "^5.1.1", "tsx": "^4.7.1", "typescript": "^5.3.3", "zod": "^3.22.4" diff --git a/apps/workers/utils.ts b/apps/workers/utils.ts index 8d297e05..15634902 100644 --- a/apps/workers/utils.ts +++ b/apps/workers/utils.ts @@ -1,4 +1,8 @@ +import os from "os"; import PDFParser from "pdf2json"; +import { createWorker } from "tesseract.js"; + +import serverConfig from "@hoarder/shared/config"; export function withTimeout<T, Ret>( func: (param: T) => Promise<Ret>, @@ -17,6 +21,24 @@ export function withTimeout<T, Ret>( }; } +export async function readImageText(buffer: Buffer) { + if (serverConfig.ocr.langs.length == 1 && serverConfig.ocr.langs[0] == "") { + return null; + } + const worker = await createWorker(serverConfig.ocr.langs, undefined, { + cachePath: serverConfig.ocr.cacheDir ?? os.tmpdir(), + }); + try { + const ret = await worker.recognize(buffer); + if (ret.data.confidence <= serverConfig.ocr.confidenceThreshold) { + return null; + } + return ret.data.text; + } finally { + await worker.terminate(); + } +} + export async function readPDFText(buffer: Buffer): Promise<{ text: string; metadata: Record<string, string>; |
