aboutsummaryrefslogtreecommitdiffstats
path: root/apps/workers
diff options
context:
space:
mode:
authorMohamed Bassem <me@mbassem.com>2024-10-20 21:06:58 +0000
committerMohamed Bassem <me@mbassem.com>2024-10-20 21:06:58 +0000
commit019b5d2f5ea0a78cb6c44be26b1eba60b2a4e88d (patch)
tree2b2f99dc9efa90372e277fbc6fe7c3371aafc785 /apps/workers
parentf793646b0daa007137e2b0bb908be0219c9cfbe8 (diff)
downloadkarakeep-019b5d2f5ea0a78cb6c44be26b1eba60b2a4e88d.tar.zst
feature: Add OCR support for images. Fixes #296
Diffstat (limited to 'apps/workers')
-rw-r--r--apps/workers/openaiWorker.ts22
-rw-r--r--apps/workers/package.json1
-rw-r--r--apps/workers/utils.ts22
3 files changed, 44 insertions, 1 deletions
diff --git a/apps/workers/openaiWorker.ts b/apps/workers/openaiWorker.ts
index d51771b2..f436f71b 100644
--- a/apps/workers/openaiWorker.ts
+++ b/apps/workers/openaiWorker.ts
@@ -23,7 +23,7 @@ import {
import type { InferenceClient } from "./inference";
import { InferenceClientFactory } from "./inference";
-import { readPDFText } from "./utils";
+import { readImageText, readPDFText } from "./utils";
const openAIResponseSchema = z.object({
tags: z.array(z.string()),
@@ -152,6 +152,26 @@ async function inferTagsFromImage(
`[inference][${jobId}] AssetId ${bookmark.asset.assetId} for bookmark ${bookmark.id} not found`,
);
}
+
+ let imageText = null;
+ try {
+ imageText = await readImageText(asset);
+ } catch (e) {
+ logger.error(`[inference][${jobId}] Failed to read image text: ${e}`);
+ }
+
+ if (imageText) {
+ logger.info(
+ `[inference][${jobId}] Extracted ${imageText.length} characters from image.`,
+ );
+ await db
+ .update(bookmarkAssets)
+ .set({
+ content: imageText,
+ })
+ .where(eq(bookmarkAssets.id, bookmark.id));
+ }
+
const base64 = asset.toString("base64");
return inferenceClient.inferFromImage(
buildImagePrompt(
diff --git a/apps/workers/package.json b/apps/workers/package.json
index 88e803fe..0ab7caa2 100644
--- a/apps/workers/package.json
+++ b/apps/workers/package.json
@@ -34,6 +34,7 @@
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-adblocker": "^2.13.6",
"puppeteer-extra-plugin-stealth": "^2.11.2",
+ "tesseract.js": "^5.1.1",
"tsx": "^4.7.1",
"typescript": "^5.3.3",
"zod": "^3.22.4"
diff --git a/apps/workers/utils.ts b/apps/workers/utils.ts
index 8d297e05..15634902 100644
--- a/apps/workers/utils.ts
+++ b/apps/workers/utils.ts
@@ -1,4 +1,8 @@
+import os from "os";
import PDFParser from "pdf2json";
+import { createWorker } from "tesseract.js";
+
+import serverConfig from "@hoarder/shared/config";
export function withTimeout<T, Ret>(
func: (param: T) => Promise<Ret>,
@@ -17,6 +21,24 @@ export function withTimeout<T, Ret>(
};
}
+export async function readImageText(buffer: Buffer) {
+ if (serverConfig.ocr.langs.length == 1 && serverConfig.ocr.langs[0] == "") {
+ return null;
+ }
+ const worker = await createWorker(serverConfig.ocr.langs, undefined, {
+ cachePath: serverConfig.ocr.cacheDir ?? os.tmpdir(),
+ });
+ try {
+ const ret = await worker.recognize(buffer);
+ if (ret.data.confidence <= serverConfig.ocr.confidenceThreshold) {
+ return null;
+ }
+ return ret.data.text;
+ } finally {
+ await worker.terminate();
+ }
+}
+
export async function readPDFText(buffer: Buffer): Promise<{
text: string;
metadata: Record<string, string>;