From 019b5d2f5ea0a78cb6c44be26b1eba60b2a4e88d Mon Sep 17 00:00:00 2001 From: Mohamed Bassem Date: Sun, 20 Oct 2024 21:06:58 +0000 Subject: feature: Add OCR support for images. Fixes #296 --- apps/workers/utils.ts | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'apps/workers/utils.ts') diff --git a/apps/workers/utils.ts b/apps/workers/utils.ts index 8d297e05..15634902 100644 --- a/apps/workers/utils.ts +++ b/apps/workers/utils.ts @@ -1,4 +1,8 @@ +import os from "os"; import PDFParser from "pdf2json"; +import { createWorker } from "tesseract.js"; + +import serverConfig from "@hoarder/shared/config"; export function withTimeout( func: (param: T) => Promise, @@ -17,6 +21,24 @@ export function withTimeout( }; } +export async function readImageText(buffer: Buffer) { + if (serverConfig.ocr.langs.length == 1 && serverConfig.ocr.langs[0] == "") { + return null; + } + const worker = await createWorker(serverConfig.ocr.langs, undefined, { + cachePath: serverConfig.ocr.cacheDir ?? os.tmpdir(), + }); + try { + const ret = await worker.recognize(buffer); + if (ret.data.confidence <= serverConfig.ocr.confidenceThreshold) { + return null; + } + return ret.data.text; + } finally { + await worker.terminate(); + } +} + export async function readPDFText(buffer: Buffer): Promise<{ text: string; metadata: Record; -- cgit v1.2.3-70-g09d2