diff options
| author | Mohamed Bassem <me@mbassem.com> | 2024-12-25 23:53:46 +0000 |
|---|---|---|
| committer | Mohamed Bassem <me@mbassem.com> | 2024-12-26 00:11:19 +0000 |
| commit | 9a950e1068a7309d0cb36ffd33ecd2cd0af5c004 (patch) | |
| tree | 93af30d9aee26995350aaa029f86a8d2abf722d4 /apps/workers/utils.ts | |
| parent | 86a4030c5fcbe2cb6ecaa0bd17136f950af34260 (diff) | |
| download | karakeep-9a950e1068a7309d0cb36ffd33ecd2cd0af5c004.tar.zst | |
refactor: Move asset preprocessing to its own worker out of the inference worker
Diffstat (limited to 'apps/workers/utils.ts')
| -rw-r--r-- | apps/workers/utils.ts | 44 |
1 files changed, 0 insertions, 44 deletions
diff --git a/apps/workers/utils.ts b/apps/workers/utils.ts index 15634902..2f56d3f0 100644 --- a/apps/workers/utils.ts +++ b/apps/workers/utils.ts @@ -1,9 +1,3 @@ -import os from "os"; -import PDFParser from "pdf2json"; -import { createWorker } from "tesseract.js"; - -import serverConfig from "@hoarder/shared/config"; - export function withTimeout<T, Ret>( func: (param: T) => Promise<Ret>, timeoutSec: number, @@ -20,41 +14,3 @@ export function withTimeout<T, Ret>( ]); }; } - -export async function readImageText(buffer: Buffer) { - if (serverConfig.ocr.langs.length == 1 && serverConfig.ocr.langs[0] == "") { - return null; - } - const worker = await createWorker(serverConfig.ocr.langs, undefined, { - cachePath: serverConfig.ocr.cacheDir ?? os.tmpdir(), - }); - try { - const ret = await worker.recognize(buffer); - if (ret.data.confidence <= serverConfig.ocr.confidenceThreshold) { - return null; - } - return ret.data.text; - } finally { - await worker.terminate(); - } -} - -export async function readPDFText(buffer: Buffer): Promise<{ - text: string; - metadata: Record<string, string>; -}> { - return new Promise((resolve, reject) => { - // Need raw text flag represents as number (1), reference : https://github.com/modesty/pdf2json/issues/76#issuecomment-236569265 - const pdfParser = new PDFParser(null, 1); - pdfParser.on("pdfParser_dataError", reject); - pdfParser.on("pdfParser_dataReady", (pdfData) => { - resolve({ - // The type isn't set correctly, reference : https://github.com/modesty/pdf2json/issues/327 - // eslint-disable-next-line - text: (pdfParser as any).getRawTextContent(), - metadata: pdfData.Meta, - }); - }); - pdfParser.parseBuffer(buffer); - }); -} |
