From 9a950e1068a7309d0cb36ffd33ecd2cd0af5c004 Mon Sep 17 00:00:00 2001 From: Mohamed Bassem Date: Wed, 25 Dec 2024 23:53:46 +0000 Subject: refactor: Move asset preprocessing to its own worker out of the inference worker --- apps/workers/utils.ts | 44 -------------------------------------------- 1 file changed, 44 deletions(-) (limited to 'apps/workers/utils.ts') diff --git a/apps/workers/utils.ts b/apps/workers/utils.ts index 15634902..2f56d3f0 100644 --- a/apps/workers/utils.ts +++ b/apps/workers/utils.ts @@ -1,9 +1,3 @@ -import os from "os"; -import PDFParser from "pdf2json"; -import { createWorker } from "tesseract.js"; - -import serverConfig from "@hoarder/shared/config"; - export function withTimeout( func: (param: T) => Promise, timeoutSec: number, @@ -20,41 +14,3 @@ export function withTimeout( ]); }; } - -export async function readImageText(buffer: Buffer) { - if (serverConfig.ocr.langs.length == 1 && serverConfig.ocr.langs[0] == "") { - return null; - } - const worker = await createWorker(serverConfig.ocr.langs, undefined, { - cachePath: serverConfig.ocr.cacheDir ?? os.tmpdir(), - }); - try { - const ret = await worker.recognize(buffer); - if (ret.data.confidence <= serverConfig.ocr.confidenceThreshold) { - return null; - } - return ret.data.text; - } finally { - await worker.terminate(); - } -} - -export async function readPDFText(buffer: Buffer): Promise<{ - text: string; - metadata: Record; -}> { - return new Promise((resolve, reject) => { - // Need raw text flag represents as number (1), reference : https://github.com/modesty/pdf2json/issues/76#issuecomment-236569265 - const pdfParser = new PDFParser(null, 1); - pdfParser.on("pdfParser_dataError", reject); - pdfParser.on("pdfParser_dataReady", (pdfData) => { - resolve({ - // The type isn't set correctly, reference : https://github.com/modesty/pdf2json/issues/327 - // eslint-disable-next-line - text: (pdfParser as any).getRawTextContent(), - metadata: pdfData.Meta, - }); - }); - pdfParser.parseBuffer(buffer); - }); -} -- cgit v1.2.3-70-g09d2