diff options
Diffstat (limited to 'apps/workers/utils.ts')
| -rw-r--r-- | apps/workers/utils.ts | 32 |
1 files changed, 32 insertions, 0 deletions
diff --git a/apps/workers/utils.ts b/apps/workers/utils.ts index 2f56d3f0..f8c48408 100644 --- a/apps/workers/utils.ts +++ b/apps/workers/utils.ts @@ -1,3 +1,5 @@ +import PDFParser from "pdf2json"; + export function withTimeout<T, Ret>( func: (param: T) => Promise<Ret>, timeoutSec: number, @@ -14,3 +16,33 @@ export function withTimeout<T, Ret>( ]); }; } + +export async function readPDFText(buffer: Buffer): Promise<{ + text: string; + metadata: Record<string, string>; +}> { + return new Promise((resolve, reject) => { + // Need raw text flag represents as number (1), reference : https://github.com/modesty/pdf2json/issues/76#issuecomment-236569265 + const pdfParser = new PDFParser(null, 1); + pdfParser.on("pdfParser_dataError", reject); + pdfParser.on("pdfParser_dataReady", (pdfData) => { + // eslint-disable-next-line + resolve({ + // The type isn't set correctly, reference : https://github.com/modesty/pdf2json/issues/327 + // eslint-disable-next-line + text: (pdfParser as any).getRawTextContent(), + metadata: pdfData.Meta, + }); + }); + pdfParser.parseBuffer(buffer); + }); +} + +export function truncateContent(content: string, length = 1500) { + let words = content.split(" "); + if (words.length > length) { + words = words.slice(length); + content = words.join(" "); + } + return content; +} |
