aboutsummaryrefslogtreecommitdiffstats
path: root/apps/workers/utils.ts
diff options
context:
space:
mode:
Diffstat (limited to 'apps/workers/utils.ts')
-rw-r--r--apps/workers/utils.ts32
1 files changed, 32 insertions, 0 deletions
diff --git a/apps/workers/utils.ts b/apps/workers/utils.ts
index 2f56d3f0..f8c48408 100644
--- a/apps/workers/utils.ts
+++ b/apps/workers/utils.ts
@@ -1,3 +1,5 @@
+import PDFParser from "pdf2json";
+
export function withTimeout<T, Ret>(
func: (param: T) => Promise<Ret>,
timeoutSec: number,
@@ -14,3 +16,33 @@ export function withTimeout<T, Ret>(
]);
};
}
+
+export async function readPDFText(buffer: Buffer): Promise<{
+ text: string;
+ metadata: Record<string, string>;
+}> {
+ return new Promise((resolve, reject) => {
+ // Need raw text flag represents as number (1), reference : https://github.com/modesty/pdf2json/issues/76#issuecomment-236569265
+ const pdfParser = new PDFParser(null, 1);
+ pdfParser.on("pdfParser_dataError", reject);
+ pdfParser.on("pdfParser_dataReady", (pdfData) => {
+ // eslint-disable-next-line
+ resolve({
+ // The type isn't set correctly, reference : https://github.com/modesty/pdf2json/issues/327
+ // eslint-disable-next-line
+ text: (pdfParser as any).getRawTextContent(),
+ metadata: pdfData.Meta,
+ });
+ });
+ pdfParser.parseBuffer(buffer);
+ });
+}
+
+export function truncateContent(content: string, length = 1500) {
+ let words = content.split(" ");
+ if (words.length > length) {
+ words = words.slice(length);
+ content = words.join(" ");
+ }
+ return content;
+}