feature: Add PDF support (#88)

* feature: Add PDF support * fix: PDF feature enhancements * fix: Freeze expo-share-intent version to prevent breaking changes * fix: set endOfLine to auto for cross-platform development * fix: Upgrading eslint/parser and eslint-plugin to 7.6.0 to solve the linting issues * fix: enhancing PDF feature * fix: Allowing null in fiename for backward compatibility * fix: update pnpm file with pnpm 9.0.0-alpha-8 * fix:(web): PDF Preview for web
author: Ahmad Mujahid <55625580+AhmadMuj@users.noreply.github.com> 2024-04-11 15:29:51 +0400
committer: GitHub <noreply@github.com> 2024-04-11 14:29:51 +0300
commit: be622e5594ecb21c82bb6066a82c86e0917bcc35 (patch)
tree: e77973630b30bb5d51abc2ade6993c523a8413b9 /apps/workers
parent: 2806701318dff77b10a5574d4b26ef6032f6b9bc (diff)
download: karakeep-be622e5594ecb21c82bb6066a82c86e0917bcc35.tar.zst
4 files changed, 98 insertions, 12 deletions
diff --git a/apps/workers/openaiWorker.ts b/apps/workers/openaiWorker.ts
index c7b519e2..b07e02fe 100644
--- a/apps/workers/openaiWorker.ts
+++ b/apps/workers/openaiWorker.ts
@@ -5,7 +5,12 @@ import { z } from "zod";
 
 import type { ZOpenAIRequest } from "@hoarder/shared/queues";
 import { db } from "@hoarder/db";
-import { bookmarks, bookmarkTags, tagsOnBookmarks } from "@hoarder/db/schema";
+import {
+  bookmarkAssets,
+  bookmarks,
+  bookmarkTags,
+  tagsOnBookmarks,
+} from "@hoarder/db/schema";
 import { readAsset } from "@hoarder/shared/assetdb";
 import serverConfig from "@hoarder/shared/config";
 import logger from "@hoarder/shared/logger";
@@ -18,6 +23,7 @@ import {
 
 import type { InferenceClient } from "./inference";
 import { InferenceClientFactory } from "./inference";
+import { readPDFText, truncateContent } from "./utils";
 
 const openAIResponseSchema = z.object({
   tags: z.array(z.string()),
@@ -91,14 +97,6 @@ CONTENT START HERE:
 function buildPrompt(
   bookmark: NonNullable<Awaited<ReturnType<typeof fetchBookmark>>>,
 ) {
-  const truncateContent = (content: string) => {
-    let words = content.split(" ");
-    if (words.length > 1500) {
-      words = words.slice(1500);
-      content = words.join(" ");
-    }
-    return content;
-  };
   if (bookmark.link) {
     if (!bookmark.link.description && !bookmark.link.content) {
       throw new Error(
@@ -158,14 +156,48 @@ async function inferTagsFromImage(
     );
   }
   const base64 = asset.toString("base64");
-
-  return await inferenceClient.inferFromImage(
+  return inferenceClient.inferFromImage(
     IMAGE_PROMPT_BASE,
     metadata.contentType,
     base64,
   );
 }
 
+async function inferTagsFromPDF(
+  jobId: string,
+  bookmark: NonNullable<Awaited<ReturnType<typeof fetchBookmark>>>,
+  inferenceClient: InferenceClient,
+) {
+  const { asset } = await readAsset({
+    userId: bookmark.userId,
+    assetId: bookmark.asset.assetId,
+  });
+  if (!asset) {
+    throw new Error(
+      `[inference][${jobId}] AssetId ${bookmark.asset.assetId} for bookmark ${bookmark.id} not found`,
+    );
+  }
+  const pdfParse = await readPDFText(asset);
+  if (!pdfParse?.text) {
+    throw new Error(
+      `[inference][${jobId}] PDF text is empty. Please make sure that the PDF includes text and not just images.`,
+    );
+  }
+
+  await db
+    .update(bookmarkAssets)
+    .set({
+      content: pdfParse.text,
+      metadata: pdfParse.metadata ? JSON.stringify(pdfParse.metadata) : null,
+    })
+    .where(eq(bookmarkAssets.id, bookmark.id));
+
+  const prompt = `${TEXT_PROMPT_BASE}
+Content: ${truncateContent(pdfParse.text)}
+`;
+  return inferenceClient.inferFromText(prompt);
+}
+
 async function inferTagsFromText(
   bookmark: NonNullable<Awaited<ReturnType<typeof fetchBookmark>>>,
   inferenceClient: InferenceClient,
@@ -182,11 +214,24 @@ async function inferTags(
   if (bookmark.link || bookmark.text) {
     response = await inferTagsFromText(bookmark, inferenceClient);
   } else if (bookmark.asset) {
-    response = await inferTagsFromImage(jobId, bookmark, inferenceClient);
+    switch (bookmark.asset.assetType) {
+      case "image":
+        response = await inferTagsFromImage(jobId, bookmark, inferenceClient);
+        break;
+      case "pdf":
+        response = await inferTagsFromPDF(jobId, bookmark, inferenceClient);
+        break;
+      default:
+        throw new Error(`[inference][${jobId}] Unsupported bookmark type`);
+    }
   } else {
     throw new Error(`[inference][${jobId}] Unsupported bookmark type`);
   }
 
+  if (!response) {
+    throw new Error(`[inference][${jobId}] Inference response is empty`);
+  }
+
   try {
     let tags = openAIResponseSchema.parse(JSON.parse(response.response)).tags;
     logger.info(
diff --git a/apps/workers/package.json b/apps/workers/package.json
index c9de43a4..e14c576b 100644
--- a/apps/workers/package.json
+++ b/apps/workers/package.json
@@ -26,6 +26,8 @@
     "metascraper-url": "^5.43.4",
     "ollama": "^0.5.0",
     "openai": "^4.29.0",
+    "pdf2json": "^3.0.5",
+    "pdfjs-dist": "^4.0.379",
     "puppeteer": "^22.0.0",
     "puppeteer-extra": "^3.3.6",
     "puppeteer-extra-plugin-adblocker": "^2.13.6",
diff --git a/apps/workers/searchWorker.ts b/apps/workers/searchWorker.ts
index 79b0c8c1..fcef7a1b 100644
--- a/apps/workers/searchWorker.ts
+++ b/apps/workers/searchWorker.ts
@@ -48,6 +48,7 @@ async function runIndex(
     with: {
       link: true,
       text: true,
+      asset: true,
       tagsOnBookmarks: {
         with: {
           tag: true,
@@ -72,6 +73,12 @@ async function runIndex(
             content: bookmark.link.content,
           }
         : undefined),
+      ...(bookmark.asset
+        ? {
+            content: bookmark.asset.content,
+            metadata: bookmark.asset.metadata,
+          }
+        : undefined),
       ...(bookmark.text ? { content: bookmark.text.text } : undefined),
       note: bookmark.note,
       createdAt: bookmark.createdAt.toISOString(),
diff --git a/apps/workers/utils.ts b/apps/workers/utils.ts
index 2f56d3f0..f8c48408 100644
--- a/apps/workers/utils.ts
+++ b/apps/workers/utils.ts
@@ -1,3 +1,5 @@
+import PDFParser from "pdf2json";
+
 export function withTimeout<T, Ret>(
   func: (param: T) => Promise<Ret>,
   timeoutSec: number,
@@ -14,3 +16,33 @@ export function withTimeout<T, Ret>(
     ]);
   };
 }
+
+export async function readPDFText(buffer: Buffer): Promise<{
+  text: string;
+  metadata: Record<string, string>;
+}> {
+  return new Promise((resolve, reject) => {
+    // Need raw text flag represents as number (1), reference : https://github.com/modesty/pdf2json/issues/76#issuecomment-236569265
+    const pdfParser = new PDFParser(null, 1);
+    pdfParser.on("pdfParser_dataError", reject);
+    pdfParser.on("pdfParser_dataReady", (pdfData) => {
+      // eslint-disable-next-line
+      resolve({
+        // The type isn't set correctly, reference : https://github.com/modesty/pdf2json/issues/327
+        // eslint-disable-next-line
+        text: (pdfParser as any).getRawTextContent(),
+        metadata: pdfData.Meta,
+      });
+    });
+    pdfParser.parseBuffer(buffer);
+  });
+}
+
+export function truncateContent(content: string, length = 1500) {
+  let words = content.split(" ");
+  if (words.length > length) {
+    words = words.slice(length);
+    content = words.join(" ");
+  }
+  return content;
+}
author	Ahmad Mujahid <55625580+AhmadMuj@users.noreply.github.com>	2024-04-11 15:29:51 +0400
committer	GitHub <noreply@github.com>	2024-04-11 14:29:51 +0300
commit	be622e5594ecb21c82bb6066a82c86e0917bcc35 (patch)
tree	e77973630b30bb5d51abc2ade6993c523a8413b9 /apps/workers
parent	2806701318dff77b10a5574d4b26ef6032f6b9bc (diff)
download	karakeep-be622e5594ecb21c82bb6066a82c86e0917bcc35.tar.zst