feature: Add PDF support (#88)

* feature: Add PDF support * fix: PDF feature enhancements * fix: Freeze expo-share-intent version to prevent breaking changes * fix: set endOfLine to auto for cross-platform development * fix: Upgrading eslint/parser and eslint-plugin to 7.6.0 to solve the linting issues * fix: enhancing PDF feature * fix: Allowing null in fiename for backward compatibility * fix: update pnpm file with pnpm 9.0.0-alpha-8 * fix:(web): PDF Preview for web
author: Ahmad Mujahid <55625580+AhmadMuj@users.noreply.github.com> 2024-04-11 15:29:51 +0400
committer: GitHub <noreply@github.com> 2024-04-11 14:29:51 +0300
commit: be622e5594ecb21c82bb6066a82c86e0917bcc35 (patch)
tree: e77973630b30bb5d51abc2ade6993c523a8413b9 /apps/workers/openaiWorker.ts
parent: 2806701318dff77b10a5574d4b26ef6032f6b9bc (diff)
download: karakeep-be622e5594ecb21c82bb6066a82c86e0917bcc35.tar.zst
1 files changed, 57 insertions, 12 deletions
diff --git a/apps/workers/openaiWorker.ts b/apps/workers/openaiWorker.ts
index c7b519e2..b07e02fe 100644
--- a/apps/workers/openaiWorker.ts
+++ b/apps/workers/openaiWorker.ts
@@ -5,7 +5,12 @@ import { z } from "zod";
 
 import type { ZOpenAIRequest } from "@hoarder/shared/queues";
 import { db } from "@hoarder/db";
-import { bookmarks, bookmarkTags, tagsOnBookmarks } from "@hoarder/db/schema";
+import {
+  bookmarkAssets,
+  bookmarks,
+  bookmarkTags,
+  tagsOnBookmarks,
+} from "@hoarder/db/schema";
 import { readAsset } from "@hoarder/shared/assetdb";
 import serverConfig from "@hoarder/shared/config";
 import logger from "@hoarder/shared/logger";
@@ -18,6 +23,7 @@ import {
 
 import type { InferenceClient } from "./inference";
 import { InferenceClientFactory } from "./inference";
+import { readPDFText, truncateContent } from "./utils";
 
 const openAIResponseSchema = z.object({
   tags: z.array(z.string()),
@@ -91,14 +97,6 @@ CONTENT START HERE:
 function buildPrompt(
   bookmark: NonNullable<Awaited<ReturnType<typeof fetchBookmark>>>,
 ) {
-  const truncateContent = (content: string) => {
-    let words = content.split(" ");
-    if (words.length > 1500) {
-      words = words.slice(1500);
-      content = words.join(" ");
-    }
-    return content;
-  };
   if (bookmark.link) {
     if (!bookmark.link.description && !bookmark.link.content) {
       throw new Error(
@@ -158,14 +156,48 @@ async function inferTagsFromImage(
     );
   }
   const base64 = asset.toString("base64");
-
-  return await inferenceClient.inferFromImage(
+  return inferenceClient.inferFromImage(
     IMAGE_PROMPT_BASE,
     metadata.contentType,
     base64,
   );
 }
 
+async function inferTagsFromPDF(
+  jobId: string,
+  bookmark: NonNullable<Awaited<ReturnType<typeof fetchBookmark>>>,
+  inferenceClient: InferenceClient,
+) {
+  const { asset } = await readAsset({
+    userId: bookmark.userId,
+    assetId: bookmark.asset.assetId,
+  });
+  if (!asset) {
+    throw new Error(
+      `[inference][${jobId}] AssetId ${bookmark.asset.assetId} for bookmark ${bookmark.id} not found`,
+    );
+  }
+  const pdfParse = await readPDFText(asset);
+  if (!pdfParse?.text) {
+    throw new Error(
+      `[inference][${jobId}] PDF text is empty. Please make sure that the PDF includes text and not just images.`,
+    );
+  }
+
+  await db
+    .update(bookmarkAssets)
+    .set({
+      content: pdfParse.text,
+      metadata: pdfParse.metadata ? JSON.stringify(pdfParse.metadata) : null,
+    })
+    .where(eq(bookmarkAssets.id, bookmark.id));
+
+  const prompt = `${TEXT_PROMPT_BASE}
+Content: ${truncateContent(pdfParse.text)}
+`;
+  return inferenceClient.inferFromText(prompt);
+}
+
 async function inferTagsFromText(
   bookmark: NonNullable<Awaited<ReturnType<typeof fetchBookmark>>>,
   inferenceClient: InferenceClient,
@@ -182,11 +214,24 @@ async function inferTags(
   if (bookmark.link || bookmark.text) {
     response = await inferTagsFromText(bookmark, inferenceClient);
   } else if (bookmark.asset) {
-    response = await inferTagsFromImage(jobId, bookmark, inferenceClient);
+    switch (bookmark.asset.assetType) {
+      case "image":
+        response = await inferTagsFromImage(jobId, bookmark, inferenceClient);
+        break;
+      case "pdf":
+        response = await inferTagsFromPDF(jobId, bookmark, inferenceClient);
+        break;
+      default:
+        throw new Error(`[inference][${jobId}] Unsupported bookmark type`);
+    }
   } else {
     throw new Error(`[inference][${jobId}] Unsupported bookmark type`);
   }
 
+  if (!response) {
+    throw new Error(`[inference][${jobId}] Inference response is empty`);
+  }
+
   try {
     let tags = openAIResponseSchema.parse(JSON.parse(response.response)).tags;
     logger.info(
author	Ahmad Mujahid <55625580+AhmadMuj@users.noreply.github.com>	2024-04-11 15:29:51 +0400
committer	GitHub <noreply@github.com>	2024-04-11 14:29:51 +0300
commit	be622e5594ecb21c82bb6066a82c86e0917bcc35 (patch)
tree	e77973630b30bb5d51abc2ade6993c523a8413b9 /apps/workers/openaiWorker.ts
parent	2806701318dff77b10a5574d4b26ef6032f6b9bc (diff)
download	karakeep-be622e5594ecb21c82bb6066a82c86e0917bcc35.tar.zst