aboutsummaryrefslogtreecommitdiffstats
path: root/apps/workers
diff options
context:
space:
mode:
authorAhmad Mujahid <55625580+AhmadMuj@users.noreply.github.com>2024-04-11 15:29:51 +0400
committerGitHub <noreply@github.com>2024-04-11 14:29:51 +0300
commitbe622e5594ecb21c82bb6066a82c86e0917bcc35 (patch)
treee77973630b30bb5d51abc2ade6993c523a8413b9 /apps/workers
parent2806701318dff77b10a5574d4b26ef6032f6b9bc (diff)
downloadkarakeep-be622e5594ecb21c82bb6066a82c86e0917bcc35.tar.zst
feature: Add PDF support (#88)
* feature: Add PDF support * fix: PDF feature enhancements * fix: Freeze expo-share-intent version to prevent breaking changes * fix: set endOfLine to auto for cross-platform development * fix: Upgrading eslint/parser and eslint-plugin to 7.6.0 to solve the linting issues * fix: enhancing PDF feature * fix: Allowing null in fiename for backward compatibility * fix: update pnpm file with pnpm 9.0.0-alpha-8 * fix:(web): PDF Preview for web
Diffstat (limited to 'apps/workers')
-rw-r--r--apps/workers/openaiWorker.ts69
-rw-r--r--apps/workers/package.json2
-rw-r--r--apps/workers/searchWorker.ts7
-rw-r--r--apps/workers/utils.ts32
4 files changed, 98 insertions, 12 deletions
diff --git a/apps/workers/openaiWorker.ts b/apps/workers/openaiWorker.ts
index c7b519e2..b07e02fe 100644
--- a/apps/workers/openaiWorker.ts
+++ b/apps/workers/openaiWorker.ts
@@ -5,7 +5,12 @@ import { z } from "zod";
import type { ZOpenAIRequest } from "@hoarder/shared/queues";
import { db } from "@hoarder/db";
-import { bookmarks, bookmarkTags, tagsOnBookmarks } from "@hoarder/db/schema";
+import {
+ bookmarkAssets,
+ bookmarks,
+ bookmarkTags,
+ tagsOnBookmarks,
+} from "@hoarder/db/schema";
import { readAsset } from "@hoarder/shared/assetdb";
import serverConfig from "@hoarder/shared/config";
import logger from "@hoarder/shared/logger";
@@ -18,6 +23,7 @@ import {
import type { InferenceClient } from "./inference";
import { InferenceClientFactory } from "./inference";
+import { readPDFText, truncateContent } from "./utils";
const openAIResponseSchema = z.object({
tags: z.array(z.string()),
@@ -91,14 +97,6 @@ CONTENT START HERE:
function buildPrompt(
bookmark: NonNullable<Awaited<ReturnType<typeof fetchBookmark>>>,
) {
- const truncateContent = (content: string) => {
- let words = content.split(" ");
- if (words.length > 1500) {
- words = words.slice(1500);
- content = words.join(" ");
- }
- return content;
- };
if (bookmark.link) {
if (!bookmark.link.description && !bookmark.link.content) {
throw new Error(
@@ -158,14 +156,48 @@ async function inferTagsFromImage(
);
}
const base64 = asset.toString("base64");
-
- return await inferenceClient.inferFromImage(
+ return inferenceClient.inferFromImage(
IMAGE_PROMPT_BASE,
metadata.contentType,
base64,
);
}
+async function inferTagsFromPDF(
+ jobId: string,
+ bookmark: NonNullable<Awaited<ReturnType<typeof fetchBookmark>>>,
+ inferenceClient: InferenceClient,
+) {
+ const { asset } = await readAsset({
+ userId: bookmark.userId,
+ assetId: bookmark.asset.assetId,
+ });
+ if (!asset) {
+ throw new Error(
+ `[inference][${jobId}] AssetId ${bookmark.asset.assetId} for bookmark ${bookmark.id} not found`,
+ );
+ }
+ const pdfParse = await readPDFText(asset);
+ if (!pdfParse?.text) {
+ throw new Error(
+ `[inference][${jobId}] PDF text is empty. Please make sure that the PDF includes text and not just images.`,
+ );
+ }
+
+ await db
+ .update(bookmarkAssets)
+ .set({
+ content: pdfParse.text,
+ metadata: pdfParse.metadata ? JSON.stringify(pdfParse.metadata) : null,
+ })
+ .where(eq(bookmarkAssets.id, bookmark.id));
+
+ const prompt = `${TEXT_PROMPT_BASE}
+Content: ${truncateContent(pdfParse.text)}
+`;
+ return inferenceClient.inferFromText(prompt);
+}
+
async function inferTagsFromText(
bookmark: NonNullable<Awaited<ReturnType<typeof fetchBookmark>>>,
inferenceClient: InferenceClient,
@@ -182,11 +214,24 @@ async function inferTags(
if (bookmark.link || bookmark.text) {
response = await inferTagsFromText(bookmark, inferenceClient);
} else if (bookmark.asset) {
- response = await inferTagsFromImage(jobId, bookmark, inferenceClient);
+ switch (bookmark.asset.assetType) {
+ case "image":
+ response = await inferTagsFromImage(jobId, bookmark, inferenceClient);
+ break;
+ case "pdf":
+ response = await inferTagsFromPDF(jobId, bookmark, inferenceClient);
+ break;
+ default:
+ throw new Error(`[inference][${jobId}] Unsupported bookmark type`);
+ }
} else {
throw new Error(`[inference][${jobId}] Unsupported bookmark type`);
}
+ if (!response) {
+ throw new Error(`[inference][${jobId}] Inference response is empty`);
+ }
+
try {
let tags = openAIResponseSchema.parse(JSON.parse(response.response)).tags;
logger.info(
diff --git a/apps/workers/package.json b/apps/workers/package.json
index c9de43a4..e14c576b 100644
--- a/apps/workers/package.json
+++ b/apps/workers/package.json
@@ -26,6 +26,8 @@
"metascraper-url": "^5.43.4",
"ollama": "^0.5.0",
"openai": "^4.29.0",
+ "pdf2json": "^3.0.5",
+ "pdfjs-dist": "^4.0.379",
"puppeteer": "^22.0.0",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-adblocker": "^2.13.6",
diff --git a/apps/workers/searchWorker.ts b/apps/workers/searchWorker.ts
index 79b0c8c1..fcef7a1b 100644
--- a/apps/workers/searchWorker.ts
+++ b/apps/workers/searchWorker.ts
@@ -48,6 +48,7 @@ async function runIndex(
with: {
link: true,
text: true,
+ asset: true,
tagsOnBookmarks: {
with: {
tag: true,
@@ -72,6 +73,12 @@ async function runIndex(
content: bookmark.link.content,
}
: undefined),
+ ...(bookmark.asset
+ ? {
+ content: bookmark.asset.content,
+ metadata: bookmark.asset.metadata,
+ }
+ : undefined),
...(bookmark.text ? { content: bookmark.text.text } : undefined),
note: bookmark.note,
createdAt: bookmark.createdAt.toISOString(),
diff --git a/apps/workers/utils.ts b/apps/workers/utils.ts
index 2f56d3f0..f8c48408 100644
--- a/apps/workers/utils.ts
+++ b/apps/workers/utils.ts
@@ -1,3 +1,5 @@
+import PDFParser from "pdf2json";
+
export function withTimeout<T, Ret>(
func: (param: T) => Promise<Ret>,
timeoutSec: number,
@@ -14,3 +16,33 @@ export function withTimeout<T, Ret>(
]);
};
}
+
+export async function readPDFText(buffer: Buffer): Promise<{
+ text: string;
+ metadata: Record<string, string>;
+}> {
+ return new Promise((resolve, reject) => {
+ // Need raw text flag represents as number (1), reference : https://github.com/modesty/pdf2json/issues/76#issuecomment-236569265
+ const pdfParser = new PDFParser(null, 1);
+ pdfParser.on("pdfParser_dataError", reject);
+ pdfParser.on("pdfParser_dataReady", (pdfData) => {
+ // eslint-disable-next-line
+ resolve({
+ // The type isn't set correctly, reference : https://github.com/modesty/pdf2json/issues/327
+ // eslint-disable-next-line
+ text: (pdfParser as any).getRawTextContent(),
+ metadata: pdfData.Meta,
+ });
+ });
+ pdfParser.parseBuffer(buffer);
+ });
+}
+
+export function truncateContent(content: string, length = 1500) {
+ let words = content.split(" ");
+ if (words.length > length) {
+ words = words.slice(length);
+ content = words.join(" ");
+ }
+ return content;
+}