From be622e5594ecb21c82bb6066a82c86e0917bcc35 Mon Sep 17 00:00:00 2001 From: Ahmad Mujahid <55625580+AhmadMuj@users.noreply.github.com> Date: Thu, 11 Apr 2024 15:29:51 +0400 Subject: feature: Add PDF support (#88) * feature: Add PDF support * fix: PDF feature enhancements * fix: Freeze expo-share-intent version to prevent breaking changes * fix: set endOfLine to auto for cross-platform development * fix: Upgrading eslint/parser and eslint-plugin to 7.6.0 to solve the linting issues * fix: enhancing PDF feature * fix: Allowing null in fiename for backward compatibility * fix: update pnpm file with pnpm 9.0.0-alpha-8 * fix:(web): PDF Preview for web --- apps/workers/openaiWorker.ts | 69 ++++++++++++++++++++++++++++++++++++-------- apps/workers/package.json | 2 ++ apps/workers/searchWorker.ts | 7 +++++ apps/workers/utils.ts | 32 ++++++++++++++++++++ 4 files changed, 98 insertions(+), 12 deletions(-) (limited to 'apps/workers') diff --git a/apps/workers/openaiWorker.ts b/apps/workers/openaiWorker.ts index c7b519e2..b07e02fe 100644 --- a/apps/workers/openaiWorker.ts +++ b/apps/workers/openaiWorker.ts @@ -5,7 +5,12 @@ import { z } from "zod"; import type { ZOpenAIRequest } from "@hoarder/shared/queues"; import { db } from "@hoarder/db"; -import { bookmarks, bookmarkTags, tagsOnBookmarks } from "@hoarder/db/schema"; +import { + bookmarkAssets, + bookmarks, + bookmarkTags, + tagsOnBookmarks, +} from "@hoarder/db/schema"; import { readAsset } from "@hoarder/shared/assetdb"; import serverConfig from "@hoarder/shared/config"; import logger from "@hoarder/shared/logger"; @@ -18,6 +23,7 @@ import { import type { InferenceClient } from "./inference"; import { InferenceClientFactory } from "./inference"; +import { readPDFText, truncateContent } from "./utils"; const openAIResponseSchema = z.object({ tags: z.array(z.string()), @@ -91,14 +97,6 @@ CONTENT START HERE: function buildPrompt( bookmark: NonNullable>>, ) { - const truncateContent = (content: string) => { - let words = content.split(" "); - if (words.length > 1500) { - words = words.slice(1500); - content = words.join(" "); - } - return content; - }; if (bookmark.link) { if (!bookmark.link.description && !bookmark.link.content) { throw new Error( @@ -158,14 +156,48 @@ async function inferTagsFromImage( ); } const base64 = asset.toString("base64"); - - return await inferenceClient.inferFromImage( + return inferenceClient.inferFromImage( IMAGE_PROMPT_BASE, metadata.contentType, base64, ); } +async function inferTagsFromPDF( + jobId: string, + bookmark: NonNullable>>, + inferenceClient: InferenceClient, +) { + const { asset } = await readAsset({ + userId: bookmark.userId, + assetId: bookmark.asset.assetId, + }); + if (!asset) { + throw new Error( + `[inference][${jobId}] AssetId ${bookmark.asset.assetId} for bookmark ${bookmark.id} not found`, + ); + } + const pdfParse = await readPDFText(asset); + if (!pdfParse?.text) { + throw new Error( + `[inference][${jobId}] PDF text is empty. Please make sure that the PDF includes text and not just images.`, + ); + } + + await db + .update(bookmarkAssets) + .set({ + content: pdfParse.text, + metadata: pdfParse.metadata ? JSON.stringify(pdfParse.metadata) : null, + }) + .where(eq(bookmarkAssets.id, bookmark.id)); + + const prompt = `${TEXT_PROMPT_BASE} +Content: ${truncateContent(pdfParse.text)} +`; + return inferenceClient.inferFromText(prompt); +} + async function inferTagsFromText( bookmark: NonNullable>>, inferenceClient: InferenceClient, @@ -182,11 +214,24 @@ async function inferTags( if (bookmark.link || bookmark.text) { response = await inferTagsFromText(bookmark, inferenceClient); } else if (bookmark.asset) { - response = await inferTagsFromImage(jobId, bookmark, inferenceClient); + switch (bookmark.asset.assetType) { + case "image": + response = await inferTagsFromImage(jobId, bookmark, inferenceClient); + break; + case "pdf": + response = await inferTagsFromPDF(jobId, bookmark, inferenceClient); + break; + default: + throw new Error(`[inference][${jobId}] Unsupported bookmark type`); + } } else { throw new Error(`[inference][${jobId}] Unsupported bookmark type`); } + if (!response) { + throw new Error(`[inference][${jobId}] Inference response is empty`); + } + try { let tags = openAIResponseSchema.parse(JSON.parse(response.response)).tags; logger.info( diff --git a/apps/workers/package.json b/apps/workers/package.json index c9de43a4..e14c576b 100644 --- a/apps/workers/package.json +++ b/apps/workers/package.json @@ -26,6 +26,8 @@ "metascraper-url": "^5.43.4", "ollama": "^0.5.0", "openai": "^4.29.0", + "pdf2json": "^3.0.5", + "pdfjs-dist": "^4.0.379", "puppeteer": "^22.0.0", "puppeteer-extra": "^3.3.6", "puppeteer-extra-plugin-adblocker": "^2.13.6", diff --git a/apps/workers/searchWorker.ts b/apps/workers/searchWorker.ts index 79b0c8c1..fcef7a1b 100644 --- a/apps/workers/searchWorker.ts +++ b/apps/workers/searchWorker.ts @@ -48,6 +48,7 @@ async function runIndex( with: { link: true, text: true, + asset: true, tagsOnBookmarks: { with: { tag: true, @@ -72,6 +73,12 @@ async function runIndex( content: bookmark.link.content, } : undefined), + ...(bookmark.asset + ? { + content: bookmark.asset.content, + metadata: bookmark.asset.metadata, + } + : undefined), ...(bookmark.text ? { content: bookmark.text.text } : undefined), note: bookmark.note, createdAt: bookmark.createdAt.toISOString(), diff --git a/apps/workers/utils.ts b/apps/workers/utils.ts index 2f56d3f0..f8c48408 100644 --- a/apps/workers/utils.ts +++ b/apps/workers/utils.ts @@ -1,3 +1,5 @@ +import PDFParser from "pdf2json"; + export function withTimeout( func: (param: T) => Promise, timeoutSec: number, @@ -14,3 +16,33 @@ export function withTimeout( ]); }; } + +export async function readPDFText(buffer: Buffer): Promise<{ + text: string; + metadata: Record; +}> { + return new Promise((resolve, reject) => { + // Need raw text flag represents as number (1), reference : https://github.com/modesty/pdf2json/issues/76#issuecomment-236569265 + const pdfParser = new PDFParser(null, 1); + pdfParser.on("pdfParser_dataError", reject); + pdfParser.on("pdfParser_dataReady", (pdfData) => { + // eslint-disable-next-line + resolve({ + // The type isn't set correctly, reference : https://github.com/modesty/pdf2json/issues/327 + // eslint-disable-next-line + text: (pdfParser as any).getRawTextContent(), + metadata: pdfData.Meta, + }); + }); + pdfParser.parseBuffer(buffer); + }); +} + +export function truncateContent(content: string, length = 1500) { + let words = content.split(" "); + if (words.length > length) { + words = words.slice(length); + content = words.join(" "); + } + return content; +} -- cgit v1.2.3-70-g09d2