From e5cb9aa848009ea22c1385e4d33b7edf372979fb Mon Sep 17 00:00:00 2001 From: Ahmad Mujahid <55625580+AhmadMuj@users.noreply.github.com> Date: Mon, 17 Feb 2025 13:25:16 +0400 Subject: feat: Add PDF screenshot generation and display (#995) * Updated pdf2json to 3.1.5 * Extract and store a screenshot from PDF files using pdf2pic * Installing graphicsmagick and ghostscript * Generate Missing PDF screenshot with tidyAssets worker for backward support * Display PDF screenshot instead of the PDF in web if it exists. * Display PDF screenshot in mobile app if exists. * Updated pnpm-lock.yaml * Removed console.log * Revert the unnecessary changes in package.json * Revert pnpm-lock changes * Prevent rendering PDF files if the screenshot is not generated * refactor: replace useEffect with useMemo for section initialization * feat: show PDF file download button and handle large PDFs by defaulting to screenshot view * feat: add file size to openapi spec * feature: Add Assets preprocessing in fix mode to admin actions * i18n: add reprocess_assets_fix_mode translation * i18n: Add missing ar translations * A bunch of fixes * Fix openspec schema --------- Co-authored-by: Mohamed Bassem --- apps/workers/assetPreprocessingWorker.ts | 213 +++++++++++++++++++++++++------ 1 file changed, 175 insertions(+), 38 deletions(-) (limited to 'apps/workers/assetPreprocessingWorker.ts') diff --git a/apps/workers/assetPreprocessingWorker.ts b/apps/workers/assetPreprocessingWorker.ts index 5c4937e5..f94eeb9e 100644 --- a/apps/workers/assetPreprocessingWorker.ts +++ b/apps/workers/assetPreprocessingWorker.ts @@ -2,12 +2,18 @@ import os from "os"; import { eq } from "drizzle-orm"; import { DequeuedJob, Runner } from "liteque"; import PDFParser from "pdf2json"; +import { fromBuffer } from "pdf2pic"; import { createWorker } from "tesseract.js"; import type { AssetPreprocessingRequest } from "@hoarder/shared/queues"; import { db } from "@hoarder/db"; -import { bookmarkAssets, bookmarks } from "@hoarder/db/schema"; -import { readAsset } from "@hoarder/shared/assetdb"; +import { + assets, + AssetTypes, + bookmarkAssets, + bookmarks, +} from "@hoarder/db/schema"; +import { newAssetId, readAsset, saveAsset } from "@hoarder/shared/assetdb"; import serverConfig from "@hoarder/shared/config"; import logger from "@hoarder/shared/logger"; import { @@ -67,17 +73,14 @@ async function readImageText(buffer: Buffer) { async function readPDFText(buffer: Buffer): Promise<{ text: string; - metadata: Record; + metadata: Record; }> { return new Promise((resolve, reject) => { - // Need raw text flag represents as number (1), reference : https://github.com/modesty/pdf2json/issues/76#issuecomment-236569265 - const pdfParser = new PDFParser(null, 1); + const pdfParser = new PDFParser(null, true); pdfParser.on("pdfParser_dataError", reject); pdfParser.on("pdfParser_dataReady", (pdfData) => { resolve({ - // The type isn't set correctly, reference : https://github.com/modesty/pdf2json/issues/327 - // eslint-disable-next-line - text: (pdfParser as any).getRawTextContent(), + text: pdfParser.getRawTextContent(), metadata: pdfData.Meta, }); }); @@ -85,11 +88,102 @@ async function readPDFText(buffer: Buffer): Promise<{ }); } -async function preprocessImage( +export async function extractAndSavePDFScreenshot( jobId: string, asset: Buffer, -): Promise<{ content: string; metadata: string | null } | undefined> { + bookmark: NonNullable>>, + isFixMode: boolean, +): Promise { + { + const alreadyHasScreenshot = + bookmark.assets.find( + (r) => r.assetType === AssetTypes.ASSET_SCREENSHOT, + ) !== undefined; + if (alreadyHasScreenshot && isFixMode) { + logger.info( + `[assetPreprocessing][${jobId}] Skipping PDF screenshot generation as it's already been generated.`, + ); + return false; + } + } + logger.info( + `[assetPreprocessing][${jobId}] Attempting to generate PDF screenshot for bookmarkId: ${bookmark.id}`, + ); + try { + /** + * If you encountered any issues with this library, make sure you have ghostscript and graphicsmagick installed following this URL + * https://github.com/yakovmeister/pdf2image/blob/HEAD/docs/gm-installation.md + */ + const screenshot = await fromBuffer(asset, { + density: 100, + quality: 100, + format: "png", + preserveAspectRatio: true, + })(1, { responseType: "buffer" }); + + if (!screenshot.buffer) { + logger.error( + `[assetPreprocessing][${jobId}] Failed to generate PDF screenshot`, + ); + return false; + } + + // Store the screenshot + const assetId = newAssetId(); + const fileName = "screenshot.png"; + const contentType = "image/png"; + await saveAsset({ + userId: bookmark.userId, + assetId, + asset: screenshot.buffer, + metadata: { + contentType, + fileName, + }, + }); + + // Insert into database + await db.insert(assets).values({ + id: assetId, + bookmarkId: bookmark.id, + userId: bookmark.userId, + assetType: AssetTypes.ASSET_SCREENSHOT, + contentType, + size: screenshot.buffer.byteLength, + fileName, + }); + + logger.info( + `[assetPreprocessing][${jobId}] Successfully saved PDF screenshot to database`, + ); + return true; + } catch (error) { + logger.error( + `[assetPreprocessing][${jobId}] Failed to process PDF screenshot: ${error}`, + ); + return false; + } +} + +async function extractAndSaveImageText( + jobId: string, + asset: Buffer, + bookmark: NonNullable>>, + isFixMode: boolean, +): Promise { + { + const alreadyHasText = !!bookmark.asset.content; + if (alreadyHasText && isFixMode) { + logger.info( + `[assetPreprocessing][${jobId}] Skipping image text extraction as it's already been extracted.`, + ); + return false; + } + } let imageText = null; + logger.info( + `[assetPreprocessing][${jobId}] Attempting to extract text from image.`, + ); try { imageText = await readImageText(asset); } catch (e) { @@ -98,19 +192,40 @@ async function preprocessImage( ); } if (!imageText) { - return undefined; + return false; } logger.info( `[assetPreprocessing][${jobId}] Extracted ${imageText.length} characters from image.`, ); - return { content: imageText, metadata: null }; + await db + .update(bookmarkAssets) + .set({ + content: imageText, + metadata: null, + }) + .where(eq(bookmarkAssets.id, bookmark.id)); + return true; } -async function preProcessPDF( +async function extractAndSavePDFText( jobId: string, asset: Buffer, -): Promise<{ content: string; metadata: string | null } | undefined> { + bookmark: NonNullable>>, + isFixMode: boolean, +): Promise { + { + const alreadyHasText = !!bookmark.asset.content; + if (alreadyHasText && isFixMode) { + logger.info( + `[assetPreprocessing][${jobId}] Skipping PDF text extraction as it's already been extracted.`, + ); + return false; + } + } + logger.info( + `[assetPreprocessing][${jobId}] Attempting to extract text from pdf.`, + ); const pdfParse = await readPDFText(asset); if (!pdfParse?.text) { throw new Error( @@ -120,13 +235,28 @@ async function preProcessPDF( logger.info( `[assetPreprocessing][${jobId}] Extracted ${pdfParse.text.length} characters from pdf.`, ); - return { - content: pdfParse.text, - metadata: pdfParse.metadata ? JSON.stringify(pdfParse.metadata) : null, - }; + await db + .update(bookmarkAssets) + .set({ + content: pdfParse.text, + metadata: pdfParse.metadata ? JSON.stringify(pdfParse.metadata) : null, + }) + .where(eq(bookmarkAssets.id, bookmark.id)); + return true; +} + +async function getBookmark(bookmarkId: string) { + return db.query.bookmarks.findFirst({ + where: eq(bookmarks.id, bookmarkId), + with: { + asset: true, + assets: true, + }, + }); } async function run(req: DequeuedJob) { + const isFixMode = req.data.fixMode; const jobId = req.id; const bookmarkId = req.data.bookmarkId; @@ -134,6 +264,7 @@ async function run(req: DequeuedJob) { where: eq(bookmarks.id, bookmarkId), with: { asset: true, + assets: true, }, }); @@ -162,15 +293,29 @@ async function run(req: DequeuedJob) { ); } - let result: { content: string; metadata: string | null } | undefined = - undefined; - + let anythingChanged = false; switch (bookmark.asset.assetType) { case "image": - result = await preprocessImage(jobId, asset); + anythingChanged ||= await extractAndSaveImageText( + jobId, + asset, + bookmark, + isFixMode, + ); break; case "pdf": - result = await preProcessPDF(jobId, asset); + anythingChanged ||= await extractAndSavePDFText( + jobId, + asset, + bookmark, + isFixMode, + ); + anythingChanged ||= await extractAndSavePDFScreenshot( + jobId, + asset, + bookmark, + isFixMode, + ); break; default: throw new Error( @@ -178,20 +323,12 @@ async function run(req: DequeuedJob) { ); } - if (result) { - await db - .update(bookmarkAssets) - .set({ - content: result.content, - metadata: result.metadata, - }) - .where(eq(bookmarkAssets.id, bookmarkId)); - } - - await OpenAIQueue.enqueue({ - bookmarkId, - }); + if (anythingChanged) { + await OpenAIQueue.enqueue({ + bookmarkId, + }); - // Update the search index - await triggerSearchReindex(bookmarkId); + // Update the search index + await triggerSearchReindex(bookmarkId); + } } -- cgit v1.2.3-70-g09d2