diff options
| author | Ahmad Mujahid <55625580+AhmadMuj@users.noreply.github.com> | 2025-02-17 13:25:16 +0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-02-17 09:25:16 +0000 |
| commit | e5cb9aa848009ea22c1385e4d33b7edf372979fb (patch) | |
| tree | 89470d8da8aab10f30bbfccea8d1b0cea08a1408 /apps | |
| parent | a14be108736133535e2828b6bbdc8d0a69accd63 (diff) | |
| download | karakeep-e5cb9aa848009ea22c1385e4d33b7edf372979fb.tar.zst | |
feat: Add PDF screenshot generation and display (#995)
* Updated pdf2json to 3.1.5
* Extract and store a screenshot from PDF files using pdf2pic
* Installing graphicsmagick and ghostscript
* Generate Missing PDF screenshot with tidyAssets worker for backward support
* Display PDF screenshot instead of the PDF in web if it exists.
* Display PDF screenshot in mobile app if exists.
* Updated pnpm-lock.yaml
* Removed console.log
* Revert the unnecessary changes in package.json
* Revert pnpm-lock changes
* Prevent rendering PDF files if the screenshot is not generated
* refactor: replace useEffect with useMemo for section initialization
* feat: show PDF file download button and handle large PDFs by defaulting to screenshot view
* feat: add file size to openapi spec
* feature: Add Assets preprocessing in fix mode to admin actions
* i18n: add reprocess_assets_fix_mode translation
* i18n: Add missing ar translations
* A bunch of fixes
* Fix openspec schema
---------
Co-authored-by: Mohamed Bassem <me@mbassem.com>
Diffstat (limited to 'apps')
23 files changed, 389 insertions, 94 deletions
diff --git a/apps/mobile/components/bookmarks/BookmarkCard.tsx b/apps/mobile/components/bookmarks/BookmarkCard.tsx index ce294a6f..3cbd064e 100644 --- a/apps/mobile/components/bookmarks/BookmarkCard.tsx +++ b/apps/mobile/components/bookmarks/BookmarkCard.tsx @@ -1,3 +1,4 @@ +import React from "react"; import { ActivityIndicator, Alert, @@ -300,11 +301,15 @@ function AssetCard({ } const title = bookmark.title ?? bookmark.content.fileName; + const assetImage = + bookmark.assets.find((r) => r.assetType == "assetScreenshot")?.id ?? + bookmark.content.assetId; + return ( <View className="flex gap-2"> <Pressable onPress={onOpenBookmark}> <BookmarkAssetImage - assetId={bookmark.content.assetId} + assetId={assetImage} className="h-56 min-h-56 w-full object-cover" /> </Pressable> diff --git a/apps/web/components/admin/AdminActions.tsx b/apps/web/components/admin/AdminActions.tsx index 34b3d63a..fb151ac8 100644 --- a/apps/web/components/admin/AdminActions.tsx +++ b/apps/web/components/admin/AdminActions.tsx @@ -37,6 +37,21 @@ export default function AdminActions() { }, }); + const { mutate: reprocessAssetsFixMode, isPending: isReprocessingPending } = + api.admin.reprocessAssetsFixMode.useMutation({ + onSuccess: () => { + toast({ + description: "Reprocessing enqueued", + }); + }, + onError: (e) => { + toast({ + variant: "destructive", + description: e.message, + }); + }, + }); + const { mutate: reRunInferenceOnAllBookmarks, isPending: isInferencePending, @@ -126,6 +141,13 @@ export default function AdminActions() { </ActionButton> <ActionButton variant="destructive" + loading={isReprocessingPending} + onClick={() => reprocessAssetsFixMode()} + > + {t("admin.actions.reprocess_assets_fix_mode")} + </ActionButton> + <ActionButton + variant="destructive" loading={isTidyAssetsPending} onClick={() => tidyAssets()} > diff --git a/apps/web/components/dashboard/bookmarks/AssetCard.tsx b/apps/web/components/dashboard/bookmarks/AssetCard.tsx index 61b3bc8d..0cb75b3f 100644 --- a/apps/web/components/dashboard/bookmarks/AssetCard.tsx +++ b/apps/web/components/dashboard/bookmarks/AssetCard.tsx @@ -2,6 +2,8 @@ import Image from "next/image"; import Link from "next/link"; +import { cn } from "@/lib/utils"; +import { FileText } from "lucide-react"; import type { ZBookmarkTypeAsset } from "@hoarder/shared/types/bookmarks"; import { getAssetUrl } from "@hoarder/shared-react/utils/assetUtils"; @@ -32,12 +34,28 @@ function AssetImage({ ); } case "pdf": { + const screenshotAssetId = bookmark.assets.find( + (r) => r.assetType === "assetScreenshot", + )?.id; + if (!screenshotAssetId) { + return ( + <div + className={cn(className, "flex items-center justify-center")} + title="PDF screenshot not available. Run asset preprocessing job to generate one screenshot" + > + <FileText size={80} /> + </div> + ); + } return ( - <iframe - title={bookmarkedAsset.assetId} - className={className} - src={getAssetUrl(bookmarkedAsset.assetId)} - /> + <Link href={`/dashboard/preview/${bookmark.id}`}> + <Image + alt="asset" + src={getAssetUrl(screenshotAssetId)} + fill={true} + className={className} + /> + </Link> ); } default: { diff --git a/apps/web/components/dashboard/preview/AssetContentSection.tsx b/apps/web/components/dashboard/preview/AssetContentSection.tsx index 03ab8a43..8590d2ad 100644 --- a/apps/web/components/dashboard/preview/AssetContentSection.tsx +++ b/apps/web/components/dashboard/preview/AssetContentSection.tsx @@ -1,42 +1,117 @@ +import { useMemo, useState } from "react"; import Image from "next/image"; import Link from "next/link"; +import { + Select, + SelectContent, + SelectGroup, + SelectItem, + SelectTrigger, + SelectValue, +} from "@/components/ui/select"; +import { useTranslation } from "@/lib/i18n/client"; +import { getAssetUrl } from "@hoarder/shared-react/utils/assetUtils"; import { BookmarkTypes, ZBookmark } from "@hoarder/shared/types/bookmarks"; -export function AssetContentSection({ bookmark }: { bookmark: ZBookmark }) { +// 20 MB +const BIG_FILE_SIZE = 20 * 1024 * 1024; + +function PDFContentSection({ bookmark }: { bookmark: ZBookmark }) { if (bookmark.content.type != BookmarkTypes.ASSET) { throw new Error("Invalid content type"); } + const { t } = useTranslation(); - switch (bookmark.content.assetType) { - case "image": { - return ( - <div className="relative h-full min-w-full"> - <Link - href={`/api/assets/${bookmark.content.assetId}`} - target="_blank" - > - <Image - alt="asset" - fill={true} - className="object-contain" - src={`/api/assets/${bookmark.content.assetId}`} - /> - </Link> - </div> - ); + const initialSection = useMemo(() => { + if (bookmark.content.type != BookmarkTypes.ASSET) { + throw new Error("Invalid content type"); } - case "pdf": { - return ( - <iframe - title={bookmark.content.assetId} - className="h-full w-full" - src={`/api/assets/${bookmark.content.assetId}`} - /> - ); + + const screenshot = bookmark.assets.find( + (item) => item.assetType === "assetScreenshot", + ); + const bigSize = + bookmark.content.size && bookmark.content.size > BIG_FILE_SIZE; + if (bigSize && screenshot) { + return "screenshot"; } - default: { + return "pdf"; + }, [bookmark]); + const [section, setSection] = useState(initialSection); + + const screenshot = bookmark.assets.find( + (r) => r.assetType === "assetScreenshot", + )?.id; + + const content = + section === "screenshot" && screenshot ? ( + <div className="relative h-full min-w-full"> + <Image + alt="screenshot" + src={getAssetUrl(screenshot)} + fill={true} + className="object-contain" + /> + </div> + ) : ( + <iframe + title={bookmark.content.assetId} + className="h-full w-full" + src={getAssetUrl(bookmark.content.assetId)} + /> + ); + + return ( + <div className="flex h-full flex-col items-center gap-2"> + <div className="flex w-full items-center justify-center gap-4"> + <Select onValueChange={setSection} value={section}> + <SelectTrigger className="w-fit"> + <SelectValue /> + </SelectTrigger> + <SelectContent> + <SelectGroup> + <SelectItem value="screenshot" disabled={!screenshot}> + {t("common.screenshot")} + </SelectItem> + <SelectItem value="pdf">PDF</SelectItem> + </SelectGroup> + </SelectContent> + </Select> + </div> + {content} + </div> + ); +} + +function ImageContentSection({ bookmark }: { bookmark: ZBookmark }) { + if (bookmark.content.type != BookmarkTypes.ASSET) { + throw new Error("Invalid content type"); + } + return ( + <div className="relative h-full min-w-full"> + <Link href={getAssetUrl(bookmark.content.assetId)} target="_blank"> + <Image + alt="asset" + fill={true} + className="object-contain" + src={getAssetUrl(bookmark.content.assetId)} + /> + </Link> + </div> + ); +} + +export function AssetContentSection({ bookmark }: { bookmark: ZBookmark }) { + if (bookmark.content.type != BookmarkTypes.ASSET) { + throw new Error("Invalid content type"); + } + switch (bookmark.content.assetType) { + case "image": + return <ImageContentSection bookmark={bookmark} />; + case "pdf": + return <PDFContentSection bookmark={bookmark} />; + default: return <div>Unsupported asset type</div>; - } } } diff --git a/apps/web/components/dashboard/preview/AttachmentBox.tsx b/apps/web/components/dashboard/preview/AttachmentBox.tsx index 6547ae51..32939cb0 100644 --- a/apps/web/components/dashboard/preview/AttachmentBox.tsx +++ b/apps/web/components/dashboard/preview/AttachmentBox.tsx @@ -45,6 +45,7 @@ export default function AttachmentBox({ bookmark }: { bookmark: ZBookmark }) { const { t } = useTranslation(); const typeToIcon: Record<ZAssetType, React.ReactNode> = { screenshot: <Camera className="size-4" />, + assetScreenshot: <Camera className="size-4" />, fullPageArchive: <Archive className="size-4" />, precrawledArchive: <Archive className="size-4" />, bannerImage: <Image className="size-4" />, diff --git a/apps/web/lib/i18n/locales/ar/translation.json b/apps/web/lib/i18n/locales/ar/translation.json index 7bd0bcad..e9239e70 100644 --- a/apps/web/lib/i18n/locales/ar/translation.json +++ b/apps/web/lib/i18n/locales/ar/translation.json @@ -153,7 +153,7 @@ } }, "admin": { - "admin_settings": "إعدادات المدير", + "admin_settings": "إعدادات المشرف", "server_stats": { "server_stats": "إحصائيات الخادم", "total_users": "إجمالي المستخدمين", @@ -161,15 +161,36 @@ "server_version": "إصدار الخادم" }, "background_jobs": { - "background_jobs": "المهام التلقائية", + "background_jobs": "المهام الخلفية", "crawler_jobs": "مهام الاستكشاف", "indexing_jobs": "مهام الفهرسة", - "inference_jobs": "مهام التحليل الذكي", - "tidy_assets_jobs": "مهام تنظيم الملفات", + "inference_jobs": "مهام الاستدلال", + "tidy_assets_jobs": "مهام تنظيم الوسائط", "job": "مهمة", "queued": "في قائمة الانتظار", - "pending": "معلق", - "failed": "فشل" + "pending": "قيد الانتظار", + "failed": "فشلت" + }, + "actions": { + "recrawl_failed_links_only": "إعادة استكشاف الروابط الفاشلة فقط", + "recrawl_all_links": "إعادة استكشاف جميع الروابط", + "without_inference": "بدون استدلال", + "regenerate_ai_tags_for_failed_bookmarks_only": "إعادة إنشاء علامات الذكاء الاصطناعي للإشارات المرجعية الفاشلة فقط", + "regenerate_ai_tags_for_all_bookmarks": "إعادة إنشاء علامات الذكاء الاصطناعي لجميع الإشارات المرجعية", + "reindex_all_bookmarks": "إعادة فهرسة جميع الإشارات المرجعية", + "compact_assets": "ضغط الوسائط", + "reprocess_assets_fix_mode": "إعادة معالجة الوسائط (وضع الإصلاح)" + }, + "users_list": { + "users_list": "قائمة المستخدمين", + "create_user": "إنشاء مستخدم", + "change_role": "تغيير الدور", + "reset_password": "إعادة تعيين كلمة المرور", + "delete_user": "حذف المستخدم", + "num_bookmarks": "عدد الإشارات المرجعية", + "asset_sizes": "أحجام الوسائط", + "local_user": "مستخدم محلي", + "confirm_password": "تأكيد كلمة المرور" } }, "options": { diff --git a/apps/web/lib/i18n/locales/da/translation.json b/apps/web/lib/i18n/locales/da/translation.json index 3822d5c6..4fe69650 100644 --- a/apps/web/lib/i18n/locales/da/translation.json +++ b/apps/web/lib/i18n/locales/da/translation.json @@ -94,7 +94,8 @@ "recrawl_all_links": "Gennemsøg alle links", "without_inference": "Uden inferens", "regenerate_ai_tags_for_all_bookmarks": "Genopret AI-tags for alle bogmærker", - "reindex_all_bookmarks": "Genindeksér alle bogmærker" + "reindex_all_bookmarks": "Genindeksér alle bogmærker", + "reprocess_assets_fix_mode": "Genbehandling af aktiver (Fix Mode)" }, "background_jobs": { "inference_jobs": "Inferensopgaver", diff --git a/apps/web/lib/i18n/locales/de/translation.json b/apps/web/lib/i18n/locales/de/translation.json index c20a2273..ccebf1f1 100644 --- a/apps/web/lib/i18n/locales/de/translation.json +++ b/apps/web/lib/i18n/locales/de/translation.json @@ -175,7 +175,8 @@ "regenerate_ai_tags_for_failed_bookmarks_only": "KI-Tags nur für fehlgeschlagene Lesezeichen neu generieren", "regenerate_ai_tags_for_all_bookmarks": "KI-Tags für alle Lesezeichen neu generieren", "reindex_all_bookmarks": "Alle Lesezeichen neu indizieren", - "compact_assets": "Assets komprimieren" + "compact_assets": "Assets komprimieren", + "reprocess_assets_fix_mode": "Assets neu verarbeiten (Fix-Modus)" }, "users_list": { "users_list": "Benutzerliste", diff --git a/apps/web/lib/i18n/locales/en/translation.json b/apps/web/lib/i18n/locales/en/translation.json index 2e80f2f4..81ef942f 100644 --- a/apps/web/lib/i18n/locales/en/translation.json +++ b/apps/web/lib/i18n/locales/en/translation.json @@ -178,7 +178,8 @@ "regenerate_ai_tags_for_failed_bookmarks_only": "Regenerate AI Tags for Failed Bookmarks Only", "regenerate_ai_tags_for_all_bookmarks": "Regenerate AI Tags for All Bookmarks", "reindex_all_bookmarks": "Reindex All Bookmarks", - "compact_assets": "Compact Assets" + "compact_assets": "Compact Assets", + "reprocess_assets_fix_mode": "Reprocess Assets (Fix Mode)" }, "users_list": { "users_list": "Users List", diff --git a/apps/web/lib/i18n/locales/es/translation.json b/apps/web/lib/i18n/locales/es/translation.json index 40c6cb01..3a1a7e3c 100644 --- a/apps/web/lib/i18n/locales/es/translation.json +++ b/apps/web/lib/i18n/locales/es/translation.json @@ -146,7 +146,8 @@ "compact_assets": "Optimizar multimedia", "without_inference": "Sin inferencia", "recrawl_failed_links_only": "Recrawlear solo los enlaces fallidos", - "recrawl_all_links": "Recrawlear todos los enlaces" + "recrawl_all_links": "Recrawlear todos los enlaces", + "reprocess_assets_fix_mode": "Reprocesar assets (modo fijo)" }, "users_list": { "users_list": "Lista de usuarios", diff --git a/apps/web/lib/i18n/locales/fr/translation.json b/apps/web/lib/i18n/locales/fr/translation.json index b7834a7b..1772c2ff 100644 --- a/apps/web/lib/i18n/locales/fr/translation.json +++ b/apps/web/lib/i18n/locales/fr/translation.json @@ -146,7 +146,8 @@ "regenerate_ai_tags_for_failed_bookmarks_only": "Régénérer les tags AI uniquement pour les favoris échoués", "regenerate_ai_tags_for_all_bookmarks": "Régénérer les tags AI pour tous les favoris", "reindex_all_bookmarks": "Réindexer tous les favoris", - "compact_assets": "Compacter les assets" + "compact_assets": "Compacter les assets", + "reprocess_assets_fix_mode": "Reprocesser les assets (mode fix)" }, "users_list": { "users_list": "Liste des utilisateurs", diff --git a/apps/web/lib/i18n/locales/gl/translation.json b/apps/web/lib/i18n/locales/gl/translation.json index eb65ca64..363ffac8 100644 --- a/apps/web/lib/i18n/locales/gl/translation.json +++ b/apps/web/lib/i18n/locales/gl/translation.json @@ -178,7 +178,8 @@ "regenerate_ai_tags_for_failed_bookmarks_only": "Rexenerar etiquetas IA so en marcadores errados", "regenerate_ai_tags_for_all_bookmarks": "Rexenerar etiquetas IA para todos os marcadores", "reindex_all_bookmarks": "Reindexar marcadores", - "compact_assets": "Optimizar multimedia" + "compact_assets": "Optimizar multimedia", + "reprocess_assets_fix_mode": "Reprocesar assets (modo fixo)" }, "users_list": { "users_list": "Listado de usuarios", diff --git a/apps/web/lib/i18n/locales/hr/translation.json b/apps/web/lib/i18n/locales/hr/translation.json index 6e250924..7a72d295 100644 --- a/apps/web/lib/i18n/locales/hr/translation.json +++ b/apps/web/lib/i18n/locales/hr/translation.json @@ -36,7 +36,8 @@ "recrawl_all_links": "Ponovno pregledavanje svih veza", "regenerate_ai_tags_for_all_bookmarks": "Ponovno generiranje AI oznaka za sve oznake", "without_inference": "Bez zaključivanja", - "compact_assets": "Kompaktiranje resursa" + "compact_assets": "Kompaktiranje resursa", + "reprocess_assets_fix_mode": "Ponovno postupanje s resursima (fiksni mod)" } }, "layouts": { diff --git a/apps/web/lib/i18n/locales/hu/translation.json b/apps/web/lib/i18n/locales/hu/translation.json index 38ef96b4..439212f4 100644 --- a/apps/web/lib/i18n/locales/hu/translation.json +++ b/apps/web/lib/i18n/locales/hu/translation.json @@ -258,7 +258,8 @@ "regenerate_ai_tags_for_all_bookmarks": "Minden könyvjelző MI címkéjének lecserélése", "regenerate_ai_tags_for_failed_bookmarks_only": "Hibás könyvjelzők MI címkéjének lecserélése", "reindex_all_bookmarks": "Minden könyvjelző újraindexelése", - "compact_assets": "Kompakt tulajdonok" + "compact_assets": "Kompakt tulajdonok", + "reprocess_assets_fix_mode": "Tulajdonok függvényezése (Fix Mod)" }, "users_list": { "asset_sizes": "Tulajdon méretek", diff --git a/apps/web/lib/i18n/locales/it/translation.json b/apps/web/lib/i18n/locales/it/translation.json index e24b6b7f..4b093b72 100644 --- a/apps/web/lib/i18n/locales/it/translation.json +++ b/apps/web/lib/i18n/locales/it/translation.json @@ -201,7 +201,8 @@ "regenerate_ai_tags_for_failed_bookmarks_only": "Rigenera tag AI solo per i segnalibri falliti", "regenerate_ai_tags_for_all_bookmarks": "Rigenera tag AI per tutti i segnalibri", "compact_assets": "Compatta asset", - "reindex_all_bookmarks": "Reindicizza tutti i segnalibri" + "reindex_all_bookmarks": "Reindicizza tutti i segnalibri", + "reprocess_assets_fix_mode": "Riprocessa asset (modalità fissa)" }, "users_list": { "users_list": "Lista utenti", diff --git a/apps/web/lib/i18n/locales/pl/translation.json b/apps/web/lib/i18n/locales/pl/translation.json index 0d026542..66921560 100644 --- a/apps/web/lib/i18n/locales/pl/translation.json +++ b/apps/web/lib/i18n/locales/pl/translation.json @@ -148,7 +148,8 @@ "regenerate_ai_tags_for_failed_bookmarks_only": "Regeneruj tagi AI tylko dla nieudanych zakładek", "regenerate_ai_tags_for_all_bookmarks": "Regeneruj tagi AI dla wszystkich zakładek", "reindex_all_bookmarks": "Ponowne indeksowanie wszystkich zakładek", - "compact_assets": "Kompaktuj zasoby" + "compact_assets": "Kompaktuj zasoby", + "reprocess_assets_fix_mode": "Ponowne przetwarzanie zasobów (tryb fiksny)" } }, "tags": { diff --git a/apps/web/lib/i18n/locales/ru/translation.json b/apps/web/lib/i18n/locales/ru/translation.json index 4a8cdd52..1d4c50bd 100644 --- a/apps/web/lib/i18n/locales/ru/translation.json +++ b/apps/web/lib/i18n/locales/ru/translation.json @@ -211,7 +211,8 @@ "compact_assets": "Сжать ресурсы", "regenerate_ai_tags_for_failed_bookmarks_only": "Перегенерировать ИИ метки только для неудачных закладок", "reindex_all_bookmarks": "Переиндексировать все закладки", - "recrawl_all_links": "Пересканировать все ссылки" + "recrawl_all_links": "Пересканировать все ссылки", + "reprocess_assets_fix_mode": "Перепроцессировать ресурсы (фиксный режим)" }, "admin_settings": "Настройки администратора" }, diff --git a/apps/web/lib/i18n/locales/tr/translation.json b/apps/web/lib/i18n/locales/tr/translation.json index 9840c6f0..227f6dac 100644 --- a/apps/web/lib/i18n/locales/tr/translation.json +++ b/apps/web/lib/i18n/locales/tr/translation.json @@ -148,7 +148,8 @@ "regenerate_ai_tags_for_failed_bookmarks_only": "Yalnızca Başarısız Yer İşaretleri için Yapay Zeka Etiketlerini Yeniden Oluştur", "regenerate_ai_tags_for_all_bookmarks": "Tüm Yer İşaretleri için Yapay Zeka Etiketlerini Yeniden Oluştur", "reindex_all_bookmarks": "Tüm Yer İşaretlerini Yeniden Dizine Al", - "compact_assets": "Varlıkları Sıkıştır" + "compact_assets": "Varlıkları Sıkıştır", + "reprocess_assets_fix_mode": "Varlıkları Yeniden İşle (Fix Mod)" }, "users_list": { "users_list": "Kullanıcı Listesi", diff --git a/apps/web/lib/i18n/locales/zh/translation.json b/apps/web/lib/i18n/locales/zh/translation.json index 84a9e17a..d798b716 100644 --- a/apps/web/lib/i18n/locales/zh/translation.json +++ b/apps/web/lib/i18n/locales/zh/translation.json @@ -175,7 +175,8 @@ "regenerate_ai_tags_for_failed_bookmarks_only": "仅为失败书签重新生成AI标签", "regenerate_ai_tags_for_all_bookmarks": "为所有书签重新生成AI标签", "reindex_all_bookmarks": "重新索引所有书签", - "compact_assets": "压缩资产" + "compact_assets": "压缩资产", + "reprocess_assets_fix_mode": "重新处理资产(固定模式)" }, "users_list": { "users_list": "用户列表", diff --git a/apps/web/lib/i18n/locales/zhtw/translation.json b/apps/web/lib/i18n/locales/zhtw/translation.json index aada5492..284b5de2 100644 --- a/apps/web/lib/i18n/locales/zhtw/translation.json +++ b/apps/web/lib/i18n/locales/zhtw/translation.json @@ -156,7 +156,8 @@ "regenerate_ai_tags_for_failed_bookmarks_only": "僅重新產生失敗書籤的 AI 標籤", "regenerate_ai_tags_for_all_bookmarks": "重新產生所有書籤的 AI 標籤", "reindex_all_bookmarks": "重新索引所有書籤", - "compact_assets": "壓縮資源" + "compact_assets": "壓縮資源", + "reprocess_assets_fix_mode": "重新處理資源(固定模式)" }, "users_list": { "users_list": "使用者清單", diff --git a/apps/workers/assetPreprocessingWorker.ts b/apps/workers/assetPreprocessingWorker.ts index 5c4937e5..f94eeb9e 100644 --- a/apps/workers/assetPreprocessingWorker.ts +++ b/apps/workers/assetPreprocessingWorker.ts @@ -2,12 +2,18 @@ import os from "os"; import { eq } from "drizzle-orm"; import { DequeuedJob, Runner } from "liteque"; import PDFParser from "pdf2json"; +import { fromBuffer } from "pdf2pic"; import { createWorker } from "tesseract.js"; import type { AssetPreprocessingRequest } from "@hoarder/shared/queues"; import { db } from "@hoarder/db"; -import { bookmarkAssets, bookmarks } from "@hoarder/db/schema"; -import { readAsset } from "@hoarder/shared/assetdb"; +import { + assets, + AssetTypes, + bookmarkAssets, + bookmarks, +} from "@hoarder/db/schema"; +import { newAssetId, readAsset, saveAsset } from "@hoarder/shared/assetdb"; import serverConfig from "@hoarder/shared/config"; import logger from "@hoarder/shared/logger"; import { @@ -67,17 +73,14 @@ async function readImageText(buffer: Buffer) { async function readPDFText(buffer: Buffer): Promise<{ text: string; - metadata: Record<string, string>; + metadata: Record<string, object>; }> { return new Promise((resolve, reject) => { - // Need raw text flag represents as number (1), reference : https://github.com/modesty/pdf2json/issues/76#issuecomment-236569265 - const pdfParser = new PDFParser(null, 1); + const pdfParser = new PDFParser(null, true); pdfParser.on("pdfParser_dataError", reject); pdfParser.on("pdfParser_dataReady", (pdfData) => { resolve({ - // The type isn't set correctly, reference : https://github.com/modesty/pdf2json/issues/327 - // eslint-disable-next-line - text: (pdfParser as any).getRawTextContent(), + text: pdfParser.getRawTextContent(), metadata: pdfData.Meta, }); }); @@ -85,11 +88,102 @@ async function readPDFText(buffer: Buffer): Promise<{ }); } -async function preprocessImage( +export async function extractAndSavePDFScreenshot( jobId: string, asset: Buffer, -): Promise<{ content: string; metadata: string | null } | undefined> { + bookmark: NonNullable<Awaited<ReturnType<typeof getBookmark>>>, + isFixMode: boolean, +): Promise<boolean> { + { + const alreadyHasScreenshot = + bookmark.assets.find( + (r) => r.assetType === AssetTypes.ASSET_SCREENSHOT, + ) !== undefined; + if (alreadyHasScreenshot && isFixMode) { + logger.info( + `[assetPreprocessing][${jobId}] Skipping PDF screenshot generation as it's already been generated.`, + ); + return false; + } + } + logger.info( + `[assetPreprocessing][${jobId}] Attempting to generate PDF screenshot for bookmarkId: ${bookmark.id}`, + ); + try { + /** + * If you encountered any issues with this library, make sure you have ghostscript and graphicsmagick installed following this URL + * https://github.com/yakovmeister/pdf2image/blob/HEAD/docs/gm-installation.md + */ + const screenshot = await fromBuffer(asset, { + density: 100, + quality: 100, + format: "png", + preserveAspectRatio: true, + })(1, { responseType: "buffer" }); + + if (!screenshot.buffer) { + logger.error( + `[assetPreprocessing][${jobId}] Failed to generate PDF screenshot`, + ); + return false; + } + + // Store the screenshot + const assetId = newAssetId(); + const fileName = "screenshot.png"; + const contentType = "image/png"; + await saveAsset({ + userId: bookmark.userId, + assetId, + asset: screenshot.buffer, + metadata: { + contentType, + fileName, + }, + }); + + // Insert into database + await db.insert(assets).values({ + id: assetId, + bookmarkId: bookmark.id, + userId: bookmark.userId, + assetType: AssetTypes.ASSET_SCREENSHOT, + contentType, + size: screenshot.buffer.byteLength, + fileName, + }); + + logger.info( + `[assetPreprocessing][${jobId}] Successfully saved PDF screenshot to database`, + ); + return true; + } catch (error) { + logger.error( + `[assetPreprocessing][${jobId}] Failed to process PDF screenshot: ${error}`, + ); + return false; + } +} + +async function extractAndSaveImageText( + jobId: string, + asset: Buffer, + bookmark: NonNullable<Awaited<ReturnType<typeof getBookmark>>>, + isFixMode: boolean, +): Promise<boolean> { + { + const alreadyHasText = !!bookmark.asset.content; + if (alreadyHasText && isFixMode) { + logger.info( + `[assetPreprocessing][${jobId}] Skipping image text extraction as it's already been extracted.`, + ); + return false; + } + } let imageText = null; + logger.info( + `[assetPreprocessing][${jobId}] Attempting to extract text from image.`, + ); try { imageText = await readImageText(asset); } catch (e) { @@ -98,19 +192,40 @@ async function preprocessImage( ); } if (!imageText) { - return undefined; + return false; } logger.info( `[assetPreprocessing][${jobId}] Extracted ${imageText.length} characters from image.`, ); - return { content: imageText, metadata: null }; + await db + .update(bookmarkAssets) + .set({ + content: imageText, + metadata: null, + }) + .where(eq(bookmarkAssets.id, bookmark.id)); + return true; } -async function preProcessPDF( +async function extractAndSavePDFText( jobId: string, asset: Buffer, -): Promise<{ content: string; metadata: string | null } | undefined> { + bookmark: NonNullable<Awaited<ReturnType<typeof getBookmark>>>, + isFixMode: boolean, +): Promise<boolean> { + { + const alreadyHasText = !!bookmark.asset.content; + if (alreadyHasText && isFixMode) { + logger.info( + `[assetPreprocessing][${jobId}] Skipping PDF text extraction as it's already been extracted.`, + ); + return false; + } + } + logger.info( + `[assetPreprocessing][${jobId}] Attempting to extract text from pdf.`, + ); const pdfParse = await readPDFText(asset); if (!pdfParse?.text) { throw new Error( @@ -120,13 +235,28 @@ async function preProcessPDF( logger.info( `[assetPreprocessing][${jobId}] Extracted ${pdfParse.text.length} characters from pdf.`, ); - return { - content: pdfParse.text, - metadata: pdfParse.metadata ? JSON.stringify(pdfParse.metadata) : null, - }; + await db + .update(bookmarkAssets) + .set({ + content: pdfParse.text, + metadata: pdfParse.metadata ? JSON.stringify(pdfParse.metadata) : null, + }) + .where(eq(bookmarkAssets.id, bookmark.id)); + return true; +} + +async function getBookmark(bookmarkId: string) { + return db.query.bookmarks.findFirst({ + where: eq(bookmarks.id, bookmarkId), + with: { + asset: true, + assets: true, + }, + }); } async function run(req: DequeuedJob<AssetPreprocessingRequest>) { + const isFixMode = req.data.fixMode; const jobId = req.id; const bookmarkId = req.data.bookmarkId; @@ -134,6 +264,7 @@ async function run(req: DequeuedJob<AssetPreprocessingRequest>) { where: eq(bookmarks.id, bookmarkId), with: { asset: true, + assets: true, }, }); @@ -162,15 +293,29 @@ async function run(req: DequeuedJob<AssetPreprocessingRequest>) { ); } - let result: { content: string; metadata: string | null } | undefined = - undefined; - + let anythingChanged = false; switch (bookmark.asset.assetType) { case "image": - result = await preprocessImage(jobId, asset); + anythingChanged ||= await extractAndSaveImageText( + jobId, + asset, + bookmark, + isFixMode, + ); break; case "pdf": - result = await preProcessPDF(jobId, asset); + anythingChanged ||= await extractAndSavePDFText( + jobId, + asset, + bookmark, + isFixMode, + ); + anythingChanged ||= await extractAndSavePDFScreenshot( + jobId, + asset, + bookmark, + isFixMode, + ); break; default: throw new Error( @@ -178,20 +323,12 @@ async function run(req: DequeuedJob<AssetPreprocessingRequest>) { ); } - if (result) { - await db - .update(bookmarkAssets) - .set({ - content: result.content, - metadata: result.metadata, - }) - .where(eq(bookmarkAssets.id, bookmarkId)); - } - - await OpenAIQueue.enqueue({ - bookmarkId, - }); + if (anythingChanged) { + await OpenAIQueue.enqueue({ + bookmarkId, + }); - // Update the search index - await triggerSearchReindex(bookmarkId); + // Update the search index + await triggerSearchReindex(bookmarkId); + } } diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts index 7611494e..17dba443 100644 --- a/apps/workers/crawlerWorker.ts +++ b/apps/workers/crawlerWorker.ts @@ -592,6 +592,7 @@ async function handleAsAssetBookmark( }); await AssetPreprocessingQueue.enqueue({ bookmarkId, + fixMode: false, }); } diff --git a/apps/workers/package.json b/apps/workers/package.json index ebcae757..122c7cb1 100644 --- a/apps/workers/package.json +++ b/apps/workers/package.json @@ -30,7 +30,8 @@ "metascraper-url": "^5.45.22", "node-cron": "^3.0.3", "node-fetch": "^3.3.2", - "pdf2json": "^3.0.5", + "pdf2json": "^3.1.5", + "pdf2pic": "^3.1.3", "pdfjs-dist": "^4.0.379", "puppeteer": "^22.0.0", "puppeteer-extra": "^3.3.6", @@ -65,4 +66,4 @@ ] }, "prettier": "@hoarder/prettier-config" -} +}
\ No newline at end of file |
