aboutsummaryrefslogtreecommitdiffstats
path: root/apps
diff options
context:
space:
mode:
authorAhmad Mujahid <55625580+AhmadMuj@users.noreply.github.com>2025-02-17 13:25:16 +0400
committerGitHub <noreply@github.com>2025-02-17 09:25:16 +0000
commite5cb9aa848009ea22c1385e4d33b7edf372979fb (patch)
tree89470d8da8aab10f30bbfccea8d1b0cea08a1408 /apps
parenta14be108736133535e2828b6bbdc8d0a69accd63 (diff)
downloadkarakeep-e5cb9aa848009ea22c1385e4d33b7edf372979fb.tar.zst
feat: Add PDF screenshot generation and display (#995)
* Updated pdf2json to 3.1.5 * Extract and store a screenshot from PDF files using pdf2pic * Installing graphicsmagick and ghostscript * Generate Missing PDF screenshot with tidyAssets worker for backward support * Display PDF screenshot instead of the PDF in web if it exists. * Display PDF screenshot in mobile app if exists. * Updated pnpm-lock.yaml * Removed console.log * Revert the unnecessary changes in package.json * Revert pnpm-lock changes * Prevent rendering PDF files if the screenshot is not generated * refactor: replace useEffect with useMemo for section initialization * feat: show PDF file download button and handle large PDFs by defaulting to screenshot view * feat: add file size to openapi spec * feature: Add Assets preprocessing in fix mode to admin actions * i18n: add reprocess_assets_fix_mode translation * i18n: Add missing ar translations * A bunch of fixes * Fix openspec schema --------- Co-authored-by: Mohamed Bassem <me@mbassem.com>
Diffstat (limited to 'apps')
-rw-r--r--apps/mobile/components/bookmarks/BookmarkCard.tsx7
-rw-r--r--apps/web/components/admin/AdminActions.tsx22
-rw-r--r--apps/web/components/dashboard/bookmarks/AssetCard.tsx28
-rw-r--r--apps/web/components/dashboard/preview/AssetContentSection.tsx131
-rw-r--r--apps/web/components/dashboard/preview/AttachmentBox.tsx1
-rw-r--r--apps/web/lib/i18n/locales/ar/translation.json33
-rw-r--r--apps/web/lib/i18n/locales/da/translation.json3
-rw-r--r--apps/web/lib/i18n/locales/de/translation.json3
-rw-r--r--apps/web/lib/i18n/locales/en/translation.json3
-rw-r--r--apps/web/lib/i18n/locales/es/translation.json3
-rw-r--r--apps/web/lib/i18n/locales/fr/translation.json3
-rw-r--r--apps/web/lib/i18n/locales/gl/translation.json3
-rw-r--r--apps/web/lib/i18n/locales/hr/translation.json3
-rw-r--r--apps/web/lib/i18n/locales/hu/translation.json3
-rw-r--r--apps/web/lib/i18n/locales/it/translation.json3
-rw-r--r--apps/web/lib/i18n/locales/pl/translation.json3
-rw-r--r--apps/web/lib/i18n/locales/ru/translation.json3
-rw-r--r--apps/web/lib/i18n/locales/tr/translation.json3
-rw-r--r--apps/web/lib/i18n/locales/zh/translation.json3
-rw-r--r--apps/web/lib/i18n/locales/zhtw/translation.json3
-rw-r--r--apps/workers/assetPreprocessingWorker.ts213
-rw-r--r--apps/workers/crawlerWorker.ts1
-rw-r--r--apps/workers/package.json5
23 files changed, 389 insertions, 94 deletions
diff --git a/apps/mobile/components/bookmarks/BookmarkCard.tsx b/apps/mobile/components/bookmarks/BookmarkCard.tsx
index ce294a6f..3cbd064e 100644
--- a/apps/mobile/components/bookmarks/BookmarkCard.tsx
+++ b/apps/mobile/components/bookmarks/BookmarkCard.tsx
@@ -1,3 +1,4 @@
+import React from "react";
import {
ActivityIndicator,
Alert,
@@ -300,11 +301,15 @@ function AssetCard({
}
const title = bookmark.title ?? bookmark.content.fileName;
+ const assetImage =
+ bookmark.assets.find((r) => r.assetType == "assetScreenshot")?.id ??
+ bookmark.content.assetId;
+
return (
<View className="flex gap-2">
<Pressable onPress={onOpenBookmark}>
<BookmarkAssetImage
- assetId={bookmark.content.assetId}
+ assetId={assetImage}
className="h-56 min-h-56 w-full object-cover"
/>
</Pressable>
diff --git a/apps/web/components/admin/AdminActions.tsx b/apps/web/components/admin/AdminActions.tsx
index 34b3d63a..fb151ac8 100644
--- a/apps/web/components/admin/AdminActions.tsx
+++ b/apps/web/components/admin/AdminActions.tsx
@@ -37,6 +37,21 @@ export default function AdminActions() {
},
});
+ const { mutate: reprocessAssetsFixMode, isPending: isReprocessingPending } =
+ api.admin.reprocessAssetsFixMode.useMutation({
+ onSuccess: () => {
+ toast({
+ description: "Reprocessing enqueued",
+ });
+ },
+ onError: (e) => {
+ toast({
+ variant: "destructive",
+ description: e.message,
+ });
+ },
+ });
+
const {
mutate: reRunInferenceOnAllBookmarks,
isPending: isInferencePending,
@@ -126,6 +141,13 @@ export default function AdminActions() {
</ActionButton>
<ActionButton
variant="destructive"
+ loading={isReprocessingPending}
+ onClick={() => reprocessAssetsFixMode()}
+ >
+ {t("admin.actions.reprocess_assets_fix_mode")}
+ </ActionButton>
+ <ActionButton
+ variant="destructive"
loading={isTidyAssetsPending}
onClick={() => tidyAssets()}
>
diff --git a/apps/web/components/dashboard/bookmarks/AssetCard.tsx b/apps/web/components/dashboard/bookmarks/AssetCard.tsx
index 61b3bc8d..0cb75b3f 100644
--- a/apps/web/components/dashboard/bookmarks/AssetCard.tsx
+++ b/apps/web/components/dashboard/bookmarks/AssetCard.tsx
@@ -2,6 +2,8 @@
import Image from "next/image";
import Link from "next/link";
+import { cn } from "@/lib/utils";
+import { FileText } from "lucide-react";
import type { ZBookmarkTypeAsset } from "@hoarder/shared/types/bookmarks";
import { getAssetUrl } from "@hoarder/shared-react/utils/assetUtils";
@@ -32,12 +34,28 @@ function AssetImage({
);
}
case "pdf": {
+ const screenshotAssetId = bookmark.assets.find(
+ (r) => r.assetType === "assetScreenshot",
+ )?.id;
+ if (!screenshotAssetId) {
+ return (
+ <div
+ className={cn(className, "flex items-center justify-center")}
+ title="PDF screenshot not available. Run asset preprocessing job to generate one screenshot"
+ >
+ <FileText size={80} />
+ </div>
+ );
+ }
return (
- <iframe
- title={bookmarkedAsset.assetId}
- className={className}
- src={getAssetUrl(bookmarkedAsset.assetId)}
- />
+ <Link href={`/dashboard/preview/${bookmark.id}`}>
+ <Image
+ alt="asset"
+ src={getAssetUrl(screenshotAssetId)}
+ fill={true}
+ className={className}
+ />
+ </Link>
);
}
default: {
diff --git a/apps/web/components/dashboard/preview/AssetContentSection.tsx b/apps/web/components/dashboard/preview/AssetContentSection.tsx
index 03ab8a43..8590d2ad 100644
--- a/apps/web/components/dashboard/preview/AssetContentSection.tsx
+++ b/apps/web/components/dashboard/preview/AssetContentSection.tsx
@@ -1,42 +1,117 @@
+import { useMemo, useState } from "react";
import Image from "next/image";
import Link from "next/link";
+import {
+ Select,
+ SelectContent,
+ SelectGroup,
+ SelectItem,
+ SelectTrigger,
+ SelectValue,
+} from "@/components/ui/select";
+import { useTranslation } from "@/lib/i18n/client";
+import { getAssetUrl } from "@hoarder/shared-react/utils/assetUtils";
import { BookmarkTypes, ZBookmark } from "@hoarder/shared/types/bookmarks";
-export function AssetContentSection({ bookmark }: { bookmark: ZBookmark }) {
+// 20 MB
+const BIG_FILE_SIZE = 20 * 1024 * 1024;
+
+function PDFContentSection({ bookmark }: { bookmark: ZBookmark }) {
if (bookmark.content.type != BookmarkTypes.ASSET) {
throw new Error("Invalid content type");
}
+ const { t } = useTranslation();
- switch (bookmark.content.assetType) {
- case "image": {
- return (
- <div className="relative h-full min-w-full">
- <Link
- href={`/api/assets/${bookmark.content.assetId}`}
- target="_blank"
- >
- <Image
- alt="asset"
- fill={true}
- className="object-contain"
- src={`/api/assets/${bookmark.content.assetId}`}
- />
- </Link>
- </div>
- );
+ const initialSection = useMemo(() => {
+ if (bookmark.content.type != BookmarkTypes.ASSET) {
+ throw new Error("Invalid content type");
}
- case "pdf": {
- return (
- <iframe
- title={bookmark.content.assetId}
- className="h-full w-full"
- src={`/api/assets/${bookmark.content.assetId}`}
- />
- );
+
+ const screenshot = bookmark.assets.find(
+ (item) => item.assetType === "assetScreenshot",
+ );
+ const bigSize =
+ bookmark.content.size && bookmark.content.size > BIG_FILE_SIZE;
+ if (bigSize && screenshot) {
+ return "screenshot";
}
- default: {
+ return "pdf";
+ }, [bookmark]);
+ const [section, setSection] = useState(initialSection);
+
+ const screenshot = bookmark.assets.find(
+ (r) => r.assetType === "assetScreenshot",
+ )?.id;
+
+ const content =
+ section === "screenshot" && screenshot ? (
+ <div className="relative h-full min-w-full">
+ <Image
+ alt="screenshot"
+ src={getAssetUrl(screenshot)}
+ fill={true}
+ className="object-contain"
+ />
+ </div>
+ ) : (
+ <iframe
+ title={bookmark.content.assetId}
+ className="h-full w-full"
+ src={getAssetUrl(bookmark.content.assetId)}
+ />
+ );
+
+ return (
+ <div className="flex h-full flex-col items-center gap-2">
+ <div className="flex w-full items-center justify-center gap-4">
+ <Select onValueChange={setSection} value={section}>
+ <SelectTrigger className="w-fit">
+ <SelectValue />
+ </SelectTrigger>
+ <SelectContent>
+ <SelectGroup>
+ <SelectItem value="screenshot" disabled={!screenshot}>
+ {t("common.screenshot")}
+ </SelectItem>
+ <SelectItem value="pdf">PDF</SelectItem>
+ </SelectGroup>
+ </SelectContent>
+ </Select>
+ </div>
+ {content}
+ </div>
+ );
+}
+
+function ImageContentSection({ bookmark }: { bookmark: ZBookmark }) {
+ if (bookmark.content.type != BookmarkTypes.ASSET) {
+ throw new Error("Invalid content type");
+ }
+ return (
+ <div className="relative h-full min-w-full">
+ <Link href={getAssetUrl(bookmark.content.assetId)} target="_blank">
+ <Image
+ alt="asset"
+ fill={true}
+ className="object-contain"
+ src={getAssetUrl(bookmark.content.assetId)}
+ />
+ </Link>
+ </div>
+ );
+}
+
+export function AssetContentSection({ bookmark }: { bookmark: ZBookmark }) {
+ if (bookmark.content.type != BookmarkTypes.ASSET) {
+ throw new Error("Invalid content type");
+ }
+ switch (bookmark.content.assetType) {
+ case "image":
+ return <ImageContentSection bookmark={bookmark} />;
+ case "pdf":
+ return <PDFContentSection bookmark={bookmark} />;
+ default:
return <div>Unsupported asset type</div>;
- }
}
}
diff --git a/apps/web/components/dashboard/preview/AttachmentBox.tsx b/apps/web/components/dashboard/preview/AttachmentBox.tsx
index 6547ae51..32939cb0 100644
--- a/apps/web/components/dashboard/preview/AttachmentBox.tsx
+++ b/apps/web/components/dashboard/preview/AttachmentBox.tsx
@@ -45,6 +45,7 @@ export default function AttachmentBox({ bookmark }: { bookmark: ZBookmark }) {
const { t } = useTranslation();
const typeToIcon: Record<ZAssetType, React.ReactNode> = {
screenshot: <Camera className="size-4" />,
+ assetScreenshot: <Camera className="size-4" />,
fullPageArchive: <Archive className="size-4" />,
precrawledArchive: <Archive className="size-4" />,
bannerImage: <Image className="size-4" />,
diff --git a/apps/web/lib/i18n/locales/ar/translation.json b/apps/web/lib/i18n/locales/ar/translation.json
index 7bd0bcad..e9239e70 100644
--- a/apps/web/lib/i18n/locales/ar/translation.json
+++ b/apps/web/lib/i18n/locales/ar/translation.json
@@ -153,7 +153,7 @@
}
},
"admin": {
- "admin_settings": "إعدادات المدير",
+ "admin_settings": "إعدادات المشرف",
"server_stats": {
"server_stats": "إحصائيات الخادم",
"total_users": "إجمالي المستخدمين",
@@ -161,15 +161,36 @@
"server_version": "إصدار الخادم"
},
"background_jobs": {
- "background_jobs": "المهام التلقائية",
+ "background_jobs": "المهام الخلفية",
"crawler_jobs": "مهام الاستكشاف",
"indexing_jobs": "مهام الفهرسة",
- "inference_jobs": "مهام التحليل الذكي",
- "tidy_assets_jobs": "مهام تنظيم الملفات",
+ "inference_jobs": "مهام الاستدلال",
+ "tidy_assets_jobs": "مهام تنظيم الوسائط",
"job": "مهمة",
"queued": "في قائمة الانتظار",
- "pending": "معلق",
- "failed": "فشل"
+ "pending": "قيد الانتظار",
+ "failed": "فشلت"
+ },
+ "actions": {
+ "recrawl_failed_links_only": "إعادة استكشاف الروابط الفاشلة فقط",
+ "recrawl_all_links": "إعادة استكشاف جميع الروابط",
+ "without_inference": "بدون استدلال",
+ "regenerate_ai_tags_for_failed_bookmarks_only": "إعادة إنشاء علامات الذكاء الاصطناعي للإشارات المرجعية الفاشلة فقط",
+ "regenerate_ai_tags_for_all_bookmarks": "إعادة إنشاء علامات الذكاء الاصطناعي لجميع الإشارات المرجعية",
+ "reindex_all_bookmarks": "إعادة فهرسة جميع الإشارات المرجعية",
+ "compact_assets": "ضغط الوسائط",
+ "reprocess_assets_fix_mode": "إعادة معالجة الوسائط (وضع الإصلاح)"
+ },
+ "users_list": {
+ "users_list": "قائمة المستخدمين",
+ "create_user": "إنشاء مستخدم",
+ "change_role": "تغيير الدور",
+ "reset_password": "إعادة تعيين كلمة المرور",
+ "delete_user": "حذف المستخدم",
+ "num_bookmarks": "عدد الإشارات المرجعية",
+ "asset_sizes": "أحجام الوسائط",
+ "local_user": "مستخدم محلي",
+ "confirm_password": "تأكيد كلمة المرور"
}
},
"options": {
diff --git a/apps/web/lib/i18n/locales/da/translation.json b/apps/web/lib/i18n/locales/da/translation.json
index 3822d5c6..4fe69650 100644
--- a/apps/web/lib/i18n/locales/da/translation.json
+++ b/apps/web/lib/i18n/locales/da/translation.json
@@ -94,7 +94,8 @@
"recrawl_all_links": "Gennemsøg alle links",
"without_inference": "Uden inferens",
"regenerate_ai_tags_for_all_bookmarks": "Genopret AI-tags for alle bogmærker",
- "reindex_all_bookmarks": "Genindeksér alle bogmærker"
+ "reindex_all_bookmarks": "Genindeksér alle bogmærker",
+ "reprocess_assets_fix_mode": "Genbehandling af aktiver (Fix Mode)"
},
"background_jobs": {
"inference_jobs": "Inferensopgaver",
diff --git a/apps/web/lib/i18n/locales/de/translation.json b/apps/web/lib/i18n/locales/de/translation.json
index c20a2273..ccebf1f1 100644
--- a/apps/web/lib/i18n/locales/de/translation.json
+++ b/apps/web/lib/i18n/locales/de/translation.json
@@ -175,7 +175,8 @@
"regenerate_ai_tags_for_failed_bookmarks_only": "KI-Tags nur für fehlgeschlagene Lesezeichen neu generieren",
"regenerate_ai_tags_for_all_bookmarks": "KI-Tags für alle Lesezeichen neu generieren",
"reindex_all_bookmarks": "Alle Lesezeichen neu indizieren",
- "compact_assets": "Assets komprimieren"
+ "compact_assets": "Assets komprimieren",
+ "reprocess_assets_fix_mode": "Assets neu verarbeiten (Fix-Modus)"
},
"users_list": {
"users_list": "Benutzerliste",
diff --git a/apps/web/lib/i18n/locales/en/translation.json b/apps/web/lib/i18n/locales/en/translation.json
index 2e80f2f4..81ef942f 100644
--- a/apps/web/lib/i18n/locales/en/translation.json
+++ b/apps/web/lib/i18n/locales/en/translation.json
@@ -178,7 +178,8 @@
"regenerate_ai_tags_for_failed_bookmarks_only": "Regenerate AI Tags for Failed Bookmarks Only",
"regenerate_ai_tags_for_all_bookmarks": "Regenerate AI Tags for All Bookmarks",
"reindex_all_bookmarks": "Reindex All Bookmarks",
- "compact_assets": "Compact Assets"
+ "compact_assets": "Compact Assets",
+ "reprocess_assets_fix_mode": "Reprocess Assets (Fix Mode)"
},
"users_list": {
"users_list": "Users List",
diff --git a/apps/web/lib/i18n/locales/es/translation.json b/apps/web/lib/i18n/locales/es/translation.json
index 40c6cb01..3a1a7e3c 100644
--- a/apps/web/lib/i18n/locales/es/translation.json
+++ b/apps/web/lib/i18n/locales/es/translation.json
@@ -146,7 +146,8 @@
"compact_assets": "Optimizar multimedia",
"without_inference": "Sin inferencia",
"recrawl_failed_links_only": "Recrawlear solo los enlaces fallidos",
- "recrawl_all_links": "Recrawlear todos los enlaces"
+ "recrawl_all_links": "Recrawlear todos los enlaces",
+ "reprocess_assets_fix_mode": "Reprocesar assets (modo fijo)"
},
"users_list": {
"users_list": "Lista de usuarios",
diff --git a/apps/web/lib/i18n/locales/fr/translation.json b/apps/web/lib/i18n/locales/fr/translation.json
index b7834a7b..1772c2ff 100644
--- a/apps/web/lib/i18n/locales/fr/translation.json
+++ b/apps/web/lib/i18n/locales/fr/translation.json
@@ -146,7 +146,8 @@
"regenerate_ai_tags_for_failed_bookmarks_only": "Régénérer les tags AI uniquement pour les favoris échoués",
"regenerate_ai_tags_for_all_bookmarks": "Régénérer les tags AI pour tous les favoris",
"reindex_all_bookmarks": "Réindexer tous les favoris",
- "compact_assets": "Compacter les assets"
+ "compact_assets": "Compacter les assets",
+ "reprocess_assets_fix_mode": "Reprocesser les assets (mode fix)"
},
"users_list": {
"users_list": "Liste des utilisateurs",
diff --git a/apps/web/lib/i18n/locales/gl/translation.json b/apps/web/lib/i18n/locales/gl/translation.json
index eb65ca64..363ffac8 100644
--- a/apps/web/lib/i18n/locales/gl/translation.json
+++ b/apps/web/lib/i18n/locales/gl/translation.json
@@ -178,7 +178,8 @@
"regenerate_ai_tags_for_failed_bookmarks_only": "Rexenerar etiquetas IA so en marcadores errados",
"regenerate_ai_tags_for_all_bookmarks": "Rexenerar etiquetas IA para todos os marcadores",
"reindex_all_bookmarks": "Reindexar marcadores",
- "compact_assets": "Optimizar multimedia"
+ "compact_assets": "Optimizar multimedia",
+ "reprocess_assets_fix_mode": "Reprocesar assets (modo fixo)"
},
"users_list": {
"users_list": "Listado de usuarios",
diff --git a/apps/web/lib/i18n/locales/hr/translation.json b/apps/web/lib/i18n/locales/hr/translation.json
index 6e250924..7a72d295 100644
--- a/apps/web/lib/i18n/locales/hr/translation.json
+++ b/apps/web/lib/i18n/locales/hr/translation.json
@@ -36,7 +36,8 @@
"recrawl_all_links": "Ponovno pregledavanje svih veza",
"regenerate_ai_tags_for_all_bookmarks": "Ponovno generiranje AI oznaka za sve oznake",
"without_inference": "Bez zaključivanja",
- "compact_assets": "Kompaktiranje resursa"
+ "compact_assets": "Kompaktiranje resursa",
+ "reprocess_assets_fix_mode": "Ponovno postupanje s resursima (fiksni mod)"
}
},
"layouts": {
diff --git a/apps/web/lib/i18n/locales/hu/translation.json b/apps/web/lib/i18n/locales/hu/translation.json
index 38ef96b4..439212f4 100644
--- a/apps/web/lib/i18n/locales/hu/translation.json
+++ b/apps/web/lib/i18n/locales/hu/translation.json
@@ -258,7 +258,8 @@
"regenerate_ai_tags_for_all_bookmarks": "Minden könyvjelző MI címkéjének lecserélése",
"regenerate_ai_tags_for_failed_bookmarks_only": "Hibás könyvjelzők MI címkéjének lecserélése",
"reindex_all_bookmarks": "Minden könyvjelző újraindexelése",
- "compact_assets": "Kompakt tulajdonok"
+ "compact_assets": "Kompakt tulajdonok",
+ "reprocess_assets_fix_mode": "Tulajdonok függvényezése (Fix Mod)"
},
"users_list": {
"asset_sizes": "Tulajdon méretek",
diff --git a/apps/web/lib/i18n/locales/it/translation.json b/apps/web/lib/i18n/locales/it/translation.json
index e24b6b7f..4b093b72 100644
--- a/apps/web/lib/i18n/locales/it/translation.json
+++ b/apps/web/lib/i18n/locales/it/translation.json
@@ -201,7 +201,8 @@
"regenerate_ai_tags_for_failed_bookmarks_only": "Rigenera tag AI solo per i segnalibri falliti",
"regenerate_ai_tags_for_all_bookmarks": "Rigenera tag AI per tutti i segnalibri",
"compact_assets": "Compatta asset",
- "reindex_all_bookmarks": "Reindicizza tutti i segnalibri"
+ "reindex_all_bookmarks": "Reindicizza tutti i segnalibri",
+ "reprocess_assets_fix_mode": "Riprocessa asset (modalità fissa)"
},
"users_list": {
"users_list": "Lista utenti",
diff --git a/apps/web/lib/i18n/locales/pl/translation.json b/apps/web/lib/i18n/locales/pl/translation.json
index 0d026542..66921560 100644
--- a/apps/web/lib/i18n/locales/pl/translation.json
+++ b/apps/web/lib/i18n/locales/pl/translation.json
@@ -148,7 +148,8 @@
"regenerate_ai_tags_for_failed_bookmarks_only": "Regeneruj tagi AI tylko dla nieudanych zakładek",
"regenerate_ai_tags_for_all_bookmarks": "Regeneruj tagi AI dla wszystkich zakładek",
"reindex_all_bookmarks": "Ponowne indeksowanie wszystkich zakładek",
- "compact_assets": "Kompaktuj zasoby"
+ "compact_assets": "Kompaktuj zasoby",
+ "reprocess_assets_fix_mode": "Ponowne przetwarzanie zasobów (tryb fiksny)"
}
},
"tags": {
diff --git a/apps/web/lib/i18n/locales/ru/translation.json b/apps/web/lib/i18n/locales/ru/translation.json
index 4a8cdd52..1d4c50bd 100644
--- a/apps/web/lib/i18n/locales/ru/translation.json
+++ b/apps/web/lib/i18n/locales/ru/translation.json
@@ -211,7 +211,8 @@
"compact_assets": "Сжать ресурсы",
"regenerate_ai_tags_for_failed_bookmarks_only": "Перегенерировать ИИ метки только для неудачных закладок",
"reindex_all_bookmarks": "Переиндексировать все закладки",
- "recrawl_all_links": "Пересканировать все ссылки"
+ "recrawl_all_links": "Пересканировать все ссылки",
+ "reprocess_assets_fix_mode": "Перепроцессировать ресурсы (фиксный режим)"
},
"admin_settings": "Настройки администратора"
},
diff --git a/apps/web/lib/i18n/locales/tr/translation.json b/apps/web/lib/i18n/locales/tr/translation.json
index 9840c6f0..227f6dac 100644
--- a/apps/web/lib/i18n/locales/tr/translation.json
+++ b/apps/web/lib/i18n/locales/tr/translation.json
@@ -148,7 +148,8 @@
"regenerate_ai_tags_for_failed_bookmarks_only": "Yalnızca Başarısız Yer İşaretleri için Yapay Zeka Etiketlerini Yeniden Oluştur",
"regenerate_ai_tags_for_all_bookmarks": "Tüm Yer İşaretleri için Yapay Zeka Etiketlerini Yeniden Oluştur",
"reindex_all_bookmarks": "Tüm Yer İşaretlerini Yeniden Dizine Al",
- "compact_assets": "Varlıkları Sıkıştır"
+ "compact_assets": "Varlıkları Sıkıştır",
+ "reprocess_assets_fix_mode": "Varlıkları Yeniden İşle (Fix Mod)"
},
"users_list": {
"users_list": "Kullanıcı Listesi",
diff --git a/apps/web/lib/i18n/locales/zh/translation.json b/apps/web/lib/i18n/locales/zh/translation.json
index 84a9e17a..d798b716 100644
--- a/apps/web/lib/i18n/locales/zh/translation.json
+++ b/apps/web/lib/i18n/locales/zh/translation.json
@@ -175,7 +175,8 @@
"regenerate_ai_tags_for_failed_bookmarks_only": "仅为失败书签重新生成AI标签",
"regenerate_ai_tags_for_all_bookmarks": "为所有书签重新生成AI标签",
"reindex_all_bookmarks": "重新索引所有书签",
- "compact_assets": "压缩资产"
+ "compact_assets": "压缩资产",
+ "reprocess_assets_fix_mode": "重新处理资产(固定模式)"
},
"users_list": {
"users_list": "用户列表",
diff --git a/apps/web/lib/i18n/locales/zhtw/translation.json b/apps/web/lib/i18n/locales/zhtw/translation.json
index aada5492..284b5de2 100644
--- a/apps/web/lib/i18n/locales/zhtw/translation.json
+++ b/apps/web/lib/i18n/locales/zhtw/translation.json
@@ -156,7 +156,8 @@
"regenerate_ai_tags_for_failed_bookmarks_only": "僅重新產生失敗書籤的 AI 標籤",
"regenerate_ai_tags_for_all_bookmarks": "重新產生所有書籤的 AI 標籤",
"reindex_all_bookmarks": "重新索引所有書籤",
- "compact_assets": "壓縮資源"
+ "compact_assets": "壓縮資源",
+ "reprocess_assets_fix_mode": "重新處理資源(固定模式)"
},
"users_list": {
"users_list": "使用者清單",
diff --git a/apps/workers/assetPreprocessingWorker.ts b/apps/workers/assetPreprocessingWorker.ts
index 5c4937e5..f94eeb9e 100644
--- a/apps/workers/assetPreprocessingWorker.ts
+++ b/apps/workers/assetPreprocessingWorker.ts
@@ -2,12 +2,18 @@ import os from "os";
import { eq } from "drizzle-orm";
import { DequeuedJob, Runner } from "liteque";
import PDFParser from "pdf2json";
+import { fromBuffer } from "pdf2pic";
import { createWorker } from "tesseract.js";
import type { AssetPreprocessingRequest } from "@hoarder/shared/queues";
import { db } from "@hoarder/db";
-import { bookmarkAssets, bookmarks } from "@hoarder/db/schema";
-import { readAsset } from "@hoarder/shared/assetdb";
+import {
+ assets,
+ AssetTypes,
+ bookmarkAssets,
+ bookmarks,
+} from "@hoarder/db/schema";
+import { newAssetId, readAsset, saveAsset } from "@hoarder/shared/assetdb";
import serverConfig from "@hoarder/shared/config";
import logger from "@hoarder/shared/logger";
import {
@@ -67,17 +73,14 @@ async function readImageText(buffer: Buffer) {
async function readPDFText(buffer: Buffer): Promise<{
text: string;
- metadata: Record<string, string>;
+ metadata: Record<string, object>;
}> {
return new Promise((resolve, reject) => {
- // Need raw text flag represents as number (1), reference : https://github.com/modesty/pdf2json/issues/76#issuecomment-236569265
- const pdfParser = new PDFParser(null, 1);
+ const pdfParser = new PDFParser(null, true);
pdfParser.on("pdfParser_dataError", reject);
pdfParser.on("pdfParser_dataReady", (pdfData) => {
resolve({
- // The type isn't set correctly, reference : https://github.com/modesty/pdf2json/issues/327
- // eslint-disable-next-line
- text: (pdfParser as any).getRawTextContent(),
+ text: pdfParser.getRawTextContent(),
metadata: pdfData.Meta,
});
});
@@ -85,11 +88,102 @@ async function readPDFText(buffer: Buffer): Promise<{
});
}
-async function preprocessImage(
+export async function extractAndSavePDFScreenshot(
jobId: string,
asset: Buffer,
-): Promise<{ content: string; metadata: string | null } | undefined> {
+ bookmark: NonNullable<Awaited<ReturnType<typeof getBookmark>>>,
+ isFixMode: boolean,
+): Promise<boolean> {
+ {
+ const alreadyHasScreenshot =
+ bookmark.assets.find(
+ (r) => r.assetType === AssetTypes.ASSET_SCREENSHOT,
+ ) !== undefined;
+ if (alreadyHasScreenshot && isFixMode) {
+ logger.info(
+ `[assetPreprocessing][${jobId}] Skipping PDF screenshot generation as it's already been generated.`,
+ );
+ return false;
+ }
+ }
+ logger.info(
+ `[assetPreprocessing][${jobId}] Attempting to generate PDF screenshot for bookmarkId: ${bookmark.id}`,
+ );
+ try {
+ /**
+ * If you encountered any issues with this library, make sure you have ghostscript and graphicsmagick installed following this URL
+ * https://github.com/yakovmeister/pdf2image/blob/HEAD/docs/gm-installation.md
+ */
+ const screenshot = await fromBuffer(asset, {
+ density: 100,
+ quality: 100,
+ format: "png",
+ preserveAspectRatio: true,
+ })(1, { responseType: "buffer" });
+
+ if (!screenshot.buffer) {
+ logger.error(
+ `[assetPreprocessing][${jobId}] Failed to generate PDF screenshot`,
+ );
+ return false;
+ }
+
+ // Store the screenshot
+ const assetId = newAssetId();
+ const fileName = "screenshot.png";
+ const contentType = "image/png";
+ await saveAsset({
+ userId: bookmark.userId,
+ assetId,
+ asset: screenshot.buffer,
+ metadata: {
+ contentType,
+ fileName,
+ },
+ });
+
+ // Insert into database
+ await db.insert(assets).values({
+ id: assetId,
+ bookmarkId: bookmark.id,
+ userId: bookmark.userId,
+ assetType: AssetTypes.ASSET_SCREENSHOT,
+ contentType,
+ size: screenshot.buffer.byteLength,
+ fileName,
+ });
+
+ logger.info(
+ `[assetPreprocessing][${jobId}] Successfully saved PDF screenshot to database`,
+ );
+ return true;
+ } catch (error) {
+ logger.error(
+ `[assetPreprocessing][${jobId}] Failed to process PDF screenshot: ${error}`,
+ );
+ return false;
+ }
+}
+
+async function extractAndSaveImageText(
+ jobId: string,
+ asset: Buffer,
+ bookmark: NonNullable<Awaited<ReturnType<typeof getBookmark>>>,
+ isFixMode: boolean,
+): Promise<boolean> {
+ {
+ const alreadyHasText = !!bookmark.asset.content;
+ if (alreadyHasText && isFixMode) {
+ logger.info(
+ `[assetPreprocessing][${jobId}] Skipping image text extraction as it's already been extracted.`,
+ );
+ return false;
+ }
+ }
let imageText = null;
+ logger.info(
+ `[assetPreprocessing][${jobId}] Attempting to extract text from image.`,
+ );
try {
imageText = await readImageText(asset);
} catch (e) {
@@ -98,19 +192,40 @@ async function preprocessImage(
);
}
if (!imageText) {
- return undefined;
+ return false;
}
logger.info(
`[assetPreprocessing][${jobId}] Extracted ${imageText.length} characters from image.`,
);
- return { content: imageText, metadata: null };
+ await db
+ .update(bookmarkAssets)
+ .set({
+ content: imageText,
+ metadata: null,
+ })
+ .where(eq(bookmarkAssets.id, bookmark.id));
+ return true;
}
-async function preProcessPDF(
+async function extractAndSavePDFText(
jobId: string,
asset: Buffer,
-): Promise<{ content: string; metadata: string | null } | undefined> {
+ bookmark: NonNullable<Awaited<ReturnType<typeof getBookmark>>>,
+ isFixMode: boolean,
+): Promise<boolean> {
+ {
+ const alreadyHasText = !!bookmark.asset.content;
+ if (alreadyHasText && isFixMode) {
+ logger.info(
+ `[assetPreprocessing][${jobId}] Skipping PDF text extraction as it's already been extracted.`,
+ );
+ return false;
+ }
+ }
+ logger.info(
+ `[assetPreprocessing][${jobId}] Attempting to extract text from pdf.`,
+ );
const pdfParse = await readPDFText(asset);
if (!pdfParse?.text) {
throw new Error(
@@ -120,13 +235,28 @@ async function preProcessPDF(
logger.info(
`[assetPreprocessing][${jobId}] Extracted ${pdfParse.text.length} characters from pdf.`,
);
- return {
- content: pdfParse.text,
- metadata: pdfParse.metadata ? JSON.stringify(pdfParse.metadata) : null,
- };
+ await db
+ .update(bookmarkAssets)
+ .set({
+ content: pdfParse.text,
+ metadata: pdfParse.metadata ? JSON.stringify(pdfParse.metadata) : null,
+ })
+ .where(eq(bookmarkAssets.id, bookmark.id));
+ return true;
+}
+
+async function getBookmark(bookmarkId: string) {
+ return db.query.bookmarks.findFirst({
+ where: eq(bookmarks.id, bookmarkId),
+ with: {
+ asset: true,
+ assets: true,
+ },
+ });
}
async function run(req: DequeuedJob<AssetPreprocessingRequest>) {
+ const isFixMode = req.data.fixMode;
const jobId = req.id;
const bookmarkId = req.data.bookmarkId;
@@ -134,6 +264,7 @@ async function run(req: DequeuedJob<AssetPreprocessingRequest>) {
where: eq(bookmarks.id, bookmarkId),
with: {
asset: true,
+ assets: true,
},
});
@@ -162,15 +293,29 @@ async function run(req: DequeuedJob<AssetPreprocessingRequest>) {
);
}
- let result: { content: string; metadata: string | null } | undefined =
- undefined;
-
+ let anythingChanged = false;
switch (bookmark.asset.assetType) {
case "image":
- result = await preprocessImage(jobId, asset);
+ anythingChanged ||= await extractAndSaveImageText(
+ jobId,
+ asset,
+ bookmark,
+ isFixMode,
+ );
break;
case "pdf":
- result = await preProcessPDF(jobId, asset);
+ anythingChanged ||= await extractAndSavePDFText(
+ jobId,
+ asset,
+ bookmark,
+ isFixMode,
+ );
+ anythingChanged ||= await extractAndSavePDFScreenshot(
+ jobId,
+ asset,
+ bookmark,
+ isFixMode,
+ );
break;
default:
throw new Error(
@@ -178,20 +323,12 @@ async function run(req: DequeuedJob<AssetPreprocessingRequest>) {
);
}
- if (result) {
- await db
- .update(bookmarkAssets)
- .set({
- content: result.content,
- metadata: result.metadata,
- })
- .where(eq(bookmarkAssets.id, bookmarkId));
- }
-
- await OpenAIQueue.enqueue({
- bookmarkId,
- });
+ if (anythingChanged) {
+ await OpenAIQueue.enqueue({
+ bookmarkId,
+ });
- // Update the search index
- await triggerSearchReindex(bookmarkId);
+ // Update the search index
+ await triggerSearchReindex(bookmarkId);
+ }
}
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index 7611494e..17dba443 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -592,6 +592,7 @@ async function handleAsAssetBookmark(
});
await AssetPreprocessingQueue.enqueue({
bookmarkId,
+ fixMode: false,
});
}
diff --git a/apps/workers/package.json b/apps/workers/package.json
index ebcae757..122c7cb1 100644
--- a/apps/workers/package.json
+++ b/apps/workers/package.json
@@ -30,7 +30,8 @@
"metascraper-url": "^5.45.22",
"node-cron": "^3.0.3",
"node-fetch": "^3.3.2",
- "pdf2json": "^3.0.5",
+ "pdf2json": "^3.1.5",
+ "pdf2pic": "^3.1.3",
"pdfjs-dist": "^4.0.379",
"puppeteer": "^22.0.0",
"puppeteer-extra": "^3.3.6",
@@ -65,4 +66,4 @@
]
},
"prettier": "@hoarder/prettier-config"
-}
+} \ No newline at end of file