diff options
| author | Mohamed Bassem <me@mbassem.com> | 2025-12-27 11:59:39 +0200 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-12-27 09:59:39 +0000 |
| commit | 267db791290f4f539d7bda113992e3d1690b0e8b (patch) | |
| tree | 0144ea00dcf6a49bdaaf46511cd074651aeeee5a /apps | |
| parent | bb6b742a040a70478d276529774bde67b8f93648 (diff) | |
| download | karakeep-267db791290f4f539d7bda113992e3d1690b0e8b.tar.zst | |
feat: support archiving as pdf (#2309)
* feat: support archiving as pdf
* add supprot for manually triggering pdf downloads
* fix submenu
* menu cleanup
* fix store pdf
Diffstat (limited to 'apps')
| -rw-r--r-- | apps/web/components/dashboard/bookmarks/BookmarkOptions.tsx | 155 | ||||
| -rw-r--r-- | apps/web/components/dashboard/preview/LinkContentSection.tsx | 19 | ||||
| -rw-r--r-- | apps/web/lib/attachments.tsx | 2 | ||||
| -rw-r--r-- | apps/web/lib/i18n/locales/en/translation.json | 4 | ||||
| -rw-r--r-- | apps/web/lib/i18n/locales/en_US/translation.json | 4 | ||||
| -rw-r--r-- | apps/workers/workerUtils.ts | 2 | ||||
| -rw-r--r-- | apps/workers/workers/crawlerWorker.ts | 111 |
7 files changed, 264 insertions, 33 deletions
diff --git a/apps/web/components/dashboard/bookmarks/BookmarkOptions.tsx b/apps/web/components/dashboard/bookmarks/BookmarkOptions.tsx index 66de6156..eb746efc 100644 --- a/apps/web/components/dashboard/bookmarks/BookmarkOptions.tsx +++ b/apps/web/components/dashboard/bookmarks/BookmarkOptions.tsx @@ -6,13 +6,18 @@ import { DropdownMenu, DropdownMenuContent, DropdownMenuItem, + DropdownMenuSub, + DropdownMenuSubContent, + DropdownMenuSubTrigger, DropdownMenuTrigger, } from "@/components/ui/dropdown-menu"; import { useToast } from "@/components/ui/use-toast"; import { useClientConfig } from "@/lib/clientConfig"; import { useTranslation } from "@/lib/i18n/client"; import { + Archive, FileDown, + FileText, Link, List, ListX, @@ -43,6 +48,30 @@ import { EditBookmarkDialog } from "./EditBookmarkDialog"; import { ArchivedActionIcon, FavouritedActionIcon } from "./icons"; import { useManageListsModal } from "./ManageListsModal"; +interface ActionItem { + id: string; + title: string; + icon: React.ReactNode; + visible: boolean; + disabled: boolean; + className?: string; + onClick: () => void; +} + +interface SubsectionItem { + id: string; + title: string; + icon: React.ReactNode; + visible: boolean; + items: ActionItem[]; +} + +type ActionItemType = ActionItem | SubsectionItem; + +function isSubsectionItem(item: ActionItemType): item is SubsectionItem { + return "items" in item; +} + export default function BookmarkOptions({ bookmark }: { bookmark: ZBookmark }) { const { t } = useTranslation(); const { toast } = useToast(); @@ -110,6 +139,15 @@ export default function BookmarkOptions({ bookmark }: { bookmark: ZBookmark }) { onError, }); + const preservePdfMutator = useRecrawlBookmark({ + onSuccess: () => { + toast({ + description: t("toasts.bookmarks.preserve_pdf"), + }); + }, + onError, + }); + const removeFromListMutator = useRemoveBookmarkFromList({ onSuccess: () => { toast({ @@ -120,7 +158,7 @@ export default function BookmarkOptions({ bookmark }: { bookmark: ZBookmark }) { }); // Define action items array - const actionItems = [ + const actionItems: ActionItemType[] = [ { id: "edit", title: t("actions.edit"), @@ -174,19 +212,6 @@ export default function BookmarkOptions({ bookmark }: { bookmark: ZBookmark }) { }), }, { - id: "download-full-page", - title: t("actions.download_full_page_archive"), - icon: <FileDown className="mr-2 size-4" />, - visible: isOwner && bookmark.content.type === BookmarkTypes.LINK, - disabled: false, - onClick: () => { - fullPageArchiveBookmarkMutator.mutate({ - bookmarkId: bookmark.id, - archiveFullPage: true, - }); - }, - }, - { id: "copy-link", title: t("actions.copy_link"), icon: <Link className="mr-2 size-4" />, @@ -213,14 +238,15 @@ export default function BookmarkOptions({ bookmark }: { bookmark: ZBookmark }) { id: "remove-from-list", title: t("actions.remove_from_list"), icon: <ListX className="mr-2 size-4" />, - visible: + visible: Boolean( (isOwner || (withinListContext && (withinListContext.userRole === "editor" || withinListContext.userRole === "owner"))) && - !!listId && - !!withinListContext && - withinListContext.type === "manual", + !!listId && + !!withinListContext && + withinListContext.type === "manual", + ), disabled: demoMode, onClick: () => removeFromListMutator.mutate({ @@ -237,6 +263,40 @@ export default function BookmarkOptions({ bookmark }: { bookmark: ZBookmark }) { onClick: () => crawlBookmarkMutator.mutate({ bookmarkId: bookmark.id }), }, { + id: "offline-copies", + title: t("actions.offline_copies"), + icon: <Archive className="mr-2 size-4" />, + visible: isOwner && bookmark.content.type === BookmarkTypes.LINK, + items: [ + { + id: "download-full-page", + title: t("actions.download_full_page_archive"), + icon: <FileDown className="mr-2 size-4" />, + visible: true, + disabled: demoMode, + onClick: () => { + fullPageArchiveBookmarkMutator.mutate({ + bookmarkId: bookmark.id, + archiveFullPage: true, + }); + }, + }, + { + id: "preserve-pdf", + title: t("actions.preserve_as_pdf"), + icon: <FileText className="mr-2 size-4" />, + visible: true, + disabled: demoMode, + onClick: () => { + preservePdfMutator.mutate({ + bookmarkId: bookmark.id, + storePdf: true, + }); + }, + }, + ], + }, + { id: "delete", title: t("actions.delete"), icon: <Trash2 className="mr-2 size-4" />, @@ -248,7 +308,12 @@ export default function BookmarkOptions({ bookmark }: { bookmark: ZBookmark }) { ]; // Filter visible items - const visibleItems = actionItems.filter((item) => item.visible); + const visibleItems: ActionItemType[] = actionItems.filter((item) => { + if (isSubsectionItem(item)) { + return item.visible && item.items.some((subItem) => subItem.visible); + } + return item.visible; + }); // If no items are visible, don't render the dropdown if (visibleItems.length === 0) { @@ -283,17 +348,47 @@ export default function BookmarkOptions({ bookmark }: { bookmark: ZBookmark }) { </Button> </DropdownMenuTrigger> <DropdownMenuContent className="w-fit"> - {visibleItems.map((item) => ( - <DropdownMenuItem - key={item.id} - disabled={item.disabled} - className={item.className} - onClick={item.onClick} - > - {item.icon} - <span>{item.title}</span> - </DropdownMenuItem> - ))} + {visibleItems.map((item) => { + if (isSubsectionItem(item)) { + const visibleSubItems = item.items.filter( + (subItem) => subItem.visible, + ); + if (visibleSubItems.length === 0) { + return null; + } + return ( + <DropdownMenuSub key={item.id}> + <DropdownMenuSubTrigger> + {item.icon} + <span>{item.title}</span> + </DropdownMenuSubTrigger> + <DropdownMenuSubContent> + {visibleSubItems.map((subItem) => ( + <DropdownMenuItem + key={subItem.id} + disabled={subItem.disabled} + onClick={subItem.onClick} + > + {subItem.icon} + <span>{subItem.title}</span> + </DropdownMenuItem> + ))} + </DropdownMenuSubContent> + </DropdownMenuSub> + ); + } + return ( + <DropdownMenuItem + key={item.id} + disabled={item.disabled} + className={item.className} + onClick={item.onClick} + > + {item.icon} + <span>{item.title}</span> + </DropdownMenuItem> + ); + })} </DropdownMenuContent> </DropdownMenu> </> diff --git a/apps/web/components/dashboard/preview/LinkContentSection.tsx b/apps/web/components/dashboard/preview/LinkContentSection.tsx index bdf5faf1..5fb51784 100644 --- a/apps/web/components/dashboard/preview/LinkContentSection.tsx +++ b/apps/web/components/dashboard/preview/LinkContentSection.tsx @@ -24,6 +24,7 @@ import { BookOpen, Camera, ExpandIcon, + FileText, Info, Video, } from "lucide-react"; @@ -104,6 +105,16 @@ function VideoSection({ link }: { link: ZBookmarkedLink }) { ); } +function PDFSection({ link }: { link: ZBookmarkedLink }) { + return ( + <iframe + title="PDF Viewer" + src={`/api/assets/${link.pdfAssetId}`} + className="relative h-full min-w-full" + /> + ); +} + export default function LinkContentSection({ bookmark, }: { @@ -154,6 +165,8 @@ export default function LinkContentSection({ content = <FullPageArchiveSection link={bookmark.content} />; } else if (section === "video") { content = <VideoSection link={bookmark.content} />; + } else if (section === "pdf") { + content = <PDFSection link={bookmark.content} />; } else { content = <ScreenshotSection link={bookmark.content} />; } @@ -198,6 +211,12 @@ export default function LinkContentSection({ {t("common.screenshot")} </div> </SelectItem> + <SelectItem value="pdf" disabled={!bookmark.content.pdfAssetId}> + <div className="flex items-center"> + <FileText className="mr-2 h-4 w-4" /> + {t("common.pdf")} + </div> + </SelectItem> <SelectItem value="archive" disabled={ diff --git a/apps/web/lib/attachments.tsx b/apps/web/lib/attachments.tsx index 81b9f12d..5d7175ec 100644 --- a/apps/web/lib/attachments.tsx +++ b/apps/web/lib/attachments.tsx @@ -2,6 +2,7 @@ import { Archive, Camera, FileCode, + FileText, Image, Paperclip, SquareUser, @@ -13,6 +14,7 @@ import { ZAssetType } from "@karakeep/shared/types/bookmarks"; export const ASSET_TYPE_TO_ICON: Record<ZAssetType, React.ReactNode> = { screenshot: <Camera className="size-4" />, + pdf: <FileText className="size-4" />, assetScreenshot: <Camera className="size-4" />, fullPageArchive: <Archive className="size-4" />, precrawledArchive: <Archive className="size-4" />, diff --git a/apps/web/lib/i18n/locales/en/translation.json b/apps/web/lib/i18n/locales/en/translation.json index a4b94e4b..9ec25732 100644 --- a/apps/web/lib/i18n/locales/en/translation.json +++ b/apps/web/lib/i18n/locales/en/translation.json @@ -26,6 +26,7 @@ "highlights": "Highlights", "source": "Source", "screenshot": "Screenshot", + "pdf": "Archived PDF", "video": "Video", "archive": "Archive", "home": "Home", @@ -70,7 +71,9 @@ "toggle_show_archived": "Show Archived", "refresh": "Refresh", "recrawl": "Recrawl", + "offline_copies": "Offline Copies", "download_full_page_archive": "Download Full Page Archive", + "preserve_as_pdf": "Preserve as PDF", "edit_tags": "Edit Tags", "edit_notes": "Edit Notes", "add_to_list": "Add to List", @@ -802,6 +805,7 @@ "deleted": "The bookmark has been deleted!", "refetch": "Re-fetch has been enqueued!", "full_page_archive": "Full Page Archive creation has been triggered", + "preserve_pdf": "PDF preservation has been triggered", "delete_from_list": "The bookmark has been deleted from the list", "clipboard_copied": "Link has been added to your clipboard!" }, diff --git a/apps/web/lib/i18n/locales/en_US/translation.json b/apps/web/lib/i18n/locales/en_US/translation.json index 6c3dd62b..2849f930 100644 --- a/apps/web/lib/i18n/locales/en_US/translation.json +++ b/apps/web/lib/i18n/locales/en_US/translation.json @@ -25,6 +25,7 @@ "admin": "Admin" }, "screenshot": "Screenshot", + "pdf": "Archived PDF", "video": "Video", "archive": "Archive", "home": "Home", @@ -62,7 +63,9 @@ "delete": "Delete", "refresh": "Refresh", "recrawl": "Recrawl", + "offline_copies": "Offline Copies", "download_full_page_archive": "Download Full Page Archive", + "preserve_as_pdf": "Preserve as PDF", "edit_tags": "Edit Tags", "add_to_list": "Add to List", "select_all": "Select All", @@ -781,6 +784,7 @@ "deleted": "The bookmark has been deleted!", "refetch": "Re-fetch has been enqueued!", "full_page_archive": "Full Page Archive creation has been triggered", + "preserve_pdf": "PDF preservation has been triggered", "delete_from_list": "The bookmark has been deleted from the list", "clipboard_copied": "Link has been added to your clipboard!", "updated": "The bookmark has been updated!" diff --git a/apps/workers/workerUtils.ts b/apps/workers/workerUtils.ts index 3eaf5b4b..a99f2103 100644 --- a/apps/workers/workerUtils.ts +++ b/apps/workers/workerUtils.ts @@ -34,6 +34,8 @@ export async function getBookmarkDetails(bookmarkId: string) { screenshotAssetId: bookmark.assets.find(
(a) => a.assetType == AssetTypes.LINK_SCREENSHOT,
)?.id,
+ pdfAssetId: bookmark.assets.find((a) => a.assetType == AssetTypes.LINK_PDF)
+ ?.id,
imageAssetId: bookmark.assets.find(
(a) => a.assetType == AssetTypes.LINK_BANNER_IMAGE,
)?.id,
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts index 3591474e..95c91002 100644 --- a/apps/workers/workers/crawlerWorker.ts +++ b/apps/workers/workers/crawlerWorker.ts @@ -418,6 +418,7 @@ async function browserlessCrawlPage( htmlContent: await response.text(), statusCode: response.status, screenshot: undefined, + pdf: undefined, url: response.url, }; } @@ -426,10 +427,12 @@ async function crawlPage( jobId: string, url: string, userId: string, + forceStorePdf: boolean, abortSignal: AbortSignal, ): Promise<{ htmlContent: string; screenshot: Buffer | undefined; + pdf: Buffer | undefined; statusCode: number; url: string; }> { @@ -608,10 +611,45 @@ async function crawlPage( } } + // Capture PDF if configured or explicitly requested + let pdf: Buffer | undefined = undefined; + if (serverConfig.crawler.storePdf || forceStorePdf) { + const { data: pdfData, error: pdfError } = await tryCatch( + Promise.race<Buffer>([ + page.pdf({ + format: "A4", + printBackground: true, + }), + new Promise((_, reject) => + setTimeout( + () => + reject( + "TIMED_OUT, consider increasing CRAWLER_SCREENSHOT_TIMEOUT_SEC", + ), + serverConfig.crawler.screenshotTimeoutSec * 1000, + ), + ), + abortPromise(abortSignal).then(() => Buffer.from("")), + ]), + ); + abortSignal.throwIfAborted(); + if (pdfError) { + logger.warn( + `[Crawler][${jobId}] Failed to capture the PDF. Reason: ${pdfError}`, + ); + } else { + logger.info( + `[Crawler][${jobId}] Finished capturing page content as PDF`, + ); + pdf = pdfData; + } + } + return { htmlContent, statusCode: response?.status() ?? 0, screenshot, + pdf, url: page.url(), }; } finally { @@ -724,6 +762,44 @@ async function storeScreenshot( return { assetId, contentType, fileName, size: screenshot.byteLength }; } +async function storePdf( + pdf: Buffer | undefined, + userId: string, + jobId: string, +) { + if (!pdf) { + logger.info(`[Crawler][${jobId}] Skipping storing the PDF as it's empty.`); + return null; + } + const assetId = newAssetId(); + const contentType = "application/pdf"; + const fileName = "page.pdf"; + + // Check storage quota before saving the PDF + const { data: quotaApproved, error: quotaError } = await tryCatch( + QuotaService.checkStorageQuota(db, userId, pdf.byteLength), + ); + + if (quotaError) { + logger.warn( + `[Crawler][${jobId}] Skipping PDF storage due to quota exceeded: ${quotaError.message}`, + ); + return null; + } + + await saveAsset({ + userId, + assetId, + metadata: { contentType, fileName }, + asset: pdf, + quotaApproved, + }); + logger.info( + `[Crawler][${jobId}] Stored the PDF as assetId: ${assetId} (${pdf.byteLength} bytes)`, + ); + return { assetId, contentType, fileName, size: pdf.byteLength }; +} + async function downloadAndStoreFile( url: string, userId: string, @@ -1079,16 +1155,19 @@ async function crawlAndParseUrl( jobId: string, bookmarkId: string, oldScreenshotAssetId: string | undefined, + oldPdfAssetId: string | undefined, oldImageAssetId: string | undefined, oldFullPageArchiveAssetId: string | undefined, oldContentAssetId: string | undefined, precrawledArchiveAssetId: string | undefined, archiveFullPage: boolean, + forceStorePdf: boolean, abortSignal: AbortSignal, ) { let result: { htmlContent: string; screenshot: Buffer | undefined; + pdf: Buffer | undefined; statusCode: number | null; url: string; }; @@ -1104,15 +1183,16 @@ async function crawlAndParseUrl( result = { htmlContent: asset.asset.toString(), screenshot: undefined, + pdf: undefined, statusCode: 200, url, }; } else { - result = await crawlPage(jobId, url, userId, abortSignal); + result = await crawlPage(jobId, url, userId, forceStorePdf, abortSignal); } abortSignal.throwIfAborted(); - const { htmlContent, screenshot, statusCode, url: browserUrl } = result; + const { htmlContent, screenshot, pdf, statusCode, url: browserUrl } = result; // Track status code in Prometheus if (statusCode !== null) { @@ -1146,6 +1226,12 @@ async function crawlAndParseUrl( ]); abortSignal.throwIfAborted(); + const pdfAssetInfo = await Promise.race([ + storePdf(pdf, userId, jobId), + abortPromise(abortSignal), + ]); + abortSignal.throwIfAborted(); + const htmlContentAssetInfo = await storeHtmlContent( readableContent?.content, userId, @@ -1230,6 +1316,22 @@ async function crawlAndParseUrl( ); assetDeletionTasks.push(silentDeleteAsset(userId, oldScreenshotAssetId)); } + if (pdfAssetInfo) { + await updateAsset( + oldPdfAssetId, + { + id: pdfAssetInfo.assetId, + bookmarkId, + userId, + assetType: AssetTypes.LINK_PDF, + contentType: pdfAssetInfo.contentType, + size: pdfAssetInfo.size, + fileName: pdfAssetInfo.fileName, + }, + txn, + ); + assetDeletionTasks.push(silentDeleteAsset(userId, oldPdfAssetId)); + } if (imageAssetInfo) { await updateAsset(oldImageAssetId, imageAssetInfo, txn); assetDeletionTasks.push(silentDeleteAsset(userId, oldImageAssetId)); @@ -1355,11 +1457,12 @@ async function runCrawler( return { status: "completed" }; } - const { bookmarkId, archiveFullPage } = request.data; + const { bookmarkId, archiveFullPage, storePdf } = request.data; const { url, userId, screenshotAssetId: oldScreenshotAssetId, + pdfAssetId: oldPdfAssetId, imageAssetId: oldImageAssetId, fullPageArchiveAssetId: oldFullPageArchiveAssetId, contentAssetId: oldContentAssetId, @@ -1407,11 +1510,13 @@ async function runCrawler( jobId, bookmarkId, oldScreenshotAssetId, + oldPdfAssetId, oldImageAssetId, oldFullPageArchiveAssetId, oldContentAssetId, precrawledArchiveAssetId, archiveFullPage, + storePdf ?? false, job.abortSignal, ); |
