aboutsummaryrefslogtreecommitdiffstats
path: root/apps
diff options
context:
space:
mode:
Diffstat (limited to 'apps')
-rw-r--r--apps/web/components/dashboard/bookmarks/BookmarkOptions.tsx155
-rw-r--r--apps/web/components/dashboard/preview/LinkContentSection.tsx19
-rw-r--r--apps/web/lib/attachments.tsx2
-rw-r--r--apps/web/lib/i18n/locales/en/translation.json4
-rw-r--r--apps/web/lib/i18n/locales/en_US/translation.json4
-rw-r--r--apps/workers/workerUtils.ts2
-rw-r--r--apps/workers/workers/crawlerWorker.ts111
7 files changed, 264 insertions, 33 deletions
diff --git a/apps/web/components/dashboard/bookmarks/BookmarkOptions.tsx b/apps/web/components/dashboard/bookmarks/BookmarkOptions.tsx
index 66de6156..eb746efc 100644
--- a/apps/web/components/dashboard/bookmarks/BookmarkOptions.tsx
+++ b/apps/web/components/dashboard/bookmarks/BookmarkOptions.tsx
@@ -6,13 +6,18 @@ import {
DropdownMenu,
DropdownMenuContent,
DropdownMenuItem,
+ DropdownMenuSub,
+ DropdownMenuSubContent,
+ DropdownMenuSubTrigger,
DropdownMenuTrigger,
} from "@/components/ui/dropdown-menu";
import { useToast } from "@/components/ui/use-toast";
import { useClientConfig } from "@/lib/clientConfig";
import { useTranslation } from "@/lib/i18n/client";
import {
+ Archive,
FileDown,
+ FileText,
Link,
List,
ListX,
@@ -43,6 +48,30 @@ import { EditBookmarkDialog } from "./EditBookmarkDialog";
import { ArchivedActionIcon, FavouritedActionIcon } from "./icons";
import { useManageListsModal } from "./ManageListsModal";
+interface ActionItem {
+ id: string;
+ title: string;
+ icon: React.ReactNode;
+ visible: boolean;
+ disabled: boolean;
+ className?: string;
+ onClick: () => void;
+}
+
+interface SubsectionItem {
+ id: string;
+ title: string;
+ icon: React.ReactNode;
+ visible: boolean;
+ items: ActionItem[];
+}
+
+type ActionItemType = ActionItem | SubsectionItem;
+
+function isSubsectionItem(item: ActionItemType): item is SubsectionItem {
+ return "items" in item;
+}
+
export default function BookmarkOptions({ bookmark }: { bookmark: ZBookmark }) {
const { t } = useTranslation();
const { toast } = useToast();
@@ -110,6 +139,15 @@ export default function BookmarkOptions({ bookmark }: { bookmark: ZBookmark }) {
onError,
});
+ const preservePdfMutator = useRecrawlBookmark({
+ onSuccess: () => {
+ toast({
+ description: t("toasts.bookmarks.preserve_pdf"),
+ });
+ },
+ onError,
+ });
+
const removeFromListMutator = useRemoveBookmarkFromList({
onSuccess: () => {
toast({
@@ -120,7 +158,7 @@ export default function BookmarkOptions({ bookmark }: { bookmark: ZBookmark }) {
});
// Define action items array
- const actionItems = [
+ const actionItems: ActionItemType[] = [
{
id: "edit",
title: t("actions.edit"),
@@ -174,19 +212,6 @@ export default function BookmarkOptions({ bookmark }: { bookmark: ZBookmark }) {
}),
},
{
- id: "download-full-page",
- title: t("actions.download_full_page_archive"),
- icon: <FileDown className="mr-2 size-4" />,
- visible: isOwner && bookmark.content.type === BookmarkTypes.LINK,
- disabled: false,
- onClick: () => {
- fullPageArchiveBookmarkMutator.mutate({
- bookmarkId: bookmark.id,
- archiveFullPage: true,
- });
- },
- },
- {
id: "copy-link",
title: t("actions.copy_link"),
icon: <Link className="mr-2 size-4" />,
@@ -213,14 +238,15 @@ export default function BookmarkOptions({ bookmark }: { bookmark: ZBookmark }) {
id: "remove-from-list",
title: t("actions.remove_from_list"),
icon: <ListX className="mr-2 size-4" />,
- visible:
+ visible: Boolean(
(isOwner ||
(withinListContext &&
(withinListContext.userRole === "editor" ||
withinListContext.userRole === "owner"))) &&
- !!listId &&
- !!withinListContext &&
- withinListContext.type === "manual",
+ !!listId &&
+ !!withinListContext &&
+ withinListContext.type === "manual",
+ ),
disabled: demoMode,
onClick: () =>
removeFromListMutator.mutate({
@@ -237,6 +263,40 @@ export default function BookmarkOptions({ bookmark }: { bookmark: ZBookmark }) {
onClick: () => crawlBookmarkMutator.mutate({ bookmarkId: bookmark.id }),
},
{
+ id: "offline-copies",
+ title: t("actions.offline_copies"),
+ icon: <Archive className="mr-2 size-4" />,
+ visible: isOwner && bookmark.content.type === BookmarkTypes.LINK,
+ items: [
+ {
+ id: "download-full-page",
+ title: t("actions.download_full_page_archive"),
+ icon: <FileDown className="mr-2 size-4" />,
+ visible: true,
+ disabled: demoMode,
+ onClick: () => {
+ fullPageArchiveBookmarkMutator.mutate({
+ bookmarkId: bookmark.id,
+ archiveFullPage: true,
+ });
+ },
+ },
+ {
+ id: "preserve-pdf",
+ title: t("actions.preserve_as_pdf"),
+ icon: <FileText className="mr-2 size-4" />,
+ visible: true,
+ disabled: demoMode,
+ onClick: () => {
+ preservePdfMutator.mutate({
+ bookmarkId: bookmark.id,
+ storePdf: true,
+ });
+ },
+ },
+ ],
+ },
+ {
id: "delete",
title: t("actions.delete"),
icon: <Trash2 className="mr-2 size-4" />,
@@ -248,7 +308,12 @@ export default function BookmarkOptions({ bookmark }: { bookmark: ZBookmark }) {
];
// Filter visible items
- const visibleItems = actionItems.filter((item) => item.visible);
+ const visibleItems: ActionItemType[] = actionItems.filter((item) => {
+ if (isSubsectionItem(item)) {
+ return item.visible && item.items.some((subItem) => subItem.visible);
+ }
+ return item.visible;
+ });
// If no items are visible, don't render the dropdown
if (visibleItems.length === 0) {
@@ -283,17 +348,47 @@ export default function BookmarkOptions({ bookmark }: { bookmark: ZBookmark }) {
</Button>
</DropdownMenuTrigger>
<DropdownMenuContent className="w-fit">
- {visibleItems.map((item) => (
- <DropdownMenuItem
- key={item.id}
- disabled={item.disabled}
- className={item.className}
- onClick={item.onClick}
- >
- {item.icon}
- <span>{item.title}</span>
- </DropdownMenuItem>
- ))}
+ {visibleItems.map((item) => {
+ if (isSubsectionItem(item)) {
+ const visibleSubItems = item.items.filter(
+ (subItem) => subItem.visible,
+ );
+ if (visibleSubItems.length === 0) {
+ return null;
+ }
+ return (
+ <DropdownMenuSub key={item.id}>
+ <DropdownMenuSubTrigger>
+ {item.icon}
+ <span>{item.title}</span>
+ </DropdownMenuSubTrigger>
+ <DropdownMenuSubContent>
+ {visibleSubItems.map((subItem) => (
+ <DropdownMenuItem
+ key={subItem.id}
+ disabled={subItem.disabled}
+ onClick={subItem.onClick}
+ >
+ {subItem.icon}
+ <span>{subItem.title}</span>
+ </DropdownMenuItem>
+ ))}
+ </DropdownMenuSubContent>
+ </DropdownMenuSub>
+ );
+ }
+ return (
+ <DropdownMenuItem
+ key={item.id}
+ disabled={item.disabled}
+ className={item.className}
+ onClick={item.onClick}
+ >
+ {item.icon}
+ <span>{item.title}</span>
+ </DropdownMenuItem>
+ );
+ })}
</DropdownMenuContent>
</DropdownMenu>
</>
diff --git a/apps/web/components/dashboard/preview/LinkContentSection.tsx b/apps/web/components/dashboard/preview/LinkContentSection.tsx
index bdf5faf1..5fb51784 100644
--- a/apps/web/components/dashboard/preview/LinkContentSection.tsx
+++ b/apps/web/components/dashboard/preview/LinkContentSection.tsx
@@ -24,6 +24,7 @@ import {
BookOpen,
Camera,
ExpandIcon,
+ FileText,
Info,
Video,
} from "lucide-react";
@@ -104,6 +105,16 @@ function VideoSection({ link }: { link: ZBookmarkedLink }) {
);
}
+function PDFSection({ link }: { link: ZBookmarkedLink }) {
+ return (
+ <iframe
+ title="PDF Viewer"
+ src={`/api/assets/${link.pdfAssetId}`}
+ className="relative h-full min-w-full"
+ />
+ );
+}
+
export default function LinkContentSection({
bookmark,
}: {
@@ -154,6 +165,8 @@ export default function LinkContentSection({
content = <FullPageArchiveSection link={bookmark.content} />;
} else if (section === "video") {
content = <VideoSection link={bookmark.content} />;
+ } else if (section === "pdf") {
+ content = <PDFSection link={bookmark.content} />;
} else {
content = <ScreenshotSection link={bookmark.content} />;
}
@@ -198,6 +211,12 @@ export default function LinkContentSection({
{t("common.screenshot")}
</div>
</SelectItem>
+ <SelectItem value="pdf" disabled={!bookmark.content.pdfAssetId}>
+ <div className="flex items-center">
+ <FileText className="mr-2 h-4 w-4" />
+ {t("common.pdf")}
+ </div>
+ </SelectItem>
<SelectItem
value="archive"
disabled={
diff --git a/apps/web/lib/attachments.tsx b/apps/web/lib/attachments.tsx
index 81b9f12d..5d7175ec 100644
--- a/apps/web/lib/attachments.tsx
+++ b/apps/web/lib/attachments.tsx
@@ -2,6 +2,7 @@ import {
Archive,
Camera,
FileCode,
+ FileText,
Image,
Paperclip,
SquareUser,
@@ -13,6 +14,7 @@ import { ZAssetType } from "@karakeep/shared/types/bookmarks";
export const ASSET_TYPE_TO_ICON: Record<ZAssetType, React.ReactNode> = {
screenshot: <Camera className="size-4" />,
+ pdf: <FileText className="size-4" />,
assetScreenshot: <Camera className="size-4" />,
fullPageArchive: <Archive className="size-4" />,
precrawledArchive: <Archive className="size-4" />,
diff --git a/apps/web/lib/i18n/locales/en/translation.json b/apps/web/lib/i18n/locales/en/translation.json
index a4b94e4b..9ec25732 100644
--- a/apps/web/lib/i18n/locales/en/translation.json
+++ b/apps/web/lib/i18n/locales/en/translation.json
@@ -26,6 +26,7 @@
"highlights": "Highlights",
"source": "Source",
"screenshot": "Screenshot",
+ "pdf": "Archived PDF",
"video": "Video",
"archive": "Archive",
"home": "Home",
@@ -70,7 +71,9 @@
"toggle_show_archived": "Show Archived",
"refresh": "Refresh",
"recrawl": "Recrawl",
+ "offline_copies": "Offline Copies",
"download_full_page_archive": "Download Full Page Archive",
+ "preserve_as_pdf": "Preserve as PDF",
"edit_tags": "Edit Tags",
"edit_notes": "Edit Notes",
"add_to_list": "Add to List",
@@ -802,6 +805,7 @@
"deleted": "The bookmark has been deleted!",
"refetch": "Re-fetch has been enqueued!",
"full_page_archive": "Full Page Archive creation has been triggered",
+ "preserve_pdf": "PDF preservation has been triggered",
"delete_from_list": "The bookmark has been deleted from the list",
"clipboard_copied": "Link has been added to your clipboard!"
},
diff --git a/apps/web/lib/i18n/locales/en_US/translation.json b/apps/web/lib/i18n/locales/en_US/translation.json
index 6c3dd62b..2849f930 100644
--- a/apps/web/lib/i18n/locales/en_US/translation.json
+++ b/apps/web/lib/i18n/locales/en_US/translation.json
@@ -25,6 +25,7 @@
"admin": "Admin"
},
"screenshot": "Screenshot",
+ "pdf": "Archived PDF",
"video": "Video",
"archive": "Archive",
"home": "Home",
@@ -62,7 +63,9 @@
"delete": "Delete",
"refresh": "Refresh",
"recrawl": "Recrawl",
+ "offline_copies": "Offline Copies",
"download_full_page_archive": "Download Full Page Archive",
+ "preserve_as_pdf": "Preserve as PDF",
"edit_tags": "Edit Tags",
"add_to_list": "Add to List",
"select_all": "Select All",
@@ -781,6 +784,7 @@
"deleted": "The bookmark has been deleted!",
"refetch": "Re-fetch has been enqueued!",
"full_page_archive": "Full Page Archive creation has been triggered",
+ "preserve_pdf": "PDF preservation has been triggered",
"delete_from_list": "The bookmark has been deleted from the list",
"clipboard_copied": "Link has been added to your clipboard!",
"updated": "The bookmark has been updated!"
diff --git a/apps/workers/workerUtils.ts b/apps/workers/workerUtils.ts
index 3eaf5b4b..a99f2103 100644
--- a/apps/workers/workerUtils.ts
+++ b/apps/workers/workerUtils.ts
@@ -34,6 +34,8 @@ export async function getBookmarkDetails(bookmarkId: string) {
screenshotAssetId: bookmark.assets.find(
(a) => a.assetType == AssetTypes.LINK_SCREENSHOT,
)?.id,
+ pdfAssetId: bookmark.assets.find((a) => a.assetType == AssetTypes.LINK_PDF)
+ ?.id,
imageAssetId: bookmark.assets.find(
(a) => a.assetType == AssetTypes.LINK_BANNER_IMAGE,
)?.id,
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts
index 3591474e..95c91002 100644
--- a/apps/workers/workers/crawlerWorker.ts
+++ b/apps/workers/workers/crawlerWorker.ts
@@ -418,6 +418,7 @@ async function browserlessCrawlPage(
htmlContent: await response.text(),
statusCode: response.status,
screenshot: undefined,
+ pdf: undefined,
url: response.url,
};
}
@@ -426,10 +427,12 @@ async function crawlPage(
jobId: string,
url: string,
userId: string,
+ forceStorePdf: boolean,
abortSignal: AbortSignal,
): Promise<{
htmlContent: string;
screenshot: Buffer | undefined;
+ pdf: Buffer | undefined;
statusCode: number;
url: string;
}> {
@@ -608,10 +611,45 @@ async function crawlPage(
}
}
+ // Capture PDF if configured or explicitly requested
+ let pdf: Buffer | undefined = undefined;
+ if (serverConfig.crawler.storePdf || forceStorePdf) {
+ const { data: pdfData, error: pdfError } = await tryCatch(
+ Promise.race<Buffer>([
+ page.pdf({
+ format: "A4",
+ printBackground: true,
+ }),
+ new Promise((_, reject) =>
+ setTimeout(
+ () =>
+ reject(
+ "TIMED_OUT, consider increasing CRAWLER_SCREENSHOT_TIMEOUT_SEC",
+ ),
+ serverConfig.crawler.screenshotTimeoutSec * 1000,
+ ),
+ ),
+ abortPromise(abortSignal).then(() => Buffer.from("")),
+ ]),
+ );
+ abortSignal.throwIfAborted();
+ if (pdfError) {
+ logger.warn(
+ `[Crawler][${jobId}] Failed to capture the PDF. Reason: ${pdfError}`,
+ );
+ } else {
+ logger.info(
+ `[Crawler][${jobId}] Finished capturing page content as PDF`,
+ );
+ pdf = pdfData;
+ }
+ }
+
return {
htmlContent,
statusCode: response?.status() ?? 0,
screenshot,
+ pdf,
url: page.url(),
};
} finally {
@@ -724,6 +762,44 @@ async function storeScreenshot(
return { assetId, contentType, fileName, size: screenshot.byteLength };
}
+async function storePdf(
+ pdf: Buffer | undefined,
+ userId: string,
+ jobId: string,
+) {
+ if (!pdf) {
+ logger.info(`[Crawler][${jobId}] Skipping storing the PDF as it's empty.`);
+ return null;
+ }
+ const assetId = newAssetId();
+ const contentType = "application/pdf";
+ const fileName = "page.pdf";
+
+ // Check storage quota before saving the PDF
+ const { data: quotaApproved, error: quotaError } = await tryCatch(
+ QuotaService.checkStorageQuota(db, userId, pdf.byteLength),
+ );
+
+ if (quotaError) {
+ logger.warn(
+ `[Crawler][${jobId}] Skipping PDF storage due to quota exceeded: ${quotaError.message}`,
+ );
+ return null;
+ }
+
+ await saveAsset({
+ userId,
+ assetId,
+ metadata: { contentType, fileName },
+ asset: pdf,
+ quotaApproved,
+ });
+ logger.info(
+ `[Crawler][${jobId}] Stored the PDF as assetId: ${assetId} (${pdf.byteLength} bytes)`,
+ );
+ return { assetId, contentType, fileName, size: pdf.byteLength };
+}
+
async function downloadAndStoreFile(
url: string,
userId: string,
@@ -1079,16 +1155,19 @@ async function crawlAndParseUrl(
jobId: string,
bookmarkId: string,
oldScreenshotAssetId: string | undefined,
+ oldPdfAssetId: string | undefined,
oldImageAssetId: string | undefined,
oldFullPageArchiveAssetId: string | undefined,
oldContentAssetId: string | undefined,
precrawledArchiveAssetId: string | undefined,
archiveFullPage: boolean,
+ forceStorePdf: boolean,
abortSignal: AbortSignal,
) {
let result: {
htmlContent: string;
screenshot: Buffer | undefined;
+ pdf: Buffer | undefined;
statusCode: number | null;
url: string;
};
@@ -1104,15 +1183,16 @@ async function crawlAndParseUrl(
result = {
htmlContent: asset.asset.toString(),
screenshot: undefined,
+ pdf: undefined,
statusCode: 200,
url,
};
} else {
- result = await crawlPage(jobId, url, userId, abortSignal);
+ result = await crawlPage(jobId, url, userId, forceStorePdf, abortSignal);
}
abortSignal.throwIfAborted();
- const { htmlContent, screenshot, statusCode, url: browserUrl } = result;
+ const { htmlContent, screenshot, pdf, statusCode, url: browserUrl } = result;
// Track status code in Prometheus
if (statusCode !== null) {
@@ -1146,6 +1226,12 @@ async function crawlAndParseUrl(
]);
abortSignal.throwIfAborted();
+ const pdfAssetInfo = await Promise.race([
+ storePdf(pdf, userId, jobId),
+ abortPromise(abortSignal),
+ ]);
+ abortSignal.throwIfAborted();
+
const htmlContentAssetInfo = await storeHtmlContent(
readableContent?.content,
userId,
@@ -1230,6 +1316,22 @@ async function crawlAndParseUrl(
);
assetDeletionTasks.push(silentDeleteAsset(userId, oldScreenshotAssetId));
}
+ if (pdfAssetInfo) {
+ await updateAsset(
+ oldPdfAssetId,
+ {
+ id: pdfAssetInfo.assetId,
+ bookmarkId,
+ userId,
+ assetType: AssetTypes.LINK_PDF,
+ contentType: pdfAssetInfo.contentType,
+ size: pdfAssetInfo.size,
+ fileName: pdfAssetInfo.fileName,
+ },
+ txn,
+ );
+ assetDeletionTasks.push(silentDeleteAsset(userId, oldPdfAssetId));
+ }
if (imageAssetInfo) {
await updateAsset(oldImageAssetId, imageAssetInfo, txn);
assetDeletionTasks.push(silentDeleteAsset(userId, oldImageAssetId));
@@ -1355,11 +1457,12 @@ async function runCrawler(
return { status: "completed" };
}
- const { bookmarkId, archiveFullPage } = request.data;
+ const { bookmarkId, archiveFullPage, storePdf } = request.data;
const {
url,
userId,
screenshotAssetId: oldScreenshotAssetId,
+ pdfAssetId: oldPdfAssetId,
imageAssetId: oldImageAssetId,
fullPageArchiveAssetId: oldFullPageArchiveAssetId,
contentAssetId: oldContentAssetId,
@@ -1407,11 +1510,13 @@ async function runCrawler(
jobId,
bookmarkId,
oldScreenshotAssetId,
+ oldPdfAssetId,
oldImageAssetId,
oldFullPageArchiveAssetId,
oldContentAssetId,
precrawledArchiveAssetId,
archiveFullPage,
+ storePdf ?? false,
job.abortSignal,
);