aboutsummaryrefslogtreecommitdiffstats
path: root/apps/workers
diff options
context:
space:
mode:
authorMohamed Bassem <me@mbassem.com>2025-12-27 11:59:39 +0200
committerGitHub <noreply@github.com>2025-12-27 09:59:39 +0000
commit267db791290f4f539d7bda113992e3d1690b0e8b (patch)
tree0144ea00dcf6a49bdaaf46511cd074651aeeee5a /apps/workers
parentbb6b742a040a70478d276529774bde67b8f93648 (diff)
downloadkarakeep-267db791290f4f539d7bda113992e3d1690b0e8b.tar.zst
feat: support archiving as pdf (#2309)
* feat: support archiving as pdf * add supprot for manually triggering pdf downloads * fix submenu * menu cleanup * fix store pdf
Diffstat (limited to 'apps/workers')
-rw-r--r--apps/workers/workerUtils.ts2
-rw-r--r--apps/workers/workers/crawlerWorker.ts111
2 files changed, 110 insertions, 3 deletions
diff --git a/apps/workers/workerUtils.ts b/apps/workers/workerUtils.ts
index 3eaf5b4b..a99f2103 100644
--- a/apps/workers/workerUtils.ts
+++ b/apps/workers/workerUtils.ts
@@ -34,6 +34,8 @@ export async function getBookmarkDetails(bookmarkId: string) {
screenshotAssetId: bookmark.assets.find(
(a) => a.assetType == AssetTypes.LINK_SCREENSHOT,
)?.id,
+ pdfAssetId: bookmark.assets.find((a) => a.assetType == AssetTypes.LINK_PDF)
+ ?.id,
imageAssetId: bookmark.assets.find(
(a) => a.assetType == AssetTypes.LINK_BANNER_IMAGE,
)?.id,
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts
index 3591474e..95c91002 100644
--- a/apps/workers/workers/crawlerWorker.ts
+++ b/apps/workers/workers/crawlerWorker.ts
@@ -418,6 +418,7 @@ async function browserlessCrawlPage(
htmlContent: await response.text(),
statusCode: response.status,
screenshot: undefined,
+ pdf: undefined,
url: response.url,
};
}
@@ -426,10 +427,12 @@ async function crawlPage(
jobId: string,
url: string,
userId: string,
+ forceStorePdf: boolean,
abortSignal: AbortSignal,
): Promise<{
htmlContent: string;
screenshot: Buffer | undefined;
+ pdf: Buffer | undefined;
statusCode: number;
url: string;
}> {
@@ -608,10 +611,45 @@ async function crawlPage(
}
}
+ // Capture PDF if configured or explicitly requested
+ let pdf: Buffer | undefined = undefined;
+ if (serverConfig.crawler.storePdf || forceStorePdf) {
+ const { data: pdfData, error: pdfError } = await tryCatch(
+ Promise.race<Buffer>([
+ page.pdf({
+ format: "A4",
+ printBackground: true,
+ }),
+ new Promise((_, reject) =>
+ setTimeout(
+ () =>
+ reject(
+ "TIMED_OUT, consider increasing CRAWLER_SCREENSHOT_TIMEOUT_SEC",
+ ),
+ serverConfig.crawler.screenshotTimeoutSec * 1000,
+ ),
+ ),
+ abortPromise(abortSignal).then(() => Buffer.from("")),
+ ]),
+ );
+ abortSignal.throwIfAborted();
+ if (pdfError) {
+ logger.warn(
+ `[Crawler][${jobId}] Failed to capture the PDF. Reason: ${pdfError}`,
+ );
+ } else {
+ logger.info(
+ `[Crawler][${jobId}] Finished capturing page content as PDF`,
+ );
+ pdf = pdfData;
+ }
+ }
+
return {
htmlContent,
statusCode: response?.status() ?? 0,
screenshot,
+ pdf,
url: page.url(),
};
} finally {
@@ -724,6 +762,44 @@ async function storeScreenshot(
return { assetId, contentType, fileName, size: screenshot.byteLength };
}
+async function storePdf(
+ pdf: Buffer | undefined,
+ userId: string,
+ jobId: string,
+) {
+ if (!pdf) {
+ logger.info(`[Crawler][${jobId}] Skipping storing the PDF as it's empty.`);
+ return null;
+ }
+ const assetId = newAssetId();
+ const contentType = "application/pdf";
+ const fileName = "page.pdf";
+
+ // Check storage quota before saving the PDF
+ const { data: quotaApproved, error: quotaError } = await tryCatch(
+ QuotaService.checkStorageQuota(db, userId, pdf.byteLength),
+ );
+
+ if (quotaError) {
+ logger.warn(
+ `[Crawler][${jobId}] Skipping PDF storage due to quota exceeded: ${quotaError.message}`,
+ );
+ return null;
+ }
+
+ await saveAsset({
+ userId,
+ assetId,
+ metadata: { contentType, fileName },
+ asset: pdf,
+ quotaApproved,
+ });
+ logger.info(
+ `[Crawler][${jobId}] Stored the PDF as assetId: ${assetId} (${pdf.byteLength} bytes)`,
+ );
+ return { assetId, contentType, fileName, size: pdf.byteLength };
+}
+
async function downloadAndStoreFile(
url: string,
userId: string,
@@ -1079,16 +1155,19 @@ async function crawlAndParseUrl(
jobId: string,
bookmarkId: string,
oldScreenshotAssetId: string | undefined,
+ oldPdfAssetId: string | undefined,
oldImageAssetId: string | undefined,
oldFullPageArchiveAssetId: string | undefined,
oldContentAssetId: string | undefined,
precrawledArchiveAssetId: string | undefined,
archiveFullPage: boolean,
+ forceStorePdf: boolean,
abortSignal: AbortSignal,
) {
let result: {
htmlContent: string;
screenshot: Buffer | undefined;
+ pdf: Buffer | undefined;
statusCode: number | null;
url: string;
};
@@ -1104,15 +1183,16 @@ async function crawlAndParseUrl(
result = {
htmlContent: asset.asset.toString(),
screenshot: undefined,
+ pdf: undefined,
statusCode: 200,
url,
};
} else {
- result = await crawlPage(jobId, url, userId, abortSignal);
+ result = await crawlPage(jobId, url, userId, forceStorePdf, abortSignal);
}
abortSignal.throwIfAborted();
- const { htmlContent, screenshot, statusCode, url: browserUrl } = result;
+ const { htmlContent, screenshot, pdf, statusCode, url: browserUrl } = result;
// Track status code in Prometheus
if (statusCode !== null) {
@@ -1146,6 +1226,12 @@ async function crawlAndParseUrl(
]);
abortSignal.throwIfAborted();
+ const pdfAssetInfo = await Promise.race([
+ storePdf(pdf, userId, jobId),
+ abortPromise(abortSignal),
+ ]);
+ abortSignal.throwIfAborted();
+
const htmlContentAssetInfo = await storeHtmlContent(
readableContent?.content,
userId,
@@ -1230,6 +1316,22 @@ async function crawlAndParseUrl(
);
assetDeletionTasks.push(silentDeleteAsset(userId, oldScreenshotAssetId));
}
+ if (pdfAssetInfo) {
+ await updateAsset(
+ oldPdfAssetId,
+ {
+ id: pdfAssetInfo.assetId,
+ bookmarkId,
+ userId,
+ assetType: AssetTypes.LINK_PDF,
+ contentType: pdfAssetInfo.contentType,
+ size: pdfAssetInfo.size,
+ fileName: pdfAssetInfo.fileName,
+ },
+ txn,
+ );
+ assetDeletionTasks.push(silentDeleteAsset(userId, oldPdfAssetId));
+ }
if (imageAssetInfo) {
await updateAsset(oldImageAssetId, imageAssetInfo, txn);
assetDeletionTasks.push(silentDeleteAsset(userId, oldImageAssetId));
@@ -1355,11 +1457,12 @@ async function runCrawler(
return { status: "completed" };
}
- const { bookmarkId, archiveFullPage } = request.data;
+ const { bookmarkId, archiveFullPage, storePdf } = request.data;
const {
url,
userId,
screenshotAssetId: oldScreenshotAssetId,
+ pdfAssetId: oldPdfAssetId,
imageAssetId: oldImageAssetId,
fullPageArchiveAssetId: oldFullPageArchiveAssetId,
contentAssetId: oldContentAssetId,
@@ -1407,11 +1510,13 @@ async function runCrawler(
jobId,
bookmarkId,
oldScreenshotAssetId,
+ oldPdfAssetId,
oldImageAssetId,
oldFullPageArchiveAssetId,
oldContentAssetId,
precrawledArchiveAssetId,
archiveFullPage,
+ storePdf ?? false,
job.abortSignal,
);