diff options
| author | Mohamed Bassem <me@mbassem.com> | 2025-12-27 11:59:39 +0200 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-12-27 09:59:39 +0000 |
| commit | 267db791290f4f539d7bda113992e3d1690b0e8b (patch) | |
| tree | 0144ea00dcf6a49bdaaf46511cd074651aeeee5a /apps/workers | |
| parent | bb6b742a040a70478d276529774bde67b8f93648 (diff) | |
| download | karakeep-267db791290f4f539d7bda113992e3d1690b0e8b.tar.zst | |
feat: support archiving as pdf (#2309)
* feat: support archiving as pdf
* add supprot for manually triggering pdf downloads
* fix submenu
* menu cleanup
* fix store pdf
Diffstat (limited to 'apps/workers')
| -rw-r--r-- | apps/workers/workerUtils.ts | 2 | ||||
| -rw-r--r-- | apps/workers/workers/crawlerWorker.ts | 111 |
2 files changed, 110 insertions, 3 deletions
diff --git a/apps/workers/workerUtils.ts b/apps/workers/workerUtils.ts index 3eaf5b4b..a99f2103 100644 --- a/apps/workers/workerUtils.ts +++ b/apps/workers/workerUtils.ts @@ -34,6 +34,8 @@ export async function getBookmarkDetails(bookmarkId: string) { screenshotAssetId: bookmark.assets.find(
(a) => a.assetType == AssetTypes.LINK_SCREENSHOT,
)?.id,
+ pdfAssetId: bookmark.assets.find((a) => a.assetType == AssetTypes.LINK_PDF)
+ ?.id,
imageAssetId: bookmark.assets.find(
(a) => a.assetType == AssetTypes.LINK_BANNER_IMAGE,
)?.id,
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts index 3591474e..95c91002 100644 --- a/apps/workers/workers/crawlerWorker.ts +++ b/apps/workers/workers/crawlerWorker.ts @@ -418,6 +418,7 @@ async function browserlessCrawlPage( htmlContent: await response.text(), statusCode: response.status, screenshot: undefined, + pdf: undefined, url: response.url, }; } @@ -426,10 +427,12 @@ async function crawlPage( jobId: string, url: string, userId: string, + forceStorePdf: boolean, abortSignal: AbortSignal, ): Promise<{ htmlContent: string; screenshot: Buffer | undefined; + pdf: Buffer | undefined; statusCode: number; url: string; }> { @@ -608,10 +611,45 @@ async function crawlPage( } } + // Capture PDF if configured or explicitly requested + let pdf: Buffer | undefined = undefined; + if (serverConfig.crawler.storePdf || forceStorePdf) { + const { data: pdfData, error: pdfError } = await tryCatch( + Promise.race<Buffer>([ + page.pdf({ + format: "A4", + printBackground: true, + }), + new Promise((_, reject) => + setTimeout( + () => + reject( + "TIMED_OUT, consider increasing CRAWLER_SCREENSHOT_TIMEOUT_SEC", + ), + serverConfig.crawler.screenshotTimeoutSec * 1000, + ), + ), + abortPromise(abortSignal).then(() => Buffer.from("")), + ]), + ); + abortSignal.throwIfAborted(); + if (pdfError) { + logger.warn( + `[Crawler][${jobId}] Failed to capture the PDF. Reason: ${pdfError}`, + ); + } else { + logger.info( + `[Crawler][${jobId}] Finished capturing page content as PDF`, + ); + pdf = pdfData; + } + } + return { htmlContent, statusCode: response?.status() ?? 0, screenshot, + pdf, url: page.url(), }; } finally { @@ -724,6 +762,44 @@ async function storeScreenshot( return { assetId, contentType, fileName, size: screenshot.byteLength }; } +async function storePdf( + pdf: Buffer | undefined, + userId: string, + jobId: string, +) { + if (!pdf) { + logger.info(`[Crawler][${jobId}] Skipping storing the PDF as it's empty.`); + return null; + } + const assetId = newAssetId(); + const contentType = "application/pdf"; + const fileName = "page.pdf"; + + // Check storage quota before saving the PDF + const { data: quotaApproved, error: quotaError } = await tryCatch( + QuotaService.checkStorageQuota(db, userId, pdf.byteLength), + ); + + if (quotaError) { + logger.warn( + `[Crawler][${jobId}] Skipping PDF storage due to quota exceeded: ${quotaError.message}`, + ); + return null; + } + + await saveAsset({ + userId, + assetId, + metadata: { contentType, fileName }, + asset: pdf, + quotaApproved, + }); + logger.info( + `[Crawler][${jobId}] Stored the PDF as assetId: ${assetId} (${pdf.byteLength} bytes)`, + ); + return { assetId, contentType, fileName, size: pdf.byteLength }; +} + async function downloadAndStoreFile( url: string, userId: string, @@ -1079,16 +1155,19 @@ async function crawlAndParseUrl( jobId: string, bookmarkId: string, oldScreenshotAssetId: string | undefined, + oldPdfAssetId: string | undefined, oldImageAssetId: string | undefined, oldFullPageArchiveAssetId: string | undefined, oldContentAssetId: string | undefined, precrawledArchiveAssetId: string | undefined, archiveFullPage: boolean, + forceStorePdf: boolean, abortSignal: AbortSignal, ) { let result: { htmlContent: string; screenshot: Buffer | undefined; + pdf: Buffer | undefined; statusCode: number | null; url: string; }; @@ -1104,15 +1183,16 @@ async function crawlAndParseUrl( result = { htmlContent: asset.asset.toString(), screenshot: undefined, + pdf: undefined, statusCode: 200, url, }; } else { - result = await crawlPage(jobId, url, userId, abortSignal); + result = await crawlPage(jobId, url, userId, forceStorePdf, abortSignal); } abortSignal.throwIfAborted(); - const { htmlContent, screenshot, statusCode, url: browserUrl } = result; + const { htmlContent, screenshot, pdf, statusCode, url: browserUrl } = result; // Track status code in Prometheus if (statusCode !== null) { @@ -1146,6 +1226,12 @@ async function crawlAndParseUrl( ]); abortSignal.throwIfAborted(); + const pdfAssetInfo = await Promise.race([ + storePdf(pdf, userId, jobId), + abortPromise(abortSignal), + ]); + abortSignal.throwIfAborted(); + const htmlContentAssetInfo = await storeHtmlContent( readableContent?.content, userId, @@ -1230,6 +1316,22 @@ async function crawlAndParseUrl( ); assetDeletionTasks.push(silentDeleteAsset(userId, oldScreenshotAssetId)); } + if (pdfAssetInfo) { + await updateAsset( + oldPdfAssetId, + { + id: pdfAssetInfo.assetId, + bookmarkId, + userId, + assetType: AssetTypes.LINK_PDF, + contentType: pdfAssetInfo.contentType, + size: pdfAssetInfo.size, + fileName: pdfAssetInfo.fileName, + }, + txn, + ); + assetDeletionTasks.push(silentDeleteAsset(userId, oldPdfAssetId)); + } if (imageAssetInfo) { await updateAsset(oldImageAssetId, imageAssetInfo, txn); assetDeletionTasks.push(silentDeleteAsset(userId, oldImageAssetId)); @@ -1355,11 +1457,12 @@ async function runCrawler( return { status: "completed" }; } - const { bookmarkId, archiveFullPage } = request.data; + const { bookmarkId, archiveFullPage, storePdf } = request.data; const { url, userId, screenshotAssetId: oldScreenshotAssetId, + pdfAssetId: oldPdfAssetId, imageAssetId: oldImageAssetId, fullPageArchiveAssetId: oldFullPageArchiveAssetId, contentAssetId: oldContentAssetId, @@ -1407,11 +1510,13 @@ async function runCrawler( jobId, bookmarkId, oldScreenshotAssetId, + oldPdfAssetId, oldImageAssetId, oldFullPageArchiveAssetId, oldContentAssetId, precrawledArchiveAssetId, archiveFullPage, + storePdf ?? false, job.abortSignal, ); |
