From 10506173cd5309e7c63d83055243abc67cecad4f Mon Sep 17 00:00:00 2001 From: Mohamed Bassem Date: Sat, 11 Jan 2025 18:09:51 +0000 Subject: feat: Add support for singlefile extension uploads. #172 --- apps/workers/crawlerWorker.ts | 36 ++++++++++++++++++++++++++++++------ apps/workers/workerUtils.ts | 3 +++ 2 files changed, 33 insertions(+), 6 deletions(-) (limited to 'apps/workers') diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts index 252da3b2..16b1f4ae 100644 --- a/apps/workers/crawlerWorker.ts +++ b/apps/workers/crawlerWorker.ts @@ -41,6 +41,7 @@ import { getAssetSize, IMAGE_ASSET_TYPES, newAssetId, + readAsset, saveAsset, saveAssetFromFile, silentDeleteAsset, @@ -582,14 +583,35 @@ async function crawlAndParseUrl( oldScreenshotAssetId: string | undefined, oldImageAssetId: string | undefined, oldFullPageArchiveAssetId: string | undefined, + precrawledArchiveAssetId: string | undefined, archiveFullPage: boolean, ) { - const { - htmlContent, - screenshot, - statusCode, - url: browserUrl, - } = await crawlPage(jobId, url); + let result: { + htmlContent: string; + screenshot: Buffer | undefined; + statusCode: number | null; + url: string; + }; + + if (precrawledArchiveAssetId) { + logger.info( + `[Crawler][${jobId}] The page has been precrawled. Will use the precrawled archive instead.`, + ); + const asset = await readAsset({ + userId, + assetId: precrawledArchiveAssetId, + }); + result = { + htmlContent: asset.asset.toString(), + screenshot: undefined, + statusCode: 200, + url, + }; + } else { + result = await crawlPage(jobId, url); + } + + const { htmlContent, screenshot, statusCode, url: browserUrl } = result; const [meta, readableContent, screenshotAssetInfo] = await Promise.all([ extractMetadata(htmlContent, browserUrl, jobId), @@ -701,6 +723,7 @@ async function runCrawler(job: DequeuedJob) { screenshotAssetId: oldScreenshotAssetId, imageAssetId: oldImageAssetId, fullPageArchiveAssetId: oldFullPageArchiveAssetId, + precrawledArchiveAssetId, } = await getBookmarkDetails(bookmarkId); logger.info( @@ -730,6 +753,7 @@ async function runCrawler(job: DequeuedJob) { oldScreenshotAssetId, oldImageAssetId, oldFullPageArchiveAssetId, + precrawledArchiveAssetId, archiveFullPage, ); diff --git a/apps/workers/workerUtils.ts b/apps/workers/workerUtils.ts index e93d241b..2b365c73 100644 --- a/apps/workers/workerUtils.ts +++ b/apps/workers/workerUtils.ts @@ -44,5 +44,8 @@ export async function getBookmarkDetails(bookmarkId: string) { videoAssetId: bookmark.assets.find( (a) => a.assetType == AssetTypes.LINK_VIDEO, )?.id, + precrawledArchiveAssetId: bookmark.assets.find( + (a) => a.assetType == AssetTypes.LINK_PRECRAWLED_ARCHIVE, + )?.id, }; } -- cgit v1.2.3-70-g09d2