diff options
| author | Mohamed Bassem <me@mbassem.com> | 2025-01-11 18:09:51 +0000 |
|---|---|---|
| committer | Mohamed Bassem <me@mbassem.com> | 2025-01-11 18:09:51 +0000 |
| commit | 10506173cd5309e7c63d83055243abc67cecad4f (patch) | |
| tree | f37f7dd704c63e34a1e5b0bffdda442b03179d9c /apps/workers | |
| parent | 107d923b3abd60329463957ca4604107b3427b2c (diff) | |
| download | karakeep-10506173cd5309e7c63d83055243abc67cecad4f.tar.zst | |
feat: Add support for singlefile extension uploads. #172
Diffstat (limited to 'apps/workers')
| -rw-r--r-- | apps/workers/crawlerWorker.ts | 36 | ||||
| -rw-r--r-- | apps/workers/workerUtils.ts | 3 |
2 files changed, 33 insertions, 6 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts index 252da3b2..16b1f4ae 100644 --- a/apps/workers/crawlerWorker.ts +++ b/apps/workers/crawlerWorker.ts @@ -41,6 +41,7 @@ import { getAssetSize, IMAGE_ASSET_TYPES, newAssetId, + readAsset, saveAsset, saveAssetFromFile, silentDeleteAsset, @@ -582,14 +583,35 @@ async function crawlAndParseUrl( oldScreenshotAssetId: string | undefined, oldImageAssetId: string | undefined, oldFullPageArchiveAssetId: string | undefined, + precrawledArchiveAssetId: string | undefined, archiveFullPage: boolean, ) { - const { - htmlContent, - screenshot, - statusCode, - url: browserUrl, - } = await crawlPage(jobId, url); + let result: { + htmlContent: string; + screenshot: Buffer | undefined; + statusCode: number | null; + url: string; + }; + + if (precrawledArchiveAssetId) { + logger.info( + `[Crawler][${jobId}] The page has been precrawled. Will use the precrawled archive instead.`, + ); + const asset = await readAsset({ + userId, + assetId: precrawledArchiveAssetId, + }); + result = { + htmlContent: asset.asset.toString(), + screenshot: undefined, + statusCode: 200, + url, + }; + } else { + result = await crawlPage(jobId, url); + } + + const { htmlContent, screenshot, statusCode, url: browserUrl } = result; const [meta, readableContent, screenshotAssetInfo] = await Promise.all([ extractMetadata(htmlContent, browserUrl, jobId), @@ -701,6 +723,7 @@ async function runCrawler(job: DequeuedJob<ZCrawlLinkRequest>) { screenshotAssetId: oldScreenshotAssetId, imageAssetId: oldImageAssetId, fullPageArchiveAssetId: oldFullPageArchiveAssetId, + precrawledArchiveAssetId, } = await getBookmarkDetails(bookmarkId); logger.info( @@ -730,6 +753,7 @@ async function runCrawler(job: DequeuedJob<ZCrawlLinkRequest>) { oldScreenshotAssetId, oldImageAssetId, oldFullPageArchiveAssetId, + precrawledArchiveAssetId, archiveFullPage, ); diff --git a/apps/workers/workerUtils.ts b/apps/workers/workerUtils.ts index e93d241b..2b365c73 100644 --- a/apps/workers/workerUtils.ts +++ b/apps/workers/workerUtils.ts @@ -44,5 +44,8 @@ export async function getBookmarkDetails(bookmarkId: string) { videoAssetId: bookmark.assets.find(
(a) => a.assetType == AssetTypes.LINK_VIDEO,
)?.id,
+ precrawledArchiveAssetId: bookmark.assets.find(
+ (a) => a.assetType == AssetTypes.LINK_PRECRAWLED_ARCHIVE,
+ )?.id,
};
}
|
