From dedc5fb24536832eae2c18d84efa2a92272c955c Mon Sep 17 00:00:00 2001 From: MohamedBassem Date: Sun, 26 May 2024 00:06:32 +0000 Subject: feature: Full page archival with monolith. Fixes #132 --- apps/workers/crawlerWorker.ts | 66 ++++++++++++++++++++++++++++++++++++++++++- apps/workers/package.json | 1 + 2 files changed, 66 insertions(+), 1 deletion(-) (limited to 'apps/workers') diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts index fe5bc43b..87632019 100644 --- a/apps/workers/crawlerWorker.ts +++ b/apps/workers/crawlerWorker.ts @@ -7,6 +7,7 @@ import { Mutex } from "async-mutex"; import { Worker } from "bullmq"; import DOMPurify from "dompurify"; import { eq } from "drizzle-orm"; +import { execa } from "execa"; import { isShuttingDown } from "exit"; import { JSDOM } from "jsdom"; import metascraper from "metascraper"; @@ -26,7 +27,12 @@ import { withTimeout } from "utils"; import type { ZCrawlLinkRequest } from "@hoarder/shared/queues"; import { db } from "@hoarder/db"; import { bookmarkLinks, bookmarks } from "@hoarder/db/schema"; -import { deleteAsset, newAssetId, saveAsset } from "@hoarder/shared/assetdb"; +import { + deleteAsset, + newAssetId, + saveAsset, + saveAssetFromFile, +} from "@hoarder/shared/assetdb"; import serverConfig from "@hoarder/shared/config"; import logger from "@hoarder/shared/logger"; import { @@ -197,6 +203,7 @@ async function getBookmarkDetails(bookmarkId: string) { userId: bookmark.userId, screenshotAssetId: bookmark.link.screenshotAssetId, imageAssetId: bookmark.link.imageAssetId, + fullPageArchiveAssetId: bookmark.link.fullPageArchiveAssetId, }; } @@ -375,6 +382,42 @@ async function downloadAndStoreImage( } } +async function archiveWebpage( + html: string, + url: string, + userId: string, + jobId: string, +) { + if (!serverConfig.crawler.fullPageArchive) { + return; + } + logger.info(`[Crawler][${jobId}] Will attempt to archive page ...`); + const urlParsed = new URL(url); + const baseUrl = `${urlParsed.protocol}//${urlParsed.host}`; + + const assetId = newAssetId(); + const assetPath = `/tmp/${assetId}`; + + await execa({ + input: html, + })`monolith - -Ije -t 5 -b ${baseUrl} -o ${assetPath}`; + + await saveAssetFromFile({ + userId, + assetId, + assetPath, + metadata: { + contentType: "text/html", + }, + }); + + logger.info( + `[Crawler][${jobId}] Done archiving the page as assertId: ${assetId}`, + ); + + return assetId; +} + async function runCrawler(job: Job) { const jobId = job.id ?? "unknown"; @@ -392,6 +435,7 @@ async function runCrawler(job: Job) { userId, screenshotAssetId: oldScreenshotAssetId, imageAssetId: oldImageAssetId, + fullPageArchiveAssetId: oldFullPageArchiveAssetId, } = await getBookmarkDetails(bookmarkId); logger.info( @@ -453,4 +497,24 @@ async function runCrawler(job: Job) { bookmarkId, type: "index", }); + + // Do the archival as a separate last step as it has the potential for failure + const fullPageArchiveAssetId = await archiveWebpage( + htmlContent, + browserUrl, + userId, + jobId, + ); + await db + .update(bookmarkLinks) + .set({ + fullPageArchiveAssetId, + }) + .where(eq(bookmarkLinks.id, bookmarkId)); + + if (oldFullPageArchiveAssetId) { + deleteAsset({ userId, assetId: oldFullPageArchiveAssetId }).catch( + () => ({}), + ); + } } diff --git a/apps/workers/package.json b/apps/workers/package.json index 7975cc84..b74f9ec9 100644 --- a/apps/workers/package.json +++ b/apps/workers/package.json @@ -14,6 +14,7 @@ "dompurify": "^3.0.9", "dotenv": "^16.4.1", "drizzle-orm": "^0.29.4", + "execa": "^9.1.0", "jsdom": "^24.0.0", "metascraper": "^5.43.4", "metascraper-amazon": "^5.45.0", -- cgit v1.2.3-70-g09d2