From dedc5fb24536832eae2c18d84efa2a92272c955c Mon Sep 17 00:00:00 2001
From: MohamedBassem <me@mbassem.com>
Date: Sun, 26 May 2024 00:06:32 +0000
Subject: feature: Full page archival with monolith. Fixes #132

---
 apps/workers/crawlerWorker.ts | 66 ++++++++++++++++++++++++++++++++++++++++++-
 apps/workers/package.json     |  1 +
 2 files changed, 66 insertions(+), 1 deletion(-)

(limited to 'apps/workers')

diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index fe5bc43b..87632019 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -7,6 +7,7 @@ import { Mutex } from "async-mutex";
 import { Worker } from "bullmq";
 import DOMPurify from "dompurify";
 import { eq } from "drizzle-orm";
+import { execa } from "execa";
 import { isShuttingDown } from "exit";
 import { JSDOM } from "jsdom";
 import metascraper from "metascraper";
@@ -26,7 +27,12 @@ import { withTimeout } from "utils";
 import type { ZCrawlLinkRequest } from "@hoarder/shared/queues";
 import { db } from "@hoarder/db";
 import { bookmarkLinks, bookmarks } from "@hoarder/db/schema";
-import { deleteAsset, newAssetId, saveAsset } from "@hoarder/shared/assetdb";
+import {
+  deleteAsset,
+  newAssetId,
+  saveAsset,
+  saveAssetFromFile,
+} from "@hoarder/shared/assetdb";
 import serverConfig from "@hoarder/shared/config";
 import logger from "@hoarder/shared/logger";
 import {
@@ -197,6 +203,7 @@ async function getBookmarkDetails(bookmarkId: string) {
     userId: bookmark.userId,
     screenshotAssetId: bookmark.link.screenshotAssetId,
     imageAssetId: bookmark.link.imageAssetId,
+    fullPageArchiveAssetId: bookmark.link.fullPageArchiveAssetId,
   };
 }
 
@@ -375,6 +382,42 @@ async function downloadAndStoreImage(
   }
 }
 
+async function archiveWebpage(
+  html: string,
+  url: string,
+  userId: string,
+  jobId: string,
+) {
+  if (!serverConfig.crawler.fullPageArchive) {
+    return;
+  }
+  logger.info(`[Crawler][${jobId}] Will attempt to archive page ...`);
+  const urlParsed = new URL(url);
+  const baseUrl = `${urlParsed.protocol}//${urlParsed.host}`;
+
+  const assetId = newAssetId();
+  const assetPath = `/tmp/${assetId}`;
+
+  await execa({
+    input: html,
+  })`monolith  - -Ije -t 5 -b ${baseUrl} -o ${assetPath}`;
+
+  await saveAssetFromFile({
+    userId,
+    assetId,
+    assetPath,
+    metadata: {
+      contentType: "text/html",
+    },
+  });
+
+  logger.info(
+    `[Crawler][${jobId}] Done archiving the page as assertId: ${assetId}`,
+  );
+
+  return assetId;
+}
+
 async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
   const jobId = job.id ?? "unknown";
 
@@ -392,6 +435,7 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
     userId,
     screenshotAssetId: oldScreenshotAssetId,
     imageAssetId: oldImageAssetId,
+    fullPageArchiveAssetId: oldFullPageArchiveAssetId,
   } = await getBookmarkDetails(bookmarkId);
 
   logger.info(
@@ -453,4 +497,24 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
     bookmarkId,
     type: "index",
   });
+
+  // Do the archival as a separate last step as it has the potential for failure
+  const fullPageArchiveAssetId = await archiveWebpage(
+    htmlContent,
+    browserUrl,
+    userId,
+    jobId,
+  );
+  await db
+    .update(bookmarkLinks)
+    .set({
+      fullPageArchiveAssetId,
+    })
+    .where(eq(bookmarkLinks.id, bookmarkId));
+
+  if (oldFullPageArchiveAssetId) {
+    deleteAsset({ userId, assetId: oldFullPageArchiveAssetId }).catch(
+      () => ({}),
+    );
+  }
 }
diff --git a/apps/workers/package.json b/apps/workers/package.json
index 7975cc84..b74f9ec9 100644
--- a/apps/workers/package.json
+++ b/apps/workers/package.json
@@ -14,6 +14,7 @@
     "dompurify": "^3.0.9",
     "dotenv": "^16.4.1",
     "drizzle-orm": "^0.29.4",
+    "execa": "^9.1.0",
     "jsdom": "^24.0.0",
     "metascraper": "^5.43.4",
     "metascraper-amazon": "^5.45.0",
-- 
cgit v1.2.3-70-g09d2