aboutsummaryrefslogtreecommitdiffstats
path: root/apps/workers
diff options
context:
space:
mode:
authorMohamedBassem <me@mbassem.com>2024-05-26 00:06:32 +0000
committerMohamedBassem <me@mbassem.com>2024-05-26 10:11:53 +0000
commitdedc5fb24536832eae2c18d84efa2a92272c955c (patch)
tree4b9540b819db892fa6bc66a29cf8fc790d06ea67 /apps/workers
parent033e8a2d26bb0ecaa8301609960d35d3467a88f4 (diff)
downloadkarakeep-dedc5fb24536832eae2c18d84efa2a92272c955c.tar.zst
feature: Full page archival with monolith. Fixes #132
Diffstat (limited to 'apps/workers')
-rw-r--r--apps/workers/crawlerWorker.ts66
-rw-r--r--apps/workers/package.json1
2 files changed, 66 insertions, 1 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index fe5bc43b..87632019 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -7,6 +7,7 @@ import { Mutex } from "async-mutex";
import { Worker } from "bullmq";
import DOMPurify from "dompurify";
import { eq } from "drizzle-orm";
+import { execa } from "execa";
import { isShuttingDown } from "exit";
import { JSDOM } from "jsdom";
import metascraper from "metascraper";
@@ -26,7 +27,12 @@ import { withTimeout } from "utils";
import type { ZCrawlLinkRequest } from "@hoarder/shared/queues";
import { db } from "@hoarder/db";
import { bookmarkLinks, bookmarks } from "@hoarder/db/schema";
-import { deleteAsset, newAssetId, saveAsset } from "@hoarder/shared/assetdb";
+import {
+ deleteAsset,
+ newAssetId,
+ saveAsset,
+ saveAssetFromFile,
+} from "@hoarder/shared/assetdb";
import serverConfig from "@hoarder/shared/config";
import logger from "@hoarder/shared/logger";
import {
@@ -197,6 +203,7 @@ async function getBookmarkDetails(bookmarkId: string) {
userId: bookmark.userId,
screenshotAssetId: bookmark.link.screenshotAssetId,
imageAssetId: bookmark.link.imageAssetId,
+ fullPageArchiveAssetId: bookmark.link.fullPageArchiveAssetId,
};
}
@@ -375,6 +382,42 @@ async function downloadAndStoreImage(
}
}
+async function archiveWebpage(
+ html: string,
+ url: string,
+ userId: string,
+ jobId: string,
+) {
+ if (!serverConfig.crawler.fullPageArchive) {
+ return;
+ }
+ logger.info(`[Crawler][${jobId}] Will attempt to archive page ...`);
+ const urlParsed = new URL(url);
+ const baseUrl = `${urlParsed.protocol}//${urlParsed.host}`;
+
+ const assetId = newAssetId();
+ const assetPath = `/tmp/${assetId}`;
+
+ await execa({
+ input: html,
+ })`monolith - -Ije -t 5 -b ${baseUrl} -o ${assetPath}`;
+
+ await saveAssetFromFile({
+ userId,
+ assetId,
+ assetPath,
+ metadata: {
+ contentType: "text/html",
+ },
+ });
+
+ logger.info(
+ `[Crawler][${jobId}] Done archiving the page as assertId: ${assetId}`,
+ );
+
+ return assetId;
+}
+
async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
const jobId = job.id ?? "unknown";
@@ -392,6 +435,7 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
userId,
screenshotAssetId: oldScreenshotAssetId,
imageAssetId: oldImageAssetId,
+ fullPageArchiveAssetId: oldFullPageArchiveAssetId,
} = await getBookmarkDetails(bookmarkId);
logger.info(
@@ -453,4 +497,24 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
bookmarkId,
type: "index",
});
+
+ // Do the archival as a separate last step as it has the potential for failure
+ const fullPageArchiveAssetId = await archiveWebpage(
+ htmlContent,
+ browserUrl,
+ userId,
+ jobId,
+ );
+ await db
+ .update(bookmarkLinks)
+ .set({
+ fullPageArchiveAssetId,
+ })
+ .where(eq(bookmarkLinks.id, bookmarkId));
+
+ if (oldFullPageArchiveAssetId) {
+ deleteAsset({ userId, assetId: oldFullPageArchiveAssetId }).catch(
+ () => ({}),
+ );
+ }
}
diff --git a/apps/workers/package.json b/apps/workers/package.json
index 7975cc84..b74f9ec9 100644
--- a/apps/workers/package.json
+++ b/apps/workers/package.json
@@ -14,6 +14,7 @@
"dompurify": "^3.0.9",
"dotenv": "^16.4.1",
"drizzle-orm": "^0.29.4",
+ "execa": "^9.1.0",
"jsdom": "^24.0.0",
"metascraper": "^5.43.4",
"metascraper-amazon": "^5.45.0",