aboutsummaryrefslogtreecommitdiffstats
path: root/apps
diff options
context:
space:
mode:
Diffstat (limited to 'apps')
-rw-r--r--apps/web/app/api/assets/route.ts4
-rw-r--r--apps/web/components/dashboard/preview/LinkContentSection.tsx18
-rw-r--r--apps/workers/crawlerWorker.ts66
-rw-r--r--apps/workers/package.json1
4 files changed, 86 insertions, 3 deletions
diff --git a/apps/web/app/api/assets/route.ts b/apps/web/app/api/assets/route.ts
index f1a17fc9..9028f556 100644
--- a/apps/web/app/api/assets/route.ts
+++ b/apps/web/app/api/assets/route.ts
@@ -5,7 +5,7 @@ import type { ZUploadResponse } from "@hoarder/shared/types/uploads";
import {
newAssetId,
saveAsset,
- SUPPORTED_ASSET_TYPES,
+ SUPPORTED_UPLOAD_ASSET_TYPES,
} from "@hoarder/shared/assetdb";
import serverConfig from "@hoarder/shared/config";
@@ -29,7 +29,7 @@ export async function POST(request: Request) {
let contentType;
if (data instanceof File) {
contentType = data.type;
- if (!SUPPORTED_ASSET_TYPES.has(contentType)) {
+ if (!SUPPORTED_UPLOAD_ASSET_TYPES.has(contentType)) {
return Response.json(
{ error: "Unsupported asset type" },
{ status: 400 },
diff --git a/apps/web/components/dashboard/preview/LinkContentSection.tsx b/apps/web/components/dashboard/preview/LinkContentSection.tsx
index 29001c7f..3aeacdcd 100644
--- a/apps/web/components/dashboard/preview/LinkContentSection.tsx
+++ b/apps/web/components/dashboard/preview/LinkContentSection.tsx
@@ -12,6 +12,16 @@ import { ScrollArea } from "@radix-ui/react-scroll-area";
import { ZBookmark, ZBookmarkedLink } from "@hoarder/shared/types/bookmarks";
+function FullPageArchiveSection({ link }: { link: ZBookmarkedLink }) {
+ return (
+ <iframe
+ title={link.url}
+ src={`/api/assets/${link.fullPageArchiveAssetId}`}
+ className="relative h-full min-w-full"
+ />
+ );
+}
+
function ScreenshotSection({ link }: { link: ZBookmarkedLink }) {
return (
<div className="relative h-full min-w-full">
@@ -60,6 +70,8 @@ export default function LinkContentSection({
let content;
if (section === "cached") {
content = <CachedContentSection link={bookmark.content} />;
+ } else if (section === "archive") {
+ content = <FullPageArchiveSection link={bookmark.content} />;
} else {
content = <ScreenshotSection link={bookmark.content} />;
}
@@ -79,6 +91,12 @@ export default function LinkContentSection({
>
Screenshot
</SelectItem>
+ <SelectItem
+ value="archive"
+ disabled={!bookmark.content.fullPageArchiveAssetId}
+ >
+ Archive
+ </SelectItem>
</SelectGroup>
</SelectContent>
</Select>
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index fe5bc43b..87632019 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -7,6 +7,7 @@ import { Mutex } from "async-mutex";
import { Worker } from "bullmq";
import DOMPurify from "dompurify";
import { eq } from "drizzle-orm";
+import { execa } from "execa";
import { isShuttingDown } from "exit";
import { JSDOM } from "jsdom";
import metascraper from "metascraper";
@@ -26,7 +27,12 @@ import { withTimeout } from "utils";
import type { ZCrawlLinkRequest } from "@hoarder/shared/queues";
import { db } from "@hoarder/db";
import { bookmarkLinks, bookmarks } from "@hoarder/db/schema";
-import { deleteAsset, newAssetId, saveAsset } from "@hoarder/shared/assetdb";
+import {
+ deleteAsset,
+ newAssetId,
+ saveAsset,
+ saveAssetFromFile,
+} from "@hoarder/shared/assetdb";
import serverConfig from "@hoarder/shared/config";
import logger from "@hoarder/shared/logger";
import {
@@ -197,6 +203,7 @@ async function getBookmarkDetails(bookmarkId: string) {
userId: bookmark.userId,
screenshotAssetId: bookmark.link.screenshotAssetId,
imageAssetId: bookmark.link.imageAssetId,
+ fullPageArchiveAssetId: bookmark.link.fullPageArchiveAssetId,
};
}
@@ -375,6 +382,42 @@ async function downloadAndStoreImage(
}
}
+async function archiveWebpage(
+ html: string,
+ url: string,
+ userId: string,
+ jobId: string,
+) {
+ if (!serverConfig.crawler.fullPageArchive) {
+ return;
+ }
+ logger.info(`[Crawler][${jobId}] Will attempt to archive page ...`);
+ const urlParsed = new URL(url);
+ const baseUrl = `${urlParsed.protocol}//${urlParsed.host}`;
+
+ const assetId = newAssetId();
+ const assetPath = `/tmp/${assetId}`;
+
+ await execa({
+ input: html,
+ })`monolith - -Ije -t 5 -b ${baseUrl} -o ${assetPath}`;
+
+ await saveAssetFromFile({
+ userId,
+ assetId,
+ assetPath,
+ metadata: {
+ contentType: "text/html",
+ },
+ });
+
+ logger.info(
+ `[Crawler][${jobId}] Done archiving the page as assertId: ${assetId}`,
+ );
+
+ return assetId;
+}
+
async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
const jobId = job.id ?? "unknown";
@@ -392,6 +435,7 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
userId,
screenshotAssetId: oldScreenshotAssetId,
imageAssetId: oldImageAssetId,
+ fullPageArchiveAssetId: oldFullPageArchiveAssetId,
} = await getBookmarkDetails(bookmarkId);
logger.info(
@@ -453,4 +497,24 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
bookmarkId,
type: "index",
});
+
+ // Do the archival as a separate last step as it has the potential for failure
+ const fullPageArchiveAssetId = await archiveWebpage(
+ htmlContent,
+ browserUrl,
+ userId,
+ jobId,
+ );
+ await db
+ .update(bookmarkLinks)
+ .set({
+ fullPageArchiveAssetId,
+ })
+ .where(eq(bookmarkLinks.id, bookmarkId));
+
+ if (oldFullPageArchiveAssetId) {
+ deleteAsset({ userId, assetId: oldFullPageArchiveAssetId }).catch(
+ () => ({}),
+ );
+ }
}
diff --git a/apps/workers/package.json b/apps/workers/package.json
index 7975cc84..b74f9ec9 100644
--- a/apps/workers/package.json
+++ b/apps/workers/package.json
@@ -14,6 +14,7 @@
"dompurify": "^3.0.9",
"dotenv": "^16.4.1",
"drizzle-orm": "^0.29.4",
+ "execa": "^9.1.0",
"jsdom": "^24.0.0",
"metascraper": "^5.43.4",
"metascraper-amazon": "^5.45.0",