aboutsummaryrefslogtreecommitdiffstats
path: root/apps/workers
diff options
context:
space:
mode:
authorMohamedBassem <me@mbassem.com>2024-04-19 20:01:51 +0100
committerMohamed Bassem <me@mbassem.com>2024-04-20 00:05:31 +0100
commit4402e6f04170cbb0613d35fe94471162253e91b2 (patch)
tree696f6511cefa7d1c6bc3a1f8bc2de755870310cc /apps/workers
parentb4a13ce3d92ee505124fc98804935c1122978550 (diff)
downloadkarakeep-4402e6f04170cbb0613d35fe94471162253e91b2.tar.zst
feature: Download images and screenshots
Diffstat (limited to 'apps/workers')
-rw-r--r--apps/workers/crawlerWorker.ts158
1 files changed, 130 insertions, 28 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index 91b0a03f..27e9e14c 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -24,7 +24,8 @@ import { withTimeout } from "utils";
import type { ZCrawlLinkRequest } from "@hoarder/shared/queues";
import { db } from "@hoarder/db";
-import { bookmarkLinks } from "@hoarder/db/schema";
+import { bookmarkLinks, bookmarks } from "@hoarder/db/schema";
+import { newAssetId, saveAsset } from "@hoarder/shared/assetdb";
import serverConfig from "@hoarder/shared/config";
import logger from "@hoarder/shared/logger";
import {
@@ -155,15 +156,16 @@ async function changeBookmarkStatus(
.where(eq(bookmarkLinks.id, bookmarkId));
}
-async function getBookmarkUrl(bookmarkId: string) {
- const bookmark = await db.query.bookmarkLinks.findFirst({
- where: eq(bookmarkLinks.id, bookmarkId),
+async function getBookmarkDetails(bookmarkId: string) {
+ const bookmark = await db.query.bookmarks.findFirst({
+ where: eq(bookmarks.id, bookmarkId),
+ with: { link: true },
});
- if (!bookmark) {
+ if (!bookmark || !bookmark.link) {
throw new Error("The bookmark either doesn't exist or not a link");
}
- return bookmark.url;
+ return { url: bookmark.link.url, userId: bookmark.userId };
}
/**
@@ -208,13 +210,116 @@ async function crawlPage(jobId: string, url: string) {
logger.info(`[Crawler][${jobId}] Finished waiting for the page to load.`);
- const htmlContent = await page.content();
- return htmlContent;
+ const [htmlContent, screenshot] = await Promise.all([
+ page.content(),
+ page.screenshot({
+ // If you change this, you need to change the asset type in the store function.
+ type: "png",
+ encoding: "binary",
+ }),
+ ]);
+ logger.info(
+ `[Crawler][${jobId}] Finished capturing page content and a screenshot.`,
+ );
+ return { htmlContent, screenshot, url: page.url() };
} finally {
await context.close();
}
}
+async function extractMetadata(
+ htmlContent: string,
+ url: string,
+ jobId: string,
+) {
+ logger.info(
+ `[Crawler][${jobId}] Will attempt to extract metadata from page ...`,
+ );
+ const meta = await metascraperParser({
+ url,
+ html: htmlContent,
+ // We don't want to validate the URL again as we've already done it by visiting the page.
+ // This was added because URL validation fails if the URL ends with a question mark (e.g. empty query params).
+ validateUrl: false,
+ });
+ logger.info(`[Crawler][${jobId}] Done extracting metadata from the page.`);
+ return meta;
+}
+
+function extractReadableContent(
+ htmlContent: string,
+ url: string,
+ jobId: string,
+) {
+ logger.info(
+ `[Crawler][${jobId}] Will attempt to extract readable content ...`,
+ );
+ const window = new JSDOM("").window;
+ const purify = DOMPurify(window);
+ const purifiedHTML = purify.sanitize(htmlContent);
+ const purifiedDOM = new JSDOM(purifiedHTML, { url });
+ const readableContent = new Readability(purifiedDOM.window.document).parse();
+ logger.info(`[Crawler][${jobId}] Done extracting readable content.`);
+ return readableContent;
+}
+
+async function storeScreenshot(
+ screenshot: Buffer,
+ userId: string,
+ jobId: string,
+) {
+ const assetId = newAssetId();
+ await saveAsset({
+ userId,
+ assetId,
+ metadata: { contentType: "image/png", fileName: "screenshot.png" },
+ asset: screenshot,
+ });
+ logger.info(
+ `[Crawler][${jobId}] Stored the screenshot as assetId: ${assetId}`,
+ );
+ return assetId;
+}
+
+async function downloadAndStoreImage(
+ url: string,
+ userId: string,
+ jobId: string,
+) {
+ try {
+ logger.info(`[Crawler][${jobId}] Downloading image from "${url}"`);
+ const response = await fetch(url);
+ if (!response.ok) {
+ throw new Error(`Failed to download image: ${response.status}`);
+ }
+ const buffer = await response.arrayBuffer();
+ const assetId = newAssetId();
+
+ const contentType = response.headers.get("content-type");
+ if (!contentType) {
+ throw new Error("No content type in the response");
+ }
+
+ await saveAsset({
+ userId,
+ assetId,
+ metadata: { contentType },
+ asset: Buffer.from(buffer),
+ });
+
+ logger.info(
+ `[Crawler][${jobId}] Downloaded the image as assetId: ${assetId}`,
+ );
+
+ return assetId;
+ } catch (e) {
+ logger.error(
+ `[Crawler][${jobId}] Failed to download and store image: ${e}`,
+ );
+ return null;
+ }
+}
+
async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
const jobId = job.id ?? "unknown";
@@ -227,35 +332,30 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
}
const { bookmarkId } = request.data;
- const url = await getBookmarkUrl(bookmarkId);
+ const { url, userId } = await getBookmarkDetails(bookmarkId);
logger.info(
`[Crawler][${jobId}] Will crawl "${url}" for link with id "${bookmarkId}"`,
);
validateUrl(url);
- const htmlContent = await crawlPage(jobId, url);
-
- logger.info(
- `[Crawler][${jobId}] Will attempt to parse the content of the page ...`,
- );
- const meta = await metascraperParser({
- url,
- html: htmlContent,
- // We don't want to validate the URL again as we've already done it by visiting the page.
- // This was added because URL validation fails if the URL ends with a question mark (e.g. empty query params).
- validateUrl: false,
- });
- logger.info(`[Crawler][${jobId}] Done parsing the content of the page.`);
+ const {
+ htmlContent,
+ screenshot,
+ url: browserUrl,
+ } = await crawlPage(jobId, url);
- const window = new JSDOM("").window;
- const purify = DOMPurify(window);
- const purifiedHTML = purify.sanitize(htmlContent);
- const purifiedDOM = new JSDOM(purifiedHTML, { url });
- const readableContent = new Readability(purifiedDOM.window.document).parse();
+ const [meta, readableContent, screenshotAssetId] = await Promise.all([
+ extractMetadata(htmlContent, browserUrl, jobId),
+ extractReadableContent(htmlContent, browserUrl, jobId),
+ storeScreenshot(screenshot, userId, jobId),
+ ]);
+ let imageAssetId: string | null = null;
+ if (meta.image) {
+ imageAssetId = await downloadAndStoreImage(meta.image, userId, jobId);
+ }
// TODO(important): Restrict the size of content to store
-
await db
.update(bookmarkLinks)
.set({
@@ -265,6 +365,8 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
favicon: meta.logo,
content: readableContent?.textContent,
htmlContent: readableContent?.content,
+ screenshotAssetId,
+ imageAssetId,
crawledAt: new Date(),
})
.where(eq(bookmarkLinks.id, bookmarkId));