From be1b7f7e1c0cb3d905e13aa1a95e295b816cbdeb Mon Sep 17 00:00:00 2001
From: kamtschatka <simon.schatka@gmx.at>
Date: Sat, 22 Jun 2024 18:52:40 +0200
Subject: feature: add support for PDF links. Fixes #28 (#216)

* feature request: pdf support #28
Added a new sourceUrl column to the asset bookmarks
Added transforming a link bookmark pointing at a pdf to an asset bookmark
made sure the "View Original" link is also shown for asset bookmarks that have a sourceURL
updated gitignore for IDEA

* remove pdf parsing from the crawler

* extract the http logic into its own function to avoid duplicating the post-processing actions (openai/index)

* Add 5s timeout to the content type fetch

---------

Co-authored-by: MohamedBassem <me@mbassem.com>
---
 apps/workers/crawlerWorker.ts | 220 +++++++++++++++++++++++++++++++-----------
 1 file changed, 163 insertions(+), 57 deletions(-)

(limited to 'apps/workers/crawlerWorker.ts')

diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index 58f1aa85..eedb7b1e 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -1,5 +1,6 @@
 import assert from "assert";
 import * as dns from "dns";
+import * as path from "node:path";
 import type { Job } from "bullmq";
 import type { Browser } from "puppeteer";
 import { Readability } from "@mozilla/readability";
@@ -26,8 +27,9 @@ import { withTimeout } from "utils";
 
 import type { ZCrawlLinkRequest } from "@hoarder/shared/queues";
 import { db } from "@hoarder/db";
-import { bookmarkLinks, bookmarks } from "@hoarder/db/schema";
+import { bookmarkAssets, bookmarkLinks, bookmarks } from "@hoarder/db/schema";
 import {
+  ASSET_TYPES,
   deleteAsset,
   newAssetId,
   saveAsset,
@@ -68,7 +70,7 @@ async function startBrowserInstance() {
     logger.info(
       `[Crawler] Connecting to existing browser websocket address: ${serverConfig.crawler.browserWebSocketUrl}`,
     );
-    return await puppeteer.connect({
+    return puppeteer.connect({
       browserWSEndpoint: serverConfig.crawler.browserWebSocketUrl,
       defaultViewport,
     });
@@ -83,13 +85,13 @@ async function startBrowserInstance() {
     logger.info(
       `[Crawler] Successfully resolved IP address, new address: ${webUrl.toString()}`,
     );
-    return await puppeteer.connect({
+    return puppeteer.connect({
       browserURL: webUrl.toString(),
       defaultViewport,
     });
   } else {
     logger.info(`Launching a new browser instance`);
-    return await puppeteer.launch({
+    return puppeteer.launch({
       headless: serverConfig.crawler.headlessBrowser,
       defaultViewport,
     });
@@ -271,7 +273,11 @@ async function crawlPage(jobId: string, url: string) {
     logger.info(
       `[Crawler][${jobId}] Finished capturing page content and a screenshot. FullPageScreenshot: ${serverConfig.crawler.fullPageScreenshot}`,
     );
-    return { htmlContent, screenshot, url: page.url() };
+    return {
+      htmlContent,
+      screenshot,
+      url: page.url(),
+    };
   } finally {
     await context.close();
   }
@@ -337,22 +343,17 @@ async function storeScreenshot(
   return assetId;
 }
 
-async function downloadAndStoreImage(
+async function downloadAndStoreFile(
   url: string,
   userId: string,
   jobId: string,
+  fileType: string,
 ) {
-  if (!serverConfig.crawler.downloadBannerImage) {
-    logger.info(
-      `[Crawler][${jobId}] Skipping downloading the image as per the config.`,
-    );
-    return null;
-  }
   try {
-    logger.info(`[Crawler][${jobId}] Downloading image from "${url}"`);
+    logger.info(`[Crawler][${jobId}] Downloading ${fileType} from "${url}"`);
     const response = await fetch(url);
     if (!response.ok) {
-      throw new Error(`Failed to download image: ${response.status}`);
+      throw new Error(`Failed to download ${fileType}: ${response.status}`);
     }
     const buffer = await response.arrayBuffer();
     const assetId = newAssetId();
@@ -370,18 +371,32 @@ async function downloadAndStoreImage(
     });
 
     logger.info(
-      `[Crawler][${jobId}] Downloaded the image as assetId: ${assetId}`,
+      `[Crawler][${jobId}] Downloaded ${fileType} as assetId: ${assetId}`,
     );
 
     return assetId;
   } catch (e) {
     logger.error(
-      `[Crawler][${jobId}] Failed to download and store image: ${e}`,
+      `[Crawler][${jobId}] Failed to download and store ${fileType}: ${e}`,
     );
     return null;
   }
 }
 
+async function downloadAndStoreImage(
+  url: string,
+  userId: string,
+  jobId: string,
+) {
+  if (!serverConfig.crawler.downloadBannerImage) {
+    logger.info(
+      `[Crawler][${jobId}] Skipping downloading the image as per the config.`,
+    );
+    return null;
+  }
+  return downloadAndStoreFile(url, userId, jobId, "image");
+}
+
 async function archiveWebpage(
   html: string,
   url: string,
@@ -415,31 +430,70 @@ async function archiveWebpage(
   return assetId;
 }
 
-async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
-  const jobId = job.id ?? "unknown";
-
-  const request = zCrawlLinkRequestSchema.safeParse(job.data);
-  if (!request.success) {
+async function getContentType(
+  url: string,
+  jobId: string,
+): Promise<string | null> {
+  try {
+    logger.info(
+      `[Crawler][${jobId}] Attempting to determine the content-type for the url ${url}`,
+    );
+    const response = await fetch(url, {
+      method: "HEAD",
+      signal: AbortSignal.timeout(5000),
+    });
+    const contentType = response.headers.get("content-type");
+    logger.info(
+      `[Crawler][${jobId}] Content-type for the url ${url} is "${contentType}"`,
+    );
+    return contentType;
+  } catch (e) {
     logger.error(
-      `[Crawler][${jobId}] Got malformed job request: ${request.error.toString()}`,
+      `[Crawler][${jobId}] Failed to determine the content-type for the url ${url}: ${e}`,
     );
-    return;
+    return null;
   }
+}
 
-  const { bookmarkId } = request.data;
-  const {
-    url,
-    userId,
-    screenshotAssetId: oldScreenshotAssetId,
-    imageAssetId: oldImageAssetId,
-    fullPageArchiveAssetId: oldFullPageArchiveAssetId,
-  } = await getBookmarkDetails(bookmarkId);
-
-  logger.info(
-    `[Crawler][${jobId}] Will crawl "${url}" for link with id "${bookmarkId}"`,
-  );
-  validateUrl(url);
+/**
+ * Downloads the pdf asset from the URL and transforms the linkBookmark to an assetBookmark
+ * @param url the url the user provided
+ * @param userId the id of the user
+ * @param jobId the id of the job for logging
+ * @param bookmarkId the id of the bookmark
+ */
+async function handlePDFAsAssetBookmark(
+  url: string,
+  userId: string,
+  jobId: string,
+  bookmarkId: string,
+) {
+  const assetId = await downloadAndStoreFile(url, userId, jobId, "pdf");
+  if (!assetId) {
+    return;
+  }
+  await db.transaction(async (trx) => {
+    await trx.insert(bookmarkAssets).values({
+      id: bookmarkId,
+      assetType: "pdf",
+      assetId,
+      content: null,
+      fileName: path.basename(new URL(url).pathname),
+      sourceUrl: url,
+    });
+    await trx.delete(bookmarkLinks).where(eq(bookmarkLinks.id, bookmarkId));
+  });
+}
 
+async function crawlAndParseUrl(
+  url: string,
+  userId: string,
+  jobId: string,
+  bookmarkId: string,
+  oldScreenshotAssetId: string | null,
+  oldImageAssetId: string | null,
+  oldFullPageArchiveAssetId: string | null,
+) {
   const {
     htmlContent,
     screenshot,
@@ -482,6 +536,78 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
       : {},
   ]);
 
+  return async () => {
+    if (serverConfig.crawler.fullPageArchive) {
+      const fullPageArchiveAssetId = await archiveWebpage(
+        htmlContent,
+        browserUrl,
+        userId,
+        jobId,
+      );
+
+      await db
+        .update(bookmarkLinks)
+        .set({
+          fullPageArchiveAssetId,
+        })
+        .where(eq(bookmarkLinks.id, bookmarkId));
+
+      if (oldFullPageArchiveAssetId) {
+        deleteAsset({ userId, assetId: oldFullPageArchiveAssetId }).catch(
+          () => ({}),
+        );
+      }
+    }
+  };
+}
+
+async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
+  const jobId = job.id ?? "unknown";
+
+  const request = zCrawlLinkRequestSchema.safeParse(job.data);
+  if (!request.success) {
+    logger.error(
+      `[Crawler][${jobId}] Got malformed job request: ${request.error.toString()}`,
+    );
+    return;
+  }
+
+  const { bookmarkId } = request.data;
+  const {
+    url,
+    userId,
+    screenshotAssetId: oldScreenshotAssetId,
+    imageAssetId: oldImageAssetId,
+    fullPageArchiveAssetId: oldFullPageArchiveAssetId,
+  } = await getBookmarkDetails(bookmarkId);
+
+  logger.info(
+    `[Crawler][${jobId}] Will crawl "${url}" for link with id "${bookmarkId}"`,
+  );
+  validateUrl(url);
+
+  const contentType = await getContentType(url, jobId);
+
+  // Link bookmarks get transformed into asset bookmarks if they point to a pdf asset instead of a webpage
+  const isPdf = contentType === ASSET_TYPES.APPLICATION_PDF;
+
+  let archivalLogic: () => Promise<void> = () => {
+    return Promise.resolve();
+  };
+  if (isPdf) {
+    await handlePDFAsAssetBookmark(url, userId, jobId, bookmarkId);
+  } else {
+    archivalLogic = await crawlAndParseUrl(
+      url,
+      userId,
+      jobId,
+      bookmarkId,
+      oldScreenshotAssetId,
+      oldImageAssetId,
+      oldFullPageArchiveAssetId,
+    );
+  }
+
   // Enqueue openai job (if not set, assume it's true for backward compatibility)
   if (job.data.runInference !== false) {
     OpenAIQueue.add("openai", {
@@ -493,25 +619,5 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
   triggerSearchReindex(bookmarkId);
 
   // Do the archival as a separate last step as it has the potential for failure
-  if (serverConfig.crawler.fullPageArchive) {
-    const fullPageArchiveAssetId = await archiveWebpage(
-      htmlContent,
-      browserUrl,
-      userId,
-      jobId,
-    );
-
-    await db
-      .update(bookmarkLinks)
-      .set({
-        fullPageArchiveAssetId,
-      })
-      .where(eq(bookmarkLinks.id, bookmarkId));
-
-    if (oldFullPageArchiveAssetId) {
-      deleteAsset({ userId, assetId: oldFullPageArchiveAssetId }).catch(
-        () => ({}),
-      );
-    }
-  }
+  await archivalLogic();
 }
-- 
cgit v1.2.3-70-g09d2