rcgit

/ karakeep

Commit 18101001

SHA 18101001d334c96772ee56ec82ede33790f34fa8
Author Mohamed Bassem <me at mbassem dot com>
Author Date 2024-11-30 18:05 +0000
Committer Mohamed Bassem <me at mbassem dot com>
Commit Date 2024-11-30 18:05 +0000
Parent(s) 5a496916c386 (diff)
Tree 6b28e44768eb

patch snapshot

feature(workers): Allow running hoarder without chrome as a hard dependency. Fixes #650
File + - Graph
M apps/workers/crawlerWorker.ts +35 -11
1 file(s) changed, 35 insertions(+), 11 deletions(-)

apps/workers/crawlerWorker.ts

diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index 376e50ea..3952a287 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -1,4 +1,3 @@
-import assert from "assert";
 import * as dns from "dns";
 import { promises as fs } from "fs";
 import * as path from "node:path";
@@ -104,11 +103,8 @@ async function startBrowserInstance() {
       defaultViewport,
     });
   } else {
-    logger.info(`Launching a new browser instance`);
-    return puppeteer.launch({
-      headless: serverConfig.crawler.headlessBrowser,
-      defaultViewport,
-    });
+    logger.info(`Running in browserless mode`);
+    return undefined;
   }
 }
 
@@ -130,7 +126,7 @@ async function launchBrowser() {
       }, 5000);
       return;
     }
-    globalBrowser.on("disconnected", () => {
+    globalBrowser?.on("disconnected", () => {
       if (isShuttingDown) {
         logger.info(
           "[Crawler] The puppeteer browser got disconnected. But we're shutting down so won't restart it.",
@@ -238,15 +234,43 @@ function validateUrl(url: string) {
   }
 }
 
-async function crawlPage(jobId: string, url: string) {
-  let browser: Browser;
+async function browserlessCrawlPage(jobId: string, url: string) {
+  logger.info(
+    `[Crawler][${jobId}] Running in browserless mode. Will do a plain http request to "${url}". Screenshots will be disabled.`,
+  );
+  const response = await fetch(url, {
+    signal: AbortSignal.timeout(5000),
+  });
+  if (!response.ok) {
+    throw new Error(`Failed to crawl page: ${response.status}`);
+  }
+  logger.info(
+    `[Crawler][${jobId}] Successfully fetched the content of "${url}". Status: ${response.status}, Size: ${response.size}`,
+  );
+  return {
+    htmlContent: await response.text(),
+    screenshot: undefined,
+    url: response.url,
+  };
+}
+
+async function crawlPage(
+  jobId: string,
+  url: string,
+): Promise<{
+  htmlContent: string;
+  screenshot: Buffer | undefined;
+  url: string;
+}> {
+  let browser: Browser | undefined;
   if (serverConfig.crawler.browserConnectOnDemand) {
     browser = await startBrowserInstance();
   } else {
-    assert(globalBrowser);
     browser = globalBrowser;
   }
-  assert(browser);
+  if (!browser) {
+    return browserlessCrawlPage(jobId, url);
+  }
   const context = await browser.createBrowserContext();
 
   try {