feature(crawler): Allow connecting to browser's websocket address and launching the browser on demand. This enables support for browserless

author: MohamedBassem <me@mbassem.com> 2024-05-15 08:08:38 +0100
committer: MohamedBassem <me@mbassem.com> 2024-05-15 08:14:16 +0100
commit: 39025a83e041347a4c8206704e7dc2cd1e0cadd5 (patch)
tree: 53c26b0655757bdc5b5ac94ba48d24d578dc47de
parent: f64a5f3237c41b600f7047c477fbf9e79eae4297 (diff)
download: karakeep-39025a83e041347a4c8206704e7dc2cd1e0cadd5.tar.zst
3 files changed, 70 insertions, 36 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index e7ed854b..fe5bc43b 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -48,39 +48,53 @@ const metascraperParser = metascraper([
   metascraperUrl(),
 ]);
 
-let browser: Browser | undefined;
+let globalBrowser: Browser | undefined;
 // Guards the interactions with the browser instance.
 // This is needed given that most of the browser APIs are async.
 const browserMutex = new Mutex();
 
+async function startBrowserInstance() {
+  const defaultViewport = {
+    width: 1440,
+    height: 900,
+  };
+  if (serverConfig.crawler.browserWebSocketUrl) {
+    logger.info(
+      `[Crawler] Connecting to existing browser websocket address: ${serverConfig.crawler.browserWebSocketUrl}`,
+    );
+    return await puppeteer.connect({
+      browserWSEndpoint: serverConfig.crawler.browserWebSocketUrl,
+      defaultViewport,
+    });
+  } else if (serverConfig.crawler.browserWebUrl) {
+    logger.info(
+      `[Crawler] Connecting to existing browser instance: ${serverConfig.crawler.browserWebUrl}`,
+    );
+    const webUrl = new URL(serverConfig.crawler.browserWebUrl);
+    // We need to resolve the ip address as a workaround for https://github.com/puppeteer/puppeteer/issues/2242
+    const { address: address } = await dns.promises.lookup(webUrl.hostname);
+    webUrl.hostname = address;
+    logger.info(
+      `[Crawler] Successfully resolved IP address, new address: ${webUrl.toString()}`,
+    );
+    return await puppeteer.connect({
+      browserURL: webUrl.toString(),
+      defaultViewport,
+    });
+  } else {
+    logger.info(`Launching a new browser instance`);
+    return await puppeteer.launch({
+      headless: serverConfig.crawler.headlessBrowser,
+      defaultViewport,
+    });
+  }
+}
+
 async function launchBrowser() {
-  browser = undefined;
+  globalBrowser = undefined;
   await browserMutex.runExclusive(async () => {
     try {
-      if (serverConfig.crawler.browserWebUrl) {
-        logger.info(
-          `[Crawler] Connecting to existing browser instance: ${serverConfig.crawler.browserWebUrl}`,
-        );
-        const webUrl = new URL(serverConfig.crawler.browserWebUrl);
-        // We need to resolve the ip address as a workaround for https://github.com/puppeteer/puppeteer/issues/2242
-        const { address: address } = await dns.promises.lookup(webUrl.hostname);
-        webUrl.hostname = address;
-        logger.info(
-          `[Crawler] Successfully resolved IP address, new address: ${webUrl.toString()}`,
-        );
-        browser = await puppeteer.connect({
-          browserURL: webUrl.toString(),
-          defaultViewport: {
-            width: 1440,
-            height: 900,
-          },
-        });
-      } else {
-        logger.info(`Launching a new browser instance`);
-        browser = await puppeteer.launch({
-          headless: serverConfig.crawler.headlessBrowser,
-        });
-      }
+      globalBrowser = await startBrowserInstance();
     } catch (e) {
       logger.error(
         "[Crawler] Failed to connect to the browser instance, will retry in 5 secs",
@@ -90,7 +104,7 @@ async function launchBrowser() {
       }, 5000);
       return;
     }
-    browser.on("disconnected", () => {
+    globalBrowser.on("disconnected", () => {
       if (isShuttingDown) {
         logger.info(
           "[Crawler] The puppeteer browser got disconnected. But we're shutting down so won't restart it.",
@@ -113,7 +127,13 @@ export class CrawlerWorker {
         blockTrackersAndAnnoyances: true,
       }),
     );
-    await launchBrowser();
+    if (!serverConfig.crawler.browserConnectOnDemand) {
+      await launchBrowser();
+    } else {
+      logger.info(
+        "[Crawler] Browser connect on demand is enabled, won't proactively start the browser instance",
+      );
+    }
 
     logger.info("Starting crawler worker ...");
     const worker = new Worker<ZCrawlLinkRequest, void>(
@@ -197,6 +217,13 @@ function validateUrl(url: string) {
 }
 
 async function crawlPage(jobId: string, url: string) {
+  let browser: Browser;
+  if (serverConfig.crawler.browserConnectOnDemand) {
+    browser = await startBrowserInstance();
+  } else {
+    assert(globalBrowser);
+    browser = globalBrowser;
+  }
   assert(browser);
   const context = await browser.createBrowserContext();
 
diff --git a/docs/docs/03-configuration.md b/docs/docs/03-configuration.md
index 83546ec8..08405a0f 100644
--- a/docs/docs/03-configuration.md
+++ b/docs/docs/03-configuration.md
@@ -38,11 +38,14 @@ Either `OPENAI_API_KEY` or `OLLAMA_BASE_URL` need to be set for automatic taggin
 
 ## Crawler Configs
 
-| Name                          | Required | Default | Description                                                                                                                                                                                                                |
-| ----------------------------- | -------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| CRAWLER_NUM_WORKERS           | No       | 1       | Number of allowed concurrent crawling jobs. By default, we're only doing one crawling request at a time to avoid consuming a lot of resources.                                                                             |
-| CRAWLER_DOWNLOAD_BANNER_IMAGE | No       | true    | Whether to cache the banner image used in the cards locally or fetch it each time directly from the website. Caching it consumes more storage space, but is more resilient against link rot and rate limits from websites. |
-| CRAWLER_STORE_SCREENSHOT      | No       | true    | Whether to store a screenshot from the crawled website or not. Screenshots act as a fallback for when we fail to extract an image from a website. You can also view the stored screenshots for any link.                   |
-| CRAWLER_FULL_PAGE_SCREENSHOT  | No       | false   | Whether to store a screenshot of the full page or not. Disabled by default, as it can lead to much higher disk usage. If disabled, the screenshot will only include the visible part of the page                           |
-| CRAWLER_JOB_TIMEOUT_SEC       | No       | 60      | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit                                                     |
-| CRAWLER_NAVIGATE_TIMEOUT_SEC  | No       | 30      | How long to spend navigating to the page (along with its redirects). Increase this if you have a slow internet connection                                                                                                  |
+| Name                          | Required | Default | Description                                                                                                                                                                                                                                                                                                                                                                        |
+| ----------------------------- | -------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| CRAWLER_NUM_WORKERS           | No       | 1       | Number of allowed concurrent crawling jobs. By default, we're only doing one crawling request at a time to avoid consuming a lot of resources.                                                                                                                                                                                                                                     |
+| BROWSER_WEB_URL               | No       | Not set | The browser's http debugging address. The worker will talk to this endpoint to resolve the debugging console's websocket address. If you already have the websocket address, use `BROWSER_WEBSOCKET_URL` instead. If neither `BROWSER_WEB_URL` nor `BROWSER_WEBSOCKET_URL` are set, the worker will launch its own browser instance (assuming it has access to the chrome binary). |
+| BROWSER_WEBSOCKET_URL         | No       | Not set | The websocket address of browser's debugging console. If you want to use [browserless](https://browserless.io), use their websocket address here. If neither `BROWSER_WEB_URL` nor `BROWSER_WEBSOCKET_URL` are set, the worker will launch its own browser instance (assuming it has access to the chrome binary).                                                                 |
+| BROWSER_CONNECT_ONDEMAND      | No       | false   | If set to false, the crawler will proactively connect to the browser instance and always maintain an active connection. If set to true, the browser will be launched on demand only whenever a crawling is requested. Set to true if you're using a service that provides you with browser instances on demand.                                                                    |
+| CRAWLER_DOWNLOAD_BANNER_IMAGE | No       | true    | Whether to cache the banner image used in the cards locally or fetch it each time directly from the website. Caching it consumes more storage space, but is more resilient against link rot and rate limits from websites.                                                                                                                                                         |
+| CRAWLER_STORE_SCREENSHOT      | No       | true    | Whether to store a screenshot from the crawled website or not. Screenshots act as a fallback for when we fail to extract an image from a website. You can also view the stored screenshots for any link.                                                                                                                                                                           |
+| CRAWLER_FULL_PAGE_SCREENSHOT  | No       | false   | Whether to store a screenshot of the full page or not. Disabled by default, as it can lead to much higher disk usage. If disabled, the screenshot will only include the visible part of the page                                                                                                                                                                                   |
+| CRAWLER_JOB_TIMEOUT_SEC       | No       | 60      | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit                                                                                                                                                                                                             |
+| CRAWLER_NAVIGATE_TIMEOUT_SEC  | No       | 30      | How long to spend navigating to the page (along with its redirects). Increase this if you have a slow internet connection                                                                                                                                                                                                                                                          |
diff --git a/packages/shared/config.ts b/packages/shared/config.ts
index 664ad7d9..388a6660 100644
--- a/packages/shared/config.ts
+++ b/packages/shared/config.ts
@@ -21,6 +21,8 @@ const allEnv = z.object({
   REDIS_PASSWORD: z.string().optional(),
   CRAWLER_HEADLESS_BROWSER: stringBool("true"),
   BROWSER_WEB_URL: z.string().url().optional(),
+  BROWSER_WEBSOCKET_URL: z.string().url().optional(),
+  BROWSER_CONNECT_ONDEMAND: stringBool("false"),
   CRAWLER_JOB_TIMEOUT_SEC: z.coerce.number().default(60),
   CRAWLER_NAVIGATE_TIMEOUT_SEC: z.coerce.number().default(30),
   CRAWLER_NUM_WORKERS: z.coerce.number().default(1),
@@ -65,6 +67,8 @@ const serverConfigSchema = allEnv.transform((val) => {
       numWorkers: val.CRAWLER_NUM_WORKERS,
       headlessBrowser: val.CRAWLER_HEADLESS_BROWSER,
       browserWebUrl: val.BROWSER_WEB_URL,
+      browserWebSocketUrl: val.BROWSER_WEBSOCKET_URL,
+      browserConnectOnDemand: val.BROWSER_CONNECT_ONDEMAND,
       jobTimeoutSec: val.CRAWLER_JOB_TIMEOUT_SEC,
       navigateTimeoutSec: val.CRAWLER_NAVIGATE_TIMEOUT_SEC,
       downloadBannerImage: val.CRAWLER_DOWNLOAD_BANNER_IMAGE,
author	MohamedBassem <me@mbassem.com>	2024-05-15 08:08:38 +0100
committer	MohamedBassem <me@mbassem.com>	2024-05-15 08:14:16 +0100
commit	39025a83e041347a4c8206704e7dc2cd1e0cadd5 (patch)
tree	53c26b0655757bdc5b5ac94ba48d24d578dc47de
parent	f64a5f3237c41b600f7047c477fbf9e79eae4297 (diff)
download	karakeep-39025a83e041347a4c8206704e7dc2cd1e0cadd5.tar.zst