feature(crawler): Allow increasing crawler concurrency and configure storing images and screenshots

author: MohamedBassem <me@mbassem.com> 2024-04-26 11:06:54 +0100
committer: MohamedBassem <me@mbassem.com> 2024-04-26 11:06:54 +0100
commit: a91aff23eaa3616aec331ef0267863ed0b0e594b (patch)
tree: 09a449ec43b632762af1ef7125cf707b8fe45d81
parent: 9dace185acff4002aec8265fc010db49d91c7d7f (diff)
download: karakeep-a91aff23eaa3616aec331ef0267863ed0b0e594b.tar.zst
3 files changed, 26 insertions, 4 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index b583864d..6b4d39f0 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -123,6 +123,7 @@ export class CrawlerWorker {
         /* timeoutSec */ serverConfig.crawler.jobTimeoutSec,
       ),
       {
+        concurrency: serverConfig.crawler.numWorkers,
         connection: queueConnectionDetails,
         autorun: false,
       },
@@ -282,6 +283,12 @@ async function storeScreenshot(
   userId: string,
   jobId: string,
 ) {
+  if (!serverConfig.crawler.storeScreenshot) {
+    logger.info(
+      `[Crawler][${jobId}] Skipping storing the screenshot as per the config.`,
+    );
+    return null;
+  }
   const assetId = newAssetId();
   await saveAsset({
     userId,
@@ -300,6 +307,12 @@ async function downloadAndStoreImage(
   userId: string,
   jobId: string,
 ) {
+  if (!serverConfig.crawler.downloadBannerImage) {
+    logger.info(
+      `[Crawler][${jobId}] Skipping downloading the image as per the config.`,
+    );
+    return null;
+  }
   try {
     logger.info(`[Crawler][${jobId}] Downloading image from "${url}"`);
     const response = await fetch(url);
diff --git a/docs/docs/03-configuration.md b/docs/docs/03-configuration.md
index 28ead2f1..3d44f359 100644
--- a/docs/docs/03-configuration.md
+++ b/docs/docs/03-configuration.md
@@ -37,7 +37,10 @@ Either `OPENAI_API_KEY` or `OLLAMA_BASE_URL` need to be set for automatic taggin
 
 ## Crawler Configs
 
-| Name                         | Required | Default | Description                                                                                                                                                            |
-| ---------------------------- | -------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| CRAWLER_JOB_TIMEOUT_SEC      | No       | 60      | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit |
-| CRAWLER_NAVIGATE_TIMEOUT_SEC | No       | 30      | How long to spend navigating to the page (along with its redirects). Increase this if you have a slow internet connection                                              |
+| Name                          | Required | Default | Description                                                                                                                                                                                                                |
+| ----------------------------- | -------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| CRAWLER_NUM_WORKERS           | No       | 1       | Number of allowed concurrent crawling jobs. By default, we're only doing one crawling request at a time to avoid consuming a lot of resources.                                                                             |
+| CRAWLER_DOWNLOAD_BANNER_IMAGE | No       | true    | Whether to cache the banner image used in the cards locally or fetch it each time directly from the website. Caching it consumes more storage space, but is more resilient against link rot and rate limits from websites. |
+| CRAWLER_STORE_SCREENSHOT      | No       | true    | Whether to store a screenshot from the crawled website or not. Screenshots act as a fallback for when we fail to extract an image from a website. You can also view the stored screenshots for any link.                   |
+| CRAWLER_JOB_TIMEOUT_SEC       | No       | 60      | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit                                                     |
+| CRAWLER_NAVIGATE_TIMEOUT_SEC  | No       | 30      | How long to spend navigating to the page (along with its redirects). Increase this if you have a slow internet connection                                                                                                  |
diff --git a/packages/shared/config.ts b/packages/shared/config.ts
index 41173433..a31c48b4 100644
--- a/packages/shared/config.ts
+++ b/packages/shared/config.ts
@@ -22,6 +22,9 @@ const allEnv = z.object({
   BROWSER_WEB_URL: z.string().url().optional(),
   CRAWLER_JOB_TIMEOUT_SEC: z.number().default(60),
   CRAWLER_NAVIGATE_TIMEOUT_SEC: z.number().default(30),
+  CRAWLER_NUM_WORKERS: z.coerce.number().default(1),
+  CRAWLER_DOWNLOAD_BANNER_IMAGE: stringBool("true"),
+  CRAWLER_STORE_SCREENSHOT: stringBool("true"),
   MEILI_ADDR: z.string().optional(),
   MEILI_MASTER_KEY: z.string().default(""),
   LOG_LEVEL: z.string().default("debug"),
@@ -56,10 +59,13 @@ const serverConfigSchema = allEnv.transform((val) => {
       redisDBIdx: val.REDIS_DB_IDX,
     },
     crawler: {
+      numWorkers: val.CRAWLER_NUM_WORKERS,
       headlessBrowser: val.CRAWLER_HEADLESS_BROWSER,
       browserWebUrl: val.BROWSER_WEB_URL,
       jobTimeoutSec: val.CRAWLER_JOB_TIMEOUT_SEC,
       navigateTimeoutSec: val.CRAWLER_NAVIGATE_TIMEOUT_SEC,
+      downloadBannerImage: val.CRAWLER_DOWNLOAD_BANNER_IMAGE,
+      storeScreenshot: val.CRAWLER_STORE_SCREENSHOT,
     },
     meilisearch: val.MEILI_ADDR
       ? {
author	MohamedBassem <me@mbassem.com>	2024-04-26 11:06:54 +0100
committer	MohamedBassem <me@mbassem.com>	2024-04-26 11:06:54 +0100
commit	a91aff23eaa3616aec331ef0267863ed0b0e594b (patch)
tree	09a449ec43b632762af1ef7125cf707b8fe45d81
parent	9dace185acff4002aec8265fc010db49d91c7d7f (diff)
download	karakeep-a91aff23eaa3616aec331ef0267863ed0b0e594b.tar.zst