feat: Allow configuring inline asset size threshold

author: Mohamed Bassem <me@mbassem.com> 2025-10-26 11:17:16 +0000
committer: Mohamed Bassem <me@mbassem.com> 2025-10-26 11:17:16 +0000
commit: cf3ffff0a5800168c3e4c446a0735084f62f8216 (patch)
tree: 15e327911f4aff70e1c0d03737364a57c1b9d237
parent: 2b769cba822506c1572793385993737d4ffff478 (diff)
download: karakeep-cf3ffff0a5800168c3e4c446a0735084f62f8216.tar.zst
5 files changed, 32 insertions, 31 deletions
diff --git a/apps/workers/constants.ts b/apps/workers/constants.ts
deleted file mode 100644
index 954466bf..00000000
--- a/apps/workers/constants.ts
+++ /dev/null
@@ -1 +0,0 @@
-export const HTML_CONTENT_SIZE_THRESHOLD = 50 * 1024; // 50KB
diff --git a/apps/workers/workers/adminMaintenance/tasks/migrateLinkHtmlContent.ts b/apps/workers/workers/adminMaintenance/tasks/migrateLinkHtmlContent.ts
index c5336bce..467f2aa7 100644
--- a/apps/workers/workers/adminMaintenance/tasks/migrateLinkHtmlContent.ts
+++ b/apps/workers/workers/adminMaintenance/tasks/migrateLinkHtmlContent.ts
@@ -11,10 +11,10 @@ import {
   newAssetId,
   saveAsset,
 } from "@karakeep/shared/assetdb";
+import serverConfig from "@karakeep/shared/config";
 import logger from "@karakeep/shared/logger";
 import { tryCatch } from "@karakeep/shared/tryCatch";
 
-import { HTML_CONTENT_SIZE_THRESHOLD } from "../../../constants";
 import { updateAsset } from "../../../workerUtils";
 
 const BATCH_SIZE = 25;
@@ -40,12 +40,12 @@ async function getBookmarksWithLargeInlineHtml(limit: number, cursor?: string) {
             gt(bookmarkLinks.id, cursor),
             isNotNull(bookmarkLinks.htmlContent),
             isNull(bookmarkLinks.contentAssetId),
-            sql`length(CAST(${bookmarkLinks.htmlContent} AS BLOB)) > ${HTML_CONTENT_SIZE_THRESHOLD}`,
+            sql`length(CAST(${bookmarkLinks.htmlContent} AS BLOB)) > ${serverConfig.crawler.htmlContentSizeThreshold}`,
           )
         : and(
             isNotNull(bookmarkLinks.htmlContent),
             isNull(bookmarkLinks.contentAssetId),
-            sql`length(CAST(${bookmarkLinks.htmlContent} AS BLOB)) > ${HTML_CONTENT_SIZE_THRESHOLD}`,
+            sql`length(CAST(${bookmarkLinks.htmlContent} AS BLOB)) > ${serverConfig.crawler.htmlContentSizeThreshold}`,
           ),
     )
     .orderBy(asc(bookmarkLinks.id))
@@ -62,7 +62,7 @@ async function migrateBookmarkHtml(
 
   const contentSize = Buffer.byteLength(htmlContent, "utf8");
 
-  if (contentSize <= HTML_CONTENT_SIZE_THRESHOLD) {
+  if (contentSize <= serverConfig.crawler.htmlContentSizeThreshold) {
     logger.debug(
       `[adminMaintenance:migrate_large_link_html][${jobId}] Bookmark ${bookmarkId} inline HTML (${contentSize} bytes) below threshold, skipping`,
     );
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts
index 4e02d73a..def0ae88 100644
--- a/apps/workers/workers/crawlerWorker.ts
+++ b/apps/workers/workers/crawlerWorker.ts
@@ -73,7 +73,6 @@ import {
 import { tryCatch } from "@karakeep/shared/tryCatch";
 import { BookmarkTypes } from "@karakeep/shared/types/bookmarks";
 
-import { HTML_CONTENT_SIZE_THRESHOLD } from "../constants";
 import metascraperReddit from "../metascraper-plugins/metascraper-reddit";
 
 function abortPromise(signal: AbortSignal): Promise<never> {
@@ -934,7 +933,7 @@ async function storeHtmlContent(
   const contentSize = contentBuffer.byteLength;
 
   // Only store in assets if content is >= 50KB
-  if (contentSize < HTML_CONTENT_SIZE_THRESHOLD) {
+  if (contentSize < serverConfig.crawler.htmlContentSizeThreshold) {
     logger.info(
       `[Crawler][${jobId}] HTML content size (${contentSize} bytes) is below threshold, storing inline`,
     );
diff --git a/docs/docs/03-configuration.md b/docs/docs/03-configuration.md
index a7682e72..26760d6c 100644
--- a/docs/docs/03-configuration.md
+++ b/docs/docs/03-configuration.md
@@ -7,7 +7,7 @@ The app is mainly configured by environment variables. All the used environment
 | PORT                            | No                                    | 3000            | The port on which the web server will listen. DON'T CHANGE THIS IF YOU'RE USING DOCKER, instead changed the docker bound external port.                                                                                                                                                                                 |
 | WORKERS_PORT                    | No                                    | 0 (Random Port) | The port on which the worker will export its prometheus metrics on `/metrics`. By default it's a random unused port. If you want to utilize those metrics, fix the port to a value (and export it in docker if you're using docker).                                                                                    |
 | WORKERS_HOST                    | No                                    | 127.0.0.1       | Host to listen to for requests to WORKERS_PORT. You will need to set this if running in a container, since localhost will not be reachable from outside                                                                                                                                                                 |
-| WORKERS_ENABLED_WORKERS         | No                                    | Not set         | Comma separated list of worker names to enable. If set, only these workers will run. Valid values: crawler,inference,search,adminMaintenance,video,feed,assetPreprocessing,webhook,ruleEngine.                                                                                                                                |
+| WORKERS_ENABLED_WORKERS         | No                                    | Not set         | Comma separated list of worker names to enable. If set, only these workers will run. Valid values: crawler,inference,search,adminMaintenance,video,feed,assetPreprocessing,webhook,ruleEngine.                                                                                                                          |
 | WORKERS_DISABLED_WORKERS        | No                                    | Not set         | Comma separated list of worker names to disable. Takes precedence over `WORKERS_ENABLED_WORKERS`.                                                                                                                                                                                                                       |
 | DATA_DIR                        | Yes                                   | Not set         | The path for the persistent data directory. This is where the db lives. Assets are stored here by default unless `ASSETS_DIR` is set.                                                                                                                                                                                   |
 | ASSETS_DIR                      | No                                    | Not set         | The path where crawled assets will be stored. If not set, defaults to `${DATA_DIR}/assets`.                                                                                                                                                                                                                             |
@@ -95,7 +95,7 @@ Either `OPENAI_API_KEY` or `OLLAMA_BASE_URL` need to be set for automatic taggin
 | EMBEDDING_TEXT_MODEL                 | No       | text-embedding-3-small | The model to be used for generating embeddings for the text.                                                                                                                                                                                                                                                                                                                          |
 | INFERENCE_CONTEXT_LENGTH             | No       | 2048                   | The max number of tokens that we'll pass to the inference model. If your content is larger than this size, it'll be truncated to fit. The larger this value, the more of the content will be used in tag inference, but the more expensive the inference will be (money-wise on openAI and resource-wise on ollama). Check the model you're using for its max supported content size. |
 | INFERENCE_MAX_OUTPUT_TOKENS          | No       | 2048                   | The maximum number of tokens that the inference model is allowed to generate in its response. This controls the length of AI-generated content like tags and summaries. Increase this if you need longer responses, but be aware that higher values will increase costs (for OpenAI) and processing time.                                                                             |
-| INFERENCE_USE_MAX_COMPLETION_TOKENS  | No       | false                  | \[OpenAI Only\] Whether to use the newer `max_completion_tokens` parameter instead of the deprecated `max_tokens` parameter. Set to `true` if using GPT-5 or o-series models which require this. Will become the default in a future release. |
+| INFERENCE_USE_MAX_COMPLETION_TOKENS  | No       | false                  | \[OpenAI Only\] Whether to use the newer `max_completion_tokens` parameter instead of the deprecated `max_tokens` parameter. Set to `true` if using GPT-5 or o-series models which require this. Will become the default in a future release.                                                                                                                                         |
 | INFERENCE_LANG                       | No       | english                | The language in which the tags will be generated.                                                                                                                                                                                                                                                                                                                                     |
 | INFERENCE_NUM_WORKERS                | No       | 1                      | Number of concurrent workers for AI inference tasks (tagging and summarization). Increase this if you have multiple AI inference requests and want to process them in parallel.                                                                                                                                                                                                       |
 | INFERENCE_ENABLE_AUTO_TAGGING        | No       | true                   | Whether automatic AI tagging is enabled or disabled.                                                                                                                                                                                                                                                                                                                                  |
@@ -113,25 +113,26 @@ Either `OPENAI_API_KEY` or `OLLAMA_BASE_URL` need to be set for automatic taggin
 
 ## Crawler Configs
 
-| Name                               | Required | Default | Description                                                                                                                                                                                                                                                                                                                                                                   |
-| ---------------------------------- | -------- | ------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| CRAWLER_NUM_WORKERS                | No       | 1       | Number of allowed concurrent crawling jobs. By default, we're only doing one crawling request at a time to avoid consuming a lot of resources.                                                                                                                                                                                                                                |
-| BROWSER_WEB_URL                    | No       | Not set | The browser's http debugging address. The worker will talk to this endpoint to resolve the debugging console's websocket address. If you already have the websocket address, use `BROWSER_WEBSOCKET_URL` instead. If neither `BROWSER_WEB_URL` nor `BROWSER_WEBSOCKET_URL` are set, the worker will use plain http requests skipping screenshotting and javascript execution. |
-| BROWSER_WEBSOCKET_URL              | No       | Not set | The websocket address of browser's debugging console. If you want to use [browserless](https://browserless.io), use their websocket address here. If neither `BROWSER_WEB_URL` nor `BROWSER_WEBSOCKET_URL` are set, the worker will use plain http requests skipping screenshotting and javascript execution.                                                                 |
-| BROWSER_CONNECT_ONDEMAND           | No       | false   | If set to false, the crawler will proactively connect to the browser instance and always maintain an active connection. If set to true, the browser will be launched on demand only whenever a crawling is requested. Set to true if you're using a service that provides you with browser instances on demand.                                                               |
-| CRAWLER_DOWNLOAD_BANNER_IMAGE      | No       | true    | Whether to cache the banner image used in the cards locally or fetch it each time directly from the website. Caching it consumes more storage space, but is more resilient against link rot and rate limits from websites.                                                                                                                                                    |
-| CRAWLER_STORE_SCREENSHOT           | No       | true    | Whether to store a screenshot from the crawled website or not. Screenshots act as a fallback for when we fail to extract an image from a website. You can also view the stored screenshots for any link.                                                                                                                                                                      |
-| CRAWLER_FULL_PAGE_SCREENSHOT       | No       | false   | Whether to store a screenshot of the full page or not. Disabled by default, as it can lead to much higher disk usage. If disabled, the screenshot will only include the visible part of the page                                                                                                                                                                              |
-| CRAWLER_SCREENSHOT_TIMEOUT_SEC     | No       | 5       | How long to wait for the screenshot finish before timing out. If you are capturing full-page screenshots of long webpages, consider increasing this value.                                                                                                                                                                                                                    |
-| CRAWLER_FULL_PAGE_ARCHIVE          | No       | false   | Whether to store a full local copy of the page or not. Disabled by default, as it can lead to much higher disk usage. If disabled, only the readable text of the page is archived.                                                                                                                                                                                            |
-| CRAWLER_JOB_TIMEOUT_SEC            | No       | 60      | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit                                                                                                                                                                                                        |
-| CRAWLER_NAVIGATE_TIMEOUT_SEC       | No       | 30      | How long to spend navigating to the page (along with its redirects). Increase this if you have a slow internet connection                                                                                                                                                                                                                                                     |
-| CRAWLER_VIDEO_DOWNLOAD             | No       | false   | Whether to download videos from the page or not (using yt-dlp)                                                                                                                                                                                                                                                                                                                |
-| CRAWLER_VIDEO_DOWNLOAD_MAX_SIZE    | No       | 50      | The maximum file size for the downloaded video. The quality will be chosen accordingly. Use -1 to disable the limit.                                                                                                                                                                                                                                                          |
-| CRAWLER_VIDEO_DOWNLOAD_TIMEOUT_SEC | No       | 600     | How long to wait for the video download to finish                                                                                                                                                                                                                                                                                                                             |
-| CRAWLER_ENABLE_ADBLOCKER           | No       | true    | Whether to enable an adblocker in the crawler or not. If you're facing troubles downloading the adblocking lists on worker startup, you can disable this.                                                                                                                                                                                                                     |
-| CRAWLER_YTDLP_ARGS                 | No       | []      | Include additional yt-dlp arguments to be passed at crawl time separated by %%: https://github.com/yt-dlp/yt-dlp?tab=readme-ov-file#general-options                                                                                                                                                                                                                           |
-| BROWSER_COOKIE_PATH                | No       | Not set | Path to a JSON file containing cookies to be loaded into the browser context. The file should be an array of cookie objects, each with name and value (required), and optional fields like domain, path, expires, httpOnly, secure, and sameSite (e.g., `[{"name": "session", "value": "xxx", "domain": ".example.com"}`]).                                                   |
+| Name                                     | Required | Default   | Description                                                                                                                                                                                                                                                                                                                                                                   |
+| ---------------------------------------- | -------- | --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| CRAWLER_NUM_WORKERS                      | No       | 1         | Number of allowed concurrent crawling jobs. By default, we're only doing one crawling request at a time to avoid consuming a lot of resources.                                                                                                                                                                                                                                |
+| BROWSER_WEB_URL                          | No       | Not set   | The browser's http debugging address. The worker will talk to this endpoint to resolve the debugging console's websocket address. If you already have the websocket address, use `BROWSER_WEBSOCKET_URL` instead. If neither `BROWSER_WEB_URL` nor `BROWSER_WEBSOCKET_URL` are set, the worker will use plain http requests skipping screenshotting and javascript execution. |
+| BROWSER_WEBSOCKET_URL                    | No       | Not set   | The websocket address of browser's debugging console. If you want to use [browserless](https://browserless.io), use their websocket address here. If neither `BROWSER_WEB_URL` nor `BROWSER_WEBSOCKET_URL` are set, the worker will use plain http requests skipping screenshotting and javascript execution.                                                                 |
+| BROWSER_CONNECT_ONDEMAND                 | No       | false     | If set to false, the crawler will proactively connect to the browser instance and always maintain an active connection. If set to true, the browser will be launched on demand only whenever a crawling is requested. Set to true if you're using a service that provides you with browser instances on demand.                                                               |
+| CRAWLER_DOWNLOAD_BANNER_IMAGE            | No       | true      | Whether to cache the banner image used in the cards locally or fetch it each time directly from the website. Caching it consumes more storage space, but is more resilient against link rot and rate limits from websites.                                                                                                                                                    |
+| CRAWLER_STORE_SCREENSHOT                 | No       | true      | Whether to store a screenshot from the crawled website or not. Screenshots act as a fallback for when we fail to extract an image from a website. You can also view the stored screenshots for any link.                                                                                                                                                                      |
+| CRAWLER_FULL_PAGE_SCREENSHOT             | No       | false     | Whether to store a screenshot of the full page or not. Disabled by default, as it can lead to much higher disk usage. If disabled, the screenshot will only include the visible part of the page                                                                                                                                                                              |
+| CRAWLER_SCREENSHOT_TIMEOUT_SEC           | No       | 5         | How long to wait for the screenshot finish before timing out. If you are capturing full-page screenshots of long webpages, consider increasing this value.                                                                                                                                                                                                                    |
+| CRAWLER_FULL_PAGE_ARCHIVE                | No       | false     | Whether to store a full local copy of the page or not. Disabled by default, as it can lead to much higher disk usage. If disabled, only the readable text of the page is archived.                                                                                                                                                                                            |
+| CRAWLER_JOB_TIMEOUT_SEC                  | No       | 60        | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit                                                                                                                                                                                                        |
+| CRAWLER_NAVIGATE_TIMEOUT_SEC             | No       | 30        | How long to spend navigating to the page (along with its redirects). Increase this if you have a slow internet connection                                                                                                                                                                                                                                                     |
+| CRAWLER_VIDEO_DOWNLOAD                   | No       | false     | Whether to download videos from the page or not (using yt-dlp)                                                                                                                                                                                                                                                                                                                |
+| CRAWLER_VIDEO_DOWNLOAD_MAX_SIZE          | No       | 50        | The maximum file size for the downloaded video. The quality will be chosen accordingly. Use -1 to disable the limit.                                                                                                                                                                                                                                                          |
+| CRAWLER_VIDEO_DOWNLOAD_TIMEOUT_SEC       | No       | 600       | How long to wait for the video download to finish                                                                                                                                                                                                                                                                                                                             |
+| CRAWLER_ENABLE_ADBLOCKER                 | No       | true      | Whether to enable an adblocker in the crawler or not. If you're facing troubles downloading the adblocking lists on worker startup, you can disable this.                                                                                                                                                                                                                     |
+| CRAWLER_YTDLP_ARGS                       | No       | []        | Include additional yt-dlp arguments to be passed at crawl time separated by %%: https://github.com/yt-dlp/yt-dlp?tab=readme-ov-file#general-options                                                                                                                                                                                                                           |
+| BROWSER_COOKIE_PATH                      | No       | Not set   | Path to a JSON file containing cookies to be loaded into the browser context. The file should be an array of cookie objects, each with name and value (required), and optional fields like domain, path, expires, httpOnly, secure, and sameSite (e.g., `[{"name": "session", "value": "xxx", "domain": ".example.com"}`]).                                                   |
+| HTML_CONTENT_SIZE_INLINE_THRESHOLD_BYTES | No       | 5 \* 1024 | The thresholds in bytes after which larger assets will be stored in the assetdb (folder/s3) instead of inline in the database.                                                                                                                                                                                                                                                |
 
 <details>
 
@@ -221,11 +222,11 @@ Karakeep can send emails for various purposes such as email verification during
 
 If your Karakeep instance needs to connect through a proxy server, you can configure the following settings:
 
-| Name                | Required | Default | Description                                                                                                                                                                           |
-| ------------------- | -------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Name                | Required | Default | Description                                                                                                                                                                          |
+| ------------------- | -------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | CRAWLER_HTTP_PROXY  | No       | Not set | HTTP proxy server URL for outgoing HTTP requests (e.g., `http://proxy.example.com:8080`). You can pass multiple comma separated proxies and the used one will be chosen at random.   |
 | CRAWLER_HTTPS_PROXY | No       | Not set | HTTPS proxy server URL for outgoing HTTPS requests (e.g., `http://proxy.example.com:8080`). You can pass multiple comma separated proxies and the used one will be chosen at random. |
-| CRAWLER_NO_PROXY    | No       | Not set | Comma-separated list of hostnames/IPs that should bypass the proxy (e.g., `localhost,127.0.0.1,.local`)                                                                               |
+| CRAWLER_NO_PROXY    | No       | Not set | Comma-separated list of hostnames/IPs that should bypass the proxy (e.g., `localhost,127.0.0.1,.local`)                                                                              |
 
 :::info
 These proxy settings will be used by the crawler and other components that make outgoing HTTP requests.
diff --git a/packages/shared/config.ts b/packages/shared/config.ts
index 8e7d0252..d54b7589 100644
--- a/packages/shared/config.ts
+++ b/packages/shared/config.ts
@@ -112,6 +112,7 @@ const allEnv = z.object({
   DATA_DIR: z.string().default(""),
   ASSETS_DIR: z.string().optional(),
   MAX_ASSET_SIZE_MB: z.coerce.number().default(50),
+  HTML_CONTENT_SIZE_INLINE_THRESHOLD_BYTES: z.coerce.number().default(5 * 1024),
   INFERENCE_LANG: z.string().default("english"),
   WEBHOOK_TIMEOUT_SEC: z.coerce.number().default(5),
   WEBHOOK_RETRY_TIMES: z.coerce.number().int().min(0).default(3),
@@ -274,6 +275,7 @@ const serverConfigSchema = allEnv.transform((val, ctx) => {
       enableAdblocker: val.CRAWLER_ENABLE_ADBLOCKER,
       ytDlpArguments: val.CRAWLER_YTDLP_ARGS,
       screenshotTimeoutSec: val.CRAWLER_SCREENSHOT_TIMEOUT_SEC,
+      htmlContentSizeThreshold: val.HTML_CONTENT_SIZE_INLINE_THRESHOLD_BYTES,
     },
     ocr: {
       langs: val.OCR_LANGS,
author	Mohamed Bassem <me@mbassem.com>	2025-10-26 11:17:16 +0000
committer	Mohamed Bassem <me@mbassem.com>	2025-10-26 11:17:16 +0000
commit	cf3ffff0a5800168c3e4c446a0735084f62f8216 (patch)
tree	15e327911f4aff70e1c0d03737364a57c1b9d237
parent	2b769cba822506c1572793385993737d4ffff478 (diff)
download	karakeep-cf3ffff0a5800168c3e4c446a0735084f62f8216.tar.zst