feat: Add cookie support for browser page access

* feat: Add cookie support for browser page access Implemented cookie functionality for browser page access, including BROWSER_COOKIE_PATH configuration to specify the cookies JSON file path. * fix the docs --------- Co-authored-by: lizz <lizong1204@gmail.com>
author: Mohamed Bassem <me@mbassem.com> 2025-09-07 15:47:38 +0100
committer: GitHub <noreply@github.com> 2025-09-07 15:47:38 +0100
commit: c57fd5137cc29870667777a371a4d1fcdf69436b (patch)
tree: 845bec5a60ee2b43fc33d653965a6571fa92d84b
parent: 492b15203807b4ceb00af4b301958344cc5a668f (diff)
download: karakeep-c57fd5137cc29870667777a371a4d1fcdf69436b.tar.zst
3 files changed, 94 insertions, 2 deletions
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts
index e011b826..79d8e06a 100644
--- a/apps/workers/workers/crawlerWorker.ts
+++ b/apps/workers/workers/crawlerWorker.ts
@@ -28,6 +28,7 @@ import { chromium } from "playwright-extra";
 import StealthPlugin from "puppeteer-extra-plugin-stealth";
 import { fetchWithProxy } from "utils";
 import { getBookmarkDetails, updateAsset } from "workerUtils";
+import { z } from "zod";
 
 import type { ZCrawlLinkRequest } from "@karakeep/shared/queues";
 import { db } from "@karakeep/db";
@@ -95,6 +96,30 @@ const metascraperParser = metascraper([
   metascraperUrl(),
 ]);
 
+interface Cookie {
+  name: string;
+  value: string;
+  domain?: string;
+  path?: string;
+  expires?: number;
+  httpOnly?: boolean;
+  secure?: boolean;
+  sameSite?: "Strict" | "Lax" | "None";
+}
+
+const cookieSchema = z.object({
+  name: z.string(),
+  value: z.string(),
+  domain: z.string().optional(),
+  path: z.string().optional(),
+  expires: z.number().optional(),
+  httpOnly: z.boolean().optional(),
+  secure: z.boolean().optional(),
+  sameSite: z.enum(["Strict", "Lax", "None"]).optional(),
+});
+
+const cookiesSchema = z.array(cookieSchema);
+
 function getPlaywrightProxyConfig(): BrowserContextOptions["proxy"] {
   const { proxy } = serverConfig;
 
@@ -121,6 +146,8 @@ function getPlaywrightProxyConfig(): BrowserContextOptions["proxy"] {
 
 let globalBrowser: Browser | undefined;
 let globalBlocker: PlaywrightBlocker | undefined;
+// Global variable to store parsed cookies
+let globalCookies: Cookie[] = [];
 // Guards the interactions with the browser instance.
 // This is needed given that most of the browser APIs are async.
 const browserMutex = new Mutex();
@@ -252,10 +279,35 @@ export class CrawlerWorker {
       },
     );
 
+    await loadCookiesFromFile();
+
     return worker;
   }
 }
 
+async function loadCookiesFromFile(): Promise<void> {
+  try {
+    const path = serverConfig.crawler.browserCookiePath;
+    if (!path) {
+      logger.info(
+        "[Crawler] Not defined in the server configuration BROWSER_COOKIE_PATH",
+      );
+      return;
+    }
+    const data = await fs.readFile(path, "utf8");
+    const cookies = JSON.parse(data);
+    globalCookies = cookiesSchema.parse(cookies);
+  } catch (error) {
+    logger.error("Failed to read or parse cookies file:", error);
+    if (error instanceof z.ZodError) {
+      logger.error("[Crawler] Invalid cookie file format:", error.errors);
+    } else {
+      logger.error("[Crawler] Failed to read or parse cookies file:", error);
+    }
+    throw error;
+  }
+}
+
 type DBAssetType = typeof assets.$inferInsert;
 
 async function changeBookmarkStatus(
@@ -352,6 +404,13 @@ async function crawlPage(
     proxy: getPlaywrightProxyConfig(),
   });
   try {
+    if (globalCookies.length > 0) {
+      await context.addCookies(globalCookies);
+      logger.info(
+        `[Crawler][${jobId}] Cookies successfully loaded into browser context`,
+      );
+    }
+
     // Create a new page in the context
     const page = await context.newPage();
 
diff --git a/docs/docs/03-configuration.md b/docs/docs/03-configuration.md
index aae1ffa3..0f61360f 100644
--- a/docs/docs/03-configuration.md
+++ b/docs/docs/03-configuration.md
@@ -6,8 +6,8 @@ The app is mainly configured by environment variables. All the used environment
 | ------------------------------- | ------------------------------------- | --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | PORT                            | No                                    | 3000            | The port on which the web server will listen. DON'T CHANGE THIS IF YOU'RE USING DOCKER, instead changed the docker bound external port.                                                                                                                                |
 | WORKERS_PORT                    | No                                    | 0 (Random Port) | The port on which the worker will export its prometheus metrics on `/metrics`. By default it's a random unused port. If you want to utilize those metrics, fix the port to a value (and export it in docker if you're using docker).                                   |
-| WORKERS_ENABLED_WORKERS         | No                                    | Not set         | Comma separated list of worker names to enable. If set, only these workers will run. Valid values: crawler,inference,search,tidyAssets,video,feed,assetPreprocessing,webhook,ruleEngine. |
-| WORKERS_DISABLED_WORKERS        | No                                    | Not set         | Comma separated list of worker names to disable. Takes precedence over `WORKERS_ENABLED_WORKERS`. |
+| WORKERS_ENABLED_WORKERS         | No                                    | Not set         | Comma separated list of worker names to enable. If set, only these workers will run. Valid values: crawler,inference,search,tidyAssets,video,feed,assetPreprocessing,webhook,ruleEngine.                                                                               |
+| WORKERS_DISABLED_WORKERS        | No                                    | Not set         | Comma separated list of worker names to disable. Takes precedence over `WORKERS_ENABLED_WORKERS`.                                                                                                                                                                      |
 | DATA_DIR                        | Yes                                   | Not set         | The path for the persistent data directory. This is where the db lives. Assets are stored here by default unless `ASSETS_DIR` is set.                                                                                                                                  |
 | ASSETS_DIR                      | No                                    | Not set         | The path where crawled assets will be stored. If not set, defaults to `${DATA_DIR}/assets`.                                                                                                                                                                            |
 | NEXTAUTH_URL                    | Yes                                   | Not set         | Should point to the address of your server. The app will function without it, but will redirect you to wrong addresses on signout for example.                                                                                                                         |
@@ -129,6 +129,37 @@ Either `OPENAI_API_KEY` or `OLLAMA_BASE_URL` need to be set for automatic taggin
 | CRAWLER_VIDEO_DOWNLOAD_TIMEOUT_SEC | No       | 600     | How long to wait for the video download to finish                                                                                                                                                                                                                                                                                                                             |
 | CRAWLER_ENABLE_ADBLOCKER           | No       | true    | Whether to enable an adblocker in the crawler or not. If you're facing troubles downloading the adblocking lists on worker startup, you can disable this.                                                                                                                                                                                                                     |
 | CRAWLER_YTDLP_ARGS                 | No       | []      | Include additional yt-dlp arguments to be passed at crawl time separated by %%: https://github.com/yt-dlp/yt-dlp?tab=readme-ov-file#general-options                                                                                                                                                                                                                           |
+| BROWSER_COOKIE_PATH                | No       | Not set | Path to a JSON file containing cookies to be loaded into the browser context. The file should be an array of cookie objects, each with name and value (required), and optional fields like domain, path, expires, httpOnly, secure, and sameSite (e.g., `[{"name": "session", "value": "xxx", "domain": ".example.com"}`]).                                                   |
+
+<details>
+
+  <summary>More info on BROWSER_COOKIE_PATH</summary>
+
+BROWSER_COOKIE_PATH specifies the path to a JSON file containing cookies to be loaded into the browser context for crawling.
+
+The JSON file must be an array of cookie objects, each with:
+- name: The cookie name (required).
+- value: The cookie value (required).
+- Optional fields: domain, path, expires, httpOnly, secure, sameSite (values: "Strict", "Lax", or "None").
+
+Example JSON file:
+
+```json
+[
+  {
+    "name": "session",
+    "value": "xxx",
+    "domain": ".example.com",
+    "path": "/",
+    "expires": 1735689600,
+    "httpOnly": true,
+    "secure": true,
+    "sameSite": "Lax"
+  }
+]
+```
+
+</details>
 
 ## OCR Configs
 
diff --git a/packages/shared/config.ts b/packages/shared/config.ts
index 3cc65f4c..99a43da7 100644
--- a/packages/shared/config.ts
+++ b/packages/shared/config.ts
@@ -81,6 +81,7 @@ const allEnv = z.object({
   BROWSER_WEB_URL: z.string().optional(),
   BROWSER_WEBSOCKET_URL: z.string().optional(),
   BROWSER_CONNECT_ONDEMAND: stringBool("false"),
+  BROWSER_COOKIE_PATH: z.string().optional(),
   CRAWLER_JOB_TIMEOUT_SEC: z.coerce.number().default(60),
   CRAWLER_NAVIGATE_TIMEOUT_SEC: z.coerce.number().default(30),
   CRAWLER_NUM_WORKERS: z.coerce.number().default(1),
@@ -242,6 +243,7 @@ const serverConfigSchema = allEnv.transform((val, ctx) => {
       browserWebUrl: val.BROWSER_WEB_URL,
       browserWebSocketUrl: val.BROWSER_WEBSOCKET_URL,
       browserConnectOnDemand: val.BROWSER_CONNECT_ONDEMAND,
+      browserCookiePath: val.BROWSER_COOKIE_PATH,
       jobTimeoutSec: val.CRAWLER_JOB_TIMEOUT_SEC,
       navigateTimeoutSec: val.CRAWLER_NAVIGATE_TIMEOUT_SEC,
       downloadBannerImage: val.CRAWLER_DOWNLOAD_BANNER_IMAGE,
author	Mohamed Bassem <me@mbassem.com>	2025-09-07 15:47:38 +0100
committer	GitHub <noreply@github.com>	2025-09-07 15:47:38 +0100
commit	c57fd5137cc29870667777a371a4d1fcdf69436b (patch)
tree	845bec5a60ee2b43fc33d653965a6571fa92d84b
parent	492b15203807b4ceb00af4b301958344cc5a668f (diff)
download	karakeep-c57fd5137cc29870667777a371a4d1fcdf69436b.tar.zst