diff options
| author | Mohamed Bassem <me@mbassem.com> | 2025-09-07 15:47:38 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-09-07 15:47:38 +0100 |
| commit | c57fd5137cc29870667777a371a4d1fcdf69436b (patch) | |
| tree | 845bec5a60ee2b43fc33d653965a6571fa92d84b | |
| parent | 492b15203807b4ceb00af4b301958344cc5a668f (diff) | |
| download | karakeep-c57fd5137cc29870667777a371a4d1fcdf69436b.tar.zst | |
feat: Add cookie support for browser page access
* feat: Add cookie support for browser page access
Implemented cookie functionality for browser page access, including BROWSER_COOKIE_PATH configuration to specify the cookies JSON file path.
* fix the docs
---------
Co-authored-by: lizz <lizong1204@gmail.com>
| -rw-r--r-- | apps/workers/workers/crawlerWorker.ts | 59 | ||||
| -rw-r--r-- | docs/docs/03-configuration.md | 35 | ||||
| -rw-r--r-- | packages/shared/config.ts | 2 |
3 files changed, 94 insertions, 2 deletions
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts index e011b826..79d8e06a 100644 --- a/apps/workers/workers/crawlerWorker.ts +++ b/apps/workers/workers/crawlerWorker.ts @@ -28,6 +28,7 @@ import { chromium } from "playwright-extra"; import StealthPlugin from "puppeteer-extra-plugin-stealth"; import { fetchWithProxy } from "utils"; import { getBookmarkDetails, updateAsset } from "workerUtils"; +import { z } from "zod"; import type { ZCrawlLinkRequest } from "@karakeep/shared/queues"; import { db } from "@karakeep/db"; @@ -95,6 +96,30 @@ const metascraperParser = metascraper([ metascraperUrl(), ]); +interface Cookie { + name: string; + value: string; + domain?: string; + path?: string; + expires?: number; + httpOnly?: boolean; + secure?: boolean; + sameSite?: "Strict" | "Lax" | "None"; +} + +const cookieSchema = z.object({ + name: z.string(), + value: z.string(), + domain: z.string().optional(), + path: z.string().optional(), + expires: z.number().optional(), + httpOnly: z.boolean().optional(), + secure: z.boolean().optional(), + sameSite: z.enum(["Strict", "Lax", "None"]).optional(), +}); + +const cookiesSchema = z.array(cookieSchema); + function getPlaywrightProxyConfig(): BrowserContextOptions["proxy"] { const { proxy } = serverConfig; @@ -121,6 +146,8 @@ function getPlaywrightProxyConfig(): BrowserContextOptions["proxy"] { let globalBrowser: Browser | undefined; let globalBlocker: PlaywrightBlocker | undefined; +// Global variable to store parsed cookies +let globalCookies: Cookie[] = []; // Guards the interactions with the browser instance. // This is needed given that most of the browser APIs are async. const browserMutex = new Mutex(); @@ -252,10 +279,35 @@ export class CrawlerWorker { }, ); + await loadCookiesFromFile(); + return worker; } } +async function loadCookiesFromFile(): Promise<void> { + try { + const path = serverConfig.crawler.browserCookiePath; + if (!path) { + logger.info( + "[Crawler] Not defined in the server configuration BROWSER_COOKIE_PATH", + ); + return; + } + const data = await fs.readFile(path, "utf8"); + const cookies = JSON.parse(data); + globalCookies = cookiesSchema.parse(cookies); + } catch (error) { + logger.error("Failed to read or parse cookies file:", error); + if (error instanceof z.ZodError) { + logger.error("[Crawler] Invalid cookie file format:", error.errors); + } else { + logger.error("[Crawler] Failed to read or parse cookies file:", error); + } + throw error; + } +} + type DBAssetType = typeof assets.$inferInsert; async function changeBookmarkStatus( @@ -352,6 +404,13 @@ async function crawlPage( proxy: getPlaywrightProxyConfig(), }); try { + if (globalCookies.length > 0) { + await context.addCookies(globalCookies); + logger.info( + `[Crawler][${jobId}] Cookies successfully loaded into browser context`, + ); + } + // Create a new page in the context const page = await context.newPage(); diff --git a/docs/docs/03-configuration.md b/docs/docs/03-configuration.md index aae1ffa3..0f61360f 100644 --- a/docs/docs/03-configuration.md +++ b/docs/docs/03-configuration.md @@ -6,8 +6,8 @@ The app is mainly configured by environment variables. All the used environment | ------------------------------- | ------------------------------------- | --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | PORT | No | 3000 | The port on which the web server will listen. DON'T CHANGE THIS IF YOU'RE USING DOCKER, instead changed the docker bound external port. | | WORKERS_PORT | No | 0 (Random Port) | The port on which the worker will export its prometheus metrics on `/metrics`. By default it's a random unused port. If you want to utilize those metrics, fix the port to a value (and export it in docker if you're using docker). | -| WORKERS_ENABLED_WORKERS | No | Not set | Comma separated list of worker names to enable. If set, only these workers will run. Valid values: crawler,inference,search,tidyAssets,video,feed,assetPreprocessing,webhook,ruleEngine. | -| WORKERS_DISABLED_WORKERS | No | Not set | Comma separated list of worker names to disable. Takes precedence over `WORKERS_ENABLED_WORKERS`. | +| WORKERS_ENABLED_WORKERS | No | Not set | Comma separated list of worker names to enable. If set, only these workers will run. Valid values: crawler,inference,search,tidyAssets,video,feed,assetPreprocessing,webhook,ruleEngine. | +| WORKERS_DISABLED_WORKERS | No | Not set | Comma separated list of worker names to disable. Takes precedence over `WORKERS_ENABLED_WORKERS`. | | DATA_DIR | Yes | Not set | The path for the persistent data directory. This is where the db lives. Assets are stored here by default unless `ASSETS_DIR` is set. | | ASSETS_DIR | No | Not set | The path where crawled assets will be stored. If not set, defaults to `${DATA_DIR}/assets`. | | NEXTAUTH_URL | Yes | Not set | Should point to the address of your server. The app will function without it, but will redirect you to wrong addresses on signout for example. | @@ -129,6 +129,37 @@ Either `OPENAI_API_KEY` or `OLLAMA_BASE_URL` need to be set for automatic taggin | CRAWLER_VIDEO_DOWNLOAD_TIMEOUT_SEC | No | 600 | How long to wait for the video download to finish | | CRAWLER_ENABLE_ADBLOCKER | No | true | Whether to enable an adblocker in the crawler or not. If you're facing troubles downloading the adblocking lists on worker startup, you can disable this. | | CRAWLER_YTDLP_ARGS | No | [] | Include additional yt-dlp arguments to be passed at crawl time separated by %%: https://github.com/yt-dlp/yt-dlp?tab=readme-ov-file#general-options | +| BROWSER_COOKIE_PATH | No | Not set | Path to a JSON file containing cookies to be loaded into the browser context. The file should be an array of cookie objects, each with name and value (required), and optional fields like domain, path, expires, httpOnly, secure, and sameSite (e.g., `[{"name": "session", "value": "xxx", "domain": ".example.com"}`]). | + +<details> + + <summary>More info on BROWSER_COOKIE_PATH</summary> + +BROWSER_COOKIE_PATH specifies the path to a JSON file containing cookies to be loaded into the browser context for crawling. + +The JSON file must be an array of cookie objects, each with: +- name: The cookie name (required). +- value: The cookie value (required). +- Optional fields: domain, path, expires, httpOnly, secure, sameSite (values: "Strict", "Lax", or "None"). + +Example JSON file: + +```json +[ + { + "name": "session", + "value": "xxx", + "domain": ".example.com", + "path": "/", + "expires": 1735689600, + "httpOnly": true, + "secure": true, + "sameSite": "Lax" + } +] +``` + +</details> ## OCR Configs diff --git a/packages/shared/config.ts b/packages/shared/config.ts index 3cc65f4c..99a43da7 100644 --- a/packages/shared/config.ts +++ b/packages/shared/config.ts @@ -81,6 +81,7 @@ const allEnv = z.object({ BROWSER_WEB_URL: z.string().optional(), BROWSER_WEBSOCKET_URL: z.string().optional(), BROWSER_CONNECT_ONDEMAND: stringBool("false"), + BROWSER_COOKIE_PATH: z.string().optional(), CRAWLER_JOB_TIMEOUT_SEC: z.coerce.number().default(60), CRAWLER_NAVIGATE_TIMEOUT_SEC: z.coerce.number().default(30), CRAWLER_NUM_WORKERS: z.coerce.number().default(1), @@ -242,6 +243,7 @@ const serverConfigSchema = allEnv.transform((val, ctx) => { browserWebUrl: val.BROWSER_WEB_URL, browserWebSocketUrl: val.BROWSER_WEBSOCKET_URL, browserConnectOnDemand: val.BROWSER_CONNECT_ONDEMAND, + browserCookiePath: val.BROWSER_COOKIE_PATH, jobTimeoutSec: val.CRAWLER_JOB_TIMEOUT_SEC, navigateTimeoutSec: val.CRAWLER_NAVIGATE_TIMEOUT_SEC, downloadBannerImage: val.CRAWLER_DOWNLOAD_BANNER_IMAGE, |
