diff options
| author | Mohamed Bassem <me@mbassem.com> | 2025-09-07 15:47:38 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-09-07 15:47:38 +0100 |
| commit | c57fd5137cc29870667777a371a4d1fcdf69436b (patch) | |
| tree | 845bec5a60ee2b43fc33d653965a6571fa92d84b /apps/workers | |
| parent | 492b15203807b4ceb00af4b301958344cc5a668f (diff) | |
| download | karakeep-c57fd5137cc29870667777a371a4d1fcdf69436b.tar.zst | |
feat: Add cookie support for browser page access
* feat: Add cookie support for browser page access
Implemented cookie functionality for browser page access, including BROWSER_COOKIE_PATH configuration to specify the cookies JSON file path.
* fix the docs
---------
Co-authored-by: lizz <lizong1204@gmail.com>
Diffstat (limited to 'apps/workers')
| -rw-r--r-- | apps/workers/workers/crawlerWorker.ts | 59 |
1 files changed, 59 insertions, 0 deletions
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts index e011b826..79d8e06a 100644 --- a/apps/workers/workers/crawlerWorker.ts +++ b/apps/workers/workers/crawlerWorker.ts @@ -28,6 +28,7 @@ import { chromium } from "playwright-extra"; import StealthPlugin from "puppeteer-extra-plugin-stealth"; import { fetchWithProxy } from "utils"; import { getBookmarkDetails, updateAsset } from "workerUtils"; +import { z } from "zod"; import type { ZCrawlLinkRequest } from "@karakeep/shared/queues"; import { db } from "@karakeep/db"; @@ -95,6 +96,30 @@ const metascraperParser = metascraper([ metascraperUrl(), ]); +interface Cookie { + name: string; + value: string; + domain?: string; + path?: string; + expires?: number; + httpOnly?: boolean; + secure?: boolean; + sameSite?: "Strict" | "Lax" | "None"; +} + +const cookieSchema = z.object({ + name: z.string(), + value: z.string(), + domain: z.string().optional(), + path: z.string().optional(), + expires: z.number().optional(), + httpOnly: z.boolean().optional(), + secure: z.boolean().optional(), + sameSite: z.enum(["Strict", "Lax", "None"]).optional(), +}); + +const cookiesSchema = z.array(cookieSchema); + function getPlaywrightProxyConfig(): BrowserContextOptions["proxy"] { const { proxy } = serverConfig; @@ -121,6 +146,8 @@ function getPlaywrightProxyConfig(): BrowserContextOptions["proxy"] { let globalBrowser: Browser | undefined; let globalBlocker: PlaywrightBlocker | undefined; +// Global variable to store parsed cookies +let globalCookies: Cookie[] = []; // Guards the interactions with the browser instance. // This is needed given that most of the browser APIs are async. const browserMutex = new Mutex(); @@ -252,10 +279,35 @@ export class CrawlerWorker { }, ); + await loadCookiesFromFile(); + return worker; } } +async function loadCookiesFromFile(): Promise<void> { + try { + const path = serverConfig.crawler.browserCookiePath; + if (!path) { + logger.info( + "[Crawler] Not defined in the server configuration BROWSER_COOKIE_PATH", + ); + return; + } + const data = await fs.readFile(path, "utf8"); + const cookies = JSON.parse(data); + globalCookies = cookiesSchema.parse(cookies); + } catch (error) { + logger.error("Failed to read or parse cookies file:", error); + if (error instanceof z.ZodError) { + logger.error("[Crawler] Invalid cookie file format:", error.errors); + } else { + logger.error("[Crawler] Failed to read or parse cookies file:", error); + } + throw error; + } +} + type DBAssetType = typeof assets.$inferInsert; async function changeBookmarkStatus( @@ -352,6 +404,13 @@ async function crawlPage( proxy: getPlaywrightProxyConfig(), }); try { + if (globalCookies.length > 0) { + await context.addCookies(globalCookies); + logger.info( + `[Crawler][${jobId}] Cookies successfully loaded into browser context`, + ); + } + // Create a new page in the context const page = await context.newPage(); |
