diff options
Diffstat (limited to 'apps')
| -rw-r--r-- | apps/workers/workers/crawlerWorker.ts | 59 |
1 files changed, 59 insertions, 0 deletions
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts index e011b826..79d8e06a 100644 --- a/apps/workers/workers/crawlerWorker.ts +++ b/apps/workers/workers/crawlerWorker.ts @@ -28,6 +28,7 @@ import { chromium } from "playwright-extra"; import StealthPlugin from "puppeteer-extra-plugin-stealth"; import { fetchWithProxy } from "utils"; import { getBookmarkDetails, updateAsset } from "workerUtils"; +import { z } from "zod"; import type { ZCrawlLinkRequest } from "@karakeep/shared/queues"; import { db } from "@karakeep/db"; @@ -95,6 +96,30 @@ const metascraperParser = metascraper([ metascraperUrl(), ]); +interface Cookie { + name: string; + value: string; + domain?: string; + path?: string; + expires?: number; + httpOnly?: boolean; + secure?: boolean; + sameSite?: "Strict" | "Lax" | "None"; +} + +const cookieSchema = z.object({ + name: z.string(), + value: z.string(), + domain: z.string().optional(), + path: z.string().optional(), + expires: z.number().optional(), + httpOnly: z.boolean().optional(), + secure: z.boolean().optional(), + sameSite: z.enum(["Strict", "Lax", "None"]).optional(), +}); + +const cookiesSchema = z.array(cookieSchema); + function getPlaywrightProxyConfig(): BrowserContextOptions["proxy"] { const { proxy } = serverConfig; @@ -121,6 +146,8 @@ function getPlaywrightProxyConfig(): BrowserContextOptions["proxy"] { let globalBrowser: Browser | undefined; let globalBlocker: PlaywrightBlocker | undefined; +// Global variable to store parsed cookies +let globalCookies: Cookie[] = []; // Guards the interactions with the browser instance. // This is needed given that most of the browser APIs are async. const browserMutex = new Mutex(); @@ -252,10 +279,35 @@ export class CrawlerWorker { }, ); + await loadCookiesFromFile(); + return worker; } } +async function loadCookiesFromFile(): Promise<void> { + try { + const path = serverConfig.crawler.browserCookiePath; + if (!path) { + logger.info( + "[Crawler] Not defined in the server configuration BROWSER_COOKIE_PATH", + ); + return; + } + const data = await fs.readFile(path, "utf8"); + const cookies = JSON.parse(data); + globalCookies = cookiesSchema.parse(cookies); + } catch (error) { + logger.error("Failed to read or parse cookies file:", error); + if (error instanceof z.ZodError) { + logger.error("[Crawler] Invalid cookie file format:", error.errors); + } else { + logger.error("[Crawler] Failed to read or parse cookies file:", error); + } + throw error; + } +} + type DBAssetType = typeof assets.$inferInsert; async function changeBookmarkStatus( @@ -352,6 +404,13 @@ async function crawlPage( proxy: getPlaywrightProxyConfig(), }); try { + if (globalCookies.length > 0) { + await context.addCookies(globalCookies); + logger.info( + `[Crawler][${jobId}] Cookies successfully loaded into browser context`, + ); + } + // Create a new page in the context const page = await context.newPage(); |
