aboutsummaryrefslogtreecommitdiffstats
path: root/apps
diff options
context:
space:
mode:
Diffstat (limited to 'apps')
-rw-r--r--apps/workers/workers/crawlerWorker.ts59
1 files changed, 59 insertions, 0 deletions
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts
index e011b826..79d8e06a 100644
--- a/apps/workers/workers/crawlerWorker.ts
+++ b/apps/workers/workers/crawlerWorker.ts
@@ -28,6 +28,7 @@ import { chromium } from "playwright-extra";
import StealthPlugin from "puppeteer-extra-plugin-stealth";
import { fetchWithProxy } from "utils";
import { getBookmarkDetails, updateAsset } from "workerUtils";
+import { z } from "zod";
import type { ZCrawlLinkRequest } from "@karakeep/shared/queues";
import { db } from "@karakeep/db";
@@ -95,6 +96,30 @@ const metascraperParser = metascraper([
metascraperUrl(),
]);
+interface Cookie {
+ name: string;
+ value: string;
+ domain?: string;
+ path?: string;
+ expires?: number;
+ httpOnly?: boolean;
+ secure?: boolean;
+ sameSite?: "Strict" | "Lax" | "None";
+}
+
+const cookieSchema = z.object({
+ name: z.string(),
+ value: z.string(),
+ domain: z.string().optional(),
+ path: z.string().optional(),
+ expires: z.number().optional(),
+ httpOnly: z.boolean().optional(),
+ secure: z.boolean().optional(),
+ sameSite: z.enum(["Strict", "Lax", "None"]).optional(),
+});
+
+const cookiesSchema = z.array(cookieSchema);
+
function getPlaywrightProxyConfig(): BrowserContextOptions["proxy"] {
const { proxy } = serverConfig;
@@ -121,6 +146,8 @@ function getPlaywrightProxyConfig(): BrowserContextOptions["proxy"] {
let globalBrowser: Browser | undefined;
let globalBlocker: PlaywrightBlocker | undefined;
+// Global variable to store parsed cookies
+let globalCookies: Cookie[] = [];
// Guards the interactions with the browser instance.
// This is needed given that most of the browser APIs are async.
const browserMutex = new Mutex();
@@ -252,10 +279,35 @@ export class CrawlerWorker {
},
);
+ await loadCookiesFromFile();
+
return worker;
}
}
+async function loadCookiesFromFile(): Promise<void> {
+ try {
+ const path = serverConfig.crawler.browserCookiePath;
+ if (!path) {
+ logger.info(
+ "[Crawler] Not defined in the server configuration BROWSER_COOKIE_PATH",
+ );
+ return;
+ }
+ const data = await fs.readFile(path, "utf8");
+ const cookies = JSON.parse(data);
+ globalCookies = cookiesSchema.parse(cookies);
+ } catch (error) {
+ logger.error("Failed to read or parse cookies file:", error);
+ if (error instanceof z.ZodError) {
+ logger.error("[Crawler] Invalid cookie file format:", error.errors);
+ } else {
+ logger.error("[Crawler] Failed to read or parse cookies file:", error);
+ }
+ throw error;
+ }
+}
+
type DBAssetType = typeof assets.$inferInsert;
async function changeBookmarkStatus(
@@ -352,6 +404,13 @@ async function crawlPage(
proxy: getPlaywrightProxyConfig(),
});
try {
+ if (globalCookies.length > 0) {
+ await context.addCookies(globalCookies);
+ logger.info(
+ `[Crawler][${jobId}] Cookies successfully loaded into browser context`,
+ );
+ }
+
// Create a new page in the context
const page = await context.newPage();