From 360ef9dbbe68f2b87fcb59ff0100de7527cc88ba Mon Sep 17 00:00:00 2001 From: Mohamed Bassem Date: Sun, 13 Jul 2025 01:04:55 +0000 Subject: feat: Add proper proxy support. fixes #1265 --- apps/workers/package.json | 2 + apps/workers/workers/crawlerWorker.ts | 94 +++++++++++++++++++++++++++++++---- docs/docs/03-configuration.md | 15 ++++++ packages/shared/config.ts | 10 ++++ pnpm-lock.yaml | 6 +++ 5 files changed, 118 insertions(+), 9 deletions(-) diff --git a/apps/workers/package.json b/apps/workers/package.json index 93a26718..43e479c8 100644 --- a/apps/workers/package.json +++ b/apps/workers/package.json @@ -16,6 +16,8 @@ "dotenv": "^16.4.1", "drizzle-orm": "^0.44.2", "execa": "9.3.1", + "http-proxy-agent": "^7.0.2", + "https-proxy-agent": "^7.0.6", "jsdom": "^24.0.0", "liteque": "^0.4.1", "metascraper": "^5.46.18", diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts index edd1d8f1..05bce103 100644 --- a/apps/workers/workers/crawlerWorker.ts +++ b/apps/workers/workers/crawlerWorker.ts @@ -9,6 +9,8 @@ import DOMPurify from "dompurify"; import { eq } from "drizzle-orm"; import { execa } from "execa"; import { isShuttingDown } from "exit"; +import { HttpProxyAgent } from "http-proxy-agent"; +import { HttpsProxyAgent } from "https-proxy-agent"; import { JSDOM, VirtualConsole } from "jsdom"; import { DequeuedJob, EnqueueOptions, Runner } from "liteque"; import metascraper from "metascraper"; @@ -23,7 +25,7 @@ import metascraperTitle from "metascraper-title"; import metascraperTwitter from "metascraper-twitter"; import metascraperUrl from "metascraper-url"; import fetch from "node-fetch"; -import { Browser } from "playwright"; +import { Browser, BrowserContextOptions } from "playwright"; import { chromium } from "playwright-extra"; import StealthPlugin from "puppeteer-extra-plugin-stealth"; import { withTimeout } from "utils"; @@ -85,6 +87,76 @@ const metascraperParser = metascraper([ metascraperUrl(), ]); +function getProxyAgent(url: string) { + const { proxy } = serverConfig; + + if (!proxy.httpProxy && !proxy.httpsProxy) { + return undefined; + } + + const urlObj = new URL(url); + const protocol = urlObj.protocol; + + // Check if URL should bypass proxy + if (proxy.noProxy) { + const noProxyList = proxy.noProxy.split(",").map((host) => host.trim()); + const hostname = urlObj.hostname; + + for (const noProxyHost of noProxyList) { + if ( + noProxyHost === hostname || + (noProxyHost.startsWith(".") && hostname.endsWith(noProxyHost)) || + hostname.endsWith("." + noProxyHost) + ) { + return undefined; + } + } + } + + if (protocol === "https:" && proxy.httpsProxy) { + return new HttpsProxyAgent(proxy.httpsProxy); + } else if (protocol === "http:" && proxy.httpProxy) { + return new HttpProxyAgent(proxy.httpProxy); + } else if (proxy.httpProxy) { + // Fallback to HTTP proxy for HTTPS if HTTPS proxy not configured + return new HttpProxyAgent(proxy.httpProxy); + } + + return undefined; +} + +function getPlaywrightProxyConfig(): BrowserContextOptions["proxy"] { + const { proxy } = serverConfig; + + if (!proxy.httpProxy && !proxy.httpsProxy) { + return undefined; + } + + // Use HTTPS proxy if available, otherwise fall back to HTTP proxy + const proxyUrl = proxy.httpsProxy || proxy.httpProxy; + if (!proxyUrl) { + // Unreachable, but TypeScript doesn't know that + return undefined; + } + + const parsed = new URL(proxyUrl); + + return { + server: proxyUrl, + username: parsed.username, + password: parsed.password, + bypass: proxy.noProxy, + }; +} + +const fetchWithProxy = (url: string, options: Record = {}) => { + const agent = getProxyAgent(url); + if (agent) { + options.agent = agent; + } + return fetch(url, options); +}; + let globalBrowser: Browser | undefined; let globalBlocker: PlaywrightBlocker | undefined; // Guards the interactions with the browser instance. @@ -163,11 +235,14 @@ export class CrawlerWorker { if (serverConfig.crawler.enableAdblocker) { try { logger.info("[crawler] Loading adblocker ..."); - globalBlocker = await PlaywrightBlocker.fromPrebuiltFull(fetch, { - path: path.join(os.tmpdir(), "karakeep_adblocker.bin"), - read: fs.readFile, - write: fs.writeFile, - }); + globalBlocker = await PlaywrightBlocker.fromPrebuiltFull( + fetchWithProxy, + { + path: path.join(os.tmpdir(), "karakeep_adblocker.bin"), + read: fs.readFile, + write: fs.writeFile, + }, + ); } catch (e) { logger.error( `[crawler] Failed to load adblocker. Will not be blocking ads: ${e}`, @@ -258,7 +333,7 @@ async function browserlessCrawlPage( logger.info( `[Crawler][${jobId}] Running in browserless mode. Will do a plain http request to "${url}". Screenshots will be disabled.`, ); - const response = await fetch(url, { + const response = await fetchWithProxy(url, { signal: AbortSignal.any([AbortSignal.timeout(5000), abortSignal]), }); logger.info( @@ -296,6 +371,7 @@ async function crawlPage( viewport: { width: 1440, height: 900 }, userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", + proxy: getPlaywrightProxyConfig(), }); try { // Create a new page in the context @@ -479,7 +555,7 @@ async function downloadAndStoreFile( ) { try { logger.info(`[Crawler][${jobId}] Downloading ${fileType} from "${url}"`); - const response = await fetch(url, { + const response = await fetchWithProxy(url, { signal: abortSignal, }); if (!response.ok) { @@ -617,7 +693,7 @@ async function getContentType( logger.info( `[Crawler][${jobId}] Attempting to determine the content-type for the url ${url}`, ); - const response = await fetch(url, { + const response = await fetchWithProxy(url, { method: "HEAD", signal: AbortSignal.any([AbortSignal.timeout(5000), abortSignal]), }); diff --git a/docs/docs/03-configuration.md b/docs/docs/03-configuration.md index 5f5d79ef..54a077b5 100644 --- a/docs/docs/03-configuration.md +++ b/docs/docs/03-configuration.md @@ -183,3 +183,18 @@ Karakeep can send emails for various purposes such as email verification during | SMTP_USER | No | Not set | The username for SMTP authentication. Usually your email address. | | SMTP_PASSWORD | No | Not set | The password for SMTP authentication. For services like Gmail, use an app-specific password. | | SMTP_FROM | No | Not set | The "from" email address that will appear in sent emails. This should be a valid email address. | + + +## Proxy Configuration + +If your Karakeep instance needs to connect through a proxy server, you can configure the following settings: + +| Name | Required | Default | Description | +| ----------- | -------- | ------- | ------------------------------------------------------------------------------------------------------------- | +| HTTP_PROXY | No | Not set | HTTP proxy server URL for outgoing HTTP requests (e.g., `http://proxy.example.com:8080`) | +| HTTPS_PROXY | No | Not set | HTTPS proxy server URL for outgoing HTTPS requests (e.g., `http://proxy.example.com:8080`) | +| NO_PROXY | No | Not set | Comma-separated list of hostnames/IPs that should bypass the proxy (e.g., `localhost,127.0.0.1,.local`) | + +:::info +These proxy settings will be used by the crawler and other components that make outgoing HTTP requests. +::: diff --git a/packages/shared/config.ts b/packages/shared/config.ts index 87914529..ed17bb90 100644 --- a/packages/shared/config.ts +++ b/packages/shared/config.ts @@ -116,6 +116,11 @@ const allEnv = z.object({ // Rate limiting configuration RATE_LIMITING_ENABLED: stringBool("false"), + + // Proxy configuration + HTTP_PROXY: z.string().optional(), + HTTPS_PROXY: z.string().optional(), + NO_PROXY: z.string().optional(), }); const serverConfigSchema = allEnv @@ -232,6 +237,11 @@ const serverConfigSchema = allEnv retryTimes: val.WEBHOOK_RETRY_TIMES, numWorkers: val.WEBHOOK_NUM_WORKERS, }, + proxy: { + httpProxy: val.HTTP_PROXY, + httpsProxy: val.HTTPS_PROXY, + noProxy: val.NO_PROXY, + }, assetPreprocessing: { numWorkers: val.ASSET_PREPROCESSING_NUM_WORKERS, }, diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 81a4b8b6..796b8a2d 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -784,6 +784,12 @@ importers: execa: specifier: 9.3.1 version: 9.3.1 + http-proxy-agent: + specifier: ^7.0.2 + version: 7.0.2 + https-proxy-agent: + specifier: ^7.0.6 + version: 7.0.6(supports-color@10.0.0) jsdom: specifier: ^24.0.0 version: 24.1.3 -- cgit v1.2.3-70-g09d2