aboutsummaryrefslogtreecommitdiffstats
path: root/apps/workers
diff options
context:
space:
mode:
authorMohamed Bassem <me@mbassem.com>2025-07-13 01:04:55 +0000
committerMohamed Bassem <me@mbassem.com>2025-07-13 01:05:54 +0000
commit360ef9dbbe68f2b87fcb59ff0100de7527cc88ba (patch)
tree2adcca8a38444dc352dd850c68e0bd33b363517e /apps/workers
parent1105b4a41b2a91a24a164c70264b294a80afe97b (diff)
downloadkarakeep-360ef9dbbe68f2b87fcb59ff0100de7527cc88ba.tar.zst
feat: Add proper proxy support. fixes #1265
Diffstat (limited to 'apps/workers')
-rw-r--r--apps/workers/package.json2
-rw-r--r--apps/workers/workers/crawlerWorker.ts94
2 files changed, 87 insertions, 9 deletions
diff --git a/apps/workers/package.json b/apps/workers/package.json
index 93a26718..43e479c8 100644
--- a/apps/workers/package.json
+++ b/apps/workers/package.json
@@ -16,6 +16,8 @@
"dotenv": "^16.4.1",
"drizzle-orm": "^0.44.2",
"execa": "9.3.1",
+ "http-proxy-agent": "^7.0.2",
+ "https-proxy-agent": "^7.0.6",
"jsdom": "^24.0.0",
"liteque": "^0.4.1",
"metascraper": "^5.46.18",
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts
index edd1d8f1..05bce103 100644
--- a/apps/workers/workers/crawlerWorker.ts
+++ b/apps/workers/workers/crawlerWorker.ts
@@ -9,6 +9,8 @@ import DOMPurify from "dompurify";
import { eq } from "drizzle-orm";
import { execa } from "execa";
import { isShuttingDown } from "exit";
+import { HttpProxyAgent } from "http-proxy-agent";
+import { HttpsProxyAgent } from "https-proxy-agent";
import { JSDOM, VirtualConsole } from "jsdom";
import { DequeuedJob, EnqueueOptions, Runner } from "liteque";
import metascraper from "metascraper";
@@ -23,7 +25,7 @@ import metascraperTitle from "metascraper-title";
import metascraperTwitter from "metascraper-twitter";
import metascraperUrl from "metascraper-url";
import fetch from "node-fetch";
-import { Browser } from "playwright";
+import { Browser, BrowserContextOptions } from "playwright";
import { chromium } from "playwright-extra";
import StealthPlugin from "puppeteer-extra-plugin-stealth";
import { withTimeout } from "utils";
@@ -85,6 +87,76 @@ const metascraperParser = metascraper([
metascraperUrl(),
]);
+function getProxyAgent(url: string) {
+ const { proxy } = serverConfig;
+
+ if (!proxy.httpProxy && !proxy.httpsProxy) {
+ return undefined;
+ }
+
+ const urlObj = new URL(url);
+ const protocol = urlObj.protocol;
+
+ // Check if URL should bypass proxy
+ if (proxy.noProxy) {
+ const noProxyList = proxy.noProxy.split(",").map((host) => host.trim());
+ const hostname = urlObj.hostname;
+
+ for (const noProxyHost of noProxyList) {
+ if (
+ noProxyHost === hostname ||
+ (noProxyHost.startsWith(".") && hostname.endsWith(noProxyHost)) ||
+ hostname.endsWith("." + noProxyHost)
+ ) {
+ return undefined;
+ }
+ }
+ }
+
+ if (protocol === "https:" && proxy.httpsProxy) {
+ return new HttpsProxyAgent(proxy.httpsProxy);
+ } else if (protocol === "http:" && proxy.httpProxy) {
+ return new HttpProxyAgent(proxy.httpProxy);
+ } else if (proxy.httpProxy) {
+ // Fallback to HTTP proxy for HTTPS if HTTPS proxy not configured
+ return new HttpProxyAgent(proxy.httpProxy);
+ }
+
+ return undefined;
+}
+
+function getPlaywrightProxyConfig(): BrowserContextOptions["proxy"] {
+ const { proxy } = serverConfig;
+
+ if (!proxy.httpProxy && !proxy.httpsProxy) {
+ return undefined;
+ }
+
+ // Use HTTPS proxy if available, otherwise fall back to HTTP proxy
+ const proxyUrl = proxy.httpsProxy || proxy.httpProxy;
+ if (!proxyUrl) {
+ // Unreachable, but TypeScript doesn't know that
+ return undefined;
+ }
+
+ const parsed = new URL(proxyUrl);
+
+ return {
+ server: proxyUrl,
+ username: parsed.username,
+ password: parsed.password,
+ bypass: proxy.noProxy,
+ };
+}
+
+const fetchWithProxy = (url: string, options: Record<string, unknown> = {}) => {
+ const agent = getProxyAgent(url);
+ if (agent) {
+ options.agent = agent;
+ }
+ return fetch(url, options);
+};
+
let globalBrowser: Browser | undefined;
let globalBlocker: PlaywrightBlocker | undefined;
// Guards the interactions with the browser instance.
@@ -163,11 +235,14 @@ export class CrawlerWorker {
if (serverConfig.crawler.enableAdblocker) {
try {
logger.info("[crawler] Loading adblocker ...");
- globalBlocker = await PlaywrightBlocker.fromPrebuiltFull(fetch, {
- path: path.join(os.tmpdir(), "karakeep_adblocker.bin"),
- read: fs.readFile,
- write: fs.writeFile,
- });
+ globalBlocker = await PlaywrightBlocker.fromPrebuiltFull(
+ fetchWithProxy,
+ {
+ path: path.join(os.tmpdir(), "karakeep_adblocker.bin"),
+ read: fs.readFile,
+ write: fs.writeFile,
+ },
+ );
} catch (e) {
logger.error(
`[crawler] Failed to load adblocker. Will not be blocking ads: ${e}`,
@@ -258,7 +333,7 @@ async function browserlessCrawlPage(
logger.info(
`[Crawler][${jobId}] Running in browserless mode. Will do a plain http request to "${url}". Screenshots will be disabled.`,
);
- const response = await fetch(url, {
+ const response = await fetchWithProxy(url, {
signal: AbortSignal.any([AbortSignal.timeout(5000), abortSignal]),
});
logger.info(
@@ -296,6 +371,7 @@ async function crawlPage(
viewport: { width: 1440, height: 900 },
userAgent:
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
+ proxy: getPlaywrightProxyConfig(),
});
try {
// Create a new page in the context
@@ -479,7 +555,7 @@ async function downloadAndStoreFile(
) {
try {
logger.info(`[Crawler][${jobId}] Downloading ${fileType} from "${url}"`);
- const response = await fetch(url, {
+ const response = await fetchWithProxy(url, {
signal: abortSignal,
});
if (!response.ok) {
@@ -617,7 +693,7 @@ async function getContentType(
logger.info(
`[Crawler][${jobId}] Attempting to determine the content-type for the url ${url}`,
);
- const response = await fetch(url, {
+ const response = await fetchWithProxy(url, {
method: "HEAD",
signal: AbortSignal.any([AbortSignal.timeout(5000), abortSignal]),
});