diff options
Diffstat (limited to 'apps')
| -rw-r--r-- | apps/workers/utils.ts | 55 | ||||
| -rw-r--r-- | apps/workers/workers/crawlerWorker.ts | 50 | ||||
| -rw-r--r-- | apps/workers/workers/feedWorker.ts | 3 |
3 files changed, 58 insertions, 50 deletions
diff --git a/apps/workers/utils.ts b/apps/workers/utils.ts index 2f56d3f0..55204570 100644 --- a/apps/workers/utils.ts +++ b/apps/workers/utils.ts @@ -1,3 +1,9 @@ +import { HttpProxyAgent } from "http-proxy-agent"; +import { HttpsProxyAgent } from "https-proxy-agent"; +import fetch from "node-fetch"; + +import serverConfig from "@karakeep/shared/config"; + export function withTimeout<T, Ret>( func: (param: T) => Promise<Ret>, timeoutSec: number, @@ -14,3 +20,52 @@ export function withTimeout<T, Ret>( ]); }; } + +function getProxyAgent(url: string) { + const { proxy } = serverConfig; + + if (!proxy.httpProxy && !proxy.httpsProxy) { + return undefined; + } + + const urlObj = new URL(url); + const protocol = urlObj.protocol; + + // Check if URL should bypass proxy + if (proxy.noProxy) { + const noProxyList = proxy.noProxy.split(",").map((host) => host.trim()); + const hostname = urlObj.hostname; + + for (const noProxyHost of noProxyList) { + if ( + noProxyHost === hostname || + (noProxyHost.startsWith(".") && hostname.endsWith(noProxyHost)) || + hostname.endsWith("." + noProxyHost) + ) { + return undefined; + } + } + } + + if (protocol === "https:" && proxy.httpsProxy) { + return new HttpsProxyAgent(proxy.httpsProxy); + } else if (protocol === "http:" && proxy.httpProxy) { + return new HttpProxyAgent(proxy.httpProxy); + } else if (proxy.httpProxy) { + // Fallback to HTTP proxy for HTTPS if HTTPS proxy not configured + return new HttpProxyAgent(proxy.httpProxy); + } + + return undefined; +} + +export const fetchWithProxy = ( + url: string, + options: Record<string, unknown> = {}, +) => { + const agent = getProxyAgent(url); + if (agent) { + options.agent = agent; + } + return fetch(url, options); +}; diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts index 2aaab776..625c92d9 100644 --- a/apps/workers/workers/crawlerWorker.ts +++ b/apps/workers/workers/crawlerWorker.ts @@ -9,8 +9,6 @@ import DOMPurify from "dompurify"; import { eq } from "drizzle-orm"; import { execa } from "execa"; import { exitAbortController } from "exit"; -import { HttpProxyAgent } from "http-proxy-agent"; -import { HttpsProxyAgent } from "https-proxy-agent"; import { JSDOM, VirtualConsole } from "jsdom"; import { DequeuedJob, EnqueueOptions, Runner } from "liteque"; import metascraper from "metascraper"; @@ -25,10 +23,10 @@ import metascraperTitle from "metascraper-title"; import metascraperTwitter from "metascraper-twitter"; import metascraperUrl from "metascraper-url"; import { workerStatsCounter } from "metrics"; -import fetch from "node-fetch"; import { Browser, BrowserContextOptions } from "playwright"; import { chromium } from "playwright-extra"; import StealthPlugin from "puppeteer-extra-plugin-stealth"; +import { fetchWithProxy } from "utils"; import { getBookmarkDetails, updateAsset } from "workerUtils"; import type { ZCrawlLinkRequest } from "@karakeep/shared/queues"; @@ -86,44 +84,6 @@ const metascraperParser = metascraper([ metascraperUrl(), ]); -function getProxyAgent(url: string) { - const { proxy } = serverConfig; - - if (!proxy.httpProxy && !proxy.httpsProxy) { - return undefined; - } - - const urlObj = new URL(url); - const protocol = urlObj.protocol; - - // Check if URL should bypass proxy - if (proxy.noProxy) { - const noProxyList = proxy.noProxy.split(",").map((host) => host.trim()); - const hostname = urlObj.hostname; - - for (const noProxyHost of noProxyList) { - if ( - noProxyHost === hostname || - (noProxyHost.startsWith(".") && hostname.endsWith(noProxyHost)) || - hostname.endsWith("." + noProxyHost) - ) { - return undefined; - } - } - } - - if (protocol === "https:" && proxy.httpsProxy) { - return new HttpsProxyAgent(proxy.httpsProxy); - } else if (protocol === "http:" && proxy.httpProxy) { - return new HttpProxyAgent(proxy.httpProxy); - } else if (proxy.httpProxy) { - // Fallback to HTTP proxy for HTTPS if HTTPS proxy not configured - return new HttpProxyAgent(proxy.httpProxy); - } - - return undefined; -} - function getPlaywrightProxyConfig(): BrowserContextOptions["proxy"] { const { proxy } = serverConfig; @@ -148,14 +108,6 @@ function getPlaywrightProxyConfig(): BrowserContextOptions["proxy"] { }; } -const fetchWithProxy = (url: string, options: Record<string, unknown> = {}) => { - const agent = getProxyAgent(url); - if (agent) { - options.agent = agent; - } - return fetch(url, options); -}; - let globalBrowser: Browser | undefined; let globalBlocker: PlaywrightBlocker | undefined; // Guards the interactions with the browser instance. diff --git a/apps/workers/workers/feedWorker.ts b/apps/workers/workers/feedWorker.ts index 74b5f65c..2ece4890 100644 --- a/apps/workers/workers/feedWorker.ts +++ b/apps/workers/workers/feedWorker.ts @@ -4,6 +4,7 @@ import { workerStatsCounter } from "metrics"; import cron from "node-cron"; import Parser from "rss-parser"; import { buildImpersonatingTRPCClient } from "trpc"; +import { fetchWithProxy } from "utils"; import { z } from "zod"; import type { ZFeedRequestSchema } from "@karakeep/shared/queues"; @@ -111,7 +112,7 @@ async function run(req: DequeuedJob<ZFeedRequestSchema>) { `[feed][${jobId}] Starting fetching feed "${feed.name}" (${feed.id}) ...`, ); - const response = await fetch(feed.url, { + const response = await fetchWithProxy(feed.url, { signal: AbortSignal.timeout(5000), headers: { UserAgent: |
