aboutsummaryrefslogtreecommitdiffstats
path: root/apps/workers
diff options
context:
space:
mode:
Diffstat (limited to 'apps/workers')
-rw-r--r--apps/workers/utils.ts55
-rw-r--r--apps/workers/workers/crawlerWorker.ts50
-rw-r--r--apps/workers/workers/feedWorker.ts3
3 files changed, 58 insertions, 50 deletions
diff --git a/apps/workers/utils.ts b/apps/workers/utils.ts
index 2f56d3f0..55204570 100644
--- a/apps/workers/utils.ts
+++ b/apps/workers/utils.ts
@@ -1,3 +1,9 @@
+import { HttpProxyAgent } from "http-proxy-agent";
+import { HttpsProxyAgent } from "https-proxy-agent";
+import fetch from "node-fetch";
+
+import serverConfig from "@karakeep/shared/config";
+
export function withTimeout<T, Ret>(
func: (param: T) => Promise<Ret>,
timeoutSec: number,
@@ -14,3 +20,52 @@ export function withTimeout<T, Ret>(
]);
};
}
+
+function getProxyAgent(url: string) {
+ const { proxy } = serverConfig;
+
+ if (!proxy.httpProxy && !proxy.httpsProxy) {
+ return undefined;
+ }
+
+ const urlObj = new URL(url);
+ const protocol = urlObj.protocol;
+
+ // Check if URL should bypass proxy
+ if (proxy.noProxy) {
+ const noProxyList = proxy.noProxy.split(",").map((host) => host.trim());
+ const hostname = urlObj.hostname;
+
+ for (const noProxyHost of noProxyList) {
+ if (
+ noProxyHost === hostname ||
+ (noProxyHost.startsWith(".") && hostname.endsWith(noProxyHost)) ||
+ hostname.endsWith("." + noProxyHost)
+ ) {
+ return undefined;
+ }
+ }
+ }
+
+ if (protocol === "https:" && proxy.httpsProxy) {
+ return new HttpsProxyAgent(proxy.httpsProxy);
+ } else if (protocol === "http:" && proxy.httpProxy) {
+ return new HttpProxyAgent(proxy.httpProxy);
+ } else if (proxy.httpProxy) {
+ // Fallback to HTTP proxy for HTTPS if HTTPS proxy not configured
+ return new HttpProxyAgent(proxy.httpProxy);
+ }
+
+ return undefined;
+}
+
+export const fetchWithProxy = (
+ url: string,
+ options: Record<string, unknown> = {},
+) => {
+ const agent = getProxyAgent(url);
+ if (agent) {
+ options.agent = agent;
+ }
+ return fetch(url, options);
+};
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts
index 2aaab776..625c92d9 100644
--- a/apps/workers/workers/crawlerWorker.ts
+++ b/apps/workers/workers/crawlerWorker.ts
@@ -9,8 +9,6 @@ import DOMPurify from "dompurify";
import { eq } from "drizzle-orm";
import { execa } from "execa";
import { exitAbortController } from "exit";
-import { HttpProxyAgent } from "http-proxy-agent";
-import { HttpsProxyAgent } from "https-proxy-agent";
import { JSDOM, VirtualConsole } from "jsdom";
import { DequeuedJob, EnqueueOptions, Runner } from "liteque";
import metascraper from "metascraper";
@@ -25,10 +23,10 @@ import metascraperTitle from "metascraper-title";
import metascraperTwitter from "metascraper-twitter";
import metascraperUrl from "metascraper-url";
import { workerStatsCounter } from "metrics";
-import fetch from "node-fetch";
import { Browser, BrowserContextOptions } from "playwright";
import { chromium } from "playwright-extra";
import StealthPlugin from "puppeteer-extra-plugin-stealth";
+import { fetchWithProxy } from "utils";
import { getBookmarkDetails, updateAsset } from "workerUtils";
import type { ZCrawlLinkRequest } from "@karakeep/shared/queues";
@@ -86,44 +84,6 @@ const metascraperParser = metascraper([
metascraperUrl(),
]);
-function getProxyAgent(url: string) {
- const { proxy } = serverConfig;
-
- if (!proxy.httpProxy && !proxy.httpsProxy) {
- return undefined;
- }
-
- const urlObj = new URL(url);
- const protocol = urlObj.protocol;
-
- // Check if URL should bypass proxy
- if (proxy.noProxy) {
- const noProxyList = proxy.noProxy.split(",").map((host) => host.trim());
- const hostname = urlObj.hostname;
-
- for (const noProxyHost of noProxyList) {
- if (
- noProxyHost === hostname ||
- (noProxyHost.startsWith(".") && hostname.endsWith(noProxyHost)) ||
- hostname.endsWith("." + noProxyHost)
- ) {
- return undefined;
- }
- }
- }
-
- if (protocol === "https:" && proxy.httpsProxy) {
- return new HttpsProxyAgent(proxy.httpsProxy);
- } else if (protocol === "http:" && proxy.httpProxy) {
- return new HttpProxyAgent(proxy.httpProxy);
- } else if (proxy.httpProxy) {
- // Fallback to HTTP proxy for HTTPS if HTTPS proxy not configured
- return new HttpProxyAgent(proxy.httpProxy);
- }
-
- return undefined;
-}
-
function getPlaywrightProxyConfig(): BrowserContextOptions["proxy"] {
const { proxy } = serverConfig;
@@ -148,14 +108,6 @@ function getPlaywrightProxyConfig(): BrowserContextOptions["proxy"] {
};
}
-const fetchWithProxy = (url: string, options: Record<string, unknown> = {}) => {
- const agent = getProxyAgent(url);
- if (agent) {
- options.agent = agent;
- }
- return fetch(url, options);
-};
-
let globalBrowser: Browser | undefined;
let globalBlocker: PlaywrightBlocker | undefined;
// Guards the interactions with the browser instance.
diff --git a/apps/workers/workers/feedWorker.ts b/apps/workers/workers/feedWorker.ts
index 74b5f65c..2ece4890 100644
--- a/apps/workers/workers/feedWorker.ts
+++ b/apps/workers/workers/feedWorker.ts
@@ -4,6 +4,7 @@ import { workerStatsCounter } from "metrics";
import cron from "node-cron";
import Parser from "rss-parser";
import { buildImpersonatingTRPCClient } from "trpc";
+import { fetchWithProxy } from "utils";
import { z } from "zod";
import type { ZFeedRequestSchema } from "@karakeep/shared/queues";
@@ -111,7 +112,7 @@ async function run(req: DequeuedJob<ZFeedRequestSchema>) {
`[feed][${jobId}] Starting fetching feed "${feed.name}" (${feed.id}) ...`,
);
- const response = await fetch(feed.url, {
+ const response = await fetchWithProxy(feed.url, {
signal: AbortSignal.timeout(5000),
headers: {
UserAgent: