rcgit

/ karakeep

Commit b63a49fc

SHA b63a49fc3980296c6a6ea6ac0624142e8af94d52
Author Mohamed Bassem <me at mbassem dot com>
Author Date 2025-11-02 17:19 +0000
Committer GitHub <noreply at github dot com>
Commit Date 2025-11-02 17:19 +0000
Parent(s) c6ebceb9f0b1 (diff)
Tree 1b1266f09f78

patch snapshot

fix: Stricter SSRF validation (#2082)
* fix: Stricter SSRF validation

* skip dns resolution if running in proxy context

* more fixes

* Add LRU cache

* change the env variable for internal hostnames

* make dns resolution timeout configerable

* upgrade ipaddr

* handle ipv6

* handle proxy bypass for request interceptor
File + - Graph
A apps/workers/network.ts +419 -0
M apps/workers/package.json +2 -0
M apps/workers/utils.ts +0 -61
M apps/workers/workers/crawlerWorker.ts +54 -27
M apps/workers/workers/feedWorker.ts +1 -1
M apps/workers/workers/videoWorker.ts +29 -7
M apps/workers/workers/webhookWorker.ts +2 -2
M docs/docs/03-configuration.md +6 -5
M packages/shared/config.ts +24 -1
M pnpm-lock.yaml +65 -31
10 file(s) changed, 602 insertions(+), 135 deletions(-)

apps/workers/network.ts

diff --git a/apps/workers/network.ts b/apps/workers/network.ts
new file mode 100644
index 00000000..acfd2439
--- /dev/null
+++ b/apps/workers/network.ts
@@ -0,0 +1,419 @@
+import dns from "node:dns/promises";
+import type { HeadersInit, RequestInit, Response } from "node-fetch";
+import { HttpProxyAgent } from "http-proxy-agent";
+import { HttpsProxyAgent } from "https-proxy-agent";
+import ipaddr from "ipaddr.js";
+import { LRUCache } from "lru-cache";
+import fetch, { Headers } from "node-fetch";
+
+import serverConfig from "@karakeep/shared/config";
+
+const DISALLOWED_IP_RANGES = new Set([
+  // IPv4 ranges
+  "unspecified",
+  "broadcast",
+  "multicast",
+  "linkLocal",
+  "loopback",
+  "private",
+  "reserved",
+  "carrierGradeNat",
+  // IPv6 ranges
+  "uniqueLocal",
+  "6to4", // RFC 3056 - IPv6 transition mechanism
+  "teredo", // RFC 4380 - IPv6 tunneling
+  "benchmarking", // RFC 5180 - benchmarking addresses
+  "deprecated", // RFC 3879 - deprecated IPv6 addresses
+  "discard", // RFC 6666 - discard-only prefix
+]);
+
+// DNS cache with 5 minute TTL and max 1000 entries
+const dnsCache = new LRUCache<string, string[]>({
+  max: 1000,
+  ttl: 5 * 60 * 1000, // 5 minutes in milliseconds
+});
+
+async function resolveHostAddresses(hostname: string): Promise<string[]> {
+  const resolver = new dns.Resolver({
+    timeout: serverConfig.crawler.ipValidation.dnsResolverTimeoutSec * 1000,
+  });
+
+  const results = await Promise.allSettled([
+    resolver.resolve4(hostname),
+    resolver.resolve6(hostname),
+  ]);
+
+  const addresses: string[] = [];
+  const errors: string[] = [];
+
+  for (const result of results) {
+    if (result.status === "fulfilled") {
+      addresses.push(...result.value);
+    } else {
+      const reason = result.reason;
+      if (reason instanceof Error) {
+        errors.push(reason.message);
+      } else {
+        errors.push(String(reason));
+      }
+    }
+  }
+
+  if (addresses.length > 0) {
+    return addresses;
+  }
+
+  const errorMessage =
+    errors.length > 0
+      ? errors.join("; ")
+      : "DNS lookup did not return any A or AAAA records";
+  throw new Error(errorMessage);
+}
+
+function isAddressForbidden(address: string): boolean {
+  if (!ipaddr.isValid(address)) {
+    return true;
+  }
+  const parsed = ipaddr.parse(address);
+  if (
+    parsed.kind() === "ipv6" &&
+    (parsed as ipaddr.IPv6).isIPv4MappedAddress()
+  ) {
+    const mapped = (parsed as ipaddr.IPv6).toIPv4Address();
+    return DISALLOWED_IP_RANGES.has(mapped.range());
+  }
+  return DISALLOWED_IP_RANGES.has(parsed.range());
+}
+
+export type UrlValidationResult =
+  | { ok: true; url: URL }
+  | { ok: false; reason: string };
+
+function hostnameMatchesAnyPattern(
+  hostname: string,
+  patterns: string[],
+): boolean {
+  function hostnameMatchesPattern(hostname: string, pattern: string): boolean {
+    return (
+      pattern === hostname ||
+      (pattern.startsWith(".") && hostname.endsWith(pattern)) ||
+      hostname.endsWith("." + pattern)
+    );
+  }
+
+  for (const pattern of patterns) {
+    if (hostnameMatchesPattern(hostname, pattern)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+function isHostnameAllowedForInternalAccess(hostname: string): boolean {
+  if (!serverConfig.allowedInternalHostnames) {
+    return false;
+  }
+  return hostnameMatchesAnyPattern(
+    hostname,
+    serverConfig.allowedInternalHostnames,
+  );
+}
+
+export async function validateUrl(
+  urlCandidate: string,
+  runningInProxyContext: boolean,
+): Promise<UrlValidationResult> {
+  let parsedUrl: URL;
+  try {
+    parsedUrl = new URL(urlCandidate);
+  } catch (error) {
+    return {
+      ok: false,
+      reason: `Invalid URL "${urlCandidate}": ${
+        error instanceof Error ? error.message : String(error)
+      }`,
+    } as const;
+  }
+
+  if (parsedUrl.protocol !== "http:" && parsedUrl.protocol !== "https:") {
+    return {
+      ok: false,
+      reason: `Unsupported protocol for URL: ${parsedUrl.toString()}`,
+    } as const;
+  }
+
+  const hostname = parsedUrl.hostname;
+  if (!hostname) {
+    return {
+      ok: false,
+      reason: `URL ${parsedUrl.toString()} must include a hostname`,
+    } as const;
+  }
+
+  if (isHostnameAllowedForInternalAccess(hostname)) {
+    return { ok: true, url: parsedUrl } as const;
+  }
+
+  if (ipaddr.isValid(hostname)) {
+    if (isAddressForbidden(hostname)) {
+      return {
+        ok: false,
+        reason: `Refusing to access disallowed IP address ${hostname} (requested via ${parsedUrl.toString()})`,
+      } as const;
+    }
+    return { ok: true, url: parsedUrl } as const;
+  }
+
+  if (runningInProxyContext) {
+    // If we're running in a proxy context, we must skip DNS resolution
+    // as the DNS resolution will be handled by the proxy
+    return { ok: true, url: parsedUrl } as const;
+  }
+
+  // Check cache first
+  let records = dnsCache.get(hostname);
+
+  if (!records) {
+    // Cache miss or expired - perform DNS resolution
+    try {
+      records = await resolveHostAddresses(hostname);
+      dnsCache.set(hostname, records);
+    } catch (error) {
+      return {
+        ok: false,
+        reason: `Failed to resolve hostname ${hostname}: ${
+          error instanceof Error ? error.message : String(error)
+        }`,
+      } as const;
+    }
+  }
+
+  if (!records || records.length === 0) {
+    return {
+      ok: false,
+      reason: `DNS lookup for ${hostname} did not return any addresses (requested via ${parsedUrl.toString()})`,
+    } as const;
+  }
+
+  for (const record of records) {
+    if (isAddressForbidden(record)) {
+      return {
+        ok: false,
+        reason: `Refusing to access disallowed resolved address ${record} for host ${hostname}`,
+      } as const;
+    }
+  }
+
+  return { ok: true, url: parsedUrl } as const;
+}
+
+export function getRandomProxy(proxyList: string[]): string {
+  return proxyList[Math.floor(Math.random() * proxyList.length)].trim();
+}
+
+export function matchesNoProxy(url: string, noProxy: string[]) {
+  const urlObj = new URL(url);
+  const hostname = urlObj.hostname;
+  return hostnameMatchesAnyPattern(hostname, noProxy);
+}
+
+export function getProxyAgent(url: string) {
+  const { proxy } = serverConfig;
+
+  if (!proxy.httpProxy && !proxy.httpsProxy) {
+    return undefined;
+  }
+
+  const urlObj = new URL(url);
+  const protocol = urlObj.protocol;
+
+  // Check if URL should bypass proxy
+  if (proxy.noProxy && matchesNoProxy(url, proxy.noProxy)) {
+    return undefined;
+  }
+
+  if (protocol === "https:" && proxy.httpsProxy) {
+    const selectedProxy = getRandomProxy(proxy.httpsProxy);
+    return new HttpsProxyAgent(selectedProxy);
+  } else if (protocol === "http:" && proxy.httpProxy) {
+    const selectedProxy = getRandomProxy(proxy.httpProxy);
+    return new HttpProxyAgent(selectedProxy);
+  } else if (proxy.httpProxy) {
+    const selectedProxy = getRandomProxy(proxy.httpProxy);
+    return new HttpProxyAgent(selectedProxy);
+  }
+
+  return undefined;
+}
+
+function cloneHeaders(init?: HeadersInit): Headers {
+  const headers = new Headers();
+  if (!init) {
+    return headers;
+  }
+  if (init instanceof Headers) {
+    init.forEach((value, key) => {
+      headers.set(key, value);
+    });
+    return headers;
+  }
+
+  if (Array.isArray(init)) {
+    for (const [key, value] of init) {
+      headers.append(key, value);
+    }
+    return headers;
+  }
+
+  for (const [key, value] of Object.entries(init)) {
+    if (Array.isArray(value)) {
+      headers.set(key, value.join(", "));
+    } else if (value !== undefined) {
+      headers.set(key, value);
+    }
+  }
+
+  return headers;
+}
+
+function isRedirectResponse(response: Response): boolean {
+  return (
+    response.status === 301 ||
+    response.status === 302 ||
+    response.status === 303 ||
+    response.status === 307 ||
+    response.status === 308
+  );
+}
+
+export type FetchWithProxyOptions = Omit<
+  RequestInit & {
+    maxRedirects?: number;
+  },
+  "agent"
+>;
+
+interface PreparedFetchOptions {
+  maxRedirects: number;
+  baseHeaders: Headers;
+  method: string;
+  body?: RequestInit["body"];
+  baseOptions: RequestInit;
+}
+
+export function prepareFetchOptions(
+  options: FetchWithProxyOptions = {},
+): PreparedFetchOptions {
+  const {
+    maxRedirects = 5,
+    headers: initHeaders,
+    method: initMethod,
+    body: initBody,
+    redirect: _ignoredRedirect,
+    ...restOptions
+  } = options;
+
+  const baseOptions = restOptions as RequestInit;
+
+  return {
+    maxRedirects,
+    baseHeaders: cloneHeaders(initHeaders),
+    method: initMethod?.toUpperCase?.() ?? "GET",
+    body: initBody,
+    baseOptions,
+  };
+}
+
+interface BuildFetchOptionsInput {
+  method: string;
+  body?: RequestInit["body"];
+  headers: Headers;
+  agent?: RequestInit["agent"];
+  baseOptions: RequestInit;
+}
+
+export function buildFetchOptions({
+  method,
+  body,
+  headers,
+  agent,
+  baseOptions,
+}: BuildFetchOptionsInput): RequestInit {
+  return {
+    ...baseOptions,
+    method,
+    body,
+    headers,
+    agent,
+    redirect: "manual",
+  };
+}
+
+export const fetchWithProxy = async (
+  url: string,
+  options: FetchWithProxyOptions = {},
+) => {
+  const {
+    maxRedirects,
+    baseHeaders,
+    method: preparedMethod,
+    body: preparedBody,
+    baseOptions,
+  } = prepareFetchOptions(options);
+
+  let redirectsRemaining = maxRedirects;
+  let currentUrl = url;
+  let currentMethod = preparedMethod;
+  let currentBody = preparedBody;
+
+  while (true) {
+    const agent = getProxyAgent(currentUrl);
+
+    const validation = await validateUrl(currentUrl, !!agent);
+    if (!validation.ok) {
+      throw new Error(validation.reason);
+    }
+    const requestUrl = validation.url;
+    currentUrl = requestUrl.toString();
+
+    const response = await fetch(
+      currentUrl,
+      buildFetchOptions({
+        method: currentMethod,
+        body: currentBody,
+        headers: baseHeaders,
+        agent,
+        baseOptions,
+      }),
+    );
+
+    if (!isRedirectResponse(response)) {
+      return response;
+    }
+
+    const locationHeader = response.headers.get("location");
+    if (!locationHeader) {
+      return response;
+    }
+
+    if (redirectsRemaining <= 0) {
+      throw new Error(`Too many redirects while fetching ${url}`);
+    }
+
+    const nextUrl = new URL(locationHeader, currentUrl);
+
+    if (
+      response.status === 303 ||
+      ((response.status === 301 || response.status === 302) &&
+        currentMethod !== "GET" &&
+        currentMethod !== "HEAD")
+    ) {
+      currentMethod = "GET";
+      currentBody = undefined;
+      baseHeaders.delete("content-length");
+    }
+
+    currentUrl = nextUrl.toString();
+    redirectsRemaining -= 1;
+  }
+};

apps/workers/package.json

diff --git a/apps/workers/package.json b/apps/workers/package.json
index b02c3bc9..f35a52f4 100644
--- a/apps/workers/package.json
+++ b/apps/workers/package.json
@@ -23,8 +23,10 @@
     "hono": "^4.7.10",
     "http-proxy-agent": "^7.0.2",
     "https-proxy-agent": "^7.0.6",
+    "ipaddr.js": "^2.2.0",
     "jsdom": "^24.0.0",
     "liteque": "^0.6.2",
+    "lru-cache": "^11.2.2",
     "metascraper": "^5.49.5",
     "metascraper-amazon": "^5.49.5",
     "metascraper-author": "^5.49.5",

apps/workers/utils.ts

diff --git a/apps/workers/utils.ts b/apps/workers/utils.ts
index a82dd12d..2f56d3f0 100644
--- a/apps/workers/utils.ts
+++ b/apps/workers/utils.ts
@@ -1,9 +1,3 @@
-import { HttpProxyAgent } from "http-proxy-agent";
-import { HttpsProxyAgent } from "https-proxy-agent";
-import fetch from "node-fetch";
-
-import serverConfig from "@karakeep/shared/config";
-
 export function withTimeout<T, Ret>(
   func: (param: T) => Promise<Ret>,
   timeoutSec: number,
@@ -20,58 +14,3 @@ export function withTimeout<T, Ret>(
     ]);
   };
 }
-
-export function getRandomProxy(proxyList: string[]): string {
-  return proxyList[Math.floor(Math.random() * proxyList.length)].trim();
-}
-
-function getProxyAgent(url: string) {
-  const { proxy } = serverConfig;
-
-  if (!proxy.httpProxy && !proxy.httpsProxy) {
-    return undefined;
-  }
-
-  const urlObj = new URL(url);
-  const protocol = urlObj.protocol;
-
-  // Check if URL should bypass proxy
-  if (proxy.noProxy) {
-    const noProxyList = proxy.noProxy.split(",").map((host) => host.trim());
-    const hostname = urlObj.hostname;
-
-    for (const noProxyHost of noProxyList) {
-      if (
-        noProxyHost === hostname ||
-        (noProxyHost.startsWith(".") && hostname.endsWith(noProxyHost)) ||
-        hostname.endsWith("." + noProxyHost)
-      ) {
-        return undefined;
-      }
-    }
-  }
-
-  if (protocol === "https:" && proxy.httpsProxy) {
-    const selectedProxy = getRandomProxy(proxy.httpsProxy);
-    return new HttpsProxyAgent(selectedProxy);
-  } else if (protocol === "http:" && proxy.httpProxy) {
-    const selectedProxy = getRandomProxy(proxy.httpProxy);
-    return new HttpProxyAgent(selectedProxy);
-  } else if (proxy.httpProxy) {
-    const selectedProxy = getRandomProxy(proxy.httpProxy);
-    return new HttpProxyAgent(selectedProxy);
-  }
-
-  return undefined;
-}
-
-export const fetchWithProxy = (
-  url: string,
-  options: Record<string, unknown> = {},
-) => {
-  const agent = getProxyAgent(url);
-  if (agent) {
-    options.agent = agent;
-  }
-  return fetch(url, options);
-};

apps/workers/workers/crawlerWorker.ts

diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts
index 33ff2851..70b2e644 100644
--- a/apps/workers/workers/crawlerWorker.ts
+++ b/apps/workers/workers/crawlerWorker.ts
@@ -25,10 +25,15 @@ import metascraperTitle from "metascraper-title";
 import metascraperTwitter from "metascraper-twitter";
 import metascraperUrl from "metascraper-url";
 import { workerStatsCounter } from "metrics";
+import {
+  fetchWithProxy,
+  getRandomProxy,
+  matchesNoProxy,
+  validateUrl,
+} from "network";
 import { Browser, BrowserContextOptions } from "playwright";
 import { chromium } from "playwright-extra";
 import StealthPlugin from "puppeteer-extra-plugin-stealth";
-import { fetchWithProxy, getRandomProxy } from "utils";
 import { getBookmarkDetails, updateAsset } from "workerUtils";
 import { z } from "zod";
 
@@ -173,7 +178,7 @@ function getPlaywrightProxyConfig(): BrowserContextOptions["proxy"] {
     server: proxyUrl,
     username: parsed.username,
     password: parsed.password,
-    bypass: proxy.noProxy,
+    bypass: proxy.noProxy?.join(","),
   };
 }
 
@@ -355,22 +360,6 @@ async function changeBookmarkStatus(
     .where(eq(bookmarkLinks.id, bookmarkId));
 }
 
-/**
- * This provides some "basic" protection from malicious URLs. However, all of those
- * can be easily circumvented by pointing dns of origin to localhost, or with
- * redirects.
- */
-function validateUrl(url: string) {
-  const urlParsed = new URL(url);
-  if (urlParsed.protocol != "http:" && urlParsed.protocol != "https:") {
-    throw new Error(`Unsupported URL protocol: ${urlParsed.protocol}`);
-  }
-
-  if (["localhost", "127.0.0.1", "0.0.0.0"].includes(urlParsed.hostname)) {
-    throw new Error(`Link hostname rejected: ${urlParsed.hostname}`);
-  }
-}
-
 async function browserlessCrawlPage(
   jobId: string,
   url: string,
@@ -430,11 +419,15 @@ async function crawlPage(
     return browserlessCrawlPage(jobId, url, abortSignal);
   }
 
+  const proxyConfig = getPlaywrightProxyConfig();
+  const isRunningInProxyContext =
+    proxyConfig !== undefined &&
+    !matchesNoProxy(url, proxyConfig.bypass?.split(",") ?? []);
   const context = await browser.newContext({
     viewport: { width: 1440, height: 900 },
     userAgent:
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
-    proxy: getPlaywrightProxyConfig(),
+    proxy: proxyConfig,
   });
 
   try {
@@ -453,8 +446,12 @@ async function crawlPage(
       await globalBlocker.enableBlockingInPage(page);
     }
 
-    // Block audio/video resources
-    await page.route("**/*", (route) => {
+    // Block audio/video resources and disallowed sub-requests
+    await page.route("**/*", async (route) => {
+      if (abortSignal.aborted) {
+        await route.abort("aborted");
+        return;
+      }
       const request = route.request();
       const resourceType = request.resourceType();
 
@@ -464,18 +461,49 @@ async function crawlPage(
         request.headers()["content-type"]?.includes("video/") ||
         request.headers()["content-type"]?.includes("audio/")
       ) {
-        route.abort();
+        await route.abort("aborted");
         return;
       }
 
+      const requestUrl = request.url();
+      const requestIsRunningInProxyContext =
+        proxyConfig !== undefined &&
+        !matchesNoProxy(requestUrl, proxyConfig.bypass?.split(",") ?? []);
+      if (
+        requestUrl.startsWith("http://") ||
+        requestUrl.startsWith("https://")
+      ) {
+        const validation = await validateUrl(
+          requestUrl,
+          requestIsRunningInProxyContext,
+        );
+        if (!validation.ok) {
+          logger.warn(
+            `[Crawler][${jobId}] Blocking sub-request to disallowed URL "${requestUrl}": ${validation.reason}`,
+          );
+          await route.abort("blockedbyclient");
+          return;
+        }
+      }
+
       // Continue with other requests
-      route.continue();
+      await route.continue();
     });
 
     // Navigate to the target URL
-    logger.info(`[Crawler][${jobId}] Navigating to "${url}"`);
+    const navigationValidation = await validateUrl(
+      url,
+      isRunningInProxyContext,
+    );
+    if (!navigationValidation.ok) {
+      throw new Error(
+        `Disallowed navigation target "${url}": ${navigationValidation.reason}`,
+      );
+    }
+    const targetUrl = navigationValidation.url.toString();
+    logger.info(`[Crawler][${jobId}] Navigating to "${targetUrl}"`);
     const response = await Promise.race([
-      page.goto(url, {
+      page.goto(targetUrl, {
         timeout: serverConfig.crawler.navigateTimeoutSec * 1000,
         waitUntil: "domcontentloaded",
       }),
@@ -483,7 +511,7 @@ async function crawlPage(
     ]);
 
     logger.info(
-      `[Crawler][${jobId}] Successfully navigated to "${url}". Waiting for the page to load ...`,
+      `[Crawler][${jobId}] Successfully navigated to "${targetUrl}". Waiting for the page to load ...`,
     );
 
     // Wait until network is relatively idle or timeout after 5 seconds
@@ -1231,7 +1259,6 @@ async function runCrawler(job: DequeuedJob<ZCrawlLinkRequest>) {
   logger.info(
     `[Crawler][${jobId}] Will crawl "${url}" for link with id "${bookmarkId}"`,
   );
-  validateUrl(url);
 
   const contentType = await getContentType(url, jobId, job.abortSignal);
   job.abortSignal.throwIfAborted();

apps/workers/workers/feedWorker.ts

diff --git a/apps/workers/workers/feedWorker.ts b/apps/workers/workers/feedWorker.ts
index 38b06c47..f86e7424 100644
--- a/apps/workers/workers/feedWorker.ts
+++ b/apps/workers/workers/feedWorker.ts
@@ -1,9 +1,9 @@
 import { and, eq, inArray } from "drizzle-orm";
 import { workerStatsCounter } from "metrics";
+import { fetchWithProxy } from "network";
 import cron from "node-cron";
 import Parser from "rss-parser";
 import { buildImpersonatingTRPCClient } from "trpc";
-import { fetchWithProxy } from "utils";
 import { z } from "zod";
 
 import type { ZFeedRequestSchema } from "@karakeep/shared-server";

apps/workers/workers/videoWorker.ts

diff --git a/apps/workers/workers/videoWorker.ts b/apps/workers/workers/videoWorker.ts
index a41eb069..8d3ac666 100644
--- a/apps/workers/workers/videoWorker.ts
+++ b/apps/workers/workers/videoWorker.ts
@@ -3,6 +3,7 @@ import * as os from "os";
 import path from "path";
 import { execa } from "execa";
 import { workerStatsCounter } from "metrics";
+import { getProxyAgent, validateUrl } from "network";
 
 import { db } from "@karakeep/db";
 import { AssetTypes } from "@karakeep/db/schema";
@@ -62,7 +63,11 @@ export class VideoWorker {
   }
 }
 
-function prepareYtDlpArguments(url: string, assetPath: string) {
+function prepareYtDlpArguments(
+  url: string,
+  proxy: string | undefined,
+  assetPath: string,
+) {
   const ytDlpArguments = [url];
   if (serverConfig.crawler.maxVideoDownloadSize > 0) {
     ytDlpArguments.push(
@@ -74,6 +79,9 @@ function prepareYtDlpArguments(url: string, assetPath: string) {
   ytDlpArguments.push(...serverConfig.crawler.ytDlpArguments);
   ytDlpArguments.push("-o", assetPath);
   ytDlpArguments.push("--no-playlist");
+  if (proxy) {
+    ytDlpArguments.push("--proxy", proxy);
+  }
   return ytDlpArguments;
 }
 
@@ -94,15 +102,29 @@ async function runWorker(job: DequeuedJob<ZVideoRequest>) {
     return;
   }
 
+  const proxy = getProxyAgent(url);
+  const validation = await validateUrl(url, !!proxy);
+  if (!validation.ok) {
+    logger.warn(
+      `[VideoCrawler][${jobId}] Skipping video download to disallowed URL "${url}": ${validation.reason}`,
+    );
+    return;
+  }
+  const normalizedUrl = validation.url.toString();
+
   const videoAssetId = newAssetId();
   let assetPath = `${TMP_FOLDER}/${videoAssetId}`;
   await fs.promises.mkdir(TMP_FOLDER, { recursive: true });
 
-  const ytDlpArguments = prepareYtDlpArguments(url, assetPath);
+  const ytDlpArguments = prepareYtDlpArguments(
+    normalizedUrl,
+    proxy?.proxy.toString(),
+    assetPath,
+  );
 
   try {
     logger.info(
-      `[VideoCrawler][${jobId}] Attempting to download a file from "${url}" to "${assetPath}" using the following arguments: "${ytDlpArguments}"`,
+      `[VideoCrawler][${jobId}] Attempting to download a file from "${normalizedUrl}" to "${assetPath}" using the following arguments: "${ytDlpArguments}"`,
     );
 
     await execa("yt-dlp", ytDlpArguments, {
@@ -123,11 +145,11 @@ async function runWorker(job: DequeuedJob<ZVideoRequest>) {
       err.message.includes("No media found")
     ) {
       logger.info(
-        `[VideoCrawler][${jobId}] Skipping video download from "${url}", because it's not one of the supported yt-dlp URLs`,
+        `[VideoCrawler][${jobId}] Skipping video download from "${normalizedUrl}", because it's not one of the supported yt-dlp URLs`,
       );
       return;
     }
-    const genericError = `[VideoCrawler][${jobId}] Failed to download a file from "${url}" to "${assetPath}"`;
+    const genericError = `[VideoCrawler][${jobId}] Failed to download a file from "${normalizedUrl}" to "${assetPath}"`;
     if ("stderr" in err) {
       logger.error(`${genericError}: ${err.stderr}`);
     } else {
@@ -138,7 +160,7 @@ async function runWorker(job: DequeuedJob<ZVideoRequest>) {
   }
 
   logger.info(
-    `[VideoCrawler][${jobId}] Finished downloading a file from "${url}" to "${assetPath}"`,
+    `[VideoCrawler][${jobId}] Finished downloading a file from "${normalizedUrl}" to "${assetPath}"`,
   );
 
   // Get file size and check quota before saving
@@ -177,7 +199,7 @@ async function runWorker(job: DequeuedJob<ZVideoRequest>) {
     await silentDeleteAsset(userId, oldVideoAssetId);
 
     logger.info(
-      `[VideoCrawler][${jobId}] Finished downloading video from "${url}" and adding it to the database`,
+      `[VideoCrawler][${jobId}] Finished downloading video from "${normalizedUrl}" and adding it to the database`,
     );
   } catch (error) {
     if (error instanceof StorageQuotaError) {

apps/workers/workers/webhookWorker.ts

diff --git a/apps/workers/workers/webhookWorker.ts b/apps/workers/workers/webhookWorker.ts
index 2bbef160..472a27ed 100644
--- a/apps/workers/workers/webhookWorker.ts
+++ b/apps/workers/workers/webhookWorker.ts
@@ -1,6 +1,6 @@
 import { eq } from "drizzle-orm";
 import { workerStatsCounter } from "metrics";
-import fetch from "node-fetch";
+import { fetchWithProxy } from "network";
 
 import { db } from "@karakeep/db";
 import { bookmarks, webhooksTable } from "@karakeep/db/schema";
@@ -102,7 +102,7 @@ async function runWebhook(job: DequeuedJob<ZWebhookRequest>) {
 
         while (attempt < maxRetries && !success) {
           try {
-            const response = await fetch(url, {
+            const response = await fetchWithProxy(url, {
               method: "POST",
               headers: {
                 "Content-Type": "application/json",

docs/docs/03-configuration.md

diff --git a/docs/docs/03-configuration.md b/docs/docs/03-configuration.md
index 26760d6c..50280a55 100644
--- a/docs/docs/03-configuration.md
+++ b/docs/docs/03-configuration.md
@@ -222,11 +222,12 @@ Karakeep can send emails for various purposes such as email verification during
 
 If your Karakeep instance needs to connect through a proxy server, you can configure the following settings:
 
-| Name                | Required | Default | Description                                                                                                                                                                          |
-| ------------------- | -------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| CRAWLER_HTTP_PROXY  | No       | Not set | HTTP proxy server URL for outgoing HTTP requests (e.g., `http://proxy.example.com:8080`). You can pass multiple comma separated proxies and the used one will be chosen at random.   |
-| CRAWLER_HTTPS_PROXY | No       | Not set | HTTPS proxy server URL for outgoing HTTPS requests (e.g., `http://proxy.example.com:8080`). You can pass multiple comma separated proxies and the used one will be chosen at random. |
-| CRAWLER_NO_PROXY    | No       | Not set | Comma-separated list of hostnames/IPs that should bypass the proxy (e.g., `localhost,127.0.0.1,.local`)                                                                              |
+| Name                               | Required | Default | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| ---------------------------------- | -------- | ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| CRAWLER_HTTP_PROXY                 | No       | Not set | HTTP proxy server URL for outgoing HTTP requests (e.g., `http://proxy.example.com:8080`). You can pass multiple comma separated proxies and the used one will be chosen at random. The proxy is used for crawling, RSS feed fetches and webhooks.                                                                                                                                                                                                                                       |
+| CRAWLER_HTTPS_PROXY                | No       | Not set | HTTPS proxy server URL for outgoing HTTPS requests (e.g., `http://proxy.example.com:8080`). You can pass multiple comma separated proxies and the used one will be chosen at random. The proxy is used for crawling, RSS feed fetches and webhooks.                                                                                                                                                                                                                                     |
+| CRAWLER_NO_PROXY                   | No       | Not set | Comma-separated list of hostnames/IPs that should bypass the proxy (e.g., `localhost,127.0.0.1,.local`)                                                                                                                                                                                                                                                                                                                                                                                 |
+| CRAWLER_ALLOWED_INTERNAL_HOSTNAMES | No       | Not set | By default, Karakeep blocks worker-initiated requests whose DNS resolves to private, loopback, or link-local IP addresses. Use this to allowlist specific hostnames for internal access (e.g., `internal.company.com,.local`). Supports domain wildcards by prefixing with a dot (e.g., `.internal.company.com`). Note: Internal IP validation is bypassed when a proxy is configured for the URL as the local DNS resolver won't necessarily be the same as the one used by the proxy. |
 
 :::info
 These proxy settings will be used by the crawler and other components that make outgoing HTTP requests.

packages/shared/config.ts

diff --git a/packages/shared/config.ts b/packages/shared/config.ts
index d54b7589..51b591ad 100644
--- a/packages/shared/config.ts
+++ b/packages/shared/config.ts
@@ -104,6 +104,7 @@ const allEnv = z.object({
     .default("")
     .transform((t) => t.split("%%").filter((a) => a)),
   CRAWLER_SCREENSHOT_TIMEOUT_SEC: z.coerce.number().default(5),
+  CRAWLER_IP_VALIDATION_DNS_RESOLVER_TIMEOUT_SEC: z.coerce.number().default(1),
   LOG_LEVEL: z.string().default("debug"),
   NO_COLOR: stringBool("false"),
   DEMO_MODE: stringBool("false"),
@@ -178,7 +179,24 @@ const allEnv = z.object({
         .filter((p) => p),
     )
     .optional(),
-  CRAWLER_NO_PROXY: z.string().optional(),
+  CRAWLER_NO_PROXY: z
+    .string()
+    .transform((val) =>
+      val
+        .split(",")
+        .map((p) => p.trim())
+        .filter((p) => p),
+    )
+    .optional(),
+  CRAWLER_ALLOWED_INTERNAL_HOSTNAMES: z
+    .string()
+    .transform((val) =>
+      val
+        .split(",")
+        .map((p) => p.trim())
+        .filter((p) => p),
+    )
+    .optional(),
 
   // Database configuration
   DB_WAL_MODE: stringBool("false"),
@@ -276,6 +294,10 @@ const serverConfigSchema = allEnv.transform((val, ctx) => {
       ytDlpArguments: val.CRAWLER_YTDLP_ARGS,
       screenshotTimeoutSec: val.CRAWLER_SCREENSHOT_TIMEOUT_SEC,
       htmlContentSizeThreshold: val.HTML_CONTENT_SIZE_INLINE_THRESHOLD_BYTES,
+      ipValidation: {
+        dnsResolverTimeoutSec:
+          val.CRAWLER_IP_VALIDATION_DNS_RESOLVER_TIMEOUT_SEC,
+      },
     },
     ocr: {
       langs: val.OCR_LANGS,
@@ -309,6 +331,7 @@ const serverConfigSchema = allEnv.transform((val, ctx) => {
       httpsProxy: val.CRAWLER_HTTPS_PROXY,
       noProxy: val.CRAWLER_NO_PROXY,
     },
+    allowedInternalHostnames: val.CRAWLER_ALLOWED_INTERNAL_HOSTNAMES,
     assetPreprocessing: {
       numWorkers: val.ASSET_PREPROCESSING_NUM_WORKERS,
     },

pnpm-lock.yaml

diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 8d068c65..a3ca5ec8 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -826,12 +826,18 @@ importers:
       https-proxy-agent:
         specifier: ^7.0.6
         version: 7.0.6(supports-color@10.0.0)
+      ipaddr.js:
+        specifier: ^2.2.0
+        version: 2.2.0
       jsdom:
         specifier: ^24.0.0
         version: 24.1.3
       liteque:
         specifier: ^0.6.2
         version: 0.6.2(@opentelemetry/api@1.9.0)(@types/better-sqlite3@7.6.13)(@types/react@19.2.2)(better-sqlite3@11.3.0)(kysely@0.28.5)(react@19.1.0)
+      lru-cache:
+        specifier: ^11.2.2
+        version: 11.2.2
       metascraper:
         specifier: ^5.49.5
         version: 5.49.5(postcss@8.5.6)
@@ -1797,8 +1803,8 @@ packages:
     resolution: {integrity: sha512-lJjzvrbEeWrhB4P3QBsH7tey117PjLZnDbLiQEKjQ/fNJTjuq4HSqgFA+UNSwZT8D7dxxbnuSBMsa1lrWzKlQg==}
     engines: {node: '>=6.9.0'}
 
-  '@babel/generator@7.28.3':
-    resolution: {integrity: sha512-3lSpxGgvnmZznmBkCRnVREPUFJv2wrv9iAoFDvADJc0ypmdOxdUtcLeBgBJ6zE0PMeTKnxeQzyk0xTBq4Ep7zw==}
+  '@babel/generator@7.28.5':
+    resolution: {integrity: sha512-3EwLFhZ38J4VyIP6WNtt2kUdW9dokXA9Cr4IVIFHuCpZ3H8/YFOl5JjZHisrn1fATPBmKKqXzDFvh9fUwHz6CQ==}
     engines: {node: '>=6.9.0'}
 
   '@babel/helper-annotate-as-pure@7.27.3':
@@ -1876,6 +1882,10 @@ packages:
     resolution: {integrity: sha512-D2hP9eA+Sqx1kBZgzxZh0y1trbuU+JoDkiEwqhQ36nodYqJwyEIhPSdMNd7lOm/4io72luTPWH20Yda0xOuUow==}
     engines: {node: '>=6.9.0'}
 
+  '@babel/helper-validator-identifier@7.28.5':
+    resolution: {integrity: sha512-qSs4ifwzKJSV39ucNjsvc6WVHs6b7S03sOh2OcHF9UHfVPqWWALUsNUVzhSBiItjRZoLHx7nIarVjqKVusUZ1Q==}
+    engines: {node: '>=6.9.0'}
+
   '@babel/helper-validator-option@7.27.1':
     resolution: {integrity: sha512-YvjJow9FxbhFFKDSuFnVCe2WxXk1zWc22fFePVNEaWJEu8IrZVlda6N0uHwzZrUM1il7NC9Mlp4MaJYbYd9JSg==}
     engines: {node: '>=6.9.0'}
@@ -1902,8 +1912,8 @@ packages:
     engines: {node: '>=6.0.0'}
     hasBin: true
 
-  '@babel/parser@7.28.4':
-    resolution: {integrity: sha512-yZbBqeM6TkpP9du/I2pUZnJsRMGGvOuIrhjzC1AwHwW+6he4mni6Bp/m8ijn0iOuZuPI2BfkCoSRunpyjnrQKg==}
+  '@babel/parser@7.28.5':
+    resolution: {integrity: sha512-KKBU1VGYR7ORr3At5HAtUQ+TV3SzRCXmA/8OdDZiLDBIZxVyzXuztPjfLd3BV1PRAQGCMWWSHYhL0F8d5uHBDQ==}
     engines: {node: '>=6.0.0'}
     hasBin: true
 
@@ -2488,8 +2498,8 @@ packages:
     resolution: {integrity: sha512-mGe7UK5wWyh0bKRfupsUchrQGqvDbZDbKJw+kcRGSmdHVYrv+ltd0pnpDTVpiTqnaBru9iEvA8pz8W46v0Amwg==}
     engines: {node: '>=6.9.0'}
 
-  '@babel/traverse@7.28.4':
-    resolution: {integrity: sha512-YEzuboP2qvQavAcjgQNVgsvHIDv6ZpwXvcvjmyySP2DIMuByS/6ioU5G9pYrWHM6T2YDfc7xga9iNzYOs12CFQ==}
+  '@babel/traverse@7.28.5':
+    resolution: {integrity: sha512-TCCj4t55U90khlYkVV/0TfkJkAkUg3jZFA3Neb7unZT8CPok7iiRfaX0F+WnqWqt7OxhOn0uBKXCw4lbL8W0aQ==}
     engines: {node: '>=6.9.0'}
 
   '@babel/types@7.27.6':
@@ -2500,8 +2510,8 @@ packages:
     resolution: {integrity: sha512-x0LvFTekgSX+83TI28Y9wYPUfzrnl2aT5+5QLnO6v7mSJYtEEevuDRN0F0uSHRk1G1IWZC43o00Y0xDDrpBGPQ==}
     engines: {node: '>=6.9.0'}
 
-  '@babel/types@7.28.4':
-    resolution: {integrity: sha512-bkFqkLhh3pMBUQQkpVgWDWq/lqzc2678eUyDlTBhRqhCHFguYYGM0Efga7tYk4TogG/3x0EEl66/OQ+WGbWB/Q==}
+  '@babel/types@7.28.5':
+    resolution: {integrity: sha512-qQ5m48eI/MFLQ5PxQj4PFaprjyCTLI37ElWMmNs0K8Lk3dVeOdNpB3ks8jc7yM5CDmVC73eMVk/trk3fgmrUpA==}
     engines: {node: '>=6.9.0'}
 
   '@colors/colors@1.5.0':
@@ -10064,10 +10074,6 @@ packages:
   lru-cache@10.4.3:
     resolution: {integrity: sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==}
 
-  lru-cache@11.1.0:
-    resolution: {integrity: sha512-QIXZUBJUx+2zHUdQujWejBkcD9+cs94tLn0+YL8UrCh+D5sCXZ4c7LaEH48pNwRY3MLDgqUFyhlCyjJPf1WP0A==}
-    engines: {node: 20 || >=22}
-
   lru-cache@11.2.2:
     resolution: {integrity: sha512-F9ODfyqML2coTIsQpSkRHnLSZMtkU8Q+mSfcaIyKwy58u+8k5nvAYeiNhsyMARvzNcXJ9QfWVrcPsC9e9rAxtg==}
     engines: {node: 20 || >=22}
@@ -12998,6 +13004,27 @@ packages:
       webpack:
         optional: true
 
+  sass-loader@16.0.6:
+    resolution: {integrity: sha512-sglGzId5gmlfxNs4gK2U3h7HlVRfx278YK6Ono5lwzuvi1jxig80YiuHkaDBVsYIKFhx8wN7XSCI0M2IDS/3qA==}
+    engines: {node: '>= 18.12.0'}
+    peerDependencies:
+      '@rspack/core': 0.x || 1.x
+      node-sass: ^4.0.0 || ^5.0.0 || ^6.0.0 || ^7.0.0 || ^8.0.0 || ^9.0.0
+      sass: ^1.3.0
+      sass-embedded: '*'
+      webpack: ^5.0.0
+    peerDependenciesMeta:
+      '@rspack/core':
+        optional: true
+      node-sass:
+        optional: true
+      sass:
+        optional: true
+      sass-embedded:
+        optional: true
+      webpack:
+        optional: true
+
   sass@1.89.1:
     resolution: {integrity: sha512-eMLLkl+qz7tx/0cJ9wI+w09GQ2zodTkcE/aVfywwdlRcI3EO19xGnbmJwg/JMIm+5MxVJ6outddLZ4Von4E++Q==}
     engines: {node: '>=14.0.0'}
@@ -15578,10 +15605,10 @@ snapshots:
       '@jridgewell/trace-mapping': 0.3.29
       jsesc: 3.1.0
 
-  '@babel/generator@7.28.3':
+  '@babel/generator@7.28.5':
     dependencies:
-      '@babel/parser': 7.28.4
-      '@babel/types': 7.28.4
+      '@babel/parser': 7.28.5
+      '@babel/types': 7.28.5
       '@jridgewell/gen-mapping': 0.3.13
       '@jridgewell/trace-mapping': 0.3.31
       jsesc: 3.1.0
@@ -15727,13 +15754,15 @@ snapshots:
 
   '@babel/helper-validator-identifier@7.27.1': {}
 
+  '@babel/helper-validator-identifier@7.28.5': {}
+
   '@babel/helper-validator-option@7.27.1': {}
 
   '@babel/helper-wrap-function@7.27.1':
     dependencies:
       '@babel/template': 7.27.2
       '@babel/traverse': 7.28.0
-      '@babel/types': 7.28.4
+      '@babel/types': 7.28.5
     transitivePeerDependencies:
       - supports-color
 
@@ -15757,9 +15786,9 @@ snapshots:
     dependencies:
       '@babel/types': 7.28.1
 
-  '@babel/parser@7.28.4':
+  '@babel/parser@7.28.5':
     dependencies:
-      '@babel/types': 7.28.4
+      '@babel/types': 7.28.5
 
   '@babel/plugin-bugfix-firefox-class-in-computed-class-key@7.27.1(@babel/core@7.26.0)':
     dependencies:
@@ -16644,14 +16673,14 @@ snapshots:
     transitivePeerDependencies:
       - supports-color
 
-  '@babel/traverse@7.28.4':
+  '@babel/traverse@7.28.5':
     dependencies:
       '@babel/code-frame': 7.27.1
-      '@babel/generator': 7.28.3
+      '@babel/generator': 7.28.5
       '@babel/helper-globals': 7.28.0
-      '@babel/parser': 7.28.4
+      '@babel/parser': 7.28.5
       '@babel/template': 7.27.2
-      '@babel/types': 7.28.4
+      '@babel/types': 7.28.5
       debug: 4.4.3
     transitivePeerDependencies:
       - supports-color
@@ -16666,10 +16695,10 @@ snapshots:
       '@babel/helper-string-parser': 7.27.1
       '@babel/helper-validator-identifier': 7.27.1
 
-  '@babel/types@7.28.4':
+  '@babel/types@7.28.5':
     dependencies:
       '@babel/helper-string-parser': 7.27.1
-      '@babel/helper-validator-identifier': 7.27.1
+      '@babel/helper-validator-identifier': 7.28.5
 
   '@colors/colors@1.5.0':
     optional: true
@@ -17566,7 +17595,7 @@ snapshots:
 
   '@docusaurus/react-loadable@6.0.0(react@19.1.0)':
     dependencies:
-      '@types/react': 19.2.2
+      '@types/react': 19.1.8
       react: 19.1.0
 
   '@docusaurus/theme-classic@3.8.1(@types/react@19.2.2)(acorn@8.15.0)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(typescript@5.8.3)':
@@ -21758,7 +21787,7 @@ snapshots:
 
   babel-plugin-macros@3.1.0:
     dependencies:
-      '@babel/runtime': 7.28.4
+      '@babel/runtime': 7.27.6
       cosmiconfig: 7.1.0
       resolve: 1.22.10
 
@@ -23045,7 +23074,7 @@ snapshots:
     dependencies:
       '@docusaurus/core': 3.8.1(@mdx-js/react@3.1.0(@types/react@19.2.2)(react@19.1.0))(acorn@8.15.0)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(typescript@5.8.3)
       sass: 1.89.1
-      sass-loader: 16.0.5(sass@1.89.1)(webpack@5.99.9)
+      sass-loader: 16.0.6(sass@1.89.1)(webpack@5.99.9)
     transitivePeerDependencies:
       - '@rspack/core'
       - node-sass
@@ -25774,8 +25803,6 @@ snapshots:
 
   lru-cache@10.4.3: {}
 
-  lru-cache@11.1.0: {}
-
   lru-cache@11.2.2: {}
 
   lru-cache@5.1.1:
@@ -26418,7 +26445,7 @@ snapshots:
   metro-source-map@0.82.5:
     dependencies:
       '@babel/traverse': 7.28.0
-      '@babel/traverse--for-generate-function-map': '@babel/traverse@7.28.4'
+      '@babel/traverse--for-generate-function-map': '@babel/traverse@7.28.5'
       '@babel/types': 7.28.1
       flow-enums-runtime: 0.0.6
       invariant: 2.2.4
@@ -27798,7 +27825,7 @@ snapshots:
 
   path-scurry@2.0.0:
     dependencies:
-      lru-cache: 11.1.0
+      lru-cache: 11.2.2
       minipass: 7.1.2
 
   path-to-regexp@0.1.12: {}
@@ -29756,6 +29783,13 @@ snapshots:
       sass: 1.89.1
       webpack: 5.99.9
 
+  sass-loader@16.0.6(sass@1.89.1)(webpack@5.99.9):
+    dependencies:
+      neo-async: 2.6.2
+    optionalDependencies:
+      sass: 1.89.1
+      webpack: 5.99.9
+
   sass@1.89.1:
     dependencies:
       chokidar: 4.0.3