diff options
| -rw-r--r-- | apps/workers/workers/crawlerWorker.ts | 84 | ||||
| -rw-r--r-- | docs/docs/03-configuration.md | 48 | ||||
| -rw-r--r-- | packages/plugins/ratelimit-memory/src/index.test.ts | 4 | ||||
| -rw-r--r-- | packages/shared/config.ts | 10 | ||||
| -rw-r--r-- | packages/shared/ratelimiting.ts | 7 |
5 files changed, 121 insertions, 32 deletions
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts index 5b49b23e..07a74757 100644 --- a/apps/workers/workers/crawlerWorker.ts +++ b/apps/workers/workers/crawlerWorker.ts @@ -77,6 +77,7 @@ import { EnqueueOptions, getQueueClient, } from "@karakeep/shared/queueing"; +import { getRateLimitClient } from "@karakeep/shared/ratelimiting"; import { tryCatch } from "@karakeep/shared/tryCatch"; import { BookmarkTypes } from "@karakeep/shared/types/bookmarks"; @@ -170,6 +171,10 @@ const cookieSchema = z.object({ const cookiesSchema = z.array(cookieSchema); +interface CrawlerRunResult { + status: "completed" | "rescheduled"; +} + function getPlaywrightProxyConfig(): BrowserContextOptions["proxy"] { const { proxy } = serverConfig; @@ -298,11 +303,20 @@ export class CrawlerWorker { } logger.info("Starting crawler worker ..."); - const worker = (await getQueueClient())!.createRunner<ZCrawlLinkRequest>( + const worker = (await getQueueClient())!.createRunner< + ZCrawlLinkRequest, + CrawlerRunResult + >( LinkCrawlerQueue, { run: runCrawler, - onComplete: async (job) => { + onComplete: async (job, result) => { + if (result.status === "rescheduled") { + logger.info( + `[Crawler][${job.id}] Rescheduled due to domain rate limiting`, + ); + return; + } workerStatsCounter.labels("crawler", "completed").inc(); const jobId = job.id; logger.info(`[Crawler][${jobId}] Completed successfully`); @@ -1259,7 +1273,57 @@ async function crawlAndParseUrl( }; } -async function runCrawler(job: DequeuedJob<ZCrawlLinkRequest>) { +/** + * Checks if the domain should be rate limited and reschedules the job if needed. + * @returns true if the job should continue, false if it was rescheduled + */ +async function checkDomainRateLimit( + url: string, + jobId: string, + jobData: ZCrawlLinkRequest, + jobPriority?: number, +): Promise<boolean> { + const crawlerDomainRateLimitConfig = serverConfig.crawler.domainRatelimiting; + if (!crawlerDomainRateLimitConfig) { + return true; + } + + const rateLimitClient = await getRateLimitClient(); + if (!rateLimitClient) { + return true; + } + + const hostname = new URL(url).hostname; + const rateLimitResult = rateLimitClient.checkRateLimit( + { + name: "domain-ratelimit", + maxRequests: crawlerDomainRateLimitConfig.maxRequests, + windowMs: crawlerDomainRateLimitConfig.windowMs, + }, + hostname, + ); + + if (!rateLimitResult.allowed) { + const resetInSeconds = rateLimitResult.resetInSeconds; + // Add jitter to prevent thundering herd: +40% random variation + const jitterFactor = 1.0 + Math.random() * 0.4; // Random value between 1.0 and 1.4 + const delayMs = Math.floor(resetInSeconds * 1000 * jitterFactor); + logger.info( + `[Crawler][${jobId}] Domain "${hostname}" is rate limited. Rescheduling in ${(delayMs / 1000).toFixed(2)} seconds (with jitter).`, + ); + await LinkCrawlerQueue.enqueue(jobData, { + priority: jobPriority, + delayMs, + }); + return false; + } + + return true; +} + +async function runCrawler( + job: DequeuedJob<ZCrawlLinkRequest>, +): Promise<CrawlerRunResult> { const jobId = `${job.id}:${job.runNumber}`; const request = zCrawlLinkRequestSchema.safeParse(job.data); @@ -1267,7 +1331,7 @@ async function runCrawler(job: DequeuedJob<ZCrawlLinkRequest>) { logger.error( `[Crawler][${jobId}] Got malformed job request: ${request.error.toString()}`, ); - return; + return { status: "completed" }; } const { bookmarkId, archiveFullPage } = request.data; @@ -1281,6 +1345,17 @@ async function runCrawler(job: DequeuedJob<ZCrawlLinkRequest>) { precrawledArchiveAssetId, } = await getBookmarkDetails(bookmarkId); + const shouldContinue = await checkDomainRateLimit( + url, + jobId, + job.data, + job.priority, + ); + + if (!shouldContinue) { + return { status: "rescheduled" }; + } + logger.info( `[Crawler][${jobId}] Will crawl "${url}" for link with id "${bookmarkId}"`, ); @@ -1371,4 +1446,5 @@ async function runCrawler(job: DequeuedJob<ZCrawlLinkRequest>) { // Do the archival as a separate last step as it has the potential for failure await archivalLogic(); } + return { status: "completed" }; } diff --git a/docs/docs/03-configuration.md b/docs/docs/03-configuration.md index 6e8d8994..beea98f2 100644 --- a/docs/docs/03-configuration.md +++ b/docs/docs/03-configuration.md @@ -2,29 +2,31 @@ The app is mainly configured by environment variables. All the used environment variables are listed in [packages/shared/config.ts](https://github.com/karakeep-app/karakeep/blob/main/packages/shared/config.ts). The most important ones are: -| Name | Required | Default | Description | -| ------------------------------- | ------------------------------------- | --------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| PORT | No | 3000 | The port on which the web server will listen. DON'T CHANGE THIS IF YOU'RE USING DOCKER, instead changed the docker bound external port. | -| WORKERS_PORT | No | 0 (Random Port) | The port on which the worker will export its prometheus metrics on `/metrics`. By default it's a random unused port. If you want to utilize those metrics, fix the port to a value (and export it in docker if you're using docker). | -| WORKERS_HOST | No | 127.0.0.1 | Host to listen to for requests to WORKERS_PORT. You will need to set this if running in a container, since localhost will not be reachable from outside | -| WORKERS_ENABLED_WORKERS | No | Not set | Comma separated list of worker names to enable. If set, only these workers will run. Valid values: crawler,inference,search,adminMaintenance,video,feed,assetPreprocessing,webhook,ruleEngine. | -| WORKERS_DISABLED_WORKERS | No | Not set | Comma separated list of worker names to disable. Takes precedence over `WORKERS_ENABLED_WORKERS`. | -| DATA_DIR | Yes | Not set | The path for the persistent data directory. This is where the db lives. Assets are stored here by default unless `ASSETS_DIR` is set. | -| ASSETS_DIR | No | Not set | The path where crawled assets will be stored. If not set, defaults to `${DATA_DIR}/assets`. | -| NEXTAUTH_URL | Yes | Not set | Should point to the address of your server. The app will function without it, but will redirect you to wrong addresses on signout for example. | -| NEXTAUTH_SECRET | Yes | Not set | Random string used to sign the JWT tokens. Generate one with `openssl rand -base64 36`. | -| MEILI_ADDR | No | Not set | The address of meilisearch. If not set, Search will be disabled. E.g. (`http://meilisearch:7700`) | -| MEILI_MASTER_KEY | Only in Prod and if search is enabled | Not set | The master key configured for meilisearch. Not needed in development environment. Generate one with `openssl rand -base64 36 \| tr -dc 'A-Za-z0-9'` | -| MAX_ASSET_SIZE_MB | No | 50 | Sets the maximum allowed asset size (in MB) to be uploaded | -| DISABLE_NEW_RELEASE_CHECK | No | false | If set to true, latest release check will be disabled in the admin panel. | -| PROMETHEUS_AUTH_TOKEN | No | Random | Enable a prometheus metrics endpoint at `/api/metrics`. This endpoint will require this token being passed in the Authorization header as a Bearer token. If not set, a new random token is generated everytime at startup. This cannot contain any special characters or you may encounter a 400 Bad Request response. | -| RATE_LIMITING_ENABLED | No | false | If set to true, API rate limiting will be enabled. | -| DB_WAL_MODE | No | false | Enables WAL mode for the sqlite database. This should improve the performance of the database. There's no reason why you shouldn't set this to true unless you're running the db on a network attached drive. This will become the default at some time in the future. | -| SEARCH_NUM_WORKERS | No | 1 | Number of concurrent workers for search indexing tasks. Increase this if you have a high volume of content being indexed for search. | -| SEARCH_JOB_TIMEOUT_SEC | No | 30 | How long to wait for a search indexing job to finish before timing out. Increase this if you have large bookmarks with extensive content that takes longer to index. | -| WEBHOOK_NUM_WORKERS | No | 1 | Number of concurrent workers for webhook delivery. Increase this if you have multiple webhook endpoints or high webhook traffic. | -| ASSET_PREPROCESSING_NUM_WORKERS | No | 1 | Number of concurrent workers for asset preprocessing tasks (image processing, OCR, etc.). Increase this if you have many images or documents that need processing. | -| RULE_ENGINE_NUM_WORKERS | No | 1 | Number of concurrent workers for rule engine processing. Increase this if you have complex automation rules that need to be processed quickly. | +| Name | Required | Default | Description | +| -------------------------------------- | ------------------------------------- | --------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| PORT | No | 3000 | The port on which the web server will listen. DON'T CHANGE THIS IF YOU'RE USING DOCKER, instead changed the docker bound external port. | +| WORKERS_PORT | No | 0 (Random Port) | The port on which the worker will export its prometheus metrics on `/metrics`. By default it's a random unused port. If you want to utilize those metrics, fix the port to a value (and export it in docker if you're using docker). | +| WORKERS_HOST | No | 127.0.0.1 | Host to listen to for requests to WORKERS_PORT. You will need to set this if running in a container, since localhost will not be reachable from outside | +| WORKERS_ENABLED_WORKERS | No | Not set | Comma separated list of worker names to enable. If set, only these workers will run. Valid values: crawler,inference,search,adminMaintenance,video,feed,assetPreprocessing,webhook,ruleEngine. | +| WORKERS_DISABLED_WORKERS | No | Not set | Comma separated list of worker names to disable. Takes precedence over `WORKERS_ENABLED_WORKERS`. | +| DATA_DIR | Yes | Not set | The path for the persistent data directory. This is where the db lives. Assets are stored here by default unless `ASSETS_DIR` is set. | +| ASSETS_DIR | No | Not set | The path where crawled assets will be stored. If not set, defaults to `${DATA_DIR}/assets`. | +| NEXTAUTH_URL | Yes | Not set | Should point to the address of your server. The app will function without it, but will redirect you to wrong addresses on signout for example. | +| NEXTAUTH_SECRET | Yes | Not set | Random string used to sign the JWT tokens. Generate one with `openssl rand -base64 36`. | +| MEILI_ADDR | No | Not set | The address of meilisearch. If not set, Search will be disabled. E.g. (`http://meilisearch:7700`) | +| MEILI_MASTER_KEY | Only in Prod and if search is enabled | Not set | The master key configured for meilisearch. Not needed in development environment. Generate one with `openssl rand -base64 36 \| tr -dc 'A-Za-z0-9'` | +| MAX_ASSET_SIZE_MB | No | 50 | Sets the maximum allowed asset size (in MB) to be uploaded | +| DISABLE_NEW_RELEASE_CHECK | No | false | If set to true, latest release check will be disabled in the admin panel. | +| PROMETHEUS_AUTH_TOKEN | No | Random | Enable a prometheus metrics endpoint at `/api/metrics`. This endpoint will require this token being passed in the Authorization header as a Bearer token. If not set, a new random token is generated everytime at startup. This cannot contain any special characters or you may encounter a 400 Bad Request response. | +| RATE_LIMITING_ENABLED | No | false | If set to true, API rate limiting will be enabled. | +| CRAWLER_DOMAIN_RATE_LIMIT_WINDOW_MS | No | Not set | Time window in milliseconds for per-domain crawler rate limiting. | +| CRAWLER_DOMAIN_RATE_LIMIT_MAX_REQUESTS | No | Not set | Maximum crawler requests allowed per domain inside the configured window. | +| DB_WAL_MODE | No | false | Enables WAL mode for the sqlite database. This should improve the performance of the database. There's no reason why you shouldn't set this to true unless you're running the db on a network attached drive. This will become the default at some time in the future. | +| SEARCH_NUM_WORKERS | No | 1 | Number of concurrent workers for search indexing tasks. Increase this if you have a high volume of content being indexed for search. | +| SEARCH_JOB_TIMEOUT_SEC | No | 30 | How long to wait for a search indexing job to finish before timing out. Increase this if you have large bookmarks with extensive content that takes longer to index. | +| WEBHOOK_NUM_WORKERS | No | 1 | Number of concurrent workers for webhook delivery. Increase this if you have multiple webhook endpoints or high webhook traffic. | +| ASSET_PREPROCESSING_NUM_WORKERS | No | 1 | Number of concurrent workers for asset preprocessing tasks (image processing, OCR, etc.). Increase this if you have many images or documents that need processing. | +| RULE_ENGINE_NUM_WORKERS | No | 1 | Number of concurrent workers for rule engine processing. Increase this if you have complex automation rules that need to be processed quickly. | ## Asset Storage diff --git a/packages/plugins/ratelimit-memory/src/index.test.ts b/packages/plugins/ratelimit-memory/src/index.test.ts index 5bbed769..74989aab 100644 --- a/packages/plugins/ratelimit-memory/src/index.test.ts +++ b/packages/plugins/ratelimit-memory/src/index.test.ts @@ -1,3 +1,4 @@ +import assert from "assert"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import { RateLimiter } from "./index"; @@ -46,7 +47,7 @@ describe("RateLimiter", () => { expect(result1.allowed).toBe(true); expect(result2.allowed).toBe(true); expect(result3.allowed).toBe(false); - expect(result3.resetInSeconds).toBeDefined(); + assert(!result3.allowed); expect(result3.resetInSeconds).toBeGreaterThan(0); }); @@ -139,6 +140,7 @@ describe("RateLimiter", () => { const result = rateLimiter.checkRateLimit(config, "user1"); expect(result.allowed).toBe(false); // Should have ~30 seconds remaining + assert(!result.allowed); expect(result.resetInSeconds).toBeGreaterThan(29); expect(result.resetInSeconds).toBeLessThanOrEqual(30); }); diff --git a/packages/shared/config.ts b/packages/shared/config.ts index a62f0fb6..d4a951f1 100644 --- a/packages/shared/config.ts +++ b/packages/shared/config.ts @@ -106,6 +106,8 @@ const allEnv = z.object({ .transform((t) => t.split("%%").filter((a) => a)), CRAWLER_SCREENSHOT_TIMEOUT_SEC: z.coerce.number().default(5), CRAWLER_IP_VALIDATION_DNS_RESOLVER_TIMEOUT_SEC: z.coerce.number().default(1), + CRAWLER_DOMAIN_RATE_LIMIT_WINDOW_MS: z.coerce.number().min(1).optional(), + CRAWLER_DOMAIN_RATE_LIMIT_MAX_REQUESTS: z.coerce.number().min(1).optional(), LOG_LEVEL: z.string().default("debug"), NO_COLOR: stringBool("false"), DEMO_MODE: stringBool("false"), @@ -299,6 +301,14 @@ const serverConfigSchema = allEnv.transform((val, ctx) => { dnsResolverTimeoutSec: val.CRAWLER_IP_VALIDATION_DNS_RESOLVER_TIMEOUT_SEC, }, + domainRatelimiting: + val.CRAWLER_DOMAIN_RATE_LIMIT_WINDOW_MS !== undefined && + val.CRAWLER_DOMAIN_RATE_LIMIT_MAX_REQUESTS !== undefined + ? { + windowMs: val.CRAWLER_DOMAIN_RATE_LIMIT_WINDOW_MS, + maxRequests: val.CRAWLER_DOMAIN_RATE_LIMIT_MAX_REQUESTS, + } + : null, }, ocr: { langs: val.OCR_LANGS, diff --git a/packages/shared/ratelimiting.ts b/packages/shared/ratelimiting.ts index 3b22310b..ee5988b4 100644 --- a/packages/shared/ratelimiting.ts +++ b/packages/shared/ratelimiting.ts @@ -6,10 +6,9 @@ export interface RateLimitConfig { maxRequests: number; } -export interface RateLimitResult { - allowed: boolean; - resetInSeconds?: number; -} +export type RateLimitResult = + | { allowed: true } + | { allowed: false; resetInSeconds: number }; export interface RateLimitClient { /** |
