diff options
| author | MohamedBassem <me@mbassem.com> | 2024-04-26 11:06:54 +0100 |
|---|---|---|
| committer | MohamedBassem <me@mbassem.com> | 2024-04-26 11:06:54 +0100 |
| commit | a91aff23eaa3616aec331ef0267863ed0b0e594b (patch) | |
| tree | 09a449ec43b632762af1ef7125cf707b8fe45d81 | |
| parent | 9dace185acff4002aec8265fc010db49d91c7d7f (diff) | |
| download | karakeep-a91aff23eaa3616aec331ef0267863ed0b0e594b.tar.zst | |
feature(crawler): Allow increasing crawler concurrency and configure storing images and screenshots
| -rw-r--r-- | apps/workers/crawlerWorker.ts | 13 | ||||
| -rw-r--r-- | docs/docs/03-configuration.md | 11 | ||||
| -rw-r--r-- | packages/shared/config.ts | 6 |
3 files changed, 26 insertions, 4 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts index b583864d..6b4d39f0 100644 --- a/apps/workers/crawlerWorker.ts +++ b/apps/workers/crawlerWorker.ts @@ -123,6 +123,7 @@ export class CrawlerWorker { /* timeoutSec */ serverConfig.crawler.jobTimeoutSec, ), { + concurrency: serverConfig.crawler.numWorkers, connection: queueConnectionDetails, autorun: false, }, @@ -282,6 +283,12 @@ async function storeScreenshot( userId: string, jobId: string, ) { + if (!serverConfig.crawler.storeScreenshot) { + logger.info( + `[Crawler][${jobId}] Skipping storing the screenshot as per the config.`, + ); + return null; + } const assetId = newAssetId(); await saveAsset({ userId, @@ -300,6 +307,12 @@ async function downloadAndStoreImage( userId: string, jobId: string, ) { + if (!serverConfig.crawler.downloadBannerImage) { + logger.info( + `[Crawler][${jobId}] Skipping downloading the image as per the config.`, + ); + return null; + } try { logger.info(`[Crawler][${jobId}] Downloading image from "${url}"`); const response = await fetch(url); diff --git a/docs/docs/03-configuration.md b/docs/docs/03-configuration.md index 28ead2f1..3d44f359 100644 --- a/docs/docs/03-configuration.md +++ b/docs/docs/03-configuration.md @@ -37,7 +37,10 @@ Either `OPENAI_API_KEY` or `OLLAMA_BASE_URL` need to be set for automatic taggin ## Crawler Configs -| Name | Required | Default | Description | -| ---------------------------- | -------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| CRAWLER_JOB_TIMEOUT_SEC | No | 60 | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit | -| CRAWLER_NAVIGATE_TIMEOUT_SEC | No | 30 | How long to spend navigating to the page (along with its redirects). Increase this if you have a slow internet connection | +| Name | Required | Default | Description | +| ----------------------------- | -------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| CRAWLER_NUM_WORKERS | No | 1 | Number of allowed concurrent crawling jobs. By default, we're only doing one crawling request at a time to avoid consuming a lot of resources. | +| CRAWLER_DOWNLOAD_BANNER_IMAGE | No | true | Whether to cache the banner image used in the cards locally or fetch it each time directly from the website. Caching it consumes more storage space, but is more resilient against link rot and rate limits from websites. | +| CRAWLER_STORE_SCREENSHOT | No | true | Whether to store a screenshot from the crawled website or not. Screenshots act as a fallback for when we fail to extract an image from a website. You can also view the stored screenshots for any link. | +| CRAWLER_JOB_TIMEOUT_SEC | No | 60 | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit | +| CRAWLER_NAVIGATE_TIMEOUT_SEC | No | 30 | How long to spend navigating to the page (along with its redirects). Increase this if you have a slow internet connection | diff --git a/packages/shared/config.ts b/packages/shared/config.ts index 41173433..a31c48b4 100644 --- a/packages/shared/config.ts +++ b/packages/shared/config.ts @@ -22,6 +22,9 @@ const allEnv = z.object({ BROWSER_WEB_URL: z.string().url().optional(), CRAWLER_JOB_TIMEOUT_SEC: z.number().default(60), CRAWLER_NAVIGATE_TIMEOUT_SEC: z.number().default(30), + CRAWLER_NUM_WORKERS: z.coerce.number().default(1), + CRAWLER_DOWNLOAD_BANNER_IMAGE: stringBool("true"), + CRAWLER_STORE_SCREENSHOT: stringBool("true"), MEILI_ADDR: z.string().optional(), MEILI_MASTER_KEY: z.string().default(""), LOG_LEVEL: z.string().default("debug"), @@ -56,10 +59,13 @@ const serverConfigSchema = allEnv.transform((val) => { redisDBIdx: val.REDIS_DB_IDX, }, crawler: { + numWorkers: val.CRAWLER_NUM_WORKERS, headlessBrowser: val.CRAWLER_HEADLESS_BROWSER, browserWebUrl: val.BROWSER_WEB_URL, jobTimeoutSec: val.CRAWLER_JOB_TIMEOUT_SEC, navigateTimeoutSec: val.CRAWLER_NAVIGATE_TIMEOUT_SEC, + downloadBannerImage: val.CRAWLER_DOWNLOAD_BANNER_IMAGE, + storeScreenshot: val.CRAWLER_STORE_SCREENSHOT, }, meilisearch: val.MEILI_ADDR ? { |
