aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMohamedBassem <me@mbassem.com>2024-04-26 11:06:54 +0100
committerMohamedBassem <me@mbassem.com>2024-04-26 11:06:54 +0100
commita91aff23eaa3616aec331ef0267863ed0b0e594b (patch)
tree09a449ec43b632762af1ef7125cf707b8fe45d81
parent9dace185acff4002aec8265fc010db49d91c7d7f (diff)
downloadkarakeep-a91aff23eaa3616aec331ef0267863ed0b0e594b.tar.zst
feature(crawler): Allow increasing crawler concurrency and configure storing images and screenshots
-rw-r--r--apps/workers/crawlerWorker.ts13
-rw-r--r--docs/docs/03-configuration.md11
-rw-r--r--packages/shared/config.ts6
3 files changed, 26 insertions, 4 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index b583864d..6b4d39f0 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -123,6 +123,7 @@ export class CrawlerWorker {
/* timeoutSec */ serverConfig.crawler.jobTimeoutSec,
),
{
+ concurrency: serverConfig.crawler.numWorkers,
connection: queueConnectionDetails,
autorun: false,
},
@@ -282,6 +283,12 @@ async function storeScreenshot(
userId: string,
jobId: string,
) {
+ if (!serverConfig.crawler.storeScreenshot) {
+ logger.info(
+ `[Crawler][${jobId}] Skipping storing the screenshot as per the config.`,
+ );
+ return null;
+ }
const assetId = newAssetId();
await saveAsset({
userId,
@@ -300,6 +307,12 @@ async function downloadAndStoreImage(
userId: string,
jobId: string,
) {
+ if (!serverConfig.crawler.downloadBannerImage) {
+ logger.info(
+ `[Crawler][${jobId}] Skipping downloading the image as per the config.`,
+ );
+ return null;
+ }
try {
logger.info(`[Crawler][${jobId}] Downloading image from "${url}"`);
const response = await fetch(url);
diff --git a/docs/docs/03-configuration.md b/docs/docs/03-configuration.md
index 28ead2f1..3d44f359 100644
--- a/docs/docs/03-configuration.md
+++ b/docs/docs/03-configuration.md
@@ -37,7 +37,10 @@ Either `OPENAI_API_KEY` or `OLLAMA_BASE_URL` need to be set for automatic taggin
## Crawler Configs
-| Name | Required | Default | Description |
-| ---------------------------- | -------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| CRAWLER_JOB_TIMEOUT_SEC | No | 60 | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit |
-| CRAWLER_NAVIGATE_TIMEOUT_SEC | No | 30 | How long to spend navigating to the page (along with its redirects). Increase this if you have a slow internet connection |
+| Name | Required | Default | Description |
+| ----------------------------- | -------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| CRAWLER_NUM_WORKERS | No | 1 | Number of allowed concurrent crawling jobs. By default, we're only doing one crawling request at a time to avoid consuming a lot of resources. |
+| CRAWLER_DOWNLOAD_BANNER_IMAGE | No | true | Whether to cache the banner image used in the cards locally or fetch it each time directly from the website. Caching it consumes more storage space, but is more resilient against link rot and rate limits from websites. |
+| CRAWLER_STORE_SCREENSHOT | No | true | Whether to store a screenshot from the crawled website or not. Screenshots act as a fallback for when we fail to extract an image from a website. You can also view the stored screenshots for any link. |
+| CRAWLER_JOB_TIMEOUT_SEC | No | 60 | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit |
+| CRAWLER_NAVIGATE_TIMEOUT_SEC | No | 30 | How long to spend navigating to the page (along with its redirects). Increase this if you have a slow internet connection |
diff --git a/packages/shared/config.ts b/packages/shared/config.ts
index 41173433..a31c48b4 100644
--- a/packages/shared/config.ts
+++ b/packages/shared/config.ts
@@ -22,6 +22,9 @@ const allEnv = z.object({
BROWSER_WEB_URL: z.string().url().optional(),
CRAWLER_JOB_TIMEOUT_SEC: z.number().default(60),
CRAWLER_NAVIGATE_TIMEOUT_SEC: z.number().default(30),
+ CRAWLER_NUM_WORKERS: z.coerce.number().default(1),
+ CRAWLER_DOWNLOAD_BANNER_IMAGE: stringBool("true"),
+ CRAWLER_STORE_SCREENSHOT: stringBool("true"),
MEILI_ADDR: z.string().optional(),
MEILI_MASTER_KEY: z.string().default(""),
LOG_LEVEL: z.string().default("debug"),
@@ -56,10 +59,13 @@ const serverConfigSchema = allEnv.transform((val) => {
redisDBIdx: val.REDIS_DB_IDX,
},
crawler: {
+ numWorkers: val.CRAWLER_NUM_WORKERS,
headlessBrowser: val.CRAWLER_HEADLESS_BROWSER,
browserWebUrl: val.BROWSER_WEB_URL,
jobTimeoutSec: val.CRAWLER_JOB_TIMEOUT_SEC,
navigateTimeoutSec: val.CRAWLER_NAVIGATE_TIMEOUT_SEC,
+ downloadBannerImage: val.CRAWLER_DOWNLOAD_BANNER_IMAGE,
+ storeScreenshot: val.CRAWLER_STORE_SCREENSHOT,
},
meilisearch: val.MEILI_ADDR
? {