diff options
| author | MohamedBassem <me@mbassem.com> | 2024-04-11 15:03:31 +0300 |
|---|---|---|
| committer | MohamedBassem <me@mbassem.com> | 2024-04-11 15:03:31 +0300 |
| commit | 238c2967b269ca0f66d8e759c6a0234107e1fd1e (patch) | |
| tree | 1e8590a829bd6b24950ac56eb0a21450c8ce3332 | |
| parent | be622e5594ecb21c82bb6066a82c86e0917bcc35 (diff) | |
| download | karakeep-238c2967b269ca0f66d8e759c6a0234107e1fd1e.tar.zst | |
fix: Increase default navigation timeout to 30s, make it configurable and add retries to crawling jobs
| -rw-r--r-- | apps/workers/crawlerWorker.ts | 2 | ||||
| -rw-r--r-- | apps/workers/utils.ts | 1 | ||||
| -rw-r--r-- | docs/docs/03-configuration.md | 7 | ||||
| -rw-r--r-- | packages/shared/config.ts | 2 | ||||
| -rw-r--r-- | packages/shared/queues.ts | 11 |
5 files changed, 17 insertions, 6 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts index c9a1189c..eec8cd98 100644 --- a/apps/workers/crawlerWorker.ts +++ b/apps/workers/crawlerWorker.ts @@ -170,7 +170,7 @@ async function crawlPage(jobId: string, url: string) { const page = await context.newPage(); await page.goto(url, { - timeout: 10000, // 10 seconds + timeout: serverConfig.crawler.navigateTimeoutSec * 1000, }); logger.info( `[Crawler][${jobId}] Successfully navigated to "${url}". Waiting for the page to load ...`, diff --git a/apps/workers/utils.ts b/apps/workers/utils.ts index f8c48408..8e69dcd2 100644 --- a/apps/workers/utils.ts +++ b/apps/workers/utils.ts @@ -26,7 +26,6 @@ export async function readPDFText(buffer: Buffer): Promise<{ const pdfParser = new PDFParser(null, 1); pdfParser.on("pdfParser_dataError", reject); pdfParser.on("pdfParser_dataReady", (pdfData) => { - // eslint-disable-next-line resolve({ // The type isn't set correctly, reference : https://github.com/modesty/pdf2json/issues/327 // eslint-disable-next-line diff --git a/docs/docs/03-configuration.md b/docs/docs/03-configuration.md index 5bf1612c..28ead2f1 100644 --- a/docs/docs/03-configuration.md +++ b/docs/docs/03-configuration.md @@ -37,6 +37,7 @@ Either `OPENAI_API_KEY` or `OLLAMA_BASE_URL` need to be set for automatic taggin ## Crawler Configs -| Name | Required | Default | Description | -| ----------------------- | -------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| CRAWLER_JOB_TIMEOUT_SEC | No | 60 | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit | +| Name | Required | Default | Description | +| ---------------------------- | -------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| CRAWLER_JOB_TIMEOUT_SEC | No | 60 | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit | +| CRAWLER_NAVIGATE_TIMEOUT_SEC | No | 30 | How long to spend navigating to the page (along with its redirects). Increase this if you have a slow internet connection | diff --git a/packages/shared/config.ts b/packages/shared/config.ts index 4e444908..41173433 100644 --- a/packages/shared/config.ts +++ b/packages/shared/config.ts @@ -21,6 +21,7 @@ const allEnv = z.object({ CRAWLER_HEADLESS_BROWSER: stringBool("true"), BROWSER_WEB_URL: z.string().url().optional(), CRAWLER_JOB_TIMEOUT_SEC: z.number().default(60), + CRAWLER_NAVIGATE_TIMEOUT_SEC: z.number().default(30), MEILI_ADDR: z.string().optional(), MEILI_MASTER_KEY: z.string().default(""), LOG_LEVEL: z.string().default("debug"), @@ -58,6 +59,7 @@ const serverConfigSchema = allEnv.transform((val) => { headlessBrowser: val.CRAWLER_HEADLESS_BROWSER, browserWebUrl: val.BROWSER_WEB_URL, jobTimeoutSec: val.CRAWLER_JOB_TIMEOUT_SEC, + navigateTimeoutSec: val.CRAWLER_NAVIGATE_TIMEOUT_SEC, }, meilisearch: val.MEILI_ADDR ? { diff --git a/packages/shared/queues.ts b/packages/shared/queues.ts index 146c19c6..6d5fdd5f 100644 --- a/packages/shared/queues.ts +++ b/packages/shared/queues.ts @@ -17,7 +17,16 @@ export type ZCrawlLinkRequest = z.infer<typeof zCrawlLinkRequestSchema>; export const LinkCrawlerQueue = new Queue<ZCrawlLinkRequest, void>( "link_crawler_queue", - { connection: queueConnectionDetails }, + { + connection: queueConnectionDetails, + defaultJobOptions: { + attempts: 5, + backoff: { + type: "exponential", + delay: 1000, + }, + }, + }, ); // OpenAI Worker |
