aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMohamedBassem <me@mbassem.com>2024-04-11 15:03:31 +0300
committerMohamedBassem <me@mbassem.com>2024-04-11 15:03:31 +0300
commit238c2967b269ca0f66d8e759c6a0234107e1fd1e (patch)
tree1e8590a829bd6b24950ac56eb0a21450c8ce3332
parentbe622e5594ecb21c82bb6066a82c86e0917bcc35 (diff)
downloadkarakeep-238c2967b269ca0f66d8e759c6a0234107e1fd1e.tar.zst
fix: Increase default navigation timeout to 30s, make it configurable and add retries to crawling jobs
-rw-r--r--apps/workers/crawlerWorker.ts2
-rw-r--r--apps/workers/utils.ts1
-rw-r--r--docs/docs/03-configuration.md7
-rw-r--r--packages/shared/config.ts2
-rw-r--r--packages/shared/queues.ts11
5 files changed, 17 insertions, 6 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index c9a1189c..eec8cd98 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -170,7 +170,7 @@ async function crawlPage(jobId: string, url: string) {
const page = await context.newPage();
await page.goto(url, {
- timeout: 10000, // 10 seconds
+ timeout: serverConfig.crawler.navigateTimeoutSec * 1000,
});
logger.info(
`[Crawler][${jobId}] Successfully navigated to "${url}". Waiting for the page to load ...`,
diff --git a/apps/workers/utils.ts b/apps/workers/utils.ts
index f8c48408..8e69dcd2 100644
--- a/apps/workers/utils.ts
+++ b/apps/workers/utils.ts
@@ -26,7 +26,6 @@ export async function readPDFText(buffer: Buffer): Promise<{
const pdfParser = new PDFParser(null, 1);
pdfParser.on("pdfParser_dataError", reject);
pdfParser.on("pdfParser_dataReady", (pdfData) => {
- // eslint-disable-next-line
resolve({
// The type isn't set correctly, reference : https://github.com/modesty/pdf2json/issues/327
// eslint-disable-next-line
diff --git a/docs/docs/03-configuration.md b/docs/docs/03-configuration.md
index 5bf1612c..28ead2f1 100644
--- a/docs/docs/03-configuration.md
+++ b/docs/docs/03-configuration.md
@@ -37,6 +37,7 @@ Either `OPENAI_API_KEY` or `OLLAMA_BASE_URL` need to be set for automatic taggin
## Crawler Configs
-| Name | Required | Default | Description |
-| ----------------------- | -------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| CRAWLER_JOB_TIMEOUT_SEC | No | 60 | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit |
+| Name | Required | Default | Description |
+| ---------------------------- | -------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| CRAWLER_JOB_TIMEOUT_SEC | No | 60 | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit |
+| CRAWLER_NAVIGATE_TIMEOUT_SEC | No | 30 | How long to spend navigating to the page (along with its redirects). Increase this if you have a slow internet connection |
diff --git a/packages/shared/config.ts b/packages/shared/config.ts
index 4e444908..41173433 100644
--- a/packages/shared/config.ts
+++ b/packages/shared/config.ts
@@ -21,6 +21,7 @@ const allEnv = z.object({
CRAWLER_HEADLESS_BROWSER: stringBool("true"),
BROWSER_WEB_URL: z.string().url().optional(),
CRAWLER_JOB_TIMEOUT_SEC: z.number().default(60),
+ CRAWLER_NAVIGATE_TIMEOUT_SEC: z.number().default(30),
MEILI_ADDR: z.string().optional(),
MEILI_MASTER_KEY: z.string().default(""),
LOG_LEVEL: z.string().default("debug"),
@@ -58,6 +59,7 @@ const serverConfigSchema = allEnv.transform((val) => {
headlessBrowser: val.CRAWLER_HEADLESS_BROWSER,
browserWebUrl: val.BROWSER_WEB_URL,
jobTimeoutSec: val.CRAWLER_JOB_TIMEOUT_SEC,
+ navigateTimeoutSec: val.CRAWLER_NAVIGATE_TIMEOUT_SEC,
},
meilisearch: val.MEILI_ADDR
? {
diff --git a/packages/shared/queues.ts b/packages/shared/queues.ts
index 146c19c6..6d5fdd5f 100644
--- a/packages/shared/queues.ts
+++ b/packages/shared/queues.ts
@@ -17,7 +17,16 @@ export type ZCrawlLinkRequest = z.infer<typeof zCrawlLinkRequestSchema>;
export const LinkCrawlerQueue = new Queue<ZCrawlLinkRequest, void>(
"link_crawler_queue",
- { connection: queueConnectionDetails },
+ {
+ connection: queueConnectionDetails,
+ defaultJobOptions: {
+ attempts: 5,
+ backoff: {
+ type: "exponential",
+ delay: 1000,
+ },
+ },
+ },
);
// OpenAI Worker