3 files changed, 29 insertions, 11 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index a969ab86..cce409e5 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -56,14 +56,14 @@ async function launchBrowser() {
     try {
       if (serverConfig.crawler.browserWebUrl) {
         logger.info(
-          `Connecting to existing browser instance: ${serverConfig.crawler.browserWebUrl}`,
+          `[Crawler] Connecting to existing browser instance: ${serverConfig.crawler.browserWebUrl}`,
         );
         const webUrl = new URL(serverConfig.crawler.browserWebUrl);
         // We need to resolve the ip address as a workaround for https://github.com/puppeteer/puppeteer/issues/2242
         const { address: address } = await dns.promises.lookup(webUrl.hostname);
         webUrl.hostname = address;
         logger.info(
-          `Successfully resolved IP address, new address: ${webUrl.toString()}`,
+          `[Crawler] Successfully resolved IP address, new address: ${webUrl.toString()}`,
         );
         browser = await puppeteer.connect({
           browserURL: webUrl.toString(),
@@ -76,7 +76,7 @@ async function launchBrowser() {
       }
     } catch (e) {
       logger.error(
-        "Failed to connect to the browser instance, will retry in 5 secs",
+        "[Crawler] Failed to connect to the browser instance, will retry in 5 secs",
       );
       setTimeout(() => {
         launchBrowser();
@@ -86,12 +86,12 @@ async function launchBrowser() {
     browser.on("disconnected", () => {
       if (isShuttingDown) {
         logger.info(
-          "The puppeteer browser got disconnected. But we're shutting down so won't restart it.",
+          "[Crawler] The puppeteer browser got disconnected. But we're shutting down so won't restart it.",
         );
         return;
       }
       logger.info(
-        "The puppeteer browser got disconnected. Will attempt to launch it again.",
+        "[Crawler] The puppeteer browser got disconnected. Will attempt to launch it again.",
       );
       launchBrowser();
     });
@@ -111,7 +111,10 @@ export class CrawlerWorker {
     logger.info("Starting crawler worker ...");
     const worker = new Worker<ZCrawlLinkRequest, void>(
       LinkCrawlerQueue.name,
-      withTimeout(runCrawler, /* timeoutSec */ 30),
+      withTimeout(
+        runCrawler,
+        /* timeoutSec */ serverConfig.crawler.jobTimeoutSec,
+      ),
       {
         connection: queueConnectionDetails,
         autorun: false,
@@ -125,9 +128,7 @@ export class CrawlerWorker {
 
     worker.on("failed", (job, error) => {
       const jobId = job?.id ?? "unknown";
-      logger.error(
-        `[Crawler][${jobId}] Crawling job failed: ${JSON.stringify(error)}`,
-      );
+      logger.error(`[Crawler][${jobId}] Crawling job failed: ${error}`);
     });
 
     return worker;
@@ -161,7 +162,7 @@ function validateUrl(url: string) {
   }
 }
 
-async function crawlPage(url: string) {
+async function crawlPage(jobId: string, url: string) {
   assert(browser);
   const context = await browser.createBrowserContext();
 
@@ -171,6 +172,9 @@ async function crawlPage(url: string) {
     await page.goto(url, {
       timeout: 10000, // 10 seconds
     });
+    logger.info(
+      `[Crawler][${jobId}] Successfully navigated to "${url}". Waiting for the page to load ...`,
+    );
 
     // Wait until there's at most two connections for 2 seconds
     // Attempt to wait only for 5 seconds
@@ -182,6 +186,8 @@ async function crawlPage(url: string) {
       new Promise((f) => setTimeout(f, 5000)),
     ]);
 
+    logger.info(`[Crawler][${jobId}] Finished waiting for the page to load.`);
+
     const htmlContent = await page.content();
     return htmlContent;
   } finally {
@@ -208,12 +214,16 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
   );
   validateUrl(url);
 
-  const htmlContent = await crawlPage(url);
+  const htmlContent = await crawlPage(jobId, url);
 
+  logger.info(
+    `[Crawler][${jobId}] Will attempt to parse the content of the page ...`,
+  );
   const meta = await metascraperParser({
     url,
     html: htmlContent,
   });
+  logger.info(`[Crawler][${jobId}] Done parsing the content of the page.`);
 
   const window = new JSDOM("").window;
   const purify = DOMPurify(window);
diff --git a/docs/docs/03-configuration.md b/docs/docs/03-configuration.md
index 8bf8a069..1307bcfd 100644
--- a/docs/docs/03-configuration.md
+++ b/docs/docs/03-configuration.md
@@ -34,3 +34,9 @@ Either `OPENAI_API_KEY` or `OLLAMA_BASE_URL` need to be set for automatic taggin
 | INFERENCE_TEXT_MODEL  | No       | gpt-3.5-turbo-0125   | The model to use for text inference. You'll need to change this to some other model if you're using ollama.                                                                                     |
 | INFERENCE_IMAGE_MODEL | No       | gpt-4-vision-preview | The model to use for image inference. You'll need to change this to some other model if you're using ollama and that model needs to support vision APIs (e.g. llava).                           |
 | INFERENCE_LANG        | No       | english              | The language in which the tags will be generated.                                                                                                                                               |
+
+## Crawler Configs
+
+| Name                    | Required | Default | Description                                                                                                                                                            |
+| ----------------------- | -------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| CRAWLER_JOB_TIMEOUT_SEC | No       | 60      | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit |
diff --git a/packages/shared/config.ts b/packages/shared/config.ts
index 11140c3b..75274a4e 100644
--- a/packages/shared/config.ts
+++ b/packages/shared/config.ts
@@ -20,6 +20,7 @@ const allEnv = z.object({
   REDIS_DB_IDX: z.coerce.number().optional(),
   CRAWLER_HEADLESS_BROWSER: stringBool("true"),
   BROWSER_WEB_URL: z.string().url().optional(),
+  CRAWLER_JOB_TIMEOUT_SEC: z.number().default(60),
   MEILI_ADDR: z.string().optional(),
   MEILI_MASTER_KEY: z.string().default(""),
   LOG_LEVEL: z.string().default("debug"),
@@ -56,6 +57,7 @@ const serverConfigSchema = allEnv.transform((val) => {
     crawler: {
       headlessBrowser: val.CRAWLER_HEADLESS_BROWSER,
       browserWebUrl: val.BROWSER_WEB_URL,
+      jobTimeoutSec: val.CRAWLER_JOB_TIMEOUT_SEC,
     },
     meilisearch: val.MEILI_ADDR
       ? {