From e247b141a98ab7b55d54ca5c7be8347fd076bda2 Mon Sep 17 00:00:00 2001 From: MohamedBassem Date: Sat, 17 Feb 2024 13:35:16 +0000 Subject: fix: Let the crawler wait a bit more for page load --- .env.sample | 4 +++- packages/shared/config.ts | 3 +++ packages/workers/crawler.ts | 14 ++++++++++++-- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/.env.sample b/.env.sample index fd160adb..befdb5c3 100644 --- a/.env.sample +++ b/.env.sample @@ -1,6 +1,8 @@ # Must have a scheme in the beginning (e.g. file://) # DATABASE_URL= -# LOG_LEVEL="debug" +# LOG_LEVEL=debug +# CRAWLER_HEADLESS_BROWSER=true + # Redis for BullMQ diff --git a/packages/shared/config.ts b/packages/shared/config.ts index 8cf0d620..35014823 100644 --- a/packages/shared/config.ts +++ b/packages/shared/config.ts @@ -24,6 +24,9 @@ const serverConfig = { redisHost: process.env.REDIS_HOST || "localhost", redisPort: parseInt(process.env.REDIS_PORT || "6379"), }, + crawler: { + headlessBrowser: (process.env.CRAWLER_HEADLESS_BROWSER ?? "true") == "true", + }, logLevel: process.env.LOG_LEVEL || "debug", }; diff --git a/packages/workers/crawler.ts b/packages/workers/crawler.ts index 353f9056..4ba6aedc 100644 --- a/packages/workers/crawler.ts +++ b/packages/workers/crawler.ts @@ -27,6 +27,7 @@ import metascraperTwitter from "metascraper-twitter"; import metascraperReadability from "metascraper-readability"; import { Mutex } from "async-mutex"; import assert from "assert"; +import serverConfig from "@remember/shared/config"; const metascraperParser = metascraper([ metascraperReadability(), @@ -46,7 +47,7 @@ const browserMutex = new Mutex(); async function launchBrowser() { browser = undefined; await browserMutex.runExclusive(async () => { - browser = await puppeteer.launch({ headless: true }); + browser = await puppeteer.launch({ headless: serverConfig.crawler.headlessBrowser }); browser.on("disconnected", async () => { logger.info( "The puppeteer browser got disconnected. Will attempt to launch it again.", @@ -105,9 +106,18 @@ async function crawlPage(url: string) { await page.goto(url, { timeout: 10000, // 10 seconds - waitUntil: "networkidle2", }); + // Wait until there's at most two connections for 2 seconds + // Attempt to wait only for 5 seconds + await Promise.race([ + page.waitForNetworkIdle({ + idleTime: 1000, // 1 sec + concurrency: 2, + }), + new Promise((f) => setTimeout(f, 5000)), + ]); + const htmlContent = await page.content(); return htmlContent; } finally { -- cgit v1.2.3-70-g09d2