From e247b141a98ab7b55d54ca5c7be8347fd076bda2 Mon Sep 17 00:00:00 2001 From: MohamedBassem Date: Sat, 17 Feb 2024 13:35:16 +0000 Subject: fix: Let the crawler wait a bit more for page load --- packages/workers/crawler.ts | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'packages/workers') diff --git a/packages/workers/crawler.ts b/packages/workers/crawler.ts index 353f9056..4ba6aedc 100644 --- a/packages/workers/crawler.ts +++ b/packages/workers/crawler.ts @@ -27,6 +27,7 @@ import metascraperTwitter from "metascraper-twitter"; import metascraperReadability from "metascraper-readability"; import { Mutex } from "async-mutex"; import assert from "assert"; +import serverConfig from "@remember/shared/config"; const metascraperParser = metascraper([ metascraperReadability(), @@ -46,7 +47,7 @@ const browserMutex = new Mutex(); async function launchBrowser() { browser = undefined; await browserMutex.runExclusive(async () => { - browser = await puppeteer.launch({ headless: true }); + browser = await puppeteer.launch({ headless: serverConfig.crawler.headlessBrowser }); browser.on("disconnected", async () => { logger.info( "The puppeteer browser got disconnected. Will attempt to launch it again.", @@ -105,9 +106,18 @@ async function crawlPage(url: string) { await page.goto(url, { timeout: 10000, // 10 seconds - waitUntil: "networkidle2", }); + // Wait until there's at most two connections for 2 seconds + // Attempt to wait only for 5 seconds + await Promise.race([ + page.waitForNetworkIdle({ + idleTime: 1000, // 1 sec + concurrency: 2, + }), + new Promise((f) => setTimeout(f, 5000)), + ]); + const htmlContent = await page.content(); return htmlContent; } finally { -- cgit v1.2.3-70-g09d2