diff options
| author | MohamedBassem <me@mbassem.com> | 2024-02-17 13:35:16 +0000 |
|---|---|---|
| committer | MohamedBassem <me@mbassem.com> | 2024-02-17 13:35:16 +0000 |
| commit | e247b141a98ab7b55d54ca5c7be8347fd076bda2 (patch) | |
| tree | c531b93ad3451943ad288452cc72ef65190b13c2 | |
| parent | 561bef94d9229b3125228d7a6110847a4899591d (diff) | |
| download | karakeep-e247b141a98ab7b55d54ca5c7be8347fd076bda2.tar.zst | |
fix: Let the crawler wait a bit more for page load
| -rw-r--r-- | .env.sample | 4 | ||||
| -rw-r--r-- | packages/shared/config.ts | 3 | ||||
| -rw-r--r-- | packages/workers/crawler.ts | 14 |
3 files changed, 18 insertions, 3 deletions
diff --git a/.env.sample b/.env.sample index fd160adb..befdb5c3 100644 --- a/.env.sample +++ b/.env.sample @@ -1,6 +1,8 @@ # Must have a scheme in the beginning (e.g. file://) # DATABASE_URL= -# LOG_LEVEL="debug" +# LOG_LEVEL=debug +# CRAWLER_HEADLESS_BROWSER=true + # Redis for BullMQ diff --git a/packages/shared/config.ts b/packages/shared/config.ts index 8cf0d620..35014823 100644 --- a/packages/shared/config.ts +++ b/packages/shared/config.ts @@ -24,6 +24,9 @@ const serverConfig = { redisHost: process.env.REDIS_HOST || "localhost", redisPort: parseInt(process.env.REDIS_PORT || "6379"), }, + crawler: { + headlessBrowser: (process.env.CRAWLER_HEADLESS_BROWSER ?? "true") == "true", + }, logLevel: process.env.LOG_LEVEL || "debug", }; diff --git a/packages/workers/crawler.ts b/packages/workers/crawler.ts index 353f9056..4ba6aedc 100644 --- a/packages/workers/crawler.ts +++ b/packages/workers/crawler.ts @@ -27,6 +27,7 @@ import metascraperTwitter from "metascraper-twitter"; import metascraperReadability from "metascraper-readability"; import { Mutex } from "async-mutex"; import assert from "assert"; +import serverConfig from "@remember/shared/config"; const metascraperParser = metascraper([ metascraperReadability(), @@ -46,7 +47,7 @@ const browserMutex = new Mutex(); async function launchBrowser() { browser = undefined; await browserMutex.runExclusive(async () => { - browser = await puppeteer.launch({ headless: true }); + browser = await puppeteer.launch({ headless: serverConfig.crawler.headlessBrowser }); browser.on("disconnected", async () => { logger.info( "The puppeteer browser got disconnected. Will attempt to launch it again.", @@ -105,9 +106,18 @@ async function crawlPage(url: string) { await page.goto(url, { timeout: 10000, // 10 seconds - waitUntil: "networkidle2", }); + // Wait until there's at most two connections for 2 seconds + // Attempt to wait only for 5 seconds + await Promise.race([ + page.waitForNetworkIdle({ + idleTime: 1000, // 1 sec + concurrency: 2, + }), + new Promise((f) => setTimeout(f, 5000)), + ]); + const htmlContent = await page.content(); return htmlContent; } finally { |
