diff options
Diffstat (limited to 'packages')
| -rw-r--r-- | packages/shared/config.ts | 3 | ||||
| -rw-r--r-- | packages/workers/crawler.ts | 14 |
2 files changed, 15 insertions, 2 deletions
diff --git a/packages/shared/config.ts b/packages/shared/config.ts index 8cf0d620..35014823 100644 --- a/packages/shared/config.ts +++ b/packages/shared/config.ts @@ -24,6 +24,9 @@ const serverConfig = { redisHost: process.env.REDIS_HOST || "localhost", redisPort: parseInt(process.env.REDIS_PORT || "6379"), }, + crawler: { + headlessBrowser: (process.env.CRAWLER_HEADLESS_BROWSER ?? "true") == "true", + }, logLevel: process.env.LOG_LEVEL || "debug", }; diff --git a/packages/workers/crawler.ts b/packages/workers/crawler.ts index 353f9056..4ba6aedc 100644 --- a/packages/workers/crawler.ts +++ b/packages/workers/crawler.ts @@ -27,6 +27,7 @@ import metascraperTwitter from "metascraper-twitter"; import metascraperReadability from "metascraper-readability"; import { Mutex } from "async-mutex"; import assert from "assert"; +import serverConfig from "@remember/shared/config"; const metascraperParser = metascraper([ metascraperReadability(), @@ -46,7 +47,7 @@ const browserMutex = new Mutex(); async function launchBrowser() { browser = undefined; await browserMutex.runExclusive(async () => { - browser = await puppeteer.launch({ headless: true }); + browser = await puppeteer.launch({ headless: serverConfig.crawler.headlessBrowser }); browser.on("disconnected", async () => { logger.info( "The puppeteer browser got disconnected. Will attempt to launch it again.", @@ -105,9 +106,18 @@ async function crawlPage(url: string) { await page.goto(url, { timeout: 10000, // 10 seconds - waitUntil: "networkidle2", }); + // Wait until there's at most two connections for 2 seconds + // Attempt to wait only for 5 seconds + await Promise.race([ + page.waitForNetworkIdle({ + idleTime: 1000, // 1 sec + concurrency: 2, + }), + new Promise((f) => setTimeout(f, 5000)), + ]); + const htmlContent = await page.content(); return htmlContent; } finally { |
