aboutsummaryrefslogtreecommitdiffstats
path: root/packages
diff options
context:
space:
mode:
Diffstat (limited to 'packages')
-rw-r--r--packages/shared/config.ts3
-rw-r--r--packages/workers/crawler.ts14
2 files changed, 15 insertions, 2 deletions
diff --git a/packages/shared/config.ts b/packages/shared/config.ts
index 8cf0d620..35014823 100644
--- a/packages/shared/config.ts
+++ b/packages/shared/config.ts
@@ -24,6 +24,9 @@ const serverConfig = {
redisHost: process.env.REDIS_HOST || "localhost",
redisPort: parseInt(process.env.REDIS_PORT || "6379"),
},
+ crawler: {
+ headlessBrowser: (process.env.CRAWLER_HEADLESS_BROWSER ?? "true") == "true",
+ },
logLevel: process.env.LOG_LEVEL || "debug",
};
diff --git a/packages/workers/crawler.ts b/packages/workers/crawler.ts
index 353f9056..4ba6aedc 100644
--- a/packages/workers/crawler.ts
+++ b/packages/workers/crawler.ts
@@ -27,6 +27,7 @@ import metascraperTwitter from "metascraper-twitter";
import metascraperReadability from "metascraper-readability";
import { Mutex } from "async-mutex";
import assert from "assert";
+import serverConfig from "@remember/shared/config";
const metascraperParser = metascraper([
metascraperReadability(),
@@ -46,7 +47,7 @@ const browserMutex = new Mutex();
async function launchBrowser() {
browser = undefined;
await browserMutex.runExclusive(async () => {
- browser = await puppeteer.launch({ headless: true });
+ browser = await puppeteer.launch({ headless: serverConfig.crawler.headlessBrowser });
browser.on("disconnected", async () => {
logger.info(
"The puppeteer browser got disconnected. Will attempt to launch it again.",
@@ -105,9 +106,18 @@ async function crawlPage(url: string) {
await page.goto(url, {
timeout: 10000, // 10 seconds
- waitUntil: "networkidle2",
});
+ // Wait until there's at most two connections for 2 seconds
+ // Attempt to wait only for 5 seconds
+ await Promise.race([
+ page.waitForNetworkIdle({
+ idleTime: 1000, // 1 sec
+ concurrency: 2,
+ }),
+ new Promise((f) => setTimeout(f, 5000)),
+ ]);
+
const htmlContent = await page.content();
return htmlContent;
} finally {