aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMohamedBassem <me@mbassem.com>2024-02-17 13:35:16 +0000
committerMohamedBassem <me@mbassem.com>2024-02-17 13:35:16 +0000
commite247b141a98ab7b55d54ca5c7be8347fd076bda2 (patch)
treec531b93ad3451943ad288452cc72ef65190b13c2
parent561bef94d9229b3125228d7a6110847a4899591d (diff)
downloadkarakeep-e247b141a98ab7b55d54ca5c7be8347fd076bda2.tar.zst
fix: Let the crawler wait a bit more for page load
-rw-r--r--.env.sample4
-rw-r--r--packages/shared/config.ts3
-rw-r--r--packages/workers/crawler.ts14
3 files changed, 18 insertions, 3 deletions
diff --git a/.env.sample b/.env.sample
index fd160adb..befdb5c3 100644
--- a/.env.sample
+++ b/.env.sample
@@ -1,6 +1,8 @@
# Must have a scheme in the beginning (e.g. file://)
# DATABASE_URL=
-# LOG_LEVEL="debug"
+# LOG_LEVEL=debug
+# CRAWLER_HEADLESS_BROWSER=true
+
# Redis for BullMQ
diff --git a/packages/shared/config.ts b/packages/shared/config.ts
index 8cf0d620..35014823 100644
--- a/packages/shared/config.ts
+++ b/packages/shared/config.ts
@@ -24,6 +24,9 @@ const serverConfig = {
redisHost: process.env.REDIS_HOST || "localhost",
redisPort: parseInt(process.env.REDIS_PORT || "6379"),
},
+ crawler: {
+ headlessBrowser: (process.env.CRAWLER_HEADLESS_BROWSER ?? "true") == "true",
+ },
logLevel: process.env.LOG_LEVEL || "debug",
};
diff --git a/packages/workers/crawler.ts b/packages/workers/crawler.ts
index 353f9056..4ba6aedc 100644
--- a/packages/workers/crawler.ts
+++ b/packages/workers/crawler.ts
@@ -27,6 +27,7 @@ import metascraperTwitter from "metascraper-twitter";
import metascraperReadability from "metascraper-readability";
import { Mutex } from "async-mutex";
import assert from "assert";
+import serverConfig from "@remember/shared/config";
const metascraperParser = metascraper([
metascraperReadability(),
@@ -46,7 +47,7 @@ const browserMutex = new Mutex();
async function launchBrowser() {
browser = undefined;
await browserMutex.runExclusive(async () => {
- browser = await puppeteer.launch({ headless: true });
+ browser = await puppeteer.launch({ headless: serverConfig.crawler.headlessBrowser });
browser.on("disconnected", async () => {
logger.info(
"The puppeteer browser got disconnected. Will attempt to launch it again.",
@@ -105,9 +106,18 @@ async function crawlPage(url: string) {
await page.goto(url, {
timeout: 10000, // 10 seconds
- waitUntil: "networkidle2",
});
+ // Wait until there's at most two connections for 2 seconds
+ // Attempt to wait only for 5 seconds
+ await Promise.race([
+ page.waitForNetworkIdle({
+ idleTime: 1000, // 1 sec
+ concurrency: 2,
+ }),
+ new Promise((f) => setTimeout(f, 5000)),
+ ]);
+
const htmlContent = await page.content();
return htmlContent;
} finally {