aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMohamedBassem <me@mbassem.com>2024-02-14 11:49:57 +0000
committerMohamedBassem <me@mbassem.com>2024-02-14 11:49:57 +0000
commitc80ac83dabd7482b394585b2822cc921d76e17f9 (patch)
tree1862f17bbd00baf91d67919dff32d1ae8603393e
parent686c677edc3a739cd8afdf6d20837fef202cbfe7 (diff)
downloadkarakeep-c80ac83dabd7482b394585b2822cc921d76e17f9.tar.zst
fix: Harden puppeteer against browser disconnections and exceptions
-rw-r--r--packages/workers/crawler.ts49
-rw-r--r--packages/workers/package.json1
-rw-r--r--yarn.lock10
3 files changed, 44 insertions, 16 deletions
diff --git a/packages/workers/crawler.ts b/packages/workers/crawler.ts
index 4febc1ca..353f9056 100644
--- a/packages/workers/crawler.ts
+++ b/packages/workers/crawler.ts
@@ -25,6 +25,8 @@ import metascraperTitle from "metascraper-title";
import metascraperUrl from "metascraper-url";
import metascraperTwitter from "metascraper-twitter";
import metascraperReadability from "metascraper-readability";
+import { Mutex } from "async-mutex";
+import assert from "assert";
const metascraperParser = metascraper([
metascraperReadability(),
@@ -37,14 +39,27 @@ const metascraperParser = metascraper([
]);
let browser: Browser | undefined;
+// Guards the interactions with the browser instance.
+// This is needed given that most of the browser APIs are async.
+const browserMutex = new Mutex();
+
+async function launchBrowser() {
+ browser = undefined;
+ await browserMutex.runExclusive(async () => {
+ browser = await puppeteer.launch({ headless: true });
+ browser.on("disconnected", async () => {
+ logger.info(
+ "The puppeteer browser got disconnected. Will attempt to launch it again.",
+ );
+ await launchBrowser();
+ });
+ });
+}
export class CrawlerWorker {
static async build() {
- if (!browser) {
- puppeteer.use(StealthPlugin());
- console.log("HERE");
- browser = await puppeteer.launch({ headless: true });
- }
+ puppeteer.use(StealthPlugin());
+ await launchBrowser();
logger.info("Starting crawler worker ...");
const worker = new Worker<ZCrawlLinkRequest, void>(
@@ -82,20 +97,22 @@ async function getBookmarkUrl(bookmarkId: string) {
}
async function crawlPage(url: string) {
- if (!browser) {
- throw new Error("The browser must have been initalized by this point.");
- }
+ assert(browser);
const context = await browser.createBrowserContext();
- const page = await context.newPage();
- await page.goto(url, {
- timeout: 10000, // 10 seconds
- waitUntil: "networkidle2",
- });
+ try {
+ const page = await context.newPage();
+
+ await page.goto(url, {
+ timeout: 10000, // 10 seconds
+ waitUntil: "networkidle2",
+ });
- const htmlContent = await page.content();
- await context.close();
- return htmlContent;
+ const htmlContent = await page.content();
+ return htmlContent;
+ } finally {
+ await context.close();
+ }
}
async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
diff --git a/packages/workers/package.json b/packages/workers/package.json
index e20dc7f2..f84737a2 100644
--- a/packages/workers/package.json
+++ b/packages/workers/package.json
@@ -6,6 +6,7 @@
"dependencies": {
"@remember/db": "0.1.0",
"@remember/shared": "0.1.0",
+ "async-mutex": "^0.4.1",
"dotenv": "^16.4.1",
"metascraper": "^5.43.4",
"metascraper-description": "^5.43.4",
diff --git a/yarn.lock b/yarn.lock
index 8f424166..592ed96e 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -1684,6 +1684,7 @@ __metadata:
"@remember/shared": "npm:0.1.0"
"@tsconfig/node21": "npm:^21.0.1"
"@types/metascraper": "npm:^5.14.3"
+ async-mutex: "npm:^0.4.1"
dotenv: "npm:^16.4.1"
metascraper: "npm:^5.43.4"
metascraper-description: "npm:^5.43.4"
@@ -2803,6 +2804,15 @@ __metadata:
languageName: node
linkType: hard
+"async-mutex@npm:^0.4.1":
+ version: 0.4.1
+ resolution: "async-mutex@npm:0.4.1"
+ dependencies:
+ tslib: "npm:^2.4.0"
+ checksum: 10c0/3c412736c0bc4a9a2cfd948276a8caab8686aa615866a5bd20986e616f8945320acb310058a17afa1b31b8de6f634a78b7ec2217a33d7559b38f68bb85a95854
+ languageName: node
+ linkType: hard
+
"async@npm:^3.2.3":
version: 3.2.5
resolution: "async@npm:3.2.5"