From c80ac83dabd7482b394585b2822cc921d76e17f9 Mon Sep 17 00:00:00 2001 From: MohamedBassem Date: Wed, 14 Feb 2024 11:49:57 +0000 Subject: fix: Harden puppeteer against browser disconnections and exceptions --- packages/workers/crawler.ts | 49 +++++++++++++++++++++++++++++-------------- packages/workers/package.json | 1 + 2 files changed, 34 insertions(+), 16 deletions(-) (limited to 'packages/workers') diff --git a/packages/workers/crawler.ts b/packages/workers/crawler.ts index 4febc1ca..353f9056 100644 --- a/packages/workers/crawler.ts +++ b/packages/workers/crawler.ts @@ -25,6 +25,8 @@ import metascraperTitle from "metascraper-title"; import metascraperUrl from "metascraper-url"; import metascraperTwitter from "metascraper-twitter"; import metascraperReadability from "metascraper-readability"; +import { Mutex } from "async-mutex"; +import assert from "assert"; const metascraperParser = metascraper([ metascraperReadability(), @@ -37,14 +39,27 @@ const metascraperParser = metascraper([ ]); let browser: Browser | undefined; +// Guards the interactions with the browser instance. +// This is needed given that most of the browser APIs are async. +const browserMutex = new Mutex(); + +async function launchBrowser() { + browser = undefined; + await browserMutex.runExclusive(async () => { + browser = await puppeteer.launch({ headless: true }); + browser.on("disconnected", async () => { + logger.info( + "The puppeteer browser got disconnected. Will attempt to launch it again.", + ); + await launchBrowser(); + }); + }); +} export class CrawlerWorker { static async build() { - if (!browser) { - puppeteer.use(StealthPlugin()); - console.log("HERE"); - browser = await puppeteer.launch({ headless: true }); - } + puppeteer.use(StealthPlugin()); + await launchBrowser(); logger.info("Starting crawler worker ..."); const worker = new Worker( @@ -82,20 +97,22 @@ async function getBookmarkUrl(bookmarkId: string) { } async function crawlPage(url: string) { - if (!browser) { - throw new Error("The browser must have been initalized by this point."); - } + assert(browser); const context = await browser.createBrowserContext(); - const page = await context.newPage(); - await page.goto(url, { - timeout: 10000, // 10 seconds - waitUntil: "networkidle2", - }); + try { + const page = await context.newPage(); + + await page.goto(url, { + timeout: 10000, // 10 seconds + waitUntil: "networkidle2", + }); - const htmlContent = await page.content(); - await context.close(); - return htmlContent; + const htmlContent = await page.content(); + return htmlContent; + } finally { + await context.close(); + } } async function runCrawler(job: Job) { diff --git a/packages/workers/package.json b/packages/workers/package.json index e20dc7f2..f84737a2 100644 --- a/packages/workers/package.json +++ b/packages/workers/package.json @@ -6,6 +6,7 @@ "dependencies": { "@remember/db": "0.1.0", "@remember/shared": "0.1.0", + "async-mutex": "^0.4.1", "dotenv": "^16.4.1", "metascraper": "^5.43.4", "metascraper-description": "^5.43.4", -- cgit v1.2.3-70-g09d2