aboutsummaryrefslogtreecommitdiffstats
path: root/packages
diff options
context:
space:
mode:
authorMohamedBassem <me@mbassem.com>2024-02-14 11:49:57 +0000
committerMohamedBassem <me@mbassem.com>2024-02-14 11:49:57 +0000
commitc80ac83dabd7482b394585b2822cc921d76e17f9 (patch)
tree1862f17bbd00baf91d67919dff32d1ae8603393e /packages
parent686c677edc3a739cd8afdf6d20837fef202cbfe7 (diff)
downloadkarakeep-c80ac83dabd7482b394585b2822cc921d76e17f9.tar.zst
fix: Harden puppeteer against browser disconnections and exceptions
Diffstat (limited to 'packages')
-rw-r--r--packages/workers/crawler.ts49
-rw-r--r--packages/workers/package.json1
2 files changed, 34 insertions, 16 deletions
diff --git a/packages/workers/crawler.ts b/packages/workers/crawler.ts
index 4febc1ca..353f9056 100644
--- a/packages/workers/crawler.ts
+++ b/packages/workers/crawler.ts
@@ -25,6 +25,8 @@ import metascraperTitle from "metascraper-title";
import metascraperUrl from "metascraper-url";
import metascraperTwitter from "metascraper-twitter";
import metascraperReadability from "metascraper-readability";
+import { Mutex } from "async-mutex";
+import assert from "assert";
const metascraperParser = metascraper([
metascraperReadability(),
@@ -37,14 +39,27 @@ const metascraperParser = metascraper([
]);
let browser: Browser | undefined;
+// Guards the interactions with the browser instance.
+// This is needed given that most of the browser APIs are async.
+const browserMutex = new Mutex();
+
+async function launchBrowser() {
+ browser = undefined;
+ await browserMutex.runExclusive(async () => {
+ browser = await puppeteer.launch({ headless: true });
+ browser.on("disconnected", async () => {
+ logger.info(
+ "The puppeteer browser got disconnected. Will attempt to launch it again.",
+ );
+ await launchBrowser();
+ });
+ });
+}
export class CrawlerWorker {
static async build() {
- if (!browser) {
- puppeteer.use(StealthPlugin());
- console.log("HERE");
- browser = await puppeteer.launch({ headless: true });
- }
+ puppeteer.use(StealthPlugin());
+ await launchBrowser();
logger.info("Starting crawler worker ...");
const worker = new Worker<ZCrawlLinkRequest, void>(
@@ -82,20 +97,22 @@ async function getBookmarkUrl(bookmarkId: string) {
}
async function crawlPage(url: string) {
- if (!browser) {
- throw new Error("The browser must have been initalized by this point.");
- }
+ assert(browser);
const context = await browser.createBrowserContext();
- const page = await context.newPage();
- await page.goto(url, {
- timeout: 10000, // 10 seconds
- waitUntil: "networkidle2",
- });
+ try {
+ const page = await context.newPage();
+
+ await page.goto(url, {
+ timeout: 10000, // 10 seconds
+ waitUntil: "networkidle2",
+ });
- const htmlContent = await page.content();
- await context.close();
- return htmlContent;
+ const htmlContent = await page.content();
+ return htmlContent;
+ } finally {
+ await context.close();
+ }
}
async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
diff --git a/packages/workers/package.json b/packages/workers/package.json
index e20dc7f2..f84737a2 100644
--- a/packages/workers/package.json
+++ b/packages/workers/package.json
@@ -6,6 +6,7 @@
"dependencies": {
"@remember/db": "0.1.0",
"@remember/shared": "0.1.0",
+ "async-mutex": "^0.4.1",
"dotenv": "^16.4.1",
"metascraper": "^5.43.4",
"metascraper-description": "^5.43.4",