From c70d64d4cde2bf2acc5c4164eef79c40fd58aa42 Mon Sep 17 00:00:00 2001 From: Mael Date: Sun, 22 Jun 2025 19:08:21 +0200 Subject: feat(workers): migrate from puppeteer to playwright (#1296) * feat: convert to playwright Convert crawling to use Playwright instead of Chrome. - Update Dockerfile to include Playwright - Update crawler worker to use Playwright API - Update dependencies * feat: convert from Puppeteer to Playwright for crawling * feat: update docker-compose * use separate browser context for better isolation * skip chrome download in linux script * readd the stealth plugin --------- Co-authored-by: Mohamed Bassem --- apps/workers/package.json | 6 ++-- apps/workers/workers/crawlerWorker.ts | 67 +++++++++++++++++++---------------- 2 files changed, 39 insertions(+), 34 deletions(-) (limited to 'apps/workers') diff --git a/apps/workers/package.json b/apps/workers/package.json index c888350e..2ed6f9df 100644 --- a/apps/workers/package.json +++ b/apps/workers/package.json @@ -4,7 +4,7 @@ "version": "0.1.0", "private": true, "dependencies": { - "@ghostery/adblocker-puppeteer": "^2.5.1", + "@ghostery/adblocker-playwright": "^2.5.1", "@karakeep/db": "workspace:^0.1.0", "@karakeep/shared": "workspace:^0.1.0", "@karakeep/trpc": "workspace:^0.1.0", @@ -36,8 +36,8 @@ "pdf2json": "^3.1.5", "pdf2pic": "^3.1.3", "pdfjs-dist": "^4.2.67", - "puppeteer": "^22.0.0", - "puppeteer-extra": "^3.3.6", + "playwright": "^1.42.1", + "playwright-extra": "^4.3.6", "puppeteer-extra-plugin-stealth": "^2.11.2", "rss-parser": "^3.13.0", "tesseract.js": "^5.1.1", diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts index f0e831c2..d884d149 100644 --- a/apps/workers/workers/crawlerWorker.ts +++ b/apps/workers/workers/crawlerWorker.ts @@ -2,8 +2,7 @@ import * as dns from "dns"; import { promises as fs } from "fs"; import * as path from "node:path"; import * as os from "os"; -import type { Browser } from "puppeteer"; -import { PuppeteerBlocker } from "@ghostery/adblocker-puppeteer"; +import { PlaywrightBlocker } from "@ghostery/adblocker-playwright"; import { Readability } from "@mozilla/readability"; import { Mutex } from "async-mutex"; import DOMPurify from "dompurify"; @@ -25,7 +24,8 @@ import metascraperTitle from "metascraper-title"; import metascraperTwitter from "metascraper-twitter"; import metascraperUrl from "metascraper-url"; import fetch from "node-fetch"; -import puppeteer from "puppeteer-extra"; +import { Browser } from "playwright"; +import { chromium } from "playwright-extra"; import StealthPlugin from "puppeteer-extra-plugin-stealth"; import { withTimeout } from "utils"; import { getBookmarkDetails, updateAsset } from "workerUtils"; @@ -81,38 +81,37 @@ const metascraperParser = metascraper([ ]); let globalBrowser: Browser | undefined; -let globalBlocker: PuppeteerBlocker | undefined; +let globalBlocker: PlaywrightBlocker | undefined; // Guards the interactions with the browser instance. // This is needed given that most of the browser APIs are async. const browserMutex = new Mutex(); async function startBrowserInstance() { - const defaultViewport = { - width: 1440, - height: 900, - }; if (serverConfig.crawler.browserWebSocketUrl) { logger.info( `[Crawler] Connecting to existing browser websocket address: ${serverConfig.crawler.browserWebSocketUrl}`, ); - return puppeteer.connect({ - browserWSEndpoint: serverConfig.crawler.browserWebSocketUrl, - defaultViewport, + return await chromium.connect(serverConfig.crawler.browserWebSocketUrl, { + // Important: using slowMo to ensure stability with remote browser + slowMo: 100, + timeout: 5000, }); } else if (serverConfig.crawler.browserWebUrl) { logger.info( `[Crawler] Connecting to existing browser instance: ${serverConfig.crawler.browserWebUrl}`, ); + const webUrl = new URL(serverConfig.crawler.browserWebUrl); - // We need to resolve the ip address as a workaround for https://github.com/puppeteer/puppeteer/issues/2242 const { address } = await dns.promises.lookup(webUrl.hostname); webUrl.hostname = address; logger.info( `[Crawler] Successfully resolved IP address, new address: ${webUrl.toString()}`, ); - return puppeteer.connect({ - browserURL: webUrl.toString(), - defaultViewport, + + return await chromium.connectOverCDP(webUrl.toString(), { + // Important: using slowMo to ensure stability with remote browser + slowMo: 100, + timeout: 5000, }); } else { logger.info(`Running in browserless mode`); @@ -141,12 +140,12 @@ async function launchBrowser() { globalBrowser?.on("disconnected", () => { if (isShuttingDown) { logger.info( - "[Crawler] The puppeteer browser got disconnected. But we're shutting down so won't restart it.", + "[Crawler] The Playwright browser got disconnected. But we're shutting down so won't restart it.", ); return; } logger.info( - "[Crawler] The puppeteer browser got disconnected. Will attempt to launch it again.", + "[Crawler] The Playwright browser got disconnected. Will attempt to launch it again.", ); launchBrowser(); }); @@ -155,11 +154,11 @@ async function launchBrowser() { export class CrawlerWorker { static async build() { - puppeteer.use(StealthPlugin()); + chromium.use(StealthPlugin()); if (serverConfig.crawler.enableAdblocker) { try { logger.info("[crawler] Loading adblocker ..."); - globalBlocker = await PuppeteerBlocker.fromPrebuiltFull(fetch, { + globalBlocker = await PlaywrightBlocker.fromPrebuiltFull(fetch, { path: path.join(os.tmpdir(), "karakeep_adblocker.bin"), read: fs.readFile, write: fs.writeFile, @@ -287,39 +286,45 @@ async function crawlPage( if (!browser) { return browserlessCrawlPage(jobId, url, abortSignal); } - const context = await browser.createBrowserContext(); + const context = await browser.newContext({ + viewport: { width: 1440, height: 900 }, + userAgent: + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", + }); try { + // Create a new page in the context const page = await context.newPage(); + + // Apply ad blocking if (globalBlocker) { await globalBlocker.enableBlockingInPage(page); } - await page.setUserAgent( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", - ); + // Navigate to the target URL + logger.info(`[Crawler][${jobId}] Navigating to "${url}"`); const response = await page.goto(url, { timeout: serverConfig.crawler.navigateTimeoutSec * 1000, + waitUntil: "domcontentloaded", }); + logger.info( `[Crawler][${jobId}] Successfully navigated to "${url}". Waiting for the page to load ...`, ); - // Wait until there's at most two connections for 2 seconds - // Attempt to wait only for 5 seconds + // Wait until network is relatively idle or timeout after 5 seconds await Promise.race([ - page.waitForNetworkIdle({ - idleTime: 1000, // 1 sec - concurrency: 2, - }), - new Promise((f) => setTimeout(f, 5000)), + page.waitForLoadState("networkidle", { timeout: 5000 }).catch(() => ({})), + new Promise((resolve) => setTimeout(resolve, 5000)), ]); logger.info(`[Crawler][${jobId}] Finished waiting for the page to load.`); + // Extract content from the page const htmlContent = await page.content(); logger.info(`[Crawler][${jobId}] Successfully fetched the page content.`); + // Take a screenshot if configured let screenshot: Buffer | undefined = undefined; if (serverConfig.crawler.storeScreenshot) { try { @@ -327,7 +332,6 @@ async function crawlPage( page.screenshot({ // If you change this, you need to change the asset type in the store function. type: "png", - encoding: "binary", fullPage: serverConfig.crawler.fullPageScreenshot, }), new Promise((_, reject) => @@ -358,6 +362,7 @@ async function crawlPage( }; } finally { await context.close(); + // Only close the browser if it was created on demand if (serverConfig.crawler.browserConnectOnDemand) { await browser.close(); } -- cgit v1.2.3-70-g09d2