From 378ad9bc157fb7741e09cdb687a97c82c2851578 Mon Sep 17 00:00:00 2001 From: Mohamed Bassem Date: Thu, 21 Nov 2024 23:39:12 +0000 Subject: fix(workers): Don't block connection to chrome when failing to download adblock list. #674 --- apps/workers/crawlerWorker.ts | 28 ++++++++++++++++++++++------ apps/workers/package.json | 3 ++- 2 files changed, 24 insertions(+), 7 deletions(-) (limited to 'apps') diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts index 70fda993..2dad98b7 100644 --- a/apps/workers/crawlerWorker.ts +++ b/apps/workers/crawlerWorker.ts @@ -1,7 +1,10 @@ import assert from "assert"; import * as dns from "dns"; +import { promises as fs } from "fs"; import * as path from "node:path"; +import * as os from "os"; import type { Browser } from "puppeteer"; +import { PuppeteerBlocker } from "@ghostery/adblocker-puppeteer"; import { Readability } from "@mozilla/readability"; import { Mutex } from "async-mutex"; import DOMPurify from "dompurify"; @@ -19,8 +22,8 @@ import metascraperReadability from "metascraper-readability"; import metascraperTitle from "metascraper-title"; import metascraperTwitter from "metascraper-twitter"; import metascraperUrl from "metascraper-url"; +import fetch from "node-fetch"; import puppeteer from "puppeteer-extra"; -import AdblockerPlugin from "puppeteer-extra-plugin-adblocker"; import StealthPlugin from "puppeteer-extra-plugin-stealth"; import { withTimeout } from "utils"; import { getBookmarkDetails, updateAsset } from "workerUtils"; @@ -67,6 +70,7 @@ const metascraperParser = metascraper([ ]); let globalBrowser: Browser | undefined; +let globalBlocker: PuppeteerBlocker | undefined; // Guards the interactions with the browser instance. // This is needed given that most of the browser APIs are async. const browserMutex = new Mutex(); @@ -144,11 +148,20 @@ async function launchBrowser() { export class CrawlerWorker { static async build() { puppeteer.use(StealthPlugin()); - puppeteer.use( - AdblockerPlugin({ - blockTrackersAndAnnoyances: true, - }), - ); + if (serverConfig.crawler.enableAdblocker) { + try { + logger.info("[crawler] Loading adblocker ..."); + globalBlocker = await PuppeteerBlocker.fromPrebuiltFull(fetch, { + path: path.join(os.tmpdir(), "hoarder_adblocker.bin"), + read: fs.readFile, + write: fs.writeFile, + }); + } catch (e) { + logger.error( + `[crawler] Failed to load adblocker. Will not be blocking ads: ${e}`, + ); + } + } if (!serverConfig.crawler.browserConnectOnDemand) { await launchBrowser(); } else { @@ -238,6 +251,9 @@ async function crawlPage(jobId: string, url: string) { try { const page = await context.newPage(); + if (globalBlocker) { + await globalBlocker.enableBlockingInPage(page); + } await page.setUserAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", ); diff --git a/apps/workers/package.json b/apps/workers/package.json index e05b2350..1ab2a934 100644 --- a/apps/workers/package.json +++ b/apps/workers/package.json @@ -4,6 +4,7 @@ "version": "0.1.0", "private": true, "dependencies": { + "@ghostery/adblocker-puppeteer": "^2.1.1", "@hoarder/db": "workspace:^0.1.0", "@hoarder/shared": "workspace:^0.1.0", "@hoarder/trpc": "workspace:^0.1.0", @@ -28,11 +29,11 @@ "metascraper-twitter": "^5.45.6", "metascraper-url": "^5.45.22", "node-cron": "^3.0.3", + "node-fetch": "^3.3.2", "pdf2json": "^3.0.5", "pdfjs-dist": "^4.0.379", "puppeteer": "^22.0.0", "puppeteer-extra": "^3.3.6", - "puppeteer-extra-plugin-adblocker": "^2.13.6", "puppeteer-extra-plugin-stealth": "^2.11.2", "rss-parser": "^3.13.0", "tesseract.js": "^5.1.1", -- cgit v1.2.3-70-g09d2