diff options
| author | Mohamed Bassem <me@mbassem.com> | 2024-11-21 23:39:12 +0000 |
|---|---|---|
| committer | Mohamed Bassem <me@mbassem.com> | 2024-11-21 23:39:37 +0000 |
| commit | 378ad9bc157fb7741e09cdb687a97c82c2851578 (patch) | |
| tree | 0aad23fe71a78fbf02c271598c78ae70828f2ce3 /apps | |
| parent | 393d097c965c9bc223e9660b689df6a0312e9222 (diff) | |
| download | karakeep-378ad9bc157fb7741e09cdb687a97c82c2851578.tar.zst | |
fix(workers): Don't block connection to chrome when failing to download adblock list. #674
Diffstat (limited to 'apps')
| -rw-r--r-- | apps/workers/crawlerWorker.ts | 28 | ||||
| -rw-r--r-- | apps/workers/package.json | 3 |
2 files changed, 24 insertions, 7 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts index 70fda993..2dad98b7 100644 --- a/apps/workers/crawlerWorker.ts +++ b/apps/workers/crawlerWorker.ts @@ -1,7 +1,10 @@ import assert from "assert"; import * as dns from "dns"; +import { promises as fs } from "fs"; import * as path from "node:path"; +import * as os from "os"; import type { Browser } from "puppeteer"; +import { PuppeteerBlocker } from "@ghostery/adblocker-puppeteer"; import { Readability } from "@mozilla/readability"; import { Mutex } from "async-mutex"; import DOMPurify from "dompurify"; @@ -19,8 +22,8 @@ import metascraperReadability from "metascraper-readability"; import metascraperTitle from "metascraper-title"; import metascraperTwitter from "metascraper-twitter"; import metascraperUrl from "metascraper-url"; +import fetch from "node-fetch"; import puppeteer from "puppeteer-extra"; -import AdblockerPlugin from "puppeteer-extra-plugin-adblocker"; import StealthPlugin from "puppeteer-extra-plugin-stealth"; import { withTimeout } from "utils"; import { getBookmarkDetails, updateAsset } from "workerUtils"; @@ -67,6 +70,7 @@ const metascraperParser = metascraper([ ]); let globalBrowser: Browser | undefined; +let globalBlocker: PuppeteerBlocker | undefined; // Guards the interactions with the browser instance. // This is needed given that most of the browser APIs are async. const browserMutex = new Mutex(); @@ -144,11 +148,20 @@ async function launchBrowser() { export class CrawlerWorker { static async build() { puppeteer.use(StealthPlugin()); - puppeteer.use( - AdblockerPlugin({ - blockTrackersAndAnnoyances: true, - }), - ); + if (serverConfig.crawler.enableAdblocker) { + try { + logger.info("[crawler] Loading adblocker ..."); + globalBlocker = await PuppeteerBlocker.fromPrebuiltFull(fetch, { + path: path.join(os.tmpdir(), "hoarder_adblocker.bin"), + read: fs.readFile, + write: fs.writeFile, + }); + } catch (e) { + logger.error( + `[crawler] Failed to load adblocker. Will not be blocking ads: ${e}`, + ); + } + } if (!serverConfig.crawler.browserConnectOnDemand) { await launchBrowser(); } else { @@ -238,6 +251,9 @@ async function crawlPage(jobId: string, url: string) { try { const page = await context.newPage(); + if (globalBlocker) { + await globalBlocker.enableBlockingInPage(page); + } await page.setUserAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", ); diff --git a/apps/workers/package.json b/apps/workers/package.json index e05b2350..1ab2a934 100644 --- a/apps/workers/package.json +++ b/apps/workers/package.json @@ -4,6 +4,7 @@ "version": "0.1.0", "private": true, "dependencies": { + "@ghostery/adblocker-puppeteer": "^2.1.1", "@hoarder/db": "workspace:^0.1.0", "@hoarder/shared": "workspace:^0.1.0", "@hoarder/trpc": "workspace:^0.1.0", @@ -28,11 +29,11 @@ "metascraper-twitter": "^5.45.6", "metascraper-url": "^5.45.22", "node-cron": "^3.0.3", + "node-fetch": "^3.3.2", "pdf2json": "^3.0.5", "pdfjs-dist": "^4.0.379", "puppeteer": "^22.0.0", "puppeteer-extra": "^3.3.6", - "puppeteer-extra-plugin-adblocker": "^2.13.6", "puppeteer-extra-plugin-stealth": "^2.11.2", "rss-parser": "^3.13.0", "tesseract.js": "^5.1.1", |
