aboutsummaryrefslogtreecommitdiffstats
path: root/apps/workers
diff options
context:
space:
mode:
authorMohamed Bassem <me@mbassem.com>2024-11-21 23:39:12 +0000
committerMohamed Bassem <me@mbassem.com>2024-11-21 23:39:37 +0000
commit378ad9bc157fb7741e09cdb687a97c82c2851578 (patch)
tree0aad23fe71a78fbf02c271598c78ae70828f2ce3 /apps/workers
parent393d097c965c9bc223e9660b689df6a0312e9222 (diff)
downloadkarakeep-378ad9bc157fb7741e09cdb687a97c82c2851578.tar.zst
fix(workers): Don't block connection to chrome when failing to download adblock list. #674
Diffstat (limited to 'apps/workers')
-rw-r--r--apps/workers/crawlerWorker.ts28
-rw-r--r--apps/workers/package.json3
2 files changed, 24 insertions, 7 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index 70fda993..2dad98b7 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -1,7 +1,10 @@
import assert from "assert";
import * as dns from "dns";
+import { promises as fs } from "fs";
import * as path from "node:path";
+import * as os from "os";
import type { Browser } from "puppeteer";
+import { PuppeteerBlocker } from "@ghostery/adblocker-puppeteer";
import { Readability } from "@mozilla/readability";
import { Mutex } from "async-mutex";
import DOMPurify from "dompurify";
@@ -19,8 +22,8 @@ import metascraperReadability from "metascraper-readability";
import metascraperTitle from "metascraper-title";
import metascraperTwitter from "metascraper-twitter";
import metascraperUrl from "metascraper-url";
+import fetch from "node-fetch";
import puppeteer from "puppeteer-extra";
-import AdblockerPlugin from "puppeteer-extra-plugin-adblocker";
import StealthPlugin from "puppeteer-extra-plugin-stealth";
import { withTimeout } from "utils";
import { getBookmarkDetails, updateAsset } from "workerUtils";
@@ -67,6 +70,7 @@ const metascraperParser = metascraper([
]);
let globalBrowser: Browser | undefined;
+let globalBlocker: PuppeteerBlocker | undefined;
// Guards the interactions with the browser instance.
// This is needed given that most of the browser APIs are async.
const browserMutex = new Mutex();
@@ -144,11 +148,20 @@ async function launchBrowser() {
export class CrawlerWorker {
static async build() {
puppeteer.use(StealthPlugin());
- puppeteer.use(
- AdblockerPlugin({
- blockTrackersAndAnnoyances: true,
- }),
- );
+ if (serverConfig.crawler.enableAdblocker) {
+ try {
+ logger.info("[crawler] Loading adblocker ...");
+ globalBlocker = await PuppeteerBlocker.fromPrebuiltFull(fetch, {
+ path: path.join(os.tmpdir(), "hoarder_adblocker.bin"),
+ read: fs.readFile,
+ write: fs.writeFile,
+ });
+ } catch (e) {
+ logger.error(
+ `[crawler] Failed to load adblocker. Will not be blocking ads: ${e}`,
+ );
+ }
+ }
if (!serverConfig.crawler.browserConnectOnDemand) {
await launchBrowser();
} else {
@@ -238,6 +251,9 @@ async function crawlPage(jobId: string, url: string) {
try {
const page = await context.newPage();
+ if (globalBlocker) {
+ await globalBlocker.enableBlockingInPage(page);
+ }
await page.setUserAgent(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
);
diff --git a/apps/workers/package.json b/apps/workers/package.json
index e05b2350..1ab2a934 100644
--- a/apps/workers/package.json
+++ b/apps/workers/package.json
@@ -4,6 +4,7 @@
"version": "0.1.0",
"private": true,
"dependencies": {
+ "@ghostery/adblocker-puppeteer": "^2.1.1",
"@hoarder/db": "workspace:^0.1.0",
"@hoarder/shared": "workspace:^0.1.0",
"@hoarder/trpc": "workspace:^0.1.0",
@@ -28,11 +29,11 @@
"metascraper-twitter": "^5.45.6",
"metascraper-url": "^5.45.22",
"node-cron": "^3.0.3",
+ "node-fetch": "^3.3.2",
"pdf2json": "^3.0.5",
"pdfjs-dist": "^4.0.379",
"puppeteer": "^22.0.0",
"puppeteer-extra": "^3.3.6",
- "puppeteer-extra-plugin-adblocker": "^2.13.6",
"puppeteer-extra-plugin-stealth": "^2.11.2",
"rss-parser": "^3.13.0",
"tesseract.js": "^5.1.1",