aboutsummaryrefslogtreecommitdiffstats
path: root/apps/workers/crawlerWorker.ts
diff options
context:
space:
mode:
authorMohamed Bassem <me@mbassem.com>2024-11-21 23:39:12 +0000
committerMohamed Bassem <me@mbassem.com>2024-11-21 23:39:37 +0000
commit378ad9bc157fb7741e09cdb687a97c82c2851578 (patch)
tree0aad23fe71a78fbf02c271598c78ae70828f2ce3 /apps/workers/crawlerWorker.ts
parent393d097c965c9bc223e9660b689df6a0312e9222 (diff)
downloadkarakeep-378ad9bc157fb7741e09cdb687a97c82c2851578.tar.zst
fix(workers): Don't block connection to chrome when failing to download adblock list. #674
Diffstat (limited to 'apps/workers/crawlerWorker.ts')
-rw-r--r--apps/workers/crawlerWorker.ts28
1 files changed, 22 insertions, 6 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index 70fda993..2dad98b7 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -1,7 +1,10 @@
import assert from "assert";
import * as dns from "dns";
+import { promises as fs } from "fs";
import * as path from "node:path";
+import * as os from "os";
import type { Browser } from "puppeteer";
+import { PuppeteerBlocker } from "@ghostery/adblocker-puppeteer";
import { Readability } from "@mozilla/readability";
import { Mutex } from "async-mutex";
import DOMPurify from "dompurify";
@@ -19,8 +22,8 @@ import metascraperReadability from "metascraper-readability";
import metascraperTitle from "metascraper-title";
import metascraperTwitter from "metascraper-twitter";
import metascraperUrl from "metascraper-url";
+import fetch from "node-fetch";
import puppeteer from "puppeteer-extra";
-import AdblockerPlugin from "puppeteer-extra-plugin-adblocker";
import StealthPlugin from "puppeteer-extra-plugin-stealth";
import { withTimeout } from "utils";
import { getBookmarkDetails, updateAsset } from "workerUtils";
@@ -67,6 +70,7 @@ const metascraperParser = metascraper([
]);
let globalBrowser: Browser | undefined;
+let globalBlocker: PuppeteerBlocker | undefined;
// Guards the interactions with the browser instance.
// This is needed given that most of the browser APIs are async.
const browserMutex = new Mutex();
@@ -144,11 +148,20 @@ async function launchBrowser() {
export class CrawlerWorker {
static async build() {
puppeteer.use(StealthPlugin());
- puppeteer.use(
- AdblockerPlugin({
- blockTrackersAndAnnoyances: true,
- }),
- );
+ if (serverConfig.crawler.enableAdblocker) {
+ try {
+ logger.info("[crawler] Loading adblocker ...");
+ globalBlocker = await PuppeteerBlocker.fromPrebuiltFull(fetch, {
+ path: path.join(os.tmpdir(), "hoarder_adblocker.bin"),
+ read: fs.readFile,
+ write: fs.writeFile,
+ });
+ } catch (e) {
+ logger.error(
+ `[crawler] Failed to load adblocker. Will not be blocking ads: ${e}`,
+ );
+ }
+ }
if (!serverConfig.crawler.browserConnectOnDemand) {
await launchBrowser();
} else {
@@ -238,6 +251,9 @@ async function crawlPage(jobId: string, url: string) {
try {
const page = await context.newPage();
+ if (globalBlocker) {
+ await globalBlocker.enableBlockingInPage(page);
+ }
await page.setUserAgent(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
);