aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--packages/workers/crawler.ts6
-rw-r--r--packages/workers/package.json1
-rw-r--r--pnpm-lock.yaml113
3 files changed, 120 insertions, 0 deletions
diff --git a/packages/workers/crawler.ts b/packages/workers/crawler.ts
index f1ee07f3..fbbee730 100644
--- a/packages/workers/crawler.ts
+++ b/packages/workers/crawler.ts
@@ -18,6 +18,7 @@ import { db } from "@hoarder/db";
import { Browser } from "puppeteer";
import puppeteer from "puppeteer-extra";
import StealthPlugin from "puppeteer-extra-plugin-stealth";
+import AdblockerPlugin from "puppeteer-extra-plugin-adblocker";
import metascraper from "metascraper";
@@ -70,6 +71,11 @@ async function launchBrowser() {
export class CrawlerWorker {
static async build() {
puppeteer.use(StealthPlugin());
+ puppeteer.use(
+ AdblockerPlugin({
+ blockTrackersAndAnnoyances: true,
+ }),
+ );
await launchBrowser();
logger.info("Starting crawler worker ...");
diff --git a/packages/workers/package.json b/packages/workers/package.json
index a7b62462..f2fc164c 100644
--- a/packages/workers/package.json
+++ b/packages/workers/package.json
@@ -26,6 +26,7 @@
"openai": "^4.26.1",
"puppeteer": "^22.0.0",
"puppeteer-extra": "^3.3.6",
+ "puppeteer-extra-plugin-adblocker": "^2.13.6",
"puppeteer-extra-plugin-stealth": "^2.11.2",
"tsx": "^4.7.1",
"typescript": "^5",
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index ea3891da..4f7a22a6 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -420,6 +420,9 @@ importers:
puppeteer-extra:
specifier: ^3.3.6
version: 3.3.6(puppeteer@22.3.0)
+ puppeteer-extra-plugin-adblocker:
+ specifier: ^2.13.6
+ version: 2.13.6(puppeteer-extra@3.3.6)(puppeteer@22.3.0)
puppeteer-extra-plugin-stealth:
specifier: ^2.11.2
version: 2.11.2(puppeteer-extra@3.3.6)
@@ -1721,6 +1724,40 @@ packages:
'@babel/helper-validator-identifier': 7.22.20
to-fast-properties: 2.0.0
+ /@cliqz/adblocker-content@1.26.16:
+ resolution: {integrity: sha512-N1pKg1gxfpnz47w2Sjs2sg3fxFZb113ClUhitgAFSVXeIhZ+S+bCaQtvwtP0mJT+SDfUx2NsPiLwZoPjVRI3wQ==}
+ dependencies:
+ '@cliqz/adblocker-extended-selectors': 1.26.16
+ dev: false
+
+ /@cliqz/adblocker-extended-selectors@1.26.16:
+ resolution: {integrity: sha512-ePXS3aD1R+0XfCnOj0L2ms0NA5AxKHfFLfw92cZ87IPY8ZEZK/sWwQCv5wawbwBmXksr0YkMfFVCiH/IQgUNBQ==}
+ dev: false
+
+ /@cliqz/adblocker-puppeteer@1.23.8(puppeteer@22.3.0):
+ resolution: {integrity: sha512-Ca1/DBqQXsOpKTFVAHX6OpLTSEupXmUkUWHj6iXhLLleC7RPISN5B0b801VDmaGRqoC5zKRxn0vYbIfpgCWVug==}
+ peerDependencies:
+ puppeteer: '>5'
+ dependencies:
+ '@cliqz/adblocker': 1.26.16
+ '@cliqz/adblocker-content': 1.26.16
+ puppeteer: 22.3.0(typescript@5.3.3)
+ tldts-experimental: 5.7.112
+ dev: false
+
+ /@cliqz/adblocker@1.26.16:
+ resolution: {integrity: sha512-NQ5WdNeiWiggDhhT/IXbsjKgH44nA9k5GlW00gUWRUpfKHCCInyDJYjM5pbHqxhgC3LkMVmXmU5vIsMUZ4RxFQ==}
+ dependencies:
+ '@cliqz/adblocker-content': 1.26.16
+ '@cliqz/adblocker-extended-selectors': 1.26.16
+ '@remusao/guess-url-type': 1.2.1
+ '@remusao/small': 1.2.1
+ '@remusao/smaz': 1.9.1
+ '@types/chrome': 0.0.260
+ '@types/firefox-webext-browser': 120.0.1
+ tldts-experimental: 6.1.11
+ dev: false
+
/@colors/colors@1.6.0:
resolution: {integrity: sha512-Ir+AOibqzrIsL6ajt3Rz3LskB7OiMVHqltZmspbW/TJuTVuyOMirVqAkjfY6JISiLHgyNqicAC8AyHHGzNd/dA==}
engines: {node: '>=0.1.90'}
@@ -3375,6 +3412,35 @@ packages:
engines: {node: '>=14.0.0'}
dev: false
+ /@remusao/guess-url-type@1.2.1:
+ resolution: {integrity: sha512-rbOqre2jW8STjheOsOaQHLgYBaBZ9Owbdt8NO7WvNZftJlaG3y/K9oOkl8ZUpuFBisIhmBuMEW6c+YrQl5inRA==}
+ dev: false
+
+ /@remusao/small@1.2.1:
+ resolution: {integrity: sha512-7MjoGt0TJMVw1GPKgWq6SJPws1SLsUXQRa43Umht+nkyw2jnpy3WpiLNqGdwo5rHr5Wp9B2W/Pm5RQp656UJdw==}
+ dev: false
+
+ /@remusao/smaz-compress@1.9.1:
+ resolution: {integrity: sha512-E2f48TwloQu3r6BdLOGF2aczeH7bJ/32oJGqvzT9SKur0cuUnLcZ7ZXP874E2fwmdE+cXzfC7bKzp79cDnmeyw==}
+ dependencies:
+ '@remusao/trie': 1.4.1
+ dev: false
+
+ /@remusao/smaz-decompress@1.9.1:
+ resolution: {integrity: sha512-TfjKKprYe3n47od8auhvJ/Ikj9kQTbDTe71ynKlxslrvvUhlIV3VQSuwYuMWMbdz1fIs0H/fxCN1Z8/H3km6/A==}
+ dev: false
+
+ /@remusao/smaz@1.9.1:
+ resolution: {integrity: sha512-e6BLuP8oaXCZ9+v46Is4ilAZ/Vq6YLgmBP204Ixgk1qTjXmqvFYG7+AS7v9nsZdGOy96r9DWGFbbDVgMxwu1rA==}
+ dependencies:
+ '@remusao/smaz-compress': 1.9.1
+ '@remusao/smaz-decompress': 1.9.1
+ dev: false
+
+ /@remusao/trie@1.4.1:
+ resolution: {integrity: sha512-yvwa+aCyYI/UjeD39BnpMypG8N06l86wIDW1/PAc6ihBRnodIfZDwccxQN3n1t74wduzaz74m4ZMHZnB06567Q==}
+ dev: false
+
/@rollup/plugin-babel@5.3.1(@babel/core@7.23.9)(rollup@2.79.1):
resolution: {integrity: sha512-WFfdLWU/xVWKeRQnKmIAQULUI7Il0gZnBIH/ZFO069wYIfPu+8zrfp/KMW0atmELoRDq8FbiP3VCss9MhCut7Q==}
engines: {node: '>= 10.0.0'}
@@ -3946,6 +4012,10 @@ packages:
resolution: {integrity: sha512-xFU8ZXTw4gd358lb2jw25nxY9QAgqn2+bKKjKOYfNCzN4DKCFetK7sPtrlpg66Ywe3vWY9FNxprZawAh9wfJ3g==}
dev: false
+ /@types/firefox-webext-browser@120.0.1:
+ resolution: {integrity: sha512-IR+NpPC+/o9TSTelcvT/w3fXTanX3LrpVxC5EQrlQyTjyWOKFz8O2mCJQ9VuejBz4NtovCGGKacXQ/VyY63L0A==}
+ dev: false
+
/@types/glob@7.2.0:
resolution: {integrity: sha512-ZUxbzKl0IfJILTS6t7ip5fQQM/J3TJYubDm3nMbgubNNYS62eXeUpoLUC8/7fJNiFYHTrGPQn7hspDUzIHX3UA==}
dependencies:
@@ -9651,6 +9721,33 @@ packages:
- utf-8-validate
dev: false
+ /puppeteer-extra-plugin-adblocker@2.13.6(puppeteer-extra@3.3.6)(puppeteer@22.3.0):
+ resolution: {integrity: sha512-AftgnUZ1rg2RPe9RpX6rkYAxEohwp3iFeGIyjsAuTaIiw4VLZqOb1LSY8/S60vAxpeat60fbCajxoUetmLy4Dw==}
+ engines: {node: '>=8'}
+ peerDependencies:
+ puppeteer: '*'
+ puppeteer-core: '*'
+ puppeteer-extra: '*'
+ peerDependenciesMeta:
+ puppeteer:
+ optional: true
+ puppeteer-core:
+ optional: true
+ puppeteer-extra:
+ optional: true
+ dependencies:
+ '@cliqz/adblocker-puppeteer': 1.23.8(puppeteer@22.3.0)
+ debug: 4.3.4
+ node-fetch: 2.7.0
+ puppeteer: 22.3.0(typescript@5.3.3)
+ puppeteer-extra: 3.3.6(puppeteer@22.3.0)
+ puppeteer-extra-plugin: 3.2.3(puppeteer-extra@3.3.6)
+ transitivePeerDependencies:
+ - encoding
+ - playwright-extra
+ - supports-color
+ dev: false
+
/puppeteer-extra-plugin-stealth@2.11.2(puppeteer-extra@3.3.6):
resolution: {integrity: sha512-bUemM5XmTj9i2ZerBzsk2AN5is0wHMNE6K0hXBzBXOzP5m5G3Wl0RHhiqKeHToe/uIH8AoZiGhc1tCkLZQPKTQ==}
engines: {node: '>=8'}
@@ -10883,10 +10980,26 @@ packages:
hasBin: true
dev: false
+ /tldts-core@5.7.112:
+ resolution: {integrity: sha512-mutrEUgG2sp0e/MIAnv9TbSLR0IPbvmAImpzqul5O/HJ2XM1/I1sajchQ/fbj0fPdA31IiuWde8EUhfwyldY1Q==}
+ dev: false
+
/tldts-core@6.1.11:
resolution: {integrity: sha512-ZFcT+/fdEc5VRndQIJtArNBHsaq4udRoeE4E6cwLzGaH0dq7Ng2L7cAoea6riM2uhNFD09EDa1bN8lrfrOBCLg==}
dev: false
+ /tldts-experimental@5.7.112:
+ resolution: {integrity: sha512-Nq5qWN4OiLziAOOOEoSME7cZI4Hz8Srt+9q6cl8mZ5EAhCfmeE6l7K5XjuIKN+pySuGUvthE5aPiD185YU1/lg==}
+ dependencies:
+ tldts-core: 5.7.112
+ dev: false
+
+ /tldts-experimental@6.1.11:
+ resolution: {integrity: sha512-4Ij/BzPUYS33PcAo9cprPm8qmKNBeYw2U7WsBAMtseqbQvCIyDsnXlOWy/SKmldalPdMPsL2CLjt27+KlWBH7g==}
+ dependencies:
+ tldts-core: 6.1.11
+ dev: false
+
/tldts@6.1.11:
resolution: {integrity: sha512-AAgE/IWvbsg4Lr4KGFNR7bL/MhQfBlgGV9UBg2uy5mCwSGi5f12eZ7ZydAqv4ACys6pUYjNoV2qfZdcCn4RS+Q==}
hasBin: true