diff options
| author | MohamedBassem <me@mbassem.com> | 2024-02-09 22:17:31 +0000 |
|---|---|---|
| committer | MohamedBassem <me@mbassem.com> | 2024-02-09 22:17:31 +0000 |
| commit | 70f15d04b28f396b3b4e3ab4710b9bf568a9ab64 (patch) | |
| tree | f28f1b25c177e1214ca7d97128627f52f5960b4c /packages/workers | |
| parent | 347aa0c1bce7b53ab2f19b4b4904e4382e9ca6f7 (diff) | |
| download | karakeep-70f15d04b28f396b3b4e3ab4710b9bf568a9ab64.tar.zst | |
[feature] Use puppeteer for fetching websites
Diffstat (limited to 'packages/workers')
| -rw-r--r-- | packages/workers/crawler.ts | 36 | ||||
| -rw-r--r-- | packages/workers/package.json | 7 |
2 files changed, 38 insertions, 5 deletions
diff --git a/packages/workers/crawler.ts b/packages/workers/crawler.ts index d5479025..1cb82f31 100644 --- a/packages/workers/crawler.ts +++ b/packages/workers/crawler.ts @@ -8,6 +8,10 @@ import { Job } from "bullmq"; import prisma from "@remember/db"; +import { Browser } from "puppeteer"; +import puppeteer from "puppeteer-extra"; +import StealthPlugin from "puppeteer-extra-plugin-stealth"; + import metascraper from "metascraper"; import metascraperDescription from "metascraper-description"; @@ -15,15 +19,40 @@ import metascraperImage from "metascraper-image"; import metascraperLogo from "metascraper-logo-favicon"; import metascraperTitle from "metascraper-title"; import metascraperUrl from "metascraper-url"; +import metascraperTwitter from "metascraper-twitter"; +import metascraperReadability from "metascraper-readability"; const metascraperParser = metascraper([ + metascraperReadability(), + metascraperTitle(), metascraperDescription(), + metascraperTwitter(), metascraperImage(), metascraperLogo(), - metascraperTitle(), metascraperUrl(), ]); +let browser: Browser; +(async () => { + puppeteer.use(StealthPlugin()); + // TODO: Configure the browser mode via an env variable + browser = await puppeteer.launch({ headless: true }); +})(); + +async function crawlPage(url: string) { + const context = await browser.createBrowserContext(); + const page = await context.newPage(); + + await page.goto(url, { + timeout: 10000, // 10 seconds + waitUntil: "networkidle2", + }); + + const htmlContent = await page.content(); + await context.close(); + return htmlContent; +} + export default async function runCrawler(job: Job<ZCrawlLinkRequest, void>) { const jobId = job.id || "unknown"; @@ -42,12 +71,11 @@ export default async function runCrawler(job: Job<ZCrawlLinkRequest, void>) { ); // TODO(IMPORTANT): Run security validations on the input URL (e.g. deny localhost, etc) - const resp = await fetch(url); - const respBody = await resp.text(); + const htmlContent = await crawlPage(url); const meta = await metascraperParser({ url, - html: respBody, + html: htmlContent, }); await prisma.bookmarkedLink.update({ diff --git a/packages/workers/package.json b/packages/workers/package.json index c9ec1a10..65648f4e 100644 --- a/packages/workers/package.json +++ b/packages/workers/package.json @@ -11,9 +11,14 @@ "metascraper-image": "^5.43.4", "metascraper-logo": "^5.43.4", "metascraper-logo-favicon": "^5.43.4", + "metascraper-readability": "^5.43.4", "metascraper-title": "^5.43.4", + "metascraper-twitter": "^5.43.4", "metascraper-url": "^5.43.4", - "openai": "^4.26.1" + "openai": "^4.26.1", + "puppeteer": "^22.0.0", + "puppeteer-extra": "^3.3.6", + "puppeteer-extra-plugin-stealth": "^2.11.2" }, "devDependencies": { "@types/metascraper": "^5.14.3", |
