aboutsummaryrefslogtreecommitdiffstats
path: root/packages/workers
diff options
context:
space:
mode:
authorMohamedBassem <me@mbassem.com>2024-02-09 22:17:31 +0000
committerMohamedBassem <me@mbassem.com>2024-02-09 22:17:31 +0000
commit70f15d04b28f396b3b4e3ab4710b9bf568a9ab64 (patch)
treef28f1b25c177e1214ca7d97128627f52f5960b4c /packages/workers
parent347aa0c1bce7b53ab2f19b4b4904e4382e9ca6f7 (diff)
downloadkarakeep-70f15d04b28f396b3b4e3ab4710b9bf568a9ab64.tar.zst
[feature] Use puppeteer for fetching websites
Diffstat (limited to 'packages/workers')
-rw-r--r--packages/workers/crawler.ts36
-rw-r--r--packages/workers/package.json7
2 files changed, 38 insertions, 5 deletions
diff --git a/packages/workers/crawler.ts b/packages/workers/crawler.ts
index d5479025..1cb82f31 100644
--- a/packages/workers/crawler.ts
+++ b/packages/workers/crawler.ts
@@ -8,6 +8,10 @@ import { Job } from "bullmq";
import prisma from "@remember/db";
+import { Browser } from "puppeteer";
+import puppeteer from "puppeteer-extra";
+import StealthPlugin from "puppeteer-extra-plugin-stealth";
+
import metascraper from "metascraper";
import metascraperDescription from "metascraper-description";
@@ -15,15 +19,40 @@ import metascraperImage from "metascraper-image";
import metascraperLogo from "metascraper-logo-favicon";
import metascraperTitle from "metascraper-title";
import metascraperUrl from "metascraper-url";
+import metascraperTwitter from "metascraper-twitter";
+import metascraperReadability from "metascraper-readability";
const metascraperParser = metascraper([
+ metascraperReadability(),
+ metascraperTitle(),
metascraperDescription(),
+ metascraperTwitter(),
metascraperImage(),
metascraperLogo(),
- metascraperTitle(),
metascraperUrl(),
]);
+let browser: Browser;
+(async () => {
+ puppeteer.use(StealthPlugin());
+ // TODO: Configure the browser mode via an env variable
+ browser = await puppeteer.launch({ headless: true });
+})();
+
+async function crawlPage(url: string) {
+ const context = await browser.createBrowserContext();
+ const page = await context.newPage();
+
+ await page.goto(url, {
+ timeout: 10000, // 10 seconds
+ waitUntil: "networkidle2",
+ });
+
+ const htmlContent = await page.content();
+ await context.close();
+ return htmlContent;
+}
+
export default async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
const jobId = job.id || "unknown";
@@ -42,12 +71,11 @@ export default async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
);
// TODO(IMPORTANT): Run security validations on the input URL (e.g. deny localhost, etc)
- const resp = await fetch(url);
- const respBody = await resp.text();
+ const htmlContent = await crawlPage(url);
const meta = await metascraperParser({
url,
- html: respBody,
+ html: htmlContent,
});
await prisma.bookmarkedLink.update({
diff --git a/packages/workers/package.json b/packages/workers/package.json
index c9ec1a10..65648f4e 100644
--- a/packages/workers/package.json
+++ b/packages/workers/package.json
@@ -11,9 +11,14 @@
"metascraper-image": "^5.43.4",
"metascraper-logo": "^5.43.4",
"metascraper-logo-favicon": "^5.43.4",
+ "metascraper-readability": "^5.43.4",
"metascraper-title": "^5.43.4",
+ "metascraper-twitter": "^5.43.4",
"metascraper-url": "^5.43.4",
- "openai": "^4.26.1"
+ "openai": "^4.26.1",
+ "puppeteer": "^22.0.0",
+ "puppeteer-extra": "^3.3.6",
+ "puppeteer-extra-plugin-stealth": "^2.11.2"
},
"devDependencies": {
"@types/metascraper": "^5.14.3",