aboutsummaryrefslogtreecommitdiffstats
path: root/packages/workers/crawler.ts
diff options
context:
space:
mode:
authorMohamedBassem <me@mbassem.com>2024-02-09 22:17:31 +0000
committerMohamedBassem <me@mbassem.com>2024-02-09 22:17:31 +0000
commit70f15d04b28f396b3b4e3ab4710b9bf568a9ab64 (patch)
treef28f1b25c177e1214ca7d97128627f52f5960b4c /packages/workers/crawler.ts
parent347aa0c1bce7b53ab2f19b4b4904e4382e9ca6f7 (diff)
downloadkarakeep-70f15d04b28f396b3b4e3ab4710b9bf568a9ab64.tar.zst
[feature] Use puppeteer for fetching websites
Diffstat (limited to 'packages/workers/crawler.ts')
-rw-r--r--packages/workers/crawler.ts36
1 files changed, 32 insertions, 4 deletions
diff --git a/packages/workers/crawler.ts b/packages/workers/crawler.ts
index d5479025..1cb82f31 100644
--- a/packages/workers/crawler.ts
+++ b/packages/workers/crawler.ts
@@ -8,6 +8,10 @@ import { Job } from "bullmq";
import prisma from "@remember/db";
+import { Browser } from "puppeteer";
+import puppeteer from "puppeteer-extra";
+import StealthPlugin from "puppeteer-extra-plugin-stealth";
+
import metascraper from "metascraper";
import metascraperDescription from "metascraper-description";
@@ -15,15 +19,40 @@ import metascraperImage from "metascraper-image";
import metascraperLogo from "metascraper-logo-favicon";
import metascraperTitle from "metascraper-title";
import metascraperUrl from "metascraper-url";
+import metascraperTwitter from "metascraper-twitter";
+import metascraperReadability from "metascraper-readability";
const metascraperParser = metascraper([
+ metascraperReadability(),
+ metascraperTitle(),
metascraperDescription(),
+ metascraperTwitter(),
metascraperImage(),
metascraperLogo(),
- metascraperTitle(),
metascraperUrl(),
]);
+let browser: Browser;
+(async () => {
+ puppeteer.use(StealthPlugin());
+ // TODO: Configure the browser mode via an env variable
+ browser = await puppeteer.launch({ headless: true });
+})();
+
+async function crawlPage(url: string) {
+ const context = await browser.createBrowserContext();
+ const page = await context.newPage();
+
+ await page.goto(url, {
+ timeout: 10000, // 10 seconds
+ waitUntil: "networkidle2",
+ });
+
+ const htmlContent = await page.content();
+ await context.close();
+ return htmlContent;
+}
+
export default async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
const jobId = job.id || "unknown";
@@ -42,12 +71,11 @@ export default async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
);
// TODO(IMPORTANT): Run security validations on the input URL (e.g. deny localhost, etc)
- const resp = await fetch(url);
- const respBody = await resp.text();
+ const htmlContent = await crawlPage(url);
const meta = await metascraperParser({
url,
- html: respBody,
+ html: htmlContent,
});
await prisma.bookmarkedLink.update({