aboutsummaryrefslogtreecommitdiffstats
path: root/packages/workers/crawler.ts
diff options
context:
space:
mode:
Diffstat (limited to 'packages/workers/crawler.ts')
-rw-r--r--packages/workers/crawler.ts13
1 files changed, 12 insertions, 1 deletions
diff --git a/packages/workers/crawler.ts b/packages/workers/crawler.ts
index 7be014a7..f1ee07f3 100644
--- a/packages/workers/crawler.ts
+++ b/packages/workers/crawler.ts
@@ -7,6 +7,8 @@ import {
queueConnectionDetails,
zCrawlLinkRequestSchema,
} from "@hoarder/shared/queues";
+import DOMPurify from "dompurify";
+import { JSDOM } from "jsdom";
import { Worker } from "bullmq";
import { Job } from "bullmq";
@@ -31,7 +33,7 @@ import assert from "assert";
import serverConfig from "@hoarder/shared/config";
import { bookmarkLinks } from "@hoarder/db/schema";
import { eq } from "drizzle-orm";
-import { SearchIndexingWorker } from "./search";
+import { Readability } from "@mozilla/readability";
const metascraperParser = metascraper([
metascraperReadability(),
@@ -159,6 +161,14 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
html: htmlContent,
});
+ const window = new JSDOM("").window;
+ const purify = DOMPurify(window);
+ const purifiedHTML = purify.sanitize(htmlContent);
+ const purifiedDOM = new JSDOM(purifiedHTML, { url });
+ const readableContent = new Readability(purifiedDOM.window.document).parse();
+
+ // TODO(important): Restrict the size of content to store
+
await db
.update(bookmarkLinks)
.set({
@@ -166,6 +176,7 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
description: meta.description,
imageUrl: meta.image,
favicon: meta.logo,
+ content: readableContent?.textContent,
crawledAt: new Date(),
})
.where(eq(bookmarkLinks.id, bookmarkId));