diff options
| author | MohamedBassem <me@mbassem.com> | 2024-03-02 00:52:18 +0000 |
|---|---|---|
| committer | MohamedBassem <me@mbassem.com> | 2024-03-02 00:52:18 +0000 |
| commit | 8ab2747e23256106b115aa3823ad25e2c2d466d4 (patch) | |
| tree | 73a98b0c62695d4657a463808264acd043959db8 /packages/workers/crawler.ts | |
| parent | 3f5f1850b17eb0f5c4cd0970c22421f85d5a2bd6 (diff) | |
| download | karakeep-8ab2747e23256106b115aa3823ad25e2c2d466d4.tar.zst | |
feature: Store full link content and index them
Diffstat (limited to 'packages/workers/crawler.ts')
| -rw-r--r-- | packages/workers/crawler.ts | 13 |
1 files changed, 12 insertions, 1 deletions
diff --git a/packages/workers/crawler.ts b/packages/workers/crawler.ts index 7be014a7..f1ee07f3 100644 --- a/packages/workers/crawler.ts +++ b/packages/workers/crawler.ts @@ -7,6 +7,8 @@ import { queueConnectionDetails, zCrawlLinkRequestSchema, } from "@hoarder/shared/queues"; +import DOMPurify from "dompurify"; +import { JSDOM } from "jsdom"; import { Worker } from "bullmq"; import { Job } from "bullmq"; @@ -31,7 +33,7 @@ import assert from "assert"; import serverConfig from "@hoarder/shared/config"; import { bookmarkLinks } from "@hoarder/db/schema"; import { eq } from "drizzle-orm"; -import { SearchIndexingWorker } from "./search"; +import { Readability } from "@mozilla/readability"; const metascraperParser = metascraper([ metascraperReadability(), @@ -159,6 +161,14 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) { html: htmlContent, }); + const window = new JSDOM("").window; + const purify = DOMPurify(window); + const purifiedHTML = purify.sanitize(htmlContent); + const purifiedDOM = new JSDOM(purifiedHTML, { url }); + const readableContent = new Readability(purifiedDOM.window.document).parse(); + + // TODO(important): Restrict the size of content to store + await db .update(bookmarkLinks) .set({ @@ -166,6 +176,7 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) { description: meta.description, imageUrl: meta.image, favicon: meta.logo, + content: readableContent?.textContent, crawledAt: new Date(), }) .where(eq(bookmarkLinks.id, bookmarkId)); |
