diff options
| author | MohamedBassem <me@mbassem.com> | 2024-03-02 00:52:18 +0000 |
|---|---|---|
| committer | MohamedBassem <me@mbassem.com> | 2024-03-02 00:52:18 +0000 |
| commit | 8ab2747e23256106b115aa3823ad25e2c2d466d4 (patch) | |
| tree | 73a98b0c62695d4657a463808264acd043959db8 /packages/workers | |
| parent | 3f5f1850b17eb0f5c4cd0970c22421f85d5a2bd6 (diff) | |
| download | karakeep-8ab2747e23256106b115aa3823ad25e2c2d466d4.tar.zst | |
feature: Store full link content and index them
Diffstat (limited to 'packages/workers')
| -rw-r--r-- | packages/workers/crawler.ts | 13 | ||||
| -rw-r--r-- | packages/workers/openai.ts | 10 | ||||
| -rw-r--r-- | packages/workers/package.json | 5 | ||||
| -rw-r--r-- | packages/workers/search.ts | 1 |
4 files changed, 28 insertions, 1 deletions
diff --git a/packages/workers/crawler.ts b/packages/workers/crawler.ts index 7be014a7..f1ee07f3 100644 --- a/packages/workers/crawler.ts +++ b/packages/workers/crawler.ts @@ -7,6 +7,8 @@ import { queueConnectionDetails, zCrawlLinkRequestSchema, } from "@hoarder/shared/queues"; +import DOMPurify from "dompurify"; +import { JSDOM } from "jsdom"; import { Worker } from "bullmq"; import { Job } from "bullmq"; @@ -31,7 +33,7 @@ import assert from "assert"; import serverConfig from "@hoarder/shared/config"; import { bookmarkLinks } from "@hoarder/db/schema"; import { eq } from "drizzle-orm"; -import { SearchIndexingWorker } from "./search"; +import { Readability } from "@mozilla/readability"; const metascraperParser = metascraper([ metascraperReadability(), @@ -159,6 +161,14 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) { html: htmlContent, }); + const window = new JSDOM("").window; + const purify = DOMPurify(window); + const purifiedHTML = purify.sanitize(htmlContent); + const purifiedDOM = new JSDOM(purifiedHTML, { url }); + const readableContent = new Readability(purifiedDOM.window.document).parse(); + + // TODO(important): Restrict the size of content to store + await db .update(bookmarkLinks) .set({ @@ -166,6 +176,7 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) { description: meta.description, imageUrl: meta.image, favicon: meta.logo, + content: readableContent?.textContent, crawledAt: new Date(), }) .where(eq(bookmarkLinks.id, bookmarkId)); diff --git a/packages/workers/openai.ts b/packages/workers/openai.ts index cc456616..7dda1f9b 100644 --- a/packages/workers/openai.ts +++ b/packages/workers/openai.ts @@ -63,10 +63,20 @@ function buildPrompt( `No description found for link "${bookmark.id}". Skipping ...`, ); } + + let content = bookmark.link.content; + if (content) { + let words = content.split(" "); + if (words.length > 2000) { + words = words.slice(2000); + content = words.join(" "); + } + } return ` ${PROMPT_BASE} URL: ${bookmark.link.url} Description: ${bookmark.link.description} +Content: ${content || ""} `; } diff --git a/packages/workers/package.json b/packages/workers/package.json index 078f6c54..a7b62462 100644 --- a/packages/workers/package.json +++ b/packages/workers/package.json @@ -6,11 +6,14 @@ "dependencies": { "@hoarder/db": "workspace:*", "@hoarder/shared": "workspace:*", + "@mozilla/readability": "^0.5.0", "@tsconfig/node21": "^21.0.1", "async-mutex": "^0.4.1", "bullmq": "^5.1.9", + "dompurify": "^3.0.9", "dotenv": "^16.4.1", "drizzle-orm": "^0.29.4", + "jsdom": "^24.0.0", "metascraper": "^5.43.4", "metascraper-description": "^5.43.4", "metascraper-image": "^5.43.4", @@ -29,6 +32,8 @@ "zod": "^3.22.4" }, "devDependencies": { + "@types/dompurify": "^3.0.5", + "@types/jsdom": "^21.1.6", "@types/metascraper": "^5.14.3" }, "scripts": { diff --git a/packages/workers/search.ts b/packages/workers/search.ts index a628b2ed..618e7c89 100644 --- a/packages/workers/search.ts +++ b/packages/workers/search.ts @@ -68,6 +68,7 @@ async function runIndex( url: bookmark.link.url, title: bookmark.link.title, description: bookmark.link.description, + content: bookmark.link.content, } : undefined), ...(bookmark.text ? { content: bookmark.text.text } : undefined), |
