aboutsummaryrefslogtreecommitdiffstats
path: root/packages/workers
diff options
context:
space:
mode:
authorMohamedBassem <me@mbassem.com>2024-03-02 00:52:18 +0000
committerMohamedBassem <me@mbassem.com>2024-03-02 00:52:18 +0000
commit8ab2747e23256106b115aa3823ad25e2c2d466d4 (patch)
tree73a98b0c62695d4657a463808264acd043959db8 /packages/workers
parent3f5f1850b17eb0f5c4cd0970c22421f85d5a2bd6 (diff)
downloadkarakeep-8ab2747e23256106b115aa3823ad25e2c2d466d4.tar.zst
feature: Store full link content and index them
Diffstat (limited to 'packages/workers')
-rw-r--r--packages/workers/crawler.ts13
-rw-r--r--packages/workers/openai.ts10
-rw-r--r--packages/workers/package.json5
-rw-r--r--packages/workers/search.ts1
4 files changed, 28 insertions, 1 deletions
diff --git a/packages/workers/crawler.ts b/packages/workers/crawler.ts
index 7be014a7..f1ee07f3 100644
--- a/packages/workers/crawler.ts
+++ b/packages/workers/crawler.ts
@@ -7,6 +7,8 @@ import {
queueConnectionDetails,
zCrawlLinkRequestSchema,
} from "@hoarder/shared/queues";
+import DOMPurify from "dompurify";
+import { JSDOM } from "jsdom";
import { Worker } from "bullmq";
import { Job } from "bullmq";
@@ -31,7 +33,7 @@ import assert from "assert";
import serverConfig from "@hoarder/shared/config";
import { bookmarkLinks } from "@hoarder/db/schema";
import { eq } from "drizzle-orm";
-import { SearchIndexingWorker } from "./search";
+import { Readability } from "@mozilla/readability";
const metascraperParser = metascraper([
metascraperReadability(),
@@ -159,6 +161,14 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
html: htmlContent,
});
+ const window = new JSDOM("").window;
+ const purify = DOMPurify(window);
+ const purifiedHTML = purify.sanitize(htmlContent);
+ const purifiedDOM = new JSDOM(purifiedHTML, { url });
+ const readableContent = new Readability(purifiedDOM.window.document).parse();
+
+ // TODO(important): Restrict the size of content to store
+
await db
.update(bookmarkLinks)
.set({
@@ -166,6 +176,7 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
description: meta.description,
imageUrl: meta.image,
favicon: meta.logo,
+ content: readableContent?.textContent,
crawledAt: new Date(),
})
.where(eq(bookmarkLinks.id, bookmarkId));
diff --git a/packages/workers/openai.ts b/packages/workers/openai.ts
index cc456616..7dda1f9b 100644
--- a/packages/workers/openai.ts
+++ b/packages/workers/openai.ts
@@ -63,10 +63,20 @@ function buildPrompt(
`No description found for link "${bookmark.id}". Skipping ...`,
);
}
+
+ let content = bookmark.link.content;
+ if (content) {
+ let words = content.split(" ");
+ if (words.length > 2000) {
+ words = words.slice(2000);
+ content = words.join(" ");
+ }
+ }
return `
${PROMPT_BASE}
URL: ${bookmark.link.url}
Description: ${bookmark.link.description}
+Content: ${content || ""}
`;
}
diff --git a/packages/workers/package.json b/packages/workers/package.json
index 078f6c54..a7b62462 100644
--- a/packages/workers/package.json
+++ b/packages/workers/package.json
@@ -6,11 +6,14 @@
"dependencies": {
"@hoarder/db": "workspace:*",
"@hoarder/shared": "workspace:*",
+ "@mozilla/readability": "^0.5.0",
"@tsconfig/node21": "^21.0.1",
"async-mutex": "^0.4.1",
"bullmq": "^5.1.9",
+ "dompurify": "^3.0.9",
"dotenv": "^16.4.1",
"drizzle-orm": "^0.29.4",
+ "jsdom": "^24.0.0",
"metascraper": "^5.43.4",
"metascraper-description": "^5.43.4",
"metascraper-image": "^5.43.4",
@@ -29,6 +32,8 @@
"zod": "^3.22.4"
},
"devDependencies": {
+ "@types/dompurify": "^3.0.5",
+ "@types/jsdom": "^21.1.6",
"@types/metascraper": "^5.14.3"
},
"scripts": {
diff --git a/packages/workers/search.ts b/packages/workers/search.ts
index a628b2ed..618e7c89 100644
--- a/packages/workers/search.ts
+++ b/packages/workers/search.ts
@@ -68,6 +68,7 @@ async function runIndex(
url: bookmark.link.url,
title: bookmark.link.title,
description: bookmark.link.description,
+ content: bookmark.link.content,
}
: undefined),
...(bookmark.text ? { content: bookmark.text.text } : undefined),