aboutsummaryrefslogtreecommitdiffstats
path: root/packages
diff options
context:
space:
mode:
authorMohamedBassem <me@mbassem.com>2024-03-02 00:52:18 +0000
committerMohamedBassem <me@mbassem.com>2024-03-02 00:52:18 +0000
commit8ab2747e23256106b115aa3823ad25e2c2d466d4 (patch)
tree73a98b0c62695d4657a463808264acd043959db8 /packages
parent3f5f1850b17eb0f5c4cd0970c22421f85d5a2bd6 (diff)
downloadkarakeep-8ab2747e23256106b115aa3823ad25e2c2d466d4.tar.zst
feature: Store full link content and index them
Diffstat (limited to 'packages')
-rw-r--r--packages/db/drizzle/0004_skinny_vengeance.sql1
-rw-r--r--packages/db/drizzle/meta/0004_snapshot.json799
-rw-r--r--packages/db/drizzle/meta/_journal.json7
-rw-r--r--packages/db/schema.ts1
-rw-r--r--packages/workers/crawler.ts13
-rw-r--r--packages/workers/openai.ts10
-rw-r--r--packages/workers/package.json5
-rw-r--r--packages/workers/search.ts1
8 files changed, 836 insertions, 1 deletions
diff --git a/packages/db/drizzle/0004_skinny_vengeance.sql b/packages/db/drizzle/0004_skinny_vengeance.sql
new file mode 100644
index 00000000..4380d8d5
--- /dev/null
+++ b/packages/db/drizzle/0004_skinny_vengeance.sql
@@ -0,0 +1 @@
+ALTER TABLE bookmarkLinks ADD `content` text; \ No newline at end of file
diff --git a/packages/db/drizzle/meta/0004_snapshot.json b/packages/db/drizzle/meta/0004_snapshot.json
new file mode 100644
index 00000000..50f21cf3
--- /dev/null
+++ b/packages/db/drizzle/meta/0004_snapshot.json
@@ -0,0 +1,799 @@
+{
+ "version": "5",
+ "dialect": "sqlite",
+ "id": "c8e820c3-3f51-4109-a4b8-b11741e956b4",
+ "prevId": "28b3c23e-65eb-44b4-bf9c-8573a6ccde8a",
+ "tables": {
+ "account": {
+ "name": "account",
+ "columns": {
+ "userId": {
+ "name": "userId",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "type": {
+ "name": "type",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "provider": {
+ "name": "provider",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "providerAccountId": {
+ "name": "providerAccountId",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "refresh_token": {
+ "name": "refresh_token",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "access_token": {
+ "name": "access_token",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "expires_at": {
+ "name": "expires_at",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "token_type": {
+ "name": "token_type",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "scope": {
+ "name": "scope",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "id_token": {
+ "name": "id_token",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "session_state": {
+ "name": "session_state",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ }
+ },
+ "indexes": {},
+ "foreignKeys": {
+ "account_userId_user_id_fk": {
+ "name": "account_userId_user_id_fk",
+ "tableFrom": "account",
+ "tableTo": "user",
+ "columnsFrom": [
+ "userId"
+ ],
+ "columnsTo": [
+ "id"
+ ],
+ "onDelete": "cascade",
+ "onUpdate": "no action"
+ }
+ },
+ "compositePrimaryKeys": {
+ "account_provider_providerAccountId_pk": {
+ "columns": [
+ "provider",
+ "providerAccountId"
+ ],
+ "name": "account_provider_providerAccountId_pk"
+ }
+ },
+ "uniqueConstraints": {}
+ },
+ "apiKey": {
+ "name": "apiKey",
+ "columns": {
+ "id": {
+ "name": "id",
+ "type": "text",
+ "primaryKey": true,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "name": {
+ "name": "name",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "createdAt": {
+ "name": "createdAt",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "keyId": {
+ "name": "keyId",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "keyHash": {
+ "name": "keyHash",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "userId": {
+ "name": "userId",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ }
+ },
+ "indexes": {
+ "apiKey_name_unique": {
+ "name": "apiKey_name_unique",
+ "columns": [
+ "name"
+ ],
+ "isUnique": true
+ },
+ "apiKey_keyId_unique": {
+ "name": "apiKey_keyId_unique",
+ "columns": [
+ "keyId"
+ ],
+ "isUnique": true
+ },
+ "apiKey_name_userId_unique": {
+ "name": "apiKey_name_userId_unique",
+ "columns": [
+ "name",
+ "userId"
+ ],
+ "isUnique": true
+ }
+ },
+ "foreignKeys": {
+ "apiKey_userId_user_id_fk": {
+ "name": "apiKey_userId_user_id_fk",
+ "tableFrom": "apiKey",
+ "tableTo": "user",
+ "columnsFrom": [
+ "userId"
+ ],
+ "columnsTo": [
+ "id"
+ ],
+ "onDelete": "cascade",
+ "onUpdate": "no action"
+ }
+ },
+ "compositePrimaryKeys": {},
+ "uniqueConstraints": {}
+ },
+ "bookmarkLinks": {
+ "name": "bookmarkLinks",
+ "columns": {
+ "id": {
+ "name": "id",
+ "type": "text",
+ "primaryKey": true,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "url": {
+ "name": "url",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "title": {
+ "name": "title",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "description": {
+ "name": "description",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "imageUrl": {
+ "name": "imageUrl",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "favicon": {
+ "name": "favicon",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "content": {
+ "name": "content",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "crawledAt": {
+ "name": "crawledAt",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ }
+ },
+ "indexes": {},
+ "foreignKeys": {
+ "bookmarkLinks_id_bookmarks_id_fk": {
+ "name": "bookmarkLinks_id_bookmarks_id_fk",
+ "tableFrom": "bookmarkLinks",
+ "tableTo": "bookmarks",
+ "columnsFrom": [
+ "id"
+ ],
+ "columnsTo": [
+ "id"
+ ],
+ "onDelete": "cascade",
+ "onUpdate": "no action"
+ }
+ },
+ "compositePrimaryKeys": {},
+ "uniqueConstraints": {}
+ },
+ "bookmarkLists": {
+ "name": "bookmarkLists",
+ "columns": {
+ "id": {
+ "name": "id",
+ "type": "text",
+ "primaryKey": true,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "name": {
+ "name": "name",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "icon": {
+ "name": "icon",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "createdAt": {
+ "name": "createdAt",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "userId": {
+ "name": "userId",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ }
+ },
+ "indexes": {
+ "bookmarkLists_name_userId_unique": {
+ "name": "bookmarkLists_name_userId_unique",
+ "columns": [
+ "name",
+ "userId"
+ ],
+ "isUnique": true
+ }
+ },
+ "foreignKeys": {
+ "bookmarkLists_userId_user_id_fk": {
+ "name": "bookmarkLists_userId_user_id_fk",
+ "tableFrom": "bookmarkLists",
+ "tableTo": "user",
+ "columnsFrom": [
+ "userId"
+ ],
+ "columnsTo": [
+ "id"
+ ],
+ "onDelete": "cascade",
+ "onUpdate": "no action"
+ }
+ },
+ "compositePrimaryKeys": {},
+ "uniqueConstraints": {}
+ },
+ "bookmarkTags": {
+ "name": "bookmarkTags",
+ "columns": {
+ "id": {
+ "name": "id",
+ "type": "text",
+ "primaryKey": true,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "name": {
+ "name": "name",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "createdAt": {
+ "name": "createdAt",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "userId": {
+ "name": "userId",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ }
+ },
+ "indexes": {
+ "bookmarkTags_userId_name_unique": {
+ "name": "bookmarkTags_userId_name_unique",
+ "columns": [
+ "userId",
+ "name"
+ ],
+ "isUnique": true
+ }
+ },
+ "foreignKeys": {
+ "bookmarkTags_userId_user_id_fk": {
+ "name": "bookmarkTags_userId_user_id_fk",
+ "tableFrom": "bookmarkTags",
+ "tableTo": "user",
+ "columnsFrom": [
+ "userId"
+ ],
+ "columnsTo": [
+ "id"
+ ],
+ "onDelete": "cascade",
+ "onUpdate": "no action"
+ }
+ },
+ "compositePrimaryKeys": {},
+ "uniqueConstraints": {}
+ },
+ "bookmarkTexts": {
+ "name": "bookmarkTexts",
+ "columns": {
+ "id": {
+ "name": "id",
+ "type": "text",
+ "primaryKey": true,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "text": {
+ "name": "text",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ }
+ },
+ "indexes": {},
+ "foreignKeys": {
+ "bookmarkTexts_id_bookmarks_id_fk": {
+ "name": "bookmarkTexts_id_bookmarks_id_fk",
+ "tableFrom": "bookmarkTexts",
+ "tableTo": "bookmarks",
+ "columnsFrom": [
+ "id"
+ ],
+ "columnsTo": [
+ "id"
+ ],
+ "onDelete": "cascade",
+ "onUpdate": "no action"
+ }
+ },
+ "compositePrimaryKeys": {},
+ "uniqueConstraints": {}
+ },
+ "bookmarks": {
+ "name": "bookmarks",
+ "columns": {
+ "id": {
+ "name": "id",
+ "type": "text",
+ "primaryKey": true,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "createdAt": {
+ "name": "createdAt",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "archived": {
+ "name": "archived",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false,
+ "default": false
+ },
+ "favourited": {
+ "name": "favourited",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false,
+ "default": false
+ },
+ "userId": {
+ "name": "userId",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ }
+ },
+ "indexes": {},
+ "foreignKeys": {
+ "bookmarks_userId_user_id_fk": {
+ "name": "bookmarks_userId_user_id_fk",
+ "tableFrom": "bookmarks",
+ "tableTo": "user",
+ "columnsFrom": [
+ "userId"
+ ],
+ "columnsTo": [
+ "id"
+ ],
+ "onDelete": "cascade",
+ "onUpdate": "no action"
+ }
+ },
+ "compositePrimaryKeys": {},
+ "uniqueConstraints": {}
+ },
+ "bookmarksInLists": {
+ "name": "bookmarksInLists",
+ "columns": {
+ "bookmarkId": {
+ "name": "bookmarkId",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "listId": {
+ "name": "listId",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "addedAt": {
+ "name": "addedAt",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ }
+ },
+ "indexes": {},
+ "foreignKeys": {
+ "bookmarksInLists_bookmarkId_bookmarks_id_fk": {
+ "name": "bookmarksInLists_bookmarkId_bookmarks_id_fk",
+ "tableFrom": "bookmarksInLists",
+ "tableTo": "bookmarks",
+ "columnsFrom": [
+ "bookmarkId"
+ ],
+ "columnsTo": [
+ "id"
+ ],
+ "onDelete": "cascade",
+ "onUpdate": "no action"
+ },
+ "bookmarksInLists_listId_bookmarkLists_id_fk": {
+ "name": "bookmarksInLists_listId_bookmarkLists_id_fk",
+ "tableFrom": "bookmarksInLists",
+ "tableTo": "bookmarkLists",
+ "columnsFrom": [
+ "listId"
+ ],
+ "columnsTo": [
+ "id"
+ ],
+ "onDelete": "cascade",
+ "onUpdate": "no action"
+ }
+ },
+ "compositePrimaryKeys": {
+ "bookmarksInLists_bookmarkId_listId_pk": {
+ "columns": [
+ "bookmarkId",
+ "listId"
+ ],
+ "name": "bookmarksInLists_bookmarkId_listId_pk"
+ }
+ },
+ "uniqueConstraints": {}
+ },
+ "session": {
+ "name": "session",
+ "columns": {
+ "sessionToken": {
+ "name": "sessionToken",
+ "type": "text",
+ "primaryKey": true,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "userId": {
+ "name": "userId",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "expires": {
+ "name": "expires",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ }
+ },
+ "indexes": {},
+ "foreignKeys": {
+ "session_userId_user_id_fk": {
+ "name": "session_userId_user_id_fk",
+ "tableFrom": "session",
+ "tableTo": "user",
+ "columnsFrom": [
+ "userId"
+ ],
+ "columnsTo": [
+ "id"
+ ],
+ "onDelete": "cascade",
+ "onUpdate": "no action"
+ }
+ },
+ "compositePrimaryKeys": {},
+ "uniqueConstraints": {}
+ },
+ "tagsOnBookmarks": {
+ "name": "tagsOnBookmarks",
+ "columns": {
+ "bookmarkId": {
+ "name": "bookmarkId",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "tagId": {
+ "name": "tagId",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "attachedAt": {
+ "name": "attachedAt",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "attachedBy": {
+ "name": "attachedBy",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ }
+ },
+ "indexes": {},
+ "foreignKeys": {
+ "tagsOnBookmarks_bookmarkId_bookmarks_id_fk": {
+ "name": "tagsOnBookmarks_bookmarkId_bookmarks_id_fk",
+ "tableFrom": "tagsOnBookmarks",
+ "tableTo": "bookmarks",
+ "columnsFrom": [
+ "bookmarkId"
+ ],
+ "columnsTo": [
+ "id"
+ ],
+ "onDelete": "cascade",
+ "onUpdate": "no action"
+ },
+ "tagsOnBookmarks_tagId_bookmarkTags_id_fk": {
+ "name": "tagsOnBookmarks_tagId_bookmarkTags_id_fk",
+ "tableFrom": "tagsOnBookmarks",
+ "tableTo": "bookmarkTags",
+ "columnsFrom": [
+ "tagId"
+ ],
+ "columnsTo": [
+ "id"
+ ],
+ "onDelete": "cascade",
+ "onUpdate": "no action"
+ }
+ },
+ "compositePrimaryKeys": {
+ "tagsOnBookmarks_bookmarkId_tagId_pk": {
+ "columns": [
+ "bookmarkId",
+ "tagId"
+ ],
+ "name": "tagsOnBookmarks_bookmarkId_tagId_pk"
+ }
+ },
+ "uniqueConstraints": {}
+ },
+ "user": {
+ "name": "user",
+ "columns": {
+ "id": {
+ "name": "id",
+ "type": "text",
+ "primaryKey": true,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "name": {
+ "name": "name",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "email": {
+ "name": "email",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "emailVerified": {
+ "name": "emailVerified",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "image": {
+ "name": "image",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "password": {
+ "name": "password",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "role": {
+ "name": "role",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false,
+ "default": "'user'"
+ }
+ },
+ "indexes": {
+ "user_email_unique": {
+ "name": "user_email_unique",
+ "columns": [
+ "email"
+ ],
+ "isUnique": true
+ }
+ },
+ "foreignKeys": {},
+ "compositePrimaryKeys": {},
+ "uniqueConstraints": {}
+ },
+ "verificationToken": {
+ "name": "verificationToken",
+ "columns": {
+ "identifier": {
+ "name": "identifier",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "token": {
+ "name": "token",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "expires": {
+ "name": "expires",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ }
+ },
+ "indexes": {},
+ "foreignKeys": {},
+ "compositePrimaryKeys": {
+ "verificationToken_identifier_token_pk": {
+ "columns": [
+ "identifier",
+ "token"
+ ],
+ "name": "verificationToken_identifier_token_pk"
+ }
+ },
+ "uniqueConstraints": {}
+ }
+ },
+ "enums": {},
+ "_meta": {
+ "schemas": {},
+ "tables": {},
+ "columns": {}
+ }
+} \ No newline at end of file
diff --git a/packages/db/drizzle/meta/_journal.json b/packages/db/drizzle/meta/_journal.json
index a8b97da4..839d3918 100644
--- a/packages/db/drizzle/meta/_journal.json
+++ b/packages/db/drizzle/meta/_journal.json
@@ -29,6 +29,13 @@
"when": 1709331420929,
"tag": "0003_parallel_supernaut",
"breakpoints": true
+ },
+ {
+ "idx": 4,
+ "version": "5",
+ "when": 1709339866036,
+ "tag": "0004_skinny_vengeance",
+ "breakpoints": true
}
]
} \ No newline at end of file
diff --git a/packages/db/schema.ts b/packages/db/schema.ts
index d1e3fa0d..94f48ac3 100644
--- a/packages/db/schema.ts
+++ b/packages/db/schema.ts
@@ -123,6 +123,7 @@ export const bookmarkLinks = sqliteTable("bookmarkLinks", {
description: text("description"),
imageUrl: text("imageUrl"),
favicon: text("favicon"),
+ content: text("content"),
crawledAt: integer("crawledAt", { mode: "timestamp" }),
});
diff --git a/packages/workers/crawler.ts b/packages/workers/crawler.ts
index 7be014a7..f1ee07f3 100644
--- a/packages/workers/crawler.ts
+++ b/packages/workers/crawler.ts
@@ -7,6 +7,8 @@ import {
queueConnectionDetails,
zCrawlLinkRequestSchema,
} from "@hoarder/shared/queues";
+import DOMPurify from "dompurify";
+import { JSDOM } from "jsdom";
import { Worker } from "bullmq";
import { Job } from "bullmq";
@@ -31,7 +33,7 @@ import assert from "assert";
import serverConfig from "@hoarder/shared/config";
import { bookmarkLinks } from "@hoarder/db/schema";
import { eq } from "drizzle-orm";
-import { SearchIndexingWorker } from "./search";
+import { Readability } from "@mozilla/readability";
const metascraperParser = metascraper([
metascraperReadability(),
@@ -159,6 +161,14 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
html: htmlContent,
});
+ const window = new JSDOM("").window;
+ const purify = DOMPurify(window);
+ const purifiedHTML = purify.sanitize(htmlContent);
+ const purifiedDOM = new JSDOM(purifiedHTML, { url });
+ const readableContent = new Readability(purifiedDOM.window.document).parse();
+
+ // TODO(important): Restrict the size of content to store
+
await db
.update(bookmarkLinks)
.set({
@@ -166,6 +176,7 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
description: meta.description,
imageUrl: meta.image,
favicon: meta.logo,
+ content: readableContent?.textContent,
crawledAt: new Date(),
})
.where(eq(bookmarkLinks.id, bookmarkId));
diff --git a/packages/workers/openai.ts b/packages/workers/openai.ts
index cc456616..7dda1f9b 100644
--- a/packages/workers/openai.ts
+++ b/packages/workers/openai.ts
@@ -63,10 +63,20 @@ function buildPrompt(
`No description found for link "${bookmark.id}". Skipping ...`,
);
}
+
+ let content = bookmark.link.content;
+ if (content) {
+ let words = content.split(" ");
+ if (words.length > 2000) {
+ words = words.slice(2000);
+ content = words.join(" ");
+ }
+ }
return `
${PROMPT_BASE}
URL: ${bookmark.link.url}
Description: ${bookmark.link.description}
+Content: ${content || ""}
`;
}
diff --git a/packages/workers/package.json b/packages/workers/package.json
index 078f6c54..a7b62462 100644
--- a/packages/workers/package.json
+++ b/packages/workers/package.json
@@ -6,11 +6,14 @@
"dependencies": {
"@hoarder/db": "workspace:*",
"@hoarder/shared": "workspace:*",
+ "@mozilla/readability": "^0.5.0",
"@tsconfig/node21": "^21.0.1",
"async-mutex": "^0.4.1",
"bullmq": "^5.1.9",
+ "dompurify": "^3.0.9",
"dotenv": "^16.4.1",
"drizzle-orm": "^0.29.4",
+ "jsdom": "^24.0.0",
"metascraper": "^5.43.4",
"metascraper-description": "^5.43.4",
"metascraper-image": "^5.43.4",
@@ -29,6 +32,8 @@
"zod": "^3.22.4"
},
"devDependencies": {
+ "@types/dompurify": "^3.0.5",
+ "@types/jsdom": "^21.1.6",
"@types/metascraper": "^5.14.3"
},
"scripts": {
diff --git a/packages/workers/search.ts b/packages/workers/search.ts
index a628b2ed..618e7c89 100644
--- a/packages/workers/search.ts
+++ b/packages/workers/search.ts
@@ -68,6 +68,7 @@ async function runIndex(
url: bookmark.link.url,
title: bookmark.link.title,
description: bookmark.link.description,
+ content: bookmark.link.content,
}
: undefined),
...(bookmark.text ? { content: bookmark.text.text } : undefined),