aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMohamedBassem <me@mbassem.com>2024-03-02 00:52:18 +0000
committerMohamedBassem <me@mbassem.com>2024-03-02 00:52:18 +0000
commit8ab2747e23256106b115aa3823ad25e2c2d466d4 (patch)
tree73a98b0c62695d4657a463808264acd043959db8
parent3f5f1850b17eb0f5c4cd0970c22421f85d5a2bd6 (diff)
downloadkarakeep-8ab2747e23256106b115aa3823ad25e2c2d466d4.tar.zst
feature: Store full link content and index them
-rw-r--r--packages/db/drizzle/0004_skinny_vengeance.sql1
-rw-r--r--packages/db/drizzle/meta/0004_snapshot.json799
-rw-r--r--packages/db/drizzle/meta/_journal.json7
-rw-r--r--packages/db/schema.ts1
-rw-r--r--packages/workers/crawler.ts13
-rw-r--r--packages/workers/openai.ts10
-rw-r--r--packages/workers/package.json5
-rw-r--r--packages/workers/search.ts1
-rw-r--r--pnpm-lock.yaml42
9 files changed, 878 insertions, 1 deletions
diff --git a/packages/db/drizzle/0004_skinny_vengeance.sql b/packages/db/drizzle/0004_skinny_vengeance.sql
new file mode 100644
index 00000000..4380d8d5
--- /dev/null
+++ b/packages/db/drizzle/0004_skinny_vengeance.sql
@@ -0,0 +1 @@
+ALTER TABLE bookmarkLinks ADD `content` text; \ No newline at end of file
diff --git a/packages/db/drizzle/meta/0004_snapshot.json b/packages/db/drizzle/meta/0004_snapshot.json
new file mode 100644
index 00000000..50f21cf3
--- /dev/null
+++ b/packages/db/drizzle/meta/0004_snapshot.json
@@ -0,0 +1,799 @@
+{
+ "version": "5",
+ "dialect": "sqlite",
+ "id": "c8e820c3-3f51-4109-a4b8-b11741e956b4",
+ "prevId": "28b3c23e-65eb-44b4-bf9c-8573a6ccde8a",
+ "tables": {
+ "account": {
+ "name": "account",
+ "columns": {
+ "userId": {
+ "name": "userId",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "type": {
+ "name": "type",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "provider": {
+ "name": "provider",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "providerAccountId": {
+ "name": "providerAccountId",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "refresh_token": {
+ "name": "refresh_token",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "access_token": {
+ "name": "access_token",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "expires_at": {
+ "name": "expires_at",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "token_type": {
+ "name": "token_type",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "scope": {
+ "name": "scope",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "id_token": {
+ "name": "id_token",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "session_state": {
+ "name": "session_state",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ }
+ },
+ "indexes": {},
+ "foreignKeys": {
+ "account_userId_user_id_fk": {
+ "name": "account_userId_user_id_fk",
+ "tableFrom": "account",
+ "tableTo": "user",
+ "columnsFrom": [
+ "userId"
+ ],
+ "columnsTo": [
+ "id"
+ ],
+ "onDelete": "cascade",
+ "onUpdate": "no action"
+ }
+ },
+ "compositePrimaryKeys": {
+ "account_provider_providerAccountId_pk": {
+ "columns": [
+ "provider",
+ "providerAccountId"
+ ],
+ "name": "account_provider_providerAccountId_pk"
+ }
+ },
+ "uniqueConstraints": {}
+ },
+ "apiKey": {
+ "name": "apiKey",
+ "columns": {
+ "id": {
+ "name": "id",
+ "type": "text",
+ "primaryKey": true,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "name": {
+ "name": "name",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "createdAt": {
+ "name": "createdAt",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "keyId": {
+ "name": "keyId",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "keyHash": {
+ "name": "keyHash",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "userId": {
+ "name": "userId",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ }
+ },
+ "indexes": {
+ "apiKey_name_unique": {
+ "name": "apiKey_name_unique",
+ "columns": [
+ "name"
+ ],
+ "isUnique": true
+ },
+ "apiKey_keyId_unique": {
+ "name": "apiKey_keyId_unique",
+ "columns": [
+ "keyId"
+ ],
+ "isUnique": true
+ },
+ "apiKey_name_userId_unique": {
+ "name": "apiKey_name_userId_unique",
+ "columns": [
+ "name",
+ "userId"
+ ],
+ "isUnique": true
+ }
+ },
+ "foreignKeys": {
+ "apiKey_userId_user_id_fk": {
+ "name": "apiKey_userId_user_id_fk",
+ "tableFrom": "apiKey",
+ "tableTo": "user",
+ "columnsFrom": [
+ "userId"
+ ],
+ "columnsTo": [
+ "id"
+ ],
+ "onDelete": "cascade",
+ "onUpdate": "no action"
+ }
+ },
+ "compositePrimaryKeys": {},
+ "uniqueConstraints": {}
+ },
+ "bookmarkLinks": {
+ "name": "bookmarkLinks",
+ "columns": {
+ "id": {
+ "name": "id",
+ "type": "text",
+ "primaryKey": true,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "url": {
+ "name": "url",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "title": {
+ "name": "title",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "description": {
+ "name": "description",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "imageUrl": {
+ "name": "imageUrl",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "favicon": {
+ "name": "favicon",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "content": {
+ "name": "content",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "crawledAt": {
+ "name": "crawledAt",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ }
+ },
+ "indexes": {},
+ "foreignKeys": {
+ "bookmarkLinks_id_bookmarks_id_fk": {
+ "name": "bookmarkLinks_id_bookmarks_id_fk",
+ "tableFrom": "bookmarkLinks",
+ "tableTo": "bookmarks",
+ "columnsFrom": [
+ "id"
+ ],
+ "columnsTo": [
+ "id"
+ ],
+ "onDelete": "cascade",
+ "onUpdate": "no action"
+ }
+ },
+ "compositePrimaryKeys": {},
+ "uniqueConstraints": {}
+ },
+ "bookmarkLists": {
+ "name": "bookmarkLists",
+ "columns": {
+ "id": {
+ "name": "id",
+ "type": "text",
+ "primaryKey": true,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "name": {
+ "name": "name",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "icon": {
+ "name": "icon",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "createdAt": {
+ "name": "createdAt",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "userId": {
+ "name": "userId",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ }
+ },
+ "indexes": {
+ "bookmarkLists_name_userId_unique": {
+ "name": "bookmarkLists_name_userId_unique",
+ "columns": [
+ "name",
+ "userId"
+ ],
+ "isUnique": true
+ }
+ },
+ "foreignKeys": {
+ "bookmarkLists_userId_user_id_fk": {
+ "name": "bookmarkLists_userId_user_id_fk",
+ "tableFrom": "bookmarkLists",
+ "tableTo": "user",
+ "columnsFrom": [
+ "userId"
+ ],
+ "columnsTo": [
+ "id"
+ ],
+ "onDelete": "cascade",
+ "onUpdate": "no action"
+ }
+ },
+ "compositePrimaryKeys": {},
+ "uniqueConstraints": {}
+ },
+ "bookmarkTags": {
+ "name": "bookmarkTags",
+ "columns": {
+ "id": {
+ "name": "id",
+ "type": "text",
+ "primaryKey": true,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "name": {
+ "name": "name",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "createdAt": {
+ "name": "createdAt",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "userId": {
+ "name": "userId",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ }
+ },
+ "indexes": {
+ "bookmarkTags_userId_name_unique": {
+ "name": "bookmarkTags_userId_name_unique",
+ "columns": [
+ "userId",
+ "name"
+ ],
+ "isUnique": true
+ }
+ },
+ "foreignKeys": {
+ "bookmarkTags_userId_user_id_fk": {
+ "name": "bookmarkTags_userId_user_id_fk",
+ "tableFrom": "bookmarkTags",
+ "tableTo": "user",
+ "columnsFrom": [
+ "userId"
+ ],
+ "columnsTo": [
+ "id"
+ ],
+ "onDelete": "cascade",
+ "onUpdate": "no action"
+ }
+ },
+ "compositePrimaryKeys": {},
+ "uniqueConstraints": {}
+ },
+ "bookmarkTexts": {
+ "name": "bookmarkTexts",
+ "columns": {
+ "id": {
+ "name": "id",
+ "type": "text",
+ "primaryKey": true,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "text": {
+ "name": "text",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ }
+ },
+ "indexes": {},
+ "foreignKeys": {
+ "bookmarkTexts_id_bookmarks_id_fk": {
+ "name": "bookmarkTexts_id_bookmarks_id_fk",
+ "tableFrom": "bookmarkTexts",
+ "tableTo": "bookmarks",
+ "columnsFrom": [
+ "id"
+ ],
+ "columnsTo": [
+ "id"
+ ],
+ "onDelete": "cascade",
+ "onUpdate": "no action"
+ }
+ },
+ "compositePrimaryKeys": {},
+ "uniqueConstraints": {}
+ },
+ "bookmarks": {
+ "name": "bookmarks",
+ "columns": {
+ "id": {
+ "name": "id",
+ "type": "text",
+ "primaryKey": true,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "createdAt": {
+ "name": "createdAt",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "archived": {
+ "name": "archived",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false,
+ "default": false
+ },
+ "favourited": {
+ "name": "favourited",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false,
+ "default": false
+ },
+ "userId": {
+ "name": "userId",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ }
+ },
+ "indexes": {},
+ "foreignKeys": {
+ "bookmarks_userId_user_id_fk": {
+ "name": "bookmarks_userId_user_id_fk",
+ "tableFrom": "bookmarks",
+ "tableTo": "user",
+ "columnsFrom": [
+ "userId"
+ ],
+ "columnsTo": [
+ "id"
+ ],
+ "onDelete": "cascade",
+ "onUpdate": "no action"
+ }
+ },
+ "compositePrimaryKeys": {},
+ "uniqueConstraints": {}
+ },
+ "bookmarksInLists": {
+ "name": "bookmarksInLists",
+ "columns": {
+ "bookmarkId": {
+ "name": "bookmarkId",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "listId": {
+ "name": "listId",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "addedAt": {
+ "name": "addedAt",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ }
+ },
+ "indexes": {},
+ "foreignKeys": {
+ "bookmarksInLists_bookmarkId_bookmarks_id_fk": {
+ "name": "bookmarksInLists_bookmarkId_bookmarks_id_fk",
+ "tableFrom": "bookmarksInLists",
+ "tableTo": "bookmarks",
+ "columnsFrom": [
+ "bookmarkId"
+ ],
+ "columnsTo": [
+ "id"
+ ],
+ "onDelete": "cascade",
+ "onUpdate": "no action"
+ },
+ "bookmarksInLists_listId_bookmarkLists_id_fk": {
+ "name": "bookmarksInLists_listId_bookmarkLists_id_fk",
+ "tableFrom": "bookmarksInLists",
+ "tableTo": "bookmarkLists",
+ "columnsFrom": [
+ "listId"
+ ],
+ "columnsTo": [
+ "id"
+ ],
+ "onDelete": "cascade",
+ "onUpdate": "no action"
+ }
+ },
+ "compositePrimaryKeys": {
+ "bookmarksInLists_bookmarkId_listId_pk": {
+ "columns": [
+ "bookmarkId",
+ "listId"
+ ],
+ "name": "bookmarksInLists_bookmarkId_listId_pk"
+ }
+ },
+ "uniqueConstraints": {}
+ },
+ "session": {
+ "name": "session",
+ "columns": {
+ "sessionToken": {
+ "name": "sessionToken",
+ "type": "text",
+ "primaryKey": true,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "userId": {
+ "name": "userId",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "expires": {
+ "name": "expires",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ }
+ },
+ "indexes": {},
+ "foreignKeys": {
+ "session_userId_user_id_fk": {
+ "name": "session_userId_user_id_fk",
+ "tableFrom": "session",
+ "tableTo": "user",
+ "columnsFrom": [
+ "userId"
+ ],
+ "columnsTo": [
+ "id"
+ ],
+ "onDelete": "cascade",
+ "onUpdate": "no action"
+ }
+ },
+ "compositePrimaryKeys": {},
+ "uniqueConstraints": {}
+ },
+ "tagsOnBookmarks": {
+ "name": "tagsOnBookmarks",
+ "columns": {
+ "bookmarkId": {
+ "name": "bookmarkId",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "tagId": {
+ "name": "tagId",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "attachedAt": {
+ "name": "attachedAt",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "attachedBy": {
+ "name": "attachedBy",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ }
+ },
+ "indexes": {},
+ "foreignKeys": {
+ "tagsOnBookmarks_bookmarkId_bookmarks_id_fk": {
+ "name": "tagsOnBookmarks_bookmarkId_bookmarks_id_fk",
+ "tableFrom": "tagsOnBookmarks",
+ "tableTo": "bookmarks",
+ "columnsFrom": [
+ "bookmarkId"
+ ],
+ "columnsTo": [
+ "id"
+ ],
+ "onDelete": "cascade",
+ "onUpdate": "no action"
+ },
+ "tagsOnBookmarks_tagId_bookmarkTags_id_fk": {
+ "name": "tagsOnBookmarks_tagId_bookmarkTags_id_fk",
+ "tableFrom": "tagsOnBookmarks",
+ "tableTo": "bookmarkTags",
+ "columnsFrom": [
+ "tagId"
+ ],
+ "columnsTo": [
+ "id"
+ ],
+ "onDelete": "cascade",
+ "onUpdate": "no action"
+ }
+ },
+ "compositePrimaryKeys": {
+ "tagsOnBookmarks_bookmarkId_tagId_pk": {
+ "columns": [
+ "bookmarkId",
+ "tagId"
+ ],
+ "name": "tagsOnBookmarks_bookmarkId_tagId_pk"
+ }
+ },
+ "uniqueConstraints": {}
+ },
+ "user": {
+ "name": "user",
+ "columns": {
+ "id": {
+ "name": "id",
+ "type": "text",
+ "primaryKey": true,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "name": {
+ "name": "name",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "email": {
+ "name": "email",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "emailVerified": {
+ "name": "emailVerified",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "image": {
+ "name": "image",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "password": {
+ "name": "password",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false
+ },
+ "role": {
+ "name": "role",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false,
+ "autoincrement": false,
+ "default": "'user'"
+ }
+ },
+ "indexes": {
+ "user_email_unique": {
+ "name": "user_email_unique",
+ "columns": [
+ "email"
+ ],
+ "isUnique": true
+ }
+ },
+ "foreignKeys": {},
+ "compositePrimaryKeys": {},
+ "uniqueConstraints": {}
+ },
+ "verificationToken": {
+ "name": "verificationToken",
+ "columns": {
+ "identifier": {
+ "name": "identifier",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "token": {
+ "name": "token",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ },
+ "expires": {
+ "name": "expires",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": true,
+ "autoincrement": false
+ }
+ },
+ "indexes": {},
+ "foreignKeys": {},
+ "compositePrimaryKeys": {
+ "verificationToken_identifier_token_pk": {
+ "columns": [
+ "identifier",
+ "token"
+ ],
+ "name": "verificationToken_identifier_token_pk"
+ }
+ },
+ "uniqueConstraints": {}
+ }
+ },
+ "enums": {},
+ "_meta": {
+ "schemas": {},
+ "tables": {},
+ "columns": {}
+ }
+} \ No newline at end of file
diff --git a/packages/db/drizzle/meta/_journal.json b/packages/db/drizzle/meta/_journal.json
index a8b97da4..839d3918 100644
--- a/packages/db/drizzle/meta/_journal.json
+++ b/packages/db/drizzle/meta/_journal.json
@@ -29,6 +29,13 @@
"when": 1709331420929,
"tag": "0003_parallel_supernaut",
"breakpoints": true
+ },
+ {
+ "idx": 4,
+ "version": "5",
+ "when": 1709339866036,
+ "tag": "0004_skinny_vengeance",
+ "breakpoints": true
}
]
} \ No newline at end of file
diff --git a/packages/db/schema.ts b/packages/db/schema.ts
index d1e3fa0d..94f48ac3 100644
--- a/packages/db/schema.ts
+++ b/packages/db/schema.ts
@@ -123,6 +123,7 @@ export const bookmarkLinks = sqliteTable("bookmarkLinks", {
description: text("description"),
imageUrl: text("imageUrl"),
favicon: text("favicon"),
+ content: text("content"),
crawledAt: integer("crawledAt", { mode: "timestamp" }),
});
diff --git a/packages/workers/crawler.ts b/packages/workers/crawler.ts
index 7be014a7..f1ee07f3 100644
--- a/packages/workers/crawler.ts
+++ b/packages/workers/crawler.ts
@@ -7,6 +7,8 @@ import {
queueConnectionDetails,
zCrawlLinkRequestSchema,
} from "@hoarder/shared/queues";
+import DOMPurify from "dompurify";
+import { JSDOM } from "jsdom";
import { Worker } from "bullmq";
import { Job } from "bullmq";
@@ -31,7 +33,7 @@ import assert from "assert";
import serverConfig from "@hoarder/shared/config";
import { bookmarkLinks } from "@hoarder/db/schema";
import { eq } from "drizzle-orm";
-import { SearchIndexingWorker } from "./search";
+import { Readability } from "@mozilla/readability";
const metascraperParser = metascraper([
metascraperReadability(),
@@ -159,6 +161,14 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
html: htmlContent,
});
+ const window = new JSDOM("").window;
+ const purify = DOMPurify(window);
+ const purifiedHTML = purify.sanitize(htmlContent);
+ const purifiedDOM = new JSDOM(purifiedHTML, { url });
+ const readableContent = new Readability(purifiedDOM.window.document).parse();
+
+ // TODO(important): Restrict the size of content to store
+
await db
.update(bookmarkLinks)
.set({
@@ -166,6 +176,7 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
description: meta.description,
imageUrl: meta.image,
favicon: meta.logo,
+ content: readableContent?.textContent,
crawledAt: new Date(),
})
.where(eq(bookmarkLinks.id, bookmarkId));
diff --git a/packages/workers/openai.ts b/packages/workers/openai.ts
index cc456616..7dda1f9b 100644
--- a/packages/workers/openai.ts
+++ b/packages/workers/openai.ts
@@ -63,10 +63,20 @@ function buildPrompt(
`No description found for link "${bookmark.id}". Skipping ...`,
);
}
+
+ let content = bookmark.link.content;
+ if (content) {
+ let words = content.split(" ");
+ if (words.length > 2000) {
+ words = words.slice(2000);
+ content = words.join(" ");
+ }
+ }
return `
${PROMPT_BASE}
URL: ${bookmark.link.url}
Description: ${bookmark.link.description}
+Content: ${content || ""}
`;
}
diff --git a/packages/workers/package.json b/packages/workers/package.json
index 078f6c54..a7b62462 100644
--- a/packages/workers/package.json
+++ b/packages/workers/package.json
@@ -6,11 +6,14 @@
"dependencies": {
"@hoarder/db": "workspace:*",
"@hoarder/shared": "workspace:*",
+ "@mozilla/readability": "^0.5.0",
"@tsconfig/node21": "^21.0.1",
"async-mutex": "^0.4.1",
"bullmq": "^5.1.9",
+ "dompurify": "^3.0.9",
"dotenv": "^16.4.1",
"drizzle-orm": "^0.29.4",
+ "jsdom": "^24.0.0",
"metascraper": "^5.43.4",
"metascraper-description": "^5.43.4",
"metascraper-image": "^5.43.4",
@@ -29,6 +32,8 @@
"zod": "^3.22.4"
},
"devDependencies": {
+ "@types/dompurify": "^3.0.5",
+ "@types/jsdom": "^21.1.6",
"@types/metascraper": "^5.14.3"
},
"scripts": {
diff --git a/packages/workers/search.ts b/packages/workers/search.ts
index a628b2ed..618e7c89 100644
--- a/packages/workers/search.ts
+++ b/packages/workers/search.ts
@@ -68,6 +68,7 @@ async function runIndex(
url: bookmark.link.url,
title: bookmark.link.title,
description: bookmark.link.description,
+ content: bookmark.link.content,
}
: undefined),
...(bookmark.text ? { content: bookmark.text.text } : undefined),
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 5b2ab5fa..0e33a74c 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -360,6 +360,9 @@ importers:
'@hoarder/shared':
specifier: workspace:*
version: link:../shared
+ '@mozilla/readability':
+ specifier: ^0.5.0
+ version: 0.5.0
'@tsconfig/node21':
specifier: ^21.0.1
version: 21.0.1
@@ -369,12 +372,18 @@ importers:
bullmq:
specifier: ^5.1.9
version: 5.3.3
+ dompurify:
+ specifier: ^3.0.9
+ version: 3.0.9
dotenv:
specifier: ^16.4.1
version: 16.4.5
drizzle-orm:
specifier: ^0.29.4
version: 0.29.4(@types/react@18.2.58)(better-sqlite3@9.4.3)(react@18.2.0)
+ jsdom:
+ specifier: ^24.0.0
+ version: 24.0.0
metascraper:
specifier: ^5.43.4
version: 5.45.0
@@ -424,6 +433,12 @@ importers:
specifier: ^3.22.4
version: 3.22.4
devDependencies:
+ '@types/dompurify':
+ specifier: ^3.0.5
+ version: 3.0.5
+ '@types/jsdom':
+ specifier: ^21.1.6
+ version: 21.1.6
'@types/metascraper':
specifier: ^5.14.3
version: 5.14.3
@@ -2842,6 +2857,12 @@ packages:
'@types/ms': 0.7.34
dev: false
+ /@types/dompurify@3.0.5:
+ resolution: {integrity: sha512-1Wg0g3BtQF7sSb27fJQAKck1HECM6zV1EB66j8JH9i3LCjYabJa0FSdiSgsD5K/RbrsR0SiraKacLB+T8ZVYAg==}
+ dependencies:
+ '@types/trusted-types': 2.0.7
+ dev: true
+
/@types/emoji-mart@3.0.14:
resolution: {integrity: sha512-/vMkVnet466bK37ugf2jry9ldCZklFPXYMB2m+qNo3vkP2I7L0cvtNFPKAjfcHgPg9Z8pbYqVqZn7AgsC0qf+g==}
dependencies:
@@ -2881,6 +2902,14 @@ packages:
resolution: {integrity: sha512-1m0bIFVc7eJWyve9S0RnuRgcQqF/Xd5QsUZAZeQFr1Q3/p9JWoQQEqmVy+DPTNpGXwhgIetAoYF8JSc33q29QA==}
dev: false
+ /@types/jsdom@21.1.6:
+ resolution: {integrity: sha512-/7kkMsC+/kMs7gAYmmBR9P0vGTnOoLhQhyhQJSlXGI5bzTHp6xdo0TtKWQAsz6pmSAeVqKSbqeyP6hytqr9FDw==}
+ dependencies:
+ '@types/node': 20.11.20
+ '@types/tough-cookie': 4.0.5
+ parse5: 7.1.2
+ dev: true
+
/@types/json-schema@7.0.15:
resolution: {integrity: sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA==}
@@ -2953,10 +2982,18 @@ packages:
/@types/semver@7.5.8:
resolution: {integrity: sha512-I8EUhyrgfLrcTkzV3TSsGyl1tSuPrEDzr0yd5m90UgNxQkyDXULk3b6MlQqTCpZpNtWe1K0hzclnZkTcLBe2UQ==}
+ /@types/tough-cookie@4.0.5:
+ resolution: {integrity: sha512-/Ad8+nIOV7Rl++6f1BdKxFSMgmoqEoYbHRpPcx3JEfv8VRsQe9Z4mCXeJBzxs7mbHY/XOZZuXlRNfhpVPbs6ZA==}
+ dev: true
+
/@types/triple-beam@1.3.5:
resolution: {integrity: sha512-6WaYesThRMCl19iryMYP7/x2OVgCtbIVflDGFpWnb9irXI3UjYE4AzmYuiUKY1AJstGijoY+MgUszMgRxIYTYw==}
dev: false
+ /@types/trusted-types@2.0.7:
+ resolution: {integrity: sha512-ScaPdn1dQczgbl0QFTeTOmVHFULt394XJgOQNoyVhZ6r2vLnMLJfBPd53SB52T/3G36VI1/g2MZaX0cwDuXsfw==}
+ dev: true
+
/@types/unist@2.0.10:
resolution: {integrity: sha512-IfYcSBWE3hLpBg8+X2SEa8LVkJdJEkT2Ese2aaLs3ptGdVtABxndrMaxuFlQ1qdFf9Q5rDvDpxI3WwgvKFAsQA==}
dev: false
@@ -4337,6 +4374,10 @@ packages:
dependencies:
domelementtype: 2.3.0
+ /dompurify@3.0.9:
+ resolution: {integrity: sha512-uyb4NDIvQ3hRn6NiC+SIFaP4mJ/MdXlvtunaqK9Bn6dD3RuB/1S/gasEjDHD8eiaqdSael2vBv+hOs7Y+jhYOQ==}
+ dev: false
+
/domutils@3.1.0:
resolution: {integrity: sha512-H78uMmQtI2AhgDJjWeQmHwJJ2bLPD3GMmO7Zja/ZZh84wkm+4ut+IUnUdRa8uCGX88DiVx1j6FRe1XfxEgjEZA==}
dependencies:
@@ -8532,6 +8573,7 @@ packages:
/safer-buffer@2.1.2:
resolution: {integrity: sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==}
+ requiresBuild: true
dev: false
/saxes@6.0.0: