diff options
| author | MohamedBassem <me@mbassem.com> | 2024-03-02 00:52:18 +0000 |
|---|---|---|
| committer | MohamedBassem <me@mbassem.com> | 2024-03-02 00:52:18 +0000 |
| commit | 8ab2747e23256106b115aa3823ad25e2c2d466d4 (patch) | |
| tree | 73a98b0c62695d4657a463808264acd043959db8 /packages | |
| parent | 3f5f1850b17eb0f5c4cd0970c22421f85d5a2bd6 (diff) | |
| download | karakeep-8ab2747e23256106b115aa3823ad25e2c2d466d4.tar.zst | |
feature: Store full link content and index them
Diffstat (limited to 'packages')
| -rw-r--r-- | packages/db/drizzle/0004_skinny_vengeance.sql | 1 | ||||
| -rw-r--r-- | packages/db/drizzle/meta/0004_snapshot.json | 799 | ||||
| -rw-r--r-- | packages/db/drizzle/meta/_journal.json | 7 | ||||
| -rw-r--r-- | packages/db/schema.ts | 1 | ||||
| -rw-r--r-- | packages/workers/crawler.ts | 13 | ||||
| -rw-r--r-- | packages/workers/openai.ts | 10 | ||||
| -rw-r--r-- | packages/workers/package.json | 5 | ||||
| -rw-r--r-- | packages/workers/search.ts | 1 |
8 files changed, 836 insertions, 1 deletions
diff --git a/packages/db/drizzle/0004_skinny_vengeance.sql b/packages/db/drizzle/0004_skinny_vengeance.sql new file mode 100644 index 00000000..4380d8d5 --- /dev/null +++ b/packages/db/drizzle/0004_skinny_vengeance.sql @@ -0,0 +1 @@ +ALTER TABLE bookmarkLinks ADD `content` text;
\ No newline at end of file diff --git a/packages/db/drizzle/meta/0004_snapshot.json b/packages/db/drizzle/meta/0004_snapshot.json new file mode 100644 index 00000000..50f21cf3 --- /dev/null +++ b/packages/db/drizzle/meta/0004_snapshot.json @@ -0,0 +1,799 @@ +{ + "version": "5", + "dialect": "sqlite", + "id": "c8e820c3-3f51-4109-a4b8-b11741e956b4", + "prevId": "28b3c23e-65eb-44b4-bf9c-8573a6ccde8a", + "tables": { + "account": { + "name": "account", + "columns": { + "userId": { + "name": "userId", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "type": { + "name": "type", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "provider": { + "name": "provider", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "providerAccountId": { + "name": "providerAccountId", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "refresh_token": { + "name": "refresh_token", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "access_token": { + "name": "access_token", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "expires_at": { + "name": "expires_at", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "token_type": { + "name": "token_type", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "scope": { + "name": "scope", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "id_token": { + "name": "id_token", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "session_state": { + "name": "session_state", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + } + }, + "indexes": {}, + "foreignKeys": { + "account_userId_user_id_fk": { + "name": "account_userId_user_id_fk", + "tableFrom": "account", + "tableTo": "user", + "columnsFrom": [ + "userId" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": { + "account_provider_providerAccountId_pk": { + "columns": [ + "provider", + "providerAccountId" + ], + "name": "account_provider_providerAccountId_pk" + } + }, + "uniqueConstraints": {} + }, + "apiKey": { + "name": "apiKey", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true, + "autoincrement": false + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "createdAt": { + "name": "createdAt", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "keyId": { + "name": "keyId", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "keyHash": { + "name": "keyHash", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "userId": { + "name": "userId", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + } + }, + "indexes": { + "apiKey_name_unique": { + "name": "apiKey_name_unique", + "columns": [ + "name" + ], + "isUnique": true + }, + "apiKey_keyId_unique": { + "name": "apiKey_keyId_unique", + "columns": [ + "keyId" + ], + "isUnique": true + }, + "apiKey_name_userId_unique": { + "name": "apiKey_name_userId_unique", + "columns": [ + "name", + "userId" + ], + "isUnique": true + } + }, + "foreignKeys": { + "apiKey_userId_user_id_fk": { + "name": "apiKey_userId_user_id_fk", + "tableFrom": "apiKey", + "tableTo": "user", + "columnsFrom": [ + "userId" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + }, + "bookmarkLinks": { + "name": "bookmarkLinks", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true, + "autoincrement": false + }, + "url": { + "name": "url", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "title": { + "name": "title", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "imageUrl": { + "name": "imageUrl", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "favicon": { + "name": "favicon", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "content": { + "name": "content", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "crawledAt": { + "name": "crawledAt", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + } + }, + "indexes": {}, + "foreignKeys": { + "bookmarkLinks_id_bookmarks_id_fk": { + "name": "bookmarkLinks_id_bookmarks_id_fk", + "tableFrom": "bookmarkLinks", + "tableTo": "bookmarks", + "columnsFrom": [ + "id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + }, + "bookmarkLists": { + "name": "bookmarkLists", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true, + "autoincrement": false + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "icon": { + "name": "icon", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "createdAt": { + "name": "createdAt", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "userId": { + "name": "userId", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + } + }, + "indexes": { + "bookmarkLists_name_userId_unique": { + "name": "bookmarkLists_name_userId_unique", + "columns": [ + "name", + "userId" + ], + "isUnique": true + } + }, + "foreignKeys": { + "bookmarkLists_userId_user_id_fk": { + "name": "bookmarkLists_userId_user_id_fk", + "tableFrom": "bookmarkLists", + "tableTo": "user", + "columnsFrom": [ + "userId" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + }, + "bookmarkTags": { + "name": "bookmarkTags", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true, + "autoincrement": false + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "createdAt": { + "name": "createdAt", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "userId": { + "name": "userId", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + } + }, + "indexes": { + "bookmarkTags_userId_name_unique": { + "name": "bookmarkTags_userId_name_unique", + "columns": [ + "userId", + "name" + ], + "isUnique": true + } + }, + "foreignKeys": { + "bookmarkTags_userId_user_id_fk": { + "name": "bookmarkTags_userId_user_id_fk", + "tableFrom": "bookmarkTags", + "tableTo": "user", + "columnsFrom": [ + "userId" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + }, + "bookmarkTexts": { + "name": "bookmarkTexts", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true, + "autoincrement": false + }, + "text": { + "name": "text", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + } + }, + "indexes": {}, + "foreignKeys": { + "bookmarkTexts_id_bookmarks_id_fk": { + "name": "bookmarkTexts_id_bookmarks_id_fk", + "tableFrom": "bookmarkTexts", + "tableTo": "bookmarks", + "columnsFrom": [ + "id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + }, + "bookmarks": { + "name": "bookmarks", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true, + "autoincrement": false + }, + "createdAt": { + "name": "createdAt", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "archived": { + "name": "archived", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false, + "default": false + }, + "favourited": { + "name": "favourited", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false, + "default": false + }, + "userId": { + "name": "userId", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + } + }, + "indexes": {}, + "foreignKeys": { + "bookmarks_userId_user_id_fk": { + "name": "bookmarks_userId_user_id_fk", + "tableFrom": "bookmarks", + "tableTo": "user", + "columnsFrom": [ + "userId" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + }, + "bookmarksInLists": { + "name": "bookmarksInLists", + "columns": { + "bookmarkId": { + "name": "bookmarkId", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "listId": { + "name": "listId", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "addedAt": { + "name": "addedAt", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + } + }, + "indexes": {}, + "foreignKeys": { + "bookmarksInLists_bookmarkId_bookmarks_id_fk": { + "name": "bookmarksInLists_bookmarkId_bookmarks_id_fk", + "tableFrom": "bookmarksInLists", + "tableTo": "bookmarks", + "columnsFrom": [ + "bookmarkId" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "bookmarksInLists_listId_bookmarkLists_id_fk": { + "name": "bookmarksInLists_listId_bookmarkLists_id_fk", + "tableFrom": "bookmarksInLists", + "tableTo": "bookmarkLists", + "columnsFrom": [ + "listId" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": { + "bookmarksInLists_bookmarkId_listId_pk": { + "columns": [ + "bookmarkId", + "listId" + ], + "name": "bookmarksInLists_bookmarkId_listId_pk" + } + }, + "uniqueConstraints": {} + }, + "session": { + "name": "session", + "columns": { + "sessionToken": { + "name": "sessionToken", + "type": "text", + "primaryKey": true, + "notNull": true, + "autoincrement": false + }, + "userId": { + "name": "userId", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "expires": { + "name": "expires", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + } + }, + "indexes": {}, + "foreignKeys": { + "session_userId_user_id_fk": { + "name": "session_userId_user_id_fk", + "tableFrom": "session", + "tableTo": "user", + "columnsFrom": [ + "userId" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + }, + "tagsOnBookmarks": { + "name": "tagsOnBookmarks", + "columns": { + "bookmarkId": { + "name": "bookmarkId", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "tagId": { + "name": "tagId", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "attachedAt": { + "name": "attachedAt", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "attachedBy": { + "name": "attachedBy", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + } + }, + "indexes": {}, + "foreignKeys": { + "tagsOnBookmarks_bookmarkId_bookmarks_id_fk": { + "name": "tagsOnBookmarks_bookmarkId_bookmarks_id_fk", + "tableFrom": "tagsOnBookmarks", + "tableTo": "bookmarks", + "columnsFrom": [ + "bookmarkId" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "tagsOnBookmarks_tagId_bookmarkTags_id_fk": { + "name": "tagsOnBookmarks_tagId_bookmarkTags_id_fk", + "tableFrom": "tagsOnBookmarks", + "tableTo": "bookmarkTags", + "columnsFrom": [ + "tagId" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": { + "tagsOnBookmarks_bookmarkId_tagId_pk": { + "columns": [ + "bookmarkId", + "tagId" + ], + "name": "tagsOnBookmarks_bookmarkId_tagId_pk" + } + }, + "uniqueConstraints": {} + }, + "user": { + "name": "user", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true, + "autoincrement": false + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "email": { + "name": "email", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "emailVerified": { + "name": "emailVerified", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "image": { + "name": "image", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "password": { + "name": "password", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "role": { + "name": "role", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false, + "default": "'user'" + } + }, + "indexes": { + "user_email_unique": { + "name": "user_email_unique", + "columns": [ + "email" + ], + "isUnique": true + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + }, + "verificationToken": { + "name": "verificationToken", + "columns": { + "identifier": { + "name": "identifier", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "token": { + "name": "token", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "expires": { + "name": "expires", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": { + "verificationToken_identifier_token_pk": { + "columns": [ + "identifier", + "token" + ], + "name": "verificationToken_identifier_token_pk" + } + }, + "uniqueConstraints": {} + } + }, + "enums": {}, + "_meta": { + "schemas": {}, + "tables": {}, + "columns": {} + } +}
\ No newline at end of file diff --git a/packages/db/drizzle/meta/_journal.json b/packages/db/drizzle/meta/_journal.json index a8b97da4..839d3918 100644 --- a/packages/db/drizzle/meta/_journal.json +++ b/packages/db/drizzle/meta/_journal.json @@ -29,6 +29,13 @@ "when": 1709331420929, "tag": "0003_parallel_supernaut", "breakpoints": true + }, + { + "idx": 4, + "version": "5", + "when": 1709339866036, + "tag": "0004_skinny_vengeance", + "breakpoints": true } ] }
\ No newline at end of file diff --git a/packages/db/schema.ts b/packages/db/schema.ts index d1e3fa0d..94f48ac3 100644 --- a/packages/db/schema.ts +++ b/packages/db/schema.ts @@ -123,6 +123,7 @@ export const bookmarkLinks = sqliteTable("bookmarkLinks", { description: text("description"), imageUrl: text("imageUrl"), favicon: text("favicon"), + content: text("content"), crawledAt: integer("crawledAt", { mode: "timestamp" }), }); diff --git a/packages/workers/crawler.ts b/packages/workers/crawler.ts index 7be014a7..f1ee07f3 100644 --- a/packages/workers/crawler.ts +++ b/packages/workers/crawler.ts @@ -7,6 +7,8 @@ import { queueConnectionDetails, zCrawlLinkRequestSchema, } from "@hoarder/shared/queues"; +import DOMPurify from "dompurify"; +import { JSDOM } from "jsdom"; import { Worker } from "bullmq"; import { Job } from "bullmq"; @@ -31,7 +33,7 @@ import assert from "assert"; import serverConfig from "@hoarder/shared/config"; import { bookmarkLinks } from "@hoarder/db/schema"; import { eq } from "drizzle-orm"; -import { SearchIndexingWorker } from "./search"; +import { Readability } from "@mozilla/readability"; const metascraperParser = metascraper([ metascraperReadability(), @@ -159,6 +161,14 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) { html: htmlContent, }); + const window = new JSDOM("").window; + const purify = DOMPurify(window); + const purifiedHTML = purify.sanitize(htmlContent); + const purifiedDOM = new JSDOM(purifiedHTML, { url }); + const readableContent = new Readability(purifiedDOM.window.document).parse(); + + // TODO(important): Restrict the size of content to store + await db .update(bookmarkLinks) .set({ @@ -166,6 +176,7 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) { description: meta.description, imageUrl: meta.image, favicon: meta.logo, + content: readableContent?.textContent, crawledAt: new Date(), }) .where(eq(bookmarkLinks.id, bookmarkId)); diff --git a/packages/workers/openai.ts b/packages/workers/openai.ts index cc456616..7dda1f9b 100644 --- a/packages/workers/openai.ts +++ b/packages/workers/openai.ts @@ -63,10 +63,20 @@ function buildPrompt( `No description found for link "${bookmark.id}". Skipping ...`, ); } + + let content = bookmark.link.content; + if (content) { + let words = content.split(" "); + if (words.length > 2000) { + words = words.slice(2000); + content = words.join(" "); + } + } return ` ${PROMPT_BASE} URL: ${bookmark.link.url} Description: ${bookmark.link.description} +Content: ${content || ""} `; } diff --git a/packages/workers/package.json b/packages/workers/package.json index 078f6c54..a7b62462 100644 --- a/packages/workers/package.json +++ b/packages/workers/package.json @@ -6,11 +6,14 @@ "dependencies": { "@hoarder/db": "workspace:*", "@hoarder/shared": "workspace:*", + "@mozilla/readability": "^0.5.0", "@tsconfig/node21": "^21.0.1", "async-mutex": "^0.4.1", "bullmq": "^5.1.9", + "dompurify": "^3.0.9", "dotenv": "^16.4.1", "drizzle-orm": "^0.29.4", + "jsdom": "^24.0.0", "metascraper": "^5.43.4", "metascraper-description": "^5.43.4", "metascraper-image": "^5.43.4", @@ -29,6 +32,8 @@ "zod": "^3.22.4" }, "devDependencies": { + "@types/dompurify": "^3.0.5", + "@types/jsdom": "^21.1.6", "@types/metascraper": "^5.14.3" }, "scripts": { diff --git a/packages/workers/search.ts b/packages/workers/search.ts index a628b2ed..618e7c89 100644 --- a/packages/workers/search.ts +++ b/packages/workers/search.ts @@ -68,6 +68,7 @@ async function runIndex( url: bookmark.link.url, title: bookmark.link.title, description: bookmark.link.description, + content: bookmark.link.content, } : undefined), ...(bookmark.text ? { content: bookmark.text.text } : undefined), |
