aboutsummaryrefslogtreecommitdiffstats
path: root/packages/shared
diff options
context:
space:
mode:
authorMohamed Bassem <me@mbassem.com>2025-07-06 21:50:23 +0000
committerMohamed Bassem <me@mbassem.com>2025-07-06 22:04:56 +0000
commitdee3a4d44ddb1999e7dec383889246e87f202d92 (patch)
tree1984234f17eed886bc834543e1505ddbfb43228f /packages/shared
parent362be3008aa8b036c4c448a86e459044af8784c2 (diff)
downloadkarakeep-dee3a4d44ddb1999e7dec383889246e87f202d92.tar.zst
feat: Store large html content in the asset db
Diffstat (limited to 'packages/shared')
-rw-r--r--packages/shared/package.json2
-rw-r--r--packages/shared/types/bookmarks.ts2
-rw-r--r--packages/shared/utils/htmlUtils.ts17
3 files changed, 21 insertions, 0 deletions
diff --git a/packages/shared/package.json b/packages/shared/package.json
index f4e521b6..6f22865f 100644
--- a/packages/shared/package.json
+++ b/packages/shared/package.json
@@ -7,6 +7,7 @@
"dependencies": {
"@aws-sdk/client-s3": "^3.842.0",
"glob": "^11.0.0",
+ "html-to-text": "^9.0.5",
"js-tiktoken": "^1.0.20",
"liteque": "^0.3.2",
"meilisearch": "^0.37.0",
@@ -20,6 +21,7 @@
"devDependencies": {
"@karakeep/prettier-config": "workspace:^0.1.0",
"@karakeep/tsconfig": "workspace:^0.1.0",
+ "@types/html-to-text": "^9.0.4",
"vitest": "^1.6.1"
},
"scripts": {
diff --git a/packages/shared/types/bookmarks.ts b/packages/shared/types/bookmarks.ts
index ea1ab717..f648bce5 100644
--- a/packages/shared/types/bookmarks.ts
+++ b/packages/shared/types/bookmarks.ts
@@ -16,6 +16,7 @@ export const zSortOrder = z.enum(["asc", "desc", "relevance"]);
export type ZSortOrder = z.infer<typeof zSortOrder>;
export const zAssetTypesSchema = z.enum([
+ "linkHtmlContent",
"screenshot",
"assetScreenshot",
"bannerImage",
@@ -45,6 +46,7 @@ export const zBookmarkedLinkSchema = z.object({
videoAssetId: z.string().nullish(),
favicon: z.string().nullish(),
htmlContent: z.string().nullish(),
+ contentAssetId: z.string().nullish(),
crawledAt: z.date().nullish(),
author: z.string().nullish(),
publisher: z.string().nullish(),
diff --git a/packages/shared/utils/htmlUtils.ts b/packages/shared/utils/htmlUtils.ts
new file mode 100644
index 00000000..60272899
--- /dev/null
+++ b/packages/shared/utils/htmlUtils.ts
@@ -0,0 +1,17 @@
+import { compile } from "html-to-text";
+
+const compiledConvert = compile({
+ selectors: [{ selector: "img", format: "skip" }],
+});
+
+/**
+ * Converts HTML content to plain text
+ */
+export function htmlToPlainText(htmlContent: string): string {
+ if (!htmlContent) {
+ return "";
+ }
+
+ // TODO, we probably should also remove singlefile inline images from the content
+ return compiledConvert(htmlContent);
+}