From dee3a4d44ddb1999e7dec383889246e87f202d92 Mon Sep 17 00:00:00 2001 From: Mohamed Bassem Date: Sun, 6 Jul 2025 21:50:23 +0000 Subject: feat: Store large html content in the asset db --- packages/shared/package.json | 2 ++ packages/shared/types/bookmarks.ts | 2 ++ packages/shared/utils/htmlUtils.ts | 17 +++++++++++++++++ 3 files changed, 21 insertions(+) create mode 100644 packages/shared/utils/htmlUtils.ts (limited to 'packages/shared') diff --git a/packages/shared/package.json b/packages/shared/package.json index f4e521b6..6f22865f 100644 --- a/packages/shared/package.json +++ b/packages/shared/package.json @@ -7,6 +7,7 @@ "dependencies": { "@aws-sdk/client-s3": "^3.842.0", "glob": "^11.0.0", + "html-to-text": "^9.0.5", "js-tiktoken": "^1.0.20", "liteque": "^0.3.2", "meilisearch": "^0.37.0", @@ -20,6 +21,7 @@ "devDependencies": { "@karakeep/prettier-config": "workspace:^0.1.0", "@karakeep/tsconfig": "workspace:^0.1.0", + "@types/html-to-text": "^9.0.4", "vitest": "^1.6.1" }, "scripts": { diff --git a/packages/shared/types/bookmarks.ts b/packages/shared/types/bookmarks.ts index ea1ab717..f648bce5 100644 --- a/packages/shared/types/bookmarks.ts +++ b/packages/shared/types/bookmarks.ts @@ -16,6 +16,7 @@ export const zSortOrder = z.enum(["asc", "desc", "relevance"]); export type ZSortOrder = z.infer; export const zAssetTypesSchema = z.enum([ + "linkHtmlContent", "screenshot", "assetScreenshot", "bannerImage", @@ -45,6 +46,7 @@ export const zBookmarkedLinkSchema = z.object({ videoAssetId: z.string().nullish(), favicon: z.string().nullish(), htmlContent: z.string().nullish(), + contentAssetId: z.string().nullish(), crawledAt: z.date().nullish(), author: z.string().nullish(), publisher: z.string().nullish(), diff --git a/packages/shared/utils/htmlUtils.ts b/packages/shared/utils/htmlUtils.ts new file mode 100644 index 00000000..60272899 --- /dev/null +++ b/packages/shared/utils/htmlUtils.ts @@ -0,0 +1,17 @@ +import { compile } from "html-to-text"; + +const compiledConvert = compile({ + selectors: [{ selector: "img", format: "skip" }], +}); + +/** + * Converts HTML content to plain text + */ +export function htmlToPlainText(htmlContent: string): string { + if (!htmlContent) { + return ""; + } + + // TODO, we probably should also remove singlefile inline images from the content + return compiledConvert(htmlContent); +} -- cgit v1.2.3-70-g09d2