diff options
Diffstat (limited to 'packages/shared')
| -rw-r--r-- | packages/shared/package.json | 2 | ||||
| -rw-r--r-- | packages/shared/types/bookmarks.ts | 2 | ||||
| -rw-r--r-- | packages/shared/utils/htmlUtils.ts | 17 |
3 files changed, 21 insertions, 0 deletions
diff --git a/packages/shared/package.json b/packages/shared/package.json index f4e521b6..6f22865f 100644 --- a/packages/shared/package.json +++ b/packages/shared/package.json @@ -7,6 +7,7 @@ "dependencies": { "@aws-sdk/client-s3": "^3.842.0", "glob": "^11.0.0", + "html-to-text": "^9.0.5", "js-tiktoken": "^1.0.20", "liteque": "^0.3.2", "meilisearch": "^0.37.0", @@ -20,6 +21,7 @@ "devDependencies": { "@karakeep/prettier-config": "workspace:^0.1.0", "@karakeep/tsconfig": "workspace:^0.1.0", + "@types/html-to-text": "^9.0.4", "vitest": "^1.6.1" }, "scripts": { diff --git a/packages/shared/types/bookmarks.ts b/packages/shared/types/bookmarks.ts index ea1ab717..f648bce5 100644 --- a/packages/shared/types/bookmarks.ts +++ b/packages/shared/types/bookmarks.ts @@ -16,6 +16,7 @@ export const zSortOrder = z.enum(["asc", "desc", "relevance"]); export type ZSortOrder = z.infer<typeof zSortOrder>; export const zAssetTypesSchema = z.enum([ + "linkHtmlContent", "screenshot", "assetScreenshot", "bannerImage", @@ -45,6 +46,7 @@ export const zBookmarkedLinkSchema = z.object({ videoAssetId: z.string().nullish(), favicon: z.string().nullish(), htmlContent: z.string().nullish(), + contentAssetId: z.string().nullish(), crawledAt: z.date().nullish(), author: z.string().nullish(), publisher: z.string().nullish(), diff --git a/packages/shared/utils/htmlUtils.ts b/packages/shared/utils/htmlUtils.ts new file mode 100644 index 00000000..60272899 --- /dev/null +++ b/packages/shared/utils/htmlUtils.ts @@ -0,0 +1,17 @@ +import { compile } from "html-to-text"; + +const compiledConvert = compile({ + selectors: [{ selector: "img", format: "skip" }], +}); + +/** + * Converts HTML content to plain text + */ +export function htmlToPlainText(htmlContent: string): string { + if (!htmlContent) { + return ""; + } + + // TODO, we probably should also remove singlefile inline images from the content + return compiledConvert(htmlContent); +} |
