aboutsummaryrefslogtreecommitdiffstats
path: root/packages/shared
diff options
context:
space:
mode:
authorMohamed Bassem <me@mbassem.com>2025-05-18 13:43:48 +0000
committerMohamed Bassem <me@mbassem.com>2025-05-18 13:43:48 +0000
commit70d572096706cb2d7f5d3f29b51e6ba1d86578c6 (patch)
tree867d9305ef00733a55e6954aeb004a7f9f3f7c5f /packages/shared
parent523a251b6f330f72adb13119a2577a4220d5a078 (diff)
downloadkarakeep-70d572096706cb2d7f5d3f29b51e6ba1d86578c6.tar.zst
fix: Use proper tokenizer when truncating for inference. Fixes #1405
Diffstat (limited to 'packages/shared')
-rw-r--r--packages/shared/package.json1
-rw-r--r--packages/shared/prompts.ts16
2 files changed, 9 insertions, 8 deletions
diff --git a/packages/shared/package.json b/packages/shared/package.json
index a262267a..478fb77d 100644
--- a/packages/shared/package.json
+++ b/packages/shared/package.json
@@ -6,6 +6,7 @@
"type": "module",
"dependencies": {
"glob": "^11.0.0",
+ "js-tiktoken": "^1.0.20",
"liteque": "^0.3.2",
"meilisearch": "^0.37.0",
"ollama": "^0.5.14",
diff --git a/packages/shared/prompts.ts b/packages/shared/prompts.ts
index 40e706b2..b7957dd4 100644
--- a/packages/shared/prompts.ts
+++ b/packages/shared/prompts.ts
@@ -1,15 +1,15 @@
-// TODO: Use a proper tokenizer
+import { getEncoding } from "js-tiktoken";
+
+const encoding = getEncoding("o200k_base");
+
function calculateNumTokens(text: string) {
- return text.split(" ").length;
+ return encoding.encode(text).length;
}
function truncateContent(content: string, length: number) {
- let words = content.split(" ");
- if (words.length > length) {
- words = words.slice(0, length);
- content = words.join(" ");
- }
- return content;
+ const tokens = encoding.encode(content);
+ const truncatedTokens = tokens.slice(0, length);
+ return encoding.decode(truncatedTokens);
}
export function buildImagePrompt(lang: string, customPrompts: string[]) {