aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMohamed Bassem <me@mbassem.com>2025-05-18 13:43:48 +0000
committerMohamed Bassem <me@mbassem.com>2025-05-18 13:43:48 +0000
commit70d572096706cb2d7f5d3f29b51e6ba1d86578c6 (patch)
tree867d9305ef00733a55e6954aeb004a7f9f3f7c5f
parent523a251b6f330f72adb13119a2577a4220d5a078 (diff)
downloadkarakeep-70d572096706cb2d7f5d3f29b51e6ba1d86578c6.tar.zst
fix: Use proper tokenizer when truncating for inference. Fixes #1405
-rw-r--r--packages/shared/package.json1
-rw-r--r--packages/shared/prompts.ts16
-rw-r--r--pnpm-lock.yaml12
3 files changed, 21 insertions, 8 deletions
diff --git a/packages/shared/package.json b/packages/shared/package.json
index a262267a..478fb77d 100644
--- a/packages/shared/package.json
+++ b/packages/shared/package.json
@@ -6,6 +6,7 @@
"type": "module",
"dependencies": {
"glob": "^11.0.0",
+ "js-tiktoken": "^1.0.20",
"liteque": "^0.3.2",
"meilisearch": "^0.37.0",
"ollama": "^0.5.14",
diff --git a/packages/shared/prompts.ts b/packages/shared/prompts.ts
index 40e706b2..b7957dd4 100644
--- a/packages/shared/prompts.ts
+++ b/packages/shared/prompts.ts
@@ -1,15 +1,15 @@
-// TODO: Use a proper tokenizer
+import { getEncoding } from "js-tiktoken";
+
+const encoding = getEncoding("o200k_base");
+
function calculateNumTokens(text: string) {
- return text.split(" ").length;
+ return encoding.encode(text).length;
}
function truncateContent(content: string, length: number) {
- let words = content.split(" ");
- if (words.length > length) {
- words = words.slice(0, length);
- content = words.join(" ");
- }
- return content;
+ const tokens = encoding.encode(content);
+ const truncatedTokens = tokens.slice(0, length);
+ return encoding.decode(truncatedTokens);
}
export function buildImagePrompt(lang: string, customPrompts: string[]) {
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 42850157..36cc420a 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -1092,6 +1092,9 @@ importers:
glob:
specifier: ^11.0.0
version: 11.0.0
+ js-tiktoken:
+ specifier: ^1.0.20
+ version: 1.0.20
liteque:
specifier: ^0.3.2
version: 0.3.2(better-sqlite3@11.3.0)
@@ -10055,6 +10058,9 @@ packages:
resolution: {integrity: sha512-X2BB11YZtrRqY4EnQcLX5Rh373zbK4alC1FW7D7MBhL2gtcC17cTnr6DmfHZeS0s2rTHjUTMMHfG7gO8SSdw+g==}
engines: {node: '>=0.10.0'}
+ js-tiktoken@1.0.20:
+ resolution: {integrity: sha512-Xlaqhhs8VfCd6Sh7a1cFkZHQbYTLCwVJJWiHVxBYzLPxW0XsoxBy1hitmjkdIjD3Aon5BXLHFwU5O8WUx6HH+A==}
+
js-tokens@4.0.0:
resolution: {integrity: sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==}
@@ -11350,6 +11356,7 @@ packages:
node-domexception@1.0.0:
resolution: {integrity: sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==}
engines: {node: '>=10.5.0'}
+ deprecated: Use your platform's native DOMException instead
node-emoji@2.1.3:
resolution: {integrity: sha512-E2WEOVsgs7O16zsURJ/eH8BqhF029wGpEOnv7Urwdo2wmQanOACwJQh0devF9D9RhoZru0+9JXIS0dBXIAz+lA==}
@@ -27540,6 +27547,11 @@ snapshots:
js-levenshtein@1.1.6: {}
+ js-tiktoken@1.0.20:
+ dependencies:
+ base64-js: 1.5.1
+ dev: false
+
js-tokens@4.0.0: {}
js-tokens@8.0.3: