diff options
| author | Mohamed Bassem <me@mbassem.com> | 2025-05-18 13:43:48 +0000 |
|---|---|---|
| committer | Mohamed Bassem <me@mbassem.com> | 2025-05-18 13:43:48 +0000 |
| commit | 70d572096706cb2d7f5d3f29b51e6ba1d86578c6 (patch) | |
| tree | 867d9305ef00733a55e6954aeb004a7f9f3f7c5f | |
| parent | 523a251b6f330f72adb13119a2577a4220d5a078 (diff) | |
| download | karakeep-70d572096706cb2d7f5d3f29b51e6ba1d86578c6.tar.zst | |
fix: Use proper tokenizer when truncating for inference. Fixes #1405
| -rw-r--r-- | packages/shared/package.json | 1 | ||||
| -rw-r--r-- | packages/shared/prompts.ts | 16 | ||||
| -rw-r--r-- | pnpm-lock.yaml | 12 |
3 files changed, 21 insertions, 8 deletions
diff --git a/packages/shared/package.json b/packages/shared/package.json index a262267a..478fb77d 100644 --- a/packages/shared/package.json +++ b/packages/shared/package.json @@ -6,6 +6,7 @@ "type": "module", "dependencies": { "glob": "^11.0.0", + "js-tiktoken": "^1.0.20", "liteque": "^0.3.2", "meilisearch": "^0.37.0", "ollama": "^0.5.14", diff --git a/packages/shared/prompts.ts b/packages/shared/prompts.ts index 40e706b2..b7957dd4 100644 --- a/packages/shared/prompts.ts +++ b/packages/shared/prompts.ts @@ -1,15 +1,15 @@ -// TODO: Use a proper tokenizer +import { getEncoding } from "js-tiktoken"; + +const encoding = getEncoding("o200k_base"); + function calculateNumTokens(text: string) { - return text.split(" ").length; + return encoding.encode(text).length; } function truncateContent(content: string, length: number) { - let words = content.split(" "); - if (words.length > length) { - words = words.slice(0, length); - content = words.join(" "); - } - return content; + const tokens = encoding.encode(content); + const truncatedTokens = tokens.slice(0, length); + return encoding.decode(truncatedTokens); } export function buildImagePrompt(lang: string, customPrompts: string[]) { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 42850157..36cc420a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1092,6 +1092,9 @@ importers: glob: specifier: ^11.0.0 version: 11.0.0 + js-tiktoken: + specifier: ^1.0.20 + version: 1.0.20 liteque: specifier: ^0.3.2 version: 0.3.2(better-sqlite3@11.3.0) @@ -10055,6 +10058,9 @@ packages: resolution: {integrity: sha512-X2BB11YZtrRqY4EnQcLX5Rh373zbK4alC1FW7D7MBhL2gtcC17cTnr6DmfHZeS0s2rTHjUTMMHfG7gO8SSdw+g==} engines: {node: '>=0.10.0'} + js-tiktoken@1.0.20: + resolution: {integrity: sha512-Xlaqhhs8VfCd6Sh7a1cFkZHQbYTLCwVJJWiHVxBYzLPxW0XsoxBy1hitmjkdIjD3Aon5BXLHFwU5O8WUx6HH+A==} + js-tokens@4.0.0: resolution: {integrity: sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==} @@ -11350,6 +11356,7 @@ packages: node-domexception@1.0.0: resolution: {integrity: sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==} engines: {node: '>=10.5.0'} + deprecated: Use your platform's native DOMException instead node-emoji@2.1.3: resolution: {integrity: sha512-E2WEOVsgs7O16zsURJ/eH8BqhF029wGpEOnv7Urwdo2wmQanOACwJQh0devF9D9RhoZru0+9JXIS0dBXIAz+lA==} @@ -27540,6 +27547,11 @@ snapshots: js-levenshtein@1.1.6: {} + js-tiktoken@1.0.20: + dependencies: + base64-js: 1.5.1 + dev: false + js-tokens@4.0.0: {} js-tokens@8.0.3: |
