diff options
| author | Mohamed Bassem <me@mbassem.com> | 2025-05-18 13:43:48 +0000 |
|---|---|---|
| committer | Mohamed Bassem <me@mbassem.com> | 2025-05-18 13:43:48 +0000 |
| commit | 70d572096706cb2d7f5d3f29b51e6ba1d86578c6 (patch) | |
| tree | 867d9305ef00733a55e6954aeb004a7f9f3f7c5f /packages/shared/prompts.ts | |
| parent | 523a251b6f330f72adb13119a2577a4220d5a078 (diff) | |
| download | karakeep-70d572096706cb2d7f5d3f29b51e6ba1d86578c6.tar.zst | |
fix: Use proper tokenizer when truncating for inference. Fixes #1405
Diffstat (limited to 'packages/shared/prompts.ts')
| -rw-r--r-- | packages/shared/prompts.ts | 16 |
1 files changed, 8 insertions, 8 deletions
diff --git a/packages/shared/prompts.ts b/packages/shared/prompts.ts index 40e706b2..b7957dd4 100644 --- a/packages/shared/prompts.ts +++ b/packages/shared/prompts.ts @@ -1,15 +1,15 @@ -// TODO: Use a proper tokenizer +import { getEncoding } from "js-tiktoken"; + +const encoding = getEncoding("o200k_base"); + function calculateNumTokens(text: string) { - return text.split(" ").length; + return encoding.encode(text).length; } function truncateContent(content: string, length: number) { - let words = content.split(" "); - if (words.length > length) { - words = words.slice(0, length); - content = words.join(" "); - } - return content; + const tokens = encoding.encode(content); + const truncatedTokens = tokens.slice(0, length); + return encoding.decode(truncatedTokens); } export function buildImagePrompt(lang: string, customPrompts: string[]) { |
