aboutsummaryrefslogtreecommitdiffstats
path: root/packages
diff options
context:
space:
mode:
authorMohamed Bassem <me@mbassem.com>2025-06-21 12:43:46 +0000
committerMohamed Bassem <me@mbassem.com>2025-06-21 12:43:46 +0000
commit426beff15aad0c164e81996b3f6754fdc6ecfc29 (patch)
treec8ae6772691fd7494420430ed0798a5606c0f189 /packages
parent10d45e8d14cdc3672cc65dc7f5ae79e63fb2da1a (diff)
downloadkarakeep-426beff15aad0c164e81996b3f6754fdc6ecfc29.tar.zst
fix: Collapse long runs of repeated whitespaces before tokenization to avoid choking the tokenizer. Fixes #1622
Diffstat (limited to 'packages')
-rw-r--r--packages/shared/prompts.ts9
1 files changed, 9 insertions, 0 deletions
diff --git a/packages/shared/prompts.ts b/packages/shared/prompts.ts
index b7957dd4..7ee6f12a 100644
--- a/packages/shared/prompts.ts
+++ b/packages/shared/prompts.ts
@@ -2,6 +2,13 @@ import { getEncoding } from "js-tiktoken";
const encoding = getEncoding("o200k_base");
+/**
+ * Remove duplicate whitespaces to avoid tokenization issues
+ */
+function preprocessContent(content: string) {
+ return content.replace(/(\s){10,}/g, "$1");
+}
+
function calculateNumTokens(text: string) {
return encoding.encode(text).length;
}
@@ -31,6 +38,7 @@ export function buildTextPrompt(
content: string,
contextLength: number,
) {
+ content = preprocessContent(content);
const constructPrompt = (c: string) => `
You are a bot in a read-it-later app and your responsibility is to help with automatic tagging.
Please analyze the text between the sentences "CONTENT START HERE" and "CONTENT END HERE" and suggest relevant tags that describe its key themes, topics, and main ideas. The rules are:
@@ -57,6 +65,7 @@ export function buildSummaryPrompt(
content: string,
contextLength: number,
) {
+ content = preprocessContent(content);
const constructPrompt = (c: string) => `
Summarize the following content responding ONLY with the summary. You MUST follow the following rules:
- Summary must be in 3-4 sentences.