From 426beff15aad0c164e81996b3f6754fdc6ecfc29 Mon Sep 17 00:00:00 2001 From: Mohamed Bassem Date: Sat, 21 Jun 2025 12:43:46 +0000 Subject: fix: Collapse long runs of repeated whitespaces before tokenization to avoid choking the tokenizer. Fixes #1622 --- packages/shared/prompts.ts | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'packages') diff --git a/packages/shared/prompts.ts b/packages/shared/prompts.ts index b7957dd4..7ee6f12a 100644 --- a/packages/shared/prompts.ts +++ b/packages/shared/prompts.ts @@ -2,6 +2,13 @@ import { getEncoding } from "js-tiktoken"; const encoding = getEncoding("o200k_base"); +/** + * Remove duplicate whitespaces to avoid tokenization issues + */ +function preprocessContent(content: string) { + return content.replace(/(\s){10,}/g, "$1"); +} + function calculateNumTokens(text: string) { return encoding.encode(text).length; } @@ -31,6 +38,7 @@ export function buildTextPrompt( content: string, contextLength: number, ) { + content = preprocessContent(content); const constructPrompt = (c: string) => ` You are a bot in a read-it-later app and your responsibility is to help with automatic tagging. Please analyze the text between the sentences "CONTENT START HERE" and "CONTENT END HERE" and suggest relevant tags that describe its key themes, topics, and main ideas. The rules are: @@ -57,6 +65,7 @@ export function buildSummaryPrompt( content: string, contextLength: number, ) { + content = preprocessContent(content); const constructPrompt = (c: string) => ` Summarize the following content responding ONLY with the summary. You MUST follow the following rules: - Summary must be in 3-4 sentences. -- cgit v1.2.3-70-g09d2