diff options
Diffstat (limited to 'packages')
| -rw-r--r-- | packages/shared/prompts.ts | 126 | ||||
| -rw-r--r-- | packages/trpc/routers/bookmarks.ts | 2 |
2 files changed, 101 insertions, 27 deletions
diff --git a/packages/shared/prompts.ts b/packages/shared/prompts.ts index 0b79eb9a..5a6a705e 100644 --- a/packages/shared/prompts.ts +++ b/packages/shared/prompts.ts @@ -1,6 +1,19 @@ -import { getEncoding } from "js-tiktoken"; +import type { Tiktoken } from "js-tiktoken"; -const encoding = getEncoding("o200k_base"); +let encoding: Tiktoken | null = null; + +/** + * Lazy load the encoding to avoid loading the tiktoken data into memory + * until it's actually needed + */ +async function getEncodingInstance(): Promise<Tiktoken> { + if (!encoding) { + // Dynamic import to lazy load the tiktoken module + const { getEncoding } = await import("js-tiktoken"); + encoding = getEncoding("o200k_base"); + } + return encoding; +} /** * Remove duplicate whitespaces to avoid tokenization issues @@ -9,17 +22,22 @@ function preprocessContent(content: string) { return content.replace(/(\s){10,}/g, "$1"); } -function calculateNumTokens(text: string) { - return encoding.encode(text).length; +async function calculateNumTokens(text: string): Promise<number> { + const enc = await getEncodingInstance(); + return enc.encode(text).length; } -function truncateContent(content: string, length: number) { - const tokens = encoding.encode(content); +async function truncateContent( + content: string, + length: number, +): Promise<string> { + const enc = await getEncodingInstance(); + const tokens = enc.encode(content); if (tokens.length <= length) { return content; } const truncatedTokens = tokens.slice(0, length); - return encoding.decode(truncatedTokens); + return enc.decode(truncatedTokens); } export function buildImagePrompt(lang: string, customPrompts: string[]) { @@ -35,14 +53,15 @@ ${customPrompts && customPrompts.map((p) => `- ${p}`).join("\n")} You must respond in valid JSON with the key "tags" and the value is list of tags. Don't wrap the response in a markdown code.`; } -export function buildTextPrompt( +/** + * Construct tagging prompt for text content + */ +function constructTextTaggingPrompt( lang: string, customPrompts: string[], content: string, - contextLength: number, -) { - content = preprocessContent(content); - const constructPrompt = (c: string) => ` +): string { + return ` You are an expert whose responsibility is to help with automatic tagging for a read-it-later app. Please analyze the TEXT_CONTENT below and suggest relevant tags that describe its key themes, topics, and main ideas. The rules are: - Aim for a variety of tags, including broad categories, specific keywords, and potential sub-genres. @@ -54,30 +73,85 @@ Please analyze the TEXT_CONTENT below and suggest relevant tags that describe it ${customPrompts && customPrompts.map((p) => `- ${p}`).join("\n")} <TEXT_CONTENT> -${c} +${content} </TEXT_CONTENT> You must respond in JSON with the key "tags" and the value is an array of string tags.`; - - const promptSize = calculateNumTokens(constructPrompt("")); - const truncatedContent = truncateContent(content, contextLength - promptSize); - return constructPrompt(truncatedContent); } -export function buildSummaryPrompt( +/** + * Construct summary prompt + */ +function constructSummaryPrompt( lang: string, customPrompts: string[], content: string, - contextLength: number, -) { - content = preprocessContent(content); - const constructPrompt = (c: string) => ` +): string { + return ` Summarize the following content responding ONLY with the summary. You MUST follow the following rules: - Summary must be in 3-4 sentences. - The summary must be in ${lang}. ${customPrompts && customPrompts.map((p) => `- ${p}`).join("\n")} - ${c}`; + ${content}`; +} + +/** + * Build text tagging prompt without truncation (for previews/UI) + */ +export function buildTextPromptUntruncated( + lang: string, + customPrompts: string[], + content: string, +): string { + return constructTextTaggingPrompt( + lang, + customPrompts, + preprocessContent(content), + ); +} + +export async function buildTextPrompt( + lang: string, + customPrompts: string[], + content: string, + contextLength: number, +): Promise<string> { + content = preprocessContent(content); + const promptTemplate = constructTextTaggingPrompt(lang, customPrompts, ""); + const promptSize = await calculateNumTokens(promptTemplate); + const truncatedContent = await truncateContent( + content, + contextLength - promptSize, + ); + return constructTextTaggingPrompt(lang, customPrompts, truncatedContent); +} + +export async function buildSummaryPrompt( + lang: string, + customPrompts: string[], + content: string, + contextLength: number, +): Promise<string> { + content = preprocessContent(content); + const promptTemplate = constructSummaryPrompt(lang, customPrompts, ""); + const promptSize = await calculateNumTokens(promptTemplate); + const truncatedContent = await truncateContent( + content, + contextLength - promptSize, + ); + return constructSummaryPrompt(lang, customPrompts, truncatedContent); +} - const promptSize = calculateNumTokens(constructPrompt("")); - const truncatedContent = truncateContent(content, contextLength - promptSize); - return constructPrompt(truncatedContent); +/** + * Build summary prompt without truncation (for previews/UI) + */ +export function buildSummaryPromptUntruncated( + lang: string, + customPrompts: string[], + content: string, +): string { + return constructSummaryPrompt( + lang, + customPrompts, + preprocessContent(content), + ); } diff --git a/packages/trpc/routers/bookmarks.ts b/packages/trpc/routers/bookmarks.ts index 05ffa240..f68d5ada 100644 --- a/packages/trpc/routers/bookmarks.ts +++ b/packages/trpc/routers/bookmarks.ts @@ -954,7 +954,7 @@ Author: ${bookmark.author ?? ""} }, }); - const summaryPrompt = buildSummaryPrompt( + const summaryPrompt = await buildSummaryPrompt( serverConfig.inference.inferredTagLang, prompts.map((p) => p.text), bookmarkDetails, |
