3 files changed, 6 insertions, 1 deletions
diff --git a/docs/docs/03-configuration.md b/docs/docs/03-configuration.md
index 54762ee6..a1c46b7d 100644
--- a/docs/docs/03-configuration.md
+++ b/docs/docs/03-configuration.md
@@ -89,6 +89,7 @@ Either `OPENAI_API_KEY` or `OLLAMA_BASE_URL` need to be set for automatic taggin
 | INFERENCE_IMAGE_MODEL                | No       | gpt-4o-mini            | The model to use for image inference. You'll need to change this to some other model if you're using ollama and that model needs to support vision APIs (e.g. llava).                                                                                                                                                                                                                 |
 | EMBEDDING_TEXT_MODEL                 | No       | text-embedding-3-small | The model to be used for generating embeddings for the text.                                                                                                                                                                                                                                                                                                                          |
 | INFERENCE_CONTEXT_LENGTH             | No       | 2048                   | The max number of tokens that we'll pass to the inference model. If your content is larger than this size, it'll be truncated to fit. The larger this value, the more of the content will be used in tag inference, but the more expensive the inference will be (money-wise on openAI and resource-wise on ollama). Check the model you're using for its max supported content size. |
+| INFERENCE_MAX_OUTPUT_TOKENS          | No       | 2048                   | The maximum number of tokens that the inference model is allowed to generate in its response. This controls the length of AI-generated content like tags and summaries. Increase this if you need longer responses, but be aware that higher values will increase costs (for OpenAI) and processing time.                                                                             |
 | INFERENCE_LANG                       | No       | english                | The language in which the tags will be generated.                                                                                                                                                                                                                                                                                                                                     |
 | INFERENCE_NUM_WORKERS                | No       | 1                      | Number of concurrent workers for AI inference tasks (tagging and summarization). Increase this if you have multiple AI inference requests and want to process them in parallel.                                                                                                                                                                                                       |
 | INFERENCE_ENABLE_AUTO_TAGGING        | No       | true                   | Whether automatic AI tagging is enabled or disabled.                                                                                                                                                                                                                                                                                                                                  |
diff --git a/packages/shared/config.ts b/packages/shared/config.ts
index a18482c8..ea90ffcb 100644
--- a/packages/shared/config.ts
+++ b/packages/shared/config.ts
@@ -38,6 +38,7 @@ const allEnv = z.object({
   INFERENCE_IMAGE_MODEL: z.string().default("gpt-4o-mini"),
   EMBEDDING_TEXT_MODEL: z.string().default("text-embedding-3-small"),
   INFERENCE_CONTEXT_LENGTH: z.coerce.number().default(2048),
+  INFERENCE_MAX_OUTPUT_TOKENS: z.coerce.number().default(2048),
   INFERENCE_SUPPORTS_STRUCTURED_OUTPUT: optionalStringBool(),
   INFERENCE_OUTPUT_SCHEMA: z
     .enum(["structured", "json", "plain"])
@@ -190,6 +191,7 @@ const serverConfigSchema = allEnv
         imageModel: val.INFERENCE_IMAGE_MODEL,
         inferredTagLang: val.INFERENCE_LANG,
         contextLength: val.INFERENCE_CONTEXT_LENGTH,
+        maxOutputTokens: val.INFERENCE_MAX_OUTPUT_TOKENS,
         outputSchema:
           val.INFERENCE_SUPPORTS_STRUCTURED_OUTPUT !== undefined
             ? val.INFERENCE_SUPPORTS_STRUCTURED_OUTPUT
diff --git a/packages/shared/inference.ts b/packages/shared/inference.ts
index 04fa8cfc..41026fbd 100644
--- a/packages/shared/inference.ts
+++ b/packages/shared/inference.ts
@@ -90,6 +90,7 @@ class OpenAIInferenceClient implements InferenceClient {
       {
         messages: [{ role: "user", content: prompt }],
         model: serverConfig.inference.textModel,
+        max_tokens: serverConfig.inference.maxOutputTokens,
         response_format: mapInferenceOutputSchema(
           {
             structured: optsWithDefaults.schema
@@ -126,6 +127,7 @@ class OpenAIInferenceClient implements InferenceClient {
     const chatCompletion = await this.openAI.chat.completions.create(
       {
         model: serverConfig.inference.imageModel,
+        max_tokens: serverConfig.inference.maxOutputTokens,
         response_format: mapInferenceOutputSchema(
           {
             structured: optsWithDefaults.schema
@@ -151,7 +153,6 @@ class OpenAIInferenceClient implements InferenceClient {
             ],
           },
         ],
-        max_tokens: 2000,
       },
       {
         signal: optsWithDefaults.abortSignal,
@@ -224,6 +225,7 @@ class OllamaInferenceClient implements InferenceClient {
       keep_alive: serverConfig.inference.ollamaKeepAlive,
       options: {
         num_ctx: serverConfig.inference.contextLength,
+        num_predict: serverConfig.inference.maxOutputTokens,
       },
       messages: [
         { role: "user", content: prompt, images: image ? [image] : undefined },