7 files changed, 74 insertions, 39 deletions
diff --git a/apps/workers/openaiWorker.ts b/apps/workers/openaiWorker.ts
index ec5681c6..f8a775e6 100644
--- a/apps/workers/openaiWorker.ts
+++ b/apps/workers/openaiWorker.ts
@@ -162,7 +162,7 @@ async function inferTagsFromImage(
     ),
     metadata.contentType,
     base64,
-    { json: true, abortSignal },
+    { schema: openAIResponseSchema, abortSignal },
   );
 }
 
@@ -235,7 +235,10 @@ async function inferTagsFromPDF(
     `Content: ${bookmark.asset.content}`,
     serverConfig.inference.contextLength,
   );
-  return inferenceClient.inferFromText(prompt, { json: true, abortSignal });
+  return inferenceClient.inferFromText(prompt, {
+    schema: openAIResponseSchema,
+    abortSignal,
+  });
 }
 
 async function inferTagsFromText(
@@ -244,7 +247,7 @@ async function inferTagsFromText(
   abortSignal: AbortSignal,
 ) {
   return await inferenceClient.inferFromText(await buildPrompt(bookmark), {
-    json: true,
+    schema: openAIResponseSchema,
     abortSignal,
   });
 }
diff --git a/docs/docs/03-configuration.md b/docs/docs/03-configuration.md
index de433c07..27f9f14e 100644
--- a/docs/docs/03-configuration.md
+++ b/docs/docs/03-configuration.md
@@ -48,19 +48,20 @@ Either `OPENAI_API_KEY` or `OLLAMA_BASE_URL` need to be set for automatic taggin
 - You might want to tune the `INFERENCE_CONTEXT_LENGTH` as the default is quite small. The larger the value, the better the quality of the tags, but the more expensive the inference will be (money-wise on OpenAI and resource-wise on ollama).
   :::
 
-| Name                        | Required | Default                | Description                                                                                                                                                                                                                                                                                                                                                                           |
-| --------------------------- | -------- | ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| OPENAI_API_KEY              | No       | Not set                | The OpenAI key used for automatic tagging. More on that in [here](/openai).                                                                                                                                                                                                                                                                                                           |
-| OPENAI_BASE_URL             | No       | Not set                | If you just want to use OpenAI you don't need to pass this variable. If, however, you want to use some other openai compatible API (e.g. azure openai service), set this to the url of the API.                                                                                                                                                                                       |
-| OLLAMA_BASE_URL             | No       | Not set                | If you want to use ollama for local inference, set the address of ollama API here.                                                                                                                                                                                                                                                                                                    |
-| OLLAMA_KEEP_ALIVE           | No       | Not set                | Controls how long the model will stay loaded into memory following the request (example value: "5m").                                                                                                                                                                                                                                                                                 |
-| INFERENCE_TEXT_MODEL        | No       | gpt-4o-mini            | The model to use for text inference. You'll need to change this to some other model if you're using ollama.                                                                                                                                                                                                                                                                           |
-| INFERENCE_IMAGE_MODEL       | No       | gpt-4o-mini            | The model to use for image inference. You'll need to change this to some other model if you're using ollama and that model needs to support vision APIs (e.g. llava).                                                                                                                                                                                                                 |
-| EMBEDDING_TEXT_MODEL        | No       | text-embedding-3-small | The model to be used for generating embeddings for the text.                                                                                                                                                                                                                                                                                                                          |
-| INFERENCE_CONTEXT_LENGTH    | No       | 2048                   | The max number of tokens that we'll pass to the inference model. If your content is larger than this size, it'll be truncated to fit. The larger this value, the more of the content will be used in tag inference, but the more expensive the inference will be (money-wise on openAI and resource-wise on ollama). Check the model you're using for its max supported content size. |
-| INFERENCE_LANG              | No       | english                | The language in which the tags will be generated.                                                                                                                                                                                                                                                                                                                                     |
-| INFERENCE_JOB_TIMEOUT_SEC   | No       | 30                     | How long to wait for the inference job to finish before timing out. If you're running ollama without powerful GPUs, you might want to increase the timeout a bit.                                                                                                                                                                                                                     |
-| INFERENCE_FETCH_TIMEOUT_SEC | No       | 300                    | \[Ollama Only\] The timeout of the fetch request to the ollama server. If your inference requests take longer than the default 5mins, you might want to increase this timeout.                                                                                                                                                                                                        |
+| Name                                 | Required | Default                | Description                                                                                                                                                                                                                                                                                                                                                                           |
+| ------------------------------------ | -------- | ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| OPENAI_API_KEY                       | No       | Not set                | The OpenAI key used for automatic tagging. More on that in [here](/openai).                                                                                                                                                                                                                                                                                                           |
+| OPENAI_BASE_URL                      | No       | Not set                | If you just want to use OpenAI you don't need to pass this variable. If, however, you want to use some other openai compatible API (e.g. azure openai service), set this to the url of the API.                                                                                                                                                                                       |
+| OLLAMA_BASE_URL                      | No       | Not set                | If you want to use ollama for local inference, set the address of ollama API here.                                                                                                                                                                                                                                                                                                    |
+| OLLAMA_KEEP_ALIVE                    | No       | Not set                | Controls how long the model will stay loaded into memory following the request (example value: "5m").                                                                                                                                                                                                                                                                                 |
+| INFERENCE_TEXT_MODEL                 | No       | gpt-4o-mini            | The model to use for text inference. You'll need to change this to some other model if you're using ollama.                                                                                                                                                                                                                                                                           |
+| INFERENCE_IMAGE_MODEL                | No       | gpt-4o-mini            | The model to use for image inference. You'll need to change this to some other model if you're using ollama and that model needs to support vision APIs (e.g. llava).                                                                                                                                                                                                                 |
+| EMBEDDING_TEXT_MODEL                 | No       | text-embedding-3-small | The model to be used for generating embeddings for the text.                                                                                                                                                                                                                                                                                                                          |
+| INFERENCE_CONTEXT_LENGTH             | No       | 2048                   | The max number of tokens that we'll pass to the inference model. If your content is larger than this size, it'll be truncated to fit. The larger this value, the more of the content will be used in tag inference, but the more expensive the inference will be (money-wise on openAI and resource-wise on ollama). Check the model you're using for its max supported content size. |
+| INFERENCE_LANG                       | No       | english                | The language in which the tags will be generated.                                                                                                                                                                                                                                                                                                                                     |
+| INFERENCE_JOB_TIMEOUT_SEC            | No       | 30                     | How long to wait for the inference job to finish before timing out. If you're running ollama without powerful GPUs, you might want to increase the timeout a bit.                                                                                                                                                                                                                     |
+| INFERENCE_FETCH_TIMEOUT_SEC          | No       | 300                    | \[Ollama Only\] The timeout of the fetch request to the ollama server. If your inference requests take longer than the default 5mins, you might want to increase this timeout.                                                                                                                                                                                                        |
+| INFERENCE_SUPPORTS_STRUCTURED_OUTPUT | No       | true                   | Whether the inference model supports structured output or not.                                                                                                                                                                                                                                                                                                                        |
 
 :::info
 
diff --git a/packages/shared/config.ts b/packages/shared/config.ts
index 6e5a4404..1295fdbf 100644
--- a/packages/shared/config.ts
+++ b/packages/shared/config.ts
@@ -27,6 +27,7 @@ const allEnv = z.object({
   INFERENCE_IMAGE_MODEL: z.string().default("gpt-4o-mini"),
   EMBEDDING_TEXT_MODEL: z.string().default("text-embedding-3-small"),
   INFERENCE_CONTEXT_LENGTH: z.coerce.number().default(2048),
+  INFERENCE_SUPPORTS_STRUCTURED_OUTPUT: stringBool("true"),
   OCR_CACHE_DIR: z.string().optional(),
   OCR_LANGS: z
     .string()
@@ -94,6 +95,7 @@ const serverConfigSchema = allEnv.transform((val) => {
       imageModel: val.INFERENCE_IMAGE_MODEL,
       inferredTagLang: val.INFERENCE_LANG,
       contextLength: val.INFERENCE_CONTEXT_LENGTH,
+      supportsStructuredOutput: val.INFERENCE_SUPPORTS_STRUCTURED_OUTPUT,
     },
     embedding: {
       textModel: val.EMBEDDING_TEXT_MODEL,
diff --git a/packages/shared/inference.ts b/packages/shared/inference.ts
index 92d9dd94..43a14410 100644
--- a/packages/shared/inference.ts
+++ b/packages/shared/inference.ts
@@ -1,5 +1,8 @@
 import { Ollama } from "ollama";
 import OpenAI from "openai";
+import { zodResponseFormat } from "openai/helpers/zod";
+import { z } from "zod";
+import { zodToJsonSchema } from "zod-to-json-schema";
 
 import serverConfig from "./config";
 import { customFetch } from "./customFetch";
@@ -15,12 +18,13 @@ export interface EmbeddingResponse {
 }
 
 export interface InferenceOptions {
-  json: boolean;
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  schema: z.ZodSchema<any> | null;
   abortSignal?: AbortSignal;
 }
 
 const defaultInferenceOptions: InferenceOptions = {
-  json: true,
+  schema: null,
 };
 
 export interface InferenceClient {
@@ -72,9 +76,11 @@ class OpenAIInferenceClient implements InferenceClient {
       {
         messages: [{ role: "user", content: prompt }],
         model: serverConfig.inference.textModel,
-        response_format: optsWithDefaults.json
-          ? { type: "json_object" }
-          : undefined,
+        response_format:
+          optsWithDefaults.schema &&
+          serverConfig.inference.supportsStructuredOutput
+            ? zodResponseFormat(optsWithDefaults.schema, "schema")
+            : undefined,
       },
       {
         signal: optsWithDefaults.abortSignal,
@@ -101,9 +107,11 @@ class OpenAIInferenceClient implements InferenceClient {
     const chatCompletion = await this.openAI.chat.completions.create(
       {
         model: serverConfig.inference.imageModel,
-        response_format: optsWithDefaults.json
-          ? { type: "json_object" }
-          : undefined,
+        response_format:
+          optsWithDefaults.schema &&
+          serverConfig.inference.supportsStructuredOutput
+            ? zodResponseFormat(optsWithDefaults.schema, "schema")
+            : undefined,
         messages: [
           {
             role: "user",
@@ -178,7 +186,11 @@ class OllamaInferenceClient implements InferenceClient {
     }
     const chatCompletion = await this.ollama.chat({
       model: model,
-      format: optsWithDefaults.json ? "json" : undefined,
+      format:
+        optsWithDefaults.schema &&
+        serverConfig.inference.supportsStructuredOutput
+          ? zodToJsonSchema(optsWithDefaults.schema)
+          : undefined,
       stream: true,
       keep_alive: serverConfig.inference.ollamaKeepAlive,
       options: {
diff --git a/packages/shared/package.json b/packages/shared/package.json
index ecb16013..b868f9e3 100644
--- a/packages/shared/package.json
+++ b/packages/shared/package.json
@@ -8,11 +8,12 @@
     "glob": "^11.0.0",
     "liteque": "^0.3.2",
     "meilisearch": "^0.37.0",
-    "ollama": "^0.5.9",
-    "openai": "^4.67.1",
+    "ollama": "^0.5.14",
+    "openai": "^4.86.1",
     "typescript-parsec": "^0.3.4",
     "winston": "^3.11.0",
-    "zod": "^3.22.4"
+    "zod": "^3.22.4",
+    "zod-to-json-schema": "^3.24.3"
   },
   "devDependencies": {
     "@hoarder/eslint-config": "workspace:^0.2.0",
diff --git a/packages/trpc/routers/bookmarks.ts b/packages/trpc/routers/bookmarks.ts
index 63d20625..7025c3da 100644
--- a/packages/trpc/routers/bookmarks.ts
+++ b/packages/trpc/routers/bookmarks.ts
@@ -1109,7 +1109,7 @@ Content: ${bookmark.content ?? ""}
       );
 
       const summary = await inferenceClient.inferFromText(summaryPrompt, {
-        json: false,
+        schema: null,
       });
 
       if (!summary.response) {
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 6c313683..a8a143c2 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -1035,11 +1035,11 @@ importers:
         specifier: ^0.37.0
         version: 0.37.0
       ollama:
-        specifier: ^0.5.9
-        version: 0.5.9
+        specifier: ^0.5.14
+        version: 0.5.14
       openai:
-        specifier: ^4.67.1
-        version: 4.67.1(zod@3.22.4)
+        specifier: ^4.86.1
+        version: 4.86.1(zod@3.22.4)
       typescript-parsec:
         specifier: ^0.3.4
         version: 0.3.4
@@ -1049,6 +1049,9 @@ importers:
       zod:
         specifier: ^3.22.4
         version: 3.22.4
+      zod-to-json-schema:
+        specifier: ^3.24.3
+        version: 3.24.3(zod@3.22.4)
     devDependencies:
       '@hoarder/eslint-config':
         specifier: workspace:^0.2.0
@@ -10279,8 +10282,8 @@ packages:
     resolution: {integrity: sha512-IF4PcGgzAr6XXSff26Sk/+P4KZFJVuHAJZj3wgO3vX2bMdNVp/QXTP3P7CEm9V1IdG8lDLY3HhiqpsE/nOwpPw==}
     engines: {node: ^10.13.0 || >=12.0.0}
 
-  ollama@0.5.9:
-    resolution: {integrity: sha512-F/KZuDRC+ZsVCuMvcOYuQ6zj42/idzCkkuknGyyGVmNStMZ/sU3jQpvhnl4SyC0+zBzLiKNZJnJeuPFuieWZvQ==}
+  ollama@0.5.14:
+    resolution: {integrity: sha512-pvOuEYa2WkkAumxzJP0RdEYHkbZ64AYyyUszXVX7ruLvk5L+EiO2G71da2GqEQ4IAk4j6eLoUbGk5arzFT1wJA==}
 
   on-finished@2.3.0:
     resolution: {integrity: sha512-ikqdkGAAyf/X/gPhXGvfgAytDZtDbr+bkNUJ0N9h5MI/dmdgCs3l6hoHrcUv41sRKew3jIwrp4qQDXiK99Utww==}
@@ -10320,12 +10323,15 @@ packages:
     resolution: {integrity: sha512-7x81NCL719oNbsq/3mh+hVrAWmFuEYUqrq/Iw3kUzH8ReypT9QQ0BLoJS7/G9k6N81XjW4qHWtjWwe/9eLy1EQ==}
     engines: {node: '>=12'}
 
-  openai@4.67.1:
-    resolution: {integrity: sha512-2YbRFy6qaYRJabK2zLMn4txrB2xBy0KP5g/eoqeSPTT31mIJMnkT75toagvfE555IKa2RdrzJrZwdDsUipsAMw==}
+  openai@4.86.1:
+    resolution: {integrity: sha512-x3iCLyaC3yegFVZaxOmrYJjitKxZ9hpVbLi+ZlT5UHuHTMlEQEbKXkGOM78z9qm2T5GF+XRUZCP2/aV4UPFPJQ==}
     hasBin: true
     peerDependencies:
+      ws: ^8.18.0
       zod: ^3.23.8
     peerDependenciesMeta:
+      ws:
+        optional: true
       zod:
         optional: true
 
@@ -13703,6 +13709,11 @@ packages:
   zlibjs@0.3.1:
     resolution: {integrity: sha512-+J9RrgTKOmlxFSDHo0pI1xM6BLVUv+o0ZT9ANtCxGkjIVCCUdx9alUF8Gm+dGLKbkkkidWIHFDZHDMpfITt4+w==}
 
+  zod-to-json-schema@3.24.3:
+    resolution: {integrity: sha512-HIAfWdYIt1sssHfYZFCXp4rU1w2r8hVVXYIlmoa0r0gABLs5di3RCqPU5DDROogVz1pAdYBaz7HK5n9pSUNs3A==}
+    peerDependencies:
+      zod: ^3.24.1
+
   zod@3.22.4:
     resolution: {integrity: sha512-iC+8Io04lddc+mVqQ9AZ7OQ2MrUKGN+oIQyq1vemgt46jwCwLfhq7/pwnBnNXXXZb8VTVLKwp9EDkx+ryxIWmg==}
 
@@ -26595,7 +26606,7 @@ snapshots:
   oidc-token-hash@5.0.3:
     dev: false
 
-  ollama@0.5.9:
+  ollama@0.5.14:
     dependencies:
       whatwg-fetch: 3.6.20
     dev: false
@@ -26650,7 +26661,7 @@ snapshots:
       is-wsl: 2.2.0
     dev: false
 
-  openai@4.67.1(zod@3.22.4):
+  openai@4.86.1(zod@3.22.4):
     dependencies:
       '@types/node': 18.19.18
       '@types/node-fetch': 2.6.11
@@ -31125,6 +31136,11 @@ snapshots:
   zlibjs@0.3.1:
     dev: false
 
+  zod-to-json-schema@3.24.3(zod@3.22.4):
+    dependencies:
+      zod: 3.22.4
+    dev: false
+
   zod@3.22.4:
     dev: false