12 files changed, 926 insertions, 0 deletions
diff --git a/tools/compare-models/.env.example b/tools/compare-models/.env.example
new file mode 100644
index 00000000..f2f4c10c
--- /dev/null
+++ b/tools/compare-models/.env.example
@@ -0,0 +1,31 @@
+# Karakeep API configuration
+KARAKEEP_API_KEY=your_karakeep_api_key
+KARAKEEP_SERVER_ADDR=https://your-karakeep-instance.com
+
+# Comparison mode: 
+#   - "model-vs-model": Compare two models (requires MODEL1_NAME and MODEL2_NAME)
+#   - "model-vs-existing": Compare a model against existing AI tags (requires only MODEL1_NAME)
+# Default: model-vs-model
+COMPARISON_MODE=model-vs-model
+
+# Models to compare
+# MODEL1_NAME: The new model to test (always required)
+# MODEL2_NAME: The second model to compare against (required only for model-vs-model mode)
+MODEL1_NAME=gpt-4o-mini
+MODEL2_NAME=claude-3-5-sonnet
+
+# OpenAI/OpenRouter API configuration
+OPENAI_API_KEY=your_openai_or_openrouter_key
+OPENAI_BASE_URL=https://openrouter.ai/api/v1
+
+# Optional: Number of bookmarks to test (default: 10)
+COMPARE_LIMIT=10
+
+# Optional: Context length for inference (default: 8000)
+INFERENCE_CONTEXT_LENGTH=8000
+
+# Optional: Max output tokens (default: 2048)
+INFERENCE_MAX_OUTPUT_TOKENS=2048
+
+# Optional: Use max_completion_tokens parameter (default: false)
+INFERENCE_USE_MAX_COMPLETION_TOKENS=false
diff --git a/tools/compare-models/.gitignore b/tools/compare-models/.gitignore
new file mode 100644
index 00000000..9c97bbd4
--- /dev/null
+++ b/tools/compare-models/.gitignore
@@ -0,0 +1,3 @@
+node_modules
+dist
+.env
diff --git a/tools/compare-models/README.md b/tools/compare-models/README.md
new file mode 100644
index 00000000..85c7c6ec
--- /dev/null
+++ b/tools/compare-models/README.md
@@ -0,0 +1,230 @@
+# Model Comparison Tool
+
+A standalone CLI tool to compare the tagging performance of AI models using your existing Karakeep bookmarks.
+
+## Features
+
+- **Two comparison modes:**
+  - **Model vs Model**: Compare two AI models against each other
+  - **Model vs Existing**: Compare a new model against existing AI-generated tags on your bookmarks
+- Fetches existing bookmarks from your Karakeep instance
+- Runs tagging inference with AI models
+- **Random shuffling**: Models/tags are randomly assigned to "Model A" or "Model B" for each bookmark to eliminate bias
+- Blind comparison: Model names are hidden during voting (only shown as "Model A" and "Model B")
+- Interactive voting interface
+- Shows final results with winner
+
+## Setup
+
+### Environment Variables
+
+Required environment variables:
+
+```bash
+# Karakeep API configuration
+KARAKEEP_API_KEY=your_api_key_here
+KARAKEEP_SERVER_ADDR=https://your-karakeep-instance.com
+
+# Comparison mode (default: model-vs-model)
+# - "model-vs-model": Compare two models against each other
+# - "model-vs-existing": Compare a model against existing AI tags
+COMPARISON_MODE=model-vs-model
+
+# Models to compare
+# MODEL1_NAME: The new model to test (always required)
+# MODEL2_NAME: The second model to compare against (required only for model-vs-model mode)
+MODEL1_NAME=gpt-4o-mini
+MODEL2_NAME=claude-3-5-sonnet
+
+# OpenAI/OpenRouter API configuration (for running inference)
+OPENAI_API_KEY=your_openai_or_openrouter_key
+OPENAI_BASE_URL=https://openrouter.ai/api/v1  # Optional, defaults to OpenAI
+
+# Optional: Number of bookmarks to test (default: 10)
+COMPARE_LIMIT=10
+```
+
+### Using OpenRouter
+
+For OpenRouter, set:
+```bash
+OPENAI_BASE_URL=https://openrouter.ai/api/v1
+OPENAI_API_KEY=your_openrouter_key
+```
+
+### Using OpenAI Directly
+
+For OpenAI directly:
+```bash
+OPENAI_API_KEY=your_openai_key
+# OPENAI_BASE_URL can be omitted for direct OpenAI
+```
+
+## Usage
+
+### Run with pnpm (Recommended)
+
+```bash
+cd tools/compare-models
+pnpm install
+pnpm run
+```
+
+### Run with environment file
+
+Create a `.env` file:
+
+```env
+KARAKEEP_API_KEY=your_api_key
+KARAKEEP_SERVER_ADDR=https://your-karakeep-instance.com
+MODEL1_NAME=gpt-4o-mini
+MODEL2_NAME=claude-3-5-sonnet
+OPENAI_API_KEY=your_openai_key
+COMPARE_LIMIT=10
+```
+
+Then run:
+```bash
+pnpm run
+```
+
+### Using directly with node
+
+If you prefer to run the compiled JavaScript directly:
+
+```bash
+pnpm build
+export KARAKEEP_API_KEY=your_api_key
+export KARAKEEP_SERVER_ADDR=https://your-karakeep-instance.com
+export MODEL1_NAME=gpt-4o-mini
+export MODEL2_NAME=claude-3-5-sonnet
+export OPENAI_API_KEY=your_openai_key
+node dist/index.js
+```
+
+## Comparison Modes
+
+### Model vs Model Mode
+
+Compare two different AI models against each other:
+
+```bash
+COMPARISON_MODE=model-vs-model
+MODEL1_NAME=gpt-4o-mini
+MODEL2_NAME=claude-3-5-sonnet
+```
+
+This mode runs inference with both models on each bookmark and lets you choose which tags are better.
+
+### Model vs Existing Mode
+
+Compare a new model against existing AI-generated tags on your bookmarks:
+
+```bash
+COMPARISON_MODE=model-vs-existing
+MODEL1_NAME=gpt-4o-mini
+# MODEL2_NAME is not required in this mode
+```
+
+This mode is useful for:
+- Testing if a new model produces better tags than your current model
+- Evaluating whether to switch from one model to another
+- Quality assurance on existing AI tags
+
+**Note:** This mode only compares bookmarks that already have AI-generated tags (tags with `attachedBy: "ai"`). Bookmarks without AI tags are automatically filtered out.
+
+## Usage Flow
+
+1. The tool fetches your latest link bookmarks from Karakeep
+   - In **model-vs-existing** mode, only bookmarks with existing AI tags are included
+2. For each bookmark, it randomly assigns the options to "Model A" or "Model B" and runs tagging
+3. You'll see a side-by-side comparison (randomly shuffled each time):
+   ```
+   === Bookmark 1/10 ===
+   How to Build Better AI Systems
+   https://example.com/article
+   This article explores modern approaches to...
+
+   ─────────────────────────────────────
+
+   Model A (blind):
+     • ai
+     • machine-learning
+     • engineering
+
+   Model B (blind):
+     • artificial-intelligence
+     • ML
+     • software-development
+
+   ─────────────────────────────────────
+
+   Which tags do you prefer? [1=Model A, 2=Model B, s=skip, q=quit] >
+   ```
+
+4. Choose your preference:
+   - `1` - Vote for Model A
+   - `2` - Vote for Model B
+   - `s` or `skip` - Skip this comparison
+   - `q` or `quit` - Exit early and show current results
+
+5. After completing all comparisons (or quitting early), results are displayed:
+   ```
+   ───────────────────────────────────────
+   === FINAL RESULTS ===
+   ───────────────────────────────────────
+   gpt-4o-mini: 6 votes
+   claude-3-5-sonnet: 3 votes
+   Skipped: 1
+   Errors: 0
+   ───────────────────────────────────────
+   Total bookmarks tested: 10
+
+   🏆 WINNER: gpt-4o-mini
+   ───────────────────────────────────────
+   ```
+
+6. The actual model names are only shown in the final results - during voting you see only "Model A" and "Model B"
+
+## Bookmark Filtering
+
+The tool currently tests only:
+- **Link-type bookmarks** (not text notes or assets)
+- **Non-archived** bookmarks
+- **Latest N bookmarks** (where N is COMPARE_LIMIT)
+- **In model-vs-existing mode**: Only bookmarks with existing AI tags (tags with `attachedBy: "ai"`)
+
+## Architecture
+
+This tool leverages Karakeep's shared infrastructure:
+- **API Client**: Uses `@karakeep/sdk` for type-safe API interactions with proper authentication
+- **Inference**: Reuses `@karakeep/shared/inference` for OpenAI client with structured output support
+- **Prompts**: Uses `@karakeep/shared/prompts` for consistent tagging prompt generation with token management
+- No code duplication - all core functionality is shared with the main Karakeep application
+
+
+## Error Handling
+
+- If a model fails to generate tags for a bookmark, an error is shown and comparison continues
+- Errors are counted separately in final results
+- Missing required environment variables will cause the tool to exit with a clear error message
+
+## Build
+
+To build a standalone binary:
+
+```bash
+pnpm build
+```
+
+The built binary will be in `dist/index.js`.
+
+## Notes
+
+- The tool is designed for manual, human-in-the-loop evaluation
+- No results are persisted - they're only displayed in console
+- Content is fetched with `includeContent=true` from Karakeep API
+- Uses Karakeep SDK (`@karakeep/sdk`) for type-safe API interactions
+- Inference runs sequentially to keep state management simple
+- Recommended to use `pnpm run` for the best experience (uses tsx for development)
+- **Random shuffling**: For each bookmark, models are randomly assigned to "Model A" or "Model B" to eliminate position bias. The actual model names are only revealed in the final results.
diff --git a/tools/compare-models/package.json b/tools/compare-models/package.json
new file mode 100644
index 00000000..5a493bd2
--- /dev/null
+++ b/tools/compare-models/package.json
@@ -0,0 +1,24 @@
+{
+  "name": "@karakeep/compare-models",
+  "version": "0.1.0",
+  "description": "Standalone tool to compare tagging performance between AI models",
+  "bin": {
+    "compare-models": "dist/index.js"
+  },
+  "scripts": {
+    "build": "tsc && chmod +x dist/index.js",
+    "run": "tsx --env-file=./.env src/index.ts",
+    "typecheck": "tsc --noEmit"
+  },
+  "dependencies": {
+    "@karakeep/sdk": "workspace:^",
+    "@karakeep/shared": "workspace:^",
+    "chalk": "^5.3.0",
+    "zod": "^3.24.2"
+  },
+  "devDependencies": {
+    "@types/node": "^24",
+    "tsx": "^4.8.1",
+    "typescript": "^5.9"
+  }
+}
diff --git a/tools/compare-models/src/apiClient.ts b/tools/compare-models/src/apiClient.ts
new file mode 100644
index 00000000..1d9f799d
--- /dev/null
+++ b/tools/compare-models/src/apiClient.ts
@@ -0,0 +1,71 @@
+import { createKarakeepClient } from "@karakeep/sdk";
+
+import type { Bookmark } from "./types";
+import { config } from "./config";
+
+export class KarakeepAPIClient {
+  private readonly client: ReturnType<typeof createKarakeepClient>;
+
+  constructor() {
+    this.client = createKarakeepClient({
+      baseUrl: `${config.KARAKEEP_SERVER_ADDR}/api/v1/`,
+      headers: {
+        "Content-Type": "application/json",
+        authorization: `Bearer ${config.KARAKEEP_API_KEY}`,
+      },
+    });
+  }
+
+  async fetchBookmarks(limit: number): Promise<Bookmark[]> {
+    const bookmarks: Bookmark[] = [];
+    let cursor: string | null = null;
+    let hasMore = true;
+
+    while (hasMore && bookmarks.length < limit) {
+      const params: {
+        limit: number;
+        includeContent: true;
+        archived?: boolean;
+        cursor?: string;
+      } = {
+        limit: Math.min(limit - bookmarks.length, 50),
+        includeContent: true,
+        archived: false,
+      };
+
+      if (cursor) {
+        params.cursor = cursor;
+      }
+
+      const { data, response, error } = await this.client.GET("/bookmarks", {
+        params: {
+          query: params,
+        },
+      });
+
+      if (error) {
+        throw new Error(`Failed to fetch bookmarks: ${String(error)}`);
+      }
+
+      if (!response.ok) {
+        throw new Error(`Failed to fetch bookmarks: ${response.status}`);
+      }
+
+      const batchBookmarks = (data?.bookmarks || [])
+        .filter((b) => b.content?.type === "link")
+        .map((b) => ({
+          ...b,
+          tags: (b.tags || []).map((tag) => ({
+            name: tag.name,
+            attachedBy: tag.attachedBy,
+          })),
+        })) as Bookmark[];
+
+      bookmarks.push(...batchBookmarks);
+      cursor = data?.nextCursor || null;
+      hasMore = !!cursor;
+    }
+
+    return bookmarks.slice(0, limit);
+  }
+}
diff --git a/tools/compare-models/src/bookmarkProcessor.ts b/tools/compare-models/src/bookmarkProcessor.ts
new file mode 100644
index 00000000..4a1bbf0a
--- /dev/null
+++ b/tools/compare-models/src/bookmarkProcessor.ts
@@ -0,0 +1,68 @@
+import type { InferenceClient } from "@karakeep/shared/inference";
+import { buildTextPrompt } from "@karakeep/shared/prompts.server";
+
+import { inferTags } from "./inferenceClient";
+import type { Bookmark } from "./types";
+
+export async function extractBookmarkContent(
+  bookmark: Bookmark,
+): Promise<string> {
+  if (bookmark.content.type === "link") {
+    const parts = [];
+
+    if (bookmark.content.url) {
+      parts.push(`URL: ${bookmark.content.url}`);
+    }
+
+    if (bookmark.title) {
+      parts.push(`Title: ${bookmark.title}`);
+    }
+
+    if (bookmark.content.description) {
+      parts.push(`Description: ${bookmark.content.description}`);
+    }
+
+    if (bookmark.content.htmlContent) {
+      parts.push(`Content: ${bookmark.content.htmlContent}`);
+    }
+
+    return parts.join("\n");
+  }
+
+  if (bookmark.content.type === "text" && bookmark.content.text) {
+    return bookmark.content.text;
+  }
+
+  return "";
+}
+
+export async function runTaggingForModel(
+  bookmark: Bookmark,
+  inferenceClient: InferenceClient,
+  lang: string = "english",
+  contextLength: number = 8000,
+): Promise<string[]> {
+  const content = await extractBookmarkContent(bookmark);
+
+  if (!content) {
+    return [];
+  }
+
+  try {
+    // Use the shared prompt builder with empty custom prompts and default tag style
+    const prompt = await buildTextPrompt(
+      lang,
+      [], // No custom prompts for comparison tool
+      content,
+      contextLength,
+      "as-generated", // Use tags as generated by the model
+    );
+
+    const tags = await inferTags(inferenceClient, prompt);
+    return tags;
+  } catch (error) {
+    throw new Error(
+      `Failed to generate tags: ${error instanceof Error ? error.message : String(error)}`,
+    );
+  }
+}
diff --git a/tools/compare-models/src/config.ts b/tools/compare-models/src/config.ts
new file mode 100644
index 00000000..3a2d0d41
--- /dev/null
+++ b/tools/compare-models/src/config.ts
@@ -0,0 +1,34 @@
+import { z } from "zod";
+
+// Local config schema for compare-models tool
+const envSchema = z.object({
+  KARAKEEP_API_KEY: z.string().min(1),
+  KARAKEEP_SERVER_ADDR: z.string().url(),
+  MODEL1_NAME: z.string().min(1),
+  MODEL2_NAME: z.string().min(1).optional(),
+  OPENAI_API_KEY: z.string().min(1),
+  OPENAI_BASE_URL: z.string().url().optional(),
+  OPENAI_SERVICE_TIER: z.enum(["auto", "default", "flex"]).optional(),
+  COMPARISON_MODE: z
+    .enum(["model-vs-model", "model-vs-existing"])
+    .default("model-vs-model"),
+  COMPARE_LIMIT: z
+    .string()
+    .optional()
+    .transform((val) => (val ? parseInt(val, 10) : 10)),
+  INFERENCE_CONTEXT_LENGTH: z
+    .string()
+    .optional()
+    .transform((val) => (val ? parseInt(val, 10) : 8000)),
+  INFERENCE_MAX_OUTPUT_TOKENS: z
+    .string()
+    .optional()
+    .transform((val) => (val ? parseInt(val, 10) : 2048)),
+  INFERENCE_USE_MAX_COMPLETION_TOKENS: z
+    .string()
+    .optional()
+    .transform((val) => val === "true")
+    .default("false"),
+});
+
+export const config = envSchema.parse(process.env);
diff --git a/tools/compare-models/src/index.ts b/tools/compare-models/src/index.ts
new file mode 100644
index 00000000..88fc9249
--- /dev/null
+++ b/tools/compare-models/src/index.ts
@@ -0,0 +1,229 @@
+import chalk from "chalk";
+
+import type { ComparisonResult } from "./types";
+import { KarakeepAPIClient } from "./apiClient";
+import { runTaggingForModel } from "./bookmarkProcessor";
+import { config } from "./config";
+import { createInferenceClient } from "./inferenceClient";
+import {
+  askQuestion,
+  clearProgress,
+  close,
+  displayComparison,
+  displayError,
+  displayFinalResults,
+  displayProgress,
+} from "./interactive";
+
+interface VoteCounters {
+  model1Votes: number;
+  model2Votes: number;
+  skipped: number;
+  errors: number;
+  total: number;
+}
+
+interface ShuffleResult {
+  modelA: string;
+  modelB: string;
+  modelAIsModel1: boolean;
+}
+
+async function main() {
+  console.log(chalk.cyan("\n🚀 Karakeep Model Comparison Tool\n"));
+
+  const isExistingMode = config.COMPARISON_MODE === "model-vs-existing";
+
+  if (isExistingMode) {
+    console.log(
+      chalk.yellow(
+        `Mode: Comparing ${config.MODEL1_NAME} against existing AI tags\n`,
+      ),
+    );
+  } else {
+    if (!config.MODEL2_NAME) {
+      console.log(
+        chalk.red(
+          "\n✗ Error: MODEL2_NAME is required for model-vs-model comparison mode\n",
+        ),
+      );
+      process.exit(1);
+    }
+    console.log(
+      chalk.yellow(
+        `Mode: Comparing ${config.MODEL1_NAME} vs ${config.MODEL2_NAME}\n`,
+      ),
+    );
+  }
+
+  const apiClient = new KarakeepAPIClient();
+
+  displayProgress("Fetching bookmarks from Karakeep...");
+  let bookmarks = await apiClient.fetchBookmarks(config.COMPARE_LIMIT);
+  clearProgress();
+
+  // Filter bookmarks with AI tags if in existing mode
+  if (isExistingMode) {
+    bookmarks = bookmarks.filter(
+      (b) => b.tags.some((t) => t.attachedBy === "ai"),
+    );
+    console.log(
+      chalk.green(
+        `✓ Fetched ${bookmarks.length} link bookmarks with existing AI tags\n`,
+      ),
+    );
+  } else {
+    console.log(chalk.green(`✓ Fetched ${bookmarks.length} link bookmarks\n`));
+  }
+
+  if (bookmarks.length === 0) {
+    console.log(
+      chalk.yellow(
+        "\n⚠ No bookmarks found with AI tags. Please add some bookmarks with AI tags first.\n",
+      ),
+    );
+    return;
+  }
+
+  const counters: VoteCounters = {
+    model1Votes: 0,
+    model2Votes: 0,
+    skipped: 0,
+    errors: 0,
+    total: bookmarks.length,
+  };
+
+  const detailedResults: ComparisonResult[] = [];
+
+  for (let i = 0; i < bookmarks.length; i++) {
+    const bookmark = bookmarks[i];
+
+    displayProgress(
+      `[${i + 1}/${bookmarks.length}] Running inference on: ${bookmark.title || bookmark.content.title || "Untitled"}`,
+    );
+
+    let model1Tags: string[] = [];
+    let model2Tags: string[] = [];
+
+    // Get tags for model 1 (new model)
+    try {
+      const model1Client = createInferenceClient(config.MODEL1_NAME);
+      model1Tags = await runTaggingForModel(
+        bookmark,
+        model1Client,
+        "english",
+        config.INFERENCE_CONTEXT_LENGTH,
+      );
+    } catch (error) {
+      clearProgress();
+      displayError(
+        `${config.MODEL1_NAME} failed: ${error instanceof Error ? error.message : String(error)}`,
+      );
+      counters.errors++;
+      continue;
+    }
+
+    // Get tags for model 2 or existing AI tags
+    if (isExistingMode) {
+      // Use existing AI tags from the bookmark
+      model2Tags = bookmark.tags
+        .filter((t) => t.attachedBy === "ai")
+        .map((t) => t.name);
+    } else {
+      // Run inference with model 2
+      try {
+        const model2Client = createInferenceClient(config.MODEL2_NAME!);
+        model2Tags = await runTaggingForModel(
+          bookmark,
+          model2Client,
+          "english",
+          config.INFERENCE_CONTEXT_LENGTH,
+        );
+      } catch (error) {
+        clearProgress();
+        displayError(
+          `${config.MODEL2_NAME} failed: ${error instanceof Error ? error.message : String(error)}`,
+        );
+        counters.errors++;
+        continue;
+      }
+    }
+
+    clearProgress();
+
+    const model2Label = isExistingMode
+      ? "Existing AI Tags"
+      : config.MODEL2_NAME!;
+
+    const shuffleResult: ShuffleResult = {
+      modelA: config.MODEL1_NAME,
+      modelB: model2Label,
+      modelAIsModel1: Math.random() < 0.5,
+    };
+
+    if (!shuffleResult.modelAIsModel1) {
+      shuffleResult.modelA = model2Label;
+      shuffleResult.modelB = config.MODEL1_NAME;
+    }
+
+    const comparison: ComparisonResult = {
+      bookmark,
+      modelA: shuffleResult.modelA,
+      modelATags: shuffleResult.modelAIsModel1 ? model1Tags : model2Tags,
+      modelB: shuffleResult.modelB,
+      modelBTags: shuffleResult.modelAIsModel1 ? model2Tags : model1Tags,
+    };
+
+    displayComparison(i + 1, bookmarks.length, comparison, true);
+
+    const answer = await askQuestion(
+      "Which tags do you prefer? [1=Model A, 2=Model B, s=skip, q=quit] > ",
+    );
+
+    const normalizedAnswer = answer.toLowerCase();
+
+    if (normalizedAnswer === "q" || normalizedAnswer === "quit") {
+      console.log(chalk.yellow("\n⏸ Quitting early...\n"));
+      break;
+    }
+
+    if (normalizedAnswer === "1") {
+      comparison.winner = "modelA";
+      if (shuffleResult.modelAIsModel1) {
+        counters.model1Votes++;
+      } else {
+        counters.model2Votes++;
+      }
+      detailedResults.push(comparison);
+    } else if (normalizedAnswer === "2") {
+      comparison.winner = "modelB";
+      if (shuffleResult.modelAIsModel1) {
+        counters.model2Votes++;
+      } else {
+        counters.model1Votes++;
+      }
+      detailedResults.push(comparison);
+    } else {
+      comparison.winner = "skip";
+      counters.skipped++;
+      detailedResults.push(comparison);
+    }
+  }
+
+  close();
+
+  displayFinalResults({
+    model1Name: config.MODEL1_NAME,
+    model2Name: isExistingMode ? "Existing AI Tags" : config.MODEL2_NAME!,
+    model1Votes: counters.model1Votes,
+    model2Votes: counters.model2Votes,
+    skipped: counters.skipped,
+    errors: counters.errors,
+    total: counters.total,
+  });
+}
+
+main().catch((error) => {
+  console.error(chalk.red(`\n✗ Fatal error: ${error}\n`));
+  process.exit(1);
+});
diff --git a/tools/compare-models/src/inferenceClient.ts b/tools/compare-models/src/inferenceClient.ts
new file mode 100644
index 00000000..0a5ed8b5
--- /dev/null
+++ b/tools/compare-models/src/inferenceClient.ts
@@ -0,0 +1,46 @@
+import type { InferenceClient } from "@karakeep/shared/inference";
+import {
+  OpenAIInferenceClient,
+  type OpenAIInferenceConfig,
+} from "@karakeep/shared/inference";
+import { z } from "zod";
+
+import { config } from "./config";
+
+export function createInferenceClient(modelName: string): InferenceClient {
+  const inferenceConfig: OpenAIInferenceConfig = {
+    apiKey: config.OPENAI_API_KEY,
+    baseURL: config.OPENAI_BASE_URL,
+    serviceTier: config.OPENAI_SERVICE_TIER,
+    textModel: modelName,
+    imageModel: modelName, // Use same model for images if needed
+    contextLength: config.INFERENCE_CONTEXT_LENGTH,
+    maxOutputTokens: config.INFERENCE_MAX_OUTPUT_TOKENS,
+    useMaxCompletionTokens: config.INFERENCE_USE_MAX_COMPLETION_TOKENS,
+    outputSchema: "structured",
+  };
+
+  return new OpenAIInferenceClient(inferenceConfig);
+}
+
+export async function inferTags(
+  inferenceClient: InferenceClient,
+  prompt: string,
+): Promise<string[]> {
+  const tagsSchema = z.object({
+    tags: z.array(z.string()),
+  });
+
+  const response = await inferenceClient.inferFromText(prompt, {
+    schema: tagsSchema,
+  });
+
+  const parsed = tagsSchema.safeParse(JSON.parse(response.response));
+  if (!parsed.success) {
+    throw new Error(
+      `Failed to parse model response: ${parsed.error.message}`,
+    );
+  }
+
+  return parsed.data.tags;
+}
diff --git a/tools/compare-models/src/interactive.ts b/tools/compare-models/src/interactive.ts
new file mode 100644
index 00000000..b93fc1d7
--- /dev/null
+++ b/tools/compare-models/src/interactive.ts
@@ -0,0 +1,128 @@
+import * as readline from "node:readline";
+import chalk from "chalk";
+
+import type { ComparisonResult } from "./types";
+
+const rl = readline.createInterface({
+  input: process.stdin,
+  output: process.stdout,
+});
+
+export async function askQuestion(question: string): Promise<string> {
+  return new Promise((resolve) => {
+    rl.question(question, (answer) => {
+      resolve(answer.trim());
+    });
+  });
+}
+
+export function displayComparison(
+  index: number,
+  total: number,
+  result: ComparisonResult,
+  blind: boolean = true,
+): void {
+  const divider = chalk.gray("─".repeat(80));
+  const header = chalk.bold.cyan(`\n=== Bookmark ${index}/${total} ===`);
+  const title = chalk.bold.white(result.bookmark.title || "Untitled");
+  const url = result.bookmark.content.url
+    ? chalk.gray(result.bookmark.content.url)
+    : "";
+  const content = chalk.gray(
+    result.bookmark.content.description
+      ? result.bookmark.content.description.substring(0, 200) + "..."
+      : "",
+  );
+
+  const modelAName = blind ? "Model A" : result.modelA;
+  const modelBName = blind ? "Model B" : result.modelB;
+
+  const modelATags = result.modelATags
+    .map((tag) => chalk.green(`  • ${tag}`))
+    .join("\n");
+  const modelBTags = result.modelBTags
+    .map((tag) => chalk.yellow(`  • ${tag}`))
+    .join("\n");
+
+  console.log(header);
+  console.log(title);
+  if (url) console.log(url);
+  if (content) console.log(content);
+  console.log(divider);
+  console.log();
+  console.log(chalk.green(`${modelAName}:`));
+  if (modelATags) {
+    console.log(modelATags);
+  } else {
+    console.log(chalk.gray("  (no tags)"));
+  }
+  console.log();
+  console.log(chalk.yellow(`${modelBName}:`));
+  if (modelBTags) {
+    console.log(modelBTags);
+  } else {
+    console.log(chalk.gray("  (no tags)"));
+  }
+  console.log();
+}
+
+export function displayError(message: string): void {
+  console.log(chalk.red(`\n✗ Error: ${message}\n`));
+}
+
+export function displayProgress(message: string): void {
+  process.stdout.write(chalk.gray(message));
+}
+
+export function clearProgress(): void {
+  process.stdout.write("\r\x1b[K");
+}
+
+export function close(): void {
+  rl.close();
+}
+
+export function displayFinalResults(results: {
+  model1Name: string;
+  model2Name: string;
+  model1Votes: number;
+  model2Votes: number;
+  skipped: number;
+  errors: number;
+  total: number;
+}): void {
+  const winner =
+    results.model1Votes > results.model2Votes
+      ? results.model1Name
+      : results.model2Votes > results.model1Votes
+        ? results.model2Name
+        : "TIE";
+
+  const divider = chalk.gray("─".repeat(80));
+  const header = chalk.bold.cyan("\n=== FINAL RESULTS ===");
+  const model1Line = chalk.green(
+    `${results.model1Name}: ${results.model1Votes} votes`,
+  );
+  const model2Line = chalk.yellow(
+    `${results.model2Name}: ${results.model2Votes} votes`,
+  );
+  const skippedLine = chalk.gray(`Skipped: ${results.skipped}`);
+  const errorsLine = chalk.red(`Errors: ${results.errors}`);
+  const totalLine = chalk.bold(`Total bookmarks tested: ${results.total}`);
+  const winnerLine =
+    winner === "TIE"
+      ? chalk.bold.cyan(`\n🏁 RESULT: TIE`)
+      : chalk.bold.green(`\n🏆 WINNER: ${winner}`);
+
+  console.log(divider);
+  console.log(header);
+  console.log(divider);
+  console.log(model1Line);
+  console.log(model2Line);
+  console.log(skippedLine);
+  console.log(errorsLine);
+  console.log(divider);
+  console.log(totalLine);
+  console.log(winnerLine);
+  console.log(divider);
+}
diff --git a/tools/compare-models/src/types.ts b/tools/compare-models/src/types.ts
new file mode 100644
index 00000000..35a677ae
--- /dev/null
+++ b/tools/compare-models/src/types.ts
@@ -0,0 +1,38 @@
+export interface Bookmark {
+  id: string;
+  title: string | null;
+  content: {
+    type: string;
+    title: string;
+    url?: string;
+    text?: string;
+    htmlContent?: string;
+    description?: string;
+  };
+  tags: Array<{ name: string; attachedBy?: "ai" | "human" }>;
+}
+
+export interface ModelConfig {
+  name: string;
+  apiKey: string;
+  baseUrl?: string;
+}
+
+export interface ComparisonResult {
+  bookmark: Bookmark;
+  modelA: string;
+  modelATags: string[];
+  modelB: string;
+  modelBTags: string[];
+  winner?: "modelA" | "modelB" | "skip";
+}
+
+export interface FinalResults {
+  model1Name: string;
+  model2Name: string;
+  model1Votes: number;
+  model2Votes: number;
+  skipped: number;
+  errors: number;
+  total: number;
+}
diff --git a/tools/compare-models/tsconfig.json b/tools/compare-models/tsconfig.json
new file mode 100644
index 00000000..edeec809
--- /dev/null
+++ b/tools/compare-models/tsconfig.json
@@ -0,0 +1,24 @@
+{
+  "$schema": "https://json.schemastore.org/tsconfig",
+  "extends": "@tsconfig/node22/tsconfig.json",
+  "compilerOptions": {
+    "target": "ES2022",
+    "module": "CommonJS",
+    "lib": ["ES2022"],
+    "outDir": "dist",
+    "rootDir": "src",
+    "moduleResolution": "node",
+    "esModuleInterop": true,
+    "allowSyntheticDefaultImports": true,
+    "strict": true,
+    "skipLibCheck": true,
+    "allowJs": true,
+    "resolveJsonModule": true,
+    "isolatedModules": true,
+    "noEmit": false,
+    "declaration": false,
+    "sourceMap": false
+  },
+  "include": ["src/**/*"],
+  "exclude": ["node_modules", "dist"]
+}