aboutsummaryrefslogtreecommitdiffstats
path: root/tools/compare-models
diff options
context:
space:
mode:
Diffstat (limited to 'tools/compare-models')
-rw-r--r--tools/compare-models/.env.example31
-rw-r--r--tools/compare-models/.gitignore3
-rw-r--r--tools/compare-models/README.md230
-rw-r--r--tools/compare-models/package.json24
-rw-r--r--tools/compare-models/src/apiClient.ts71
-rw-r--r--tools/compare-models/src/bookmarkProcessor.ts68
-rw-r--r--tools/compare-models/src/config.ts34
-rw-r--r--tools/compare-models/src/index.ts229
-rw-r--r--tools/compare-models/src/inferenceClient.ts46
-rw-r--r--tools/compare-models/src/interactive.ts128
-rw-r--r--tools/compare-models/src/types.ts38
-rw-r--r--tools/compare-models/tsconfig.json24
12 files changed, 926 insertions, 0 deletions
diff --git a/tools/compare-models/.env.example b/tools/compare-models/.env.example
new file mode 100644
index 00000000..f2f4c10c
--- /dev/null
+++ b/tools/compare-models/.env.example
@@ -0,0 +1,31 @@
+# Karakeep API configuration
+KARAKEEP_API_KEY=your_karakeep_api_key
+KARAKEEP_SERVER_ADDR=https://your-karakeep-instance.com
+
+# Comparison mode:
+# - "model-vs-model": Compare two models (requires MODEL1_NAME and MODEL2_NAME)
+# - "model-vs-existing": Compare a model against existing AI tags (requires only MODEL1_NAME)
+# Default: model-vs-model
+COMPARISON_MODE=model-vs-model
+
+# Models to compare
+# MODEL1_NAME: The new model to test (always required)
+# MODEL2_NAME: The second model to compare against (required only for model-vs-model mode)
+MODEL1_NAME=gpt-4o-mini
+MODEL2_NAME=claude-3-5-sonnet
+
+# OpenAI/OpenRouter API configuration
+OPENAI_API_KEY=your_openai_or_openrouter_key
+OPENAI_BASE_URL=https://openrouter.ai/api/v1
+
+# Optional: Number of bookmarks to test (default: 10)
+COMPARE_LIMIT=10
+
+# Optional: Context length for inference (default: 8000)
+INFERENCE_CONTEXT_LENGTH=8000
+
+# Optional: Max output tokens (default: 2048)
+INFERENCE_MAX_OUTPUT_TOKENS=2048
+
+# Optional: Use max_completion_tokens parameter (default: false)
+INFERENCE_USE_MAX_COMPLETION_TOKENS=false
diff --git a/tools/compare-models/.gitignore b/tools/compare-models/.gitignore
new file mode 100644
index 00000000..9c97bbd4
--- /dev/null
+++ b/tools/compare-models/.gitignore
@@ -0,0 +1,3 @@
+node_modules
+dist
+.env
diff --git a/tools/compare-models/README.md b/tools/compare-models/README.md
new file mode 100644
index 00000000..85c7c6ec
--- /dev/null
+++ b/tools/compare-models/README.md
@@ -0,0 +1,230 @@
+# Model Comparison Tool
+
+A standalone CLI tool to compare the tagging performance of AI models using your existing Karakeep bookmarks.
+
+## Features
+
+- **Two comparison modes:**
+ - **Model vs Model**: Compare two AI models against each other
+ - **Model vs Existing**: Compare a new model against existing AI-generated tags on your bookmarks
+- Fetches existing bookmarks from your Karakeep instance
+- Runs tagging inference with AI models
+- **Random shuffling**: Models/tags are randomly assigned to "Model A" or "Model B" for each bookmark to eliminate bias
+- Blind comparison: Model names are hidden during voting (only shown as "Model A" and "Model B")
+- Interactive voting interface
+- Shows final results with winner
+
+## Setup
+
+### Environment Variables
+
+Required environment variables:
+
+```bash
+# Karakeep API configuration
+KARAKEEP_API_KEY=your_api_key_here
+KARAKEEP_SERVER_ADDR=https://your-karakeep-instance.com
+
+# Comparison mode (default: model-vs-model)
+# - "model-vs-model": Compare two models against each other
+# - "model-vs-existing": Compare a model against existing AI tags
+COMPARISON_MODE=model-vs-model
+
+# Models to compare
+# MODEL1_NAME: The new model to test (always required)
+# MODEL2_NAME: The second model to compare against (required only for model-vs-model mode)
+MODEL1_NAME=gpt-4o-mini
+MODEL2_NAME=claude-3-5-sonnet
+
+# OpenAI/OpenRouter API configuration (for running inference)
+OPENAI_API_KEY=your_openai_or_openrouter_key
+OPENAI_BASE_URL=https://openrouter.ai/api/v1 # Optional, defaults to OpenAI
+
+# Optional: Number of bookmarks to test (default: 10)
+COMPARE_LIMIT=10
+```
+
+### Using OpenRouter
+
+For OpenRouter, set:
+```bash
+OPENAI_BASE_URL=https://openrouter.ai/api/v1
+OPENAI_API_KEY=your_openrouter_key
+```
+
+### Using OpenAI Directly
+
+For OpenAI directly:
+```bash
+OPENAI_API_KEY=your_openai_key
+# OPENAI_BASE_URL can be omitted for direct OpenAI
+```
+
+## Usage
+
+### Run with pnpm (Recommended)
+
+```bash
+cd tools/compare-models
+pnpm install
+pnpm run
+```
+
+### Run with environment file
+
+Create a `.env` file:
+
+```env
+KARAKEEP_API_KEY=your_api_key
+KARAKEEP_SERVER_ADDR=https://your-karakeep-instance.com
+MODEL1_NAME=gpt-4o-mini
+MODEL2_NAME=claude-3-5-sonnet
+OPENAI_API_KEY=your_openai_key
+COMPARE_LIMIT=10
+```
+
+Then run:
+```bash
+pnpm run
+```
+
+### Using directly with node
+
+If you prefer to run the compiled JavaScript directly:
+
+```bash
+pnpm build
+export KARAKEEP_API_KEY=your_api_key
+export KARAKEEP_SERVER_ADDR=https://your-karakeep-instance.com
+export MODEL1_NAME=gpt-4o-mini
+export MODEL2_NAME=claude-3-5-sonnet
+export OPENAI_API_KEY=your_openai_key
+node dist/index.js
+```
+
+## Comparison Modes
+
+### Model vs Model Mode
+
+Compare two different AI models against each other:
+
+```bash
+COMPARISON_MODE=model-vs-model
+MODEL1_NAME=gpt-4o-mini
+MODEL2_NAME=claude-3-5-sonnet
+```
+
+This mode runs inference with both models on each bookmark and lets you choose which tags are better.
+
+### Model vs Existing Mode
+
+Compare a new model against existing AI-generated tags on your bookmarks:
+
+```bash
+COMPARISON_MODE=model-vs-existing
+MODEL1_NAME=gpt-4o-mini
+# MODEL2_NAME is not required in this mode
+```
+
+This mode is useful for:
+- Testing if a new model produces better tags than your current model
+- Evaluating whether to switch from one model to another
+- Quality assurance on existing AI tags
+
+**Note:** This mode only compares bookmarks that already have AI-generated tags (tags with `attachedBy: "ai"`). Bookmarks without AI tags are automatically filtered out.
+
+## Usage Flow
+
+1. The tool fetches your latest link bookmarks from Karakeep
+ - In **model-vs-existing** mode, only bookmarks with existing AI tags are included
+2. For each bookmark, it randomly assigns the options to "Model A" or "Model B" and runs tagging
+3. You'll see a side-by-side comparison (randomly shuffled each time):
+ ```
+ === Bookmark 1/10 ===
+ How to Build Better AI Systems
+ https://example.com/article
+ This article explores modern approaches to...
+
+ ─────────────────────────────────────
+
+ Model A (blind):
+ • ai
+ • machine-learning
+ • engineering
+
+ Model B (blind):
+ • artificial-intelligence
+ • ML
+ • software-development
+
+ ─────────────────────────────────────
+
+ Which tags do you prefer? [1=Model A, 2=Model B, s=skip, q=quit] >
+ ```
+
+4. Choose your preference:
+ - `1` - Vote for Model A
+ - `2` - Vote for Model B
+ - `s` or `skip` - Skip this comparison
+ - `q` or `quit` - Exit early and show current results
+
+5. After completing all comparisons (or quitting early), results are displayed:
+ ```
+ ───────────────────────────────────────
+ === FINAL RESULTS ===
+ ───────────────────────────────────────
+ gpt-4o-mini: 6 votes
+ claude-3-5-sonnet: 3 votes
+ Skipped: 1
+ Errors: 0
+ ───────────────────────────────────────
+ Total bookmarks tested: 10
+
+ 🏆 WINNER: gpt-4o-mini
+ ───────────────────────────────────────
+ ```
+
+6. The actual model names are only shown in the final results - during voting you see only "Model A" and "Model B"
+
+## Bookmark Filtering
+
+The tool currently tests only:
+- **Link-type bookmarks** (not text notes or assets)
+- **Non-archived** bookmarks
+- **Latest N bookmarks** (where N is COMPARE_LIMIT)
+- **In model-vs-existing mode**: Only bookmarks with existing AI tags (tags with `attachedBy: "ai"`)
+
+## Architecture
+
+This tool leverages Karakeep's shared infrastructure:
+- **API Client**: Uses `@karakeep/sdk` for type-safe API interactions with proper authentication
+- **Inference**: Reuses `@karakeep/shared/inference` for OpenAI client with structured output support
+- **Prompts**: Uses `@karakeep/shared/prompts` for consistent tagging prompt generation with token management
+- No code duplication - all core functionality is shared with the main Karakeep application
+
+
+## Error Handling
+
+- If a model fails to generate tags for a bookmark, an error is shown and comparison continues
+- Errors are counted separately in final results
+- Missing required environment variables will cause the tool to exit with a clear error message
+
+## Build
+
+To build a standalone binary:
+
+```bash
+pnpm build
+```
+
+The built binary will be in `dist/index.js`.
+
+## Notes
+
+- The tool is designed for manual, human-in-the-loop evaluation
+- No results are persisted - they're only displayed in console
+- Content is fetched with `includeContent=true` from Karakeep API
+- Uses Karakeep SDK (`@karakeep/sdk`) for type-safe API interactions
+- Inference runs sequentially to keep state management simple
+- Recommended to use `pnpm run` for the best experience (uses tsx for development)
+- **Random shuffling**: For each bookmark, models are randomly assigned to "Model A" or "Model B" to eliminate position bias. The actual model names are only revealed in the final results.
diff --git a/tools/compare-models/package.json b/tools/compare-models/package.json
new file mode 100644
index 00000000..5a493bd2
--- /dev/null
+++ b/tools/compare-models/package.json
@@ -0,0 +1,24 @@
+{
+ "name": "@karakeep/compare-models",
+ "version": "0.1.0",
+ "description": "Standalone tool to compare tagging performance between AI models",
+ "bin": {
+ "compare-models": "dist/index.js"
+ },
+ "scripts": {
+ "build": "tsc && chmod +x dist/index.js",
+ "run": "tsx --env-file=./.env src/index.ts",
+ "typecheck": "tsc --noEmit"
+ },
+ "dependencies": {
+ "@karakeep/sdk": "workspace:^",
+ "@karakeep/shared": "workspace:^",
+ "chalk": "^5.3.0",
+ "zod": "^3.24.2"
+ },
+ "devDependencies": {
+ "@types/node": "^24",
+ "tsx": "^4.8.1",
+ "typescript": "^5.9"
+ }
+}
diff --git a/tools/compare-models/src/apiClient.ts b/tools/compare-models/src/apiClient.ts
new file mode 100644
index 00000000..1d9f799d
--- /dev/null
+++ b/tools/compare-models/src/apiClient.ts
@@ -0,0 +1,71 @@
+import { createKarakeepClient } from "@karakeep/sdk";
+
+import type { Bookmark } from "./types";
+import { config } from "./config";
+
+export class KarakeepAPIClient {
+ private readonly client: ReturnType<typeof createKarakeepClient>;
+
+ constructor() {
+ this.client = createKarakeepClient({
+ baseUrl: `${config.KARAKEEP_SERVER_ADDR}/api/v1/`,
+ headers: {
+ "Content-Type": "application/json",
+ authorization: `Bearer ${config.KARAKEEP_API_KEY}`,
+ },
+ });
+ }
+
+ async fetchBookmarks(limit: number): Promise<Bookmark[]> {
+ const bookmarks: Bookmark[] = [];
+ let cursor: string | null = null;
+ let hasMore = true;
+
+ while (hasMore && bookmarks.length < limit) {
+ const params: {
+ limit: number;
+ includeContent: true;
+ archived?: boolean;
+ cursor?: string;
+ } = {
+ limit: Math.min(limit - bookmarks.length, 50),
+ includeContent: true,
+ archived: false,
+ };
+
+ if (cursor) {
+ params.cursor = cursor;
+ }
+
+ const { data, response, error } = await this.client.GET("/bookmarks", {
+ params: {
+ query: params,
+ },
+ });
+
+ if (error) {
+ throw new Error(`Failed to fetch bookmarks: ${String(error)}`);
+ }
+
+ if (!response.ok) {
+ throw new Error(`Failed to fetch bookmarks: ${response.status}`);
+ }
+
+ const batchBookmarks = (data?.bookmarks || [])
+ .filter((b) => b.content?.type === "link")
+ .map((b) => ({
+ ...b,
+ tags: (b.tags || []).map((tag) => ({
+ name: tag.name,
+ attachedBy: tag.attachedBy,
+ })),
+ })) as Bookmark[];
+
+ bookmarks.push(...batchBookmarks);
+ cursor = data?.nextCursor || null;
+ hasMore = !!cursor;
+ }
+
+ return bookmarks.slice(0, limit);
+ }
+}
diff --git a/tools/compare-models/src/bookmarkProcessor.ts b/tools/compare-models/src/bookmarkProcessor.ts
new file mode 100644
index 00000000..4a1bbf0a
--- /dev/null
+++ b/tools/compare-models/src/bookmarkProcessor.ts
@@ -0,0 +1,68 @@
+import type { InferenceClient } from "@karakeep/shared/inference";
+import { buildTextPrompt } from "@karakeep/shared/prompts.server";
+
+import { inferTags } from "./inferenceClient";
+import type { Bookmark } from "./types";
+
+export async function extractBookmarkContent(
+ bookmark: Bookmark,
+): Promise<string> {
+ if (bookmark.content.type === "link") {
+ const parts = [];
+
+ if (bookmark.content.url) {
+ parts.push(`URL: ${bookmark.content.url}`);
+ }
+
+ if (bookmark.title) {
+ parts.push(`Title: ${bookmark.title}`);
+ }
+
+ if (bookmark.content.description) {
+ parts.push(`Description: ${bookmark.content.description}`);
+ }
+
+ if (bookmark.content.htmlContent) {
+ parts.push(`Content: ${bookmark.content.htmlContent}`);
+ }
+
+ return parts.join("\n");
+ }
+
+ if (bookmark.content.type === "text" && bookmark.content.text) {
+ return bookmark.content.text;
+ }
+
+ return "";
+}
+
+export async function runTaggingForModel(
+ bookmark: Bookmark,
+ inferenceClient: InferenceClient,
+ lang: string = "english",
+ contextLength: number = 8000,
+): Promise<string[]> {
+ const content = await extractBookmarkContent(bookmark);
+
+ if (!content) {
+ return [];
+ }
+
+ try {
+ // Use the shared prompt builder with empty custom prompts and default tag style
+ const prompt = await buildTextPrompt(
+ lang,
+ [], // No custom prompts for comparison tool
+ content,
+ contextLength,
+ "as-generated", // Use tags as generated by the model
+ );
+
+ const tags = await inferTags(inferenceClient, prompt);
+ return tags;
+ } catch (error) {
+ throw new Error(
+ `Failed to generate tags: ${error instanceof Error ? error.message : String(error)}`,
+ );
+ }
+}
diff --git a/tools/compare-models/src/config.ts b/tools/compare-models/src/config.ts
new file mode 100644
index 00000000..3a2d0d41
--- /dev/null
+++ b/tools/compare-models/src/config.ts
@@ -0,0 +1,34 @@
+import { z } from "zod";
+
+// Local config schema for compare-models tool
+const envSchema = z.object({
+ KARAKEEP_API_KEY: z.string().min(1),
+ KARAKEEP_SERVER_ADDR: z.string().url(),
+ MODEL1_NAME: z.string().min(1),
+ MODEL2_NAME: z.string().min(1).optional(),
+ OPENAI_API_KEY: z.string().min(1),
+ OPENAI_BASE_URL: z.string().url().optional(),
+ OPENAI_SERVICE_TIER: z.enum(["auto", "default", "flex"]).optional(),
+ COMPARISON_MODE: z
+ .enum(["model-vs-model", "model-vs-existing"])
+ .default("model-vs-model"),
+ COMPARE_LIMIT: z
+ .string()
+ .optional()
+ .transform((val) => (val ? parseInt(val, 10) : 10)),
+ INFERENCE_CONTEXT_LENGTH: z
+ .string()
+ .optional()
+ .transform((val) => (val ? parseInt(val, 10) : 8000)),
+ INFERENCE_MAX_OUTPUT_TOKENS: z
+ .string()
+ .optional()
+ .transform((val) => (val ? parseInt(val, 10) : 2048)),
+ INFERENCE_USE_MAX_COMPLETION_TOKENS: z
+ .string()
+ .optional()
+ .transform((val) => val === "true")
+ .default("false"),
+});
+
+export const config = envSchema.parse(process.env);
diff --git a/tools/compare-models/src/index.ts b/tools/compare-models/src/index.ts
new file mode 100644
index 00000000..88fc9249
--- /dev/null
+++ b/tools/compare-models/src/index.ts
@@ -0,0 +1,229 @@
+import chalk from "chalk";
+
+import type { ComparisonResult } from "./types";
+import { KarakeepAPIClient } from "./apiClient";
+import { runTaggingForModel } from "./bookmarkProcessor";
+import { config } from "./config";
+import { createInferenceClient } from "./inferenceClient";
+import {
+ askQuestion,
+ clearProgress,
+ close,
+ displayComparison,
+ displayError,
+ displayFinalResults,
+ displayProgress,
+} from "./interactive";
+
+interface VoteCounters {
+ model1Votes: number;
+ model2Votes: number;
+ skipped: number;
+ errors: number;
+ total: number;
+}
+
+interface ShuffleResult {
+ modelA: string;
+ modelB: string;
+ modelAIsModel1: boolean;
+}
+
+async function main() {
+ console.log(chalk.cyan("\n🚀 Karakeep Model Comparison Tool\n"));
+
+ const isExistingMode = config.COMPARISON_MODE === "model-vs-existing";
+
+ if (isExistingMode) {
+ console.log(
+ chalk.yellow(
+ `Mode: Comparing ${config.MODEL1_NAME} against existing AI tags\n`,
+ ),
+ );
+ } else {
+ if (!config.MODEL2_NAME) {
+ console.log(
+ chalk.red(
+ "\n✗ Error: MODEL2_NAME is required for model-vs-model comparison mode\n",
+ ),
+ );
+ process.exit(1);
+ }
+ console.log(
+ chalk.yellow(
+ `Mode: Comparing ${config.MODEL1_NAME} vs ${config.MODEL2_NAME}\n`,
+ ),
+ );
+ }
+
+ const apiClient = new KarakeepAPIClient();
+
+ displayProgress("Fetching bookmarks from Karakeep...");
+ let bookmarks = await apiClient.fetchBookmarks(config.COMPARE_LIMIT);
+ clearProgress();
+
+ // Filter bookmarks with AI tags if in existing mode
+ if (isExistingMode) {
+ bookmarks = bookmarks.filter(
+ (b) => b.tags.some((t) => t.attachedBy === "ai"),
+ );
+ console.log(
+ chalk.green(
+ `✓ Fetched ${bookmarks.length} link bookmarks with existing AI tags\n`,
+ ),
+ );
+ } else {
+ console.log(chalk.green(`✓ Fetched ${bookmarks.length} link bookmarks\n`));
+ }
+
+ if (bookmarks.length === 0) {
+ console.log(
+ chalk.yellow(
+ "\n⚠ No bookmarks found with AI tags. Please add some bookmarks with AI tags first.\n",
+ ),
+ );
+ return;
+ }
+
+ const counters: VoteCounters = {
+ model1Votes: 0,
+ model2Votes: 0,
+ skipped: 0,
+ errors: 0,
+ total: bookmarks.length,
+ };
+
+ const detailedResults: ComparisonResult[] = [];
+
+ for (let i = 0; i < bookmarks.length; i++) {
+ const bookmark = bookmarks[i];
+
+ displayProgress(
+ `[${i + 1}/${bookmarks.length}] Running inference on: ${bookmark.title || bookmark.content.title || "Untitled"}`,
+ );
+
+ let model1Tags: string[] = [];
+ let model2Tags: string[] = [];
+
+ // Get tags for model 1 (new model)
+ try {
+ const model1Client = createInferenceClient(config.MODEL1_NAME);
+ model1Tags = await runTaggingForModel(
+ bookmark,
+ model1Client,
+ "english",
+ config.INFERENCE_CONTEXT_LENGTH,
+ );
+ } catch (error) {
+ clearProgress();
+ displayError(
+ `${config.MODEL1_NAME} failed: ${error instanceof Error ? error.message : String(error)}`,
+ );
+ counters.errors++;
+ continue;
+ }
+
+ // Get tags for model 2 or existing AI tags
+ if (isExistingMode) {
+ // Use existing AI tags from the bookmark
+ model2Tags = bookmark.tags
+ .filter((t) => t.attachedBy === "ai")
+ .map((t) => t.name);
+ } else {
+ // Run inference with model 2
+ try {
+ const model2Client = createInferenceClient(config.MODEL2_NAME!);
+ model2Tags = await runTaggingForModel(
+ bookmark,
+ model2Client,
+ "english",
+ config.INFERENCE_CONTEXT_LENGTH,
+ );
+ } catch (error) {
+ clearProgress();
+ displayError(
+ `${config.MODEL2_NAME} failed: ${error instanceof Error ? error.message : String(error)}`,
+ );
+ counters.errors++;
+ continue;
+ }
+ }
+
+ clearProgress();
+
+ const model2Label = isExistingMode
+ ? "Existing AI Tags"
+ : config.MODEL2_NAME!;
+
+ const shuffleResult: ShuffleResult = {
+ modelA: config.MODEL1_NAME,
+ modelB: model2Label,
+ modelAIsModel1: Math.random() < 0.5,
+ };
+
+ if (!shuffleResult.modelAIsModel1) {
+ shuffleResult.modelA = model2Label;
+ shuffleResult.modelB = config.MODEL1_NAME;
+ }
+
+ const comparison: ComparisonResult = {
+ bookmark,
+ modelA: shuffleResult.modelA,
+ modelATags: shuffleResult.modelAIsModel1 ? model1Tags : model2Tags,
+ modelB: shuffleResult.modelB,
+ modelBTags: shuffleResult.modelAIsModel1 ? model2Tags : model1Tags,
+ };
+
+ displayComparison(i + 1, bookmarks.length, comparison, true);
+
+ const answer = await askQuestion(
+ "Which tags do you prefer? [1=Model A, 2=Model B, s=skip, q=quit] > ",
+ );
+
+ const normalizedAnswer = answer.toLowerCase();
+
+ if (normalizedAnswer === "q" || normalizedAnswer === "quit") {
+ console.log(chalk.yellow("\n⏸ Quitting early...\n"));
+ break;
+ }
+
+ if (normalizedAnswer === "1") {
+ comparison.winner = "modelA";
+ if (shuffleResult.modelAIsModel1) {
+ counters.model1Votes++;
+ } else {
+ counters.model2Votes++;
+ }
+ detailedResults.push(comparison);
+ } else if (normalizedAnswer === "2") {
+ comparison.winner = "modelB";
+ if (shuffleResult.modelAIsModel1) {
+ counters.model2Votes++;
+ } else {
+ counters.model1Votes++;
+ }
+ detailedResults.push(comparison);
+ } else {
+ comparison.winner = "skip";
+ counters.skipped++;
+ detailedResults.push(comparison);
+ }
+ }
+
+ close();
+
+ displayFinalResults({
+ model1Name: config.MODEL1_NAME,
+ model2Name: isExistingMode ? "Existing AI Tags" : config.MODEL2_NAME!,
+ model1Votes: counters.model1Votes,
+ model2Votes: counters.model2Votes,
+ skipped: counters.skipped,
+ errors: counters.errors,
+ total: counters.total,
+ });
+}
+
+main().catch((error) => {
+ console.error(chalk.red(`\n✗ Fatal error: ${error}\n`));
+ process.exit(1);
+});
diff --git a/tools/compare-models/src/inferenceClient.ts b/tools/compare-models/src/inferenceClient.ts
new file mode 100644
index 00000000..0a5ed8b5
--- /dev/null
+++ b/tools/compare-models/src/inferenceClient.ts
@@ -0,0 +1,46 @@
+import type { InferenceClient } from "@karakeep/shared/inference";
+import {
+ OpenAIInferenceClient,
+ type OpenAIInferenceConfig,
+} from "@karakeep/shared/inference";
+import { z } from "zod";
+
+import { config } from "./config";
+
+export function createInferenceClient(modelName: string): InferenceClient {
+ const inferenceConfig: OpenAIInferenceConfig = {
+ apiKey: config.OPENAI_API_KEY,
+ baseURL: config.OPENAI_BASE_URL,
+ serviceTier: config.OPENAI_SERVICE_TIER,
+ textModel: modelName,
+ imageModel: modelName, // Use same model for images if needed
+ contextLength: config.INFERENCE_CONTEXT_LENGTH,
+ maxOutputTokens: config.INFERENCE_MAX_OUTPUT_TOKENS,
+ useMaxCompletionTokens: config.INFERENCE_USE_MAX_COMPLETION_TOKENS,
+ outputSchema: "structured",
+ };
+
+ return new OpenAIInferenceClient(inferenceConfig);
+}
+
+export async function inferTags(
+ inferenceClient: InferenceClient,
+ prompt: string,
+): Promise<string[]> {
+ const tagsSchema = z.object({
+ tags: z.array(z.string()),
+ });
+
+ const response = await inferenceClient.inferFromText(prompt, {
+ schema: tagsSchema,
+ });
+
+ const parsed = tagsSchema.safeParse(JSON.parse(response.response));
+ if (!parsed.success) {
+ throw new Error(
+ `Failed to parse model response: ${parsed.error.message}`,
+ );
+ }
+
+ return parsed.data.tags;
+}
diff --git a/tools/compare-models/src/interactive.ts b/tools/compare-models/src/interactive.ts
new file mode 100644
index 00000000..b93fc1d7
--- /dev/null
+++ b/tools/compare-models/src/interactive.ts
@@ -0,0 +1,128 @@
+import * as readline from "node:readline";
+import chalk from "chalk";
+
+import type { ComparisonResult } from "./types";
+
+const rl = readline.createInterface({
+ input: process.stdin,
+ output: process.stdout,
+});
+
+export async function askQuestion(question: string): Promise<string> {
+ return new Promise((resolve) => {
+ rl.question(question, (answer) => {
+ resolve(answer.trim());
+ });
+ });
+}
+
+export function displayComparison(
+ index: number,
+ total: number,
+ result: ComparisonResult,
+ blind: boolean = true,
+): void {
+ const divider = chalk.gray("─".repeat(80));
+ const header = chalk.bold.cyan(`\n=== Bookmark ${index}/${total} ===`);
+ const title = chalk.bold.white(result.bookmark.title || "Untitled");
+ const url = result.bookmark.content.url
+ ? chalk.gray(result.bookmark.content.url)
+ : "";
+ const content = chalk.gray(
+ result.bookmark.content.description
+ ? result.bookmark.content.description.substring(0, 200) + "..."
+ : "",
+ );
+
+ const modelAName = blind ? "Model A" : result.modelA;
+ const modelBName = blind ? "Model B" : result.modelB;
+
+ const modelATags = result.modelATags
+ .map((tag) => chalk.green(` • ${tag}`))
+ .join("\n");
+ const modelBTags = result.modelBTags
+ .map((tag) => chalk.yellow(` • ${tag}`))
+ .join("\n");
+
+ console.log(header);
+ console.log(title);
+ if (url) console.log(url);
+ if (content) console.log(content);
+ console.log(divider);
+ console.log();
+ console.log(chalk.green(`${modelAName}:`));
+ if (modelATags) {
+ console.log(modelATags);
+ } else {
+ console.log(chalk.gray(" (no tags)"));
+ }
+ console.log();
+ console.log(chalk.yellow(`${modelBName}:`));
+ if (modelBTags) {
+ console.log(modelBTags);
+ } else {
+ console.log(chalk.gray(" (no tags)"));
+ }
+ console.log();
+}
+
+export function displayError(message: string): void {
+ console.log(chalk.red(`\n✗ Error: ${message}\n`));
+}
+
+export function displayProgress(message: string): void {
+ process.stdout.write(chalk.gray(message));
+}
+
+export function clearProgress(): void {
+ process.stdout.write("\r\x1b[K");
+}
+
+export function close(): void {
+ rl.close();
+}
+
+export function displayFinalResults(results: {
+ model1Name: string;
+ model2Name: string;
+ model1Votes: number;
+ model2Votes: number;
+ skipped: number;
+ errors: number;
+ total: number;
+}): void {
+ const winner =
+ results.model1Votes > results.model2Votes
+ ? results.model1Name
+ : results.model2Votes > results.model1Votes
+ ? results.model2Name
+ : "TIE";
+
+ const divider = chalk.gray("─".repeat(80));
+ const header = chalk.bold.cyan("\n=== FINAL RESULTS ===");
+ const model1Line = chalk.green(
+ `${results.model1Name}: ${results.model1Votes} votes`,
+ );
+ const model2Line = chalk.yellow(
+ `${results.model2Name}: ${results.model2Votes} votes`,
+ );
+ const skippedLine = chalk.gray(`Skipped: ${results.skipped}`);
+ const errorsLine = chalk.red(`Errors: ${results.errors}`);
+ const totalLine = chalk.bold(`Total bookmarks tested: ${results.total}`);
+ const winnerLine =
+ winner === "TIE"
+ ? chalk.bold.cyan(`\n🏁 RESULT: TIE`)
+ : chalk.bold.green(`\n🏆 WINNER: ${winner}`);
+
+ console.log(divider);
+ console.log(header);
+ console.log(divider);
+ console.log(model1Line);
+ console.log(model2Line);
+ console.log(skippedLine);
+ console.log(errorsLine);
+ console.log(divider);
+ console.log(totalLine);
+ console.log(winnerLine);
+ console.log(divider);
+}
diff --git a/tools/compare-models/src/types.ts b/tools/compare-models/src/types.ts
new file mode 100644
index 00000000..35a677ae
--- /dev/null
+++ b/tools/compare-models/src/types.ts
@@ -0,0 +1,38 @@
+export interface Bookmark {
+ id: string;
+ title: string | null;
+ content: {
+ type: string;
+ title: string;
+ url?: string;
+ text?: string;
+ htmlContent?: string;
+ description?: string;
+ };
+ tags: Array<{ name: string; attachedBy?: "ai" | "human" }>;
+}
+
+export interface ModelConfig {
+ name: string;
+ apiKey: string;
+ baseUrl?: string;
+}
+
+export interface ComparisonResult {
+ bookmark: Bookmark;
+ modelA: string;
+ modelATags: string[];
+ modelB: string;
+ modelBTags: string[];
+ winner?: "modelA" | "modelB" | "skip";
+}
+
+export interface FinalResults {
+ model1Name: string;
+ model2Name: string;
+ model1Votes: number;
+ model2Votes: number;
+ skipped: number;
+ errors: number;
+ total: number;
+}
diff --git a/tools/compare-models/tsconfig.json b/tools/compare-models/tsconfig.json
new file mode 100644
index 00000000..edeec809
--- /dev/null
+++ b/tools/compare-models/tsconfig.json
@@ -0,0 +1,24 @@
+{
+ "$schema": "https://json.schemastore.org/tsconfig",
+ "extends": "@tsconfig/node22/tsconfig.json",
+ "compilerOptions": {
+ "target": "ES2022",
+ "module": "CommonJS",
+ "lib": ["ES2022"],
+ "outDir": "dist",
+ "rootDir": "src",
+ "moduleResolution": "node",
+ "esModuleInterop": true,
+ "allowSyntheticDefaultImports": true,
+ "strict": true,
+ "skipLibCheck": true,
+ "allowJs": true,
+ "resolveJsonModule": true,
+ "isolatedModules": true,
+ "noEmit": false,
+ "declaration": false,
+ "sourceMap": false
+ },
+ "include": ["src/**/*"],
+ "exclude": ["node_modules", "dist"]
+}