aboutsummaryrefslogtreecommitdiffstats
path: root/apps
diff options
context:
space:
mode:
authorMohamed Bassem <me@mbassem.com>2026-02-04 13:45:32 +0000
committerGitHub <noreply@github.com>2026-02-04 13:45:32 +0000
commit538035c452bfbc042961b199c0f44e733c88bfab (patch)
treeb401a004a0ca3103264261170602fbf2822df14e /apps
parent93ad2e2001eb7070df50b0ab51dfd3e1ab377629 (diff)
downloadkarakeep-538035c452bfbc042961b199c0f44e733c88bfab.tar.zst
feat: add extra instrumentation in the otel traces (#2453)
Diffstat (limited to 'apps')
-rw-r--r--apps/workers/network.ts9
-rw-r--r--apps/workers/workers/crawlerWorker.ts126
-rw-r--r--apps/workers/workers/inference/summarize.ts32
-rw-r--r--apps/workers/workers/inference/tagging.ts37
4 files changed, 178 insertions, 26 deletions
diff --git a/apps/workers/network.ts b/apps/workers/network.ts
index 4bc30562..2ef8483f 100644
--- a/apps/workers/network.ts
+++ b/apps/workers/network.ts
@@ -86,6 +86,15 @@ function isAddressForbidden(address: string): boolean {
return DISALLOWED_IP_RANGES.has(parsed.range());
}
+export function getBookmarkDomain(url?: string | null): string | undefined {
+ if (!url) return undefined;
+ try {
+ return new URL(url).hostname;
+ } catch {
+ return undefined;
+ }
+}
+
export type UrlValidationResult =
| { ok: true; url: URL }
| { ok: false; reason: string };
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts
index 597d45d3..d3c20b7c 100644
--- a/apps/workers/workers/crawlerWorker.ts
+++ b/apps/workers/workers/crawlerWorker.ts
@@ -30,6 +30,7 @@ import metascraperYoutube from "metascraper-youtube";
import { crawlerStatusCodeCounter, workerStatsCounter } from "metrics";
import {
fetchWithProxy,
+ getBookmarkDomain,
getRandomProxy,
matchesNoProxy,
validateUrl,
@@ -57,6 +58,7 @@ import {
LinkCrawlerQueue,
OpenAIQueue,
QuotaService,
+ setSpanAttributes,
triggerSearchReindex,
triggerWebhook,
VideoWorkerQueue,
@@ -435,7 +437,13 @@ async function browserlessCrawlPage(
return await withSpan(
tracer,
"crawlerWorker.browserlessCrawlPage",
- { attributes: { url, jobId } },
+ {
+ attributes: {
+ "bookmark.url": url,
+ "bookmark.domain": getBookmarkDomain(url),
+ "job.id": jobId,
+ },
+ },
async () => {
logger.info(
`[Crawler][${jobId}] Running in browserless mode. Will do a plain http request to "${url}". Screenshots will be disabled.`,
@@ -473,7 +481,15 @@ async function crawlPage(
return await withSpan(
tracer,
"crawlerWorker.crawlPage",
- { attributes: { url, jobId, userId, forceStorePdf } },
+ {
+ attributes: {
+ "bookmark.url": url,
+ "bookmark.domain": getBookmarkDomain(url),
+ "job.id": jobId,
+ "user.id": userId,
+ "crawler.forceStorePdf": forceStorePdf,
+ },
+ },
async () => {
// Check user's browser crawling setting
const userData = await db.query.users.findFirst({
@@ -717,7 +733,13 @@ async function extractMetadata(
return await withSpan(
tracer,
"crawlerWorker.extractMetadata",
- { attributes: { url, jobId } },
+ {
+ attributes: {
+ "bookmark.url": url,
+ "bookmark.domain": getBookmarkDomain(url),
+ "job.id": jobId,
+ },
+ },
async () => {
logger.info(
`[Crawler][${jobId}] Will attempt to extract metadata from page ...`,
@@ -745,7 +767,13 @@ async function extractReadableContent(
return await withSpan(
tracer,
"crawlerWorker.extractReadableContent",
- { attributes: { url, jobId } },
+ {
+ attributes: {
+ "bookmark.url": url,
+ "bookmark.domain": getBookmarkDomain(url),
+ "job.id": jobId,
+ },
+ },
async () => {
logger.info(
`[Crawler][${jobId}] Will attempt to extract readable content ...`,
@@ -790,9 +818,9 @@ async function storeScreenshot(
"crawlerWorker.storeScreenshot",
{
attributes: {
- jobId,
- userId,
- size: screenshot?.byteLength ?? 0,
+ "job.id": jobId,
+ "user.id": userId,
+ "asset.size": screenshot?.byteLength ?? 0,
},
},
async () => {
@@ -849,9 +877,9 @@ async function storePdf(
"crawlerWorker.storePdf",
{
attributes: {
- jobId,
- userId,
- size: pdf?.byteLength ?? 0,
+ "job.id": jobId,
+ "user.id": userId,
+ "asset.size": pdf?.byteLength ?? 0,
},
},
async () => {
@@ -902,7 +930,15 @@ async function downloadAndStoreFile(
return await withSpan(
tracer,
"crawlerWorker.downloadAndStoreFile",
- { attributes: { url, jobId, userId, fileType } },
+ {
+ attributes: {
+ "bookmark.url": url,
+ "bookmark.domain": getBookmarkDomain(url),
+ "job.id": jobId,
+ "user.id": userId,
+ "asset.type": fileType,
+ },
+ },
async () => {
let assetPath: string | undefined;
try {
@@ -1018,7 +1054,14 @@ async function archiveWebpage(
return await withSpan(
tracer,
"crawlerWorker.archiveWebpage",
- { attributes: { url, jobId, userId } },
+ {
+ attributes: {
+ "bookmark.url": url,
+ "bookmark.domain": getBookmarkDomain(url),
+ "job.id": jobId,
+ "user.id": userId,
+ },
+ },
async () => {
logger.info(`[Crawler][${jobId}] Will attempt to archive page ...`);
const assetId = newAssetId();
@@ -1103,7 +1146,13 @@ async function getContentType(
return await withSpan(
tracer,
"crawlerWorker.getContentType",
- { attributes: { url, jobId } },
+ {
+ attributes: {
+ "bookmark.url": url,
+ "bookmark.domain": getBookmarkDomain(url),
+ "job.id": jobId,
+ },
+ },
async () => {
try {
logger.info(
@@ -1113,8 +1162,14 @@ async function getContentType(
method: "GET",
signal: AbortSignal.any([AbortSignal.timeout(5000), abortSignal]),
});
+ setSpanAttributes({
+ "crawler.getContentType.statusCode": response.status,
+ });
const rawContentType = response.headers.get("content-type");
const contentType = normalizeContentType(rawContentType);
+ setSpanAttributes({
+ "crawler.contentType": contentType ?? undefined,
+ });
logger.info(
`[Crawler][${jobId}] Content-type for the url ${url} is "${contentType}"`,
);
@@ -1148,7 +1203,16 @@ async function handleAsAssetBookmark(
return await withSpan(
tracer,
"crawlerWorker.handleAsAssetBookmark",
- { attributes: { url, jobId, userId, bookmarkId, assetType } },
+ {
+ attributes: {
+ "bookmark.url": url,
+ "bookmark.domain": getBookmarkDomain(url),
+ "job.id": jobId,
+ "user.id": userId,
+ "bookmark.id": bookmarkId,
+ "asset.type": assetType,
+ },
+ },
async () => {
const downloaded = await downloadAndStoreFile(
url,
@@ -1218,9 +1282,11 @@ async function storeHtmlContent(
"crawlerWorker.storeHtmlContent",
{
attributes: {
- jobId,
- userId,
- contentSize: htmlContent ? Buffer.byteLength(htmlContent, "utf8") : 0,
+ "job.id": jobId,
+ "user.id": userId,
+ "bookmark.content.size": htmlContent
+ ? Buffer.byteLength(htmlContent, "utf8")
+ : 0,
},
},
async () => {
@@ -1302,13 +1368,14 @@ async function crawlAndParseUrl(
"crawlerWorker.crawlAndParseUrl",
{
attributes: {
- url,
- jobId,
- userId,
- bookmarkId,
- archiveFullPage,
- forceStorePdf,
- hasPrecrawledArchive: !!precrawledArchiveAssetId,
+ "bookmark.url": url,
+ "bookmark.domain": getBookmarkDomain(url),
+ "job.id": jobId,
+ "user.id": userId,
+ "bookmark.id": bookmarkId,
+ "crawler.archiveFullPage": archiveFullPage,
+ "crawler.forceStorePdf": forceStorePdf,
+ "crawler.hasPrecrawledArchive": !!precrawledArchiveAssetId,
},
},
async () => {
@@ -1357,6 +1424,9 @@ async function crawlAndParseUrl(
// Track status code in Prometheus
if (statusCode !== null) {
crawlerStatusCodeCounter.labels(statusCode.toString()).inc();
+ setSpanAttributes({
+ "crawler.statusCode": statusCode,
+ });
}
const meta = await Promise.race([
@@ -1576,7 +1646,13 @@ async function checkDomainRateLimit(url: string, jobId: string): Promise<void> {
return await withSpan(
tracer,
"crawlerWorker.checkDomainRateLimit",
- { attributes: { url, jobId } },
+ {
+ attributes: {
+ "bookmark.url": url,
+ "bookmark.domain": getBookmarkDomain(url),
+ "job.id": jobId,
+ },
+ },
async () => {
const crawlerDomainRateLimitConfig =
serverConfig.crawler.domainRatelimiting;
diff --git a/apps/workers/workers/inference/summarize.ts b/apps/workers/workers/inference/summarize.ts
index 094c46ca..922eb5b7 100644
--- a/apps/workers/workers/inference/summarize.ts
+++ b/apps/workers/workers/inference/summarize.ts
@@ -1,8 +1,13 @@
import { and, eq } from "drizzle-orm";
+import { getBookmarkDomain } from "network";
import { db } from "@karakeep/db";
import { bookmarks, customPrompts, users } from "@karakeep/db/schema";
-import { triggerSearchReindex, ZOpenAIRequest } from "@karakeep/shared-server";
+import {
+ setSpanAttributes,
+ triggerSearchReindex,
+ ZOpenAIRequest,
+} from "@karakeep/shared-server";
import serverConfig from "@karakeep/shared/config";
import { InferenceClient } from "@karakeep/shared/inference";
import logger from "@karakeep/shared/logger";
@@ -22,6 +27,7 @@ async function fetchBookmarkDetailsForSummary(bookmarkId: string) {
description: true,
htmlContent: true,
contentAssetId: true,
+ crawlStatusCode: true,
publisher: true,
author: true,
url: true,
@@ -65,6 +71,17 @@ export async function runSummarization(
},
});
+ setSpanAttributes({
+ "user.id": bookmarkData.userId,
+ "bookmark.id": bookmarkData.id,
+ "bookmark.url": bookmarkData.link?.url,
+ "bookmark.domain": getBookmarkDomain(bookmarkData.link?.url),
+ "bookmark.content.type": bookmarkData.type,
+ "crawler.statusCode": bookmarkData.link?.crawlStatusCode ?? undefined,
+ "inference.type": "summarization",
+ "inference.model": serverConfig.inference.textModel,
+ });
+
if (userSettings?.autoSummarizationEnabled === false) {
logger.debug(
`[inference][${jobId}] Skipping summarization job for bookmark with id "${bookmarkId}" because user has disabled auto-summarization.`,
@@ -121,6 +138,10 @@ URL: ${link.url ?? ""}
},
});
+ setSpanAttributes({
+ "inference.prompt.customCount": prompts.length,
+ });
+
const summaryPrompt = await buildSummaryPrompt(
userSettings?.inferredTagLang ?? serverConfig.inference.inferredTagLang,
prompts.map((p) => p.text),
@@ -128,6 +149,10 @@ URL: ${link.url ?? ""}
serverConfig.inference.contextLength,
);
+ setSpanAttributes({
+ "inference.prompt.size": Buffer.byteLength(summaryPrompt, "utf8"),
+ });
+
const summaryResult = await inferenceClient.inferFromText(summaryPrompt, {
schema: null, // Summaries are typically free-form text
abortSignal: job.abortSignal,
@@ -139,6 +164,11 @@ URL: ${link.url ?? ""}
);
}
+ setSpanAttributes({
+ "inference.summary.size": Buffer.byteLength(summaryResult.response, "utf8"),
+ "inference.totalTokens": summaryResult.totalTokens,
+ });
+
logger.info(
`[inference][${jobId}] Generated summary for bookmark "${bookmarkId}" using ${summaryResult.totalTokens} tokens.`,
);
diff --git a/apps/workers/workers/inference/tagging.ts b/apps/workers/workers/inference/tagging.ts
index 376eab14..b3006193 100644
--- a/apps/workers/workers/inference/tagging.ts
+++ b/apps/workers/workers/inference/tagging.ts
@@ -1,4 +1,5 @@
import { and, eq, inArray } from "drizzle-orm";
+import { getBookmarkDomain } from "network";
import { buildImpersonatingTRPCClient } from "trpc";
import { z } from "zod";
@@ -17,6 +18,7 @@ import {
users,
} from "@karakeep/db/schema";
import {
+ setSpanAttributes,
triggerRuleEngineOnEvent,
triggerSearchReindex,
triggerWebhook,
@@ -150,6 +152,9 @@ async function inferTagsFromImage(
}
const base64 = asset.toString("base64");
+ setSpanAttributes({
+ "inference.model": serverConfig.inference.imageModel,
+ });
return inferenceClient.inferFromImage(
buildImagePrompt(
inferredTagLang,
@@ -176,6 +181,10 @@ async function fetchCustomPrompts(
},
});
+ setSpanAttributes({
+ "inference.prompt.customCount": prompts.length,
+ });
+
let promptTexts = prompts.map((p) => p.text);
if (containsTagsPlaceholder(prompts)) {
promptTexts = await replaceTagsPlaceholders(promptTexts, userId);
@@ -234,6 +243,12 @@ async function inferTagsFromPDF(
serverConfig.inference.contextLength,
tagStyle,
);
+ setSpanAttributes({
+ "inference.model": serverConfig.inference.textModel,
+ });
+ setSpanAttributes({
+ "inference.prompt.size": Buffer.byteLength(prompt, "utf8"),
+ });
return inferenceClient.inferFromText(prompt, {
schema: openAIResponseSchema,
abortSignal,
@@ -251,6 +266,12 @@ async function inferTagsFromText(
if (!prompt) {
return null;
}
+ setSpanAttributes({
+ "inference.model": serverConfig.inference.textModel,
+ });
+ setSpanAttributes({
+ "inference.prompt.size": Buffer.byteLength(prompt, "utf8"),
+ });
return await inferenceClient.inferFromText(prompt, {
schema: openAIResponseSchema,
abortSignal,
@@ -265,6 +286,18 @@ async function inferTags(
tagStyle: ZTagStyle,
inferredTagLang: string,
) {
+ setSpanAttributes({
+ "user.id": bookmark.userId,
+ "bookmark.id": bookmark.id,
+ "bookmark.url": bookmark.link?.url,
+ "bookmark.domain": getBookmarkDomain(bookmark.link?.url),
+ "bookmark.content.type": bookmark.type,
+ "crawler.statusCode": bookmark.link?.crawlStatusCode ?? undefined,
+ "inference.tagging.style": tagStyle,
+ "inference.lang": inferredTagLang,
+ "inference.type": "tagging",
+ });
+
let response: InferenceResponse | null;
if (bookmark.link || bookmark.text) {
response = await inferTagsFromText(
@@ -325,6 +358,10 @@ async function inferTags(
}
return tag.trim();
});
+ setSpanAttributes({
+ "inference.tagging.numGeneratedTags": tags.length,
+ "inference.totalTokens": response.totalTokens,
+ });
return tags;
} catch (e) {