From 1b8129a28191c7093818060e39e968fc16bf24b4 Mon Sep 17 00:00:00 2001 From: Mohamed Bassem Date: Sun, 9 Nov 2025 09:02:28 +0000 Subject: feat: add failed_permanent metric for worker monitoring (#2107) * feat: add last failure timestamp metric for worker monitoring Add a Prometheus Gauge metric to track the timestamp of the last failure for each worker. This complements the existing failed job counter by providing visibility into when failures last occurred for monitoring and alerting purposes. Changes: - Added workerLastFailureGauge metric in metrics.ts - Updated all 9 workers to set the gauge on failure: - crawler, feed, webhook, assetPreProcessing - inference, adminMaintenance, ruleEngine - video, search * refactor: track both all failures and permanent failures with counter Remove the gauge metric and use the existing counter to track both: - All failures (including retry attempts): status="failed" - Permanent failures (retries exhausted): status="failed_permanent" This provides better visibility into retry behavior and permanent vs temporary failures without adding a separate metric. Changes: - Removed workerLastFailureGauge from metrics.ts - Updated all 9 workers to track failed_permanent when numRetriesLeft == 0 - Maintained existing failed counter for all failure attempts * style: format worker files with prettier --------- Co-authored-by: Claude --- apps/workers/workers/adminMaintenanceWorker.ts | 8 ++++++++ apps/workers/workers/assetPreprocessingWorker.ts | 5 +++++ apps/workers/workers/crawlerWorker.ts | 3 +++ apps/workers/workers/feedWorker.ts | 3 +++ apps/workers/workers/inference/inferenceWorker.ts | 1 + apps/workers/workers/ruleEngineWorker.ts | 3 +++ apps/workers/workers/searchWorker.ts | 3 +++ apps/workers/workers/videoWorker.ts | 3 +++ apps/workers/workers/webhookWorker.ts | 3 +++ 9 files changed, 32 insertions(+) (limited to 'apps') diff --git a/apps/workers/workers/adminMaintenanceWorker.ts b/apps/workers/workers/adminMaintenanceWorker.ts index 03c3b964..e5312964 100644 --- a/apps/workers/workers/adminMaintenanceWorker.ts +++ b/apps/workers/workers/adminMaintenanceWorker.ts @@ -34,6 +34,14 @@ export class AdminMaintenanceWorker { workerStatsCounter .labels(`adminMaintenance:${job.data?.type}`, "failed") .inc(); + if (job.numRetriesLeft == 0) { + workerStatsCounter + .labels( + `adminMaintenance:${job.data?.type}`, + "failed_permanent", + ) + .inc(); + } logger.error( `[adminMaintenance:${job.data?.type}][${job.id}] Job failed: ${job.error}\n${job.error.stack}`, ); diff --git a/apps/workers/workers/assetPreprocessingWorker.ts b/apps/workers/workers/assetPreprocessingWorker.ts index 2dcec59b..42c0ff01 100644 --- a/apps/workers/workers/assetPreprocessingWorker.ts +++ b/apps/workers/workers/assetPreprocessingWorker.ts @@ -47,6 +47,11 @@ export class AssetPreprocessingWorker { }, onError: async (job) => { workerStatsCounter.labels("assetPreProcessing", "failed").inc(); + if (job.numRetriesLeft == 0) { + workerStatsCounter + .labels("assetPreProcessing", "failed_permanent") + .inc(); + } const jobId = job.id; logger.error( `[assetPreprocessing][${jobId}] Asset preprocessing failed: ${job.error}\n${job.error.stack}`, diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts index 357ae976..5b49b23e 100644 --- a/apps/workers/workers/crawlerWorker.ts +++ b/apps/workers/workers/crawlerWorker.ts @@ -313,6 +313,9 @@ export class CrawlerWorker { }, onError: async (job) => { workerStatsCounter.labels("crawler", "failed").inc(); + if (job.numRetriesLeft == 0) { + workerStatsCounter.labels("crawler", "failed_permanent").inc(); + } const jobId = job.id; logger.error( `[Crawler][${jobId}] Crawling job failed: ${job.error}\n${job.error.stack}`, diff --git a/apps/workers/workers/feedWorker.ts b/apps/workers/workers/feedWorker.ts index 0dfc5399..57358880 100644 --- a/apps/workers/workers/feedWorker.ts +++ b/apps/workers/workers/feedWorker.ts @@ -67,6 +67,9 @@ export class FeedWorker { }, onError: async (job) => { workerStatsCounter.labels("feed", "failed").inc(); + if (job.numRetriesLeft == 0) { + workerStatsCounter.labels("feed", "failed_permanent").inc(); + } const jobId = job.id; logger.error( `[feed][${jobId}] Feed fetch job failed: ${job.error}\n${job.error.stack}`, diff --git a/apps/workers/workers/inference/inferenceWorker.ts b/apps/workers/workers/inference/inferenceWorker.ts index 065462b3..eefc1dd8 100644 --- a/apps/workers/workers/inference/inferenceWorker.ts +++ b/apps/workers/workers/inference/inferenceWorker.ts @@ -56,6 +56,7 @@ export class OpenAiWorker { `[inference][${jobId}] inference job failed: ${job.error}\n${job.error.stack}`, ); if (job.numRetriesLeft == 0) { + workerStatsCounter.labels("inference", "failed_permanent").inc(); await attemptMarkStatus(job?.data, "failure"); } }, diff --git a/apps/workers/workers/ruleEngineWorker.ts b/apps/workers/workers/ruleEngineWorker.ts index 37c7f595..98a9de74 100644 --- a/apps/workers/workers/ruleEngineWorker.ts +++ b/apps/workers/workers/ruleEngineWorker.ts @@ -29,6 +29,9 @@ export class RuleEngineWorker { }, onError: (job) => { workerStatsCounter.labels("ruleEngine", "failed").inc(); + if (job.numRetriesLeft == 0) { + workerStatsCounter.labels("ruleEngine", "failed_permanent").inc(); + } const jobId = job.id; logger.error( `[ruleEngine][${jobId}] rule engine job failed: ${job.error}\n${job.error.stack}`, diff --git a/apps/workers/workers/searchWorker.ts b/apps/workers/workers/searchWorker.ts index 5824d963..fed30c9b 100644 --- a/apps/workers/workers/searchWorker.ts +++ b/apps/workers/workers/searchWorker.ts @@ -34,6 +34,9 @@ export class SearchIndexingWorker { }, onError: (job) => { workerStatsCounter.labels("search", "failed").inc(); + if (job.numRetriesLeft == 0) { + workerStatsCounter.labels("search", "failed_permanent").inc(); + } const jobId = job.id; logger.error( `[search][${jobId}] search job failed: ${job.error}\n${job.error.stack}`, diff --git a/apps/workers/workers/videoWorker.ts b/apps/workers/workers/videoWorker.ts index 8d3ac666..03525fdf 100644 --- a/apps/workers/workers/videoWorker.ts +++ b/apps/workers/workers/videoWorker.ts @@ -46,6 +46,9 @@ export class VideoWorker { }, onError: async (job) => { workerStatsCounter.labels("video", "failed").inc(); + if (job.numRetriesLeft == 0) { + workerStatsCounter.labels("video", "failed_permanent").inc(); + } const jobId = job.id; logger.error( `[VideoCrawler][${jobId}] Video Download job failed: ${job.error}`, diff --git a/apps/workers/workers/webhookWorker.ts b/apps/workers/workers/webhookWorker.ts index 472a27ed..0d661372 100644 --- a/apps/workers/workers/webhookWorker.ts +++ b/apps/workers/workers/webhookWorker.ts @@ -28,6 +28,9 @@ export class WebhookWorker { }, onError: async (job) => { workerStatsCounter.labels("webhook", "failed").inc(); + if (job.numRetriesLeft == 0) { + workerStatsCounter.labels("webhook", "failed_permanent").inc(); + } const jobId = job.id; logger.error( `[webhook][${jobId}] webhook job failed: ${job.error}\n${job.error.stack}`, -- cgit v1.2.3-70-g09d2