From 3f9601dc94dc5e599ee22e6f5a8ca63c9c385d35 Mon Sep 17 00:00:00 2001 From: "Dustin C. Hatch" Date: Fri, 1 Nov 2024 18:00:50 -0500 Subject: [PATCH] v-m/alerts: Improve Paperless-ngx Celery task alert The `flower_events_total` metric is a counter, so its value only ever increases (discounting restarts of the server process). As such, nonzero values do not necessarily indicate a _current_ problem, but rather that there was one at some point in the past. To identify current issues, we need to use the `increase` function, and then apply the `max_over_time` function so that the alert doesn't immediately reset itself. --- victoria-metrics/alerts.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/victoria-metrics/alerts.yml b/victoria-metrics/alerts.yml index d793350..807870b 100644 --- a/victoria-metrics/alerts.yml +++ b/victoria-metrics/alerts.yml @@ -172,9 +172,13 @@ groups: rules: - alert: Celery tasks failed expr: >- - flower_events_total{job="paperless-ngx", type="task-failed"} > 0 + max_over_time( + increase( + flower_events_total{job="paperless-ngx", type="task-failed"} + )[24h] + ) > 0 annotations: - summary: One or more Celery tasks have failed + summary: Paperless-ngx Celery task failed description: >- Failing Celery tasks may indicate a problem with the Paperless-ngx deployment and can result in data loss. Check the Paperless-ngx logs