From 8ecee4133f70696dd00d0dbabcc3e155568bb356 Mon Sep 17 00:00:00 2001 From: "Dustin C. Hatch" Date: Sat, 2 Nov 2024 09:38:02 -0500 Subject: [PATCH] v-m/alerts: Rework free disk space alert Fedora CoreOS fills `/boot` beyond the 75% alert threshold under normal circumstances on aarch64 machines. This is not a problem, because it cleans up old files on its own, so we do not need to alert on it. Unfortunately, the _DiskUsage_ alert is already quite complex, and adding in exclusions for these devices would make it even worse. To simplify the logic, we can use a recording rule to precomupte the used/free space ratio. By using `sum(...) without (type)` instead of `sum(...) on (df, instance)`, we keep the other labels, which we can then use to identify the metrics coming from machines we don't care to monitor. Instead of having different thresholds for different volumes encoded in the same expression, we can use multiple alerts to alert on "low" vs "very low" thresholds. Since this will of course cause duplicate alerts for most volumes, we can use AlertManager inhibition rules to disable the "low" alert once the metric crosses the "very low" threshold. --- victoria-metrics/alertmanager.config.yml | 9 +++++++ victoria-metrics/alerts.yml | 31 +++++++++++++++++++++--- victoria-metrics/kustomization.yaml | 1 + victoria-metrics/recording.yml | 8 ++++++ 4 files changed, 45 insertions(+), 4 deletions(-) create mode 100644 victoria-metrics/recording.yml diff --git a/victoria-metrics/alertmanager.config.yml b/victoria-metrics/alertmanager.config.yml index d31b9d0..ab391f3 100644 --- a/victoria-metrics/alertmanager.config.yml +++ b/victoria-metrics/alertmanager.config.yml @@ -31,3 +31,12 @@ route: - alertgroup=Frigate group_by: - alertname + +inhibit_rules: +- source_matchers: + - alertname=Free disk space is very low + target_matchers: + - alertname=Free disk space is low + equal: + - instance + - df diff --git a/victoria-metrics/alerts.yml b/victoria-metrics/alerts.yml index d29d33a..0eb721a 100644 --- a/victoria-metrics/alerts.yml +++ b/victoria-metrics/alerts.yml @@ -1,12 +1,35 @@ groups: - name: default alert rules: - - alert: DiskUsage + - alert: Free disk space is low expr: >- - sum(collectd_df_df_complex{type!="free"}) by (instance, df) / sum(collectd_df_df_complex{df!="var-log", df!="var-lib-frigate"}) by (instance, df) > .75 - or sum(collectd_df_df_complex{type!="free"}) by (instance, df) / sum(collectd_df_df_complex{df="var-log"}) by (instance, df) > .95 - or sum(collectd_df_df_complex{type!="free"}) by (instance, df) / sum(collectd_df_df_complex{df="var-lib-frigate"}) by (instance, df) > .95 + ( + filesystem:usage:percent{ + kubernetes_io_arch!="arm64", + df!="mmcblk0p3", + df!="var-lib-frigate", + df!="var-log", + } + or + filesystem:usage:percent{ + kubernetes_io_arch="arm64", + df!="boot", + } + or + filesystem:usage:percent{ + df="mmcblk0p3", + instance!="nut0.pyrocufflink.blue", + } + ) > .75 for: 2h + annotations: + severity: minor + - alert: Free disk space is very low + expr: >- + filesystem:usage:percent > 0.9 + for: 2h + annotations: + severity: minor - alert: TheWebsiteIsDown expr: >- probe_success{job="websites"} == 0 diff --git a/victoria-metrics/kustomization.yaml b/victoria-metrics/kustomization.yaml index 9f5c132..6d52acc 100644 --- a/victoria-metrics/kustomization.yaml +++ b/victoria-metrics/kustomization.yaml @@ -38,6 +38,7 @@ configMapGenerator: - name: vmalert-rules files: - alerts.yml + - recording.yml options: disableNameSuffixHash: true labels: diff --git a/victoria-metrics/recording.yml b/victoria-metrics/recording.yml new file mode 100644 index 0000000..d2dbebc --- /dev/null +++ b/victoria-metrics/recording.yml @@ -0,0 +1,8 @@ +groups: +- name: collectd + rules: + - record: filesystem:usage:percent + expr: >- + sum without (type) (collectd_df_df_complex{type!="free"}) + / sum without (type) (collectd_df_df_complex) +