From 9f287d0f71899020a6206a2e0c8dd83eb02e9ce5 Mon Sep 17 00:00:00 2001 From: "Dustin C. Hatch" Date: Mon, 4 Nov 2024 20:46:03 -0600 Subject: [PATCH] v-m/alerts: Add alerts for backup RAID array Just like I did with the RAID-1 array in the old BURP server, I will keep one member active and one in the fireproof safe, swapping them each month. We can use the same metrics queries to alert on when the swap should happen that we used with the BURP server. --- victoria-metrics/alerts.yml | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/victoria-metrics/alerts.yml b/victoria-metrics/alerts.yml index ae21f58..fb7a334 100644 --- a/victoria-metrics/alerts.yml +++ b/victoria-metrics/alerts.yml @@ -60,10 +60,43 @@ groups: - name: mdraid rules: - alert: mdraid missing disk - expr: collectd_md_md_disks{type="missing", instance!~"burp.*"} != 0 + expr: collectd_md_md_disks{type="missing", instance!="chromie.pyrocufflink.blue"} != 0 - alert: mdraid failed disk expr: collectd_md_md_disks{type="failed"} != 0 +- name: Backups + rules: + - alert: disks need swapped + expr: + time() - tlast_change_over_time( + ( + collectd_md_md_disks{instance="chromie.pyrocufflink.blue", type="active"} + or last_over_time(collectd_md_md_disks{instance="chromie.pyrocufflink.blue", type="active"})[1d] + )[90d] + ) > 86400 * 30 + annotations: + summary: The disks in the backup array need swapped + description: >- + The disks in the backup RAID-1 (mirror) array should be swapped + periodically. One disk should be online and mounted while the other + is stored in the fireproof safe. Switching them ensures that even if + something happens to the active disk, such as hardware failure, power + surge, fire, or accidental `rm -rf`, the offline disk is only out of + date by a few weeks. + - alert: disk needs archived + expr: + sum( + collectd_md_md_disks{instance="chromie.pyrocufflink.blue", type=~"missing|spare"} + ) < 1 + annotations: + summary: One of the disks in the backup array should be archived + description: >- + The disks in the backup RAID-1 (mirror) array should be swapped + periodically. One disk should be online and mounted while the other + is stored in the fireproof safe. All of the disks are currently + online; one needs to be disconnected and moved to the safe as soon as + possible. + - name: certificates rules: - alert: certificate will expire soon