From dc2a05dc8fe24f258681c9b7545019932cab6690 Mon Sep 17 00:00:00 2001 From: "Dustin C. Hatch" Date: Tue, 11 Apr 2023 22:23:17 -0500 Subject: [PATCH] alerts: Add alert for BURP RAID array swap This alert counts how long its been since the number of "active" disks in the RAID array on the BURP server has changed. The assumption is that the number will typically be `1`, but it will be `2` when the second disk synchronized before the swap occurs. --- group_vars/metricspi/alerts.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/group_vars/metricspi/alerts.yml b/group_vars/metricspi/alerts.yml index cb59042..03b253c 100644 --- a/group_vars/metricspi/alerts.yml +++ b/group_vars/metricspi/alerts.yml @@ -46,3 +46,23 @@ vmalert_rules: expr: collectd_md_md_disks{type="missing", instance!~"burp.*"} != 0 - alert: mdraid failed disk expr: collectd_md_md_disks{type="failed"} != 0 + + - name: BURP RAID + rules: + - alert: disks need swapped + expr: + time() - tlast_change_over_time( + ( + collectd_md_md_disks{instance="burp1.pyrocufflink.blue", type="active"} + or last_over_time(collectd_md_md_disks{instance="burp1.pyrocufflink.blue", type="active"})[1d] + )[1d] + ) > 86400 * 30 + annotations: + summary: The disks in the BURP array need swapped + description: >- + The disks in the BURP RAID-1 (mirror) array should be swapped + periodically. One disk should be online and mounted while the other + is stored in the fireproof safe. Switching them ensures that even if + something happens to the active disk, such as hardware failure, power + surge, fire, or accidental `rm -rf`, the offline disk is only out of + date by a few weeks.