diff --git a/group_vars/metricspi/alerts.yml b/group_vars/metricspi/alerts.yml index cb59042..03b253c 100644 --- a/group_vars/metricspi/alerts.yml +++ b/group_vars/metricspi/alerts.yml @@ -46,3 +46,23 @@ vmalert_rules: expr: collectd_md_md_disks{type="missing", instance!~"burp.*"} != 0 - alert: mdraid failed disk expr: collectd_md_md_disks{type="failed"} != 0 + + - name: BURP RAID + rules: + - alert: disks need swapped + expr: + time() - tlast_change_over_time( + ( + collectd_md_md_disks{instance="burp1.pyrocufflink.blue", type="active"} + or last_over_time(collectd_md_md_disks{instance="burp1.pyrocufflink.blue", type="active"})[1d] + )[1d] + ) > 86400 * 30 + annotations: + summary: The disks in the BURP array need swapped + description: >- + The disks in the BURP RAID-1 (mirror) array should be swapped + periodically. One disk should be online and mounted while the other + is stored in the fireproof safe. Switching them ensures that even if + something happens to the active disk, such as hardware failure, power + surge, fire, or accidental `rm -rf`, the offline disk is only out of + date by a few weeks.