diff --git a/victoria-metrics/alerts.yml b/victoria-metrics/alerts.yml index ae21f58..fb7a334 100644 --- a/victoria-metrics/alerts.yml +++ b/victoria-metrics/alerts.yml @@ -60,10 +60,43 @@ groups: - name: mdraid rules: - alert: mdraid missing disk - expr: collectd_md_md_disks{type="missing", instance!~"burp.*"} != 0 + expr: collectd_md_md_disks{type="missing", instance!="chromie.pyrocufflink.blue"} != 0 - alert: mdraid failed disk expr: collectd_md_md_disks{type="failed"} != 0 +- name: Backups + rules: + - alert: disks need swapped + expr: + time() - tlast_change_over_time( + ( + collectd_md_md_disks{instance="chromie.pyrocufflink.blue", type="active"} + or last_over_time(collectd_md_md_disks{instance="chromie.pyrocufflink.blue", type="active"})[1d] + )[90d] + ) > 86400 * 30 + annotations: + summary: The disks in the backup array need swapped + description: >- + The disks in the backup RAID-1 (mirror) array should be swapped + periodically. One disk should be online and mounted while the other + is stored in the fireproof safe. Switching them ensures that even if + something happens to the active disk, such as hardware failure, power + surge, fire, or accidental `rm -rf`, the offline disk is only out of + date by a few weeks. + - alert: disk needs archived + expr: + sum( + collectd_md_md_disks{instance="chromie.pyrocufflink.blue", type=~"missing|spare"} + ) < 1 + annotations: + summary: One of the disks in the backup array should be archived + description: >- + The disks in the backup RAID-1 (mirror) array should be swapped + periodically. One disk should be online and mounted while the other + is stored in the fireproof safe. All of the disks are currently + online; one needs to be disconnected and moved to the safe as soon as + possible. + - name: certificates rules: - alert: certificate will expire soon