configpolicy/group_vars/metricspi/alerts.yml

124 lines
4.7 KiB
YAML

vmalert_rules:
groups:
- name: default alert
rules:
- alert: DiskUsage
expr: >-
sum(collectd_df_df_complex{type!="free"}) by (instance, df) / sum(collectd_df_df_complex{df!="var-log"}) by (instance, df) > .75
or sum(collectd_df_df_complex{type!="free"}) by (instance, df) / sum(collectd_df_df_complex{df="var-log"}) by (instance, df) > .95
for: 2h
- alert: TheWebsiteIsDown
expr: >-
probe_success{job="websites"} == 0
for: 10m
- alert: Missing Metrics
expr: >-
up{instance!~"vmhost.*"} == 0
for: 10m
- alert: NUT is offline
expr: >-
absent(collectd_nut_percent)
- name: Bitwarden
rules:
- alert: vaultwarden is not running
expr: >-
collectd_processes_ps_count_processes{processes="vaultwarden"} < 1
for: 5m
- name: Active Directory
rules:
- alert: samba is not running
expr: >-
collectd_processes_ps_count_processes{processes=~"samba|smbd|winbindd|krb5kdc"} < 1
for: 5m
- name: Graylog
rules:
- alert: unprocessed messages
expr: >-
org_graylog2_journal_entries_uncommitted > 100
for: 1h
- name: mdraid
rules:
- alert: mdraid missing disk
expr: collectd_md_md_disks{type="missing", instance!~"burp.*"} != 0
- alert: mdraid failed disk
expr: collectd_md_md_disks{type="failed"} != 0
- name: BURP
rules:
- alert: no recent backups
expr: absent(burp_client_last_backup_timestamp)
for: 8h
annotations:
summary: No clients have been backed up recently
description: >-
This alert indicates that NO clients have been backed up within the
last day. There is likely a problem with the BURP server.
- alert: missed client backup
expr:
time() - (burp_client_last_backup_timestamp > now() - 86400 * 90) > 86400 * 2
for: 3h
annotations:
summary: A client has not backed up today
description: >-
A client has not been backed up for more than a day. This may be
because the client is offline, or because the backup process has
failed. Clients that have not been backed up for more than 90 days
will not trigger this alert.
- alert: disks need swapped
expr:
time() - tlast_change_over_time(
(
collectd_md_md_disks{instance="burp1.pyrocufflink.blue", type="active"}
or last_over_time(collectd_md_md_disks{instance="burp1.pyrocufflink.blue", type="active"})[1d]
)[90d]
) > 86400 * 30
annotations:
summary: The disks in the BURP array need swapped
description: >-
The disks in the BURP RAID-1 (mirror) array should be swapped
periodically. One disk should be online and mounted while the other
is stored in the fireproof safe. Switching them ensures that even if
something happens to the active disk, such as hardware failure, power
surge, fire, or accidental `rm -rf`, the offline disk is only out of
date by a few weeks.
- alert: disk needs archived
expr:
sum(
collectd_md_md_disks{instance="burp1.pyrocufflink.blue", type=~"missing|spare"}
) < 1
annotations:
summary: One of the disks in the BURP array should be archived
description: >-
The disks in the BURP RAID-1 (mirror) array should be swapped
periodically. One disk should be online and mounted while the other
is stored in the fireproof safe. All of the disks are currently
online; one needs to be disconnected and moved to the safe as soon as
possible.
- name: certificates
rules:
- alert: certificate will expire soon
expr:
probe_ssl_last_chain_expiry_timestamp_seconds - time() < 29 * 86400
annotations:
summary: A certificate will expire in less than 29 days
description: >-
Generally, certificates are renewed automatically, approximately 30
days before their expiration (NotAfter) date. There may be a problem
with the certificate renewal process that prevented this certificate
from being renewed.
- alert: certificate will expire very soon
expr:
probe_ssl_last_chain_expiry_timestamp_seconds - time() < 14 * 86400
annotations:
summary: A certificate will expire in less than 14 days
description: >-
Generally, certificates are renewed automatically, approximately 30
days before their expiration (NotAfter) date. There is most likely a
problem with the certificate renewal process that prevented this
certificate from being renewed.