configpolicy/group_vars/metricspi/alerts.yml

49 lines
1.4 KiB
YAML

vmalert_rules:
groups:
- name: default alert
rules:
- alert: DiskUsage
expr: >-
sum(collectd_df_df_complex{type!="free"}) by (instance, df) / sum(collectd_df_df_complex{df!="var-log"}) by (instance, df) > .75
or sum(collectd_df_df_complex{type!="free"}) by (instance, df) / sum(collectd_df_df_complex{df="var-log"}) by (instance, df) > .95
for: 2h
- alert: TheWebsiteIsDown
expr: >-
probe_success{job="websites"} == 0
for: 10m
- alert: Missing Metrics
expr: >-
up{instance!~"vmhost.*"} == 0
for: 10m
- alert: NUT is offline
expr: >-
absent(collectd_nut_percent)
- name: Bitwarden
rules:
- alert: vaultwarden is not running
expr: >-
collectd_processes_ps_count_processes{processes="vaultwarden"} < 1
for: 5m
- name: Active Directory
rules:
- alert: samba is not running
expr: >-
collectd_processes_ps_count_processes{processes=~"samba|smbd|winbindd|krb5kdc"} < 1
for: 5m
- name: Graylog
rules:
- alert: unprocessed messages
expr: >-
org_graylog2_journal_entries_uncommitted > 100
for: 1h
- name: mdraid
rules:
- alert: mdraid missing disk
expr: collectd_md_md_disks{type="missing"} != 0
- alert: mdraid failed disk
expr: collectd_md_md_disks{type="failed"} != 0