138 lines
5.0 KiB
YAML
138 lines
5.0 KiB
YAML
groups:
|
|
- name: default alert
|
|
rules:
|
|
- alert: DiskUsage
|
|
expr: >-
|
|
sum(collectd_df_df_complex{type!="free"}) by (instance, df) / sum(collectd_df_df_complex{df!="var-log", df!="var-lib-frigate"}) by (instance, df) > .75
|
|
or sum(collectd_df_df_complex{type!="free"}) by (instance, df) / sum(collectd_df_df_complex{df="var-log"}) by (instance, df) > .95
|
|
or sum(collectd_df_df_complex{type!="free"}) by (instance, df) / sum(collectd_df_df_complex{df="var-lib-frigate"}) by (instance, df) > .95
|
|
for: 2h
|
|
- alert: TheWebsiteIsDown
|
|
expr: >-
|
|
probe_success{job="websites"} == 0
|
|
for: 10m
|
|
- alert: Missing Metrics
|
|
expr: >-
|
|
up{instance!~"vmhost.*"} == 0
|
|
for: 10m
|
|
- alert: NUT is offline
|
|
expr: >-
|
|
absent(collectd_nut_percent)
|
|
|
|
- name: Bitwarden
|
|
rules:
|
|
- alert: vaultwarden is not running
|
|
expr: >-
|
|
collectd_processes_ps_count_processes{processes="vaultwarden"} < 1
|
|
for: 5m
|
|
|
|
- name: Active Directory
|
|
rules:
|
|
- alert: samba is not running
|
|
expr: >-
|
|
collectd_processes_ps_count_processes{processes=~"samba|smbd|winbindd|krb5kdc"} < 1
|
|
for: 5m
|
|
|
|
- name: mdraid
|
|
rules:
|
|
- alert: mdraid missing disk
|
|
expr: collectd_md_md_disks{type="missing", instance!~"burp.*"} != 0
|
|
- alert: mdraid failed disk
|
|
expr: collectd_md_md_disks{type="failed"} != 0
|
|
|
|
- name: BURP
|
|
rules:
|
|
- alert: no recent backups
|
|
expr: absent(burp_client_last_backup_timestamp)
|
|
for: 8h
|
|
annotations:
|
|
summary: No clients have been backed up recently
|
|
description: >-
|
|
This alert indicates that NO clients have been backed up within the
|
|
last day. There is likely a problem with the BURP server.
|
|
- alert: missed client backup
|
|
expr:
|
|
time() - (burp_client_last_backup_timestamp > now() - 86400 * 90) > 86400 * 2
|
|
for: 3h
|
|
annotations:
|
|
summary: A client has not backed up today
|
|
description: >-
|
|
A client has not been backed up for more than a day. This may be
|
|
because the client is offline, or because the backup process has
|
|
failed. Clients that have not been backed up for more than 90 days
|
|
will not trigger this alert.
|
|
- alert: disks need swapped
|
|
expr:
|
|
time() - tlast_change_over_time(
|
|
(
|
|
collectd_md_md_disks{instance="burp1.pyrocufflink.blue", type="active"}
|
|
or last_over_time(collectd_md_md_disks{instance="burp1.pyrocufflink.blue", type="active"})[1d]
|
|
)[90d]
|
|
) > 86400 * 30
|
|
annotations:
|
|
summary: The disks in the BURP array need swapped
|
|
description: >-
|
|
The disks in the BURP RAID-1 (mirror) array should be swapped
|
|
periodically. One disk should be online and mounted while the other
|
|
is stored in the fireproof safe. Switching them ensures that even if
|
|
something happens to the active disk, such as hardware failure, power
|
|
surge, fire, or accidental `rm -rf`, the offline disk is only out of
|
|
date by a few weeks.
|
|
- alert: disk needs archived
|
|
expr:
|
|
sum(
|
|
collectd_md_md_disks{instance="burp1.pyrocufflink.blue", type=~"missing|spare"}
|
|
) < 1
|
|
annotations:
|
|
summary: One of the disks in the BURP array should be archived
|
|
description: >-
|
|
The disks in the BURP RAID-1 (mirror) array should be swapped
|
|
periodically. One disk should be online and mounted while the other
|
|
is stored in the fireproof safe. All of the disks are currently
|
|
online; one needs to be disconnected and moved to the safe as soon as
|
|
possible.
|
|
|
|
- name: certificates
|
|
rules:
|
|
- alert: certificate will expire soon
|
|
expr:
|
|
probe_ssl_last_chain_expiry_timestamp_seconds - time() < 29 * 86400
|
|
annotations:
|
|
summary: A certificate will expire in less than 29 days
|
|
description: >-
|
|
Generally, certificates are renewed automatically, approximately 30
|
|
days before their expiration (NotAfter) date. There may be a problem
|
|
with the certificate renewal process that prevented this certificate
|
|
from being renewed.
|
|
- alert: certificate will expire very soon
|
|
expr:
|
|
probe_ssl_last_chain_expiry_timestamp_seconds - time() < 14 * 86400
|
|
annotations:
|
|
summary: A certificate will expire in less than 14 days
|
|
description: >-
|
|
Generally, certificates are renewed automatically, approximately 30
|
|
days before their expiration (NotAfter) date. There is most likely a
|
|
problem with the certificate renewal process that prevented this
|
|
certificate from being renewed.
|
|
|
|
- name: Frigate
|
|
rules:
|
|
- alert: Frigate is Unavailable
|
|
expr:
|
|
homeassistant_entity_available{entity=~".*frigate_(server|status)"} != 1
|
|
|
|
- name: Sensors
|
|
rules:
|
|
- alert: Battery Low
|
|
expr:
|
|
homeassistant_sensor_battery_percent{entity!~"sensor\\.(pixel_|sm_p610).*"} < 10
|
|
|
|
- name: PostgreSQL
|
|
rules:
|
|
- alert: Replica lag too high
|
|
expr:
|
|
(patroni_xlog_location != 0)
|
|
- ignoring (instance) group_right (scope) (patroni_xlog_replayed_location != 0)
|
|
> 10240
|
|
for: 10m
|