groups: - name: default alert rules: - alert: DiskUsage expr: >- sum(collectd_df_df_complex{type!="free"}) by (instance, df) / sum(collectd_df_df_complex{df!="var-log", df!="var-lib-frigate"}) by (instance, df) > .75 or sum(collectd_df_df_complex{type!="free"}) by (instance, df) / sum(collectd_df_df_complex{df="var-log"}) by (instance, df) > .95 or sum(collectd_df_df_complex{type!="free"}) by (instance, df) / sum(collectd_df_df_complex{df="var-lib-frigate"}) by (instance, df) > .95 for: 2h - alert: TheWebsiteIsDown expr: >- probe_success{job="websites"} == 0 for: 10m - alert: Missing Metrics expr: >- up{instance!~"vmhost.*"} == 0 for: 10m - alert: NUT is offline expr: >- absent(collectd_nut_percent) for: 10m - name: Bitwarden rules: - alert: vaultwarden is not running expr: >- collectd_processes_ps_count_processes{processes="vaultwarden"} < 1 for: 5m - name: Active Directory rules: - alert: samba is not running expr: >- collectd_processes_ps_count_processes{processes=~"samba|smbd|winbindd|krb5kdc"} < 1 for: 5m - name: mdraid rules: - alert: mdraid missing disk expr: collectd_md_md_disks{type="missing", instance!~"burp.*"} != 0 - alert: mdraid failed disk expr: collectd_md_md_disks{type="failed"} != 0 - name: certificates rules: - alert: certificate will expire soon expr: probe_ssl_last_chain_expiry_timestamp_seconds - time() < 29 * 86400 annotations: summary: A certificate will expire in less than 29 days description: >- Generally, certificates are renewed automatically, approximately 30 days before their expiration (NotAfter) date. There may be a problem with the certificate renewal process that prevented this certificate from being renewed. - alert: certificate will expire very soon expr: probe_ssl_last_chain_expiry_timestamp_seconds - time() < 14 * 86400 annotations: summary: A certificate will expire in less than 14 days description: >- Generally, certificates are renewed automatically, approximately 30 days before their expiration (NotAfter) date. There is most likely a problem with the certificate renewal process that prevented this certificate from being renewed. - name: Frigate rules: - alert: Frigate is Unavailable expr: homeassistant_entity_available{entity=~".*frigate_(server|status)"} != 1 for: 10m - alert: Camera unavailable expr: homeassistant_entity_available{domain="camera"} != 1 for: 10m - name: Sensors rules: - alert: Battery Low expr: homeassistant_sensor_battery_percent{entity!~"sensor\\.(pixel_|sm_p610).*"} < 10 - name: PostgreSQL rules: - alert: Replica lag too high expr: (patroni_xlog_location != 0) - ignoring (instance) group_right (scope) (patroni_xlog_replayed_location != 0) > 10240 for: 10m - name: Temperature rules: - alert: High Temperature expr: >- {__name__=~"collectd_.*_temperature", sensors!~"i350bb.*"} > 80 for: 10m - name: Longhorn rules: - alert: Degraded Volumes expr: >- count(longhorn_volume_robustness==2) > 0 for: 1h - alert: Faulted Volumes expr: >- count(longhorn_volume_robustness==3) > 0 for: 5m