1
0
Fork 0
kubernetes/victoria-metrics/alerts.yml

180 lines
5.8 KiB
YAML

groups:
- name: default alert
rules:
- alert: DiskUsage
expr: >-
sum(collectd_df_df_complex{type!="free"}) by (instance, df) / sum(collectd_df_df_complex{df!="var-log", df!="var-lib-frigate"}) by (instance, df) > .75
or sum(collectd_df_df_complex{type!="free"}) by (instance, df) / sum(collectd_df_df_complex{df="var-log"}) by (instance, df) > .95
or sum(collectd_df_df_complex{type!="free"}) by (instance, df) / sum(collectd_df_df_complex{df="var-lib-frigate"}) by (instance, df) > .95
for: 2h
- alert: TheWebsiteIsDown
expr: >-
probe_success{job="websites"} == 0
for: 10m
- alert: Missing Metrics
expr: >-
up{instance!~"vmhost.*"} == 0
for: 10m
- alert: NUT is offline
expr: >-
absent(collectd_nut_percent)
for: 10m
- name: Bitwarden
rules:
- alert: vaultwarden is not running
expr: >-
collectd_processes_ps_count_processes{processes="vaultwarden"} < 1
for: 5m
- name: Active Directory
rules:
- alert: samba is not running
expr: >-
collectd_processes_ps_count_processes{processes=~"samba|smbd|winbindd|krb5kdc"} < 1
for: 5m
- name: mdraid
rules:
- alert: mdraid missing disk
expr: collectd_md_md_disks{type="missing", instance!~"burp.*"} != 0
- alert: mdraid failed disk
expr: collectd_md_md_disks{type="failed"} != 0
- name: certificates
rules:
- alert: certificate will expire soon
expr:
probe_ssl_last_chain_expiry_timestamp_seconds - time() < 29 * 86400
annotations:
summary: A certificate will expire in less than 29 days
description: >-
Generally, certificates are renewed automatically, approximately 30
days before their expiration (NotAfter) date. There may be a problem
with the certificate renewal process that prevented this certificate
from being renewed.
- alert: certificate will expire very soon
expr:
probe_ssl_last_chain_expiry_timestamp_seconds - time() < 14 * 86400
annotations:
summary: A certificate will expire in less than 14 days
description: >-
Generally, certificates are renewed automatically, approximately 30
days before their expiration (NotAfter) date. There is most likely a
problem with the certificate renewal process that prevented this
certificate from being renewed.
- name: Frigate
rules:
- alert: Frigate is Unavailable
expr:
homeassistant_entity_available{entity=~".*frigate_(server|status)"} != 1
for: 10m
- alert: Camera unavailable
expr:
homeassistant_entity_available{domain="camera"} != 1
for: 10m
- name: Sensors
rules:
- alert: Battery Low
expr:
homeassistant_sensor_battery_percent{entity!~"sensor\\.(pixel_|sm_p610).*"} < 10
- name: PostgreSQL
rules:
- alert: Replica lag too high
expr:
(patroni_xlog_location != 0)
- ignoring (instance) group_right (scope) (patroni_xlog_replayed_location != 0)
> 10240
for: 10m
- alert: WAL archive process failed
expr: >-
pg_stat_archiver_failed_count > 0
annotations:
summary: The archiver process failed for one or more WAL segments
description: >-
Check the WAL segment archiver configuration and confirm that WAL
segments are being backed up correctly.
- alert: No recent WAL archives
expr: >-
pg_stat_archiver_last_archive_age > 3600
annotations:
summary: The last successful WAL segment backup was over 1h ago
description: >-
The WAL archiver process has not run successfully for over an hour.
Ensure the WAL backup process is configured correctly and the backup
target is online and healthy.
- name: Temperature
rules:
- alert: High Temperature
expr: >-
{__name__=~"collectd_.*_temperature", sensors!~"i350bb.*"} > 80
for: 10m
- name: Longhorn
rules:
- alert: Degraded Volumes
expr: >-
count(longhorn_volume_robustness==2) > 0
for: 1h
- alert: Faulted Volumes
expr: >-
count(longhorn_volume_robustness==3) > 0
for: 5m
- name: Restic
rules:
- alert: Repository Check Failed
expr: >-
min(restic_check_success) by (job) < 1
annotations:
summary: Errors found in restic repository data
description: >-
The Restic repository has one or more problems that may result in data
loss. Check the restic-exporter log for more information and correct
the issue as soon as possible.
- alert: Last Backup Age
expr: >-
time() - restic_backup_timestamp > 604800
annotations:
summary: A Restic client has not backed up recently
description: >-
Clients are scheduled to back up every day, but at least one has not
been backed up in at least 7 days. Check the Restic configuration on
that system to ensure backups are running properly.
- alert: No File Changes
expr: >-
max_over_time(
abs(
delta(
sum(restic_backup_size_total{
client_hostname!="pxe0.pyrocufflink.blue",
client_hostname!="web0.pyrocufflink.blue",
})
by (client_hostname, client_username)
)
)[7d]
) == 0
annotations:
summary: The size of the Restic backup has not changed
description: >-
The size of the Restic backup for a particular client has not changed
in at least 7 days. This may indicate that the backup configuration
is incorrect.
- name: Paperless-ngx
rules:
- alert: Celery tasks failed
expr: >-
flower_events_total{job="paperless-ngx", type="task-failed"} > 0
annotations:
summary: One or more Celery tasks have failed
description: >-
Failing Celery tasks may indicate a problem with the Paperless-ngx
deployment and can result in data loss. Check the Paperless-ngx logs
for details about the task failures.