1
0
Fork 0
kubernetes/victoria-metrics/alerts.yml

313 lines
10 KiB
YAML

groups:
- name: default alert
rules:
- alert: Free disk space is low
expr: >-
(
filesystem:usage:percent{
kubernetes_io_arch!="arm64",
df!="mmcblk0p3",
df!="var-lib-frigate",
df!="var-log",
}
or
filesystem:usage:percent{
kubernetes_io_arch="arm64",
df!="boot",
}
or
filesystem:usage:percent{
df="mmcblk0p3",
instance!="nut0.pyrocufflink.blue",
}
) > .75
for: 2h
annotations:
severity: minor
- alert: Free disk space is very low
expr: >-
filesystem:usage:percent > 0.9
for: 2h
annotations:
severity: minor
- alert: TheWebsiteIsDown
expr: >-
probe_success{job="websites"} == 0
for: 10m
- alert: Missing Metrics
expr: >-
up{instance!~"vmhost.*"} == 0
for: 10m
- alert: NUT is offline
expr: >-
absent(collectd_nut_percent)
for: 10m
- alert: Internet is down
expr: >-
probe_success{job="blackbox"} == 0
for: 5m
annotations:
severity: critical
summary: The connection to the Internet is down.
description: >-
The Internet connection is down. Try rebooting the ONT, or call
Everfast Fiber.
- name: Bitwarden
rules:
- alert: vaultwarden is not running
expr: >-
collectd_processes_ps_count_processes{processes="vaultwarden"} < 1
for: 5m
- name: Active Directory
rules:
- alert: samba is not running
expr: >-
collectd_processes_ps_count_processes{processes=~"samba|smbd|winbindd|krb5kdc"} < 1
for: 5m
- name: mdraid
rules:
- alert: mdraid missing disk
expr: collectd_md_md_disks{type="missing", instance!="chromie.pyrocufflink.blue"} != 0
- alert: mdraid failed disk
expr: collectd_md_md_disks{type="failed"} != 0
- name: Backups
rules:
- alert: disks need swapped
expr:
time() - tlast_change_over_time(
(
collectd_md_md_disks{instance="chromie.pyrocufflink.blue", type="active"}
or last_over_time(collectd_md_md_disks{instance="chromie.pyrocufflink.blue", type="active"})[1d]
)[90d]
) > 86400 * 30
annotations:
summary: The disks in the backup array need swapped
description: >-
The disks in the backup RAID-1 (mirror) array should be swapped
periodically. One disk should be online and mounted while the other
is stored in the fireproof safe. Switching them ensures that even if
something happens to the active disk, such as hardware failure, power
surge, fire, or accidental `rm -rf`, the offline disk is only out of
date by a few weeks.
- alert: disk needs archived
expr:
sum(
collectd_md_md_disks{instance="chromie.pyrocufflink.blue", type=~"missing|spare"}
) < 1
annotations:
summary: One of the disks in the backup array should be archived
description: >-
The disks in the backup RAID-1 (mirror) array should be swapped
periodically. One disk should be online and mounted while the other
is stored in the fireproof safe. All of the disks are currently
online; one needs to be disconnected and moved to the safe as soon as
possible.
- name: certificates
rules:
- alert: certificate will expire soon
expr:
probe_ssl_last_chain_expiry_timestamp_seconds - time() < 29 * 86400
annotations:
summary: A certificate will expire in less than 29 days
description: >-
Generally, certificates are renewed automatically, approximately 30
days before their expiration (NotAfter) date. There may be a problem
with the certificate renewal process that prevented this certificate
from being renewed.
- alert: certificate will expire very soon
expr:
probe_ssl_last_chain_expiry_timestamp_seconds - time() < 14 * 86400
annotations:
summary: A certificate will expire in less than 14 days
description: >-
Generally, certificates are renewed automatically, approximately 30
days before their expiration (NotAfter) date. There is most likely a
problem with the certificate renewal process that prevented this
certificate from being renewed.
- name: Frigate
rules:
- alert: Frigate is Unavailable
expr:
absent(frigate_service_info)
or irate(frigate_service_last_updated_timestamp) < 1
or irate(frigate_service_uptime_seconds) < 1
for: 10m
- alert: Camera unavailable
expr:
homeassistant_entity_available{domain="camera"} != 1
for: 10m
- alert: No camera video frames
expr:
homeassistant_sensor_unit_fps{entity=~"sensor.*_camera_fps"} <= 0
annotations:
summary: No video received from camera
description: >-
Frigate is not receiving video from the camera. The camera may be
offline.
- name: Home Assistant
rules:
- alert: Battery Low
expr:
homeassistant_sensor_battery_percent{entity!~"sensor\\.(pixel_|sm_p610).*"} < 10
annotations:
summary: >-
Low battery: {{ $labels.friendly_name }}
severity: minor
- alert: Z-Wave Network is Offline
expr:
sum(
homeassistant_entity_available{entity="sensor.usb_controller_status"}
) without (
friendly_name
) < 1
annotations:
summary: The Z-Wave network controller is offline
description: >-
Home Assistant is not able to communicate with ZWaveJS, or ZWaveJS is
not able to connect to the Z-Wave USB controller. Z-Wave devices like
light switches, door sensors, and smart plugs will not work until the
Z-Wave network is operational again.
- alert: Zigbee Network is Offline
expr:
homeassistant_binary_sensor_state{entity="binary_sensor.zigbee2mqtt_bridge_connection_state"} == 0
annotations:
summary: The Zigbee network bridge is offline
description: >-
Home Assistant is not able to communicate with Zigbee2MQTT, or
Zigbee2MQTT is not able to connect to the Z-Wave USB controller.
Zigbee devices like smart bulbs and buttons will not work until the
Zigbee network is operational again.
- name: PostgreSQL
rules:
- alert: Replica lag too high
expr:
(patroni_xlog_location != 0)
- ignoring (instance) group_right (scope) (patroni_xlog_replayed_location != 0)
> 10240
for: 10m
- alert: WAL archive process failed
expr: >-
max_over_time(
increase(pg_stat_archiver_failed_count)[20m]
)> 0
annotations:
summary: The archiver process failed for one or more WAL segments
description: >-
Check the WAL segment archiver configuration and confirm that WAL
segments are being backed up correctly.
- alert: No recent WAL archives
expr: >-
pg_stat_archiver_last_archive_age > 3600
annotations:
summary: The last successful WAL segment backup was over 1h ago
description: >-
The WAL archiver process has not run successfully for over an hour.
Ensure the WAL backup process is configured correctly and the backup
target is online and healthy.
- name: Temperature
rules:
- alert: High Temperature
expr: >-
{__name__=~"collectd_.*_temperature", sensors!~"i350bb.*"} > 80
for: 10m
- name: Longhorn
rules:
- alert: Degraded Volumes
expr: >-
count(longhorn_volume_robustness==2) > 0
for: 1h
- alert: Faulted Volumes
expr: >-
count(longhorn_volume_robustness==3) > 0
for: 5m
- name: Restic
rules:
- alert: Repository Check Failed
expr: >-
min(restic_check_success) by (job) < 1
annotations:
summary: Errors found in restic repository data
description: >-
The Restic repository has one or more problems that may result in data
loss. Check the restic-exporter log for more information and correct
the issue as soon as possible.
- alert: Last Backup Age
expr: >-
time() - restic_backup_timestamp{
client_hostname!="bw0.pyrocufflink.blue",
client_hostname!="luma.pyrocufflink.blue",
client_hostname!="purplepi.hatch",
client_hostname!="toad.pyrocufflink.blue",
}> 604800
annotations:
summary: A Restic client has not backed up recently
description: >-
Clients are scheduled to back up every day, but at least one has not
been backed up in at least 7 days. Check the Restic configuration on
that system to ensure backups are running properly.
- name: Paperless-ngx
rules:
- alert: Paperless-ngx is down
expr: >-
up{job="paperless-ngx"} == 0 or absent(up{job="paperless-ngx"})
annotations:
summary: Paperless-ngx is down
description: >-
Paperless-ngx is offline.
- alert: Celery tasks failed
expr: >-
max_over_time(
increase(
flower_events_total{
job="paperless-ngx",
type="task-failed",
task!="documents.tasks.consume_file",
}
)[24h]
) > 0
annotations:
summary: Paperless-ngx Celery task failed
description: >-
Failing Celery tasks may indicate a problem with the Paperless-ngx
deployment and can result in data loss. Check the Paperless-ngx logs
for details about the task failures.
- alert: Paperless email task not running
expr: >-
absent_over_time(
flower_events_total{
type="task-started",
task="paperless_mail.tasks.process_mail_accounts"
}[12h]
)
annotations:
summary: Paperless task to process mail accounts has not run recently
description: >-
Paperless-ngx uses a scheduled Celery task to periodically poll email
mailboxes for new messages. If this task does not start, new email
messages will not be downloaded and imported into the document library.
- name: Firefly III
rules:
- alert: Firefly III is down
expr: >-
probe_success{job="firefly-iii"} != 1
- name: phpipam
rules:
- alert: phpipam is down
expr: >-
probe_success{job="phpipam"} != 1