kubernetes/victoria-metrics/alerts.yml

groups:
- name: default alert
  rules:
  - alert: Free disk space is low
    expr: >-
      (
      filesystem:usage:percent{
        kubernetes_io_arch!="arm64",
        df!="mmcblk0p3",
        df!="var-lib-frigate",
        df!="var-log",
      }
      or
      filesystem:usage:percent{
        kubernetes_io_arch="arm64",
        df!="boot",
      }
      or
      filesystem:usage:percent{
        df="mmcblk0p3",
        instance!="nut0.pyrocufflink.blue",
      }
      ) > .75
    for: 2h
    annotations:
      severity: minor
  - alert: Free disk space is very low
    expr: >-
      filesystem:usage:percent > 0.9
    for: 2h
    annotations:
      severity: minor
  - alert: TheWebsiteIsDown
    expr: >-
      probe_success{job="websites"} == 0
    for: 10m
  - alert: Missing Metrics
    expr: >-
      up{instance!~"vmhost.*"} == 0
    for: 10m
  - alert: NUT is offline
    expr: >-
      absent(collectd_nut_percent)
    for: 10m
  - alert: Internet is down
    expr: >-
      probe_success{job="blackbox"} == 0
    for: 5m
    annotations:
      severity: critical
      summary: The connection to the Internet is down.
      description: >-
        The Internet connection is down.  Try rebooting the ONT, or call
        Everfast Fiber.

- name: Bitwarden
  rules:
  - alert: vaultwarden is not running
    expr: >-
      collectd_processes_ps_count_processes{processes="vaultwarden"} < 1
    for: 5m

- name: Active Directory
  rules:
  - alert: samba is not running
    expr: >-
      collectd_processes_ps_count_processes{processes=~"samba|smbd|winbindd|krb5kdc"} < 1
    for: 5m

- name: mdraid
  rules:
  - alert: mdraid missing disk
    expr: collectd_md_md_disks{type="missing", instance!="chromie.pyrocufflink.blue"} != 0
  - alert: mdraid failed disk
    expr: collectd_md_md_disks{type="failed"} != 0

- name: Backups
  rules:
  - alert: disks need swapped
    expr:
      time() - tlast_change_over_time(
        (
          collectd_md_md_disks{instance="chromie.pyrocufflink.blue", type="active"}
          or last_over_time(collectd_md_md_disks{instance="chromie.pyrocufflink.blue", type="active"})[1d]
        )[90d]
      ) > 86400 * 30
    annotations:
      summary: The disks in the backup array need swapped
      description: >-
        The disks in the backup RAID-1 (mirror) array should be swapped
        periodically. One disk should be online and mounted while the other
        is stored in the fireproof safe.  Switching them ensures that even if
        something happens to the active disk, such as hardware failure, power
        surge, fire, or accidental `rm -rf`, the offline disk is only out of
        date by a few weeks.
  - alert: disk needs archived
    expr:
      sum(
        collectd_md_md_disks{instance="chromie.pyrocufflink.blue", type=~"missing|spare"}
      ) < 1
    annotations:
      summary: One of the disks in the backup array should be archived
      description: >-
        The disks in the backup RAID-1 (mirror) array should be swapped
        periodically.  One disk should be online and mounted while the other
        is stored in the fireproof safe.  All of the disks are currently
        online; one needs to be disconnected and moved to the safe as soon as
        possible.

- name: certificates
  rules:
  - alert: certificate will expire soon
    expr:
      probe_ssl_last_chain_expiry_timestamp_seconds - time() < 29 * 86400
    annotations:
      summary: A certificate will expire in less than 29 days
      description: >-
        Generally, certificates are renewed automatically, approximately 30
        days before their expiration (NotAfter) date.  There may be a problem
        with the certificate renewal process that prevented this certificate
        from being renewed.
  - alert: certificate will expire very soon
    expr:
      probe_ssl_last_chain_expiry_timestamp_seconds - time() < 14 * 86400
    annotations:
      summary: A certificate will expire in less than 14 days
      description: >-
        Generally, certificates are renewed automatically, approximately 30
        days before their expiration (NotAfter) date.  There is most likely a
        problem with the certificate renewal process that prevented this
        certificate from being renewed.

- name: Frigate
  rules:
  - alert: Frigate is Unavailable
    expr:
      absent(frigate_service_info)
      or irate(frigate_service_last_updated_timestamp) < 1
      or irate(frigate_service_uptime_seconds) < 1
    for: 10m
  - alert: Camera unavailable
    expr:
      homeassistant_entity_available{domain="camera"} != 1
    for: 10m
  - alert: No camera video frames
    expr:
      homeassistant_sensor_unit_fps{entity=~"sensor.*_camera_fps"} <= 0
    annotations:
      summary: No video received from camera
      description: >-
        Frigate is not receiving video from the camera. The camera may be
        offline.

- name: Home Assistant
  rules:
  - alert: Battery Low
    expr:
      homeassistant_sensor_battery_percent{entity!~"sensor\\.(pixel_|sm_p610).*"} < 10
    annotations:
      summary: >-
        Low battery: {{ $labels.friendly_name }}
      severity: minor
  - alert: Z-Wave Network is Offline
    expr:
      sum(
        homeassistant_entity_available{entity="sensor.usb_controller_status"}
      ) without (
        friendly_name
      ) < 1
    annotations:
      summary: The Z-Wave network controller is offline
      description: >-
        Home Assistant is not able to communicate with ZWaveJS, or ZWaveJS is
        not able to connect to the Z-Wave USB controller.  Z-Wave devices like
        light switches, door sensors, and smart plugs will not work until the
        Z-Wave network is operational again.
  - alert: Zigbee Network is Offline
    expr:
      homeassistant_binary_sensor_state{entity="binary_sensor.zigbee2mqtt_bridge_connection_state"} == 0
    annotations:
      summary: The Zigbee network bridge is offline
      description: >-
        Home Assistant is not able to communicate with Zigbee2MQTT, or
        Zigbee2MQTT is not able to connect to the Z-Wave USB controller.
        Zigbee devices like smart bulbs and buttons will not work until the
        Zigbee network is operational again.

- name: PostgreSQL
  rules:
  - alert: Replica lag too high
    expr:
      (patroni_xlog_location != 0)
      - ignoring (instance) group_right (scope) (patroni_xlog_replayed_location != 0)
      > 10240
    for: 10m
  - alert: WAL archive process failed
    expr: >-
      max_over_time(
        increase(pg_stat_archiver_failed_count)[20m]
      )> 0
    annotations:
      summary: The archiver process failed for one or more WAL segments
      description: >-
        Check the WAL segment archiver configuration and confirm that WAL
        segments are being backed up correctly.
  - alert: No recent WAL archives
    expr: >-
      pg_stat_archiver_last_archive_age > 3600
    annotations:
      summary: The last successful WAL segment backup was over 1h ago
      description: >-
        The WAL archiver process has not run successfully for over an hour.
        Ensure the WAL backup process is configured correctly and the backup
        target is online and healthy.


- name: Temperature
  rules:
  - alert: High Temperature
    expr: >-
      {__name__=~"collectd_.*_temperature", sensors!~"i350bb.*"} > 80
    for: 10m

- name: Longhorn
  rules:
  - alert: Degraded Volumes
    expr: >-
      count(longhorn_volume_robustness==2) > 0
    for: 1h
  - alert: Faulted Volumes
    expr: >-
      count(longhorn_volume_robustness==3) > 0
    for: 5m

- name: Restic
  rules:
  - alert: Repository Check Failed
    expr: >-
      min(restic_check_success) by (job) < 1
    annotations:
      summary: Errors found in restic repository data
      description: >-
        The Restic repository has one or more problems that may result in data
        loss.  Check the restic-exporter log for more information and correct
        the issue as soon as possible.
  - alert: Last Backup Age
    expr: >-
      time() - restic_backup_timestamp{
        client_hostname!="bw0.pyrocufflink.blue",
        client_hostname!="luma.pyrocufflink.blue",
        client_hostname!="purplepi.hatch",
        client_hostname!="toad.pyrocufflink.blue",
      }> 604800
    annotations:
      summary: A Restic client has not backed up recently
      description: >-
        Clients are scheduled to back up every day, but at least one has not
        been backed up in at least 7 days.  Check the Restic configuration on
        that system to ensure backups are running properly.

- name: Paperless-ngx
  rules:
  - alert: Paperless-ngx is down
    expr: >-
      up{job="paperless-ngx"} == 0 or absent(up{job="paperless-ngx"})
    annotations:
      summary: Paperless-ngx is down
      description: >-
        Paperless-ngx is offline.
  - alert: Celery tasks failed
    expr: >-
      max_over_time(
        increase(
          flower_events_total{
            job="paperless-ngx",
            type="task-failed",
            task!="documents.tasks.consume_file",
          }
        )[24h]
      ) > 0
    annotations:
      summary: Paperless-ngx Celery task failed
      description: >-
        Failing Celery tasks may indicate a problem with the Paperless-ngx
        deployment and can result in data loss.  Check the Paperless-ngx logs
        for details about the task failures.
  - alert: Paperless email task not running
    expr: >-
      absent_over_time(
        flower_events_total{
          type="task-started",
          task="paperless_mail.tasks.process_mail_accounts"
        }[12h]
      )
    annotations:
      summary: Paperless task to process mail accounts has not run recently
      description: >-
        Paperless-ngx uses a scheduled Celery task to periodically poll email
        mailboxes for new messages.  If this task does not start, new email
        messages will not be downloaded and imported into the document library.

- name: Firefly III
  rules:
  - alert: Firefly III is down
    expr: >-
      probe_success{job="firefly-iii"} != 1

- name: phpipam
  rules:
  - alert: phpipam is down
    expr: >-
      probe_success{job="phpipam"} != 1