kubernetes/victoria-metrics/alerts.yml

groups:
- name: default alert
  rules:
  - alert: DiskUsage
    expr: >-
      sum(collectd_df_df_complex{type!="free"}) by (instance, df) / sum(collectd_df_df_complex{df!="var-log", df!="var-lib-frigate"}) by (instance, df) > .75
      or sum(collectd_df_df_complex{type!="free"}) by (instance, df) / sum(collectd_df_df_complex{df="var-log"}) by (instance, df) > .95
      or sum(collectd_df_df_complex{type!="free"}) by (instance, df) / sum(collectd_df_df_complex{df="var-lib-frigate"}) by (instance, df) > .95
    for: 2h
  - alert: TheWebsiteIsDown
    expr: >-
      probe_success{job="websites"} == 0
    for: 10m
  - alert: Missing Metrics
    expr: >-
      up{instance!~"vmhost.*"} == 0
    for: 10m
  - alert: NUT is offline
    expr: >-
      absent(collectd_nut_percent)
    for: 10m

- name: Bitwarden
  rules:
  - alert: vaultwarden is not running
    expr: >-
      collectd_processes_ps_count_processes{processes="vaultwarden"} < 1
    for: 5m

- name: Active Directory
  rules:
  - alert: samba is not running
    expr: >-
      collectd_processes_ps_count_processes{processes=~"samba|smbd|winbindd|krb5kdc"} < 1
    for: 5m

- name: mdraid
  rules:
  - alert: mdraid missing disk
    expr: collectd_md_md_disks{type="missing", instance!~"burp.*"} != 0
  - alert: mdraid failed disk
    expr: collectd_md_md_disks{type="failed"} != 0

- name: certificates
  rules:
  - alert: certificate will expire soon
    expr:
      probe_ssl_last_chain_expiry_timestamp_seconds - time() < 29 * 86400
    annotations:
      summary: A certificate will expire in less than 29 days
      description: >-
        Generally, certificates are renewed automatically, approximately 30
        days before their expiration (NotAfter) date.  There may be a problem
        with the certificate renewal process that prevented this certificate
        from being renewed.
  - alert: certificate will expire very soon
    expr:
      probe_ssl_last_chain_expiry_timestamp_seconds - time() < 14 * 86400
    annotations:
      summary: A certificate will expire in less than 14 days
      description: >-
        Generally, certificates are renewed automatically, approximately 30
        days before their expiration (NotAfter) date.  There is most likely a
        problem with the certificate renewal process that prevented this
        certificate from being renewed.

- name: Frigate
  rules:
  - alert: Frigate is Unavailable
    expr:
      homeassistant_entity_available{entity=~".*frigate_(server|status)"} != 1
    for: 10m
  - alert: Camera unavailable
    expr:
      homeassistant_entity_available{domain="camera"} != 1
    for: 10m

- name: Sensors
  rules:
  - alert: Battery Low
    expr:
      homeassistant_sensor_battery_percent{entity!~"sensor\\.(pixel_|sm_p610).*"} < 10

- name: PostgreSQL
  rules:
  - alert: Replica lag too high
    expr:
      (patroni_xlog_location != 0)
      - ignoring (instance) group_right (scope) (patroni_xlog_replayed_location != 0)
      > 10240
    for: 10m
  - alert: WAL archive process failed
    expr: >-
      pg_stat_archiver_failed_count > 0
    annotations:
      summary: The archiver process failed for one or more WAL segments
      description: >-
        Check the WAL segment archiver configuration and confirm that WAL
        segments are being backed up correctly.
  - alert: No recent WAL archives
    expr: >-
      pg_stat_archiver_last_archive_age > 3600
    annotations:
      summary: The last successful WAL segment backup was over 1h ago
      description: >-
        The WAL archiver process has not run successfully for over an hour.
        Ensure the WAL backup process is configured correctly and the backup
        target is online and healthy.


- name: Temperature
  rules:
  - alert: High Temperature
    expr: >-
      {__name__=~"collectd_.*_temperature", sensors!~"i350bb.*"} > 80
    for: 10m

- name: Longhorn
  rules:
  - alert: Degraded Volumes
    expr: >-
      count(longhorn_volume_robustness==2) > 0
    for: 1h
  - alert: Faulted Volumes
    expr: >-
      count(longhorn_volume_robustness==3) > 0
    for: 5m

- name: Restic
  rules:
  - alert: Repository Check Failed
    expr: >-
      min(restic_check_success) by (job) < 1
    annotations:
      summary: Errors found in restic repository data
      description: >-
        The Restic repository has one or more problems that may result in data
        loss.  Check the restic-exporter log for more information and correct
        the issue as soon as possible.
  - alert: Last Backup Age
    expr: >-
      time() - restic_backup_timestamp > 604800
    annotations:
      summary: A Restic client has not backed up recently
      description: >-
        Clients are scheduled to back up every day, but at least one has not
        been backed up in at least 7 days.  Check the Restic configuration on
        that system to ensure backups are running properly.
  - alert: No File Changes
    expr: >-
      max_over_time(
        abs(
          delta(
            sum(restic_backup_size_total{
              client_hostname!="pxe0.pyrocufflink.blue",
              client_hostname!="web0.pyrocufflink.blue",
            })
              by (client_hostname, client_username)
          )
        )[7d]
      ) == 0
    annotations:
      summary: The size of the Restic backup has not changed
      description: >-
        The size of the Restic backup for a particular client has not changed
        in at least 7 days.  This may indicate that the backup configuration
        is incorrect.

- name: Paperless-ngx
  rules:
  - alert: Celery tasks failed
    expr: >-
      flower_events_total{job="paperless-ngx", type="task-failed"} > 0
    annotations:
      summary: One or more Celery tasks have failed
      description: >-
        Failing Celery tasks may indicate a problem with the Paperless-ngx
        deployment and can result in data loss.  Check the Paperless-ngx logs
        for details about the task failures.