configpolicy/group_vars/metricspi/alerts.yml

vmalert_rules:
  groups:
  - name: default alert
    rules:
    - alert: DiskUsage
      expr: >-
        sum(collectd_df_df_complex{type!="free"}) by (instance, df) / sum(collectd_df_df_complex{df!="var-log"}) by (instance, df) > .75
        or sum(collectd_df_df_complex{type!="free"}) by (instance, df) / sum(collectd_df_df_complex{df="var-log"}) by (instance, df) > .95
      for: 2h
    - alert: TheWebsiteIsDown
      expr: >-
        probe_success{job="websites"} == 0
      for: 10m
    - alert: Missing Metrics
      expr: >-
        up{instance!~"vmhost.*"} == 0
      for: 10m
    - alert: NUT is offline
      expr: >-
        absent(collectd_nut_percent)

  - name: Bitwarden
    rules:
    - alert: vaultwarden is not running
      expr: >-
        collectd_processes_ps_count_processes{processes="vaultwarden"} < 1
      for: 5m

  - name: Active Directory
    rules:
    - alert: samba is not running
      expr: >-
        collectd_processes_ps_count_processes{processes=~"samba|smbd|winbindd|krb5kdc"} < 1
      for: 5m

  - name: Graylog
    rules:
    - alert: unprocessed messages
      expr: >-
        org_graylog2_journal_entries_uncommitted > 100
      for: 1h

  - name: mdraid
    rules:
    - alert: mdraid missing disk
      expr: collectd_md_md_disks{type="missing", instance!~"burp.*"} != 0
    - alert: mdraid failed disk
      expr: collectd_md_md_disks{type="failed"} != 0

  - name: BURP
    rules:
    - alert: no recent backups
      expr: absent(burp_client_last_backup_timestamp)
      for: 8h
      annotations:
        summary: No clients have been backed up recently
        description: >-
          This alert indicates that NO clients have been backed up within the
          last day.  There is likely a problem with the BURP server.
    - alert: missed client backup
      expr:
        time() - (burp_client_last_backup_timestamp > now() - 86400 * 90) > 86400 * 2
      for: 3h
      annotations:
        summary: A client has not backed up today
        description: >-
          A client has not been backed up for more than a day.  This may be
          because the client is offline, or because the backup process has
          failed.  Clients that have not been backed up for more than 90 days
          will not trigger this alert.
    - alert: disks need swapped
      expr:
        time() - tlast_change_over_time(
          (
            collectd_md_md_disks{instance="burp1.pyrocufflink.blue", type="active"}
            or last_over_time(collectd_md_md_disks{instance="burp1.pyrocufflink.blue", type="active"})[1d]
          )[90d]
        ) > 86400 * 30
      annotations:
        summary: The disks in the BURP array need swapped
        description: >-
          The disks in the BURP RAID-1 (mirror) array should be swapped
          periodically. One disk should be online and mounted while the other
          is stored in the fireproof safe.  Switching them ensures that even if
          something happens to the active disk, such as hardware failure, power
          surge, fire, or accidental `rm -rf`, the offline disk is only out of
          date by a few weeks.
    - alert: disk needs archived
      expr:
        sum(
          collectd_md_md_disks{instance="burp1.pyrocufflink.blue", type=~"missing|spare"}
        ) < 1
      annotations:
        summary: One of the disks in the BURP array should be archived
        description: >-
          The disks in the BURP RAID-1 (mirror) array should be swapped
          periodically.  One disk should be online and mounted while the other
          is stored in the fireproof safe.  All of the disks are currently
          online; one needs to be disconnected and moved to the safe as soon as
          possible.

  - name: certificates
    rules:
    - alert: certificate will expire soon
      expr:
        probe_ssl_last_chain_expiry_timestamp_seconds - time() < 29 * 86400
      annotations:
        summary: A certificate will expire in less than 29 days
        description: >-
          Generally, certificates are renewed automatically, approximately 30
          days before their expiration (NotAfter) date.  There may be a problem
          with the certificate renewal process that prevented this certificate
          from being renewed.
    - alert: certificate will expire very soon
      expr:
        probe_ssl_last_chain_expiry_timestamp_seconds - time() < 14 * 86400
      annotations:
        summary: A certificate will expire in less than 14 days
        description: >-
          Generally, certificates are renewed automatically, approximately 30
          days before their expiration (NotAfter) date.  There is most likely a
          problem with the certificate renewal process that prevented this
          certificate from being renewed.