Files
kubernetes/victoria-metrics/kustomization.yaml
Dustin C. Hatch 8ecee4133f v-m/alerts: Rework free disk space alert
Fedora CoreOS fills `/boot` beyond the 75% alert threshold under normal
circumstances on aarch64 machines.  This is not a problem, because it
cleans up old files on its own, so we do not need to alert on it.
Unfortunately, the _DiskUsage_ alert is already quite complex, and
adding in exclusions for these devices would make it even worse.

To simplify the logic, we can use a recording rule to precomupte the
used/free space ratio.  By using `sum(...) without (type)` instead of
`sum(...) on (df, instance)`, we keep the other labels, which we can
then use to identify the metrics coming from machines we don't care to
monitor.

Instead of having different thresholds for different volumes
encoded in the same expression, we can use multiple alerts to alert on
"low" vs "very low" thresholds.  Since this will of course cause
duplicate alerts for most volumes, we can use AlertManager inhibition
rules to disable the "low" alert once the metric crosses the "very low"
threshold.
2024-11-02 09:38:02 -05:00

231 lines
5.4 KiB
YAML

apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: victoria-metrics
labels:
- pairs:
app.kubernetes.io/instance: victoria-metrics
includeSelectors: true
- pairs:
app.kubernetes.io/part-of: victoria-metrics
includeSelectors: false
resources:
- namespace.yaml
- secrets.yaml
- vmstorage-iscsi.yaml
- vmstorage.yaml
- vmselect.yaml
- vminsert.yaml
- vmagent.yaml
- vmalert.yaml
- alertmanager.yaml
- alertmanager-ntfy.yaml
- blackbox-exporter.yaml
- ingress.yaml
- ../dch-root-ca
configMapGenerator:
- name: vmagent
files:
- scrape.yml
options:
disableNameSuffixHash: true
labels:
app.kubernetes.io/component: vmagent
- name: vmalert-rules
files:
- alerts.yml
- recording.yml
options:
disableNameSuffixHash: true
labels:
app.kubernetes.io/component: vmalert
- name: alertmanager
files:
- alertmanager.yml=alertmanager.config.yml
options:
disableNameSuffixHash: true
labels:
app.kubernetes.io/component: alertmanager
- name: alertmanager-ntfy
files:
- config.yml=alertmanager-ntfy.config.yml
options:
labels:
app.kubernetes.io/component: alertmanager-ntfy
- name: blackbox
files:
- blackbox.yml
options:
disableNameSuffixHash: true
labels:
app.kubernetes.io/component: blackbox
replicas:
# When changing the number of vmstorage replicas, be sure to update
# the storageNode value for vmselect and vminsert. Also, the
# replicationFactor setting may need adjusted.
- name: vmstorage
count: 4
- name: vmselect
count: 2
- name: vminsert
count: 2
- name: vmagent
count: 2
- name: vmalert
count: 2
# When changing the number of alertmanager replicas, be sure to update
# the notifier URL value for vmalert and the peer addresses provided to
# Alertmanager itself.
- name: alertmanager
count: 2
patches:
- patch: |
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: vmstorage
spec:
template:
spec:
containers:
- name: vmstorage
env:
- name: vmstorage_dedup_minScrapeInterval
value: 1m
- name: vmstorage_retentionPeriod
value: 5y
- patch: |
apiVersion: apps/v1
kind: Deployment
metadata:
name: vmselect
spec:
template:
spec:
containers:
- name: vmselect
env:
- name: vmselect_storageNode
value: vmstorage-0.vmstorage,vmstorage-1.vmstorage,vmstorage-2.vmstorage,vmstorage-3.vmstorage
- name: vmselect_replicationFactor
value: '2'
- patch: |
apiVersion: apps/v1
kind: Deployment
metadata:
name: vminsert
spec:
template:
spec:
containers:
- name: vminsert
env:
- name: vminsert_storageNode
value: vmstorage-0.vmstorage,vmstorage-1.vmstorage,vmstorage-2.vmstorage,vmstorage-3.vmstorage
- name: vminsert_dedup_minScrapeInterval
value: 1m
- name: vminsert_replicationFactor
value: '2'
- patch: |
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: vmagent
spec:
template:
spec:
containers:
- name: vmagent
env:
- name: SCRAPE_GRAYLOG_TOKEN
valueFrom:
secretKeyRef:
name: vmagent
key: graylog.token
optional: true
volumeMounts:
- mountPath: /run/dch-ca
name: dch-ca
readOnly: true
- mountPath: /run/secrets/vmagent
name: secrets
readOnly: true
- mountPath: /scrape/collectd
name: scrape-collectd
readOnly: true
volumes:
- name: scrape-collectd
configMap:
name: scrape-collectd
optional: true
- name: secrets
secret:
secretName: vmagent
- name: dch-ca
configMap:
name: dch-root-ca
optional: true
- patch: |
apiVersion: apps/v1
kind: Deployment
metadata:
name: vmalert
spec:
template:
spec:
containers:
- name: vmalert
env:
- name: vmalert_http_pathPrefix
value: /vmalert
- name: vmalert_notifier_url
value: http://alertmanager-0.alertmanager:9093,http://alertmanager-1.alertmanager:9093
startupProbe:
httpGet:
path: /vmalert/health
readinessProbe:
httpGet:
path: /vmalert/health
- patch: |
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: alertmanager
spec:
template:
spec:
containers:
- name: alertmanager
args:
- --config.file=/etc/alertmanager/alertmanager.yml
- --storage.path=/alertmanager
- --cluster.peer=alertmanager-0.alertmanager:9094
- --cluster.peer=alertmanager-1.alertmanager:9094
- patch: |
- op: add
path: /spec/volumeClaimTemplates/0/spec/storageClassName
value: ''
- op: replace
path: /spec/volumeClaimTemplates/0/spec/resources/requests/storage
value: 30G
target:
group: apps
version: v1
kind: StatefulSet
name: vmstorage