Fedora CoreOS fills `/boot` beyond the 75% alert threshold under normal circumstances on aarch64 machines. This is not a problem, because it cleans up old files on its own, so we do not need to alert on it. Unfortunately, the _DiskUsage_ alert is already quite complex, and adding in exclusions for these devices would make it even worse. To simplify the logic, we can use a recording rule to precomupte the used/free space ratio. By using `sum(...) without (type)` instead of `sum(...) on (df, instance)`, we keep the other labels, which we can then use to identify the metrics coming from machines we don't care to monitor. Instead of having different thresholds for different volumes encoded in the same expression, we can use multiple alerts to alert on "low" vs "very low" thresholds. Since this will of course cause duplicate alerts for most volumes, we can use AlertManager inhibition rules to disable the "low" alert once the metric crosses the "very low" threshold.
231 lines
5.4 KiB
YAML
231 lines
5.4 KiB
YAML
apiVersion: kustomize.config.k8s.io/v1beta1
|
|
kind: Kustomization
|
|
|
|
namespace: victoria-metrics
|
|
|
|
labels:
|
|
- pairs:
|
|
app.kubernetes.io/instance: victoria-metrics
|
|
includeSelectors: true
|
|
- pairs:
|
|
app.kubernetes.io/part-of: victoria-metrics
|
|
includeSelectors: false
|
|
|
|
resources:
|
|
- namespace.yaml
|
|
- secrets.yaml
|
|
- vmstorage-iscsi.yaml
|
|
- vmstorage.yaml
|
|
- vmselect.yaml
|
|
- vminsert.yaml
|
|
- vmagent.yaml
|
|
- vmalert.yaml
|
|
- alertmanager.yaml
|
|
- alertmanager-ntfy.yaml
|
|
- blackbox-exporter.yaml
|
|
- ingress.yaml
|
|
- ../dch-root-ca
|
|
|
|
configMapGenerator:
|
|
- name: vmagent
|
|
files:
|
|
- scrape.yml
|
|
options:
|
|
disableNameSuffixHash: true
|
|
labels:
|
|
app.kubernetes.io/component: vmagent
|
|
|
|
- name: vmalert-rules
|
|
files:
|
|
- alerts.yml
|
|
- recording.yml
|
|
options:
|
|
disableNameSuffixHash: true
|
|
labels:
|
|
app.kubernetes.io/component: vmalert
|
|
|
|
- name: alertmanager
|
|
files:
|
|
- alertmanager.yml=alertmanager.config.yml
|
|
options:
|
|
disableNameSuffixHash: true
|
|
labels:
|
|
app.kubernetes.io/component: alertmanager
|
|
|
|
- name: alertmanager-ntfy
|
|
files:
|
|
- config.yml=alertmanager-ntfy.config.yml
|
|
options:
|
|
labels:
|
|
app.kubernetes.io/component: alertmanager-ntfy
|
|
|
|
- name: blackbox
|
|
files:
|
|
- blackbox.yml
|
|
options:
|
|
disableNameSuffixHash: true
|
|
labels:
|
|
app.kubernetes.io/component: blackbox
|
|
|
|
replicas:
|
|
# When changing the number of vmstorage replicas, be sure to update
|
|
# the storageNode value for vmselect and vminsert. Also, the
|
|
# replicationFactor setting may need adjusted.
|
|
- name: vmstorage
|
|
count: 4
|
|
- name: vmselect
|
|
count: 2
|
|
- name: vminsert
|
|
count: 2
|
|
- name: vmagent
|
|
count: 2
|
|
- name: vmalert
|
|
count: 2
|
|
# When changing the number of alertmanager replicas, be sure to update
|
|
# the notifier URL value for vmalert and the peer addresses provided to
|
|
# Alertmanager itself.
|
|
- name: alertmanager
|
|
count: 2
|
|
|
|
patches:
|
|
- patch: |
|
|
apiVersion: apps/v1
|
|
kind: StatefulSet
|
|
metadata:
|
|
name: vmstorage
|
|
spec:
|
|
template:
|
|
spec:
|
|
containers:
|
|
- name: vmstorage
|
|
env:
|
|
- name: vmstorage_dedup_minScrapeInterval
|
|
value: 1m
|
|
- name: vmstorage_retentionPeriod
|
|
value: 5y
|
|
|
|
- patch: |
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: vmselect
|
|
spec:
|
|
template:
|
|
spec:
|
|
containers:
|
|
- name: vmselect
|
|
env:
|
|
- name: vmselect_storageNode
|
|
value: vmstorage-0.vmstorage,vmstorage-1.vmstorage,vmstorage-2.vmstorage,vmstorage-3.vmstorage
|
|
- name: vmselect_replicationFactor
|
|
value: '2'
|
|
|
|
- patch: |
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: vminsert
|
|
spec:
|
|
template:
|
|
spec:
|
|
containers:
|
|
- name: vminsert
|
|
env:
|
|
- name: vminsert_storageNode
|
|
value: vmstorage-0.vmstorage,vmstorage-1.vmstorage,vmstorage-2.vmstorage,vmstorage-3.vmstorage
|
|
- name: vminsert_dedup_minScrapeInterval
|
|
value: 1m
|
|
- name: vminsert_replicationFactor
|
|
value: '2'
|
|
|
|
- patch: |
|
|
apiVersion: apps/v1
|
|
kind: StatefulSet
|
|
metadata:
|
|
name: vmagent
|
|
spec:
|
|
template:
|
|
spec:
|
|
containers:
|
|
- name: vmagent
|
|
env:
|
|
- name: SCRAPE_GRAYLOG_TOKEN
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: vmagent
|
|
key: graylog.token
|
|
optional: true
|
|
volumeMounts:
|
|
- mountPath: /run/dch-ca
|
|
name: dch-ca
|
|
readOnly: true
|
|
- mountPath: /run/secrets/vmagent
|
|
name: secrets
|
|
readOnly: true
|
|
- mountPath: /scrape/collectd
|
|
name: scrape-collectd
|
|
readOnly: true
|
|
volumes:
|
|
- name: scrape-collectd
|
|
configMap:
|
|
name: scrape-collectd
|
|
optional: true
|
|
- name: secrets
|
|
secret:
|
|
secretName: vmagent
|
|
- name: dch-ca
|
|
configMap:
|
|
name: dch-root-ca
|
|
optional: true
|
|
|
|
- patch: |
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: vmalert
|
|
spec:
|
|
template:
|
|
spec:
|
|
containers:
|
|
- name: vmalert
|
|
env:
|
|
- name: vmalert_http_pathPrefix
|
|
value: /vmalert
|
|
- name: vmalert_notifier_url
|
|
value: http://alertmanager-0.alertmanager:9093,http://alertmanager-1.alertmanager:9093
|
|
startupProbe:
|
|
httpGet:
|
|
path: /vmalert/health
|
|
readinessProbe:
|
|
httpGet:
|
|
path: /vmalert/health
|
|
|
|
- patch: |
|
|
apiVersion: apps/v1
|
|
kind: StatefulSet
|
|
metadata:
|
|
name: alertmanager
|
|
spec:
|
|
template:
|
|
spec:
|
|
containers:
|
|
- name: alertmanager
|
|
args:
|
|
- --config.file=/etc/alertmanager/alertmanager.yml
|
|
- --storage.path=/alertmanager
|
|
- --cluster.peer=alertmanager-0.alertmanager:9094
|
|
- --cluster.peer=alertmanager-1.alertmanager:9094
|
|
|
|
- patch: |
|
|
- op: add
|
|
path: /spec/volumeClaimTemplates/0/spec/storageClassName
|
|
value: ''
|
|
- op: replace
|
|
path: /spec/volumeClaimTemplates/0/spec/resources/requests/storage
|
|
value: 30G
|
|
target:
|
|
group: apps
|
|
version: v1
|
|
kind: StatefulSet
|
|
name: vmstorage
|