infrastructure/ansible/plays/services/mimir/rules/cadvisor.yaml
Tobias Manske d8a7d47993
All checks were successful
continuous-integration/drone/push Build is passing
Adjust prometheus rules to leave time for backups
2024-01-13 00:09:45 +01:00

55 lines
2.3 KiB
YAML

# {% raw %}
groups:
- name: GoogleCadvisor
rules:
# - alert: ContainerKilled
# expr: 'time() - container_last_seen > 60'
# for: 0m
# labels:
# severity: warning
# annotations:
# summary: Container killed (instance {{ $labels.instance }})
# description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# - alert: ContainerAbsent
# expr: 'absent(container_last_seen)'
# for: 5m
# labels:
# severity: warning
# annotations:
# summary: Container absent (instance {{ $labels.instance }})
# description: "A container is absent for 5 min\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerCpuUsage
expr: '(sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80'
for: 2m
labels:
severity: warning
annotations:
summary: Container CPU usage (instance {{ $labels.instance }})
description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerMemoryUsage
expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80'
for: 2m
labels:
severity: warning
annotations:
summary: Container Memory usage (instance {{ $labels.instance }})
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerVolumeUsage
expr: '(1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80'
for: 2m
labels:
severity: warning
annotations:
summary: Container Volume usage (instance {{ $labels.instance }})
description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerHighThrottleRate
expr: 'rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1'
for: 2m
labels:
severity: warning
annotations:
summary: Container high throttle rate (instance {{ $labels.instance }})
description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# {% endraw %}