From 450b6c160db9f802c35df2c153fd008f8a3924fe Mon Sep 17 00:00:00 2001 From: Tobias Manske Date: Sat, 13 Jan 2024 00:09:45 +0100 Subject: [PATCH] Adjust prometheus rules to leave time for backups --- ansible/plays/services/mimir/rules/cadvisor.yaml | 16 ++++++++-------- ansible/plays/services/mimir/rules/node.yaml | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/ansible/plays/services/mimir/rules/cadvisor.yaml b/ansible/plays/services/mimir/rules/cadvisor.yaml index 4e4512c..6525097 100644 --- a/ansible/plays/services/mimir/rules/cadvisor.yaml +++ b/ansible/plays/services/mimir/rules/cadvisor.yaml @@ -35,14 +35,14 @@ groups: annotations: summary: Container Memory usage (instance {{ $labels.instance }}) description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - # - alert: ContainerVolumeUsage - # expr: '(1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80' - # for: 2m - # labels: - # severity: warning - # annotations: - # summary: Container Volume usage (instance {{ $labels.instance }}) - # description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: ContainerVolumeUsage + expr: '(1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80' + for: 2m + labels: + severity: warning + annotations: + summary: Container Volume usage (instance {{ $labels.instance }}) + description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ContainerHighThrottleRate expr: 'rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1' for: 2m diff --git a/ansible/plays/services/mimir/rules/node.yaml b/ansible/plays/services/mimir/rules/node.yaml index b2822d6..677648b 100644 --- a/ansible/plays/services/mimir/rules/node.yaml +++ b/ansible/plays/services/mimir/rules/node.yaml @@ -12,7 +12,7 @@ groups: summary: Host out of memory (instance {{ $labels.instance }}) description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostMemoryUnderMemoryPressure - expr: 'rate(node_vmstat_pgmajfault[1m]) > 1000' + expr: 'rate(node_vmstat_pgmajfault[1m]) > 1000 unless on() hour()>=0 <=3' for: 2m labels: severity: warning @@ -108,7 +108,7 @@ groups: summary: Host unusual disk write latency (instance {{ $labels.instance }}) description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostHighCpuLoad - expr: '(100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100)) > 80' + expr: '(100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100)) > 80 unless on() hour()>=0 <=3' for: 2m labels: severity: warning