Adjust prometheus rules to leave time for backups
This commit is contained in:
parent
ac0335c9b5
commit
450b6c160d
@ -35,14 +35,14 @@ groups:
|
|||||||
annotations:
|
annotations:
|
||||||
summary: Container Memory usage (instance {{ $labels.instance }})
|
summary: Container Memory usage (instance {{ $labels.instance }})
|
||||||
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
# - alert: ContainerVolumeUsage
|
- alert: ContainerVolumeUsage
|
||||||
# expr: '(1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80'
|
expr: '(1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80'
|
||||||
# for: 2m
|
for: 2m
|
||||||
# labels:
|
labels:
|
||||||
# severity: warning
|
severity: warning
|
||||||
# annotations:
|
annotations:
|
||||||
# summary: Container Volume usage (instance {{ $labels.instance }})
|
summary: Container Volume usage (instance {{ $labels.instance }})
|
||||||
# description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
- alert: ContainerHighThrottleRate
|
- alert: ContainerHighThrottleRate
|
||||||
expr: 'rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1'
|
expr: 'rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1'
|
||||||
for: 2m
|
for: 2m
|
||||||
|
@ -12,7 +12,7 @@ groups:
|
|||||||
summary: Host out of memory (instance {{ $labels.instance }})
|
summary: Host out of memory (instance {{ $labels.instance }})
|
||||||
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
- alert: HostMemoryUnderMemoryPressure
|
- alert: HostMemoryUnderMemoryPressure
|
||||||
expr: 'rate(node_vmstat_pgmajfault[1m]) > 1000'
|
expr: 'rate(node_vmstat_pgmajfault[1m]) > 1000 unless on() hour()>=0 <=3'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@ -108,7 +108,7 @@ groups:
|
|||||||
summary: Host unusual disk write latency (instance {{ $labels.instance }})
|
summary: Host unusual disk write latency (instance {{ $labels.instance }})
|
||||||
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
- alert: HostHighCpuLoad
|
- alert: HostHighCpuLoad
|
||||||
expr: '(100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100)) > 80'
|
expr: '(100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100)) > 80 unless on() hour()>=0 <=3'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
Loading…
Reference in New Issue
Block a user