Adjust prometheus rules to leave time for backups
This commit is contained in:
		@@ -35,14 +35,14 @@ groups:
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Container Memory usage (instance {{ $labels.instance }})
 | 
			
		||||
        description: "Container Memory usage is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
    # - alert: ContainerVolumeUsage
 | 
			
		||||
    #   expr: '(1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80'
 | 
			
		||||
    #   for: 2m
 | 
			
		||||
    #   labels:
 | 
			
		||||
    #     severity: warning
 | 
			
		||||
    #   annotations:
 | 
			
		||||
    #     summary: Container Volume usage (instance {{ $labels.instance }})
 | 
			
		||||
    #     description: "Container Volume usage is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
    - alert: ContainerVolumeUsage
 | 
			
		||||
      expr: '(1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80'
 | 
			
		||||
      for: 2m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Container Volume usage (instance {{ $labels.instance }})
 | 
			
		||||
        description: "Container Volume usage is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
    - alert: ContainerHighThrottleRate
 | 
			
		||||
      expr: 'rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1'
 | 
			
		||||
      for: 2m
 | 
			
		||||
 
 | 
			
		||||
@@ -12,7 +12,7 @@ groups:
 | 
			
		||||
        summary: Host out of memory (instance {{ $labels.instance }})
 | 
			
		||||
        description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
    - alert: HostMemoryUnderMemoryPressure
 | 
			
		||||
      expr: 'rate(node_vmstat_pgmajfault[1m]) > 1000'
 | 
			
		||||
      expr: 'rate(node_vmstat_pgmajfault[1m]) > 1000 unless on() hour()>=0 <=3'
 | 
			
		||||
      for: 2m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
@@ -108,7 +108,7 @@ groups:
 | 
			
		||||
        summary: Host unusual disk write latency (instance {{ $labels.instance }})
 | 
			
		||||
        description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
    - alert: HostHighCpuLoad
 | 
			
		||||
      expr: '(100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100)) > 80'
 | 
			
		||||
      expr: '(100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100)) > 80 unless on() hour()>=0 <=3'
 | 
			
		||||
      for: 2m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user