# summary: Host Memory is under utilized (instance {{ $labels.instance }})
# description: "Node memory is < 20% for 1 week. Consider reducing memory space.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert:HostUnusualNetworkThroughputIn
expr:'sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100'
for:5m
labels:
severity:warning
annotations:
summary:Host unusual network throughput in (instance {{ $labels.instance }})
description:"Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert:HostUnusualNetworkThroughputOut
expr:'sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100'
for:5m
labels:
severity:warning
annotations:
summary:Host unusual network throughput out (instance {{ $labels.instance }})
description:"Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert:HostUnusualDiskReadRate
expr:'sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50'
for:5m
labels:
severity:warning
annotations:
summary:Host unusual disk read rate (instance {{ $labels.instance }})
description:"Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert:HostUnusualDiskWriteRate
expr:'sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50'
for:2m
labels:
severity:warning
annotations:
summary:Host unusual disk write rate (instance {{ $labels.instance }})
description:"Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert:HostOutOfDiskSpace
expr:'(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0'
for:2m
labels:
severity:warning
annotations:
summary:Host out of disk space (instance {{ $labels.instance }})
description:"Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert:HostDiskWillFillIn24Hours
expr:'(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0'
for:2m
labels:
severity:warning
annotations:
summary:Host disk will fill in 24 hours (instance {{ $labels.instance }})
description:"Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert:HostOutOfInodes
expr:'node_filesystem_files_free / node_filesystem_files * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0'
for:2m
labels:
severity:warning
annotations:
summary:Host out of inodes (instance {{ $labels.instance }})
description:"Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert:HostInodesWillFillIn24Hours
expr:'node_filesystem_files_free / node_filesystem_files * 100 < 10 and predict_linear(node_filesystem_files_free[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0'
for:2m
labels:
severity:warning
annotations:
summary:Host inodes will fill in 24 hours (instance {{ $labels.instance }})
description:"Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert:HostUnusualDiskReadLatency
expr:'rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0'
for:2m
labels:
severity:warning
annotations:
summary:Host unusual disk read latency (instance {{ $labels.instance }})
description:"Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert:HostUnusualDiskWriteLatency
expr:'rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0'
for:2m
labels:
severity:warning
annotations:
summary:Host unusual disk write latency (instance {{ $labels.instance }})
description:"Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary:Host CPU steal noisy neighbor (instance {{ $labels.instance }})
description:"CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description:"RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert:HostRaidDiskFailure
expr:'node_md_disks{state="failed"} > 0'
for:2m
labels:
severity:warning
annotations:
summary:Host RAID disk failure (instance {{ $labels.instance }})
description:"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert:HostKernelVersionDeviations
expr:'count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1'
for:6h
labels:
severity:warning
annotations:
summary:Host kernel version deviations (instance {{ $labels.instance }})
description:"Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description:"Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description:"Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description:"The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert:HostClockSkew
expr:'(node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)'
description:"Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert:HostClockNotSynchronising
expr:'min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16'
for:2m
labels:
severity:warning
annotations:
summary:Host clock not synchronising (instance {{ $labels.instance }})
description:"Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"