Refactor: Dir structure

This commit is contained in:
2023-09-14 06:48:05 +02:00
parent c361625230
commit af3e66f901
157 changed files with 3 additions and 3 deletions

View File

@ -0,0 +1 @@
COMPOSE_PROJECT_NAME=prometheus

View File

@ -0,0 +1,50 @@
global:
resolve_timeout: 5m
route:
group_by: ['alertname']
group_wait: 5s
group_interval: 5m
repeat_interval: 1h
receiver: 'matrix-monitoring'
routes:
- receiver: 'hcio'
repeat_interval: 1h
matchers:
- alertname="PrometheusAlertmanagerE2eDeadManSwitch"
- receiver: 'email'
group_interval: 1m
matchers:
- job="matrix_synapse_1"
- receiver: 'matrix-monitoring'
group_wait: 30s
group_interval: 1h
matchers:
- alertname="PrometheusAllTargetsMissing"
- receiver: 'matrix-monitoring'
group_wait: 30s
group_interval: 1h
matchers:
- alertname="PrometheusTargetMissing"
receivers:
- name: 'email'
email_configs:
- to: '{{ prometheus.alertmanager.smtp.target }}'
from: '"Alertmanager" <{{ prometheus.alertmanager.smtp.username }}>'
smarthost: 'mxe8cf.netcup.net:587'
auth_username: '{{ prometheus.alertmanager.smtp.username }}'
auth_identity: '{{ prometheus.alertmanager.smtp.username }}'
auth_password: '{{ prometheus.alertmanager.smtp.password }}'
- name: 'hcio'
email_configs:
- to: '{{ prometheus.alertmanager.hcio.mail }}'
from: '"Alertmanager" <{{ prometheus.alertmanager.smtp.username }}>'
smarthost: 'mxe8cf.netcup.net:587'
auth_username: '{{ prometheus.alertmanager.smtp.username }}'
auth_identity: '{{ prometheus.alertmanager.smtp.username }}'
auth_password: '{{ prometheus.alertmanager.smtp.password }}'
- name: 'matrix-monitoring'
webhook_configs:
- url: 'http://alertmanager-matrix:3000/alerts?secret={{ prometheus.alertmanager.matrix.alertmanager_token }}'

View File

@ -0,0 +1,223 @@
version: "3.4"
services:
prometheus:
image: prom/prometheus:latest
restart: unless-stopped
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--web.external-url=https://prometheus.tobiasmanske.de'
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro,Z
- prom_data:/prometheus
- label_discovery:/label_discovery:ro
- ./rules:/rules:ro,Z
labels:
- "traefik.enable=true"
- "traefik.http.routers.prometheus.rule=Host(`prometheus.tobiasmanske.de`)"
- "traefik.http.routers.prometheus.entryPoints=websecure"
- "traefik.http.services.prometheus.loadbalancer.server.port=9090"
- "traefik.http.routers.prometheus.middlewares=oauth@file"
depends_on:
- prometheus-docker-sd
- cadvisor
- node-exporter
networks:
- backend
- alertmanager
- metrics
prometheus-docker-sd:
image: registry.tobiasmanske.de/prometheus-docker-sd:latest
restart: unless-stopped
privileged: true
networks:
- backend
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro,Z
- label_discovery:/prometheus-docker-sd:rw
logging: # this service generates a HUGE amout of logs.
driver: "none"
alertmanager:
image: prom/alertmanager:latest
labels:
- "traefik.enable=true"
- "traefik.http.routers.alertmanager.rule=Host(`alertmanager.tobiasmanske.de`)"
- "traefik.http.routers.alertmanager.entryPoints=websecure"
- "traefik.http.services.alertmanager.loadbalancer.server.port=9093"
- "traefik.http.routers.alertmanager.middlewares=oauth@file"
volumes:
- ./alertmanager.yml:/etc/alertmanager/config.yml:ro,Z
- alertmanager_data:/data
networks:
- alertmanager
restart: unless-stopped
command:
- '--config.file=/etc/alertmanager/config.yml'
- '--web.external-url=https://alertmanager.tobiasmanske.de'
- '--storage.path=/data'
alertmanager-matrix:
image: jaywink/matrix-alertmanager:latest
restart: unless-stopped
labels:
- "traefik.enable=true"
- "traefik.http.routers.alertmanager-matrix.rule=Host(`alertmanager.tobiasmanske.de`) && PathPrefix(`/matrix/`)"
- "traefik.http.routers.alertmanager-matrix.middlewares=matrix-strip"
- "traefik.http.middlewares.matrix-strip.stripprefix.prefixes=/matrix"
- "traefik.http.middlewares.matrix-strip.stripprefix.forceslash=false"
- "traefik.http.routers.alertmanager-matrix.entryPoints=websecure"
- "traefik.http.services.alertmanager-matrix.loadbalancer.server.port=3000"
environment:
- APP_PORT=3000
- APP_ALERTMANAGER_SECRET={{ prometheus.alertmanager.matrix.alertmanager_token }}
- MATRIX_HOMESERVER_URL=http://pantalaimon:8008
- MATRIX_ROOMS={{ prometheus.alertmanager.matrix.rooms | join('|') }}
- MATRIX_TOKEN={{ prometheus.alertmanager.matrix.matrix_token }}
- MATRIX_USER=@alertmanager:{{ matrix.baseurl }}
- MENTION_ROOM=1
networks:
- alertmanager
- pantalaimon
grafana:
image: grafana/grafana:latest
restart: unless-stopped
labels:
- "traefik.enable=true"
- "traefik.http.routers.grafana.rule=Host(`grafana.tobiasmanske.de`)"
- "traefik.http.routers.grafana.entryPoints=websecure"
- "traefik.http.services.grafana.loadbalancer.server.port=3000"
networks:
- backend
environment:
- "GF_SERVER_ROOT_URL=https://grafana.tobiasmanske.de"
- "GF_SECURITY_ADMIN_USER={{ grafana.admin.user }}"
- "GF_SECURITY_ADMIN_PASSWORD={{ grafana.admin.password }}"
- "GF_AUTH_GENERIC_OAUTH_NAME=Keycloak"
- "GF_AUTH_GENERIC_OAUTH_ENABLED=true"
- "GF_AUTH_GENERIC_OAUTH_ALLOW_SIGN_UP=true"
- "GF_AUTH_GENERIC_OAUTH_CLIENT_ID={{ grafana.oidc.client_id }}"
- "GF_AUTH_GENERIC_OAUTH_CLIENT_SECRET={{ grafana.oidc.client_secret }}"
- "GF_AUTH_GENERIC_OAUTH_SCOPES=openid email profile offline_access roles"
- "GF_AUTH_GENERIC_OAUTH_GROUP_ATTRIBUTE_PATH=groups"
- "GF_AUTH_GENERIC_OAUTH_EMAIL_ATTRIBUTE_PATH=email"
- "GF_AUTH_GENERIC_OAUTH_LOGIN_ATTRIBUTE_PATH=preferred_username"
- "GF_AUTH_GENERIC_OAUTH_NAME_ATTRIBUTE_PATH=full_name"
- "GF_AUTH_GENERIC_OAUTH_AUTH_URL=https://{{ grafana.oidc.url }}/realms/{{ grafana.oidc.realm_name }}/protocol/openid-connect/auth"
- "GF_AUTH_GENERIC_OAUTH_TOKEN_URL=https://{{ grafana.oidc.url }}/realms/{{ grafana.oidc.realm_name }}/protocol/openid-connect/token"
- "GF_AUTH_GENERIC_OAUTH_API_URL=https://{{ grafana.oidc.url }}/realms/{{ grafana.oidc.realm_name }}/protocol/openid-connect/userinfo"
- "GF_AUTH_GENERIC_OAUTH_ROLE_ATTRIBUTE_PATH=contains(resource_access.grafana.roles[*], 'serveradmin') && 'GrafanaAdmin' || contains(resource_access.grafana.roles[*], 'admin') && 'Admin' || contains(resource_access.grafana.roles[*], 'editor') && 'Editor' || 'Viewer'"
- "GF_AUTH_GENERIC_OAUTH_ALLOW_ASSIGN_GRAFANA_ADMIN=true"
volumes:
- grafana_data:/var/lib/grafana
- ./grafana-ds.yml:/etc/grafana/provisioning/datasources/datasource.yml:ro,Z
- ./grafana-db.yml:/etc/grafana/provisioning/dashboards/datasource.yml:ro,Z
- ./grafana-dashboards:/var/lib/grafana/dashboards:ro,Z
node-exporter:
image: quay.io/prometheus/node-exporter:latest
container_name: host-nc-chaoswg-org-node-exporter
privileged: true
labels:
- "prometheus-scrape.enabled=true"
- "prometheus-scrape.port=9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
- /:/host:ro,rslave
- /run/dbus/system_bus_socket:/var/run/dbus/system_bus_socket:ro
command:
- '--path.rootfs=/host'
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.ignored-mount-points'
- "^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)"
- '--collector.systemd'
networks:
- metrics
restart: unless-stopped
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
privileged: true
labels:
- "prometheus-scrape.enabled=true"
- "prometheus-scrape.port=8080"
command:
- "-docker_only=true"
- "-housekeeping_interval=10s"
volumes:
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
networks:
- metrics
restart: unless-stopped
loki:
image: grafana/loki:latest
restart: unless-stopped
command: -config.file=/etc/loki/loki.yaml
volumes:
- ./loki.yml:/etc/loki/loki.yaml:ro,Z
- loki_data:/loki
labels:
- "prometheus-scrape.enabled=true"
- "prometheus-scrape.port=3100"
networks:
- backend
promtail:
image: grafana/promtail:latest
security_opt:
- label:disable
restart: unless-stopped
volumes:
- ./promtail.yml:/etc/promtail/config.yml:ro
- /var/log:/var/log:ro
- /var/lib/docker/containers:/var/lib/docker/containers:ro
- /var/run/docker.sock:/var/run/docker.sock
command: -config.file=/etc/promtail/config.yml
labels:
- "prometheus-scrape.enabled=true"
- "prometheus-scrape.port=8080"
networks:
- backend
- metrics
mimir:
image: grafana/mimir:latest
restart: unless-stopped
volumes:
- mimir_data:/mimir
- ./mimir.yml:/etc/mimir-config/mimir.yaml:ro,Z
entrypoint:
- /bin/mimir
- -config.file=/etc/mimir-config/mimir.yaml
- -validation.max-label-names-per-series=60
labels:
- "prometheus-scrape.enabled=true"
- "prometheus-scrape.port=8080"
networks:
- backend
- metrics
volumes:
prom_data:
grafana_data:
loki_data:
label_discovery:
alertmanager_data:
mimir_data:
networks:
pantalaimon:
external: true
backend:
internal: true
alertmanager:
metrics:
external: true

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,602 @@
{% raw %}
{
"__inputs": [
{
"name": "DS_PROMETHEUS",
"label": "Prometheus",
"description": "",
"type": "datasource",
"pluginId": "prometheus",
"pluginName": "Prometheus"
}
],
"__elements": {},
"__requires": [
{
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "10.0.3"
},
{
"type": "datasource",
"id": "prometheus",
"name": "Prometheus",
"version": "1.0.0"
},
{
"type": "panel",
"id": "stat",
"name": "Stat",
"version": ""
},
{
"type": "panel",
"id": "timeseries",
"name": "Time series",
"version": ""
}
],
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "datasource",
"uid": "grafana"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"target": {
"limit": 100,
"matchAny": false,
"tags": [],
"type": "dashboard"
},
"type": "dashboard"
}
]
},
"description": "Dashboard for Drone CI",
"editable": true,
"fiscalYearStartMonth": 0,
"gnetId": 16720,
"graphTooltip": 2,
"id": null,
"links": [],
"liveNow": false,
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 4,
"x": 0,
"y": 0
},
"id": 2,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "none",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "10.0.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "sum(drone_build_count) by (application_name)",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
}
],
"title": "Total Builds",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 4,
"x": 4,
"y": 0
},
"id": 4,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "none",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "10.0.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "sum(drone_repo_count) by (application_name)",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
}
],
"title": "Activated Repos",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 4,
"x": 8,
"y": 0
},
"id": 7,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "none",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "10.0.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(drone_user_count) by (application_name)",
"format": "time_series",
"intervalFactor": 1,
"range": true,
"refId": "A"
}
],
"title": "Total Users",
"type": "stat"
},
{
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 4
},
"id": 10,
"title": "Metrics",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"decimals": 0,
"links": [],
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 5
},
"id": 6,
"links": [],
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "9.0.7",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(drone_running_builds) by (application_name)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "running builds",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(drone_pending_builds) by (application_name)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "pending builds",
"range": true,
"refId": "B"
}
],
"title": "Builds",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"decimals": 0,
"links": [],
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 5
},
"id": 8,
"links": [],
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "9.0.7",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(drone_running_jobs) by (application_name)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "running jobs",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(drone_pending_jobs) by (application_name)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "pending jobs",
"range": true,
"refId": "B"
}
],
"title": "Jobs",
"type": "timeseries"
}
],
"refresh": "1m",
"schemaVersion": 38,
"style": "dark",
"tags": [
"drone",
"drone-ci",
"ci/cd"
],
"templating": {
"list": [
{
"current": {
"selected": true,
"text": "Prometheus",
"value": "Prometheus"
},
"hide": 0,
"includeAll": false,
"label": "datasource",
"multi": false,
"name": "DS_PROMETHEUS",
"options": [],
"query": "prometheus",
"queryValue": "",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
}
]
},
"time": {
"from": "now-12h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "",
"title": "Drone CI",
"uid": "IT4-bnNik",
"version": 2,
"weekStart": ""
}
{% endraw %}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,440 @@
{% raw %}
{
"__inputs": [
{
"name": "DS_PROMETHEUS",
"label": "Prometheus",
"description": "",
"type": "datasource",
"pluginId": "prometheus",
"pluginName": "Prometheus"
}
],
"__elements": {},
"__requires": [
{
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "10.0.3"
},
{
"type": "datasource",
"id": "prometheus",
"name": "Prometheus",
"version": "1.0.0"
},
{
"type": "panel",
"id": "stat",
"name": "Stat",
"version": ""
},
{
"type": "panel",
"id": "timeseries",
"name": "Time series",
"version": ""
}
],
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "datasource",
"uid": "grafana"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"target": {
"limit": 100,
"matchAny": false,
"tags": [],
"type": "dashboard"
},
"type": "dashboard"
}
]
},
"description": "A dashboard to show the data from the excellent Uptime Kuma project!",
"editable": true,
"fiscalYearStartMonth": 0,
"gnetId": 14847,
"graphTooltip": 0,
"id": null,
"links": [],
"liveNow": false,
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"0": {
"color": "red",
"index": 0,
"text": "DOWN"
},
"1": {
"color": "green",
"index": 1,
"text": "UP"
}
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 17,
"w": 24,
"x": 0,
"y": 0
},
"id": 4,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"text": {},
"textMode": "auto"
},
"pluginVersion": "10.0.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "monitor_status ",
"interval": "",
"legendFormat": "{{ monitor_name }}",
"refId": "A"
}
],
"title": "Site Status",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "#EAB839",
"value": 30
},
{
"color": "green",
"value": 60
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 13,
"x": 0,
"y": 17
},
"id": 6,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"text": {},
"textMode": "auto"
},
"pluginVersion": "10.0.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "monitor_cert_days_remaining",
"interval": "",
"legendFormat": "{{ monitor_name }}",
"refId": "A"
}
],
"title": "TLS Certificate Remaining Days",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"0": {
"color": "red",
"index": 0,
"text": "EXPIRED"
},
"1": {
"color": "green",
"index": 1,
"text": "VALID"
}
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "green",
"value": 1
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 11,
"x": 13,
"y": 17
},
"id": 5,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"text": {},
"textMode": "auto"
},
"pluginVersion": "10.0.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "monitor_cert_is_valid",
"interval": "",
"legendFormat": "{{ monitor_name }}",
"refId": "A"
}
],
"title": "TLS Certificate Status",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "ms"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 24,
"x": 0,
"y": 26
},
"id": 2,
"options": {
"legend": {
"calcs": [
"max",
"min",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "sum(monitor_response_time{}) by (monitor_name)",
"interval": "",
"legendFormat": "{{ monitor_name }}",
"refId": "A"
}
],
"title": "Response Times",
"type": "timeseries"
}
],
"refresh": "30s",
"revision": 1,
"schemaVersion": 38,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"current": {
"selected": false,
"text": "Prometheus",
"value": "Prometheus"
},
"hide": 0,
"includeAll": false,
"multi": false,
"name": "DS_PROMETHEUS",
"options": [],
"query": "prometheus",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
}
]
},
"time": {
"from": "now-5m",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Uptime Kuma",
"uid": "CN8E-vZ7k",
"version": 4,
"weekStart": ""
}
{% endraw %}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,12 @@
apiVersion: 1
providers:
- name: "Dashboard provider"
orgId: 1
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards
foldersFromFilesStructure: true

View File

@ -0,0 +1,28 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
url: http://prometheus:9090
isDefault: true
access: proxy
editable: true
- name: Mimir Netcup
type: prometheus
jsonData:
httpHeaderName1: "X-Scope-OrgID"
secureJsonData:
httpHeaderValue1: "host-nc-chaoswg-org"
url: http://mimir:8080/prometheus
isDefault: false
access: proxy
editable: true
- name: Loki
type: loki
access: proxy
orgId: 1
url: http://loki:3100
basicAuth: false
isDefault: false
version: 1
editable: true

View File

@ -0,0 +1,51 @@
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
query_range:
results_cache:
cache:
embedded_cache:
enabled: true
max_size_mb: 100
schema_config:
configs:
- from: 2020-10-24
store: boltdb-shipper
object_store: aws
schema: v11
index:
prefix: index_
period: 24h
common:
path_prefix: /loki
storage:
s3:
endpoint: s3.tobiasmanske.de
bucketnames: loki-data
access_key_id: "{{ loki.s3.access_key }}"
secret_access_key: "{{ loki.s3.secret_key }}"
s3forcepathstyle: true
replication_factor: 1
ring:
kvstore:
store: inmemory
compactor:
working_directory: /loki/compactor
shared_store: s3
storage_config:
boltdb_shipper:
active_index_directory: /loki/active
cache_location: /loki/cache
cache_ttl: 24h
resync_interval: 5s
shared_store: s3
aws:
s3: "s3://{{ loki.s3.access_key }}:{{ loki.s3.secret_key }}@s3.tobiasmanske.de.:443/loki-data"
s3forcepathstyle: true

View File

@ -0,0 +1,47 @@
# Do not use this configuration in production.
# It is for demonstration purposes only.
# Run Mimir in single process mode, with all components running in 1 process.
target: all
# ,alertmanager,overrides-exporter
# Configure Mimir to use Minio as object storage backend.
common:
storage:
backend: s3
s3:
endpoint: s3.tobiasmanske.de
access_key_id: "{{ mimir.s3.access_key }}"
secret_access_key: "{{ mimir.s3.secret_key }}"
bucket_name: mimir
# Blocks storage requires a prefix when using a common object storage bucket.
blocks_storage:
s3:
bucket_name: mimir-blocks
tsdb:
dir: /mimir/tsdb
flush_blocks_on_shutdown: true
ingester:
ring:
replication_factor: 1
store_gateway:
sharding_ring:
replication_factor: 1
# ruler:
# rule_path: /data/ruler
# alertmanager_url: http://127.0.0.1:8080/alertmanager
# ring:
# # Quickly detect unhealthy rulers to speed up the tutorial.
# heartbeat_period: 2s
# heartbeat_timeout: 10s
#
# alertmanager:
# data_dir: /data/alertmanager
# fallback_config_file: /etc/alertmanager-fallback-config.yaml
# external_url: http://localhost:9009/alertmanager
server:
log_level: warn

View File

@ -0,0 +1,58 @@
global:
scrape_interval: 15s
scrape_timeout: 10s
evaluation_interval: 15s
alerting:
alertmanagers:
- scheme: http
static_configs:
- targets: [ 'alertmanager:9093' ]
- static_configs:
- targets: []
scheme: http
timeout: 10s
api_version: v1
rule_files:
- "/rules/*.yaml"
scrape_configs:
- job_name: prometheus
honor_timestamps: true
scrape_interval: 15s
scrape_timeout: 10s
metrics_path: /metrics
scheme: http
static_configs:
- targets:
- localhost:9090
- job_name: 'service_discovery'
metric_relabel_configs:
- source_labels:
- "container_name"
target_label: "instance"
action: replace
file_sd_configs:
- files:
- /label_discovery/docker-targets.json
- job_name: minio-job
bearer_token: "{{ prometheus.scrape.s3.bearer_token }}"
metrics_path: /minio/v2/metrics/cluster
scheme: https
static_configs:
- targets: [s3.tobiasmanske.de]
- job_name: drone-job
bearer_token: "{{ prometheus.scrape.drone.bearer_token }}"
scheme: https
static_configs:
- targets: [drone.tobiasmanske.de]
- job_name: 'uptime-kuma-job'
scrape_interval: 30s
scheme: https
static_configs:
- targets: [status.tobiasmanske.de]
basic_auth:
username: "{{ prometheus.scrape.kuma.user }}"
password: "{{ prometheus.scrape.kuma.password }}"
remote_write:
- url: http://mimir:8080/api/v1/push
headers:
X-Scope-OrgID: host-nc-chaoswg-org

View File

@ -0,0 +1,24 @@
positions:
filename: /positions.yaml
server:
http_listen_port: 8080
clients:
- url: http://loki:3100/loki/api/v1/push
scrape_configs:
- job_name: flog_scrape
docker_sd_configs:
- host: unix:///var/run/docker.sock
refresh_interval: 5s
# filters:
# - name: label
# values: ["logging=promtail"]
relabel_configs:
- source_labels: ['__meta_docker_container_name']
regex: '/(.*)'
target_label: 'container'
- source_labels: ['__meta_docker_container_log_stream']
target_label: 'logstream'
- source_labels: ['__meta_docker_container_label_logging_jobname']
target_label: 'job'

View File

@ -0,0 +1,54 @@
# {% raw %}
groups:
- name: GoogleCadvisor
rules:
# - alert: ContainerKilled
# expr: 'time() - container_last_seen > 60'
# for: 0m
# labels:
# severity: warning
# annotations:
# summary: Container killed (instance {{ $labels.instance }})
# description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# - alert: ContainerAbsent
# expr: 'absent(container_last_seen)'
# for: 5m
# labels:
# severity: warning
# annotations:
# summary: Container absent (instance {{ $labels.instance }})
# description: "A container is absent for 5 min\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerCpuUsage
expr: '(sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80'
for: 2m
labels:
severity: warning
annotations:
summary: Container CPU usage (instance {{ $labels.instance }})
description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerMemoryUsage
expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80'
for: 2m
labels:
severity: warning
annotations:
summary: Container Memory usage (instance {{ $labels.instance }})
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# - alert: ContainerVolumeUsage
# expr: '(1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80'
# for: 2m
# labels:
# severity: warning
# annotations:
# summary: Container Volume usage (instance {{ $labels.instance }})
# description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerHighThrottleRate
expr: 'rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1'
for: 2m
labels:
severity: warning
annotations:
summary: Container high throttle rate (instance {{ $labels.instance }})
description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# {% endraw %}

View File

@ -0,0 +1,303 @@
# {% raw %}
groups:
- name: NodeExporter
rules:
- alert: HostOutOfMemory
expr: 'node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10'
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryUnderMemoryPressure
expr: 'rate(node_vmstat_pgmajfault[1m]) > 1000'
for: 2m
labels:
severity: warning
annotations:
summary: Host memory under memory pressure (instance {{ $labels.instance }})
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# - alert: HostMemoryIsUnderUtilized
# expr: '100 - (rate(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20'
# for: 1w
# labels:
# severity: info
# annotations:
# summary: Host Memory is under utilized (instance {{ $labels.instance }})
# description: "Node memory is < 20% for 1 week. Consider reducing memory space.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputIn
expr: 'sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }})
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputOut
expr: 'sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput out (instance {{ $labels.instance }})
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadRate
expr: 'sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual disk read rate (instance {{ $labels.instance }})
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskWriteRate
expr: 'sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50'
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write rate (instance {{ $labels.instance }})
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfDiskSpace
expr: '(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0'
for: 2m
labels:
severity: warning
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostDiskWillFillIn24Hours
expr: '(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0'
for: 2m
labels:
severity: warning
annotations:
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfInodes
expr: 'node_filesystem_files_free / node_filesystem_files * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0'
for: 2m
labels:
severity: warning
annotations:
summary: Host out of inodes (instance {{ $labels.instance }})
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostInodesWillFillIn24Hours
expr: 'node_filesystem_files_free / node_filesystem_files * 100 < 10 and predict_linear(node_filesystem_files_free[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0'
for: 2m
labels:
severity: warning
annotations:
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadLatency
expr: 'rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0'
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk read latency (instance {{ $labels.instance }})
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskWriteLatency
expr: 'rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0'
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write latency (instance {{ $labels.instance }})
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostHighCpuLoad
expr: '(100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100)) > 80'
for: 2m
labels:
severity: warning
annotations:
summary: Host high CPU load (instance {{ $labels.instance }})
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# - alert: HostCpuIsUnderUtilized
# expr: '100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20'
# for: 1w
# labels:
# severity: info
# annotations:
# summary: Host CPU is under utilized (instance {{ $labels.instance }})
# description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuStealNoisyNeighbor
expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuHighIowait
expr: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 15'
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU high iowait (instance {{ $labels.instance }})
description: "CPU iowait > 15%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskIo
expr: 'rate(node_disk_io_time_seconds_total[1m]) > 0.5'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual disk IO (instance {{ $labels.instance }})
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# - alert: HostContextSwitching
# expr: '(rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1000'
# for: 0m
# labels:
# severity: warning
# annotations:
# summary: Host context switching (instance {{ $labels.instance }})
# description: "Context switching is growing on node (> 1000 / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSwapIsFillingUp
expr: '(1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80'
for: 2m
labels:
severity: warning
annotations:
summary: Host swap is filling up (instance {{ $labels.instance }})
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSystemdServiceCrashed
expr: 'node_systemd_unit_state{state="failed"} == 1'
for: 0m
labels:
severity: warning
annotations:
summary: Host systemd service crashed (instance {{ $labels.instance }})
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostPhysicalComponentTooHot
expr: 'node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75'
for: 5m
labels:
severity: warning
annotations:
summary: Host physical component too hot (instance {{ $labels.instance }})
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNodeOvertemperatureAlarm
expr: 'node_hwmon_temp_crit_alarm_celsius == 1'
for: 0m
labels:
severity: critical
annotations:
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRaidArrayGotInactive
expr: 'node_md_state{state="inactive"} > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Host RAID array got inactive (instance {{ $labels.instance }})
description: "RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRaidDiskFailure
expr: 'node_md_disks{state="failed"} > 0'
for: 2m
labels:
severity: warning
annotations:
summary: Host RAID disk failure (instance {{ $labels.instance }})
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostKernelVersionDeviations
expr: 'count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1'
for: 6h
labels:
severity: warning
annotations:
summary: Host kernel version deviations (instance {{ $labels.instance }})
description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOomKillDetected
expr: 'increase(node_vmstat_oom_kill[1m]) > 0'
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected (instance {{ $labels.instance }})
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacCorrectableErrorsDetected
expr: 'increase(node_edac_correctable_errors_total[1m]) > 0'
for: 0m
labels:
severity: info
annotations:
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacUncorrectableErrorsDetected
expr: 'node_edac_uncorrectable_errors_total > 0'
for: 0m
labels:
severity: warning
annotations:
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkReceiveErrors
expr: 'rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Receive Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkTransmitErrors
expr: 'rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkInterfaceSaturated
expr: '(rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000'
for: 1m
labels:
severity: warning
annotations:
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkBondDegraded
expr: '(node_bonding_active - node_bonding_slaves) != 0'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostConntrackLimit
expr: 'node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8'
for: 5m
labels:
severity: warning
annotations:
summary: Host conntrack limit (instance {{ $labels.instance }})
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockSkew
expr: '(node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)'
for: 2m
labels:
severity: warning
annotations:
summary: Host clock skew (instance {{ $labels.instance }})
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockNotSynchronising
expr: 'min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16'
for: 2m
labels:
severity: warning
annotations:
summary: Host clock not synchronising (instance {{ $labels.instance }})
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRequiresReboot
expr: 'node_reboot_required > 0'
for: 4h
labels:
severity: info
annotations:
summary: Host requires reboot (instance {{ $labels.instance }})
description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# {% endraw %}

View File

@ -0,0 +1,231 @@
# {% raw %}
groups:
- name: EmbeddedExporter
rules:
- alert: PrometheusJobMissing
expr: 'absent(up{job="prometheus"})'
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus job missing (instance {{ $labels.instance }})
description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTargetMissing
expr: 'up == 0'
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus target missing (instance {{ $labels.instance }})
description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusAllTargetsMissing
expr: 'sum by (job) (up) == 0'
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus all targets missing (instance {{ $labels.instance }})
description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTargetMissingWithWarmupTime
expr: 'sum by (instance, job) ((up == 0) * on (instance) group_right(job) (node_time_seconds - node_boot_time_seconds > 600))'
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus target missing with warmup time (instance {{ $labels.instance }})
description: "Allow a job time to start up (10 minutes) before alerting that it's down.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusConfigurationReloadFailure
expr: 'prometheus_config_last_reload_successful != 1'
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTooManyRestarts
expr: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2'
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus too many restarts (instance {{ $labels.instance }})
description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# - alert: PrometheusAlertmanagerJobMissing
# expr: 'absent(up{job="alertmanager"})'
# for: 0m
# labels:
# severity: warning
# annotations:
# summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
# description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusAlertmanagerConfigurationReloadFailure
expr: 'alertmanager_config_last_reload_successful != 1'
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusAlertmanagerConfigNotSynced
expr: 'count(count_values("config_hash", alertmanager_config_hash)) > 1'
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusAlertmanagerE2eDeadManSwitch
expr: 'vector(1)'
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})
description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusNotConnectedToAlertmanager
expr: 'prometheus_notifications_alertmanagers_discovered < 1'
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusRuleEvaluationFailures
expr: 'increase(prometheus_rule_evaluation_failures_total[3m]) > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTemplateTextExpansionFailures
expr: 'increase(prometheus_template_text_expansion_failures_total[3m]) > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusRuleEvaluationSlow
expr: 'prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds'
for: 5m
labels:
severity: warning
annotations:
summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusNotificationsBacklog
expr: 'min_over_time(prometheus_notifications_queue_length[10m]) > 0'
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus notifications backlog (instance {{ $labels.instance }})
description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusAlertmanagerNotificationFailing
expr: 'rate(alertmanager_notifications_failed_total[1m]) > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# - alert: PrometheusTargetEmpty
# expr: 'prometheus_sd_discovered_targets == 0'
# for: 0m
# labels:
# severity: critical
# annotations:
# summary: Prometheus target empty (instance {{ $labels.instance }})
# description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTargetScrapingSlow
expr: 'prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05'
for: 5m
labels:
severity: warning
annotations:
summary: Prometheus target scraping slow (instance {{ $labels.instance }})
description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusLargeScrape
expr: 'increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10'
for: 5m
labels:
severity: warning
annotations:
summary: Prometheus large scrape (instance {{ $labels.instance }})
description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTargetScrapeDuplicate
expr: 'increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0'
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTsdbCheckpointCreationFailures
expr: 'increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTsdbCheckpointDeletionFailures
expr: 'increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTsdbCompactionsFailed
expr: 'increase(prometheus_tsdb_compactions_failed_total[1m]) > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTsdbHeadTruncationsFailed
expr: 'increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTsdbReloadFailures
expr: 'increase(prometheus_tsdb_reloads_failures_total[1m]) > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTsdbWalCorruptions
expr: 'increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTsdbWalTruncationsFailed
expr: 'increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTimeserieCardinality
expr: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000'
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus timeserie cardinality (instance {{ $labels.instance }})
description: "The \"{{ $labels.name }}\" timeserie cardinality is getting very high: {{ $value }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# {% endraw %}