First step is to define alert rules into prometheus.yml
.
Add this alert.rules to main config prometheus :
rule_files:
- 'alert.rules'
Then fill the alert.rules with this content :
sudo nano /etc/prometheus/alert.rules
groups:
- name: NodeExporter
rules:
- alert: VMOutOfMemory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 2m
labels:
severity: warning
annotations:
summary: VM out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CPUMediumUsage70to75
expr: (100 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 70 and (100 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) <= 75
for: 2m
labels:
severity: medium
annotations:
summary: CPU usage between 70% and 75% (instance {{ $labels.instance }})
description: "CPU usage is between 70% and 75%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CPUMediumUsage75to80
expr: (100 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 75 and (100 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) <= 80
for: 2m
labels:
severity: medium
annotations:
summary: CPU usage between 75% and 80% (instance {{ $labels.instance }})
description: "CPU usage is between 75% and 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CPUCriticalUsage
expr: (100 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 2m
labels:
severity: critical
annotations:
summary: CPU usage above 80% (instance {{ $labels.instance }})
description: "CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
Now restart prometheus service :
sudo systemctl restart prometheus
You can check alerts in { http or https }://{ prometheus-url }:9090/alerts
alerts.yml
of Prometheus), you'd need to create three different alerts. Not 100% sure, but I think same is true for Grafana.