]> git.openstreetmap.org Git - chef.git/blobdiff - cookbooks/prometheus/templates/default/alert_rules.yml.erb
Add some additional alert rules
[chef.git] / cookbooks / prometheus / templates / default / alert_rules.yml.erb
index 5a9a70f5d57b7ab22ecf792351e3ce8e78c4f57c..d2e076281dcdcc6f70f433a4d233414a5cb014e7 100644 (file)
@@ -1,6 +1,13 @@
 # DO NOT EDIT - This file is being maintained by Chef
 
 groups:
+  - name: alertmanager
+    rules:
+      - alert: prometheus target missing
+        expr: up == 0
+        for: 5m
+        labels:
+          alertgroup: "prometheus"
   - name: hwmon
     rules:
       - alert: hwmon fan alarm
@@ -74,3 +81,26 @@ groups:
           active: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='active'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
           failed: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='failed'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
           spare: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='spare'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
+  - name: memory
+    rules:
+      - alert: low memory
+        expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
+        for: 5m
+        labels:
+          alertgroup: "{{ $labels.instance }}"
+        annotations:
+          memory_free: "{{ $value }}%"
+      - alert: memory pressure
+        expr: rate(node_vmstat_pgmajfault[1m]) > 1000
+        for: 5m
+        labels:
+          alertgroup: "{{ $labels.instance }}"
+        annotations:
+          major_page_faults: "{{ $value }} faults/s"
+      - alert: oom kill detected
+        expr: increase(node_vmstat_oom_kill[1m]) > 0
+        for: 0m
+        labels:
+          alertgroup: "{{ $labels.instance }}"
+        annotations:
+          new_oom_kills: "{{ $value }}"