]> git.openstreetmap.org Git - chef.git/blobdiff - cookbooks/prometheus/templates/default/alert_rules.yml.erb
Merge remote-tracking branch 'github/pull/465'
[chef.git] / cookbooks / prometheus / templates / default / alert_rules.yml.erb
index 1b3afcba59276b20205a54007162da3899da0956..0469226dbff15ace6f9ad00c7707be166d316718 100644 (file)
@@ -119,6 +119,16 @@ groups:
           active: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='active'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
           failed: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='failed'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
           spare: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='spare'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
+      - alert: mdadm array degraded
+        expr: sum (node_md_disks{state="active"}) without (state) < node_md_disks_required
+        for: 0m
+        labels:
+          alertgroup: "{{ $labels.instance }}"
+        annotations:
+          required: "{{ with printf \"node_md_disks_required{instance='%s',device='%s'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
+          active: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='active'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
+          failed: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='failed'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
+          spare: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='spare'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
       - alert: mdadm disk failed
         expr: node_md_disks{state="failed"} > 0
         for: 0m
@@ -262,6 +272,13 @@ groups:
         for: 0m
         labels:
           alertgroup: ssl
+  - name: systemd
+    rules:
+      - alert: systemd failed service
+        expr: node_systemd_unit_state{state="failed"} == 1
+        for: 5m
+        labels:
+          alertgroup: "{{ $labels.instance }}"
   - name: tile
     rules:
       - alert: renderd replication delay
@@ -292,3 +309,12 @@ groups:
           alertgroup: "{{ $labels.instance }}"
         annotations:
           skew: "{{ with printf \"node_timex_offset_seconds{instance='%s'}\" $labels.instance | query }} {{ . | humanizeDuration }}{{ end }}"
+  - name: web
+    rules:
+      - alert: web error rate
+        expr: sum(rate(api_call_count_total{status=~"5.*"}[5m])) by (instance) / sum(rate(api_call_count_total[5m])) by (instance) > 0.002
+        for: 5m
+        labels:
+          alertgroup: web
+        annotations:
+          error_rate: "{{ $value | humanizePercentage }}"