]> git.openstreetmap.org Git - chef.git/blobdiff - cookbooks/prometheus/templates/default/alert_rules.yml.erb
Increase alert threshold for interface transmit/receive alerts
[chef.git] / cookbooks / prometheus / templates / default / alert_rules.yml.erb
index 440d2ac8b00d673502e4b1b5803ff77a146d8b56..4e00763e61583edf5b051214ddea2f1fb0ed6168 100644 (file)
@@ -11,7 +11,7 @@ groups:
         annotations:
           current: "{{ $value | humanize }}A"
       - alert: site power
-        expr: sum(rPDU2PhaseStatusApparentPower{site="amsterdam",rPDU2PhaseStatusIndex="1"} / 100) > 3
+        expr: sum(avg_over_time(rPDU2PhaseStatusApparentPower{site="amsterdam",rPDU2PhaseStatusIndex="1"}[1h]) / 100) > 3
         for: 6m
         labels:
           alertgroup: "amsterdam"
@@ -124,7 +124,7 @@ groups:
         annotations:
           current: "{{ $value | humanize }}A"
       - alert: site power
-        expr: sum(rPDU2PhaseStatusApparentPower{site="dublin",rPDU2PhaseStatusIndex="1"} / 100) > 4
+        expr: sum(avg_over_time(rPDU2PhaseStatusApparentPower{site="dublin",rPDU2PhaseStatusIndex="1"}[1h]) / 100) > 4
         for: 6m
         labels:
           alertgroup: "dublin"
@@ -363,14 +363,14 @@ groups:
   - name: network
     rules:
       - alert: interface transmit rate
-        expr: rate(node_network_transmit_bytes_total[1m]) / node_network_speed_bytes > 0.98
+        expr: rate(node_network_transmit_bytes_total[1m]) / node_network_speed_bytes > 0.99
         for: 5m
         labels:
           alertgroup: "{{ $labels.instance }}"
         annotations:
           bandwidth_used: "{{ $value | humanizePercentage }}"
       - alert: interface receive rate
-        expr: rate(node_network_receive_bytes_total[1m]) / node_network_speed_bytes > 0.98
+        expr: rate(node_network_receive_bytes_total[1m]) / node_network_speed_bytes > 0.99
         for: 5m
         labels:
           alertgroup: "{{ $labels.instance }}"
@@ -525,6 +525,13 @@ groups:
           alertgroup: "{{ $labels.instance }}"
         annotations:
           queries: "{{ $value }}"
+      - alert: postgresql idle transactions
+        expr: sum(pg_process_idle_seconds_count{state="idle in transaction"}) by (instance, server) > sum(pg_process_idle_seconds_bucket{state="idle in transaction",le="300"}) by (instance, server)
+        for: 5m
+        labels:
+          alertgroup: "{{ $labels.instance }}"
+        annotations:
+          queries: "{{ $value }}"
   - name: prometheus
     rules:
       - alert: prometheus configuration error
@@ -650,6 +657,29 @@ groups:
         for: 0m
         labels:
           alertgroup: "{{ $labels.instance }}"
+  - name: taginfo
+    rules:
+      - alert: taginfo planet age
+        expr: time() - taginfo_data_from_seconds > 129600
+        for: 0m
+        labels:
+          alertgroup: taginfo
+        annotations:
+          age: "{{ $value | humanizeDuration }}"
+      - alert: taginfo database age
+        expr: time() - taginfo_database_update_finish_seconds > 129600
+        for: 0m
+        labels:
+          alertgroup: taginfo
+        annotations:
+          age: "{{ $value | humanizeDuration }}"
+      - alert: taginfo database size
+        expr: abs(delta(taginfo_database_size_bytes[30m])) / taginfo_database_size_bytes > 0.1
+        for: 30m
+        labels:
+          alertgroup: taginfo
+        annotations:
+          size_change: "{{ $value | humanizePercentage }}"
   - name: tile
     rules:
       - alert: renderd replication delay