]> git.openstreetmap.org Git - chef.git/blobdiff - cookbooks/prometheus/templates/default/alert_rules.yml.erb
Add alerts for high numbers of active queries on the main database
[chef.git] / cookbooks / prometheus / templates / default / alert_rules.yml.erb
index d8671699c2e6a1aa7088cc8d025ea70dab50f419..f64b7fc08f17f59d075c231baf474e2c44c7dba4 100644 (file)
@@ -112,6 +112,20 @@ groups:
           alertgroup: database
         annotations:
           delay: "{{ $value | humanizeDuration }}"
+      - alert: active rails queries
+        expr: sum(pg_stat_activity_count{instance="snap-01",datname="openstreetmap",usename="rails",state="active"}) > 50 and on (instance) chef_role{name="db-master"}
+        for: 5m
+        labels:
+          alertgroup: database
+        annotations:
+          queries: "{{ $value }}"
+      - alert: active cgimap queries
+        expr: sum(pg_stat_activity_count{instance="snap-01",datname="openstreetmap",usename="cgimap",state="active"}) > 30 and on (instance) chef_role{name="db-master"}
+        for: 5m
+        labels:
+          alertgroup: database
+        annotations:
+          delay: "{{ $value }}"
   - name: discourse
     rules:
       - alert: discourse job failure rate
@@ -579,13 +593,6 @@ groups:
           alertgroup: "{{ $labels.instance }}"
         annotations:
           new_deadlocks: "{{ $value }}"
-      - alert: postgresql slow queries
-        expr: pg_slow_queries > 0
-        for: 5m
-        labels:
-          alertgroup: "{{ $labels.instance }}"
-        annotations:
-          queries: "{{ $value }}"
       - alert: postgresql idle transactions
         expr: sum(pg_process_idle_seconds_count{state="idle in transaction"}) by (instance, server) > sum(pg_process_idle_seconds_bucket{state="idle in transaction",le="300"}) by (instance, server)
         for: 5m