X-Git-Url: https://git.openstreetmap.org/chef.git/blobdiff_plain/1e91c84f1d288714b490e894ca4036eae6fe380f..97751762159405edfcd074565c79c67c4f09b726:/cookbooks/prometheus/templates/default/alert_rules.yml.erb diff --git a/cookbooks/prometheus/templates/default/alert_rules.yml.erb b/cookbooks/prometheus/templates/default/alert_rules.yml.erb index 9990483b8..02c41ce52 100644 --- a/cookbooks/prometheus/templates/default/alert_rules.yml.erb +++ b/cookbooks/prometheus/templates/default/alert_rules.yml.erb @@ -4,21 +4,14 @@ groups: - name: amsterdam rules: - alert: pdu current draw - expr: rPDU2PhaseStatusCurrent{site="amsterdam",rPDU2PhaseStatusIndex="1"} / 10 > 10 - for: 6m - labels: - alertgroup: "amsterdam" - annotations: - current: "{{ $value | humanize }}A" - - alert: site current draw - expr: sum(rPDU2PhaseStatusCurrent{site="amsterdam",rPDU2PhaseStatusIndex="1"} / 10) > 13 + expr: rPDU2PhaseStatusCurrent{site="amsterdam",rPDU2PhaseStatusIndex="1"} / 10 > 28 for: 6m labels: alertgroup: "amsterdam" annotations: current: "{{ $value | humanize }}A" - alert: site power - expr: sum(rPDU2PhaseStatusApparentPower{site="amsterdam",rPDU2PhaseStatusIndex="1"} / 100) > 3 + expr: sum(avg_over_time(rPDU2PhaseStatusApparentPower{site="amsterdam",rPDU2PhaseStatusIndex="1"}[1h]) / 100) > 3 for: 6m labels: alertgroup: "amsterdam" @@ -124,21 +117,14 @@ groups: - name: dublin rules: - alert: pdu current draw - expr: rPDU2PhaseStatusCurrent{site="dublin",rPDU2PhaseStatusIndex="1"} / 10 > 13 - for: 6m - labels: - alertgroup: "dublin" - annotations: - current: "{{ $value | humanize }}A" - - alert: site current draw - expr: sum(rPDU2PhaseStatusCurrent{site="dublin",rPDU2PhaseStatusIndex="1"} / 10) > 17 + expr: rPDU2PhaseStatusCurrent{site="dublin",rPDU2PhaseStatusIndex="1"} / 10 > 28 for: 6m labels: alertgroup: "dublin" annotations: current: "{{ $value | humanize }}A" - alert: site power - expr: sum(rPDU2PhaseStatusApparentPower{site="dublin",rPDU2PhaseStatusIndex="1"} / 100) > 4 + expr: sum(avg_over_time(rPDU2PhaseStatusApparentPower{site="dublin",rPDU2PhaseStatusIndex="1"}[1h]) / 100) > 4 for: 6m labels: alertgroup: "dublin" @@ -377,14 +363,14 @@ groups: - name: network rules: - alert: interface transmit rate - expr: rate(node_network_transmit_bytes_total[1m]) / node_network_speed_bytes > 0.98 + expr: rate(node_network_transmit_bytes_total[1m]) / node_network_speed_bytes > 0.99 for: 5m labels: alertgroup: "{{ $labels.instance }}" annotations: bandwidth_used: "{{ $value | humanizePercentage }}" - alert: interface receive rate - expr: rate(node_network_receive_bytes_total[1m]) / node_network_speed_bytes > 0.98 + expr: rate(node_network_receive_bytes_total[1m]) / node_network_speed_bytes > 0.99 for: 5m labels: alertgroup: "{{ $labels.instance }}" @@ -539,6 +525,13 @@ groups: alertgroup: "{{ $labels.instance }}" annotations: queries: "{{ $value }}" + - alert: postgresql idle transactions + expr: sum(pg_process_idle_seconds_count{state="idle in transaction"}) by (instance, server) > sum(pg_process_idle_seconds_bucket{state="idle in transaction",le="300"}) by (instance, server) + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + queries: "{{ $value }}" - name: prometheus rules: - alert: prometheus configuration error @@ -664,6 +657,29 @@ groups: for: 0m labels: alertgroup: "{{ $labels.instance }}" + - name: taginfo + rules: + - alert: taginfo planet age + expr: time() - taginfo_data_from_seconds > 129600 + for: 0m + labels: + alertgroup: taginfo + annotations: + age: "{{ $value | humanizeDuration }}" + - alert: taginfo database age + expr: time() - taginfo_database_update_finish_seconds > 129600 + for: 0m + labels: + alertgroup: taginfo + annotations: + age: "{{ $value | humanizeDuration }}" + - alert: taginfo database size + expr: abs(delta(taginfo_database_size_bytes[30m])) / taginfo_database_size_bytes > 0.1 + for: 30m + labels: + alertgroup: taginfo + annotations: + size_change: "{{ $value | humanizePercentage }}" - name: tile rules: - alert: renderd replication delay @@ -711,8 +727,8 @@ groups: annotations: error_rate: "{{ $value | humanizePercentage }}" - alert: job processing rate - expr: rate(pg_stat_user_tables_n_tup_del{datname="openstreetmap",relname="delayed_jobs"}[5m]) / rate(pg_stat_user_tables_n_tup_ins{datname="openstreetmap",relname="delayed_jobs"}[5m]) < 0.9 and ignoring(job, name, datname, relname, schemaname, server) chef_role{name="db-master"} == 1 - for: 15m + expr: rate(pg_stat_user_tables_n_tup_del{datname="openstreetmap",relname="delayed_jobs"}[1h]) / rate(pg_stat_user_tables_n_tup_ins{datname="openstreetmap",relname="delayed_jobs"}[1h]) < 0.9 and ignoring(job, name, datname, relname, schemaname, server) chef_role{name="db-master"} == 1 + for: 1h labels: alertgroup: web annotations: