X-Git-Url: https://git.openstreetmap.org/chef.git/blobdiff_plain/84625fc5630ce46c37f93be62aa61c67cbfc4ef0..0d12afe2ac536a6e7696800798b2e553f36e1a44:/cookbooks/prometheus/templates/default/alert_rules.yml.erb diff --git a/cookbooks/prometheus/templates/default/alert_rules.yml.erb b/cookbooks/prometheus/templates/default/alert_rules.yml.erb index 92c38d5c5..0aa0b64cb 100644 --- a/cookbooks/prometheus/templates/default/alert_rules.yml.erb +++ b/cookbooks/prometheus/templates/default/alert_rules.yml.erb @@ -22,10 +22,19 @@ groups: alertgroup: "{{ $labels.instance }}" annotations: busy_workers: "{{ $value | humanizePercentage }}" + - name: chef + rules: + - alert: chef client not running + expr: time() - node_systemd_timer_last_trigger_seconds{name="chef-client.timer"} > 3600 + for: 12h + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + down_time: "{{ $value | humanizeDuration }}" - name: cpu rules: - alert: cpu pressure - expr: rate(node_pressure_cpu_waiting_seconds_total[5m]) > 0.3 + expr: rate(node_pressure_cpu_waiting_seconds_total[5m]) > 0.6 for: 15m labels: alertgroup: "{{ $labels.instance }}" @@ -367,10 +376,15 @@ groups: - name: systemd rules: - alert: systemd failed service - expr: node_systemd_unit_state{state="failed"} == 1 + expr: node_systemd_unit_state{state="failed",name!="chef-client.service"} == 1 for: 5m labels: alertgroup: "{{ $labels.instance }}" + - alert: systemd failed service + expr: node_systemd_unit_state{state="failed",name="chef-client.service"} == 1 + for: 6h + labels: + alertgroup: "{{ $labels.instance }}" - name: tile rules: - alert: renderd replication delay @@ -410,3 +424,10 @@ groups: alertgroup: web annotations: error_rate: "{{ $value | humanizePercentage }}" + - alert: job processing rate + expr: rate(pg_stat_user_tables_n_tup_del{datname="openstreetmap",relname="delayed_jobs"}[5m]) / rate(pg_stat_user_tables_n_tup_ins{datname="openstreetmap",relname="delayed_jobs"}[5m]) < 0.9 and ignoring(job, name, datname, relname, schemaname, server) chef_role{name="db-master"} == 1 + for: 5m + labels: + alertgroup: web + annotations: + job_processing_rate: "{{ $value | humanizePercentage }}"