From: Tom Hughes Date: Thu, 19 May 2022 20:26:51 +0000 (+0100) Subject: Only alert for failed chef-client services if they persist X-Git-Url: https://git.openstreetmap.org/chef.git/commitdiff_plain/47f9e50f95540dd380b3593fd3ae25668658eea9 Only alert for failed chef-client services if they persist --- diff --git a/cookbooks/prometheus/templates/default/alert_rules.yml.erb b/cookbooks/prometheus/templates/default/alert_rules.yml.erb index be6a89b43..2aa46d1f0 100644 --- a/cookbooks/prometheus/templates/default/alert_rules.yml.erb +++ b/cookbooks/prometheus/templates/default/alert_rules.yml.erb @@ -367,10 +367,15 @@ groups: - name: systemd rules: - alert: systemd failed service - expr: node_systemd_unit_state{state="failed"} == 1 + expr: node_systemd_unit_state{state="failed",name!="chef-client.service"} == 1 for: 5m labels: alertgroup: "{{ $labels.instance }}" + - alert: systemd failed service + expr: node_systemd_unit_state{state="failed",name="chef-client.service"} == 1 + for: 6h + labels: + alertgroup: "{{ $labels.instance }}" - name: tile rules: - alert: renderd replication delay