From 47f9e50f95540dd380b3593fd3ae25668658eea9 Mon Sep 17 00:00:00 2001 From: Tom Hughes Date: Thu, 19 May 2022 21:26:51 +0100 Subject: [PATCH] Only alert for failed chef-client services if they persist --- cookbooks/prometheus/templates/default/alert_rules.yml.erb | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cookbooks/prometheus/templates/default/alert_rules.yml.erb b/cookbooks/prometheus/templates/default/alert_rules.yml.erb index be6a89b43..2aa46d1f0 100644 --- a/cookbooks/prometheus/templates/default/alert_rules.yml.erb +++ b/cookbooks/prometheus/templates/default/alert_rules.yml.erb @@ -367,10 +367,15 @@ groups: - name: systemd rules: - alert: systemd failed service - expr: node_systemd_unit_state{state="failed"} == 1 + expr: node_systemd_unit_state{state="failed",name!="chef-client.service"} == 1 for: 5m labels: alertgroup: "{{ $labels.instance }}" + - alert: systemd failed service + expr: node_systemd_unit_state{state="failed",name="chef-client.service"} == 1 + for: 6h + labels: + alertgroup: "{{ $labels.instance }}" - name: tile rules: - alert: renderd replication delay -- 2.39.5