From 8ba2e5010812ff169ada588834ef5606d166d526 Mon Sep 17 00:00:00 2001 From: Tom Hughes Date: Thu, 19 May 2022 21:34:23 +0100 Subject: [PATCH] Add an alert for chef not running for an extended time --- .../prometheus/templates/default/alert_rules.yml.erb | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cookbooks/prometheus/templates/default/alert_rules.yml.erb b/cookbooks/prometheus/templates/default/alert_rules.yml.erb index 2aa46d1f0..1f0ae128e 100644 --- a/cookbooks/prometheus/templates/default/alert_rules.yml.erb +++ b/cookbooks/prometheus/templates/default/alert_rules.yml.erb @@ -22,6 +22,15 @@ groups: alertgroup: "{{ $labels.instance }}" annotations: busy_workers: "{{ $value | humanizePercentage }}" + - name: chef + rules: + - alert: chef client not running + expr: time() - node_systemd_timer_last_trigger_seconds{name="chef-client.timer"} > 3600 + for: 12h + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + down_time: "{{ $value | humanizeDuration }}" - name: cpu rules: - alert: cpu pressure -- 2.45.2