]> git.openstreetmap.org Git - chef.git/commitdiff
Add some alert rules for prometheus
authorTom Hughes <tom@compton.nu>
Sun, 24 Jan 2021 12:03:49 +0000 (12:03 +0000)
committerTom Hughes <tom@compton.nu>
Sun, 24 Jan 2021 12:17:39 +0000 (12:17 +0000)
cookbooks/prometheus/recipes/server.rb
cookbooks/prometheus/templates/default/alert_rules.yml.erb [new file with mode: 0644]

index d069cd8e7b9f166c28b9f3d8deaf7e8d89167ae2..2ebf5a8230d04af5983baf5e575494760f798759 100644 (file)
@@ -184,10 +184,18 @@ template "/etc/prometheus/prometheus.yml" do
   variables :jobs => jobs
 end
 
+template "/etc/prometheus/alert_rules.yml" do
+  source "alert_rules.yml.erb"
+  owner "root"
+  group "root"
+  mode "644"
+end
+
 service "prometheus" do
   action [:enable, :start]
   subscribes :restart, "template[/etc/default/prometheus]"
   subscribes :reload, "template[/etc/prometheus/prometheus.yml]"
+  subscribes :reload, "template[/etc/prometheus/alert_rules.yml]"
 end
 
 template "/etc/default/prometheus-alertmanager" do
diff --git a/cookbooks/prometheus/templates/default/alert_rules.yml.erb b/cookbooks/prometheus/templates/default/alert_rules.yml.erb
new file mode 100644 (file)
index 0000000..5a9a70f
--- /dev/null
@@ -0,0 +1,76 @@
+# DO NOT EDIT - This file is being maintained by Chef
+
+groups:
+  - name: hwmon
+    rules:
+      - alert: hwmon fan alarm
+        expr: node_hwmon_fan_alarm == 1
+        for: 5m
+        labels:
+          alertgroup: "{{ $labels.instance }}"
+        annotations:
+          fan_rpm: "{{ with printf \"node_hwmon_fan_rpm{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}rpm{{end}}"
+          fan_min_rpm: "{{ with printf \"node_hwmon_fan_min_rpm{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}rpm{{end}}"
+      - alert: hwmon temperature alarm
+        expr: node_hwmon_temp_alarm == 1
+        for: 5m
+        labels:
+          alertgroup: "{{ $labels.instance }}"
+        annotations:
+          temp_celsius: "{{ with printf \"node_hwmon_temp_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
+          temp_max_celsius: "{{ with printf \"node_hwmon_temp_max_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
+          temp_crit_celsius: "{{ with printf \"node_hwmon_temp_crit_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
+      - alert: hwmon voltage alarm
+        expr: node_hwmon_in_alarm == 1
+        for: 5m
+        labels:
+          alertgroup: "{{ $labels.instance }}"
+        annotations:
+          in_volts: "{{ with printf \"node_hwmon_in_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
+          in_min_volts: "{{ with printf \"node_hwmon_in_min_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
+          in_max_volts: "{{ with printf \"node_hwmon_in_max_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
+  - name: ipmi
+    rules:
+      - alert: ipmi fan alarm
+        expr: ipmi_fan_speed_state > 0
+        for: 5m
+        labels:
+          alertgroup: "{{ $labels.instance }}"
+        annotations:
+          fan_speed_rpm: "{{ with printf \"ipmi_fan_speed_rpm{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}rpm{{end}}"
+      - alert: ipmi temperature alarm
+        expr: ipmi_temperature_state > 0
+        for: 5m
+        labels:
+          alertgroup: "{{ $labels.instance }}"
+        annotations:
+          temperature_celsius: "{{ with printf \"ipmi_temperature_celsius{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}C{{end}}"
+      - alert: ipmi voltage alarm
+        expr: ipmi_voltage_state > 0
+        for: 5m
+        labels:
+          alertgroup: "{{ $labels.instance }}"
+        annotations:
+          voltage_volts: "{{ with printf \"ipmi_voltage_volts{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}V{{end}}"
+  - name: mdadm
+    rules:
+      - alert: mdadm array inactive
+        expr: node_md_state{state="inactive"} > 0
+        for: 0m
+        labels:
+          alertgroup: "{{ $labels.instance }}"
+        annotations:
+          required: "{{ with printf \"node_md_disks_required{instance='%s',device='%s'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
+          active: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='active'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
+          failed: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='failed'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
+          spare: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='spare'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
+      - alert: mdadm disk failed
+        expr: node_md_disks{state="failed"} > 0
+        for: 0m
+        labels:
+          alertgroup: "{{ $labels.instance }}"
+        annotations:
+          required: "{{ with printf \"node_md_disks_required{instance='%s',device='%s'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
+          active: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='active'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
+          failed: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='failed'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
+          spare: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='spare'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"