]> git.openstreetmap.org Git - chef.git/blob - cookbooks/prometheus/templates/default/alert_rules.yml.erb
Add some alert rules for prometheus
[chef.git] / cookbooks / prometheus / templates / default / alert_rules.yml.erb
1 # DO NOT EDIT - This file is being maintained by Chef
2
3 groups:
4   - name: hwmon
5     rules:
6       - alert: hwmon fan alarm
7         expr: node_hwmon_fan_alarm == 1
8         for: 5m
9         labels:
10           alertgroup: "{{ $labels.instance }}"
11         annotations:
12           fan_rpm: "{{ with printf \"node_hwmon_fan_rpm{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}rpm{{end}}"
13           fan_min_rpm: "{{ with printf \"node_hwmon_fan_min_rpm{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}rpm{{end}}"
14       - alert: hwmon temperature alarm
15         expr: node_hwmon_temp_alarm == 1
16         for: 5m
17         labels:
18           alertgroup: "{{ $labels.instance }}"
19         annotations:
20           temp_celsius: "{{ with printf \"node_hwmon_temp_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
21           temp_max_celsius: "{{ with printf \"node_hwmon_temp_max_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
22           temp_crit_celsius: "{{ with printf \"node_hwmon_temp_crit_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
23       - alert: hwmon voltage alarm
24         expr: node_hwmon_in_alarm == 1
25         for: 5m
26         labels:
27           alertgroup: "{{ $labels.instance }}"
28         annotations:
29           in_volts: "{{ with printf \"node_hwmon_in_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
30           in_min_volts: "{{ with printf \"node_hwmon_in_min_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
31           in_max_volts: "{{ with printf \"node_hwmon_in_max_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
32   - name: ipmi
33     rules:
34       - alert: ipmi fan alarm
35         expr: ipmi_fan_speed_state > 0
36         for: 5m
37         labels:
38           alertgroup: "{{ $labels.instance }}"
39         annotations:
40           fan_speed_rpm: "{{ with printf \"ipmi_fan_speed_rpm{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}rpm{{end}}"
41       - alert: ipmi temperature alarm
42         expr: ipmi_temperature_state > 0
43         for: 5m
44         labels:
45           alertgroup: "{{ $labels.instance }}"
46         annotations:
47           temperature_celsius: "{{ with printf \"ipmi_temperature_celsius{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}C{{end}}"
48       - alert: ipmi voltage alarm
49         expr: ipmi_voltage_state > 0
50         for: 5m
51         labels:
52           alertgroup: "{{ $labels.instance }}"
53         annotations:
54           voltage_volts: "{{ with printf \"ipmi_voltage_volts{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}V{{end}}"
55   - name: mdadm
56     rules:
57       - alert: mdadm array inactive
58         expr: node_md_state{state="inactive"} > 0
59         for: 0m
60         labels:
61           alertgroup: "{{ $labels.instance }}"
62         annotations:
63           required: "{{ with printf \"node_md_disks_required{instance='%s',device='%s'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
64           active: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='active'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
65           failed: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='failed'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
66           spare: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='spare'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
67       - alert: mdadm disk failed
68         expr: node_md_disks{state="failed"} > 0
69         for: 0m
70         labels:
71           alertgroup: "{{ $labels.instance }}"
72         annotations:
73           required: "{{ with printf \"node_md_disks_required{instance='%s',device='%s'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
74           active: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='active'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
75           failed: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='failed'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
76           spare: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='spare'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"