]> git.openstreetmap.org Git - chef.git/blob - cookbooks/prometheus/templates/default/alert_rules.yml.erb
Add some more prometheus alerts
[chef.git] / cookbooks / prometheus / templates / default / alert_rules.yml.erb
1 # DO NOT EDIT - This file is being maintained by Chef
2
3 groups:
4   - name: alertmanager
5     rules:
6       - alert: prometheus target missing
7         expr: up == 0
8         for: 5m
9         labels:
10           alertgroup: "prometheus"
11   - name: database
12     rules:
13       - alert: postgres replication delay
14         expr: pg_replication_lag_seconds > 5
15         for: 5m
16         labels:
17           alertgroup: database
18         annotations:
19           delay: "{{ $value | humanizeDuration }}"
20   - name: hwmon
21     rules:
22       - alert: hwmon fan alarm
23         expr: node_hwmon_fan_alarm == 1
24         for: 5m
25         labels:
26           alertgroup: "{{ $labels.instance }}"
27         annotations:
28           fan_rpm: "{{ with printf \"node_hwmon_fan_rpm{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}rpm{{end}}"
29           fan_min_rpm: "{{ with printf \"node_hwmon_fan_min_rpm{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}rpm{{end}}"
30       - alert: hwmon temperature alarm
31         expr: node_hwmon_temp_alarm == 1
32         for: 5m
33         labels:
34           alertgroup: "{{ $labels.instance }}"
35         annotations:
36           temp_celsius: "{{ with printf \"node_hwmon_temp_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
37           temp_max_celsius: "{{ with printf \"node_hwmon_temp_max_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
38           temp_crit_celsius: "{{ with printf \"node_hwmon_temp_crit_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
39       - alert: hwmon voltage alarm
40         expr: node_hwmon_in_alarm == 1
41         for: 5m
42         labels:
43           alertgroup: "{{ $labels.instance }}"
44         annotations:
45           in_volts: "{{ with printf \"node_hwmon_in_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
46           in_min_volts: "{{ with printf \"node_hwmon_in_min_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
47           in_max_volts: "{{ with printf \"node_hwmon_in_max_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
48   - name: ipmi
49     rules:
50       - alert: ipmi fan alarm
51         expr: ipmi_fan_speed_state > 0
52         for: 5m
53         labels:
54           alertgroup: "{{ $labels.instance }}"
55         annotations:
56           fan_speed_rpm: "{{ with printf \"ipmi_fan_speed_rpm{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}rpm{{end}}"
57       - alert: ipmi temperature alarm
58         expr: ipmi_temperature_state > 0
59         for: 5m
60         labels:
61           alertgroup: "{{ $labels.instance }}"
62         annotations:
63           temperature_celsius: "{{ with printf \"ipmi_temperature_celsius{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}C{{end}}"
64       - alert: ipmi voltage alarm
65         expr: ipmi_voltage_state > 0
66         for: 5m
67         labels:
68           alertgroup: "{{ $labels.instance }}"
69         annotations:
70           voltage_volts: "{{ with printf \"ipmi_voltage_volts{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}V{{end}}"
71   - name: mdadm
72     rules:
73       - alert: mdadm array inactive
74         expr: node_md_state{state="inactive"} > 0
75         for: 0m
76         labels:
77           alertgroup: "{{ $labels.instance }}"
78         annotations:
79           required: "{{ with printf \"node_md_disks_required{instance='%s',device='%s'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
80           active: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='active'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
81           failed: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='failed'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
82           spare: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='spare'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
83       - alert: mdadm disk failed
84         expr: node_md_disks{state="failed"} > 0
85         for: 0m
86         labels:
87           alertgroup: "{{ $labels.instance }}"
88         annotations:
89           required: "{{ with printf \"node_md_disks_required{instance='%s',device='%s'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
90           active: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='active'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
91           failed: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='failed'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
92           spare: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='spare'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
93   - name: memory
94     rules:
95       - alert: low memory
96         expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < 0.1
97         for: 5m
98         labels:
99           alertgroup: "{{ $labels.instance }}"
100         annotations:
101           memory_free: "{{ $value | humanizePercentage }}"
102       - alert: memory pressure
103         expr: rate(node_vmstat_pgmajfault[1m]) > 1000
104         for: 5m
105         labels:
106           alertgroup: "{{ $labels.instance }}"
107         annotations:
108           major_page_faults: "{{ $value }} faults/s"
109       - alert: oom kill detected
110         expr: increase(node_vmstat_oom_kill[1m]) > 0
111         for: 0m
112         labels:
113           alertgroup: "{{ $labels.instance }}"
114         annotations:
115           new_oom_kills: "{{ $value }}"
116   - name: tile
117     rules:
118       - alert: renderd replication delay
119         expr: renderd_replication_delay > 120
120         for: 5m
121         labels:
122           alertgroup: tile
123         annotations:
124           delay: "{{ $value | humanizeDuration }}"
125       - alert: missed tile rate
126         expr: sum(rate(modtile_http_response_total{code="404"}[5m])) by (instance) / sum(rate(modtile_http_response_total[5m])) by (instance) > 0.05
127         for: 5m
128         labels:
129           alertgroup: tile
130         annotations:
131           miss_rate: "{{ $value | humanizePercentage }}"
132   - name: time
133     rules:
134       - alert: clock not synchronising
135         expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
136         for: 5m
137         labels:
138           alertgroup: "{{ $labels.instance }}"
139       - alert: clock skew detected
140         expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
141         for: 5m
142         labels:
143           alertgroup: "{{ $labels.instance }}"
144         annotations:
145           skew: "{{ with printf \"node_timex_offset_seconds{instance='%s'}\" $labels.instance | query }} {{ . | humanizeDuration }}{{ end }}"