]> git.openstreetmap.org Git - chef.git/blob - cookbooks/prometheus/templates/default/alert_rules.yml.erb
Add alert rules for degraded mdadm arrays and API error rate
[chef.git] / cookbooks / prometheus / templates / default / alert_rules.yml.erb
1 # DO NOT EDIT - This file is being maintained by Chef
2
3 groups:
4   - name: alertmanager
5     rules:
6       - alert: prometheus target missing
7         expr: up == 0
8         for: 5m
9         labels:
10           alertgroup: "prometheus"
11   - name: apache
12     rules:
13       - alert: apache down
14         expr: apache_up == 0
15         for: 5m
16         labels:
17           alertgroup: "{{ $labels.instance }}"
18       - alert: apache workers busy
19         expr: sum(apache_workers{state="busy"}) by (instance) / sum(apache_scoreboard) by (instance) > 0.8
20         for: 5m
21         labels:
22           alertgroup: "{{ $labels.instance }}"
23         annotations:
24           busy_workers: "{{ $value | humanizePercentage }}"
25   - name: database
26     rules:
27       - alert: postgres replication delay
28         expr: pg_replication_lag_seconds > 5
29         for: 5m
30         labels:
31           alertgroup: database
32         annotations:
33           delay: "{{ $value | humanizeDuration }}"
34   - name: filesystem
35     rules:
36       - alert: filesystem low on space
37         expr: node_filesystem_avail_bytes / node_filesystem_size_bytes < 0.05
38         for: 5m
39         labels:
40           alertgroup: "{{ $labels.instance }}"
41         annotations:
42           percentage_free: "{{ $value | humanizePercentage }}"
43           free_bytes: "{{ with printf \"node_filesystem_avail_bytes{instance='%s',mountpoint='%s'}\" $labels.instance $labels.mountpoint | query }}{{ . | first | value | humanize1024 }}bytes{{end}}"
44           total_total: "{{ with printf \"node_filesystem_size_bytes{instance='%s',mountpoint='%s'}\" $labels.instance $labels.mountpoint | query }}{{ . | first | value | humanize1024 }}bytes{{end}}"
45       - alert: filesystem low on inodes
46         expr: node_filesystem_files_free / node_filesystem_files < 0.1
47         for: 5m
48         labels:
49           alertgroup: "{{ $labels.instance }}"
50         annotations:
51           percentage_free: "{{ $value | humanizePercentage }}"
52           free_inodes: "{{ with printf \"node_filesystem_files_free{instance='%s',mountpoint='%s'}\" $labels.instance $labels.mountpoint | query }}{{ . | first | value }}{{end}}"
53           total_inodes: "{{ with printf \"node_filesystem_files{instance='%s',mountpoint='%s'}\" $labels.instance $labels.mountpoint | query }}{{ . | first | value }}{{end}}"
54   - name: hwmon
55     rules:
56       - alert: hwmon fan alarm
57         expr: node_hwmon_fan_alarm == 1
58         for: 5m
59         labels:
60           alertgroup: "{{ $labels.instance }}"
61         annotations:
62           fan_rpm: "{{ with printf \"node_hwmon_fan_rpm{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}rpm{{end}}"
63           fan_min_rpm: "{{ with printf \"node_hwmon_fan_min_rpm{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}rpm{{end}}"
64       - alert: hwmon temperature alarm
65         expr: node_hwmon_temp_alarm == 1
66         for: 5m
67         labels:
68           alertgroup: "{{ $labels.instance }}"
69         annotations:
70           temp_celsius: "{{ with printf \"node_hwmon_temp_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
71           temp_max_celsius: "{{ with printf \"node_hwmon_temp_max_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
72           temp_crit_celsius: "{{ with printf \"node_hwmon_temp_crit_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
73       - alert: hwmon voltage alarm
74         expr: node_hwmon_in_alarm == 1
75         for: 5m
76         labels:
77           alertgroup: "{{ $labels.instance }}"
78         annotations:
79           in_volts: "{{ with printf \"node_hwmon_in_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
80           in_min_volts: "{{ with printf \"node_hwmon_in_min_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
81           in_max_volts: "{{ with printf \"node_hwmon_in_max_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
82   - name: ipmi
83     rules:
84       - alert: ipmi fan alarm
85         expr: ipmi_fan_speed_state > 0
86         for: 5m
87         labels:
88           alertgroup: "{{ $labels.instance }}"
89         annotations:
90           fan_speed_rpm: "{{ with printf \"ipmi_fan_speed_rpm{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}rpm{{end}}"
91       - alert: ipmi temperature alarm
92         expr: ipmi_temperature_state > 0
93         for: 5m
94         labels:
95           alertgroup: "{{ $labels.instance }}"
96         annotations:
97           temperature_celsius: "{{ with printf \"ipmi_temperature_celsius{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}C{{end}}"
98       - alert: ipmi voltage alarm
99         expr: ipmi_voltage_state > 0
100         for: 5m
101         labels:
102           alertgroup: "{{ $labels.instance }}"
103         annotations:
104           voltage_volts: "{{ with printf \"ipmi_voltage_volts{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}V{{end}}"
105       - alert: ipmi power alarm
106         expr: ipmi_power_state > 0 or ipmi_sensor_state{type=~"Power .*"} > 0
107         for: 5m
108         labels:
109           alertgroup: "{{ $labels.instance }}"
110   - name: mdadm
111     rules:
112       - alert: mdadm array inactive
113         expr: node_md_state{state="inactive"} > 0
114         for: 0m
115         labels:
116           alertgroup: "{{ $labels.instance }}"
117         annotations:
118           required: "{{ with printf \"node_md_disks_required{instance='%s',device='%s'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
119           active: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='active'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
120           failed: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='failed'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
121           spare: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='spare'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
122       - alert: mdadm array degraded
123         expr: sum (node_md_disks{state="active"}) without (state) < node_md_disks_required
124         for: 0m
125         labels:
126           alertgroup: "{{ $labels.instance }}"
127         annotations:
128           required: "{{ with printf \"node_md_disks_required{instance='%s',device='%s'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
129           active: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='active'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
130           failed: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='failed'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
131           spare: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='spare'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
132       - alert: mdadm disk failed
133         expr: node_md_disks{state="failed"} > 0
134         for: 0m
135         labels:
136           alertgroup: "{{ $labels.instance }}"
137         annotations:
138           required: "{{ with printf \"node_md_disks_required{instance='%s',device='%s'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
139           active: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='active'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
140           failed: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='failed'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
141           spare: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='spare'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
142   - name: memory
143     rules:
144       - alert: low memory
145         expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < 0.1
146         for: 5m
147         labels:
148           alertgroup: "{{ $labels.instance }}"
149         annotations:
150           memory_free: "{{ $value | humanizePercentage }}"
151       - alert: memory pressure
152         expr: rate(node_vmstat_pgmajfault[1m]) > 1000
153         for: 5m
154         labels:
155           alertgroup: "{{ $labels.instance }}"
156         annotations:
157           major_page_faults: "{{ $value }} faults/s"
158       - alert: oom kill detected
159         expr: increase(node_vmstat_oom_kill[1m]) > 0
160         for: 0m
161         labels:
162           alertgroup: "{{ $labels.instance }}"
163         annotations:
164           new_oom_kills: "{{ $value }}"
165   - name: network
166     rules:
167       - alert: interface transmit rate
168         expr: rate(node_network_transmit_bytes_total[1m]) / node_network_speed_bytes > 0.98
169         for: 5m
170         labels:
171           alertgroup: "{{ $labels.instance }}"
172         annotations:
173           bandwidth_used: "{{ $value | humanizePercentage }}"
174       - alert: interface receive rate
175         expr: rate(node_network_receive_bytes_total[1m]) / node_network_speed_bytes > 0.98
176         for: 5m
177         labels:
178           alertgroup: "{{ $labels.instance }}"
179         annotations:
180           bandwidth_used: "{{ $value | humanizePercentage }}"
181       - alert: interface transmit errors
182         expr: rate(node_network_transmit_errs_total[1m]) / rate(node_network_transmit_packets_total[1m]) > 0.01
183         for: 5m
184         labels:
185           alertgroup: "{{ $labels.instance }}"
186         annotations:
187           error_rate: "{{ $value | humanizePercentage }}"
188       - alert: interface receive errors
189         expr: rate(node_network_receive_errs_total[1m]) / rate(node_network_receive_packets_total[1m]) > 0.01
190         for: 5m
191         labels:
192           alertgroup: "{{ $labels.instance }}"
193         annotations:
194           error_rate: "{{ $value | humanizePercentage }}"
195       - alert: conntrack entries
196         expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
197         for: 5m
198         labels:
199           alertgroup: "{{ $labels.instance }}"
200         annotations:
201           entries_used: "{{ $value | humanizePercentage }}"
202   - name: postgresql
203     rules:
204       - alert: postgresql down
205         expr: pg_up == 0
206         for: 1m
207         labels:
208           alertgroup: "{{ $labels.instance }}"
209       - alert: postgresql replication delay
210         expr: pg_replication_lag_seconds > 5
211         for: 1m
212         labels:
213           alertgroup: "{{ $labels.instance }}"
214         annotations:
215           delay: "{{ $value | humanizeDuration }}"
216       - alert: postgresql connection limit
217         expr: sum (pg_stat_activity_count) by (instance, server) / sum (pg_settings_max_connections) by (instance, server) > 0.8
218         for: 1m
219         labels:
220           alertgroup: "{{ $labels.instance }}"
221         annotations:
222           connections_used: "{{ $value | humanizePercentage }}"
223       - alert: postgresql deadlocks
224         expr: increase(pg_stat_database_deadlocks[1m]) > 5
225         for: 0m
226         labels:
227           alertgroup: "{{ $labels.instance }}"
228         annotations:
229           new_deadlocks: "{{ $value }}"
230       - alert: postgresql slow queries
231         expr: pg_slow_queries > 0
232         for: 5m
233         labels:
234           alertgroup: "{{ $labels.instance }}"
235         annotations:
236           queries: "{{ $value }}"
237   - name: smart
238     rules:
239       - alert: smart failure
240         expr: smart_health_status == 0
241         for: 60m
242         labels:
243           alertgroup: "{{ $labels.instance }}"
244       - alert: smart ssd wearout approaching
245         expr: smart_percentage_used >= 90
246         for: 60m
247         labels:
248           alertgroup: "{{ $labels.instance }}"
249         annotations:
250           percentage_used: "{{ $value | humanizePercentage }}"
251   - name: ssl
252     rules:
253       - alert: ssl certificate probe failed
254         expr: ssl_probe_success == 0
255         for: 60m
256         labels:
257           alertgroup: ssl
258       - alert: ssl certificate expiry
259         expr: ssl_verified_cert_not_after{chain_no="0"} - time() < 86400 * 14
260         for: 0m
261         labels:
262           alertgroup: ssl
263         annotations:
264           expires_in: "{{ $value | humanizeDuration }}"
265       - alert: ssl certificate revoked
266         expr: ssl_ocsp_response_status == 1
267         for: 0m
268         labels:
269           alertgroup: ssl
270       - alert: ocsp status unknown
271         expr: ssl_ocsp_response_status == 1
272         for: 0m
273         labels:
274           alertgroup: ssl
275   - name: systemd
276     rules:
277       - alert: systemd failed service
278         expr: node_systemd_unit_state{state="failed"} == 1
279         for: 5m
280         labels:
281           alertgroup: "{{ $labels.instance }}"
282   - name: tile
283     rules:
284       - alert: renderd replication delay
285         expr: renderd_replication_delay > 120
286         for: 5m
287         labels:
288           alertgroup: tile
289         annotations:
290           delay: "{{ $value | humanizeDuration }}"
291       - alert: missed tile rate
292         expr: sum(rate(modtile_http_response_total{code="404"}[5m])) by (instance) / sum(rate(modtile_http_response_total[5m])) by (instance) > 0.05
293         for: 5m
294         labels:
295           alertgroup: tile
296         annotations:
297           miss_rate: "{{ $value | humanizePercentage }}"
298   - name: time
299     rules:
300       - alert: clock not synchronising
301         expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
302         for: 5m
303         labels:
304           alertgroup: "{{ $labels.instance }}"
305       - alert: clock skew detected
306         expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
307         for: 5m
308         labels:
309           alertgroup: "{{ $labels.instance }}"
310         annotations:
311           skew: "{{ with printf \"node_timex_offset_seconds{instance='%s'}\" $labels.instance | query }} {{ . | humanizeDuration }}{{ end }}"
312   - name: web
313     rules:
314       - alert: web error rate
315         expr: sum(rate(api_call_count_total{status=~"5.*"}[5m])) by (instance) / sum(rate(api_call_count_total[5m])) by (instance) > 0.002
316         for: 5m
317         labels:
318           alertgroup: web
319         annotations:
320           error_rate: "{{ $value | humanizePercentage }}"