]> git.openstreetmap.org Git - chef.git/blob - cookbooks/prometheus/templates/default/alert_rules.yml.erb
Add alerts for filesystems low on space
[chef.git] / cookbooks / prometheus / templates / default / alert_rules.yml.erb
1 # DO NOT EDIT - This file is being maintained by Chef
2
3 groups:
4   - name: alertmanager
5     rules:
6       - alert: prometheus target missing
7         expr: up == 0
8         for: 5m
9         labels:
10           alertgroup: "prometheus"
11   - name: apache
12     rules:
13       - alert: apache down
14         expr: apache_up == 0
15         for: 5m
16         labels:
17           alertgroup: "{{ $labels.instance }}"
18       - alert: apache workers busy
19         expr: sum(apache_workers{state="busy"}) by (instance) / sum(apache_scoreboard) by (instance) > 0.8
20         for: 5m
21         labels:
22           alertgroup: "{{ $labels.instance }}"
23         annotations:
24           busy_workers: "{{ $value | humanizePercentage }}"
25   - name: database
26     rules:
27       - alert: postgres replication delay
28         expr: pg_replication_lag_seconds > 5
29         for: 5m
30         labels:
31           alertgroup: database
32         annotations:
33           delay: "{{ $value | humanizeDuration }}"
34   - name: filesystem
35     rules:
36       - alert: filesystem low on space
37         expr: node_filesystem_avail_bytes / node_filesystem_size_bytes < 0.05
38         for: 5m
39         labels:
40           alertgroup: "{{ $labels.instance }}"
41         annotations:
42           percentage_free: "{{ $value | humanizePercentage }}"
43           free_bytes: "{{ with printf \"node_filesystem_avail_bytes{instance='%s',mountpoint='%s'}\" $labels.instance $labels.mountpoint | query }}{{ . | first | value | humanize1024 }}bytes{{end}}"
44           total_total: "{{ with printf \"node_filesystem_size_bytes{instance='%s',mountpoint='%s'}\" $labels.instance $labels.mountpoint | query }}{{ . | first | value | humanize1024 }}bytes{{end}}"
45       - alert: filesystem low on inodes
46         expr: node_filesystem_files_free / node_filesystem_files < 0.1
47         for: 5m
48         labels:
49           alertgroup: "{{ $labels.instance }}"
50         annotations:
51           percentage_free: "{{ $value | humanizePercentage }}"
52           free_inodes: "{{ with printf \"node_filesystem_files_free{instance='%s',mountpoint='%s'}\" $labels.instance $labels.mountpoint | query }}{{ . | first | value }}{{end}}"
53           total_inodes: "{{ with printf \"node_filesystem_files{instance='%s',mountpoint='%s'}\" $labels.instance $labels.mountpoint | query }}{{ . | first | value }}{{end}}"
54   - name: hwmon
55     rules:
56       - alert: hwmon fan alarm
57         expr: node_hwmon_fan_alarm == 1
58         for: 5m
59         labels:
60           alertgroup: "{{ $labels.instance }}"
61         annotations:
62           fan_rpm: "{{ with printf \"node_hwmon_fan_rpm{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}rpm{{end}}"
63           fan_min_rpm: "{{ with printf \"node_hwmon_fan_min_rpm{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}rpm{{end}}"
64       - alert: hwmon temperature alarm
65         expr: node_hwmon_temp_alarm == 1
66         for: 5m
67         labels:
68           alertgroup: "{{ $labels.instance }}"
69         annotations:
70           temp_celsius: "{{ with printf \"node_hwmon_temp_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
71           temp_max_celsius: "{{ with printf \"node_hwmon_temp_max_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
72           temp_crit_celsius: "{{ with printf \"node_hwmon_temp_crit_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
73       - alert: hwmon voltage alarm
74         expr: node_hwmon_in_alarm == 1
75         for: 5m
76         labels:
77           alertgroup: "{{ $labels.instance }}"
78         annotations:
79           in_volts: "{{ with printf \"node_hwmon_in_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
80           in_min_volts: "{{ with printf \"node_hwmon_in_min_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
81           in_max_volts: "{{ with printf \"node_hwmon_in_max_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
82   - name: ipmi
83     rules:
84       - alert: ipmi fan alarm
85         expr: ipmi_fan_speed_state > 0
86         for: 5m
87         labels:
88           alertgroup: "{{ $labels.instance }}"
89         annotations:
90           fan_speed_rpm: "{{ with printf \"ipmi_fan_speed_rpm{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}rpm{{end}}"
91       - alert: ipmi temperature alarm
92         expr: ipmi_temperature_state > 0
93         for: 5m
94         labels:
95           alertgroup: "{{ $labels.instance }}"
96         annotations:
97           temperature_celsius: "{{ with printf \"ipmi_temperature_celsius{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}C{{end}}"
98       - alert: ipmi voltage alarm
99         expr: ipmi_voltage_state > 0
100         for: 5m
101         labels:
102           alertgroup: "{{ $labels.instance }}"
103         annotations:
104           voltage_volts: "{{ with printf \"ipmi_voltage_volts{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}V{{end}}"
105       - alert: ipmi power alarm
106         expr: ipmi_power_state > 0 or ipmi_sensor_state{type=~"Power .*"} > 0
107         for: 5m
108         labels:
109           alertgroup: "{{ $labels.instance }}"
110   - name: mdadm
111     rules:
112       - alert: mdadm array inactive
113         expr: node_md_state{state="inactive"} > 0
114         for: 0m
115         labels:
116           alertgroup: "{{ $labels.instance }}"
117         annotations:
118           required: "{{ with printf \"node_md_disks_required{instance='%s',device='%s'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
119           active: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='active'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
120           failed: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='failed'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
121           spare: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='spare'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
122       - alert: mdadm disk failed
123         expr: node_md_disks{state="failed"} > 0
124         for: 0m
125         labels:
126           alertgroup: "{{ $labels.instance }}"
127         annotations:
128           required: "{{ with printf \"node_md_disks_required{instance='%s',device='%s'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
129           active: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='active'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
130           failed: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='failed'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
131           spare: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='spare'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
132   - name: memory
133     rules:
134       - alert: low memory
135         expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < 0.1
136         for: 5m
137         labels:
138           alertgroup: "{{ $labels.instance }}"
139         annotations:
140           memory_free: "{{ $value | humanizePercentage }}"
141       - alert: memory pressure
142         expr: rate(node_vmstat_pgmajfault[1m]) > 1000
143         for: 5m
144         labels:
145           alertgroup: "{{ $labels.instance }}"
146         annotations:
147           major_page_faults: "{{ $value }} faults/s"
148       - alert: oom kill detected
149         expr: increase(node_vmstat_oom_kill[1m]) > 0
150         for: 0m
151         labels:
152           alertgroup: "{{ $labels.instance }}"
153         annotations:
154           new_oom_kills: "{{ $value }}"
155   - name: network
156     rules:
157       - alert: interface transmit rate
158         expr: rate(node_network_transmit_bytes_total[1m]) / node_network_speed_bytes > 0.98
159         for: 5m
160         labels:
161           alertgroup: "{{ $labels.instance }}"
162         annotations:
163           bandwidth_used: "{{ $value | humanizePercentage }}"
164       - alert: interface receive rate
165         expr: rate(node_network_receive_bytes_total[1m]) / node_network_speed_bytes > 0.98
166         for: 5m
167         labels:
168           alertgroup: "{{ $labels.instance }}"
169         annotations:
170           bandwidth_used: "{{ $value | humanizePercentage }}"
171       - alert: interface transmit errors
172         expr: rate(node_network_transmit_errs_total[1m]) / rate(node_network_transmit_packets_total[1m]) > 0.01
173         for: 5m
174         labels:
175           alertgroup: "{{ $labels.instance }}"
176         annotations:
177           error_rate: "{{ $value | humanizePercentage }}"
178       - alert: interface receive errors
179         expr: rate(node_network_receive_errs_total[1m]) / rate(node_network_receive_packets_total[1m]) > 0.01
180         for: 5m
181         labels:
182           alertgroup: "{{ $labels.instance }}"
183         annotations:
184           error_rate: "{{ $value | humanizePercentage }}"
185       - alert: conntrack entries
186         expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
187         for: 5m
188         labels:
189           alertgroup: "{{ $labels.instance }}"
190         annotations:
191           entries_used: "{{ $value | humanizePercentage }}"
192   - name: postgresql
193     rules:
194       - alert: postgresql down
195         expr: pg_up == 0
196         for: 1m
197         labels:
198           alertgroup: "{{ $labels.instance }}"
199       - alert: postgresql replication delay
200         expr: pg_replication_lag_seconds > 5
201         for: 1m
202         labels:
203           alertgroup: "{{ $labels.instance }}"
204         annotations:
205           delay: "{{ $value | humanizeDuration }}"
206       - alert: postgresql connection limit
207         expr: sum (pg_stat_activity_count) by (instance, server) / sum (pg_settings_max_connections) by (instance, server) > 0.8
208         for: 1m
209         labels:
210           alertgroup: "{{ $labels.instance }}"
211         annotations:
212           connections_used: "{{ $value | humanizePercentage }}"
213       - alert: postgresql deadlocks
214         expr: increase(pg_stat_database_deadlocks[1m]) > 5
215         for: 0m
216         labels:
217           alertgroup: "{{ $labels.instance }}"
218         annotations:
219           new_deadlocks: "{{ $value }}"
220       - alert: postgresql slow queries
221         expr: pg_slow_queries > 0
222         for: 5m
223         labels:
224           alertgroup: "{{ $labels.instance }}"
225         annotations:
226           queries: "{{ $value }}"
227   - name: smart
228     rules:
229       - alert: smart failure
230         expr: smart_health_status == 0
231         for: 60m
232         labels:
233           alertgroup: "{{ $labels.instance }}"
234       - alert: smart ssd wearout approaching
235         expr: smart_percentage_used >= 90
236         for: 60m
237         labels:
238           alertgroup: "{{ $labels.instance }}"
239         annotations:
240           percentage_used: "{{ $value | humanizePercentage }}"
241   - name: ssl
242     rules:
243       - alert: ssl certificate probe failed
244         expr: ssl_probe_success == 0
245         for: 60m
246         labels:
247           alertgroup: ssl
248       - alert: ssl certificate expiry
249         expr: ssl_verified_cert_not_after{chain_no="0"} - time() < 86400 * 14
250         for: 0m
251         labels:
252           alertgroup: ssl
253         annotations:
254           expires_in: "{{ $value | humanizeDuration }}"
255       - alert: ssl certificate revoked
256         expr: ssl_ocsp_response_status == 1
257         for: 0m
258         labels:
259           alertgroup: ssl
260       - alert: ocsp status unknown
261         expr: ssl_ocsp_response_status == 1
262         for: 0m
263         labels:
264           alertgroup: ssl
265   - name: tile
266     rules:
267       - alert: renderd replication delay
268         expr: renderd_replication_delay > 120
269         for: 5m
270         labels:
271           alertgroup: tile
272         annotations:
273           delay: "{{ $value | humanizeDuration }}"
274       - alert: missed tile rate
275         expr: sum(rate(modtile_http_response_total{code="404"}[5m])) by (instance) / sum(rate(modtile_http_response_total[5m])) by (instance) > 0.05
276         for: 5m
277         labels:
278           alertgroup: tile
279         annotations:
280           miss_rate: "{{ $value | humanizePercentage }}"
281   - name: time
282     rules:
283       - alert: clock not synchronising
284         expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
285         for: 5m
286         labels:
287           alertgroup: "{{ $labels.instance }}"
288       - alert: clock skew detected
289         expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
290         for: 5m
291         labels:
292           alertgroup: "{{ $labels.instance }}"
293         annotations:
294           skew: "{{ with printf \"node_timex_offset_seconds{instance='%s'}\" $labels.instance | query }} {{ . | humanizeDuration }}{{ end }}"