]> git.openstreetmap.org Git - chef.git/blob - cookbooks/prometheus/templates/default/alert_rules.yml.erb
Only alert if the job processing rate is low for an extended period
[chef.git] / cookbooks / prometheus / templates / default / alert_rules.yml.erb
1 # DO NOT EDIT - This file is being maintained by Chef
2
3 groups:
4   - name: alertmanager
5     rules:
6       - alert: prometheus target missing
7         expr: up == 0
8         for: 5m
9         labels:
10           alertgroup: "prometheus"
11   - name: apache
12     rules:
13       - alert: apache down
14         expr: apache_up == 0
15         for: 5m
16         labels:
17           alertgroup: "{{ $labels.instance }}"
18       - alert: apache workers busy
19         expr: sum(apache_workers{state="busy"}) by (instance) / sum(apache_scoreboard) by (instance) > 0.8
20         for: 5m
21         labels:
22           alertgroup: "{{ $labels.instance }}"
23         annotations:
24           busy_workers: "{{ $value | humanizePercentage }}"
25   - name: chef
26     rules:
27       - alert: chef client not running
28         expr: time() - node_systemd_timer_last_trigger_seconds{name="chef-client.timer"} > 3600
29         for: 12h
30         labels:
31           alertgroup: "{{ $labels.instance }}"
32         annotations:
33           down_time: "{{ $value | humanizeDuration }}"
34   - name: cpu
35     rules:
36       - alert: cpu pressure
37         expr: rate(node_pressure_cpu_waiting_seconds_total[5m]) > 0.6
38         for: 15m
39         labels:
40           alertgroup: "{{ $labels.instance }}"
41         annotations:
42           pressure: "{{ $value | humanizePercentage }}"
43   - name: database
44     rules:
45       - alert: postgres replication delay
46         expr: pg_replication_lag_seconds > 5
47         for: 5m
48         labels:
49           alertgroup: database
50         annotations:
51           delay: "{{ $value | humanizeDuration }}"
52   - name: fastly
53     rules:
54       - alert: error rate
55         expr: sum(rate(fastly_rt_status_group_total{status_group="5xx"}[5m])) by (service_name, datacenter) / sum(rate(fastly_rt_status_group_total[5m])) by (service_name, datacenter) > 0.005
56         for: 15m
57         labels:
58           alertgroup: fastly
59         annotations:
60           error_rate: "{{ $value | humanizePercentage }}"
61   - name: filesystem
62     rules:
63       - alert: readonly filesystem
64         expr: node_filesystem_readonly == 1
65         for: 0m
66         labels:
67           alertgroup: "{{ $labels.instance }}"
68       - alert: filesystem low on space
69         expr: node_filesystem_avail_bytes / node_filesystem_size_bytes < 0.05
70         for: 5m
71         labels:
72           alertgroup: "{{ $labels.instance }}"
73         annotations:
74           percentage_free: "{{ $value | humanizePercentage }}"
75           free_bytes: "{{ with printf \"node_filesystem_avail_bytes{instance='%s',mountpoint='%s'}\" $labels.instance $labels.mountpoint | query }}{{ . | first | value | humanize1024 }}bytes{{end}}"
76           total_total: "{{ with printf \"node_filesystem_size_bytes{instance='%s',mountpoint='%s'}\" $labels.instance $labels.mountpoint | query }}{{ . | first | value | humanize1024 }}bytes{{end}}"
77       - alert: filesystem low on inodes
78         expr: node_filesystem_files_free / node_filesystem_files < 0.1
79         for: 5m
80         labels:
81           alertgroup: "{{ $labels.instance }}"
82         annotations:
83           percentage_free: "{{ $value | humanizePercentage }}"
84           free_inodes: "{{ with printf \"node_filesystem_files_free{instance='%s',mountpoint='%s'}\" $labels.instance $labels.mountpoint | query }}{{ . | first | value }}{{end}}"
85           total_inodes: "{{ with printf \"node_filesystem_files{instance='%s',mountpoint='%s'}\" $labels.instance $labels.mountpoint | query }}{{ . | first | value }}{{end}}"
86   - name: hwmon
87     rules:
88       - alert: hwmon fan alarm
89         expr: node_hwmon_fan_alarm == 1
90         for: 5m
91         labels:
92           alertgroup: "{{ $labels.instance }}"
93         annotations:
94           fan_rpm: "{{ with printf \"node_hwmon_fan_rpm{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}rpm{{end}}"
95           fan_min_rpm: "{{ with printf \"node_hwmon_fan_min_rpm{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}rpm{{end}}"
96       - alert: hwmon temperature alarm
97         expr: node_hwmon_temp_alarm == 1
98         for: 5m
99         labels:
100           alertgroup: "{{ $labels.instance }}"
101         annotations:
102           temp_celsius: "{{ with printf \"node_hwmon_temp_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
103           temp_max_celsius: "{{ with printf \"node_hwmon_temp_max_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
104           temp_crit_celsius: "{{ with printf \"node_hwmon_temp_crit_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
105       - alert: hwmon voltage alarm
106         expr: node_hwmon_in_alarm == 1
107         for: 5m
108         labels:
109           alertgroup: "{{ $labels.instance }}"
110         annotations:
111           in_volts: "{{ with printf \"node_hwmon_in_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
112           in_min_volts: "{{ with printf \"node_hwmon_in_min_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
113           in_max_volts: "{{ with printf \"node_hwmon_in_max_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
114   - name: io
115     rules:
116       - alert: io pressure
117         expr: rate(node_pressure_io_waiting_seconds_total[5m]) > 0.6
118         for: 60m
119         labels:
120           alertgroup: "{{ $labels.instance }}"
121         annotations:
122           pressure: "{{ $value | humanizePercentage }}"
123   - name: ipmi
124     rules:
125       - alert: ipmi fan alarm
126         expr: ipmi_fan_speed_state > 0
127         for: 5m
128         labels:
129           alertgroup: "{{ $labels.instance }}"
130         annotations:
131           fan_speed_rpm: "{{ with printf \"ipmi_fan_speed_rpm{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}rpm{{end}}"
132       - alert: ipmi temperature alarm
133         expr: ipmi_temperature_state > 0
134         for: 5m
135         labels:
136           alertgroup: "{{ $labels.instance }}"
137         annotations:
138           temperature_celsius: "{{ with printf \"ipmi_temperature_celsius{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}C{{end}}"
139       - alert: ipmi voltage alarm
140         expr: ipmi_voltage_state > 0
141         for: 5m
142         labels:
143           alertgroup: "{{ $labels.instance }}"
144         annotations:
145           voltage_volts: "{{ with printf \"ipmi_voltage_volts{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}V{{end}}"
146       - alert: ipmi power alarm
147         expr: ipmi_power_state > 0 or ipmi_sensor_state{type=~"Power .*"} > 0
148         for: 5m
149         labels:
150           alertgroup: "{{ $labels.instance }}"
151   - name: mail
152     rules:
153       - alert: exim queue length
154         expr: exim_queue > exim_queue_limit
155         for: 60m
156         labels:
157           alertgroup: mail
158         annotations:
159           queue_length: "{{ $value }}"
160       - alert: mailman queue length
161         expr: mailman_queue_length > 200
162         for: 60m
163         labels:
164           alertgroup: mail
165         annotations:
166           queue_length: "{{ $value }}"
167   - name: mdadm
168     rules:
169       - alert: mdadm array inactive
170         expr: node_md_state{state="inactive"} > 0
171         for: 0m
172         labels:
173           alertgroup: "{{ $labels.instance }}"
174         annotations:
175           required: "{{ with printf \"node_md_disks_required{instance='%s',device='%s'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
176           active: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='active'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
177           failed: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='failed'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
178           spare: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='spare'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
179       - alert: mdadm array degraded
180         expr: sum (node_md_disks{state="active"}) without (state) < node_md_disks_required
181         for: 0m
182         labels:
183           alertgroup: "{{ $labels.instance }}"
184         annotations:
185           required: "{{ with printf \"node_md_disks_required{instance='%s',device='%s'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
186           active: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='active'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
187           failed: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='failed'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
188           spare: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='spare'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
189       - alert: mdadm disk failed
190         expr: node_md_disks{state="failed"} > 0
191         for: 0m
192         labels:
193           alertgroup: "{{ $labels.instance }}"
194         annotations:
195           required: "{{ with printf \"node_md_disks_required{instance='%s',device='%s'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
196           active: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='active'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
197           failed: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='failed'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
198           spare: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='spare'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
199   - name: memory
200     rules:
201       - alert: low memory
202         expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < 0.1
203         for: 15m
204         labels:
205           alertgroup: "{{ $labels.instance }}"
206         annotations:
207           memory_free: "{{ $value | humanizePercentage }}"
208       - alert: memory pressure
209         expr: rate(node_pressure_memory_waiting_seconds_total[5m]) > 0.6
210         for: 60m
211         labels:
212           alertgroup: "{{ $labels.instance }}"
213         annotations:
214           pressure: "{{ $value | humanizePercentage }}"
215       - alert: oom kill detected
216         expr: increase(node_vmstat_oom_kill[1m]) > 0
217         for: 0m
218         labels:
219           alertgroup: "{{ $labels.instance }}"
220         annotations:
221           new_oom_kills: "{{ $value }}"
222   - name: network
223     rules:
224       - alert: interface transmit rate
225         expr: rate(node_network_transmit_bytes_total[1m]) / node_network_speed_bytes > 0.98
226         for: 5m
227         labels:
228           alertgroup: "{{ $labels.instance }}"
229         annotations:
230           bandwidth_used: "{{ $value | humanizePercentage }}"
231       - alert: interface receive rate
232         expr: rate(node_network_receive_bytes_total[1m]) / node_network_speed_bytes > 0.98
233         for: 5m
234         labels:
235           alertgroup: "{{ $labels.instance }}"
236         annotations:
237           bandwidth_used: "{{ $value | humanizePercentage }}"
238       - alert: interface transmit errors
239         expr: rate(node_network_transmit_errs_total[1m]) / rate(node_network_transmit_packets_total[1m]) > 0.01
240         for: 5m
241         labels:
242           alertgroup: "{{ $labels.instance }}"
243         annotations:
244           error_rate: "{{ $value | humanizePercentage }}"
245       - alert: interface receive errors
246         expr: rate(node_network_receive_errs_total[1m]) / rate(node_network_receive_packets_total[1m]) > 0.01
247         for: 5m
248         labels:
249           alertgroup: "{{ $labels.instance }}"
250         annotations:
251           error_rate: "{{ $value | humanizePercentage }}"
252       - alert: conntrack entries
253         expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
254         for: 5m
255         labels:
256           alertgroup: "{{ $labels.instance }}"
257         annotations:
258           entries_used: "{{ $value | humanizePercentage }}"
259   - name: planet
260     rules:
261       - alert: planet dump overdue
262         expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/(pbf|planet)/.*"} > 7 * 86400 and ignoring (job, name, path) chef_role{name="planetdump"} == 1
263         for: 24h
264         labels:
265           alertgroup: planet
266         annotations:
267           overdue_by: "{{ $value | humanizeDuration }}"
268       - alert: notes dump overdue
269         expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/notes/.*"} > 86400 and ignoring (job, name, path) chef_role{name="planetdump"} == 1
270         for: 6h
271         labels:
272           alertgroup: planet
273         annotations:
274           overdue_by: "{{ $value | humanizeDuration }}"
275       - alert: daily replication feed delayed
276         expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/replication/day/.*"} > 86400 and ignoring (job, name, path) chef_role{name="planetdump"} == 1
277         for: 3h
278         labels:
279           alertgroup: planet
280         annotations:
281           delayed_by: "{{ $value | humanizeDuration }}"
282       - alert: hourly replication feed delayed
283         expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/replication/hour/.*"} > 3600 and ignoring (job, name, path) chef_role{name="planetdump"} == 1
284         for: 30m
285         labels:
286           alertgroup: planet
287         annotations:
288           delayed_by: "{{ $value | humanizeDuration }}"
289       - alert: minutely replication feed delayed
290         expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/replication/minute/.*"} > 60 and ignoring (job, name, path) chef_role{name="planetdump"} == 1
291         for: 5m
292         labels:
293           alertgroup: planet
294         annotations:
295           delayed_by: "{{ $value | humanizeDuration }}"
296       - alert: changeset replication feed delayed
297         expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/replication/changesets/.*"} > 60 and ignoring (job, name, path) chef_role{name="planetdump"} == 1
298         for: 5m
299         labels:
300           alertgroup: planet
301         annotations:
302           delayed_by: "{{ $value | humanizeDuration }}"
303   - name: postgresql
304     rules:
305       - alert: postgresql down
306         expr: pg_up == 0
307         for: 1m
308         labels:
309           alertgroup: "{{ $labels.instance }}"
310       - alert: postgresql replication delay
311         expr: pg_replication_lag_seconds > 5
312         for: 1m
313         labels:
314           alertgroup: "{{ $labels.instance }}"
315         annotations:
316           delay: "{{ $value | humanizeDuration }}"
317       - alert: postgresql connection limit
318         expr: sum (pg_stat_activity_count) by (instance, server) / sum (pg_settings_max_connections) by (instance, server) > 0.8
319         for: 1m
320         labels:
321           alertgroup: "{{ $labels.instance }}"
322         annotations:
323           connections_used: "{{ $value | humanizePercentage }}"
324       - alert: postgresql deadlocks
325         expr: increase(pg_stat_database_deadlocks[1m]) > 5
326         for: 0m
327         labels:
328           alertgroup: "{{ $labels.instance }}"
329         annotations:
330           new_deadlocks: "{{ $value }}"
331       - alert: postgresql slow queries
332         expr: pg_slow_queries > 0
333         for: 5m
334         labels:
335           alertgroup: "{{ $labels.instance }}"
336         annotations:
337           queries: "{{ $value }}"
338   - name: smart
339     rules:
340       - alert: smart failure
341         expr: smart_health_status == 0
342         for: 60m
343         labels:
344           alertgroup: "{{ $labels.instance }}"
345       - alert: smart ssd wearout approaching
346         expr: smart_percentage_used >= 90
347         for: 60m
348         labels:
349           alertgroup: "{{ $labels.instance }}"
350         annotations:
351           percentage_used: "{{ $value | humanizePercentage }}"
352   - name: ssl
353     rules:
354       - alert: ssl certificate probe failed
355         expr: ssl_probe_success == 0
356         for: 60m
357         labels:
358           alertgroup: ssl
359       - alert: ssl certificate expiry
360         expr: ssl_verified_cert_not_after{chain_no="0"} - time() < 86400 * 14
361         for: 0m
362         labels:
363           alertgroup: ssl
364         annotations:
365           expires_in: "{{ $value | humanizeDuration }}"
366       - alert: ssl certificate revoked
367         expr: ssl_ocsp_response_status == 1
368         for: 0m
369         labels:
370           alertgroup: ssl
371       - alert: ocsp status unknown
372         expr: ssl_ocsp_response_status == 1
373         for: 0m
374         labels:
375           alertgroup: ssl
376   - name: systemd
377     rules:
378       - alert: systemd failed service
379         expr: node_systemd_unit_state{state="failed",name!="chef-client.service"} == 1
380         for: 5m
381         labels:
382           alertgroup: "{{ $labels.instance }}"
383       - alert: systemd failed service
384         expr: node_systemd_unit_state{state="failed",name="chef-client.service"} == 1
385         for: 6h
386         labels:
387           alertgroup: "{{ $labels.instance }}"
388   - name: tile
389     rules:
390       - alert: renderd replication delay
391         expr: renderd_replication_delay > 120
392         for: 5m
393         labels:
394           alertgroup: tile
395         annotations:
396           delay: "{{ $value | humanizeDuration }}"
397       - alert: missed tile rate
398         expr: sum(rate(modtile_http_response_total{code="404"}[5m])) by (instance) / sum(rate(modtile_http_response_total[5m])) by (instance) > 0.05
399         for: 5m
400         labels:
401           alertgroup: tile
402         annotations:
403           miss_rate: "{{ $value | humanizePercentage }}"
404   - name: time
405     rules:
406       - alert: clock not synchronising
407         expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
408         for: 5m
409         labels:
410           alertgroup: "{{ $labels.instance }}"
411       - alert: clock skew detected
412         expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
413         for: 5m
414         labels:
415           alertgroup: "{{ $labels.instance }}"
416         annotations:
417           skew: "{{ with printf \"node_timex_offset_seconds{instance='%s'}\" $labels.instance | query }} {{ . | humanizeDuration }}{{ end }}"
418   - name: web
419     rules:
420       - alert: web error rate
421         expr: sum(rate(api_call_count_total{status=~"50[0-8]|5[1-9][0-9]"}[5m])) by (instance) / sum(rate(api_call_count_total[5m])) by (instance) > 0.002
422         for: 5m
423         labels:
424           alertgroup: web
425         annotations:
426           error_rate: "{{ $value | humanizePercentage }}"
427       - alert: job processing rate
428         expr: rate(pg_stat_user_tables_n_tup_del{datname="openstreetmap",relname="delayed_jobs"}[5m]) / rate(pg_stat_user_tables_n_tup_ins{datname="openstreetmap",relname="delayed_jobs"}[5m]) < 0.9 and ignoring(job, name, datname, relname, schemaname, server) chef_role{name="db-master"} == 1
429         for: 15m
430         labels:
431           alertgroup: web
432         annotations:
433           job_processing_rate: "{{ $value | humanizePercentage }}"