]> git.openstreetmap.org Git - chef.git/blob - cookbooks/prometheus/templates/default/alert_rules.yml.erb
Merge remote-tracking branch 'github/pull/802'
[chef.git] / cookbooks / prometheus / templates / default / alert_rules.yml.erb
1 # DO NOT EDIT - This file is being maintained by Chef
2
3 groups:
4   - name: amsterdam
5     rules:
6       - alert: he uplink
7         expr: junos_interface_up{site="amsterdam",name=~"ge-[01]/2/2"} != 1
8         for: 6m
9         keep_firing_for: 3m
10         labels:
11           alertgroup: "amsterdam"
12         annotations:
13           status: "{{ $value }}"
14       - alert: equinix uplink
15         expr: junos_interface_up{site="amsterdam",name=~"xe-[01]/2/0"} != 1
16         for: 6m
17         keep_firing_for: 3m
18         labels:
19           alertgroup: "amsterdam"
20         annotations:
21           status: "{{ $value }}"
22       - alert: pdu current draw
23         expr: rPDU2PhaseStatusCurrent{site="amsterdam",rPDU2PhaseStatusIndex="1"} / 10 > 28
24         for: 6m
25         keep_firing_for: 3m
26         labels:
27           alertgroup: "amsterdam"
28         annotations:
29           current: "{{ $value | humanize }}A"
30       - alert: site power
31         expr: sum(avg_over_time(rPDU2PhaseStatusApparentPower{site="amsterdam",rPDU2PhaseStatusIndex="1"}[1h]) / 100) > 3.5
32         for: 6m
33         keep_firing_for: 3m
34         labels:
35           alertgroup: "amsterdam"
36         annotations:
37           current: "{{ $value | humanize }}kVA"
38       - alert: site temperature
39         expr: min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 < 15 or min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 > 32
40         for: 6m
41         keep_firing_for: 3m
42         labels:
43           alertgroup: "amsterdam"
44         annotations:
45           temperature: "{{ $value | humanize }}C"
46       - alert: site humidity
47         expr: max(rPDU2SensorTempHumidityStatusRelativeHumidity{site="amsterdam"}) / 100 < 0.08 or max(rPDU2SensorTempHumidityStatusRelativeHumidity{site="amsterdam"}) / 100 > 0.8
48         for: 6m
49         keep_firing_for: 3m
50         labels:
51           alertgroup: "amsterdam"
52         annotations:
53           humidity: "{{ $value | humanizePercentage }}"
54   - name: apache
55     rules:
56       - alert: apache down
57         expr: apache_up == 0
58         for: 5m
59         keep_firing_for: 3m
60         labels:
61           alertgroup: "{{ $labels.instance }}"
62       - alert: apache workers busy
63         expr: sum(apache_workers{state="busy"}) by (instance) / sum(apache_scoreboard) by (instance) > 0.8
64         for: 5m
65         keep_firing_for: 3m
66         labels:
67           alertgroup: "{{ $labels.instance }}"
68         annotations:
69           busy_workers: "{{ $value | humanizePercentage }}"
70       - alert: apache connection limit
71         expr: (apache_connections{state="total"} - on (instance) apache_connections{state="closing"}) / on (instance) (apache_server_limit * on (instance) (apache_threads_per_child + on (instance) (apache_async_request_worker_factor * on (instance) apache_workers{state="idle"} / on(instance) apache_processes{state="all"}))) > 0.8
72         for: 5m
73         keep_firing_for: 3m
74         labels:
75           alertgroup: "{{ $labels.instance }}"
76         annotations:
77           connections: "{{ $value | humanizePercentage }}"
78   - name: chef
79     rules:
80       - alert: chef client not running
81         expr: time() - node_systemd_timer_last_trigger_seconds{name="chef-client.timer"} > 3600
82         for: 12h
83         keep_firing_for: 10m
84         labels:
85           alertgroup: "{{ $labels.instance }}"
86         annotations:
87           down_time: "{{ $value | humanizeDuration }}"
88   - name: cisco
89     rules:
90       - alert: cisco fan alarm
91         expr: rlPhdUnitEnvParamFan1Status{rlPhdUnitEnvParamFan1Status!="normal"} > 0 or rlPhdUnitEnvParamFan2Status{rlPhdUnitEnvParamFan2Status!="normal"} > 0
92         for: 5m
93         labels:
94           alertgroup: "{{ $labels.site }}"
95         annotations:
96           fan_rpm: "{{ with printf \"rlPhdUnitEnvParamFan1Speed{site='%s',instance='%s',rlPhdUnitEnvParamStackUnit='%s'}\" $labels.site $labels.instance $labels.rlPhdUnitEnvParamStackUnit | query }}{{ . | first | value | humanize }}rpm{{end}}"
97       - alert: cisco temperature alarm
98         expr: rlPhdUnitEnvParamTempSensorStatus{rlPhdUnitEnvParamTempSensorStatus!="ok"} > 0
99         for: 5m
100         labels:
101           alertgroup: "{{ $labels.site }}"
102         annotations:
103           temp_celsius: "{{ with printf \"rlPhdUnitEnvParamTempSensorValue{site='%s',instance='%s',rlPhdUnitEnvParamStackUnit='%s'}\" $labels.site $labels.instance $labels.rlPhdUnitEnvParamStackUnit | query }}{{ . | first | value | humanize }}C{{end}}"
104       - alert: cisco main power alarm
105         expr: rlPhdUnitEnvParamMainPSStatus{rlPhdUnitEnvParamMainPSStatus!="normal"} > 0
106         for: 5m
107         labels:
108           alertgroup: "{{ $labels.site }}"
109       - alert: cisco redundant power alarm
110         expr: rlPhdUnitEnvParamRedundantPSStatus{rlPhdUnitEnvParamRedundantPSStatus!="normal"} > 0
111         for: 5m
112         labels:
113           alertgroup: "{{ $labels.site }}"
114   - name: cpu
115     rules:
116       - alert: cpu pressure
117         expr: rate(node_pressure_cpu_waiting_seconds_total[5m]) > 0.75
118         for: 60m
119         keep_firing_for: 10m
120         labels:
121           alertgroup: "{{ $labels.instance }}"
122         annotations:
123           pressure: "{{ $value | humanizePercentage }}"
124   - name: database
125     rules:
126       - alert: active rails queries
127         expr: sum(pg_stat_activity_count{datname="openstreetmap",usename="rails",state="active"}) by (instance) > 50 and on (instance) chef_role{name="db-master"}
128         for: 5m
129         keep_firing_for: 3m
130         labels:
131           alertgroup: database
132         annotations:
133           queries: "{{ $value }}"
134       - alert: active cgimap queries
135         expr: sum(pg_stat_activity_count{datname="openstreetmap",usename="cgimap",state="active"}) by (instance) > 30 and on (instance) chef_role{name="db-master"}
136         for: 5m
137         keep_firing_for: 3m
138         labels:
139           alertgroup: database
140         annotations:
141           queries: "{{ $value }}"
142   - name: discourse
143     rules:
144       - alert: discourse job failure rate
145         expr: rate(discourse_job_failures[5m]) > 0
146         for: 5m
147         keep_firing_for: 3m
148         labels:
149           alertgroup: discourse
150         annotations:
151           failure_rate: "{{ $value }} jobs/s"
152   - name: dublin
153     rules:
154       - alert: he uplink
155         expr: junos_interface_up{site="dublin",name=~"ge-[01]/2/2"} != 1
156         for: 6m
157         keep_firing_for: 3m
158         labels:
159           alertgroup: "dublin"
160         annotations:
161           status: "{{ $value }}"
162       - alert: equinix uplink
163         expr: junos_interface_up{site="dublin",name=~"xe-[01]/2/0"} != 1
164         for: 6m
165         keep_firing_for: 3m
166         labels:
167           alertgroup: "dublin"
168         annotations:
169           status: "{{ $value }}"
170       - alert: pdu current draw
171         expr: rPDU2PhaseStatusCurrent{site="dublin",rPDU2PhaseStatusIndex="1"} / 10 > 28
172         for: 6m
173         keep_firing_for: 3m
174         labels:
175           alertgroup: "dublin"
176         annotations:
177           current: "{{ $value | humanize }}A"
178       - alert: site power
179         expr: sum(avg_over_time(rPDU2PhaseStatusApparentPower{site="dublin",rPDU2PhaseStatusIndex="1"}[1h]) / 100) > 4
180         for: 6m
181         keep_firing_for: 3m
182         labels:
183           alertgroup: "dublin"
184         annotations:
185           current: "{{ $value | humanize }}kVA"
186       - alert: site temperature
187         expr: min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 > 26
188         for: 6m
189         keep_firing_for: 3m
190         labels:
191           alertgroup: "dublin"
192         annotations:
193           temperature: "{{ $value | humanize }}C"
194       - alert: site humidity
195         expr: max(rPDU2SensorTempHumidityStatusRelativeHumidity{site="dublin"}) / 100 < 0.25 or max(rPDU2SensorTempHumidityStatusRelativeHumidity{site="dublin"}) / 100 > 0.65
196         for: 6m
197         keep_firing_for: 3m
198         labels:
199           alertgroup: "dublin"
200         annotations:
201           humidity: "{{ $value | humanizePercentage }}"
202   - name: fastly
203     rules:
204       - alert: fastly error rate
205         expr: sum(rate(fastly_rt_status_group_total{status_group="5xx"}[5m])) by (service_name, datacenter) / sum(rate(fastly_rt_status_group_total[5m])) by (service_name, datacenter) > 0.005
206         for: 15m
207         keep_firing_for: 450s
208         labels:
209           alertgroup: fastly
210         annotations:
211           error_rate: "{{ $value | humanizePercentage }}"
212       - alert: fastly frontend healthcheck warning
213         expr: count(fastly_healthcheck_status == 0) by (service, datacenter) > 2
214         for: 15m
215         keep_firing_for: 450s
216         labels:
217           alertgroup: fastly
218       - alert: fastly frontend healthcheck critical
219         expr: count(fastly_healthcheck_status == 0) by (service, datacenter) == count(fastly_healthcheck_status) by (service, datacenter)
220         for: 5m
221         keep_firing_for: 150s
222         labels:
223           alertgroup: fastly
224       - alert: fastly backend healthcheck warning
225         expr: count(fastly_healthcheck_status == 0) by (service, backend) > 10
226         for: 15m
227         keep_firing_for: 450s
228         labels:
229           alertgroup: fastly
230       - alert: fastly backend healthcheck critical
231         expr: count(fastly_healthcheck_status == 0) by (service, backend) == count(fastly_healthcheck_status) by (service, backend)
232         for: 5m
233         keep_firing_for: 150s
234         labels:
235           alertgroup: fastly
236   - name: filesystem
237     rules:
238       - alert: readonly filesystem
239         expr: node_filesystem_readonly > min_over_time(node_filesystem_readonly[7d])
240         for: 0m
241         labels:
242           alertgroup: "{{ $labels.instance }}"
243       - alert: filesystem low on space
244         expr: node_filesystem_avail_bytes / node_filesystem_size_bytes < 0.05
245         for: 5m
246         keep_firing_for: 150s
247         labels:
248           alertgroup: "{{ $labels.instance }}"
249         annotations:
250           percentage_free: "{{ $value | humanizePercentage }}"
251           free_bytes: "{{ with printf \"node_filesystem_avail_bytes{instance='%s',mountpoint='%s'}\" $labels.instance $labels.mountpoint | query }}{{ . | first | value | humanize1024 }}bytes{{end}}"
252           total_total: "{{ with printf \"node_filesystem_size_bytes{instance='%s',mountpoint='%s'}\" $labels.instance $labels.mountpoint | query }}{{ . | first | value | humanize1024 }}bytes{{end}}"
253       - alert: filesystem low on inodes
254         expr: node_filesystem_files_free / node_filesystem_files < 0.1
255         for: 5m
256         labels:
257           alertgroup: "{{ $labels.instance }}"
258         annotations:
259           percentage_free: "{{ $value | humanizePercentage }}"
260           free_inodes: "{{ with printf \"node_filesystem_files_free{instance='%s',mountpoint='%s'}\" $labels.instance $labels.mountpoint | query }}{{ . | first | value }}{{end}}"
261           total_inodes: "{{ with printf \"node_filesystem_files{instance='%s',mountpoint='%s'}\" $labels.instance $labels.mountpoint | query }}{{ . | first | value }}{{end}}"
262   - name: hwmon
263     rules:
264       - alert: hwmon fan alarm
265         expr: node_hwmon_fan_alarm == 1
266         for: 5m
267         keep_firing_for: 150s
268         labels:
269           alertgroup: "{{ $labels.instance }}"
270         annotations:
271           fan_rpm: "{{ with printf \"node_hwmon_fan_rpm{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}rpm{{end}}"
272           fan_min_rpm: "{{ with printf \"node_hwmon_fan_min_rpm{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}rpm{{end}}"
273       - alert: hwmon temperature alarm
274         expr: node_hwmon_temp_alarm == 1
275         for: 5m
276         keep_firing_for: 150s
277         labels:
278           alertgroup: "{{ $labels.instance }}"
279         annotations:
280           temp_celsius: "{{ with printf \"node_hwmon_temp_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
281           temp_max_celsius: "{{ with printf \"node_hwmon_temp_max_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
282           temp_crit_celsius: "{{ with printf \"node_hwmon_temp_crit_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
283       - alert: hwmon voltage alarm
284         expr: node_hwmon_in_alarm == 1
285         for: 5m
286         keep_firing_for: 150s
287         labels:
288           alertgroup: "{{ $labels.instance }}"
289         annotations:
290           in_volts: "{{ with printf \"node_hwmon_in_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
291           in_min_volts: "{{ with printf \"node_hwmon_in_min_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
292           in_max_volts: "{{ with printf \"node_hwmon_in_max_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
293   - name: io
294     rules:
295       - alert: io pressure
296         expr: rate(node_pressure_io_waiting_seconds_total[5m]) > 0.6
297         for: 60m
298         labels:
299           alertgroup: "{{ $labels.instance }}"
300         annotations:
301           pressure: "{{ $value | humanizePercentage }}"
302   - name: ipmi
303     rules:
304       - alert: ipmi fan alarm
305         expr: ipmi_fan_speed_state > 0
306         for: 5m
307         keep_firing_for: 150s
308         labels:
309           alertgroup: "{{ $labels.instance }}"
310         annotations:
311           fan_speed_rpm: "{{ with printf \"ipmi_fan_speed_rpm{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}rpm{{end}}"
312       - alert: ipmi temperature alarm
313         expr: ipmi_temperature_state > 0
314         for: 5m
315         keep_firing_for: 150s
316         labels:
317           alertgroup: "{{ $labels.instance }}"
318         annotations:
319           temperature_celsius: "{{ with printf \"ipmi_temperature_celsius{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}C{{end}}"
320       - alert: ipmi voltage alarm
321         expr: ipmi_voltage_state > 0
322         for: 5m
323         keep_firing_for: 150s
324         labels:
325           alertgroup: "{{ $labels.instance }}"
326         annotations:
327           voltage_volts: "{{ with printf \"ipmi_voltage_volts{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}V{{end}}"
328       - alert: ipmi power alarm
329         expr: ipmi_power_state > 0 or ipmi_sensor_state{type=~"Power .*"} > 0
330         for: 5m
331         keep_firing_for: 150s
332         labels:
333           alertgroup: "{{ $labels.instance }}"
334   - name: juniper
335     rules:
336       - alert: juniper red alarms
337         expr: juniper_alarms_red_count > 0
338         for: 5m
339         keep_firing_for: 150s
340         labels:
341           alertgroup: "{{ $labels.site }}"
342         annotations:
343           alarm_count: "{{ $value }} alarms"
344       - alert: juniper yellow alarms
345         expr: juniper_alarms_yellow_count > 0
346         for: 5m
347         keep_firing_for: 150s
348         labels:
349           alertgroup: "{{ $labels.site }}"
350         annotations:
351           alarm_count: "{{ $value }} alarms"
352       - alert: juniper cpu alarm
353         expr: junos_route_engine_load_average_five / 2 > 0.5
354         for: 5m
355         keep_firing_for: 150s
356         labels:
357           alertgroup: "{{ $labels.site }}"
358         annotations:
359           load_average: "{{ $value | humanizePercentage }}"
360       - alert: juniper fan alarm
361         expr: junos_environment_fan_up != 1
362         for: 5m
363         keep_firing_for: 150s
364         labels:
365           alertgroup: "{{ $labels.site }}"
366       - alert: juniper power alarm
367         expr: junos_environment_power_up != 1
368         for: 5m
369         keep_firing_for: 150s
370         labels:
371           alertgroup: "{{ $labels.site }}"
372       - alert: juniper laser receive power
373         expr: junos_interface_diagnostics_laser_rx_dbm < -12 and on (site, instance, name) junos_interface_admin_up == 1
374         for: 5m
375         keep_firing_for: 150s
376         labels:
377           alertgroup: "{{ $labels.site }}"
378         annotations:
379           power: "{{ $value }} dBm"
380       - alert: juniper laser transmit power
381         expr: junos_interface_diagnostics_laser_output_dbm < -8 and on (site, instance, name) junos_interface_admin_up == 1
382         for: 5m
383         keep_firing_for: 150s
384         labels:
385           alertgroup: "{{ $labels.site }}"
386         annotations:
387           power: "{{ $value }} dBm"
388   - name: load
389     rules:
390       - alert: load average
391         expr: sum(node_load5) by (instance) / count(node_cpu_frequency_max_hertz) by (instance) > 2
392         for: 5m
393         keep_firing_for: 150s
394         labels:
395           alertgroup: "{{ $labels.instance }}"
396         annotations:
397           load: "{{ $value | humanizePercentage }}"
398   - name: mail
399     rules:
400       - alert: exim down
401         expr: exim_up == 0
402         for: 5m
403         keep_firing_for: 150s
404         labels:
405           alertgroup: "{{ $labels.instance }}"
406       - alert: exim queue length
407         expr: exim_queue > ignoring(job) exim_queue_limit
408         for: 60m
409         keep_firing_for: 10m
410         labels:
411           alertgroup: mail
412         annotations:
413           queue_length: "{{ $value }}"
414       - alert: mailman queue length
415         expr: mailman_queue_length > 200
416         for: 60m
417         keep_firing_for: 10m
418         labels:
419           alertgroup: mail
420         annotations:
421           queue_length: "{{ $value }}"
422   - name: mdadm
423     rules:
424       - alert: mdadm array inactive
425         expr: node_md_state{state="inactive"} > 0
426         for: 0m
427         labels:
428           alertgroup: "{{ $labels.instance }}"
429         annotations:
430           required: "{{ with printf \"node_md_disks_required{instance='%s',device='%s'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
431           active: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='active'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
432           failed: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='failed'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
433           spare: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='spare'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
434       - alert: mdadm array degraded
435         expr: sum (node_md_disks{state="active"}) without (state) < node_md_disks_required
436         for: 0m
437         labels:
438           alertgroup: "{{ $labels.instance }}"
439         annotations:
440           required: "{{ with printf \"node_md_disks_required{instance='%s',device='%s'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
441           active: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='active'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
442           failed: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='failed'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
443           spare: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='spare'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
444       - alert: mdadm disk failed
445         expr: node_md_disks{state="failed"} > 0
446         for: 0m
447         labels:
448           alertgroup: "{{ $labels.instance }}"
449         annotations:
450           required: "{{ with printf \"node_md_disks_required{instance='%s',device='%s'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
451           active: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='active'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
452           failed: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='failed'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
453           spare: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='spare'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
454   - name: memory
455     rules:
456       - alert: low memory
457         expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < 0.1
458         for: 15m
459         keep_firing_for: 450s
460         labels:
461           alertgroup: "{{ $labels.instance }}"
462         annotations:
463           memory_free: "{{ $value | humanizePercentage }}"
464       - alert: memory pressure
465         expr: rate(node_pressure_memory_waiting_seconds_total[5m]) > 0.6
466         for: 60m
467         keep_firing_for: 10m
468         labels:
469           alertgroup: "{{ $labels.instance }}"
470         annotations:
471           pressure: "{{ $value | humanizePercentage }}"
472       - alert: oom kill detected
473         expr: increase(node_vmstat_oom_kill[1m]) > 0
474         for: 0m
475         labels:
476           alertgroup: "{{ $labels.instance }}"
477         annotations:
478           new_oom_kills: "{{ $value }}"
479   - name: mysql
480     rules:
481       - alert: mysql down
482         expr: mysql_up == 0
483         for: 1m
484         labels:
485           alertgroup: "{{ $labels.instance }}"
486       - alert: mysql connection limit
487         expr: mysql_global_status_max_used_connections / mysql_global_variables_max_connections > 0.8
488         for: 1m
489         labels:
490           alertgroup: "{{ $labels.instance }}"
491         annotations:
492           connections_used: "{{ $value | humanizePercentage }}"
493       - alert: mysql connection errors
494         expr: increase(mysql_global_status_connection_errors_total[1m]) > 0
495         for: 0m
496         labels:
497           alertgroup: "{{ $labels.instance }}"
498         annotations:
499           error_count: "{{ $value }}"
500   - name: network
501     rules:
502       - alert: interface redundancy lost
503         expr: node_bonding_active < 2 and on (instance, master) label_replace(chef_network_interface{bond_mode="802.3ad"}, "master", "$1", "name", "(.*)")
504         for: 5m
505         keep_firing_for: 150s
506         labels:
507           alertgroup: "{{ $labels.instance }}"
508         annotations:
509           link_count: "{{ $value }}"
510       - alert: interface transmit rate
511         expr: rate(node_network_transmit_bytes_total[1m]) / node_network_speed_bytes > 0.99
512         for: 5m
513         keep_firing_for: 150s
514         labels:
515           alertgroup: "{{ $labels.instance }}"
516         annotations:
517           bandwidth_used: "{{ $value | humanizePercentage }}"
518       - alert: interface receive rate
519         expr: rate(node_network_receive_bytes_total[1m]) / node_network_speed_bytes > 0.99
520         for: 5m
521         keep_firing_for: 150s
522         labels:
523           alertgroup: "{{ $labels.instance }}"
524         annotations:
525           bandwidth_used: "{{ $value | humanizePercentage }}"
526       - alert: interface transmit errors
527         expr: rate(node_network_transmit_errs_total{device!~"wg.*"}[1m]) / rate(node_network_transmit_packets_total{device!~"wg.*"}[1m]) > 0.01
528         for: 5m
529         keep_firing_for: 150s
530         labels:
531           alertgroup: "{{ $labels.instance }}"
532         annotations:
533           error_rate: "{{ $value | humanizePercentage }}"
534       - alert: wireguard interface transmit errors
535         expr: rate(node_network_transmit_errs_total{device=~"wg.*"}[1m]) / rate(node_network_transmit_packets_total{device=~"wg.*"}[1m]) > 0.05
536         for: 1h
537         keep_firing_for: 20m
538         labels:
539           alertgroup: "{{ $labels.instance }}"
540         annotations:
541           error_rate: "{{ $value | humanizePercentage }}"
542       - alert: interface receive errors
543         expr: rate(node_network_receive_errs_total[1m]) / rate(node_network_receive_packets_total[1m]) > 0.01
544         for: 5m
545         keep_firing_for: 150s
546         labels:
547           alertgroup: "{{ $labels.instance }}"
548         annotations:
549           error_rate: "{{ $value | humanizePercentage }}"
550       - alert: conntrack entries
551         expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
552         for: 5m
553         keep_firing_for: 150s
554         labels:
555           alertgroup: "{{ $labels.instance }}"
556         annotations:
557           entries_used: "{{ $value | humanizePercentage }}"
558   - name: nominatim
559     rules:
560       - alert: nominatim replication delay
561         expr: nominatim_replication_delay > 10800
562         for: 1h
563         keep_firing_for: 30m
564         labels:
565           alertgroup: nominatim
566         annotations:
567           delay: "{{ $value | humanizeDuration }}"
568       - alert: nominatim connections
569         expr: sum(nginx_connections_writing and on (instance) chef_role{name="nominatim"}) > 2500
570         for: 15m
571         keep_firing_for: 450s
572         labels:
573           alertgroup: nominatim
574   - name: overpass
575     rules:
576       - alert: overpass osm database age
577         expr: overpass_database_age_seconds{database="osm"} > 3600
578         for: 1h
579         keep_firing_for: 10m
580         labels:
581           alertgroup: overpass
582         annotations:
583           age: "{{ $value | humanizeDuration }}"
584       - alert: overpass area database age
585         expr: overpass_database_age_seconds{database="area"} > 86400
586         for: 1h
587         keep_firing_for: 10m
588         labels:
589           alertgroup: overpass
590         annotations:
591           age: "{{ $value | humanizeDuration }}"
592   - name: passenger
593     rules:
594       - alert: passenger down
595         expr: passenger_up == 0
596         for: 5m
597         keep_firing_for: 150s
598         labels:
599           alertgroup: "{{ $labels.instance }}"
600       - alert: passenger queuing
601         expr: passenger_top_level_request_queue > 0
602         for: 5m
603         keep_firing_for: 150s
604         labels:
605           alertgroup: "{{ $labels.instance }}"
606       - alert: passenger application queuing
607         expr: passenger_app_request_queue > 0
608         for: 5m
609         keep_firing_for: 150s
610         labels:
611           alertgroup: "{{ $labels.instance }}"
612   - name: planet
613     rules:
614       - alert: planet dump overdue
615         expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/(pbf|planet)/.*"} > 7 * 86400 and ignoring (job, name, path) chef_role{name="planetdump"} == 1
616         for: 24h
617         labels:
618           alertgroup: planet
619         annotations:
620           overdue_by: "{{ $value | humanizeDuration }}"
621       - alert: notes dump overdue
622         expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/notes/.*"} > 86400 and ignoring (job, name, path) chef_role{name="planetdump"} == 1
623         for: 6h
624         labels:
625           alertgroup: planet
626         annotations:
627           overdue_by: "{{ $value | humanizeDuration }}"
628       - alert: daily replication feed delayed
629         expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/replication/day/.*"} > 86400 and ignoring (job, name, path) chef_role{name="planetdump"} == 1
630         for: 3h
631         keep_firing_for: 10m
632         labels:
633           alertgroup: planet
634         annotations:
635           delayed_by: "{{ $value | humanizeDuration }}"
636       - alert: hourly replication feed delayed
637         expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/replication/hour/.*"} > 3600 and ignoring (job, name, path) chef_role{name="planetdump"} == 1
638         for: 30m
639         labels:
640           alertgroup: planet
641         annotations:
642           delayed_by: "{{ $value | humanizeDuration }}"
643       - alert: minutely replication feed delayed
644         expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/replication/minute/.*"} > 60 and ignoring (job, name, path) chef_role{name="planetdump"} == 1
645         for: 5m
646         keep_firing_for: 150s
647         labels:
648           alertgroup: planet
649         annotations:
650           delayed_by: "{{ $value | humanizeDuration }}"
651       - alert: changeset replication feed delayed
652         expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/replication/changesets/.*"} > 60 and ignoring (job, name, path) chef_role{name="planetdump"} == 1
653         for: 5m
654         keep_firing_for: 150s
655         labels:
656           alertgroup: planet
657         annotations:
658           delayed_by: "{{ $value | humanizeDuration }}"
659   - name: postgresql
660     rules:
661       - alert: postgresql down
662         expr: pg_up == 0
663         for: 1m
664         labels:
665           alertgroup: "{{ $labels.instance }}"
666       - alert: postgresql replication delay
667         expr: pg_replication_lag_seconds > 30
668         for: 15m
669         keep_firing_for: 5m
670         labels:
671           alertgroup: "{{ $labels.instance }}"
672         annotations:
673           delay: "{{ $value | humanizeDuration }}"
674       - alert: postgresql connection limit
675         expr: sum (pg_stat_activity_count) by (instance, server) / sum (pg_settings_max_connections) by (instance, server) > 0.8
676         for: 1m
677         keep_firing_for: 30s
678         labels:
679           alertgroup: "{{ $labels.instance }}"
680         annotations:
681           connections_used: "{{ $value | humanizePercentage }}"
682       - alert: postgresql deadlocks
683         expr: increase(pg_stat_database_deadlocks{datname!="nominatim"}[1m]) > 5
684         for: 0m
685         labels:
686           alertgroup: "{{ $labels.instance }}"
687         annotations:
688           new_deadlocks: "{{ $value }}"
689       - alert: postgresql idle transactions
690         expr: sum(pg_process_idle_seconds_count{state="idle in transaction"}) by (instance, server) > sum(pg_process_idle_seconds_bucket{state="idle in transaction",le="300"}) by (instance, server)
691         for: 5m
692         keep_firing_for: 150s
693         labels:
694           alertgroup: "{{ $labels.instance }}"
695         annotations:
696           queries: "{{ $value }}"
697   - name: prometheus
698     rules:
699       - alert: prometheus configuration error
700         expr: prometheus_config_last_reload_successful == 0
701         for: 10m
702         keep_firing_for: 5m
703         labels:
704           alertgroup: "prometheus"
705       - alert: prometheus target missing
706         expr: up == 0
707         for: 10m
708         keep_firing_for: 5m
709         labels:
710           alertgroup: "prometheus"
711       - alert: node exporter text file scrape error
712         expr: node_textfile_scrape_error > 0
713         for: 10m
714         keep_firing_for: 5m
715         labels:
716           alertgroup: "prometheus"
717   - name: raid
718     rules:
719       - alert: raid controller battery failed
720         expr: ohai_controller_info{battery_status="failed"} > 0
721         for: 5m
722         keep_firing_for: 150s
723         labels:
724           alertgroup: "{{ $labels.instance }}"
725       - alert: raid controller battery recharging
726         expr: ohai_controller_info{battery_status="recharging"} > 0
727         for: 4h
728         keep_firing_for: 30m
729         labels:
730           alertgroup: "{{ $labels.instance }}"
731       - alert: raid array degraded
732         expr: ohai_array_info{status="degraded"} > 0
733         for: 5m
734         keep_firing_for: 150s
735         labels:
736           alertgroup: "{{ $labels.instance }}"
737       - alert: raid disk failed
738         expr: ohai_disk_info{status="failed"} > 0
739         for: 5m
740         keep_firing_for: 150s
741         labels:
742           alertgroup: "{{ $labels.instance }}"
743   - name: rasdaemon
744     rules:
745       - alert: memory controller errors
746         expr: increase(rasdaemon_mc_events_total[1m]) > 0
747         for: 0m
748         labels:
749           alertgroup: "{{ $labels.instance }}"
750         annotations:
751           new_errors: "{{ $value }}"
752       - alert: pcie aer errors
753         expr: increase(rasdaemon_aer_events_total[1m]) > 0
754         for: 0m
755         labels:
756           alertgroup: "{{ $labels.instance }}"
757         annotations:
758           new_errors: "{{ $value }}"
759   - name: resolved
760     rules:
761       - alert: dnssec validation failures
762         expr: rate(resolved_dnssec_verdicts_total{result="bogus"}[1m]) > 1
763         for: 5m
764         keep_firing_for: 150s
765         labels:
766           alertgroup: "{{ $labels.instance }}"
767   - name: smart
768     rules:
769       - alert: smart failure
770         expr: smart_health_status == 0
771         for: 60m
772         keep_firing_for: 10m
773         labels:
774           alertgroup: "{{ $labels.instance }}"
775       - alert: smart ssd wearout approaching
776         expr: smart_percentage_used / 100 >= 0.8
777         for: 60m
778         keep_firing_for: 10m
779         labels:
780           alertgroup: "{{ $labels.instance }}"
781         annotations:
782           percentage_used: "{{ $value | humanizePercentage }}"
783   - name: smokeping
784     rules:
785       - alert: packet loss
786         expr: 1 - (rate(smokeping_response_duration_seconds_count[5m]) / rate(smokeping_requests_total[5m])) > 0.02
787         for: 10m
788         keep_firing_for: 10m
789         labels:
790           alertgroup: smokeping
791         annotations:
792           loss_rate: "{{ $value | humanizePercentage }}"
793   - name: snmp
794     rules:
795       - alert: snmp pdus missing
796         expr: max_over_time(snmp_scrape_pdus_returned[1d]) - snmp_scrape_pdus_returned > 0
797         for: 15m
798         labels:
799           alertgroup: snmp
800         annotations:
801           missing_pdus: "{{ $value }}"
802   - name: ssl
803     rules:
804       - alert: ssl certificate probe failed
805         expr: ssl_probe_success == 0
806         for: 60m
807         labels:
808           alertgroup: ssl
809       - alert: ssl certificate expiry
810         expr: ssl_verified_cert_not_after{chain_no="0"} - time() < 86400 * 14
811         for: 0m
812         labels:
813           alertgroup: ssl
814         annotations:
815           expires_in: "{{ $value | humanizeDuration }}"
816       - alert: ssl certificate revoked
817         expr: ssl_ocsp_response_status == 1
818         for: 0m
819         labels:
820           alertgroup: ssl
821       - alert: ocsp status unknown
822         expr: ssl_ocsp_response_status == 1
823         for: 0m
824         labels:
825           alertgroup: ssl
826   - name: statuscake
827     rules:
828       - alert: statuscake uptime check failing
829         expr: statuscake_paused == 0 and statuscake_up == 0
830         for: 10m
831         labels:
832           alertgroup: statuscake
833   - name: systemd
834     rules:
835       - alert: systemd failed service
836         expr: node_systemd_unit_state{state="failed",name!="chef-client.service"} == 1
837         for: 5m
838         keep_firing_for: 150s
839         labels:
840           alertgroup: "{{ $labels.instance }}"
841       - alert: systemd failed chef client service
842         expr: sum_over_time(node_systemd_unit_state{state="inactive",name="chef-client.service"}[6h]) == 0
843         for: 0m
844         labels:
845           alertgroup: "{{ $labels.instance }}"
846   - name: taginfo
847     rules:
848       - alert: taginfo planet age
849         expr: time() - taginfo_data_from_seconds > 129600 and on (instance) chef_role{name="taginfo"}
850         for: 0m
851         labels:
852           alertgroup: taginfo
853         annotations:
854           age: "{{ $value | humanizeDuration }}"
855       - alert: taginfo database age
856         expr: time() - taginfo_database_update_finish_seconds > 129600 and on (instance) chef_role{name="taginfo"}
857         for: 0m
858         labels:
859           alertgroup: taginfo
860         annotations:
861           age: "{{ $value | humanizeDuration }}"
862       - alert: taginfo database size
863         expr: abs(delta(taginfo_database_size_bytes[30m])) / taginfo_database_size_bytes > 0.1
864         for: 30m
865         keep_firing_for: 10m
866         labels:
867           alertgroup: taginfo
868         annotations:
869           size_change: "{{ $value | humanizePercentage }}"
870   - name: tile
871     rules:
872       - alert: renderd replication delay
873         expr: renderd_replication_delay > 120
874         for: 15m
875         keep_firing_for: 10m
876         labels:
877           alertgroup: tile
878         annotations:
879           delay: "{{ $value | humanizeDuration }}"
880       - alert: missed tile rate
881         expr: sum(rate(modtile_http_response_total{code="404"}[5m])) by (instance) / sum(rate(modtile_http_response_total[5m])) by (instance) > 0.05
882         for: 5m
883         keep_firing_for: 3m
884         labels:
885           alertgroup: tile
886         annotations:
887           miss_rate: "{{ $value | humanizePercentage }}"
888       - alert: tile render rate
889         expr: sum(rate(renderd_zoom_metatiles_total[5m])) by (instance) == 0
890         for: 15m
891         keep_firing_for: 10m
892         labels:
893           alertgroup: tile
894         annotations:
895           render_rate: "{{ $value }} tiles/s"
896   - name: time
897     rules:
898       - alert: clock not synchronising
899         expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
900         for: 5m
901         keep_firing_for: 2m
902         labels:
903           alertgroup: "{{ $labels.instance }}"
904       - alert: clock skew detected
905         expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
906         for: 5m
907         keep_firing_for: 2m
908         labels:
909           alertgroup: "{{ $labels.instance }}"
910         annotations:
911           skew: "{{ with printf \"node_timex_offset_seconds{instance='%s'}\" $labels.instance | query }} {{ . | humanizeDuration }}{{ end }}"
912   - name: web
913     rules:
914       - alert: web error rate
915         expr: sum(rate(api_call_count_total{status=~"50[0-8]|5[1-9][0-9]"}[5m])) by (instance) / sum(rate(api_call_count_total[5m])) by (instance) > 0.002 and sum(rate(api_call_count_total{status=~"50[0-8]|5[1-9][0-9]"}[5m])) by (instance) > 0.05
916         for: 5m
917         keep_firing_for: 3m
918         labels:
919           alertgroup: web
920         annotations:
921           error_rate: "{{ $value | humanizePercentage }}"
922       - alert: job processing rate
923         expr: rate(pg_stat_user_tables_n_tup_del{datname="openstreetmap",relname="delayed_jobs"}[1h]) / rate(pg_stat_user_tables_n_tup_ins{datname="openstreetmap",relname="delayed_jobs"}[1h]) < 0.9 and ignoring(job, name, datname, relname, schemaname, server) chef_role{name="db-master"} == 1
924         for: 1h
925         keep_firing_for: 10m
926         labels:
927           alertgroup: web
928         annotations:
929           job_processing_rate: "{{ $value | humanizePercentage }}"
930   - name: aws
931     rules:
932       - alert: aws s3 replication lag
933         expr: aws_s3_replication_latency_maximum > 7200
934         for: 1h
935         keep_firing_for: 30m
936         labels:
937           alertgroup: aws
938         annotations:
939           s3_object_replication_lag: "{{ $value | humanizeDuration }}"
940       - alert: aws s3 replication failures
941         expr: aws_s3_operations_failed_replication_sum > 0
942         for: 1h
943         keep_firing_for: 30m
944         labels:
945           alertgroup: aws
946         annotations:
947           s3_object_replication_failures: "{{ $value }} objects"
948       - alert: aws s3 replication pending
949         expr: aws_s3_operations_pending_replication_maximum > 1000
950         for: 1h
951         keep_firing_for: 30m
952         labels:
953           alertgroup: aws
954         annotations:
955           s3_object_replication_pending: "{{ $value }} objects"