X-Git-Url: https://git.openstreetmap.org/chef.git/blobdiff_plain/faeb3b7259889dc1b789ea14c223b41a0b78116b..090c2f22122d4f99a07198f59c55285e0250b78b:/cookbooks/prometheus/templates/default/alert_rules.yml.erb diff --git a/cookbooks/prometheus/templates/default/alert_rules.yml.erb b/cookbooks/prometheus/templates/default/alert_rules.yml.erb index c108451ab..5809b570b 100644 --- a/cookbooks/prometheus/templates/default/alert_rules.yml.erb +++ b/cookbooks/prometheus/templates/default/alert_rules.yml.erb @@ -1,13 +1,6 @@ -# DO NOT EDIT - This file is being maintained by Chef +c# DO NOT EDIT - This file is being maintained by Chef groups: - - name: alertmanager - rules: - - alert: prometheus target missing - expr: up == 0 - for: 10m - labels: - alertgroup: "prometheus" - name: amsterdam rules: - alert: pdu current draw @@ -122,7 +115,12 @@ groups: annotations: error_rate: "{{ $value | humanizePercentage }}" - alert: fastly healthcheck failing - expr: fastly_healthcheck_status = 0 + expr: count(fastly_healthcheck_status == 0) > 0 + for: 15m + labels: + alertgroup: fastly + - alert: fastly healthcheck failing + expr: count(fastly_healthcheck_status == 0) > 4 for: 5m labels: alertgroup: fastly @@ -422,6 +420,30 @@ groups: alertgroup: "{{ $labels.instance }}" annotations: queries: "{{ $value }}" + - name: prometheus + rules: + - alert: prometheus configuration error + expr: prometheus_config_last_reload_successful == 0 + for: 10m + labels: + alertgroup: "prometheus" + - alert: prometheus target missing + expr: up == 0 + for: 10m + labels: + alertgroup: "prometheus" + - name: raid + rules: + - alert: raid array degraded + expr: ohai_array_info{status="degraded"} > 0 + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" + - alert: raid disk failed + expr: ohai_disk_info{status="failed"} > 0 + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" - name: smart rules: - alert: smart failure @@ -430,7 +452,7 @@ groups: labels: alertgroup: "{{ $labels.instance }}" - alert: smart ssd wearout approaching - expr: smart_percentage_used >= 90 + expr: smart_percentage_used >= 80 for: 60m labels: alertgroup: "{{ $labels.instance }}"