From eacee20bf2ee23e2c271848adc005afd6d838e23 Mon Sep 17 00:00:00 2001 From: Grant Slater Date: Fri, 3 Oct 2025 13:46:22 +0100 Subject: [PATCH] prometheus: Add alertmanager rules for S3 replication lag and failures --- .../templates/default/alert_rules.yml.erb | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/cookbooks/prometheus/templates/default/alert_rules.yml.erb b/cookbooks/prometheus/templates/default/alert_rules.yml.erb index b770dba49..2d58610e6 100644 --- a/cookbooks/prometheus/templates/default/alert_rules.yml.erb +++ b/cookbooks/prometheus/templates/default/alert_rules.yml.erb @@ -927,3 +927,29 @@ groups: alertgroup: web annotations: job_processing_rate: "{{ $value | humanizePercentage }}" + - name: aws + rules: + - alert: aws s3 replication lag + expr: aws_s3_replication_latency_maximum > 7200 + for: 1h + keep_firing_for: 30m + labels: + alertgroup: aws + annotations: + s3_object_replication_lag: "{{ $value | humanizeDuration }}" + - alert: aws s3 replication failures + expr: aws_s3_operations_failed_replication_sum > 0 + for: 1h + keep_firing_for: 30m + labels: + alertgroup: aws + annotations: + s3_object_replication_failures: "{{ $value }} objects" + - alert: aws s3 replication pending + expr: aws_s3_operations_pending_replication_maximum > 1000 + for: 1h + keep_firing_for: 30m + labels: + alertgroup: aws + annotations: + s3_object_replication_pending: "{{ $value }} objects" -- 2.39.5