]> git.openstreetmap.org Git - chef.git/commitdiff
Update postgresql exporter configuration
authorTom Hughes <tom@compton.nu>
Mon, 3 Jul 2023 16:16:55 +0000 (17:16 +0100)
committerTom Hughes <tom@compton.nu>
Mon, 3 Jul 2023 16:16:55 +0000 (17:16 +0100)
cookbooks/postgresql/attributes/default.rb
cookbooks/postgresql/recipes/default.rb
cookbooks/postgresql/templates/default/postgres_queries.yml.erb
cookbooks/prometheus/templates/default/alert_rules.yml.erb

index 54a224345121ac15af47adaac4d686ecdc1a3d62..038fadeff959899ae134b85eb7b63d18829abeee 100644 (file)
@@ -1,6 +1,5 @@
 default[:postgresql][:versions] = []
 default[:postgresql][:clusters] = {}
-default[:postgresql][:monitor_tables] = true
 default[:postgresql][:settings][:defaults][:port] = "5432"
 default[:postgresql][:settings][:defaults][:max_connections] = "100"
 default[:postgresql][:settings][:defaults][:ssl] = "true"
index a3ef7d23d9f4bb2ce687531510cc7c7cc184bc62..f5bab62318e1b8ecdc6c2c2876b8404dd67aedef 100644 (file)
@@ -166,12 +166,14 @@ template "/etc/prometheus/exporters/postgres_queries.yml" do
   mode "644"
 end
 
+# lag / lag_seconds
+# process_idle missing state
 prometheus_exporter "postgres" do
   port 9187
   scrape_interval "1m"
   scrape_timeout "1m"
   user "postgres"
-  options "--extend.query-path=/etc/prometheus/exporters/postgres_queries.yml"
+  options "--no-collector.process_idle --extend.query-path=/etc/prometheus/exporters/postgres_queries.yml"
   environment "DATA_SOURCE_URI" => uris.sort.uniq.first,
               "PG_EXPORTER_AUTO_DISCOVER_DATABASES" => "true",
               "PG_EXPORTER_EXCLUDE_DATABASES" => "postgres,template0,template1"
index 1c33e0c1ef8b464cff66556b6686910842e69dbe..a92382edc0ec0e2a2118165eacbcfcfae94542ac 100644 (file)
@@ -1,128 +1,3 @@
-pg_replication:
-  query: "SELECT CASE WHEN NOT pg_is_in_recovery() THEN 0 ELSE GREATEST (0, EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))) END AS lag_seconds"
-  master: true
-  metrics:
-    - lag_seconds:
-        usage: "GAUGE"
-        description: "Replication lag behind master in seconds"
-
-pg_postmaster:
-  query: "SELECT pg_postmaster_start_time as start_time_seconds from pg_postmaster_start_time()"
-  master: true
-  metrics:
-    - start_time_seconds:
-        usage: "GAUGE"
-        description: "Time at which postmaster started"
-<% if node[:postgresql][:monitor_tables] -%>
-
-pg_stat_user_tables:
-  query: "SELECT current_database() datname, schemaname, relname, seq_scan, seq_tup_read, idx_scan, idx_tup_fetch, n_tup_ins, n_tup_upd, n_tup_del, n_tup_hot_upd, n_live_tup, n_dead_tup, n_mod_since_analyze, COALESCE(last_vacuum, '1970-01-01Z') as last_vacuum, COALESCE(last_autovacuum, '1970-01-01Z') as last_autovacuum, COALESCE(last_analyze, '1970-01-01Z') as last_analyze, COALESCE(last_autoanalyze, '1970-01-01Z') as last_autoanalyze, vacuum_count, autovacuum_count, analyze_count, autoanalyze_count FROM pg_stat_user_tables"
-  metrics:
-    - datname:
-        usage: "LABEL"
-        description: "Name of current database"
-    - schemaname:
-        usage: "LABEL"
-        description: "Name of the schema that this table is in"
-    - relname:
-        usage: "LABEL"
-        description: "Name of this table"
-    - seq_scan:
-        usage: "COUNTER"
-        description: "Number of sequential scans initiated on this table"
-    - seq_tup_read:
-        usage: "COUNTER"
-        description: "Number of live rows fetched by sequential scans"
-    - idx_scan:
-        usage: "COUNTER"
-        description: "Number of index scans initiated on this table"
-    - idx_tup_fetch:
-        usage: "COUNTER"
-        description: "Number of live rows fetched by index scans"
-    - n_tup_ins:
-        usage: "COUNTER"
-        description: "Number of rows inserted"
-    - n_tup_upd:
-        usage: "COUNTER"
-        description: "Number of rows updated"
-    - n_tup_del:
-        usage: "COUNTER"
-        description: "Number of rows deleted"
-    - n_tup_hot_upd:
-        usage: "COUNTER"
-        description: "Number of rows HOT updated (i.e., with no separate index update required)"
-    - n_live_tup:
-        usage: "GAUGE"
-        description: "Estimated number of live rows"
-    - n_dead_tup:
-        usage: "GAUGE"
-        description: "Estimated number of dead rows"
-    - n_mod_since_analyze:
-        usage: "GAUGE"
-        description: "Estimated number of rows changed since last analyze"
-    - last_vacuum:
-        usage: "GAUGE"
-        description: "Last time at which this table was manually vacuumed (not counting VACUUM FULL)"
-    - last_autovacuum:
-        usage: "GAUGE"
-        description: "Last time at which this table was vacuumed by the autovacuum daemon"
-    - last_analyze:
-        usage: "GAUGE"
-        description: "Last time at which this table was manually analyzed"
-    - last_autoanalyze:
-        usage: "GAUGE"
-        description: "Last time at which this table was analyzed by the autovacuum daemon"
-    - vacuum_count:
-        usage: "COUNTER"
-        description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)"
-    - autovacuum_count:
-        usage: "COUNTER"
-        description: "Number of times this table has been vacuumed by the autovacuum daemon"
-    - analyze_count:
-        usage: "COUNTER"
-        description: "Number of times this table has been manually analyzed"
-    - autoanalyze_count:
-        usage: "COUNTER"
-        description: "Number of times this table has been analyzed by the autovacuum daemon"
-
-pg_statio_user_tables:
-  query: "SELECT current_database() datname, schemaname, relname, heap_blks_read, heap_blks_hit, idx_blks_read, idx_blks_hit, toast_blks_read, toast_blks_hit, tidx_blks_read, tidx_blks_hit FROM pg_statio_user_tables"
-  metrics:
-    - datname:
-        usage: "LABEL"
-        description: "Name of current database"
-    - schemaname:
-        usage: "LABEL"
-        description: "Name of the schema that this table is in"
-    - relname:
-        usage: "LABEL"
-        description: "Name of this table"
-    - heap_blks_read:
-        usage: "COUNTER"
-        description: "Number of disk blocks read from this table"
-    - heap_blks_hit:
-        usage: "COUNTER"
-        description: "Number of buffer hits in this table"
-    - idx_blks_read:
-        usage: "COUNTER"
-        description: "Number of disk blocks read from all indexes on this table"
-    - idx_blks_hit:
-        usage: "COUNTER"
-        description: "Number of buffer hits in all indexes on this table"
-    - toast_blks_read:
-        usage: "COUNTER"
-        description: "Number of disk blocks read from this table's TOAST table (if any)"
-    - toast_blks_hit:
-        usage: "COUNTER"
-        description: "Number of buffer hits in this table's TOAST table (if any)"
-    - tidx_blks_read:
-        usage: "COUNTER"
-        description: "Number of disk blocks read from this table's TOAST table indexes (if any)"
-    - tidx_blks_hit:
-        usage: "COUNTER"
-        description: "Number of buffer hits in this table's TOAST table indexes (if any)"
-<% end -%>
-
 pg_process_idle:
   query: |
     WITH
@@ -174,19 +49,6 @@ pg_process_idle:
         usage: "HISTOGRAM"
         description: "Idle time of server processes"
 
-pg_unfrozen_ids:
-  query: "SELECT current_database() AS datname, max(age(relfrozenxid)) AS xid_age, max(mxid_age(relminmxid)) AS mxid_age FROM pg_class WHERE relkind IN ('r', 'm')"
-  metrics:
-    - datname:
-        usage: "LABEL"
-        description: "Name of the database"
-    - xid_age:
-        usage: "GAUGE"
-        description: "Age of the oldest unfrozen transaction ID in this database"
-    - mxid_age:
-        usage: "GAUGE"
-        description: "Age of the oldest unfrozen multixact ID in this database"
-
 pg_wal:
   query: "SELECT count(*) AS segment_count FROM pg_ls_waldir() WHERE name ~ '^[0-9A-Z]{24}$'"
   master: true
index 396de8de43001ed12dbce7aeca8a26f1fc55b0df..3c448cc2a3a515e00090d7bf9eab34c9b1d01c1c 100644 (file)
@@ -99,7 +99,7 @@ groups:
   - name: database
     rules:
       - alert: postgres replication delay
-        expr: pg_replication_lag_seconds > 30
+        expr: pg_replication_lag > 30
         for: 15m
         labels:
           alertgroup: database
@@ -507,7 +507,7 @@ groups:
         labels:
           alertgroup: "{{ $labels.instance }}"
       - alert: postgresql replication delay
-        expr: pg_replication_lag_seconds > 30
+        expr: pg_replication_lag > 30
         for: 15m
         labels:
           alertgroup: "{{ $labels.instance }}"