Skip to content

Commit 7a7a944

Browse files
committed
Export last replay age in replication collector
The exported replication lag does not handle all failure modes, and can report 0 for replicas that are out of sync and incapable of recovery. A proper replacement for that metric would require a different approach (see e.g. #1007), but for a lot of folks, simply exporting the age of the last replay can provide a pretty strong signal for something being amiss. I think this solution might be preferable to #977, though the lag metric needs to be fixed or abandoned eventually. Signed-off-by: Conrad Hoffmann <[email protected]>
1 parent f9c7457 commit 7a7a944

File tree

1 file changed

+17
-2
lines changed

1 file changed

+17
-2
lines changed

Diff for: collector/pg_replication.go

+17-2
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,15 @@ var (
5151
"Indicates if the server is a replica",
5252
[]string{}, nil,
5353
)
54+
pgReplicationLastReplay = prometheus.NewDesc(
55+
prometheus.BuildFQName(
56+
namespace,
57+
replicationSubsystem,
58+
"last_replay_seconds",
59+
),
60+
"Age of last replay in seconds",
61+
[]string{}, nil,
62+
)
5463

5564
pgReplicationQuery = `SELECT
5665
CASE
@@ -61,7 +70,8 @@ var (
6170
CASE
6271
WHEN pg_is_in_recovery() THEN 1
6372
ELSE 0
64-
END as is_replica`
73+
END as is_replica,
74+
GREATEST (0, EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))) as last_replay`
6575
)
6676

6777
func (c *PGReplicationCollector) Update(ctx context.Context, instance *instance, ch chan<- prometheus.Metric) error {
@@ -72,7 +82,8 @@ func (c *PGReplicationCollector) Update(ctx context.Context, instance *instance,
7282

7383
var lag float64
7484
var isReplica int64
75-
err := row.Scan(&lag, &isReplica)
85+
var replayAge float64
86+
err := row.Scan(&lag, &isReplica, &replayAge)
7687
if err != nil {
7788
return err
7889
}
@@ -84,5 +95,9 @@ func (c *PGReplicationCollector) Update(ctx context.Context, instance *instance,
8495
pgReplicationIsReplica,
8596
prometheus.GaugeValue, float64(isReplica),
8697
)
98+
ch <- prometheus.MustNewConstMetric(
99+
pgReplicationLastReplay,
100+
prometheus.GaugeValue, replayAge,
101+
)
87102
return nil
88103
}

0 commit comments

Comments
 (0)