Skip to content

Commit c3885e8

Browse files
authored
Export last replay age in replication collector (#1085)
The exported replication lag does not handle all failure modes, and can report 0 for replicas that are out of sync and incapable of recovery. A proper replacement for that metric would require a different approach (see e.g. #1007), but for a lot of folks, simply exporting the age of the last replay can provide a pretty strong signal for something being amiss. I think this solution might be preferable to #977, though the lag metric needs to be fixed or abandoned eventually. Signed-off-by: Conrad Hoffmann <[email protected]>
1 parent 2ee2a8f commit c3885e8

File tree

2 files changed

+20
-4
lines changed

2 files changed

+20
-4
lines changed

Diff for: collector/pg_replication.go

+17-2
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,15 @@ var (
5151
"Indicates if the server is a replica",
5252
[]string{}, nil,
5353
)
54+
pgReplicationLastReplay = prometheus.NewDesc(
55+
prometheus.BuildFQName(
56+
namespace,
57+
replicationSubsystem,
58+
"last_replay_seconds",
59+
),
60+
"Age of last replay in seconds",
61+
[]string{}, nil,
62+
)
5463

5564
pgReplicationQuery = `SELECT
5665
CASE
@@ -61,7 +70,8 @@ var (
6170
CASE
6271
WHEN pg_is_in_recovery() THEN 1
6372
ELSE 0
64-
END as is_replica`
73+
END as is_replica,
74+
GREATEST (0, EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))) as last_replay`
6575
)
6676

6777
func (c *PGReplicationCollector) Update(ctx context.Context, instance *instance, ch chan<- prometheus.Metric) error {
@@ -72,7 +82,8 @@ func (c *PGReplicationCollector) Update(ctx context.Context, instance *instance,
7282

7383
var lag float64
7484
var isReplica int64
75-
err := row.Scan(&lag, &isReplica)
85+
var replayAge float64
86+
err := row.Scan(&lag, &isReplica, &replayAge)
7687
if err != nil {
7788
return err
7889
}
@@ -84,5 +95,9 @@ func (c *PGReplicationCollector) Update(ctx context.Context, instance *instance,
8495
pgReplicationIsReplica,
8596
prometheus.GaugeValue, float64(isReplica),
8697
)
98+
ch <- prometheus.MustNewConstMetric(
99+
pgReplicationLastReplay,
100+
prometheus.GaugeValue, replayAge,
101+
)
87102
return nil
88103
}

Diff for: collector/pg_replication_test.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@ func TestPgReplicationCollector(t *testing.T) {
3131

3232
inst := &instance{db: db}
3333

34-
columns := []string{"lag", "is_replica"}
34+
columns := []string{"lag", "is_replica", "last_replay"}
3535
rows := sqlmock.NewRows(columns).
36-
AddRow(1000, 1)
36+
AddRow(1000, 1, 3)
3737
mock.ExpectQuery(sanitizeQuery(pgReplicationQuery)).WillReturnRows(rows)
3838

3939
ch := make(chan prometheus.Metric)
@@ -49,6 +49,7 @@ func TestPgReplicationCollector(t *testing.T) {
4949
expected := []MetricResult{
5050
{labels: labelMap{}, value: 1000, metricType: dto.MetricType_GAUGE},
5151
{labels: labelMap{}, value: 1, metricType: dto.MetricType_GAUGE},
52+
{labels: labelMap{}, value: 3, metricType: dto.MetricType_GAUGE},
5253
}
5354

5455
convey.Convey("Metrics comparison", t, func() {

0 commit comments

Comments
 (0)