diff --git a/pkg/epp/backend/metrics/metrics.go b/pkg/epp/backend/metrics/metrics.go index be732e78..d48b1dc5 100644 --- a/pkg/epp/backend/metrics/metrics.go +++ b/pkg/epp/backend/metrics/metrics.go @@ -39,7 +39,8 @@ type PodMetricsClientImpl struct { MetricMapping *MetricMapping } -// FetchMetrics fetches metrics from a given pod. +// FetchMetrics fetches metrics from a given pod, clones the existing metrics object and returns an +// updated one. func (p *PodMetricsClientImpl) FetchMetrics( ctx context.Context, pod *Pod, diff --git a/pkg/epp/backend/metrics/pod_metrics.go b/pkg/epp/backend/metrics/pod_metrics.go index 01db14be..b7f20e9b 100644 --- a/pkg/epp/backend/metrics/pod_metrics.go +++ b/pkg/epp/backend/metrics/pod_metrics.go @@ -116,16 +116,21 @@ func (pm *podMetrics) refreshMetrics() error { updated, err := pm.pmc.FetchMetrics(ctx, pm.GetPod(), pm.GetMetrics(), pool.Spec.TargetPortNumber) if err != nil { pm.logger.V(logutil.TRACE).Info("Failed to refreshed metrics:", "err", err) - // As refresher is running in the background, it's possible that the pod is deleted but - // the refresh goroutine doesn't read the done channel yet. In this case, we just return nil. - // The refresher will be stopped after this interval. - return nil } - updated.UpdateTime = time.Now() - - pm.logger.V(logutil.TRACE).Info("Refreshed metrics", "updated", updated) + // Optimistically update metrics even if there was an error. + // The FetchMetrics can return an error for the following reasons: + // 1. As refresher is running in the background, it's possible that the pod is deleted but + // the refresh goroutine doesn't read the done channel yet. In this case, the updated + // metrics object will be nil. And the refresher will soon be stopped. + // 2. The FetchMetrics call can partially fail. For example, due to one metric missing. In + // this case, the updated metrics object will have partial updates. A partial update is + // considered better than no updates. + if updated != nil { + updated.UpdateTime = time.Now() + pm.logger.V(logutil.TRACE).Info("Refreshed metrics", "updated", updated) + atomic.StorePointer(&pm.metrics, unsafe.Pointer(updated)) + } - atomic.StorePointer(&pm.metrics, unsafe.Pointer(updated)) return nil }