@@ -116,16 +116,21 @@ func (pm *podMetrics) refreshMetrics() error {
116
116
updated , err := pm .pmc .FetchMetrics (ctx , pm .GetPod (), pm .GetMetrics (), pool .Spec .TargetPortNumber )
117
117
if err != nil {
118
118
pm .logger .V (logutil .TRACE ).Info ("Failed to refreshed metrics:" , "err" , err )
119
- // As refresher is running in the background, it's possible that the pod is deleted but
120
- // the refresh goroutine doesn't read the done channel yet. In this case, we just return nil.
121
- // The refresher will be stopped after this interval.
122
- return nil
123
119
}
124
- updated .UpdateTime = time .Now ()
125
-
126
- pm .logger .V (logutil .TRACE ).Info ("Refreshed metrics" , "updated" , updated )
120
+ // Optimistically update metrics even if there was an error.
121
+ // The FetchMetrics can return an error for the following reasons:
122
+ // 1. As refresher is running in the background, it's possible that the pod is deleted but
123
+ // the refresh goroutine doesn't read the done channel yet. In this case, the updated
124
+ // metrics object will be nil. And the refresher will soon be stopped.
125
+ // 2. The FetchMetrics call can partially fail. For example, due to one metric missing. In
126
+ // this case, the updated metrics object will have partial updates. A partial update is
127
+ // considered better than no updates.
128
+ if updated != nil {
129
+ updated .UpdateTime = time .Now ()
130
+ pm .logger .V (logutil .TRACE ).Info ("Refreshed metrics" , "updated" , updated )
131
+ atomic .StorePointer (& pm .metrics , unsafe .Pointer (updated ))
132
+ }
127
133
128
- atomic .StorePointer (& pm .metrics , unsafe .Pointer (updated ))
129
134
return nil
130
135
}
131
136
0 commit comments