9
9
"net/http"
10
10
"time"
11
11
12
+ "github.com/prometheus/client_golang/prometheus"
12
13
"github.com/rs/zerolog"
13
14
"golang.org/x/exp/rand"
14
15
)
@@ -21,6 +22,8 @@ type HttpRunner struct {
21
22
backoff time.Duration
22
23
// graceTime tells the HttpRunner how much time to add to the script timeout to form the request timeout.
23
24
graceTime time.Duration
25
+ // metrics stores metrics for the remote k6 runner.
26
+ metrics * HTTPMetrics
24
27
}
25
28
26
29
const (
@@ -81,6 +84,8 @@ func (r HttpRunner) Run(ctx context.Context, script Script) (*RunResponse, error
81
84
Msg ("time until next execution is too close to script timeout, there might not be room for retries" )
82
85
}
83
86
87
+ // Retry logic is purely context (time) based, but we keep track of the number of attempts for reporting telemetry.
88
+ attempts := 1.0
84
89
wait := r .backoff
85
90
var response * RunResponse
86
91
for {
@@ -90,15 +95,21 @@ func (r HttpRunner) Run(ctx context.Context, script Script) (*RunResponse, error
90
95
response , err = r .request (ctx , script )
91
96
if err == nil {
92
97
r .logger .Debug ().Bytes ("metrics" , response .Metrics ).Bytes ("logs" , response .Logs ).Msg ("script result" )
98
+ r .metrics .Requests .With (map [string ]string {metricLabelSuccess : "1" , metricLabelRetriable : "" }).Inc ()
99
+ r .metrics .RequestsPerRun .WithLabelValues ("1" ).Observe (attempts )
93
100
return response , nil
94
101
}
95
102
96
103
if ! errors .Is (err , errRetryable ) {
97
104
// TODO: Log the returned error in the Processor instead.
98
105
r .logger .Error ().Err (err ).Msg ("non-retryable error running k6" )
106
+ r .metrics .Requests .With (map [string ]string {metricLabelSuccess : "0" , metricLabelRetriable : "0" }).Inc ()
107
+ r .metrics .RequestsPerRun .WithLabelValues ("0" ).Observe (attempts )
99
108
return nil , err
100
109
}
101
110
111
+ r .metrics .Requests .With (map [string ]string {metricLabelSuccess : "0" , metricLabelRetriable : "1" }).Inc ()
112
+
102
113
// Wait, but subtract the amount of time we've already waited as part of the request timeout.
103
114
// We do this because these requests have huge timeouts, and by the nature of the system running these requests,
104
115
// we expect the most common error to be a timeout, so we avoid waiting even more on top of an already large
@@ -112,6 +123,7 @@ func (r HttpRunner) Run(ctx context.Context, script Script) (*RunResponse, error
112
123
waitTimer .Stop ()
113
124
// TODO: Log the returned error in the Processor instead.
114
125
r .logger .Error ().Err (err ).Msg ("retries exhausted" )
126
+ r .metrics .RequestsPerRun .WithLabelValues ("0" ).Observe (attempts )
115
127
return nil , fmt .Errorf ("cannot retry further: %w" , errors .Join (err , ctx .Err ()))
116
128
case <- waitTimer .C :
117
129
}
@@ -206,3 +218,47 @@ func (r HttpRunner) request(ctx context.Context, script Script) (*RunResponse, e
206
218
207
219
return & response , nil
208
220
}
221
+
222
+ type HTTPMetrics struct {
223
+ Requests * prometheus.CounterVec
224
+ RequestsPerRun * prometheus.HistogramVec
225
+ }
226
+
227
+ const (
228
+ metricLabelSuccess = "success"
229
+ metricLabelRetriable = "retriable"
230
+ )
231
+
232
+ func NewHTTPMetrics (registerer prometheus.Registerer ) * HTTPMetrics {
233
+ m := & HTTPMetrics {}
234
+ m .Requests = prometheus .NewCounterVec (
235
+ prometheus.CounterOpts {
236
+ Namespace : "sm_agent" ,
237
+ Subsystem : "k6runner" ,
238
+ Name : "requests_total" ,
239
+ Help : "Total number of requests made to remote k6 runners, which may be more than one per run. " +
240
+ "The 'success' label is 1 if this single request succeeded, 0 otherwise. " +
241
+ "The 'retriable' label is 1 if the request failed with a retriable error, 0 otherwise. " +
242
+ "Successful requests do not have the 'retriable' label." ,
243
+ },
244
+ []string {metricLabelSuccess , metricLabelRetriable },
245
+ )
246
+ registerer .MustRegister (m .Requests )
247
+
248
+ m .RequestsPerRun = prometheus .NewHistogramVec (
249
+ prometheus.HistogramOpts {
250
+ Namespace : "sm_agent" ,
251
+ Subsystem : "k6runner" ,
252
+ Name : "requests_per_run" ,
253
+ Help : "Number of requests attempted per run operation. " +
254
+ "The 'success' label is 1 if one ultimately succeeded potentially including retries, 0 otherwise." ,
255
+ // Generally we expect request to be retries a handful of times, so we create high resolution buckets up to
256
+ // 5. Form 5 onwards something off is going on and we do not care that much about resolution.
257
+ Buckets : []float64 {1 , 2 , 3 , 4 , 5 , 10 , 20 , 50 },
258
+ },
259
+ []string {metricLabelSuccess },
260
+ )
261
+ registerer .MustRegister (m .RequestsPerRun )
262
+
263
+ return m
264
+ }
0 commit comments