Skip to content

Commit 8f93e87

Browse files
committed
feat: report metrics about requests made to remote runners
1 parent 2ff3090 commit 8f93e87

File tree

3 files changed

+63
-0
lines changed

3 files changed

+63
-0
lines changed

cmd/synthetic-monitoring-agent/main.go

+1
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,7 @@ func run(args []string, stdout io.Writer) error {
273273
k6Runner = k6runner.New(k6runner.RunnerOpts{
274274
Uri: config.K6URI,
275275
BlacklistedIP: config.K6BlacklistedIP,
276+
Registerer: promRegisterer,
276277
})
277278
}
278279

internal/k6runner/http.go

+56
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"net/http"
1010
"time"
1111

12+
"github.com/prometheus/client_golang/prometheus"
1213
"github.com/rs/zerolog"
1314
"golang.org/x/exp/rand"
1415
)
@@ -21,6 +22,8 @@ type HttpRunner struct {
2122
backoff time.Duration
2223
// graceTime tells the HttpRunner how much time to add to the script timeout to form the request timeout.
2324
graceTime time.Duration
25+
// metrics stores metrics for the remote k6 runner.
26+
metrics *HTTPMetrics
2427
}
2528

2629
const (
@@ -81,6 +84,8 @@ func (r HttpRunner) Run(ctx context.Context, script Script) (*RunResponse, error
8184
Msg("time until next execution is too close to script timeout, there might not be room for retries")
8285
}
8386

87+
// Retry logic is purely context (time) based, but we keep track of the number of attempts for reporting telemetry.
88+
attempts := 1.0
8489
wait := r.backoff
8590
var response *RunResponse
8691
for {
@@ -90,15 +95,21 @@ func (r HttpRunner) Run(ctx context.Context, script Script) (*RunResponse, error
9095
response, err = r.request(ctx, script)
9196
if err == nil {
9297
r.logger.Debug().Bytes("metrics", response.Metrics).Bytes("logs", response.Logs).Msg("script result")
98+
r.metrics.Requests.With(map[string]string{metricLabelSuccess: "1", metricLabelRetriable: ""}).Inc()
99+
r.metrics.RequestsPerRun.WithLabelValues("1").Observe(attempts)
93100
return response, nil
94101
}
95102

96103
if !errors.Is(err, errRetryable) {
97104
// TODO: Log the returned error in the Processor instead.
98105
r.logger.Error().Err(err).Msg("non-retryable error running k6")
106+
r.metrics.Requests.With(map[string]string{metricLabelSuccess: "0", metricLabelRetriable: "0"}).Inc()
107+
r.metrics.RequestsPerRun.WithLabelValues("0").Observe(attempts)
99108
return nil, err
100109
}
101110

111+
r.metrics.Requests.With(map[string]string{metricLabelSuccess: "0", metricLabelRetriable: "1"}).Inc()
112+
102113
// Wait, but subtract the amount of time we've already waited as part of the request timeout.
103114
// We do this because these requests have huge timeouts, and by the nature of the system running these requests,
104115
// we expect the most common error to be a timeout, so we avoid waiting even more on top of an already large
@@ -112,6 +123,7 @@ func (r HttpRunner) Run(ctx context.Context, script Script) (*RunResponse, error
112123
waitTimer.Stop()
113124
// TODO: Log the returned error in the Processor instead.
114125
r.logger.Error().Err(err).Msg("retries exhausted")
126+
r.metrics.RequestsPerRun.WithLabelValues("0").Observe(attempts)
115127
return nil, fmt.Errorf("cannot retry further: %w", errors.Join(err, ctx.Err()))
116128
case <-waitTimer.C:
117129
}
@@ -206,3 +218,47 @@ func (r HttpRunner) request(ctx context.Context, script Script) (*RunResponse, e
206218

207219
return &response, nil
208220
}
221+
222+
type HTTPMetrics struct {
223+
Requests *prometheus.CounterVec
224+
RequestsPerRun *prometheus.HistogramVec
225+
}
226+
227+
const (
228+
metricLabelSuccess = "success"
229+
metricLabelRetriable = "retriable"
230+
)
231+
232+
func NewHTTPMetrics(registerer prometheus.Registerer) *HTTPMetrics {
233+
m := &HTTPMetrics{}
234+
m.Requests = prometheus.NewCounterVec(
235+
prometheus.CounterOpts{
236+
Namespace: "sm_agent",
237+
Subsystem: "k6runner",
238+
Name: "requests_total",
239+
Help: "Total number of requests made to remote k6 runners, which may be more than one per run. " +
240+
"The 'success' label is 1 if this single request succeeded, 0 otherwise. " +
241+
"The 'retriable' label is 1 if the request failed with a retriable error, 0 otherwise. " +
242+
"Successful requests do not have the 'retriable' label.",
243+
},
244+
[]string{metricLabelSuccess, metricLabelRetriable},
245+
)
246+
registerer.MustRegister(m.Requests)
247+
248+
m.RequestsPerRun = prometheus.NewHistogramVec(
249+
prometheus.HistogramOpts{
250+
Namespace: "sm_agent",
251+
Subsystem: "k6runner",
252+
Name: "requests_per_run",
253+
Help: "Number of requests attempted per run operation. " +
254+
"The 'success' label is 1 if one ultimately succeeded potentially including retries, 0 otherwise.",
255+
// Generally we expect request to be retries a handful of times, so we create high resolution buckets up to
256+
// 5. Form 5 onwards something off is going on and we do not care that much about resolution.
257+
Buckets: []float64{1, 2, 3, 4, 5, 10, 20, 50},
258+
},
259+
[]string{metricLabelSuccess},
260+
)
261+
registerer.MustRegister(m.RequestsPerRun)
262+
263+
return m
264+
}

internal/k6runner/k6runner.go

+6
Original file line numberDiff line numberDiff line change
@@ -98,18 +98,24 @@ type Runner interface {
9898
type RunnerOpts struct {
9999
Uri string
100100
BlacklistedIP string
101+
Registerer prometheus.Registerer
101102
}
102103

103104
func New(opts RunnerOpts) Runner {
104105
var r Runner
105106
logger := zerolog.Nop()
107+
var registerer prometheus.Registerer = prometheus.NewRegistry() // NOOP registry.
108+
if opts.Registerer != nil {
109+
registerer = opts.Registerer
110+
}
106111

107112
if strings.HasPrefix(opts.Uri, "http") {
108113
r = &HttpRunner{
109114
url: opts.Uri,
110115
logger: &logger,
111116
graceTime: defaultGraceTime,
112117
backoff: defaultBackoff,
118+
metrics: NewHTTPMetrics(registerer),
113119
}
114120
} else {
115121
r = Local{

0 commit comments

Comments
 (0)