feat: report metrics about requests made to remote runners

nadiamoe · nadiamoe · commit 8f93e87fbdb9 · 2025-03-31T16:19:48.000+02:00
diff --git a/cmd/synthetic-monitoring-agent/main.go b/cmd/synthetic-monitoring-agent/main.go
@@ -273,6 +273,7 @@ func run(args []string, stdout io.Writer) error {
 		k6Runner = k6runner.New(k6runner.RunnerOpts{
 			Uri:           config.K6URI,
 			BlacklistedIP: config.K6BlacklistedIP,
+			Registerer:    promRegisterer,
 		})
 	}
 
diff --git a/internal/k6runner/http.go b/internal/k6runner/http.go
@@ -9,6 +9,7 @@ import (
 	"net/http"
 	"time"
 
+	"github.com/prometheus/client_golang/prometheus"
 	"github.com/rs/zerolog"
 	"golang.org/x/exp/rand"
 )
@@ -21,6 +22,8 @@ type HttpRunner struct {
 	backoff time.Duration
 	// graceTime tells the HttpRunner how much time to add to the script timeout to form the request timeout.
 	graceTime time.Duration
+	// metrics stores metrics for the remote k6 runner.
+	metrics *HTTPMetrics
 }
 
 const (
@@ -81,6 +84,8 @@ func (r HttpRunner) Run(ctx context.Context, script Script) (*RunResponse, error
 			Msg("time until next execution is too close to script timeout, there might not be room for retries")
 	}
 
+	// Retry logic is purely context (time) based, but we keep track of the number of attempts for reporting telemetry.
+	attempts := 1.0
 	wait := r.backoff
 	var response *RunResponse
 	for {
@@ -90,15 +95,21 @@ func (r HttpRunner) Run(ctx context.Context, script Script) (*RunResponse, error
 		response, err = r.request(ctx, script)
 		if err == nil {
 			r.logger.Debug().Bytes("metrics", response.Metrics).Bytes("logs", response.Logs).Msg("script result")
+			r.metrics.Requests.With(map[string]string{metricLabelSuccess: "1", metricLabelRetriable: ""}).Inc()
+			r.metrics.RequestsPerRun.WithLabelValues("1").Observe(attempts)
 			return response, nil
 		}
 
 		if !errors.Is(err, errRetryable) {
 			// TODO: Log the returned error in the Processor instead.
 			r.logger.Error().Err(err).Msg("non-retryable error running k6")
+			r.metrics.Requests.With(map[string]string{metricLabelSuccess: "0", metricLabelRetriable: "0"}).Inc()
+			r.metrics.RequestsPerRun.WithLabelValues("0").Observe(attempts)
 			return nil, err
 		}
 
+		r.metrics.Requests.With(map[string]string{metricLabelSuccess: "0", metricLabelRetriable: "1"}).Inc()
+
 		// Wait, but subtract the amount of time we've already waited as part of the request timeout.
 		// We do this because these requests have huge timeouts, and by the nature of the system running these requests,
 		// we expect the most common error to be a timeout, so we avoid waiting even more on top of an already large
@@ -112,6 +123,7 @@ func (r HttpRunner) Run(ctx context.Context, script Script) (*RunResponse, error
 			waitTimer.Stop()
 			// TODO: Log the returned error in the Processor instead.
 			r.logger.Error().Err(err).Msg("retries exhausted")
+			r.metrics.RequestsPerRun.WithLabelValues("0").Observe(attempts)
 			return nil, fmt.Errorf("cannot retry further: %w", errors.Join(err, ctx.Err()))
 		case <-waitTimer.C:
 		}
@@ -206,3 +218,47 @@ func (r HttpRunner) request(ctx context.Context, script Script) (*RunResponse, e
 
 	return &response, nil
 }
+
+type HTTPMetrics struct {
+	Requests       *prometheus.CounterVec
+	RequestsPerRun *prometheus.HistogramVec
+}
+
+const (
+	metricLabelSuccess   = "success"
+	metricLabelRetriable = "retriable"
+)
+
+func NewHTTPMetrics(registerer prometheus.Registerer) *HTTPMetrics {
+	m := &HTTPMetrics{}
+	m.Requests = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Namespace: "sm_agent",
+			Subsystem: "k6runner",
+			Name:      "requests_total",
+			Help: "Total number of requests made to remote k6 runners, which may be more than one per run. " +
+				"The 'success' label is 1 if this single request succeeded, 0 otherwise. " +
+				"The 'retriable' label is 1 if the request failed with a retriable error, 0 otherwise. " +
+				"Successful requests do not have the 'retriable' label.",
+		},
+		[]string{metricLabelSuccess, metricLabelRetriable},
+	)
+	registerer.MustRegister(m.Requests)
+
+	m.RequestsPerRun = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Namespace: "sm_agent",
+			Subsystem: "k6runner",
+			Name:      "requests_per_run",
+			Help: "Number of requests attempted per run operation. " +
+				"The 'success' label is 1 if one ultimately succeeded potentially including retries, 0 otherwise.",
+			// Generally we expect request to be retries a handful of times, so we create high resolution buckets up to
+			// 5. Form 5 onwards something off is going on and we do not care that much about resolution.
+			Buckets: []float64{1, 2, 3, 4, 5, 10, 20, 50},
+		},
+		[]string{metricLabelSuccess},
+	)
+	registerer.MustRegister(m.RequestsPerRun)
+
+	return m
+}
diff --git a/internal/k6runner/k6runner.go b/internal/k6runner/k6runner.go
@@ -98,18 +98,24 @@ type Runner interface {
 type RunnerOpts struct {
 	Uri           string
 	BlacklistedIP string
+	Registerer    prometheus.Registerer
 }
 
 func New(opts RunnerOpts) Runner {
 	var r Runner
 	logger := zerolog.Nop()
+	var registerer prometheus.Registerer = prometheus.NewRegistry() // NOOP registry.
+	if opts.Registerer != nil {
+		registerer = opts.Registerer
+	}
 
 	if strings.HasPrefix(opts.Uri, "http") {
 		r = &HttpRunner{
 			url:       opts.Uri,
 			logger:    &logger,
 			graceTime: defaultGraceTime,
 			backoff:   defaultBackoff,
+			metrics:   NewHTTPMetrics(registerer),
 		}
 	} else {
 		r = Local{

Original file line number	Diff line number	Diff line change
`@@ -273,6 +273,7 @@ func run(args []string, stdout io.Writer) error {`
`273`	`273`	`k6Runner = k6runner.New(k6runner.RunnerOpts{`
`274`	`274`	`Uri: config.K6URI,`
`275`	`275`	`BlacklistedIP: config.K6BlacklistedIP,`
	`276`	`+ Registerer: promRegisterer,`
`276`	`277`	`})`
`277`	`278`	`}`
`278`	`279`