diff --git a/doc/admin/observability/alerts.md b/doc/admin/observability/alerts.md index 7ec8c026fe1c..9b20d5f96309 100644 --- a/doc/admin/observability/alerts.md +++ b/doc/admin/observability/alerts.md @@ -5931,7 +5931,7 @@ with your code hosts connections or networking issues affecting communication wi
Technical details -Custom alert query: `last_over_time(sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"}[5m]))[5h:]) / (last_over_time(sum(increase(src_executor_processor_total{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"}[5m]))[5h:]) + last_over_time(sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"}[5m]))[5h:])) * 100` +Custom alert query: `last_over_time(sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",sg_job=~"^sourcegraph-executors.*"}[5m]))[5h:]) / (last_over_time(sum(increase(src_executor_processor_total{queue=~"${queue:regex}",sg_job=~"^sourcegraph-executors.*"}[5m]))[5h:]) + last_over_time(sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",sg_job=~"^sourcegraph-executors.*"}[5m]))[5h:])) * 100`
diff --git a/doc/admin/observability/dashboards.md b/doc/admin/observability/dashboards.md index 059cb73ef4a5..de457329debe 100644 --- a/doc/admin/observability/dashboards.md +++ b/doc/admin/observability/dashboards.md @@ -17470,7 +17470,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100100`
Technical details -Query: `sum(src_executor_processor_handlers{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"})` +Query: `sum(src_executor_processor_handlers{queue=~"${queue:regex}",sg_job=~"^sourcegraph-executors.*"})`
@@ -17489,7 +17489,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100110`
Technical details -Query: `sum(increase(src_executor_processor_total{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"}[5m]))` +Query: `sum(increase(src_executor_processor_total{queue=~"${queue:regex}",sg_job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17508,7 +17508,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100111`
Technical details -Query: `sum by (le)(rate(src_executor_processor_duration_seconds_bucket{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"}[5m]))` +Query: `sum by (le)(rate(src_executor_processor_duration_seconds_bucket{queue=~"${queue:regex}",sg_job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17527,7 +17527,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100112`
Technical details -Query: `sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"}[5m]))` +Query: `sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",sg_job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17546,7 +17546,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100113`
Technical details -Query: `sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"}[5m])) / (sum(increase(src_executor_processor_total{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"}[5m])) + sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",job=~"^sourcegraph-executors.*"}[5m]))) * 100` +Query: `sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",sg_job=~"^sourcegraph-executors.*"}[5m])) / (sum(increase(src_executor_processor_total{queue=~"${queue:regex}",sg_job=~"^sourcegraph-executors.*"}[5m])) + sum(increase(src_executor_processor_errors_total{queue=~"${queue:regex}",sg_job=~"^sourcegraph-executors.*"}[5m]))) * 100`
@@ -17569,7 +17569,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100200`
Technical details -Query: `sum(increase(src_executor_run_lock_wait_total{sg_jobs=~"^sourcegraph-executors.*"}[5m]))` +Query: `sum(increase(src_executor_run_lock_wait_total{sg_job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17590,7 +17590,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100201`
Technical details -Query: `sum(increase(src_executor_run_lock_held_total{sg_jobs=~"^sourcegraph-executors.*"}[5m]))` +Query: `sum(increase(src_executor_run_lock_held_total{sg_job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17611,7 +17611,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100300`
Technical details -Query: `sum(increase(src_apiworker_apiclient_total{job=~"^sourcegraph-executors.*"}[5m]))` +Query: `sum(increase(src_apiworker_apiclient_total{sg_job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17630,7 +17630,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100301`
Technical details -Query: `sum by (le)(rate(src_apiworker_apiclient_duration_seconds_bucket{job=~"^sourcegraph-executors.*"}[5m]))` +Query: `sum by (le)(rate(src_apiworker_apiclient_duration_seconds_bucket{sg_job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17649,7 +17649,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100302`
Technical details -Query: `sum(increase(src_apiworker_apiclient_errors_total{job=~"^sourcegraph-executors.*"}[5m]))` +Query: `sum(increase(src_apiworker_apiclient_errors_total{sg_job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17668,7 +17668,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100303`
Technical details -Query: `sum(increase(src_apiworker_apiclient_errors_total{job=~"^sourcegraph-executors.*"}[5m])) / (sum(increase(src_apiworker_apiclient_total{job=~"^sourcegraph-executors.*"}[5m])) + sum(increase(src_apiworker_apiclient_errors_total{job=~"^sourcegraph-executors.*"}[5m]))) * 100` +Query: `sum(increase(src_apiworker_apiclient_errors_total{sg_job=~"^sourcegraph-executors.*"}[5m])) / (sum(increase(src_apiworker_apiclient_total{sg_job=~"^sourcegraph-executors.*"}[5m])) + sum(increase(src_apiworker_apiclient_errors_total{sg_job=~"^sourcegraph-executors.*"}[5m]))) * 100`
@@ -17687,7 +17687,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100310`
Technical details -Query: `sum by (op)(increase(src_apiworker_apiclient_total{job=~"^sourcegraph-executors.*"}[5m]))` +Query: `sum by (op)(increase(src_apiworker_apiclient_total{sg_job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17706,7 +17706,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100311`
Technical details -Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_apiclient_duration_seconds_bucket{job=~"^sourcegraph-executors.*"}[5m])))` +Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_apiclient_duration_seconds_bucket{sg_job=~"^sourcegraph-executors.*"}[5m])))`
@@ -17725,7 +17725,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100312`
Technical details -Query: `sum by (op)(increase(src_apiworker_apiclient_errors_total{job=~"^sourcegraph-executors.*"}[5m]))` +Query: `sum by (op)(increase(src_apiworker_apiclient_errors_total{sg_job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17744,7 +17744,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100313`
Technical details -Query: `sum by (op)(increase(src_apiworker_apiclient_errors_total{job=~"^sourcegraph-executors.*"}[5m])) / (sum by (op)(increase(src_apiworker_apiclient_total{job=~"^sourcegraph-executors.*"}[5m])) + sum by (op)(increase(src_apiworker_apiclient_errors_total{job=~"^sourcegraph-executors.*"}[5m]))) * 100` +Query: `sum by (op)(increase(src_apiworker_apiclient_errors_total{sg_job=~"^sourcegraph-executors.*"}[5m])) / (sum by (op)(increase(src_apiworker_apiclient_total{sg_job=~"^sourcegraph-executors.*"}[5m])) + sum by (op)(increase(src_apiworker_apiclient_errors_total{sg_job=~"^sourcegraph-executors.*"}[5m]))) * 100`
@@ -17765,7 +17765,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100400`
Technical details -Query: `sum(increase(src_apiworker_command_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m]))` +Query: `sum(increase(src_apiworker_command_total{op=~"setup.*",sg_job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17784,7 +17784,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100401`
Technical details -Query: `sum by (le)(rate(src_apiworker_command_duration_seconds_bucket{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m]))` +Query: `sum by (le)(rate(src_apiworker_command_duration_seconds_bucket{op=~"setup.*",sg_job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17803,7 +17803,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100402`
Technical details -Query: `sum(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m]))` +Query: `sum(increase(src_apiworker_command_errors_total{op=~"setup.*",sg_job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17822,7 +17822,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100403`
Technical details -Query: `sum(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m])) / (sum(increase(src_apiworker_command_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m])) + sum(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m]))) * 100` +Query: `sum(increase(src_apiworker_command_errors_total{op=~"setup.*",sg_job=~"^sourcegraph-executors.*"}[5m])) / (sum(increase(src_apiworker_command_total{op=~"setup.*",sg_job=~"^sourcegraph-executors.*"}[5m])) + sum(increase(src_apiworker_command_errors_total{op=~"setup.*",sg_job=~"^sourcegraph-executors.*"}[5m]))) * 100`
@@ -17841,7 +17841,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100410`
Technical details -Query: `sum by (op)(increase(src_apiworker_command_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m]))` +Query: `sum by (op)(increase(src_apiworker_command_total{op=~"setup.*",sg_job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17860,7 +17860,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100411`
Technical details -Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_command_duration_seconds_bucket{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m])))` +Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_command_duration_seconds_bucket{op=~"setup.*",sg_job=~"^sourcegraph-executors.*"}[5m])))`
@@ -17879,7 +17879,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100412`
Technical details -Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m]))` +Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"setup.*",sg_job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17898,7 +17898,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100413`
Technical details -Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m])) / (sum by (op)(increase(src_apiworker_command_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m])) + sum by (op)(increase(src_apiworker_command_errors_total{op=~"setup.*",job=~"^sourcegraph-executors.*"}[5m]))) * 100` +Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"setup.*",sg_job=~"^sourcegraph-executors.*"}[5m])) / (sum by (op)(increase(src_apiworker_command_total{op=~"setup.*",sg_job=~"^sourcegraph-executors.*"}[5m])) + sum by (op)(increase(src_apiworker_command_errors_total{op=~"setup.*",sg_job=~"^sourcegraph-executors.*"}[5m]))) * 100`
@@ -17919,7 +17919,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100500`
Technical details -Query: `sum(increase(src_apiworker_command_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m]))` +Query: `sum(increase(src_apiworker_command_total{op=~"exec.*",sg_job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17938,7 +17938,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100501`
Technical details -Query: `sum by (le)(rate(src_apiworker_command_duration_seconds_bucket{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m]))` +Query: `sum by (le)(rate(src_apiworker_command_duration_seconds_bucket{op=~"exec.*",sg_job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17957,7 +17957,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100502`
Technical details -Query: `sum(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m]))` +Query: `sum(increase(src_apiworker_command_errors_total{op=~"exec.*",sg_job=~"^sourcegraph-executors.*"}[5m]))`
@@ -17976,7 +17976,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100503`
Technical details -Query: `sum(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m])) / (sum(increase(src_apiworker_command_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m])) + sum(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m]))) * 100` +Query: `sum(increase(src_apiworker_command_errors_total{op=~"exec.*",sg_job=~"^sourcegraph-executors.*"}[5m])) / (sum(increase(src_apiworker_command_total{op=~"exec.*",sg_job=~"^sourcegraph-executors.*"}[5m])) + sum(increase(src_apiworker_command_errors_total{op=~"exec.*",sg_job=~"^sourcegraph-executors.*"}[5m]))) * 100`
@@ -17995,7 +17995,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100510`
Technical details -Query: `sum by (op)(increase(src_apiworker_command_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m]))` +Query: `sum by (op)(increase(src_apiworker_command_total{op=~"exec.*",sg_job=~"^sourcegraph-executors.*"}[5m]))`
@@ -18014,7 +18014,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100511`
Technical details -Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_command_duration_seconds_bucket{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m])))` +Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_command_duration_seconds_bucket{op=~"exec.*",sg_job=~"^sourcegraph-executors.*"}[5m])))`
@@ -18033,7 +18033,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100512`
Technical details -Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m]))` +Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"exec.*",sg_job=~"^sourcegraph-executors.*"}[5m]))`
@@ -18052,7 +18052,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100513`
Technical details -Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m])) / (sum by (op)(increase(src_apiworker_command_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m])) + sum by (op)(increase(src_apiworker_command_errors_total{op=~"exec.*",job=~"^sourcegraph-executors.*"}[5m]))) * 100` +Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"exec.*",sg_job=~"^sourcegraph-executors.*"}[5m])) / (sum by (op)(increase(src_apiworker_command_total{op=~"exec.*",sg_job=~"^sourcegraph-executors.*"}[5m])) + sum by (op)(increase(src_apiworker_command_errors_total{op=~"exec.*",sg_job=~"^sourcegraph-executors.*"}[5m]))) * 100`
@@ -18073,7 +18073,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100600`
Technical details -Query: `sum(increase(src_apiworker_command_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m]))` +Query: `sum(increase(src_apiworker_command_total{op=~"teardown.*",sg_job=~"^sourcegraph-executors.*"}[5m]))`
@@ -18092,7 +18092,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100601`
Technical details -Query: `sum by (le)(rate(src_apiworker_command_duration_seconds_bucket{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m]))` +Query: `sum by (le)(rate(src_apiworker_command_duration_seconds_bucket{op=~"teardown.*",sg_job=~"^sourcegraph-executors.*"}[5m]))`
@@ -18111,7 +18111,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100602`
Technical details -Query: `sum(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m]))` +Query: `sum(increase(src_apiworker_command_errors_total{op=~"teardown.*",sg_job=~"^sourcegraph-executors.*"}[5m]))`
@@ -18130,7 +18130,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100603`
Technical details -Query: `sum(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m])) / (sum(increase(src_apiworker_command_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m])) + sum(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m]))) * 100` +Query: `sum(increase(src_apiworker_command_errors_total{op=~"teardown.*",sg_job=~"^sourcegraph-executors.*"}[5m])) / (sum(increase(src_apiworker_command_total{op=~"teardown.*",sg_job=~"^sourcegraph-executors.*"}[5m])) + sum(increase(src_apiworker_command_errors_total{op=~"teardown.*",sg_job=~"^sourcegraph-executors.*"}[5m]))) * 100`
@@ -18149,7 +18149,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100610`
Technical details -Query: `sum by (op)(increase(src_apiworker_command_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m]))` +Query: `sum by (op)(increase(src_apiworker_command_total{op=~"teardown.*",sg_job=~"^sourcegraph-executors.*"}[5m]))`
@@ -18168,7 +18168,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100611`
Technical details -Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_command_duration_seconds_bucket{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m])))` +Query: `histogram_quantile(0.99, sum by (le,op)(rate(src_apiworker_command_duration_seconds_bucket{op=~"teardown.*",sg_job=~"^sourcegraph-executors.*"}[5m])))`
@@ -18187,7 +18187,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100612`
Technical details -Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m]))` +Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"teardown.*",sg_job=~"^sourcegraph-executors.*"}[5m]))`
@@ -18206,7 +18206,7 @@ To see this panel, visit `/-/debug/grafana/d/executor/executor?viewPanel=100613`
Technical details -Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m])) / (sum by (op)(increase(src_apiworker_command_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m])) + sum by (op)(increase(src_apiworker_command_errors_total{op=~"teardown.*",job=~"^sourcegraph-executors.*"}[5m]))) * 100` +Query: `sum by (op)(increase(src_apiworker_command_errors_total{op=~"teardown.*",sg_job=~"^sourcegraph-executors.*"}[5m])) / (sum by (op)(increase(src_apiworker_command_total{op=~"teardown.*",sg_job=~"^sourcegraph-executors.*"}[5m])) + sum by (op)(increase(src_apiworker_command_errors_total{op=~"teardown.*",sg_job=~"^sourcegraph-executors.*"}[5m]))) * 100`
diff --git a/enterprise/cmd/executor/internal/metrics/metrics.go b/enterprise/cmd/executor/internal/metrics/metrics.go index 7d78f3092f8f..a6240f86890d 100644 --- a/enterprise/cmd/executor/internal/metrics/metrics.go +++ b/enterprise/cmd/executor/internal/metrics/metrics.go @@ -43,20 +43,23 @@ func MakeExecutorMetricsGatherer( // nodeExporterEndpoint is the URL of the local node_exporter endpoint, without // the /metrics path. Disabled, when an empty string. nodeExporterEndpoint string, - // dockerRegsitryEndpoint is the URL of the intermediary caching docker registry, + // dockerRegistryEndpoint is the URL of the intermediary caching docker registry, // for scraping and forwarding metrics. Disabled, when an empty string. - dockerRegistryNodeExporterEndpoint string, + dockerRegistryEndpoint string, ) prometheus.GathererFunc { nodeMetrics := newMetricsSyncPoint() registryMetrics := newMetricsSyncPoint() + registryNodeMetrics := newMetricsSyncPoint() - go backgroundCollectNodeExporterMetrics(nodeExporterEndpoint, nodeMetrics) - go backgroundCollectNodeExporterMetrics(dockerRegistryNodeExporterEndpoint, registryMetrics) + go backgroundCollectMetrics(nodeExporterEndpoint+"/metrics", nodeMetrics) + go backgroundCollectMetrics(dockerRegistryEndpoint+"/proxy?module=registry", registryMetrics) + go backgroundCollectMetrics(dockerRegistryEndpoint+"/proxy?module=node", registryNodeMetrics) return func() (mfs []*dto.MetricFamily, err error) { // notify to start a scrape nodeMetrics.notify.Signal() registryMetrics.notify.Signal() + registryNodeMetrics.notify.Signal() mfs, err = gatherer.Gather() if err != nil { @@ -64,12 +67,12 @@ func MakeExecutorMetricsGatherer( } if nodeExporterEndpoint != "" { - result := <-registryMetrics.result + result := <-nodeMetrics.result if result.err != nil { logger.Warn("failed to get metrics for node exporter", log.Error(result.err)) } for key, mf := range result.metrics { - if strings.HasPrefix(key, "go_") || strings.HasPrefix(key, "promhttp_") || strings.HasPrefix(key, "process_") { + if filterMetric(key) { continue } @@ -77,24 +80,46 @@ func MakeExecutorMetricsGatherer( } } - if dockerRegistryNodeExporterEndpoint != "" { - result := <-registryMetrics.result - if result.err != nil { - logger.Warn("failed to get metrics for docker registry", log.Error(result.err)) + if dockerRegistryEndpoint != "" { + { + result := <-registryMetrics.result + if result.err != nil { + logger.Warn("failed to get metrics for docker registry", log.Error(result.err)) + } + for key, mf := range result.metrics { + if filterMetric(key) { + continue + } + + // should only be 1 registry, so we give it a set instance value + metricLabelInstance := "sg_instance" + instanceName := "docker-registry" + for _, m := range mf.Metric { + m.Label = append(m.Label, &dto.LabelPair{Name: &metricLabelInstance, Value: &instanceName}) + } + + mfs = append(mfs, mf) + } } - for key, mf := range result.metrics { - if strings.HasPrefix(key, "go_") || strings.HasPrefix(key, "promhttp_") || strings.HasPrefix(key, "process_") { - continue + { + result := <-registryNodeMetrics.result + if result.err != nil { + logger.Warn("failed to get metrics for docker registry", log.Error(result.err)) } - - // should only be 1 registry, so we give it a set instance value - metricLabelInstance := "sg_instance" - instanceName := "docker-regsitry" - for _, m := range mf.Metric { - m.Label = append(m.Label, &dto.LabelPair{Name: &metricLabelInstance, Value: &instanceName}) + for key, mf := range result.metrics { + if filterMetric(key) { + continue + } + + // should only be 1 registry, so we give it a set instance value + metricLabelInstance := "sg_instance" + instanceName := "docker-registry" + for _, m := range mf.Metric { + m.Label = append(m.Label, &dto.LabelPair{Name: &metricLabelInstance, Value: &instanceName}) + } + + mfs = append(mfs, mf) } - - mfs = append(mfs, mf) } } @@ -102,10 +127,14 @@ func MakeExecutorMetricsGatherer( } } +func filterMetric(key string) bool { + return strings.HasPrefix(key, "go_") || strings.HasPrefix(key, "promhttp_") || strings.HasPrefix(key, "process_") +} + // On notify, scrapes the specified endpoint for prometheus metrics and sends them down the // associated channel. If the endpoint is "", then the channel is closed so that subsequent // reads return an empty value instead of blocking indefinitely. -func backgroundCollectNodeExporterMetrics(endpoint string, syncPoint metricsSyncPoint) { +func backgroundCollectMetrics(endpoint string, syncPoint metricsSyncPoint) { if endpoint == "" { close(syncPoint.result) return @@ -114,7 +143,7 @@ func backgroundCollectNodeExporterMetrics(endpoint string, syncPoint metricsSync collect := func() (map[string]*dto.MetricFamily, error) { resp, err := (&http.Client{ Timeout: 2 * time.Second, - }).Get(endpoint + "/metrics") + }).Get(endpoint) if err != nil { return nil, err } @@ -127,7 +156,7 @@ func backgroundCollectNodeExporterMetrics(endpoint string, syncPoint metricsSync var parser expfmt.TextParser mfMap, err := parser.TextToMetricFamilies(bytes.NewReader(b)) - return mfMap, errors.Wrapf(err, "parsing node_exporter metrics, response: %s", string(b)) + return mfMap, errors.Wrapf(err, "parsing metrics, response: %s", string(b)) } for { diff --git a/monitoring/definitions/shared/codeintel.go b/monitoring/definitions/shared/codeintel.go index efddf0f6e65d..c95d4871972f 100644 --- a/monitoring/definitions/shared/codeintel.go +++ b/monitoring/definitions/shared/codeintel.go @@ -287,6 +287,7 @@ func (codeIntelligence) NewExecutorProcessorGroup(containerName string) monitori constructorOptions := ObservableConstructorOptions{ MetricNameRoot: "executor", + JobLabel: "sg_job", MetricDescriptionRoot: "handler", Filters: filters, } @@ -323,7 +324,7 @@ func (codeIntelligence) NewExecutorProcessorGroup(containerName string) monitori // src_executor_run_lock_held_total func (codeIntelligence) NewExecutorExecutionRunLockContentionGroup(containerName string) monitoring.Group { constructor := func(metricNameRoot, legend string) Observable { - filters := makeFilters("sg_jobs", containerName) + filters := makeFilters("sg_job", containerName) return Observable{ Name: metricNameRoot + "_total", Description: fmt.Sprintf("milliseconds %s every 5m", legend), @@ -361,6 +362,7 @@ func (codeIntelligence) NewExecutorSetupCommandGroup(containerName string) monit ObservableConstructorOptions: ObservableConstructorOptions{ MetricNameRoot: "apiworker_command", + JobLabel: "sg_job", MetricDescriptionRoot: "command", Filters: []string{`op=~"setup.*"`}, By: []string{"op"}, @@ -394,6 +396,7 @@ func (codeIntelligence) NewExecutorExecutionCommandGroup(containerName string) m ObservableConstructorOptions: ObservableConstructorOptions{ MetricNameRoot: "apiworker_command", + JobLabel: "sg_job", MetricDescriptionRoot: "command", Filters: []string{`op=~"exec.*"`}, By: []string{"op"}, @@ -427,6 +430,7 @@ func (codeIntelligence) NewExecutorTeardownCommandGroup(containerName string) mo ObservableConstructorOptions: ObservableConstructorOptions{ MetricNameRoot: "apiworker_command", + JobLabel: "sg_job", MetricDescriptionRoot: "command", Filters: []string{`op=~"teardown.*"`}, By: []string{"op"}, @@ -460,6 +464,7 @@ func (codeIntelligence) NewExecutorAPIClientGroup(containerName string) monitori ObservableConstructorOptions: ObservableConstructorOptions{ MetricNameRoot: "apiworker_apiclient", + JobLabel: "sg_job", MetricDescriptionRoot: "client", Filters: nil, By: []string{"op"},