Skip to content

Commit bd1fa50

Browse files
Expose prometheus metrics for the router by default
Add new settings to the router that allow metrics to be captured via /metrics on a configurable listen address. Metrics and pprof data are only available when stats username and password are set, and require auth. The metrics implementation uses a variant of prometheus/haproxy_exporter and attempts to minimize the amount of metrics returned for the router. It also rate limits the frequency at which the haproxy stats endpoint is scraped relative to the amount of servers defined (a rough rule of thumb from this data is that for every 1000 servers the minimum interval increases by 5 seconds). The metrics returned are not transformed to be service based, but could be. This currently generates ~1.8M per 1k routes. Shorter server identifiers would have an impact. This generates a ~2% CPU cost on the router endpoint when run every 2s, which should scale reasonably to infrastructure.
1 parent 31e9bd0 commit bd1fa50

File tree

5 files changed

+691
-0
lines changed

5 files changed

+691
-0
lines changed

Diff for: pkg/cmd/admin/router/router.go

+10
Original file line numberDiff line numberDiff line change
@@ -678,6 +678,16 @@ func RunCmdRouter(f *clientcmd.Factory, cmd *cobra.Command, out, errout io.Write
678678
}
679679
env["ROUTER_CANONICAL_HOSTNAME"] = cfg.RouterCanonicalHostname
680680
}
681+
// automatically start the internal metrics agent if the type has metrics
682+
if cfg.Type == "haproxy-router" {
683+
env["ROUTER_LISTEN_ADDR"] = fmt.Sprintf("0.0.0.0:%d", defaultStatsPort-1)
684+
env["ROUTER_METRICS_TYPE"] = "haproxy"
685+
ports = append(ports, kapi.ContainerPort{
686+
Name: "router-stats",
687+
ContainerPort: int32(defaultStatsPort - 1),
688+
Protocol: kapi.ProtocolTCP,
689+
})
690+
}
681691
env.Add(secretEnv)
682692
if len(defaultCert) > 0 {
683693
if cfg.SecretsAsEnv {

Diff for: pkg/cmd/infra/router/router.go

+3
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ type RouterSelection struct {
5656
DisableNamespaceOwnershipCheck bool
5757

5858
EnableIngress bool
59+
60+
ListenAddr string
5961
}
6062

6163
// Bind sets the appropriate labels
@@ -73,6 +75,7 @@ func (o *RouterSelection) Bind(flag *pflag.FlagSet) {
7375
flag.BoolVar(&o.AllowWildcardRoutes, "allow-wildcard-routes", cmdutil.Env("ROUTER_ALLOW_WILDCARD_ROUTES", "") == "true", "Allow wildcard host names for routes")
7476
flag.BoolVar(&o.DisableNamespaceOwnershipCheck, "disable-namespace-ownership-check", cmdutil.Env("ROUTER_DISABLE_NAMESPACE_OWNERSHIP_CHECK", "") == "true", "Disables the namespace ownership checks for a route host with different paths or for overlapping host names in the case of wildcard routes. Please be aware that if namespace ownership checks are disabled, routes in a different namespace can use this mechanism to 'steal' sub-paths for existing domains. This is only safe if route creation privileges are restricted, or if all the users can be trusted.")
7577
flag.BoolVar(&o.EnableIngress, "enable-ingress", cmdutil.Env("ROUTER_ENABLE_INGRESS", "") == "true", "Enable configuration via ingress resources")
78+
flag.StringVar(&o.ListenAddr, "listen-addr", cmdutil.Env("ROUTER_LISTEN_ADDR", ""), "The name of an interface to listen on to expose metrics and health checking. If not specified, will not listen.")
7679
}
7780

7881
// RouteSelectionFunc returns a func that identifies the host for a route.

Diff for: pkg/cmd/infra/router/template.go

+48
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"fmt"
66
"os"
77
"strconv"
8+
"strings"
89
"time"
910

1011
"github.com/golang/glog"
@@ -21,6 +22,8 @@ import (
2122
"github.com/openshift/origin/pkg/cmd/util/clientcmd"
2223
"github.com/openshift/origin/pkg/router"
2324
"github.com/openshift/origin/pkg/router/controller"
25+
"github.com/openshift/origin/pkg/router/metrics"
26+
"github.com/openshift/origin/pkg/router/metrics/haproxy"
2427
templateplugin "github.com/openshift/origin/pkg/router/template"
2528
"github.com/openshift/origin/pkg/util/proc"
2629
)
@@ -66,6 +69,7 @@ type TemplateRouter struct {
6669
RouterService *ktypes.NamespacedName
6770
BindPortsAfterSync bool
6871
MaxConnections string
72+
MetricsType string
6973
}
7074

7175
// reloadInterval returns how often to run the router reloads. The interval
@@ -93,6 +97,7 @@ func (o *TemplateRouter) Bind(flag *pflag.FlagSet) {
9397
flag.BoolVar(&o.ExtendedValidation, "extended-validation", util.Env("EXTENDED_VALIDATION", "true") == "true", "If set, then an additional extended validation step is performed on all routes admitted in by this router. Defaults to true and enables the extended validation checks.")
9498
flag.BoolVar(&o.BindPortsAfterSync, "bind-ports-after-sync", util.Env("ROUTER_BIND_PORTS_AFTER_SYNC", "") == "true", "Bind ports only after route state has been synchronized")
9599
flag.StringVar(&o.MaxConnections, "max-connections", util.Env("ROUTER_MAX_CONNECTIONS", ""), "Specifies the maximum number of concurrent connections.")
100+
flag.StringVar(&o.MetricsType, "metrics-type", util.Env("ROUTER_METRICS_TYPE", ""), "Specifies the type of metrics to gather. Supports 'haproxy'.")
96101
}
97102

98103
type RouterStats struct {
@@ -178,6 +183,9 @@ func (o *TemplateRouterOptions) Complete() error {
178183
}
179184

180185
func (o *TemplateRouterOptions) Validate() error {
186+
if len(o.MetricsType) > 0 && o.MetricsType != "haproxy" {
187+
return errors.New("supported metrics types are: 'haproxy'")
188+
}
181189
if len(o.RouterName) == 0 {
182190
return errors.New("router must have a name to identify itself in route status")
183191
}
@@ -193,6 +201,46 @@ func (o *TemplateRouterOptions) Validate() error {
193201

194202
// Run launches a template router using the provided options. It never exits.
195203
func (o *TemplateRouterOptions) Run() error {
204+
switch {
205+
case o.MetricsType == "haproxy" && len(o.ListenAddr) > 0:
206+
var timeout time.Duration
207+
if t := util.Env("ROUTER_METRICS_HAPROXY_TIMEOUT", ""); len(t) > 0 {
208+
d, err := time.ParseDuration(t)
209+
if err != nil {
210+
return fmt.Errorf("ROUTER_METRICS_HAPROXY_TIMEOUT is not a valid duration: %v", err)
211+
}
212+
timeout = d
213+
}
214+
var baseScrapeInterval time.Duration
215+
if t := util.Env("ROUTER_METRICS_HAPROXY_BASE_SCRAPE_INTERVAL", ""); len(t) > 0 {
216+
d, err := time.ParseDuration(t)
217+
if err != nil {
218+
return fmt.Errorf("ROUTER_METRICS_HAPROXY_BASE_SCRAPE_INTERVAL is not a valid duration: %v", err)
219+
}
220+
baseScrapeInterval = d
221+
}
222+
var exported []int
223+
if t := util.Env("ROUTER_METRICS_HAPROXY_EXPORTED", ""); len(t) > 0 {
224+
for _, s := range strings.Split(t, ",") {
225+
i, err := strconv.Atoi(s)
226+
if err != nil {
227+
return errors.New("ROUTER_METRICS_HAPROXY_EXPORTED must be a comma delimited list of non-negative integers")
228+
}
229+
exported = append(exported, i)
230+
}
231+
}
232+
haproxy.NewPrometheusCollector(haproxy.PrometheusOptions{
233+
ScrapeURI: util.Env("ROUTER_METRICS_HAPROXY_SCRAPE_URI", ""),
234+
PidFile: util.Env("ROUTER_METRICS_HAPROXY_PID_FILE", ""),
235+
ServerThreshold: serverThreshold,
236+
BaseScrapeInterval: baseScrapeInterval,
237+
ExportedMetrics: exported,
238+
})
239+
}
240+
if len(o.ListenAddr) > 0 {
241+
metrics.Listen(o.ListenAddr, o.StatsUsername, o.StatsPassword)
242+
}
243+
196244
pluginCfg := templateplugin.TemplatePluginConfig{
197245
WorkingDir: o.WorkingDir,
198246
TemplatePath: o.TemplateFile,

0 commit comments

Comments
 (0)