From 1378778dad7ad9d198e213a341dbd895424346f3 Mon Sep 17 00:00:00 2001 From: LorcanMcVeigh Date: Mon, 29 Jun 2020 15:47:11 +0100 Subject: [PATCH 1/2] Add prometheus worker process metrics --- cmd/nginx-ingress/main.go | 6 ++ docs-web/logging-and-monitoring/prometheus.md | 1 + internal/metrics/collectors/processes.go | 91 +++++++++++++++++++ internal/nginx/manager.go | 5 +- 4 files changed, 100 insertions(+), 3 deletions(-) create mode 100644 internal/metrics/collectors/processes.go diff --git a/cmd/nginx-ingress/main.go b/cmd/nginx-ingress/main.go index 0731ba8e89..8ddb1f6fd3 100644 --- a/cmd/nginx-ingress/main.go +++ b/cmd/nginx-ingress/main.go @@ -307,6 +307,7 @@ func main() { registry = prometheus.NewRegistry() managerCollector = collectors.NewLocalManagerMetricsCollector(constLabels) controllerCollector = collectors.NewControllerMetricsCollector(*enableCustomResources, constLabels) + processCollector := collectors.NewNginxProcessesMetricsCollector(constLabels) err = managerCollector.Register(registry) if err != nil { @@ -317,6 +318,11 @@ func main() { if err != nil { glog.Errorf("Error registering Controller Prometheus metrics: %v", err) } + + err = processCollector.Register(registry) + if err != nil { + glog.Errorf("Error registering NginxProcess Prometheus metrics: %v", err) + } } useFakeNginxManager := *proxyURL != "" diff --git a/docs-web/logging-and-monitoring/prometheus.md b/docs-web/logging-and-monitoring/prometheus.md index 0b0b1a80c3..2b115c303f 100644 --- a/docs-web/logging-and-monitoring/prometheus.md +++ b/docs-web/logging-and-monitoring/prometheus.md @@ -30,6 +30,7 @@ The Ingress Controller exports the following metrics: * `controller_nginx_reload_errors_total`. Number of unsuccessful NGINX reloads. * `controller_nginx_last_reload_status`. Status of the last NGINX reload, 0 meaning down and 1 up. * `controller_nginx_last_reload_milliseconds`. Duration in milliseconds of the last NGINX reload. + * `controller_nginx_worker_processes_total`. Number of NGINX worker processes. This metric includes the constant label `generation` with two possible values `old`(the shutting down processes of the old genrations) or `current`(the processes of the current generation). * `controller_ingress_resources_total`. Number of handled Ingress resources. This metric includes the label type, that groups the Ingress resources by their type (regular, [minion or master](/nginx-ingress-controller/configuration/ingress-resources/cross-namespace-configuration)). **Note**: The metric doesn't count minions without a master. * `controller_virtualserver_resources_total`. Number of handled VirtualServer resources. * `controller_virtualserverroute_resources_total`. Number of handled VirtualServerRoute resources. **Note**: The metric counts only VirtualServerRoutes that have a reference from a VirtualServer. diff --git a/internal/metrics/collectors/processes.go b/internal/metrics/collectors/processes.go new file mode 100644 index 0000000000..313fdbbe7e --- /dev/null +++ b/internal/metrics/collectors/processes.go @@ -0,0 +1,91 @@ +package collectors + +import ( + "bytes" + "fmt" + "io/ioutil" + "strconv" + + "github.com/golang/glog" + "github.com/prometheus/client_golang/prometheus" +) + +// NginxProcessesMetricsCollector implements NginxPorcessesCollector interface and prometheus.Collector interface +type NginxProcessesMetricsCollector struct { + // Metrics + workerProcessTotal *prometheus.GaugeVec +} + +// NewNginxProcessesMetricsCollector creates a new NginxProcessMetricsCollector +func NewNginxProcessesMetricsCollector(constLabels map[string]string) *NginxProcessesMetricsCollector { + pc := &NginxProcessesMetricsCollector{ + workerProcessTotal: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "nginx_worker_processes_total", + Namespace: metricsNamespace, + Help: "Number of NGINX worker processes", + ConstLabels: constLabels, + }, + []string{"generation"}, + ), + } + return pc +} + +// updateWorkerProcessCount sets the number of NGINX worker processes +func (pc *NginxProcessesMetricsCollector) updateWorkerProcessCount() { + currWorkerProcesses, prevWorkerPrcesses, err := getWorkerProcesses() + if err != nil { + glog.Errorf("unable to collect process metrics : %v", err) + return + } + pc.workerProcessTotal.WithLabelValues("current").Set(float64(currWorkerProcesses)) + pc.workerProcessTotal.WithLabelValues("old").Set(float64(prevWorkerPrcesses)) +} + +func getWorkerProcesses() (int, int, error) { + var workerProcesses int + var prevWorkerProcesses int + + procFolders, err := ioutil.ReadDir("/proc") + if err != nil { + return 0, 0, fmt.Errorf("unable to read directory /proc : %v", err) + } + + for _, folder := range procFolders { + _, err := strconv.Atoi(folder.Name()) + if err != nil { + continue + } + + cmdlineFile := fmt.Sprintf("/proc/%v/cmdline", folder.Name()) + content, err := ioutil.ReadFile(cmdlineFile) + if err != nil { + return 0, 0, fmt.Errorf("unable to read file %v: %v", cmdlineFile, err) + } + + text := string(bytes.TrimRight(content, "\x00")) + if text == "nginx: worker process" { + workerProcesses++ + } else if text == "nginx: worker process is shutting down" { + prevWorkerProcesses++ + } + } + return workerProcesses, prevWorkerProcesses, nil +} + +// Collect implements the prometheus.Collector interface Collect method +func (pc *NginxProcessesMetricsCollector) Collect(ch chan<- prometheus.Metric) { + pc.updateWorkerProcessCount() + pc.workerProcessTotal.Collect(ch) +} + +// Describe implements prometheus.Collector interface Describe method +func (pc *NginxProcessesMetricsCollector) Describe(ch chan<- *prometheus.Desc) { + pc.workerProcessTotal.Describe(ch) +} + +// Register registers all the metrics of the collector +func (pc *NginxProcessesMetricsCollector) Register(registry *prometheus.Registry) error { + return registry.Register(pc) +} diff --git a/internal/nginx/manager.go b/internal/nginx/manager.go index 28fa9769dd..90c34e362e 100644 --- a/internal/nginx/manager.go +++ b/internal/nginx/manager.go @@ -30,7 +30,6 @@ const appPluginParams = "tmm_count 4 proc_cpuinfo_cpu_mhz 2000000 total_xml_memo const appProtectPluginStartCmd = "/usr/share/ts/bin/bd-socket-plugin" const appProtectAgentStartCmd = "/opt/app_protect/bin/bd_agent" - // ServerConfig holds the config data for an upstream server in NGINX Plus. type ServerConfig struct { MaxFails int @@ -233,7 +232,7 @@ func (lm *LocalManager) CreateDHParam(content string) (string, error) { } // CreateAppProtectResourceFile writes contents of An App Protect resource to a file -func (lm *LocalManager) CreateAppProtectResourceFile(name string, content []byte){ +func (lm *LocalManager) CreateAppProtectResourceFile(name string, content []byte) { glog.V(3).Infof("Writing App Protect Resource to %v", name) err := createFileAndWrite(name, content) if err != nil { @@ -433,7 +432,7 @@ func (lm *LocalManager) AppProtectAgentStart(apaDone chan error, debug bool) { err = createFileAndWrite(appProtectLogConfigFileName, []byte(appProtectDebugLogConfigFileContent)) if err != nil { glog.Fatalf("Failed Writing App Protect Log configuration file") - } + } } glog.V(3).Info("Starting AppProtect Agent") From 723f4334cc67fc6a951435eb3ead57bcef358331 Mon Sep 17 00:00:00 2001 From: Lorcan McVeigh Date: Wed, 29 Jul 2020 11:01:38 +0100 Subject: [PATCH 2/2] Update docs-web Co-authored-by: Luca Comellini --- docs-web/logging-and-monitoring/prometheus.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs-web/logging-and-monitoring/prometheus.md b/docs-web/logging-and-monitoring/prometheus.md index 2b115c303f..3301c67ed9 100644 --- a/docs-web/logging-and-monitoring/prometheus.md +++ b/docs-web/logging-and-monitoring/prometheus.md @@ -30,7 +30,7 @@ The Ingress Controller exports the following metrics: * `controller_nginx_reload_errors_total`. Number of unsuccessful NGINX reloads. * `controller_nginx_last_reload_status`. Status of the last NGINX reload, 0 meaning down and 1 up. * `controller_nginx_last_reload_milliseconds`. Duration in milliseconds of the last NGINX reload. - * `controller_nginx_worker_processes_total`. Number of NGINX worker processes. This metric includes the constant label `generation` with two possible values `old`(the shutting down processes of the old genrations) or `current`(the processes of the current generation). + * `controller_nginx_worker_processes_total`. Number of NGINX worker processes. This metric includes the constant label `generation` with two possible values `old` (the shutting down processes of the old generations) or `current` (the processes of the current generation). * `controller_ingress_resources_total`. Number of handled Ingress resources. This metric includes the label type, that groups the Ingress resources by their type (regular, [minion or master](/nginx-ingress-controller/configuration/ingress-resources/cross-namespace-configuration)). **Note**: The metric doesn't count minions without a master. * `controller_virtualserver_resources_total`. Number of handled VirtualServer resources. * `controller_virtualserverroute_resources_total`. Number of handled VirtualServerRoute resources. **Note**: The metric counts only VirtualServerRoutes that have a reference from a VirtualServer.