Skip to content

Commit 06e71f5

Browse files
tjungbludgrisonnet
authored andcommitted
add NLB usage metrics for network edge (openshift#1509)
1 parent 460a098 commit 06e71f5

File tree

7 files changed

+22
-2
lines changed

7 files changed

+22
-2
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
## 4.10
44

5+
- [#1509](https://github.com/openshift/cluster-monitoring-operator/pull/1509) add NLB usage metrics for network edge
56
- [#1299](https://github.com/openshift/cluster-monitoring-operator/pull/1299) Expose expose /api/v1/labels endpoint for Thanos query.
67
- [#1402](https://github.com/openshift/cluster-monitoring-operator/pull/1402) Drop pod-centric cAdvisor metrics that are available at slice level.
78
- [#1399](https://github.com/openshift/cluster-monitoring-operator/pull/1399) Rename ThanosSidecarUnhealthy to ThanosSidecarNoConnectionToStartedPrometheus and make it resilient to WAL replays.

Documentation/data-collection.md

+6
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,12 @@ data:
390390
# at least max of one instance with k8s.v1.cni.cncf.io/networks annotation, labelled by networks (any or sriov).
391391
- '{__name__="cluster:network_attachment_definition_enabled_instance_up:max"}'
392392
#
393+
# owners: (@openshift/network-edge)
394+
#
395+
# cluster:ingress_controller_aws_nlb_active:sum informs how many NLBs are active in AWS.
396+
# Zero would indicate ELB (legacy). This metric is only emitted on AWS.
397+
- '{__name__="cluster:ingress_controller_aws_nlb_active:sum"}'
398+
#
393399
# owners: (https://github.com/openshift/insights-operator/blob/master/OWNERS)
394400
#
395401
# insightsclient_request_send tracks the number of metrics sends.

Documentation/sample-metrics.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ return the full set of metrics that the Telemeter client captures:
1313

1414
[embedmd]:# (telemetry/telemeter_query txt)
1515
```txt
16-
{__name__=~"cluster:usage:.*|count:up0|count:up1|cluster_version|cluster_version_available_updates|cluster_operator_up|cluster_operator_conditions|cluster_version_payload|cluster_installer|cluster_infrastructure_provider|cluster_feature_set|instance:etcd_object_counts:sum|ALERTS|code:apiserver_request_total:rate:sum|cluster:capacity_cpu_cores:sum|cluster:capacity_memory_bytes:sum|cluster:cpu_usage_cores:sum|cluster:memory_usage_bytes:sum|openshift:cpu_usage_cores:sum|openshift:memory_usage_bytes:sum|workload:cpu_usage_cores:sum|workload:memory_usage_bytes:sum|cluster:virt_platform_nodes:sum|cluster:node_instance_type_count:sum|cnv:vmi_status_running:count|cluster:vmi_request_cpu_cores:sum|node_role_os_version_machine:cpu_capacity_cores:sum|node_role_os_version_machine:cpu_capacity_sockets:sum|subscription_sync_total|olm_resolution_duration_seconds|csv_succeeded|csv_abnormal|cluster:kube_persistentvolumeclaim_resource_requests_storage_bytes:provisioner:sum|cluster:kubelet_volume_stats_used_bytes:provisioner:sum|ceph_cluster_total_bytes|ceph_cluster_total_used_raw_bytes|ceph_health_status|job:ceph_osd_metadata:count|job:kube_pv:count|job:ceph_pools_iops:total|job:ceph_pools_iops_bytes:total|job:ceph_versions_running:count|job:noobaa_total_unhealthy_buckets:sum|job:noobaa_bucket_count:sum|job:noobaa_total_object_count:sum|noobaa_accounts_num|noobaa_total_usage|console_url|cluster:network_attachment_definition_instances:max|cluster:network_attachment_definition_enabled_instance_up:max|insightsclient_request_send_total|cam_app_workload_migrations|cluster:apiserver_current_inflight_requests:sum:max_over_time:2m|cluster:alertmanager_integrations:max|cluster:telemetry_selected_series:count|openshift:prometheus_tsdb_head_series:sum|openshift:prometheus_tsdb_head_samples_appended_total:sum|monitoring:container_memory_working_set_bytes:sum|namespace_job:scrape_series_added:topk3_sum1h|namespace_job:scrape_samples_post_metric_relabeling:topk3|monitoring:haproxy_server_http_responses_total:sum|rhmi_status|cluster_legacy_scheduler_policy|cluster_master_schedulable|che_workspace_status|che_workspace_started_total|che_workspace_failure_total|che_workspace_start_time_seconds_sum|che_workspace_start_time_seconds_count|cco_credentials_mode|cluster:kube_persistentvolume_plugin_type_counts:sum|visual_web_terminal_sessions_total|acm_managed_cluster_info|cluster:vsphere_vcenter_info:sum|cluster:vsphere_esxi_version_total:sum|cluster:vsphere_node_hw_version_total:sum|openshift:build_by_strategy:sum|rhods_aggregate_availability|rhods_total_users|instance:etcd_disk_wal_fsync_duration_seconds:histogram_quantile|instance:etcd_mvcc_db_total_size_in_bytes:sum|instance:etcd_network_peer_round_trip_time_seconds:histogram_quantile|instance:etcd_mvcc_db_total_size_in_use_in_bytes:sum|instance:etcd_disk_backend_commit_duration_seconds:histogram_quantile|jaeger_operator_instances_storage_types|jaeger_operator_instances_strategies|jaeger_operator_instances_agent_strategies|appsvcs:cores_by_product:sum|nto_custom_profiles:count",alertstate=~"firing|",quantile=~"0.99|0.99|0.99|"}
16+
{__name__=~"cluster:usage:.*|count:up0|count:up1|cluster_version|cluster_version_available_updates|cluster_operator_up|cluster_operator_conditions|cluster_version_payload|cluster_installer|cluster_infrastructure_provider|cluster_feature_set|instance:etcd_object_counts:sum|ALERTS|code:apiserver_request_total:rate:sum|cluster:capacity_cpu_cores:sum|cluster:capacity_memory_bytes:sum|cluster:cpu_usage_cores:sum|cluster:memory_usage_bytes:sum|openshift:cpu_usage_cores:sum|openshift:memory_usage_bytes:sum|workload:cpu_usage_cores:sum|workload:memory_usage_bytes:sum|cluster:virt_platform_nodes:sum|cluster:node_instance_type_count:sum|cnv:vmi_status_running:count|cluster:vmi_request_cpu_cores:sum|node_role_os_version_machine:cpu_capacity_cores:sum|node_role_os_version_machine:cpu_capacity_sockets:sum|subscription_sync_total|olm_resolution_duration_seconds|csv_succeeded|csv_abnormal|cluster:kube_persistentvolumeclaim_resource_requests_storage_bytes:provisioner:sum|cluster:kubelet_volume_stats_used_bytes:provisioner:sum|ceph_cluster_total_bytes|ceph_cluster_total_used_raw_bytes|ceph_health_status|job:ceph_osd_metadata:count|job:kube_pv:count|job:ceph_pools_iops:total|job:ceph_pools_iops_bytes:total|job:ceph_versions_running:count|job:noobaa_total_unhealthy_buckets:sum|job:noobaa_bucket_count:sum|job:noobaa_total_object_count:sum|noobaa_accounts_num|noobaa_total_usage|console_url|cluster:network_attachment_definition_instances:max|cluster:network_attachment_definition_enabled_instance_up:max|cluster:ingress_controller_aws_nlb_active:sum|insightsclient_request_send_total|cam_app_workload_migrations|cluster:apiserver_current_inflight_requests:sum:max_over_time:2m|cluster:alertmanager_integrations:max|cluster:telemetry_selected_series:count|openshift:prometheus_tsdb_head_series:sum|openshift:prometheus_tsdb_head_samples_appended_total:sum|monitoring:container_memory_working_set_bytes:sum|namespace_job:scrape_series_added:topk3_sum1h|namespace_job:scrape_samples_post_metric_relabeling:topk3|monitoring:haproxy_server_http_responses_total:sum|rhmi_status|cluster_legacy_scheduler_policy|cluster_master_schedulable|che_workspace_status|che_workspace_started_total|che_workspace_failure_total|che_workspace_start_time_seconds_sum|che_workspace_start_time_seconds_count|cco_credentials_mode|cluster:kube_persistentvolume_plugin_type_counts:sum|visual_web_terminal_sessions_total|acm_managed_cluster_info|cluster:vsphere_vcenter_info:sum|cluster:vsphere_esxi_version_total:sum|cluster:vsphere_node_hw_version_total:sum|openshift:build_by_strategy:sum|rhods_aggregate_availability|rhods_total_users|instance:etcd_disk_wal_fsync_duration_seconds:histogram_quantile|instance:etcd_mvcc_db_total_size_in_bytes:sum|instance:etcd_network_peer_round_trip_time_seconds:histogram_quantile|instance:etcd_mvcc_db_total_size_in_use_in_bytes:sum|instance:etcd_disk_backend_commit_duration_seconds:histogram_quantile|jaeger_operator_instances_storage_types|jaeger_operator_instances_strategies|jaeger_operator_instances_agent_strategies|appsvcs:cores_by_product:sum|nto_custom_profiles:count",alertstate=~"firing|",quantile=~"0.99|0.99|0.99|"}
1717
```
1818

1919
For reference, here is an example response produced by a running OpenShift cluster:
+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{__name__=~"cluster:usage:.*|count:up0|count:up1|cluster_version|cluster_version_available_updates|cluster_operator_up|cluster_operator_conditions|cluster_version_payload|cluster_installer|cluster_infrastructure_provider|cluster_feature_set|instance:etcd_object_counts:sum|ALERTS|code:apiserver_request_total:rate:sum|cluster:capacity_cpu_cores:sum|cluster:capacity_memory_bytes:sum|cluster:cpu_usage_cores:sum|cluster:memory_usage_bytes:sum|openshift:cpu_usage_cores:sum|openshift:memory_usage_bytes:sum|workload:cpu_usage_cores:sum|workload:memory_usage_bytes:sum|cluster:virt_platform_nodes:sum|cluster:node_instance_type_count:sum|cnv:vmi_status_running:count|cluster:vmi_request_cpu_cores:sum|node_role_os_version_machine:cpu_capacity_cores:sum|node_role_os_version_machine:cpu_capacity_sockets:sum|subscription_sync_total|olm_resolution_duration_seconds|csv_succeeded|csv_abnormal|cluster:kube_persistentvolumeclaim_resource_requests_storage_bytes:provisioner:sum|cluster:kubelet_volume_stats_used_bytes:provisioner:sum|ceph_cluster_total_bytes|ceph_cluster_total_used_raw_bytes|ceph_health_status|job:ceph_osd_metadata:count|job:kube_pv:count|job:ceph_pools_iops:total|job:ceph_pools_iops_bytes:total|job:ceph_versions_running:count|job:noobaa_total_unhealthy_buckets:sum|job:noobaa_bucket_count:sum|job:noobaa_total_object_count:sum|noobaa_accounts_num|noobaa_total_usage|console_url|cluster:network_attachment_definition_instances:max|cluster:network_attachment_definition_enabled_instance_up:max|insightsclient_request_send_total|cam_app_workload_migrations|cluster:apiserver_current_inflight_requests:sum:max_over_time:2m|cluster:alertmanager_integrations:max|cluster:telemetry_selected_series:count|openshift:prometheus_tsdb_head_series:sum|openshift:prometheus_tsdb_head_samples_appended_total:sum|monitoring:container_memory_working_set_bytes:sum|namespace_job:scrape_series_added:topk3_sum1h|namespace_job:scrape_samples_post_metric_relabeling:topk3|monitoring:haproxy_server_http_responses_total:sum|rhmi_status|cluster_legacy_scheduler_policy|cluster_master_schedulable|che_workspace_status|che_workspace_started_total|che_workspace_failure_total|che_workspace_start_time_seconds_sum|che_workspace_start_time_seconds_count|cco_credentials_mode|cluster:kube_persistentvolume_plugin_type_counts:sum|visual_web_terminal_sessions_total|acm_managed_cluster_info|cluster:vsphere_vcenter_info:sum|cluster:vsphere_esxi_version_total:sum|cluster:vsphere_node_hw_version_total:sum|openshift:build_by_strategy:sum|rhods_aggregate_availability|rhods_total_users|instance:etcd_disk_wal_fsync_duration_seconds:histogram_quantile|instance:etcd_mvcc_db_total_size_in_bytes:sum|instance:etcd_network_peer_round_trip_time_seconds:histogram_quantile|instance:etcd_mvcc_db_total_size_in_use_in_bytes:sum|instance:etcd_disk_backend_commit_duration_seconds:histogram_quantile|jaeger_operator_instances_storage_types|jaeger_operator_instances_strategies|jaeger_operator_instances_agent_strategies|appsvcs:cores_by_product:sum|nto_custom_profiles:count",alertstate=~"firing|",quantile=~"0.99|0.99|0.99|"}
1+
{__name__=~"cluster:usage:.*|count:up0|count:up1|cluster_version|cluster_version_available_updates|cluster_operator_up|cluster_operator_conditions|cluster_version_payload|cluster_installer|cluster_infrastructure_provider|cluster_feature_set|instance:etcd_object_counts:sum|ALERTS|code:apiserver_request_total:rate:sum|cluster:capacity_cpu_cores:sum|cluster:capacity_memory_bytes:sum|cluster:cpu_usage_cores:sum|cluster:memory_usage_bytes:sum|openshift:cpu_usage_cores:sum|openshift:memory_usage_bytes:sum|workload:cpu_usage_cores:sum|workload:memory_usage_bytes:sum|cluster:virt_platform_nodes:sum|cluster:node_instance_type_count:sum|cnv:vmi_status_running:count|cluster:vmi_request_cpu_cores:sum|node_role_os_version_machine:cpu_capacity_cores:sum|node_role_os_version_machine:cpu_capacity_sockets:sum|subscription_sync_total|olm_resolution_duration_seconds|csv_succeeded|csv_abnormal|cluster:kube_persistentvolumeclaim_resource_requests_storage_bytes:provisioner:sum|cluster:kubelet_volume_stats_used_bytes:provisioner:sum|ceph_cluster_total_bytes|ceph_cluster_total_used_raw_bytes|ceph_health_status|job:ceph_osd_metadata:count|job:kube_pv:count|job:ceph_pools_iops:total|job:ceph_pools_iops_bytes:total|job:ceph_versions_running:count|job:noobaa_total_unhealthy_buckets:sum|job:noobaa_bucket_count:sum|job:noobaa_total_object_count:sum|noobaa_accounts_num|noobaa_total_usage|console_url|cluster:network_attachment_definition_instances:max|cluster:network_attachment_definition_enabled_instance_up:max|cluster:ingress_controller_aws_nlb_active:sum|insightsclient_request_send_total|cam_app_workload_migrations|cluster:apiserver_current_inflight_requests:sum:max_over_time:2m|cluster:alertmanager_integrations:max|cluster:telemetry_selected_series:count|openshift:prometheus_tsdb_head_series:sum|openshift:prometheus_tsdb_head_samples_appended_total:sum|monitoring:container_memory_working_set_bytes:sum|namespace_job:scrape_series_added:topk3_sum1h|namespace_job:scrape_samples_post_metric_relabeling:topk3|monitoring:haproxy_server_http_responses_total:sum|rhmi_status|cluster_legacy_scheduler_policy|cluster_master_schedulable|che_workspace_status|che_workspace_started_total|che_workspace_failure_total|che_workspace_start_time_seconds_sum|che_workspace_start_time_seconds_count|cco_credentials_mode|cluster:kube_persistentvolume_plugin_type_counts:sum|visual_web_terminal_sessions_total|acm_managed_cluster_info|cluster:vsphere_vcenter_info:sum|cluster:vsphere_esxi_version_total:sum|cluster:vsphere_node_hw_version_total:sum|openshift:build_by_strategy:sum|rhods_aggregate_availability|rhods_total_users|instance:etcd_disk_wal_fsync_duration_seconds:histogram_quantile|instance:etcd_mvcc_db_total_size_in_bytes:sum|instance:etcd_network_peer_round_trip_time_seconds:histogram_quantile|instance:etcd_mvcc_db_total_size_in_use_in_bytes:sum|instance:etcd_disk_backend_commit_duration_seconds:histogram_quantile|jaeger_operator_instances_storage_types|jaeger_operator_instances_strategies|jaeger_operator_instances_agent_strategies|appsvcs:cores_by_product:sum|nto_custom_profiles:count",alertstate=~"firing|",quantile=~"0.99|0.99|0.99|"}

assets/cluster-monitoring-operator/prometheus-rule.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,8 @@ spec:
384384
- expr: sum (max without(service,endpoint,container,pod,job,namespace) (irate(haproxy_server_http_responses_total{exported_namespace=~"openshift-.*"}[5m])))
385385
or absent(__does_not_exist__)*0
386386
record: cluster:usage:openshift:ingress_request_total:irate5m
387+
- expr: sum(ingress_controller_aws_nlb_active) or vector(0)
388+
record: cluster:ingress_controller_aws_nlb_active:sum
387389
- name: openshift-build.rules
388390
rules:
389391
- expr: sum by (strategy) (openshift_build_status_phase_total)

jsonnet/rules.libsonnet

+5
Original file line numberDiff line numberDiff line change
@@ -513,6 +513,11 @@ function(params) {
513513
record: 'cluster:usage:openshift:ingress_request_total:irate5m',
514514
// The instantaneous rate of openshift requests per second arriving at the ingress controllers
515515
},
516+
{
517+
expr: 'sum(ingress_controller_aws_nlb_active) or vector(0)',
518+
record: 'cluster:ingress_controller_aws_nlb_active:sum',
519+
// Informs how many NLBs are active in AWS.
520+
},
516521
],
517522
},
518523
{

manifests/0000_50_cluster-monitoring-operator_04-config.yaml

+6
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,12 @@ data:
382382
# at least max of one instance with k8s.v1.cni.cncf.io/networks annotation, labelled by networks (any or sriov).
383383
- '{__name__="cluster:network_attachment_definition_enabled_instance_up:max"}'
384384
#
385+
# owners: (@openshift/network-edge)
386+
#
387+
# cluster:ingress_controller_aws_nlb_active:sum informs how many NLBs are active in AWS.
388+
# Zero would indicate ELB (legacy). This metric is only emitted on AWS.
389+
- '{__name__="cluster:ingress_controller_aws_nlb_active:sum"}'
390+
#
385391
# owners: (https://github.com/openshift/insights-operator/blob/master/OWNERS)
386392
#
387393
# insightsclient_request_send tracks the number of metrics sends.

0 commit comments

Comments
 (0)