Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NO-JIRA: remove firing alerts from the config/metrics file #954

Merged
merged 1 commit into from
Jul 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/gathered-data.md
Original file line number Diff line number Diff line change
Expand Up @@ -1220,7 +1220,6 @@ Gathered metrics:
- `console_helm_uninstalls_total`
- `etcd_server_slow_apply_total`
- `etcd_server_slow_read_indexes_total`
- followed by at most 1000 lines of `ALERTS` metric

### API Reference
None
Expand Down Expand Up @@ -1255,6 +1254,7 @@ None
- `etcd_server_slow_apply_total` introduced in version 4.16+
- `etcd_server_slow_read_indexes_total` introduced in version 4.16+
- `haproxy_exporter_server_threshold` introduced in version 4.17+
- `ALERTS` removed in version 4.17+


## MutatingWebhookConfigurations
Expand Down
18 changes: 1 addition & 17 deletions docs/insights-archive-sample/config/metrics
Original file line number Diff line number Diff line change
Expand Up @@ -134,20 +134,4 @@ virt_platform{container="kube-rbac-proxy",endpoint="https",instance="ci-ln-k19wb
virt_platform{container="kube-rbac-proxy",endpoint="https",instance="ci-ln-k19wbxk-f76d1-6qdmf-worker-b-94cjz",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-dmnrm",service="node-exporter",type="gcp",prometheus="openshift-monitoring/k8s",prometheus_replica="prometheus-k8s-0"} 1 1620977245743
virt_platform{container="kube-rbac-proxy",endpoint="https",instance="ci-ln-k19wbxk-f76d1-6qdmf-worker-b-94cjz",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-dmnrm",service="node-exporter",type="kvm",prometheus="openshift-monitoring/k8s",prometheus_replica="prometheus-k8s-0"} 1 1620977245743
virt_platform{container="kube-rbac-proxy",endpoint="https",instance="ci-ln-k19wbxk-f76d1-6qdmf-master-0",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-2dsf2",service="node-exporter",type="gcp",prometheus="openshift-monitoring/k8s",prometheus_replica="prometheus-k8s-0"} 1 1620977246807
virt_platform{container="kube-rbac-proxy",endpoint="https",instance="ci-ln-k19wbxk-f76d1-6qdmf-worker-c-44ttf",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-56fn6",service="node-exporter",type="gcp",prometheus="openshift-monitoring/k8s",prometheus_replica="prometheus-k8s-0"} 1 1620977240435
# ALERTS 16/1000
# TYPE ALERTS untyped
ALERTS{alertname="Watchdog",alertstate="firing",severity="none",instance="",prometheus="openshift-monitoring/k8s",prometheus_replica="prometheus-k8s-1"} 1 1612793310163
ALERTS{alertname="KubeMemoryOvercommit",alertstate="firing",severity="warning",instance="",prometheus="openshift-monitoring/k8s",prometheus_replica="prometheus-k8s-1"} 1 1612793311534
ALERTS{alertname="AlertmanagerReceiversNotConfigured",alertstate="firing",severity="warning",instance="",prometheus="openshift-monitoring/k8s",prometheus_replica="prometheus-k8s-1"} 1 1612793316662
ALERTS{alertname="KubePodNotReady",alertstate="firing",namespace="openshift-etcd",pod="etcd-quorum-guard-587fd6c776-xg5zw",severity="warning",instance="",prometheus="openshift-monitoring/k8s",prometheus_replica="prometheus-k8s-1"} 1 1612793320590
ALERTS{alertname="KubePodNotReady",alertstate="firing",namespace="openshift-etcd",pod="etcd-quorum-guard-587fd6c776-czv8b",severity="warning",instance="",prometheus="openshift-monitoring/k8s",prometheus_replica="prometheus-k8s-1"} 1 1612793320590
ALERTS{alertname="KubePodNotReady",alertstate="firing",namespace="openshift-ingress",pod="router-default-6f59db78db-gh48w",severity="warning",instance="",prometheus="openshift-monitoring/k8s",prometheus_replica="prometheus-k8s-1"} 1 1612793320590
ALERTS{alertname="KubeDeploymentReplicasMismatch",alertstate="firing",container="kube-rbac-proxy-main",deployment="router-default",endpoint="https-main",instance="10.129.0.12:8443",job="kube-state-metrics",namespace="openshift-ingress",pod="kube-state-metrics-664f855c7f-9vbzh",service="kube-state-metrics",severity="warning",prometheus="openshift-monitoring/k8s",prometheus_replica="prometheus-k8s-1"} 1 1612793320590
ALERTS{alertname="KubeDeploymentReplicasMismatch",alertstate="firing",container="kube-rbac-proxy-main",deployment="etcd-quorum-guard",endpoint="https-main",instance="10.129.0.12:8443",job="kube-state-metrics",namespace="openshift-etcd",pod="kube-state-metrics-664f855c7f-9vbzh",service="kube-state-metrics",severity="warning",prometheus="openshift-monitoring/k8s",prometheus_replica="prometheus-k8s-1"} 1 1612793320590
ALERTS{alertname="etcdHighCommitDurations",alertstate="firing",endpoint="etcd-metrics",instance="10.10.94.191:9979",job="etcd",namespace="openshift-etcd",pod="etcd-master-0.tremes.lab.rdu2.cee.redhat.com",service="etcd",severity="warning",prometheus="openshift-monitoring/k8s",prometheus_replica="prometheus-k8s-1"} 1 1612793295353
ALERTS{alertname="ClusterOperatorDown",alertstate="firing",endpoint="metrics",instance="10.10.94.191:9099",job="cluster-version-operator",name="ingress",namespace="openshift-cluster-version",pod="cluster-version-operator-644d79c75d-xl7z5",service="cluster-version-operator",severity="critical",version="4.6.15",prometheus="openshift-monitoring/k8s",prometheus_replica="prometheus-k8s-1"} 1 1612793309213
ALERTS{alertname="ClusterOperatorDegraded",alertstate="firing",condition="Degraded",endpoint="metrics",instance="10.10.94.191:9099",job="cluster-version-operator",name="ingress",namespace="openshift-cluster-version",pod="cluster-version-operator-644d79c75d-xl7z5",reason="IngressControllersDegraded",service="cluster-version-operator",severity="critical",prometheus="openshift-monitoring/k8s",prometheus_replica="prometheus-k8s-1"} 1 1612793309213
ALERTS{alertname="KubeAPIErrorBudgetBurn",alertstate="pending",long="3d",severity="warning",short="6h",instance="",prometheus="openshift-monitoring/k8s",prometheus_replica="prometheus-k8s-1"} 1 1612793298512
ALERTS{alertname="ClusterNotUpgradeable",alertstate="firing",condition="Upgradeable",endpoint="metrics",name="version",severity="warning",instance="",prometheus="openshift-monitoring/k8s",prometheus_replica="prometheus-k8s-1"} 1 1612793309213
ALERTS{alertname="KubePodNotReady",alertstate="pending",namespace="openshift-insights",pod="insights-operator-f7df674b4-x9qtw",severity="warning",instance="",prometheus="openshift-monitoring/k8s",prometheus_replica="prometheus-k8s-1"}
virt_platform{container="kube-rbac-proxy",endpoint="https",instance="ci-ln-k19wbxk-f76d1-6qdmf-worker-c-44ttf",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-56fn6",service="node-exporter",type="gcp",prometheus="openshift-monitoring/k8s",prometheus_replica="prometheus-k8s-0"} 1 1620977240435
39 changes: 1 addition & 38 deletions pkg/gatherers/clusterconfig/gather_most_recent_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,14 @@ package clusterconfig

import (
"context"
"fmt"
"io"

"k8s.io/client-go/rest"
"k8s.io/klog/v2"

"github.com/openshift/insights-operator/pkg/record"
"github.com/openshift/insights-operator/pkg/utils"
"github.com/openshift/insights-operator/pkg/utils/marshal"
)

const (
// metricsAlertsLinesLimit is the maximal number of lines read from monitoring Prometheus
// 500 KiB of alerts is limit, one alert line has typically 450 bytes => 1137 lines.
// This number has been rounded to 1000 for simplicity.
// Formerly, the `500 * 1024 / 450` expression was used instead.
metricsAlertsLinesLimit = 1000
)

// GatherMostRecentMetrics Collects cluster Federated Monitoring metrics.
//
// The GET REST query to URL /federate
Expand All @@ -34,7 +23,6 @@ const (
// - `console_helm_uninstalls_total`
// - `etcd_server_slow_apply_total`
// - `etcd_server_slow_read_indexes_total`
// - followed by at most 1000 lines of `ALERTS` metric
//
// ### API Reference
// None
Expand Down Expand Up @@ -69,6 +57,7 @@ const (
// - `etcd_server_slow_apply_total` introduced in version 4.16+
// - `etcd_server_slow_read_indexes_total` introduced in version 4.16+
// - `haproxy_exporter_server_threshold` introduced in version 4.17+
// - `ALERTS` removed in version 4.17+
func (g *Gatherer) GatherMostRecentMetrics(ctx context.Context) ([]record.Record, []error) {
metricsRESTClient, err := rest.RESTClientFor(g.metricsGatherKubeConfig)
if err != nil {
Expand Down Expand Up @@ -99,32 +88,6 @@ func gatherMostRecentMetrics(ctx context.Context, metricsClient rest.Interface)
return nil, []error{err}
}

rsp, err := metricsClient.Get().AbsPath("federate").
Param("match[]", "ALERTS").
Stream(ctx)
if err != nil {
klog.Errorf("Unable to retrieve most recent alerts from metrics: %v", err)
return nil, []error{err}
}
r := utils.NewLineLimitReader(rsp, metricsAlertsLinesLimit)
alerts, err := io.ReadAll(r)
if err != nil && err != io.EOF {
klog.Errorf("Unable to read most recent alerts from metrics: %v", err)
return nil, []error{err}
}

remainingAlertLines, err := utils.CountLines(rsp)
if err != nil && err != io.EOF {
klog.Errorf("Unable to count truncated lines of alerts metric: %v", err)
return nil, []error{err}
}
totalAlertCount := r.GetTotalLinesRead() + remainingAlertLines

// # ALERTS <Total Alerts Lines>/<Alerts Line Limit>
// The total number of alerts will typically be greater than the true number of alerts by 2
// because the `# TYPE ALERTS untyped` header and the final empty line are counter in.
data = append(data, []byte(fmt.Sprintf("# ALERTS %d/%d\n", totalAlertCount, metricsAlertsLinesLimit))...)
data = append(data, alerts...)
records := []record.Record{
{Name: "config/metrics", Item: marshal.RawByte(data), AlwaysStored: true},
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,9 @@ func Test_gatherMostRecentMetrics(t *testing.T) {
metricsClient: &mockMostRecentMetricsClient{data: []byte(`test`)},
wantRecords: []record.Record{
{
Name: "config/metrics",
Item: marshal.RawByte(`test# ALERTS 1/1000
test`),
Name: "config/metrics",
AlwaysStored: true,
Item: marshal.RawByte(`test`),
},
},
wantErrors: nil,
Expand Down