Skip to content

Commit 2980ff3

Browse files
Merge pull request #29366 from openshift-cherrypick-robot/cherry-pick-29357-to-release-4.18
[release-4.18] OCPBUGS-46079: managed services: allow KubeDaemonSetMisScheduled alert
2 parents 6f4cc03 + 63b45e9 commit 2980ff3

File tree

1 file changed

+15
-5
lines changed

1 file changed

+15
-5
lines changed

test/extended/prometheus/prometheus.go

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ import (
99
"strings"
1010
"time"
1111

12-
allowedalerts2 "github.com/openshift/origin/pkg/monitortestlibrary/allowedalerts"
12+
"github.com/openshift/origin/pkg/monitortestlibrary/allowedalerts"
1313
"github.com/openshift/origin/pkg/monitortestlibrary/platformidentification"
1414

1515
g "github.com/onsi/ginkgo/v2"
@@ -724,12 +724,12 @@ var _ = g.Describe("[sig-instrumentation] Prometheus [apigroup:image.openshift.i
724724

725725
g.It("shouldn't report any alerts in firing state apart from Watchdog and AlertmanagerReceiversNotConfigured [Early][apigroup:config.openshift.io]", func() {
726726
// Copy so we can expand:
727-
allowedAlertNames := make([]string, len(allowedalerts2.AllowedAlertNames))
728-
copy(allowedAlertNames, allowedalerts2.AllowedAlertNames)
727+
allowedAlertNames := make([]string, len(allowedalerts.AllowedAlertNames))
728+
copy(allowedAlertNames, allowedalerts.AllowedAlertNames)
729729

730730
// Checking Watchdog alert state is done in "should have a Watchdog alert in firing state".
731731
// we exclude alerts that have their own separate tests.
732-
for _, alertTest := range allowedalerts2.AllAlertTests(&platformidentification.JobType{}, nil, allowedalerts2.DefaultAllowances) {
732+
for _, alertTest := range allowedalerts.AllAlertTests(&platformidentification.JobType{}, nil, allowedalerts.DefaultAllowances) {
733733
allowedAlertNames = append(allowedAlertNames, alertTest.AlertName())
734734
}
735735

@@ -740,11 +740,21 @@ var _ = g.Describe("[sig-instrumentation] Prometheus [apigroup:image.openshift.i
740740
allowedAlertNames = append(allowedAlertNames, "TechPreviewNoUpgrade", "ClusterNotUpgradeable")
741741
}
742742

743+
// OSD-26887: managed services taints several nodes as infrastructure. This taint appears to be applied
744+
// after some of the platform DS are scheduled there, causing this alert to fire. Managed services
745+
// rebalances the DS after the taint is added, and the alert clears, but origin fails this test. Allowing
746+
// this alert to fire while we investigate why the taint is not added at node birth.
747+
isManagedService, err := exutil.IsManagedServiceCluster(ctx, oc.AdminKubeClient())
748+
o.Expect(err).NotTo(o.HaveOccurred())
749+
if isManagedService {
750+
allowedAlertNames = append(allowedAlertNames, "KubeDaemonSetMisScheduled")
751+
}
752+
743753
tests := map[string]bool{
744754
// openshift-e2e-loki alerts should never fail this test, we've seen this happen on daemon set rollout stuck when CI loki was down.
745755
fmt.Sprintf(`ALERTS{alertname!~"%s",alertstate="firing",severity!="info",namespace!="openshift-e2e-loki"} >= 1`, strings.Join(allowedAlertNames, "|")): false,
746756
}
747-
err := helper.RunQueries(context.TODO(), oc.NewPrometheusClient(context.TODO()), tests, oc)
757+
err = helper.RunQueries(context.TODO(), oc.NewPrometheusClient(context.TODO()), tests, oc)
748758
o.Expect(err).NotTo(o.HaveOccurred())
749759
})
750760

0 commit comments

Comments
 (0)