9
9
"strings"
10
10
"time"
11
11
12
- allowedalerts2 "github.com/openshift/origin/pkg/monitortestlibrary/allowedalerts"
12
+ "github.com/openshift/origin/pkg/monitortestlibrary/allowedalerts"
13
13
"github.com/openshift/origin/pkg/monitortestlibrary/platformidentification"
14
14
15
15
g "github.com/onsi/ginkgo/v2"
@@ -724,12 +724,12 @@ var _ = g.Describe("[sig-instrumentation] Prometheus [apigroup:image.openshift.i
724
724
725
725
g .It ("shouldn't report any alerts in firing state apart from Watchdog and AlertmanagerReceiversNotConfigured [Early][apigroup:config.openshift.io]" , func () {
726
726
// Copy so we can expand:
727
- allowedAlertNames := make ([]string , len (allowedalerts2 .AllowedAlertNames ))
728
- copy (allowedAlertNames , allowedalerts2 .AllowedAlertNames )
727
+ allowedAlertNames := make ([]string , len (allowedalerts .AllowedAlertNames ))
728
+ copy (allowedAlertNames , allowedalerts .AllowedAlertNames )
729
729
730
730
// Checking Watchdog alert state is done in "should have a Watchdog alert in firing state".
731
731
// we exclude alerts that have their own separate tests.
732
- for _ , alertTest := range allowedalerts2 .AllAlertTests (& platformidentification.JobType {}, nil , allowedalerts2 .DefaultAllowances ) {
732
+ for _ , alertTest := range allowedalerts .AllAlertTests (& platformidentification.JobType {}, nil , allowedalerts .DefaultAllowances ) {
733
733
allowedAlertNames = append (allowedAlertNames , alertTest .AlertName ())
734
734
}
735
735
@@ -740,11 +740,21 @@ var _ = g.Describe("[sig-instrumentation] Prometheus [apigroup:image.openshift.i
740
740
allowedAlertNames = append (allowedAlertNames , "TechPreviewNoUpgrade" , "ClusterNotUpgradeable" )
741
741
}
742
742
743
+ // OSD-26887: managed services taints several nodes as infrastructure. This taint appears to be applied
744
+ // after some of the platform DS are scheduled there, causing this alert to fire. Managed services
745
+ // rebalances the DS after the taint is added, and the alert clears, but origin fails this test. Allowing
746
+ // this alert to fire while we investigate why the taint is not added at node birth.
747
+ isManagedService , err := exutil .IsManagedServiceCluster (ctx , oc .AdminKubeClient ())
748
+ o .Expect (err ).NotTo (o .HaveOccurred ())
749
+ if isManagedService {
750
+ allowedAlertNames = append (allowedAlertNames , "KubeDaemonSetMisScheduled" )
751
+ }
752
+
743
753
tests := map [string ]bool {
744
754
// openshift-e2e-loki alerts should never fail this test, we've seen this happen on daemon set rollout stuck when CI loki was down.
745
755
fmt .Sprintf (`ALERTS{alertname!~"%s",alertstate="firing",severity!="info",namespace!="openshift-e2e-loki"} >= 1` , strings .Join (allowedAlertNames , "|" )): false ,
746
756
}
747
- err : = helper .RunQueries (context .TODO (), oc .NewPrometheusClient (context .TODO ()), tests , oc )
757
+ err = helper .RunQueries (context .TODO (), oc .NewPrometheusClient (context .TODO ()), tests , oc )
748
758
o .Expect (err ).NotTo (o .HaveOccurred ())
749
759
})
750
760
0 commit comments