Skip to content

Commit 7ee08ec

Browse files
committed
Detect panics in e2e tests
Signed-off-by: Stefan Büringer [email protected]
1 parent defa62d commit 7ee08ec

File tree

3 files changed

+156
-3
lines changed

3 files changed

+156
-3
lines changed

test/framework/deployment_helpers.go

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package framework
1818

1919
import (
2020
"bufio"
21+
"bytes"
2122
"context"
2223
"encoding/json"
2324
"fmt"
@@ -31,12 +32,15 @@ import (
3132

3233
. "github.com/onsi/ginkgo/v2"
3334
. "github.com/onsi/gomega"
35+
"github.com/pkg/errors"
36+
"github.com/prometheus/common/expfmt"
3437
appsv1 "k8s.io/api/apps/v1"
3538
corev1 "k8s.io/api/core/v1"
3639
policyv1 "k8s.io/api/policy/v1"
3740
apierrors "k8s.io/apimachinery/pkg/api/errors"
3841
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3942
"k8s.io/apimachinery/pkg/labels"
43+
kerrors "k8s.io/apimachinery/pkg/util/errors"
4044
"k8s.io/apimachinery/pkg/util/intstr"
4145
utilversion "k8s.io/apimachinery/pkg/util/version"
4246
"k8s.io/apimachinery/pkg/util/wait"
@@ -348,8 +352,8 @@ type WatchPodMetricsInput struct {
348352

349353
// WatchPodMetrics captures metrics from all pods every 5s. It expects to find port 8080 open on the controller.
350354
func WatchPodMetrics(ctx context.Context, input WatchPodMetricsInput) {
351-
// Dump machine metrics every 5 seconds
352-
ticker := time.NewTicker(time.Second * 5)
355+
// Dump metrics periodically.
356+
ticker := time.NewTicker(time.Second * 10)
353357
Expect(ctx).NotTo(BeNil(), "ctx is required for dumpContainerMetrics")
354358
Expect(input.ClientSet).NotTo(BeNil(), "input.ClientSet is required for dumpContainerMetrics")
355359
Expect(input.Deployment).NotTo(BeNil(), "input.Deployment is required for dumpContainerMetrics")
@@ -397,8 +401,10 @@ func dumpPodMetrics(ctx context.Context, client *kubernetes.Clientset, metricsPa
397401
Do(ctx)
398402
data, err := res.Raw()
399403

404+
var errorRetrievingMetrics bool
400405
if err != nil {
401406
// Failing to dump metrics should not cause the test to fail
407+
errorRetrievingMetrics = true
402408
data = []byte(fmt.Sprintf("Error retrieving metrics for pod %s: %v\n%s", klog.KRef(pod.Namespace, pod.Name), err, string(data)))
403409
metricsFile = path.Join(metricsDir, "metrics-error.txt")
404410
}
@@ -407,7 +413,50 @@ func dumpPodMetrics(ctx context.Context, client *kubernetes.Clientset, metricsPa
407413
// Failing to dump metrics should not cause the test to fail
408414
log.Logf("Error writing metrics for pod %s: %v", klog.KRef(pod.Namespace, pod.Name), err)
409415
}
416+
417+
if !errorRetrievingMetrics {
418+
Expect(verifyMetrics(data)).To(Succeed())
419+
}
420+
}
421+
}
422+
423+
func verifyMetrics(data []byte) error {
424+
var parser expfmt.TextParser
425+
mf, err := parser.TextToMetricFamilies(bytes.NewReader(data))
426+
if err != nil {
427+
return errors.Wrapf(err, "failed to parse data to metrics families")
428+
}
429+
430+
var errs []error
431+
for metric, metricFamily := range mf {
432+
if metric == "controller_runtime_reconcile_panics_total" {
433+
for _, controllerPanicMetric := range metricFamily.Metric {
434+
if controllerPanicMetric.Counter != nil && controllerPanicMetric.Counter.Value != nil && *controllerPanicMetric.Counter.Value > 0 {
435+
controllerName := "unknown"
436+
for _, label := range controllerPanicMetric.Label {
437+
if *label.Name == "controller" {
438+
controllerName = *label.Value
439+
}
440+
}
441+
errs = append(errs, fmt.Errorf("panic occurred in %q controller", controllerName))
442+
}
443+
}
444+
}
445+
446+
if metric == "controller_runtime_webhook_panics_total" {
447+
for _, webhookPanicMetric := range metricFamily.Metric {
448+
if webhookPanicMetric.Counter != nil && webhookPanicMetric.Counter.Value != nil && *webhookPanicMetric.Counter.Value > 0 {
449+
errs = append(errs, fmt.Errorf("panic occurred in webhook"))
450+
}
451+
}
452+
}
453+
}
454+
455+
if len(errs) > 0 {
456+
return kerrors.NewAggregate(errs)
410457
}
458+
459+
return nil
411460
}
412461

413462
// WaitForDNSUpgradeInput is the input for WaitForDNSUpgrade.
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
/*
2+
Copyright 2024 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package framework
18+
19+
import (
20+
"testing"
21+
22+
. "github.com/onsi/gomega"
23+
)
24+
25+
func Test_verifyMetrics(t *testing.T) {
26+
tests := []struct {
27+
name string
28+
data []byte
29+
wantErr string
30+
}{
31+
{
32+
name: "no panic metric exists",
33+
data: []byte(`
34+
# HELP controller_runtime_max_concurrent_reconciles Maximum number of concurrent reconciles per controller
35+
# TYPE controller_runtime_max_concurrent_reconciles gauge
36+
controller_runtime_max_concurrent_reconciles{controller="cluster"} 10
37+
controller_runtime_max_concurrent_reconciles{controller="clusterclass"} 10
38+
`),
39+
},
40+
{
41+
name: "no panic occurred",
42+
data: []byte(`
43+
# HELP controller_runtime_max_concurrent_reconciles Maximum number of concurrent reconciles per controller
44+
# TYPE controller_runtime_max_concurrent_reconciles gauge
45+
controller_runtime_max_concurrent_reconciles{controller="cluster"} 10
46+
controller_runtime_max_concurrent_reconciles{controller="clusterclass"} 10
47+
# HELP controller_runtime_reconcile_panics_total Total number of reconciliation panics per controller
48+
# TYPE controller_runtime_reconcile_panics_total counter
49+
controller_runtime_reconcile_panics_total{controller="cluster"} 0
50+
controller_runtime_reconcile_panics_total{controller="clusterclass"} 0
51+
# HELP controller_runtime_webhook_panics_total Total number of webhook panics
52+
# TYPE controller_runtime_webhook_panics_total counter
53+
controller_runtime_webhook_panics_total 0
54+
`),
55+
},
56+
{
57+
name: "panic occurred in controller",
58+
data: []byte(`
59+
# HELP controller_runtime_max_concurrent_reconciles Maximum number of concurrent reconciles per controller
60+
# TYPE controller_runtime_max_concurrent_reconciles gauge
61+
controller_runtime_max_concurrent_reconciles{controller="cluster"} 10
62+
controller_runtime_max_concurrent_reconciles{controller="clusterclass"} 10
63+
# HELP controller_runtime_reconcile_panics_total Total number of reconciliation panics per controller
64+
# TYPE controller_runtime_reconcile_panics_total counter
65+
controller_runtime_reconcile_panics_total{controller="cluster"} 1
66+
controller_runtime_reconcile_panics_total{controller="clusterclass"} 0
67+
# HELP controller_runtime_webhook_panics_total Total number of webhook panics
68+
# TYPE controller_runtime_webhook_panics_total counter
69+
controller_runtime_webhook_panics_total 0
70+
`),
71+
wantErr: "panic occurred in \"cluster\" controller",
72+
},
73+
{
74+
name: "panic occurred in webhook",
75+
data: []byte(`
76+
# HELP controller_runtime_max_concurrent_reconciles Maximum number of concurrent reconciles per controller
77+
# TYPE controller_runtime_max_concurrent_reconciles gauge
78+
controller_runtime_max_concurrent_reconciles{controller="cluster"} 10
79+
controller_runtime_max_concurrent_reconciles{controller="clusterclass"} 10
80+
# HELP controller_runtime_reconcile_panics_total Total number of reconciliation panics per controller
81+
# TYPE controller_runtime_reconcile_panics_total counter
82+
controller_runtime_reconcile_panics_total{controller="cluster"} 0
83+
controller_runtime_reconcile_panics_total{controller="clusterclass"} 0
84+
# HELP controller_runtime_webhook_panics_total Total number of webhook panics
85+
# TYPE controller_runtime_webhook_panics_total counter
86+
controller_runtime_webhook_panics_total 1
87+
`),
88+
wantErr: "panic occurred in webhook",
89+
},
90+
}
91+
for _, tt := range tests {
92+
t.Run(tt.name, func(t *testing.T) {
93+
g := NewWithT(t)
94+
95+
err := verifyMetrics(tt.data)
96+
if tt.wantErr == "" {
97+
g.Expect(err).ToNot(HaveOccurred())
98+
} else {
99+
g.Expect(err).To(HaveOccurred())
100+
g.Expect(err.Error()).To(Equal(tt.wantErr))
101+
}
102+
})
103+
}
104+
}

test/go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ require (
1717
github.com/onsi/gomega v1.34.1
1818
github.com/pkg/errors v0.9.1
1919
github.com/prometheus/client_golang v1.19.1
20+
github.com/prometheus/common v0.55.0
2021
github.com/spf13/pflag v1.0.5
2122
github.com/vincent-petithory/dataurl v1.0.0
2223
go.etcd.io/etcd/api/v3 v3.5.15
@@ -122,7 +123,6 @@ require (
122123
github.com/pelletier/go-toml/v2 v2.2.2 // indirect
123124
github.com/peterbourgon/diskv v2.0.1+incompatible // indirect
124125
github.com/prometheus/client_model v0.6.1 // indirect
125-
github.com/prometheus/common v0.55.0 // indirect
126126
github.com/prometheus/procfs v0.15.1 // indirect
127127
github.com/russross/blackfriday/v2 v2.1.0 // indirect
128128
github.com/sagikazarmark/locafero v0.4.0 // indirect

0 commit comments

Comments
 (0)