@@ -18,6 +18,7 @@ package framework
18
18
19
19
import (
20
20
"bufio"
21
+ "bytes"
21
22
"context"
22
23
"encoding/json"
23
24
"fmt"
@@ -31,12 +32,15 @@ import (
31
32
32
33
. "github.com/onsi/ginkgo/v2"
33
34
. "github.com/onsi/gomega"
35
+ "github.com/pkg/errors"
36
+ "github.com/prometheus/common/expfmt"
34
37
appsv1 "k8s.io/api/apps/v1"
35
38
corev1 "k8s.io/api/core/v1"
36
39
policyv1 "k8s.io/api/policy/v1"
37
40
apierrors "k8s.io/apimachinery/pkg/api/errors"
38
41
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
39
42
"k8s.io/apimachinery/pkg/labels"
43
+ kerrors "k8s.io/apimachinery/pkg/util/errors"
40
44
"k8s.io/apimachinery/pkg/util/intstr"
41
45
utilversion "k8s.io/apimachinery/pkg/util/version"
42
46
"k8s.io/apimachinery/pkg/util/wait"
@@ -348,8 +352,8 @@ type WatchPodMetricsInput struct {
348
352
349
353
// WatchPodMetrics captures metrics from all pods every 5s. It expects to find port 8080 open on the controller.
350
354
func WatchPodMetrics (ctx context.Context , input WatchPodMetricsInput ) {
351
- // Dump machine metrics every 5 seconds
352
- ticker := time .NewTicker (time .Second * 5 )
355
+ // Dump metrics periodically.
356
+ ticker := time .NewTicker (time .Second * 10 )
353
357
Expect (ctx ).NotTo (BeNil (), "ctx is required for dumpContainerMetrics" )
354
358
Expect (input .ClientSet ).NotTo (BeNil (), "input.ClientSet is required for dumpContainerMetrics" )
355
359
Expect (input .Deployment ).NotTo (BeNil (), "input.Deployment is required for dumpContainerMetrics" )
@@ -397,8 +401,10 @@ func dumpPodMetrics(ctx context.Context, client *kubernetes.Clientset, metricsPa
397
401
Do (ctx )
398
402
data , err := res .Raw ()
399
403
404
+ var errorRetrievingMetrics bool
400
405
if err != nil {
401
406
// Failing to dump metrics should not cause the test to fail
407
+ errorRetrievingMetrics = true
402
408
data = []byte (fmt .Sprintf ("Error retrieving metrics for pod %s: %v\n %s" , klog .KRef (pod .Namespace , pod .Name ), err , string (data )))
403
409
metricsFile = path .Join (metricsDir , "metrics-error.txt" )
404
410
}
@@ -407,7 +413,50 @@ func dumpPodMetrics(ctx context.Context, client *kubernetes.Clientset, metricsPa
407
413
// Failing to dump metrics should not cause the test to fail
408
414
log .Logf ("Error writing metrics for pod %s: %v" , klog .KRef (pod .Namespace , pod .Name ), err )
409
415
}
416
+
417
+ if ! errorRetrievingMetrics {
418
+ Expect (verifyMetrics (data )).To (Succeed ())
419
+ }
420
+ }
421
+ }
422
+
423
+ func verifyMetrics (data []byte ) error {
424
+ var parser expfmt.TextParser
425
+ mf , err := parser .TextToMetricFamilies (bytes .NewReader (data ))
426
+ if err != nil {
427
+ return errors .Wrapf (err , "failed to parse data to metrics families" )
428
+ }
429
+
430
+ var errs []error
431
+ for metric , metricFamily := range mf {
432
+ if metric == "controller_runtime_reconcile_panics_total" {
433
+ for _ , controllerPanicMetric := range metricFamily .Metric {
434
+ if controllerPanicMetric .Counter != nil && controllerPanicMetric .Counter .Value != nil && * controllerPanicMetric .Counter .Value > 0 {
435
+ controllerName := "unknown"
436
+ for _ , label := range controllerPanicMetric .Label {
437
+ if * label .Name == "controller" {
438
+ controllerName = * label .Value
439
+ }
440
+ }
441
+ errs = append (errs , fmt .Errorf ("panic occurred in %q controller" , controllerName ))
442
+ }
443
+ }
444
+ }
445
+
446
+ if metric == "controller_runtime_webhook_panics_total" {
447
+ for _ , webhookPanicMetric := range metricFamily .Metric {
448
+ if webhookPanicMetric .Counter != nil && webhookPanicMetric .Counter .Value != nil && * webhookPanicMetric .Counter .Value > 0 {
449
+ errs = append (errs , fmt .Errorf ("panic occurred in webhook" ))
450
+ }
451
+ }
452
+ }
453
+ }
454
+
455
+ if len (errs ) > 0 {
456
+ return kerrors .NewAggregate (errs )
410
457
}
458
+
459
+ return nil
411
460
}
412
461
413
462
// WaitForDNSUpgradeInput is the input for WaitForDNSUpgrade.
0 commit comments