Skip to content

Commit 9bedff1

Browse files
authored
Merge pull request #112948 from mimowo/112873-fix-job-finished-metric
Fix the job finished metric issue due to the final job status update occasionally failing
2 parents 0207f7a + b64e5b2 commit 9bedff1

File tree

2 files changed

+162
-11
lines changed

2 files changed

+162
-11
lines changed

pkg/controller/job/job_controller.go

+27-11
Original file line numberDiff line numberDiff line change
@@ -699,10 +699,7 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (forget bool, rEr
699699
return false, nil
700700
}
701701

702-
completionMode := string(batch.NonIndexedCompletion)
703-
if isIndexedJob(&job) {
704-
completionMode = string(batch.IndexedCompletion)
705-
}
702+
completionMode := getCompletionMode(&job)
706703
action := metrics.JobSyncActionReconciling
707704

708705
defer func() {
@@ -906,11 +903,14 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (forget bool, rEr
906903
job.Status.CompletedIndexes = succeededIndexes.String()
907904
}
908905
job.Status.UncountedTerminatedPods = nil
909-
jm.enactJobFinished(&job, finishedCondition)
906+
jobFinished := jm.enactJobFinished(&job, finishedCondition)
910907

911908
if _, err := jm.updateStatusHandler(ctx, &job); err != nil {
912909
return forget, err
913910
}
911+
if jobFinished {
912+
jm.recordJobFinished(&job, finishedCondition)
913+
}
914914

915915
if jobHasNewFailure && !IsJobFinished(&job) {
916916
// returning an error will re-enqueue Job after the backoff period
@@ -1105,13 +1105,17 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
11051105
if job, needsFlush, err = jm.flushUncountedAndRemoveFinalizers(ctx, job, podsToRemoveFinalizer, uidsWithFinalizer, &oldCounters, needsFlush); err != nil {
11061106
return err
11071107
}
1108-
if jm.enactJobFinished(job, finishedCond) {
1108+
jobFinished := jm.enactJobFinished(job, finishedCond)
1109+
if jobFinished {
11091110
needsFlush = true
11101111
}
11111112
if needsFlush {
11121113
if _, err := jm.updateStatusHandler(ctx, job); err != nil {
11131114
return fmt.Errorf("removing uncounted pods from status: %w", err)
11141115
}
1116+
if jobFinished {
1117+
jm.recordJobFinished(job, finishedCond)
1118+
}
11151119
recordJobPodFinished(job, oldCounters)
11161120
}
11171121
return nil
@@ -1244,16 +1248,20 @@ func (jm *Controller) enactJobFinished(job *batch.Job, finishedCond *batch.JobCo
12441248
return false
12451249
}
12461250
}
1247-
completionMode := string(batch.NonIndexedCompletion)
1248-
if isIndexedJob(job) {
1249-
completionMode = string(*job.Spec.CompletionMode)
1250-
}
12511251
job.Status.Conditions, _ = ensureJobConditionStatus(job.Status.Conditions, finishedCond.Type, finishedCond.Status, finishedCond.Reason, finishedCond.Message)
1252+
if finishedCond.Type == batch.JobComplete {
1253+
job.Status.CompletionTime = &finishedCond.LastTransitionTime
1254+
}
1255+
return true
1256+
}
1257+
1258+
// recordJobFinished records events and the job_finished_total metric for a finished job.
1259+
func (jm *Controller) recordJobFinished(job *batch.Job, finishedCond *batch.JobCondition) bool {
1260+
completionMode := getCompletionMode(job)
12521261
if finishedCond.Type == batch.JobComplete {
12531262
if job.Spec.Completions != nil && job.Status.Succeeded > *job.Spec.Completions {
12541263
jm.recorder.Event(job, v1.EventTypeWarning, "TooManySucceededPods", "Too many succeeded pods running after completion count reached")
12551264
}
1256-
job.Status.CompletionTime = &finishedCond.LastTransitionTime
12571265
jm.recorder.Event(job, v1.EventTypeNormal, "Completed", "Job completed")
12581266
metrics.JobFinishedNum.WithLabelValues(completionMode, "succeeded").Inc()
12591267
} else {
@@ -1613,6 +1621,14 @@ func countValidPodsWithFilter(job *batch.Job, pods []*v1.Pod, uncounted sets.Str
16131621
return result
16141622
}
16151623

1624+
// getCompletionMode returns string representation of the completion mode. Used as a label value for metrics.
1625+
func getCompletionMode(job *batch.Job) string {
1626+
if isIndexedJob(job) {
1627+
return string(batch.IndexedCompletion)
1628+
}
1629+
return string(batch.NonIndexedCompletion)
1630+
}
1631+
16161632
func trackingUncountedPods(job *batch.Job) bool {
16171633
return feature.DefaultFeatureGate.Enabled(features.JobTrackingWithFinalizers) && hasJobTrackingAnnotation(job)
16181634
}

test/integration/job/job_test.go

+135
Original file line numberDiff line numberDiff line change
@@ -47,19 +47,154 @@ import (
4747
"k8s.io/client-go/restmapper"
4848
"k8s.io/client-go/util/retry"
4949
featuregatetesting "k8s.io/component-base/featuregate/testing"
50+
basemetrics "k8s.io/component-base/metrics"
51+
"k8s.io/component-base/metrics/testutil"
5052
"k8s.io/controller-manager/pkg/informerfactory"
5153
"k8s.io/klog/v2"
5254
kubeapiservertesting "k8s.io/kubernetes/cmd/kube-apiserver/app/testing"
5355
podutil "k8s.io/kubernetes/pkg/api/v1/pod"
5456
"k8s.io/kubernetes/pkg/controller/garbagecollector"
5557
jobcontroller "k8s.io/kubernetes/pkg/controller/job"
58+
"k8s.io/kubernetes/pkg/controller/job/metrics"
5659
"k8s.io/kubernetes/pkg/features"
5760
"k8s.io/kubernetes/test/integration/framework"
5861
"k8s.io/utils/pointer"
5962
)
6063

6164
const waitInterval = time.Second
6265

66+
type metricLabelsWithValue struct {
67+
Labels []string
68+
Value int
69+
}
70+
71+
func TestMetrics(t *testing.T) {
72+
nonIndexedCompletion := batchv1.NonIndexedCompletion
73+
indexedCompletion := batchv1.IndexedCompletion
74+
wFinalizers := true
75+
defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobTrackingWithFinalizers, wFinalizers)()
76+
77+
// setup the job controller
78+
closeFn, restConfig, clientSet, ns := setup(t, "simple")
79+
defer closeFn()
80+
ctx, cancel := startJobControllerAndWaitForCaches(restConfig)
81+
defer func() {
82+
cancel()
83+
}()
84+
85+
testCases := map[string]struct {
86+
job *batchv1.Job
87+
wantJobFinishedNumMetricDelta metricLabelsWithValue
88+
wantJobPodsFinishedMetricDelta metricLabelsWithValue
89+
}{
90+
"non-indexed job": {
91+
job: &batchv1.Job{
92+
Spec: batchv1.JobSpec{
93+
Completions: pointer.Int32(2),
94+
Parallelism: pointer.Int32(2),
95+
CompletionMode: &nonIndexedCompletion,
96+
},
97+
},
98+
wantJobFinishedNumMetricDelta: metricLabelsWithValue{
99+
Labels: []string{"NonIndexed", "succeeded"},
100+
Value: 1,
101+
},
102+
wantJobPodsFinishedMetricDelta: metricLabelsWithValue{
103+
Labels: []string{"NonIndexed", "succeeded"},
104+
Value: 2,
105+
},
106+
},
107+
"indexed job": {
108+
job: &batchv1.Job{
109+
Spec: batchv1.JobSpec{
110+
Completions: pointer.Int32(2),
111+
Parallelism: pointer.Int32(2),
112+
CompletionMode: &indexedCompletion,
113+
},
114+
},
115+
wantJobFinishedNumMetricDelta: metricLabelsWithValue{
116+
Labels: []string{"Indexed", "succeeded"},
117+
Value: 1,
118+
},
119+
wantJobPodsFinishedMetricDelta: metricLabelsWithValue{
120+
Labels: []string{"Indexed", "succeeded"},
121+
Value: 2,
122+
},
123+
},
124+
}
125+
job_index := 0 // job index to avoid collisions between job names created by different test cases
126+
for name, tc := range testCases {
127+
t.Run(name, func(t *testing.T) {
128+
129+
// record the metrics after the job is created
130+
jobFinishedNumBefore, err := getCounterMetricValueForLabels(metrics.JobFinishedNum, tc.wantJobFinishedNumMetricDelta.Labels)
131+
if err != nil {
132+
t.Fatalf("Failed to collect the JobFinishedNum metric before creating the job: %q", err)
133+
}
134+
jobPodsFinishedBefore, err := getCounterMetricValueForLabels(metrics.JobPodsFinished, tc.wantJobPodsFinishedMetricDelta.Labels)
135+
if err != nil {
136+
t.Fatalf("Failed to collect the JobPodsFinished metric before creating the job: %q", err)
137+
}
138+
139+
// create a single job and wait for its completion
140+
job := tc.job.DeepCopy()
141+
job.Name = fmt.Sprintf("job-%v", job_index)
142+
job_index++
143+
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, job)
144+
if err != nil {
145+
t.Fatalf("Failed to create Job: %v", err)
146+
}
147+
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
148+
Active: int(*jobObj.Spec.Parallelism),
149+
Ready: pointer.Int32(0),
150+
}, wFinalizers)
151+
if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, int(*jobObj.Spec.Parallelism)); err != nil {
152+
t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodSucceeded, err)
153+
}
154+
validateJobSucceeded(ctx, t, clientSet, jobObj)
155+
156+
// verify metric values after the job is finished
157+
validateMetricValueDeltas(t, metrics.JobFinishedNum, tc.wantJobFinishedNumMetricDelta, jobFinishedNumBefore)
158+
validateMetricValueDeltas(t, metrics.JobPodsFinished, tc.wantJobPodsFinishedMetricDelta, jobPodsFinishedBefore)
159+
})
160+
}
161+
}
162+
163+
func validateMetricValueDeltas(t *testing.T, counterVer *basemetrics.CounterVec, wantMetricDelta metricLabelsWithValue, metricValuesBefore metricLabelsWithValue) {
164+
t.Helper()
165+
var cmpErr error
166+
err := wait.PollImmediate(10*time.Millisecond, 10*time.Second, func() (bool, error) {
167+
cmpErr = nil
168+
metricValuesAfter, err := getCounterMetricValueForLabels(counterVer, wantMetricDelta.Labels)
169+
if err != nil {
170+
return true, fmt.Errorf("Failed to collect the %q metric after the job is finished: %q", counterVer.Name, err)
171+
}
172+
wantDelta := wantMetricDelta.Value
173+
gotDelta := metricValuesAfter.Value - metricValuesBefore.Value
174+
if wantDelta != gotDelta {
175+
cmpErr = fmt.Errorf("Unexepected metric delta for %q metric with labels %q. want: %v, got: %v", counterVer.Name, wantMetricDelta.Labels, wantDelta, gotDelta)
176+
return false, nil
177+
}
178+
return true, nil
179+
})
180+
if err != nil {
181+
t.Errorf("Failed waiting for expected metric delta: %q", err)
182+
}
183+
if cmpErr != nil {
184+
t.Error(cmpErr)
185+
}
186+
}
187+
188+
func getCounterMetricValueForLabels(counterVec *basemetrics.CounterVec, labels []string) (metricLabelsWithValue, error) {
189+
var result metricLabelsWithValue = metricLabelsWithValue{Labels: labels}
190+
value, err := testutil.GetCounterMetricValue(counterVec.WithLabelValues(labels...))
191+
if err != nil {
192+
return result, err
193+
}
194+
result.Value = int(value)
195+
return result, nil
196+
}
197+
63198
// TestJobPodFailurePolicyWithFailedPodDeletedDuringControllerRestart verifies that the job is properly marked as Failed
64199
// in a scenario when the job controller crashes between removing pod finalizers and marking the job as Failed (based on
65200
// the pod failure policy). After the finalizer for the failed pod is removed we remove the failed pod. This step is

0 commit comments

Comments
 (0)