@@ -47,6 +47,7 @@ import (
47
47
"k8s.io/component-base/metrics/prometheus/ratelimiter"
48
48
"k8s.io/klog/v2"
49
49
"k8s.io/kubernetes/pkg/controller"
50
+ "k8s.io/kubernetes/pkg/controller/job/metrics"
50
51
"k8s.io/kubernetes/pkg/features"
51
52
"k8s.io/utils/integer"
52
53
)
60
61
// DefaultJobBackOff is the default backoff period, exported for the e2e test
61
62
DefaultJobBackOff = 10 * time .Second
62
63
// MaxJobBackOff is the max backoff period, exported for the e2e test
63
- MaxJobBackOff = 360 * time .Second
64
+ MaxJobBackOff = 360 * time .Second
65
+ maxPodCreateDeletePerSync = 500
64
66
)
65
67
66
68
// Controller ensures that all Job objects have corresponding pods to
@@ -139,6 +141,8 @@ func NewController(podInformer coreinformers.PodInformer, jobInformer batchinfor
139
141
jm .updateHandler = jm .updateJobStatus
140
142
jm .syncHandler = jm .syncJob
141
143
144
+ metrics .Register ()
145
+
142
146
return jm
143
147
}
144
148
@@ -440,7 +444,7 @@ func (jm *Controller) getPodsForJob(j *batch.Job) ([]*v1.Pod, error) {
440
444
// syncJob will sync the job with the given key if it has had its expectations fulfilled, meaning
441
445
// it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked
442
446
// concurrently with the same key.
443
- func (jm * Controller ) syncJob (key string ) (bool , error ) {
447
+ func (jm * Controller ) syncJob (key string ) (forget bool , rErr error ) {
444
448
startTime := time .Now ()
445
449
defer func () {
446
450
klog .V (4 ).Infof ("Finished syncing job %q (%v)" , key , time .Since (startTime ))
@@ -480,6 +484,21 @@ func (jm *Controller) syncJob(key string) (bool, error) {
480
484
return false , nil
481
485
}
482
486
487
+ completionMode := string (batch .NonIndexedCompletion )
488
+ if isIndexedJob (& job ) {
489
+ completionMode = string (batch .IndexedCompletion )
490
+ }
491
+
492
+ defer func () {
493
+ result := "success"
494
+ if rErr != nil {
495
+ result = "error"
496
+ }
497
+
498
+ metrics .JobSyncDurationSeconds .WithLabelValues (completionMode , result ).Observe (time .Since (startTime ).Seconds ())
499
+ metrics .JobSyncNum .WithLabelValues (completionMode , result ).Inc ()
500
+ }()
501
+
483
502
// Check the expectations of the job before counting active pods, otherwise a new pod can sneak in
484
503
// and update the expectations after we've retrieved active pods from the store. If a new pod enters
485
504
// the store after we've checked the expectation, the job sync is just deferred till the next relist.
@@ -546,6 +565,7 @@ func (jm *Controller) syncJob(key string) (bool, error) {
546
565
job .Status .Conditions = append (job .Status .Conditions , newCondition (batch .JobFailed , v1 .ConditionTrue , failureReason , failureMessage ))
547
566
jobConditionsChanged = true
548
567
jm .recorder .Event (& job , v1 .EventTypeWarning , failureReason , failureMessage )
568
+ metrics .JobFinishedNum .WithLabelValues (completionMode , "failed" ).Inc ()
549
569
} else {
550
570
if jobNeedsSync && job .DeletionTimestamp == nil {
551
571
active , manageJobErr = jm .manageJob (& job , activePods , succeeded , pods )
@@ -581,6 +601,7 @@ func (jm *Controller) syncJob(key string) (bool, error) {
581
601
now := metav1 .Now ()
582
602
job .Status .CompletionTime = & now
583
603
jm .recorder .Event (& job , v1 .EventTypeNormal , "Completed" , "Job completed" )
604
+ metrics .JobFinishedNum .WithLabelValues (completionMode , "succeeded" ).Inc ()
584
605
} else if utilfeature .DefaultFeatureGate .Enabled (features .SuspendJob ) && manageJobCalled {
585
606
// Update the conditions / emit events only if manageJob was called in
586
607
// this syncJob. Otherwise wait for the right syncJob call to make
@@ -613,7 +634,7 @@ func (jm *Controller) syncJob(key string) (bool, error) {
613
634
}
614
635
}
615
636
616
- forget : = false
637
+ forget = false
617
638
// Check if the number of jobs succeeded increased since the last check. If yes "forget" should be true
618
639
// This logic is linked to the issue: https://github.com/kubernetes/kubernetes/issues/56853 that aims to
619
640
// improve the Job backoff policy when parallelism > 1 and few Jobs failed but others succeed.
@@ -783,6 +804,9 @@ func (jm *Controller) manageJob(job *batch.Job, activePods []*v1.Pod, succeeded
783
804
rmAtLeast = 0
784
805
}
785
806
podsToDelete := activePodsForRemoval (job , activePods , int (rmAtLeast ))
807
+ if len (podsToDelete ) > maxPodCreateDeletePerSync {
808
+ podsToDelete = podsToDelete [:maxPodCreateDeletePerSync ]
809
+ }
786
810
if len (podsToDelete ) > 0 {
787
811
jm .expectations .ExpectDeletions (jobKey , len (podsToDelete ))
788
812
klog .V (4 ).InfoS ("Too many pods running for job" , "job" , klog .KObj (job ), "deleted" , len (podsToDelete ), "target" , parallelism )
@@ -803,6 +827,10 @@ func (jm *Controller) manageJob(job *batch.Job, activePods []*v1.Pod, succeeded
803
827
return active , nil
804
828
}
805
829
830
+ if diff > int32 (maxPodCreateDeletePerSync ) {
831
+ diff = int32 (maxPodCreateDeletePerSync )
832
+ }
833
+
806
834
jm .expectations .ExpectCreations (jobKey , int (diff ))
807
835
errCh := make (chan error , diff )
808
836
klog .V (4 ).Infof ("Too few pods running job %q, need %d, creating %d" , jobKey , wantActive , diff )
0 commit comments