kubernetes
diff --git a/‎api/openapi-spec/swagger.json
+1-1 b/‎api/openapi-spec/swagger.json
+1-1
diff --git a/‎pkg/apis/batch/types.go
+4-2 b/‎pkg/apis/batch/types.go
+4-2
diff --git a/‎pkg/apis/batch/validation/validation.go
+10 b/‎pkg/apis/batch/validation/validation.go
+10
diff --git a/‎pkg/controller/job/indexed_job_utils.go
+3-3 b/‎pkg/controller/job/indexed_job_utils.go
+3-3
diff --git a/‎pkg/controller/job/indexed_job_utils_test.go
+2-2 b/‎pkg/controller/job/indexed_job_utils_test.go
+2-2
diff --git a/‎pkg/controller/job/job_controller.go
+31-3 b/‎pkg/controller/job/job_controller.go
+31-3
diff --git a/‎pkg/controller/job/job_controller_test.go
+2-2 b/‎pkg/controller/job/job_controller_test.go
+2-2
diff --git a/‎pkg/controller/job/metrics/metrics.go
+75 b/‎pkg/controller/job/metrics/metrics.go
+75
diff --git a/‎pkg/features/kube_features.go
+2-1 b/‎pkg/features/kube_features.go
+2-1
diff --git a/‎staging/src/k8s.io/api/batch/v1/generated.proto
+4-2 b/‎staging/src/k8s.io/api/batch/v1/generated.proto
+4-2
diff --git a/‎staging/src/k8s.io/api/batch/v1/types.go
+5-3 b/‎staging/src/k8s.io/api/batch/v1/types.go
+5-3
@@ -183,9 +183,11 @@ type JobSpec struct {
 	// for each index.
 	// When value is `Indexed`, .spec.completions must be specified and
 	// `.spec.parallelism` must be less than or equal to 10^5.
+	// In addition, The Pod name takes the form
+	// `$(job-name)-$(index)-$(random-string)`,
+	// the Pod hostname takes the form `$(job-name)-$(index)`.
 	//
-	// This field is alpha-level and is only honored by servers that enable the
-	// IndexedJob feature gate. More completion modes can be added in the future.
+	// This field is beta-level. More completion modes can be added in the future.
 	// If the Job controller observes a mode that it doesn't recognize, the
 	// controller skips updates for the Job.
 	// +optional
 
@@ -88,6 +88,16 @@ func ValidateJob(job *batch.Job, opts apivalidation.PodValidationOptions) field.
 	allErrs := apivalidation.ValidateObjectMeta(&job.ObjectMeta, true, apivalidation.ValidateReplicationControllerName, field.NewPath("metadata"))
 	allErrs = append(allErrs, ValidateGeneratedSelector(job)...)
 	allErrs = append(allErrs, ValidateJobSpec(&job.Spec, field.NewPath("spec"), opts)...)
+	if job.Spec.CompletionMode != nil && *job.Spec.CompletionMode == batch.IndexedCompletion && job.Spec.Completions != nil && *job.Spec.Completions > 0 {
+		// For indexed job, the job controller appends a suffix (`-$INDEX`)
+		// to the pod hostname when indexed job create pods.
+		// The index could be maximum `.spec.completions-1`
+		// If we don't validate this here, the indexed job will fail to create pods later.
+		maximumPodHostname := fmt.Sprintf("%s-%d", job.ObjectMeta.Name, *job.Spec.Completions-1)
+		if errs := apimachineryvalidation.IsDNS1123Label(maximumPodHostname); len(errs) > 0 {
+			allErrs = append(allErrs, field.Invalid(field.NewPath("metadata").Child("name"), job.ObjectMeta.Name, fmt.Sprintf("will not able to create pod with invalid DNS label: %s", maximumPodHostname)))
+		}
+	}
 	return allErrs
 }
 
 
@@ -170,7 +170,7 @@ func getCompletionIndex(annotations map[string]string) int {
 	if annotations == nil {
 		return unknownCompletionIndex
 	}
-	v, ok := annotations[batch.JobCompletionIndexAnnotationAlpha]
+	v, ok := annotations[batch.JobCompletionIndexAnnotation]
 	if !ok {
 		return unknownCompletionIndex
 	}
@@ -203,7 +203,7 @@ func addCompletionIndexEnvVariable(container *v1.Container) {
 		Name: completionIndexEnvName,
 		ValueFrom: &v1.EnvVarSource{
 			FieldRef: &v1.ObjectFieldSelector{
-				FieldPath: fmt.Sprintf("metadata.annotations['%s']", batch.JobCompletionIndexAnnotationAlpha),
+				FieldPath: fmt.Sprintf("metadata.annotations['%s']", batch.JobCompletionIndexAnnotation),
 			},
 		},
 	})
@@ -213,7 +213,7 @@ func addCompletionIndexAnnotation(template *v1.PodTemplateSpec, index int) {
 	if template.Annotations == nil {
 		template.Annotations = make(map[string]string, 1)
 	}
-	template.Annotations[batch.JobCompletionIndexAnnotationAlpha] = strconv.Itoa(index)
+	template.Annotations[batch.JobCompletionIndexAnnotation] = strconv.Itoa(index)
 }
 
 type byCompletionIndex []*v1.Pod
 
@@ -279,7 +279,7 @@ func hollowPodsWithIndexPhase(descs []indexPhase) []*v1.Pod {
 		}
 		if desc.Index != noIndex {
 			p.Annotations = map[string]string{
-				batch.JobCompletionIndexAnnotationAlpha: desc.Index,
+				batch.JobCompletionIndexAnnotation: desc.Index,
 			}
 		}
 		pods = append(pods, p)
@@ -297,7 +297,7 @@ func toIndexPhases(pods []*v1.Pod) []indexPhase {
 	for i, p := range pods {
 		index := noIndex
 		if p.Annotations != nil {
-			index = p.Annotations[batch.JobCompletionIndexAnnotationAlpha]
+			index = p.Annotations[batch.JobCompletionIndexAnnotation]
 		}
 		result[i] = indexPhase{index, p.Status.Phase}
 	}
 
@@ -47,6 +47,7 @@ import (
 	"k8s.io/component-base/metrics/prometheus/ratelimiter"
 	"k8s.io/klog/v2"
 	"k8s.io/kubernetes/pkg/controller"
+	"k8s.io/kubernetes/pkg/controller/job/metrics"
 	"k8s.io/kubernetes/pkg/features"
 	"k8s.io/utils/integer"
 )
@@ -60,7 +61,8 @@ var (
 	// DefaultJobBackOff is the default backoff period, exported for the e2e test
 	DefaultJobBackOff = 10 * time.Second
 	// MaxJobBackOff is the max backoff period, exported for the e2e test
-	MaxJobBackOff = 360 * time.Second
+	MaxJobBackOff             = 360 * time.Second
+	maxPodCreateDeletePerSync = 500
 )
 
 // Controller ensures that all Job objects have corresponding pods to
@@ -139,6 +141,8 @@ func NewController(podInformer coreinformers.PodInformer, jobInformer batchinfor
 	jm.updateHandler = jm.updateJobStatus
 	jm.syncHandler = jm.syncJob
 
+	metrics.Register()
+
 	return jm
 }
 
@@ -440,7 +444,7 @@ func (jm *Controller) getPodsForJob(j *batch.Job) ([]*v1.Pod, error) {
 // syncJob will sync the job with the given key if it has had its expectations fulfilled, meaning
 // it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked
 // concurrently with the same key.
-func (jm *Controller) syncJob(key string) (bool, error) {
+func (jm *Controller) syncJob(key string) (forget bool, rErr error) {
 	startTime := time.Now()
 	defer func() {
 		klog.V(4).Infof("Finished syncing job %q (%v)", key, time.Since(startTime))
@@ -480,6 +484,21 @@ func (jm *Controller) syncJob(key string) (bool, error) {
 		return false, nil
 	}
 
+	completionMode := string(batch.NonIndexedCompletion)
+	if isIndexedJob(&job) {
+		completionMode = string(batch.IndexedCompletion)
+	}
+
+	defer func() {
+		result := "success"
+		if rErr != nil {
+			result = "error"
+		}
+
+		metrics.JobSyncDurationSeconds.WithLabelValues(completionMode, result).Observe(time.Since(startTime).Seconds())
+		metrics.JobSyncNum.WithLabelValues(completionMode, result).Inc()
+	}()
+
 	// Check the expectations of the job before counting active pods, otherwise a new pod can sneak in
 	// and update the expectations after we've retrieved active pods from the store. If a new pod enters
 	// the store after we've checked the expectation, the job sync is just deferred till the next relist.
@@ -546,6 +565,7 @@ func (jm *Controller) syncJob(key string) (bool, error) {
 		job.Status.Conditions = append(job.Status.Conditions, newCondition(batch.JobFailed, v1.ConditionTrue, failureReason, failureMessage))
 		jobConditionsChanged = true
 		jm.recorder.Event(&job, v1.EventTypeWarning, failureReason, failureMessage)
+		metrics.JobFinishedNum.WithLabelValues(completionMode, "failed").Inc()
 	} else {
 		if jobNeedsSync && job.DeletionTimestamp == nil {
 			active, manageJobErr = jm.manageJob(&job, activePods, succeeded, pods)
@@ -581,6 +601,7 @@ func (jm *Controller) syncJob(key string) (bool, error) {
 			now := metav1.Now()
 			job.Status.CompletionTime = &now
 			jm.recorder.Event(&job, v1.EventTypeNormal, "Completed", "Job completed")
+			metrics.JobFinishedNum.WithLabelValues(completionMode, "succeeded").Inc()
 		} else if utilfeature.DefaultFeatureGate.Enabled(features.SuspendJob) && manageJobCalled {
 			// Update the conditions / emit events only if manageJob was called in
 			// this syncJob. Otherwise wait for the right syncJob call to make
@@ -613,7 +634,7 @@ func (jm *Controller) syncJob(key string) (bool, error) {
 		}
 	}
 
-	forget := false
+	forget = false
 	// Check if the number of jobs succeeded increased since the last check. If yes "forget" should be true
 	// This logic is linked to the issue: https://github.com/kubernetes/kubernetes/issues/56853 that aims to
 	// improve the Job backoff policy when parallelism > 1 and few Jobs failed but others succeed.
@@ -783,6 +804,9 @@ func (jm *Controller) manageJob(job *batch.Job, activePods []*v1.Pod, succeeded
 		rmAtLeast = 0
 	}
 	podsToDelete := activePodsForRemoval(job, activePods, int(rmAtLeast))
+	if len(podsToDelete) > maxPodCreateDeletePerSync {
+		podsToDelete = podsToDelete[:maxPodCreateDeletePerSync]
+	}
 	if len(podsToDelete) > 0 {
 		jm.expectations.ExpectDeletions(jobKey, len(podsToDelete))
 		klog.V(4).InfoS("Too many pods running for job", "job", klog.KObj(job), "deleted", len(podsToDelete), "target", parallelism)
@@ -803,6 +827,10 @@ func (jm *Controller) manageJob(job *batch.Job, activePods []*v1.Pod, succeeded
 			return active, nil
 		}
 
+		if diff > int32(maxPodCreateDeletePerSync) {
+			diff = int32(maxPodCreateDeletePerSync)
+		}
+
 		jm.expectations.ExpectCreations(jobKey, int(diff))
 		errCh := make(chan error, diff)
 		klog.V(4).Infof("Too few pods running job %q, need %d, creating %d", jobKey, wantActive, diff)
 
@@ -149,7 +149,7 @@ func setPodsStatusesWithIndexes(podIndexer cache.Indexer, job *batch.Job, status
 		p.Status = v1.PodStatus{Phase: s.Phase}
 		if s.Index != noIndex {
 			p.Annotations = map[string]string{
-				batch.JobCompletionIndexAnnotationAlpha: s.Index,
+				batch.JobCompletionIndexAnnotation: s.Index,
 			}
 		}
 		podIndexer.Add(p)
@@ -2176,7 +2176,7 @@ func checkJobCompletionEnvVariable(t *testing.T, spec *v1.PodSpec) {
 			Name: "JOB_COMPLETION_INDEX",
 			ValueFrom: &v1.EnvVarSource{
 				FieldRef: &v1.ObjectFieldSelector{
-					FieldPath: fmt.Sprintf("metadata.annotations['%s']", batch.JobCompletionIndexAnnotationAlpha),
+					FieldPath: fmt.Sprintf("metadata.annotations['%s']", batch.JobCompletionIndexAnnotation),
 				},
 			},
 		},
 
@@ -0,0 +1,75 @@
+/*
+Copyright 2021 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package metrics
+
+import (
+	"sync"
+
+	"k8s.io/component-base/metrics"
+	"k8s.io/component-base/metrics/legacyregistry"
+)
+
+// JobControllerSubsystem - subsystem name used for this controller.
+const JobControllerSubsystem = "job_controller"
+
+var (
+	// JobSyncDurationSeconds tracks the latency of job syncs as
+	// completion_mode = Indexed / NonIndexed and result = success / error.
+	JobSyncDurationSeconds = metrics.NewHistogramVec(
+		&metrics.HistogramOpts{
+			Subsystem:      JobControllerSubsystem,
+			Name:           "job_sync_duration_seconds",
+			Help:           "The time it took to sync a job",
+			StabilityLevel: metrics.ALPHA,
+			Buckets:        metrics.ExponentialBuckets(0.001, 2, 15),
+		},
+		[]string{"completion_mode", "result"},
+	)
+	// JobSyncNum tracks the number of job syncs as
+	// completion_mode = Indexed / NonIndexed and result = success / error.
+	JobSyncNum = metrics.NewCounterVec(
+		&metrics.CounterOpts{
+			Subsystem:      JobControllerSubsystem,
+			Name:           "job_sync_total",
+			Help:           "The number of job syncs",
+			StabilityLevel: metrics.ALPHA,
+		},
+		[]string{"completion_mode", "result"},
+	)
+	// JobFinishedNum tracks the number of jobs that finish as
+	// completion_mode = Indexed / NonIndexed and result = failed / succeeded.
+	JobFinishedNum = metrics.NewCounterVec(
+		&metrics.CounterOpts{
+			Subsystem:      JobControllerSubsystem,
+			Name:           "job_finished_total",
+			Help:           "The number of finished job",
+			StabilityLevel: metrics.ALPHA,
+		},
+		[]string{"completion_mode", "result"},
+	)
+)
+
+var registerMetrics sync.Once
+
+// Register registers Job controller metrics.
+func Register() {
+	registerMetrics.Do(func() {
+		legacyregistry.MustRegister(JobSyncDurationSeconds)
+		legacyregistry.MustRegister(JobSyncNum)
+		legacyregistry.MustRegister(JobFinishedNum)
+	})
+}
@@ -293,6 +293,7 @@ const (
 
 	// owner: @alculquicondor
 	// alpha: v1.21
+	// beta: v1.22
 	//
 	// Allows Job controller to manage Pod completions per completion index.
 	IndexedJob featuregate.Feature = "IndexedJob"
@@ -779,7 +780,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS
 	NetworkPolicyEndPort:                           {Default: false, PreRelease: featuregate.Alpha},
 	ProcMountType:                                  {Default: false, PreRelease: featuregate.Alpha},
 	TTLAfterFinished:                               {Default: true, PreRelease: featuregate.Beta},
-	IndexedJob:                                     {Default: false, PreRelease: featuregate.Alpha},
+	IndexedJob:                                     {Default: true, PreRelease: featuregate.Beta},
 	KubeletPodResources:                            {Default: true, PreRelease: featuregate.Beta},
 	LocalStorageCapacityIsolationFSQuotaMonitoring: {Default: false, PreRelease: featuregate.Alpha},
 	NonPreemptingPriority:                          {Default: true, PreRelease: featuregate.Beta},
 
@@ -21,7 +21,7 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
 
-const JobCompletionIndexAnnotationAlpha = "batch.kubernetes.io/job-completion-index"
+const JobCompletionIndexAnnotation = "batch.kubernetes.io/job-completion-index"
 
 // +genclient
 // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
@@ -162,9 +162,11 @@ type JobSpec struct {
 	// for each index.
 	// When value is `Indexed`, .spec.completions must be specified and
 	// `.spec.parallelism` must be less than or equal to 10^5.
+	// In addition, The Pod name takes the form
+	// `$(job-name)-$(index)-$(random-string)`,
+	// the Pod hostname takes the form `$(job-name)-$(index)`.
 	//
-	// This field is alpha-level and is only honored by servers that enable the
-	// IndexedJob feature gate. More completion modes can be added in the future.
+	// This field is beta-level. More completion modes can be added in the future.
 	// If the Job controller observes a mode that it doesn't recognize, the
 	// controller skips updates for the Job.
 	// +optional
Original file line number	Diff line number	Diff line change
`@@ -170,7 +170,7 @@ func getCompletionIndex(annotations map[string]string) int {`
`170`	`170`	`if annotations == nil {`
`171`	`171`	`return unknownCompletionIndex`
`172`	`172`	`}`
`173`		`- v, ok := annotations[batch.JobCompletionIndexAnnotationAlpha]`
	`173`	`+ v, ok := annotations[batch.JobCompletionIndexAnnotation]`
`174`	`174`	`if !ok {`
`175`	`175`	`return unknownCompletionIndex`
`176`	`176`	`}`
`@@ -203,7 +203,7 @@ func addCompletionIndexEnvVariable(container *v1.Container) {`
`203`	`203`	`Name: completionIndexEnvName,`
`204`	`204`	`ValueFrom: &v1.EnvVarSource{`
`205`	`205`	`FieldRef: &v1.ObjectFieldSelector{`
`206`		`- FieldPath: fmt.Sprintf("metadata.annotations['%s']", batch.JobCompletionIndexAnnotationAlpha),`
	`206`	`+ FieldPath: fmt.Sprintf("metadata.annotations['%s']", batch.JobCompletionIndexAnnotation),`
`207`	`207`	`},`
`208`	`208`	`},`
`209`	`209`	`})`
`@@ -213,7 +213,7 @@ func addCompletionIndexAnnotation(template *v1.PodTemplateSpec, index int) {`
`213`	`213`	`if template.Annotations == nil {`
`214`	`214`	`template.Annotations = make(map[string]string, 1)`
`215`	`215`	`}`
`216`		`- template.Annotations[batch.JobCompletionIndexAnnotationAlpha] = strconv.Itoa(index)`
	`216`	`+ template.Annotations[batch.JobCompletionIndexAnnotation] = strconv.Itoa(index)`
`217`	`217`	`}`
`218`	`218`
`219`	`219`	`type byCompletionIndex []*v1.Pod`
Original file line number	Diff line number	Diff line change
`@@ -279,7 +279,7 @@ func hollowPodsWithIndexPhase(descs []indexPhase) []*v1.Pod {`
`279`	`279`	`}`
`280`	`280`	`if desc.Index != noIndex {`
`281`	`281`	`p.Annotations = map[string]string{`
`282`		`- batch.JobCompletionIndexAnnotationAlpha: desc.Index,`
	`282`	`+ batch.JobCompletionIndexAnnotation: desc.Index,`
`283`	`283`	`}`
`284`	`284`	`}`
`285`	`285`	`pods = append(pods, p)`
`@@ -297,7 +297,7 @@ func toIndexPhases(pods []*v1.Pod) []indexPhase {`
`297`	`297`	`for i, p := range pods {`
`298`	`298`	`index := noIndex`
`299`	`299`	`if p.Annotations != nil {`
`300`		`- index = p.Annotations[batch.JobCompletionIndexAnnotationAlpha]`
	`300`	`+ index = p.Annotations[batch.JobCompletionIndexAnnotation]`
`301`	`301`	`}`
`302`	`302`	`result[i] = indexPhase{index, p.Status.Phase}`
`303`	`303`	`}`