Skip to content

Commit ee3c571

Browse files
authored
Enhanced TAS support (#319)
1. Add Annotations field to AppWrapperPodSet 2. Record Kind-specific TAS information via annotations 3. Fix replica inference for JobSet 4. Adjust sample replica counts to make TAS more interesting
1 parent 15b0463 commit ee3c571

File tree

8 files changed

+88
-29
lines changed

8 files changed

+88
-29
lines changed

Diff for: api/v1beta2/appwrapper_types.go

+5
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,11 @@ type AppWrapperPodSet struct {
6464

6565
// Path is the path Component.Template to the PodTemplateSpec for this PodSet
6666
Path string `json:"path"`
67+
68+
// Annotations is an unstructured key value map that may be used to store and retrieve
69+
// arbitrary metadata about the PodSet to customize its treatment by the AppWrapper controller.
70+
//+optional
71+
Annotations map[string]string `json:"annotations,omitempty"`
6772
}
6873

6974
// AppWrapperPodSetInfo contains the data that Kueue wants to inject into an admitted PodSpecTemplate

Diff for: api/v1beta2/zz_generated.deepcopy.go

+7
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: config/crd/bases/workload.codeflare.dev_appwrappers.yaml

+14
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,13 @@ spec:
155155
description: AppWrapperPodSet describes an homogeneous set
156156
of pods
157157
properties:
158+
annotations:
159+
additionalProperties:
160+
type: string
161+
description: |-
162+
Annotations is an unstructured key value map that may be used to store and retrieve
163+
arbitrary metadata about the PodSet to customize its treatment by the AppWrapper controller.
164+
type: object
158165
path:
159166
description: Path is the path Component.Template to the
160167
PodTemplateSpec for this PodSet
@@ -280,6 +287,13 @@ spec:
280287
description: AppWrapperPodSet describes an homogeneous set
281288
of pods
282289
properties:
290+
annotations:
291+
additionalProperties:
292+
type: string
293+
description: |-
294+
Annotations is an unstructured key value map that may be used to store and retrieve
295+
arbitrary metadata about the PodSet to customize its treatment by the AppWrapper controller.
296+
type: object
283297
path:
284298
description: Path is the path Component.Template to the
285299
PodTemplateSpec for this PodSet

Diff for: go.mod

+6-6
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@ go 1.23.0
55
require (
66
github.com/distribution/reference v0.5.0
77
github.com/go-logr/logr v1.4.2
8-
github.com/golangci/golangci-lint v1.60.1
8+
github.com/golangci/golangci-lint v1.63.4
9+
github.com/kubeflow/training-operator v1.8.1
910
github.com/onsi/ginkgo/v2 v2.22.0
1011
github.com/onsi/gomega v1.36.1
1112
github.com/open-policy-agent/cert-controller v0.12.0
@@ -17,6 +18,7 @@ require (
1718
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8
1819
sigs.k8s.io/controller-runtime v0.19.3
1920
sigs.k8s.io/controller-tools v0.16.5
21+
sigs.k8s.io/jobset v0.7.1
2022
sigs.k8s.io/kueue v0.10.1
2123
sigs.k8s.io/kustomize/kustomize/v5 v5.5.0
2224
sigs.k8s.io/yaml v1.4.0
@@ -61,7 +63,6 @@ require (
6163
github.com/json-iterator/go v1.1.12 // indirect
6264
github.com/klauspost/compress v1.17.9 // indirect
6365
github.com/kubeflow/mpi-operator v0.6.0 // indirect
64-
github.com/kubeflow/training-operator v1.8.1 // indirect
6566
github.com/mailru/easyjson v0.7.7 // indirect
6667
github.com/mattn/go-colorable v0.1.13 // indirect
6768
github.com/mattn/go-isatty v0.0.20 // indirect
@@ -90,16 +91,16 @@ require (
9091
go.opentelemetry.io/proto/otlp v1.3.1 // indirect
9192
go.uber.org/atomic v1.11.0 // indirect
9293
go.uber.org/multierr v1.11.0 // indirect
93-
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect
94-
golang.org/x/mod v0.21.0 // indirect
94+
golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 // indirect
95+
golang.org/x/mod v0.22.0 // indirect
9596
golang.org/x/net v0.33.0 // indirect
9697
golang.org/x/oauth2 v0.21.0 // indirect
9798
golang.org/x/sync v0.10.0 // indirect
9899
golang.org/x/sys v0.28.0 // indirect
99100
golang.org/x/term v0.27.0 // indirect
100101
golang.org/x/text v0.21.0 // indirect
101102
golang.org/x/time v0.6.0 // indirect
102-
golang.org/x/tools v0.26.0 // indirect
103+
golang.org/x/tools v0.28.0 // indirect
103104
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
104105
google.golang.org/genproto/googleapis/api v0.0.0-20240528184218-531527333157 // indirect
105106
google.golang.org/genproto/googleapis/rpc v0.0.0-20240701130421-f6361c86f094 // indirect
@@ -116,7 +117,6 @@ require (
116117
k8s.io/klog/v2 v2.130.1 // indirect
117118
k8s.io/kube-openapi v0.0.0-20240812233141-91dab695df6f // indirect
118119
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.30.3 // indirect
119-
sigs.k8s.io/jobset v0.7.1 // indirect
120120
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
121121
sigs.k8s.io/kustomize/api v0.18.0 // indirect
122122
sigs.k8s.io/kustomize/kyaml v0.18.1 // indirect

Diff for: go.sum

+12-12
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,8 @@ github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l
5656
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
5757
github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
5858
github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
59-
github.com/golangci/golangci-lint v1.60.1 h1:DRKNqNTQRLBJZ1il5u4fvgLQCjQc7QFs0DbhksJtVJE=
60-
github.com/golangci/golangci-lint v1.60.1/go.mod h1:jDIPN1rYaIA+ijp9OZcUmUCoQOtZ76pOlFbi15FlLJY=
59+
github.com/golangci/golangci-lint v1.63.4 h1:bJQFQ3hSfUto597dkL7ipDzOxsGEpiWdLiZ359OWOBI=
60+
github.com/golangci/golangci-lint v1.63.4/go.mod h1:Hx0B7Lg5/NXbaOHem8+KU+ZUIzMI6zNj/7tFwdnn10I=
6161
github.com/google/cel-go v0.20.1 h1:nDx9r8S3L4pE61eDdt8igGj8rf5kjYR3ILxWIpWNi84=
6262
github.com/google/cel-go v0.20.1/go.mod h1:kWcIzTsPX0zmQ+H3TirHstLLf9ep5QTsZBN9u4dOYLg=
6363
github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I=
@@ -143,8 +143,8 @@ github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0leargg
143143
github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
144144
github.com/ray-project/kuberay/ray-operator v1.2.2 h1:wj4qe9SmJfD1ubgEaVPuAsnU/WFDvremzR8j3JslBdk=
145145
github.com/ray-project/kuberay/ray-operator v1.2.2/go.mod h1:osTiIyaDoWi5IN1f0tOOtZ4TzVf+5kJXZor8VFvcEiI=
146-
github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
147-
github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
146+
github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
147+
github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
148148
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
149149
github.com/sergi/go-diff v1.2.0 h1:XU+rvMAioB0UC3q1MFrIQy4Vo5/4VsRDQQXHsEya6xQ=
150150
github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM=
@@ -166,8 +166,8 @@ github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
166166
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
167167
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
168168
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
169-
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
170-
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
169+
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
170+
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
171171
github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
172172
github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
173173
github.com/xlab/treeprint v1.2.0 h1:HzHnuAF1plUN2zGlAFHbSQP2qJ0ZAD3XF5XD7OesXRQ=
@@ -201,12 +201,12 @@ go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E=
201201
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
202202
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
203203
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
204-
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8=
205-
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY=
204+
golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 h1:e66Fs6Z+fZTbFBAxKfP3PALWBtpfqks2bwGcexMxgtk=
205+
golang.org/x/exp v0.0.0-20240909161429-701f63a606c0/go.mod h1:2TbTHSBQa924w8M6Xs1QcRcFwyucIwBGpK1p2f1YFFY=
206206
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
207207
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
208-
golang.org/x/mod v0.21.0 h1:vvrHzRwRfVKSiLrG+d4FMl/Qi4ukBCE6kZlTUkDYRT0=
209-
golang.org/x/mod v0.21.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY=
208+
golang.org/x/mod v0.22.0 h1:D4nJWe9zXqHOmWqj4VMOJhvzj7bEZg4wEYa759z1pH4=
209+
golang.org/x/mod v0.22.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY=
210210
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
211211
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
212212
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
@@ -240,8 +240,8 @@ golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGm
240240
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
241241
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
242242
golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
243-
golang.org/x/tools v0.26.0 h1:v/60pFQmzmT9ExmjDv2gGIfi3OqfKoEP6I5+umXlbnQ=
244-
golang.org/x/tools v0.26.0/go.mod h1:TPVVj70c7JJ3WCazhD8OdXcZg/og+b9+tH/KxylGwH0=
243+
golang.org/x/tools v0.28.0 h1:WuB6qZ4RPCQo5aP3WdKZS7i595EdWqWR8vqJTlwTVK8=
244+
golang.org/x/tools v0.28.0/go.mod h1:dcIOrVd3mfQKTgrDVQHqCPMWy6lnhfhtX3hLXYVLfRw=
245245
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
246246
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
247247
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=

Diff for: pkg/utils/utils.go

+40-8
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,14 @@ import (
2929

3030
dockerref "github.com/distribution/reference"
3131

32+
kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
33+
batchv1 "k8s.io/api/batch/v1"
3234
v1 "k8s.io/api/core/v1"
3335
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
3436
"k8s.io/apimachinery/pkg/runtime"
3537
"k8s.io/apimachinery/pkg/runtime/schema"
3638
"k8s.io/utils/ptr"
39+
jobsetapi "sigs.k8s.io/jobset/api/jobset/v1alpha2"
3740

3841
kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
3942
"sigs.k8s.io/kueue/pkg/podset"
@@ -43,6 +46,12 @@ import (
4346

4447
const templateString = "template"
4548

49+
const (
50+
PodSetAnnotationTASPodIndexLabel = "workload.codeflare.dev.appwrapper/tas-pod-index-label"
51+
PodSetAnnotationTASSubGroupIndexLabel = "workload.codeflare.dev.appwrapper/tas-sub-group-index-label"
52+
PodSetAnnotationTASSubGroupCount = "workload.codeflare.dev.appwrapper/tas-sub-group-count"
53+
)
54+
4655
// GetPodTemplateSpec extracts a Kueue-compatible PodTemplateSpec at the given path within obj
4756
func GetPodTemplateSpec(obj *unstructured.Unstructured, path string) (*v1.PodTemplateSpec, error) {
4857
candidatePTS, err := GetRawTemplate(obj.UnstructuredContent(), path)
@@ -490,7 +499,13 @@ func InferPodSets(obj *unstructured.Unstructured) ([]workloadv1beta2.AppWrapperP
490499
if completions, err := GetReplicas(obj, "template.spec.completions"); err == nil && completions < replicas {
491500
replicas = completions
492501
}
493-
podSets = append(podSets, workloadv1beta2.AppWrapperPodSet{Replicas: ptr.To(replicas), Path: "template.spec.template"})
502+
podSets = append(podSets, workloadv1beta2.AppWrapperPodSet{
503+
Replicas: ptr.To(replicas),
504+
Path: "template.spec.template",
505+
Annotations: map[string]string{
506+
PodSetAnnotationTASPodIndexLabel: batchv1.JobCompletionIndexAnnotation,
507+
},
508+
})
494509

495510
case schema.GroupVersionKind{Group: "jobset.x-k8s.io", Version: "v1alpha2", Kind: "JobSet"}:
496511
if jobs, err := getValueAtPath(obj.UnstructuredContent(), "template.spec.replicatedJobs"); err == nil {
@@ -499,15 +514,26 @@ func InferPodSets(obj *unstructured.Unstructured) ([]workloadv1beta2.AppWrapperP
499514
jobSpecPrefix := fmt.Sprintf("template.spec.replicatedJobs[%v].", i)
500515
// validate path to replica template
501516
if _, err := getValueAtPath(obj.UnstructuredContent(), jobSpecPrefix+"template"); err == nil {
502-
var replicas int32 = 1
517+
var podCount int32 = 1
503518
if parallelism, err := GetReplicas(obj, jobSpecPrefix+"template.spec.parallelism"); err == nil {
504-
replicas = parallelism
519+
podCount = parallelism
505520
}
506-
if completions, err := GetReplicas(obj, jobSpecPrefix+"template.spec.completions"); err == nil && completions < replicas {
507-
replicas = completions
521+
if completions, err := GetReplicas(obj, jobSpecPrefix+"template.spec.completions"); err == nil && completions < podCount {
522+
podCount = completions
523+
}
524+
var replicas int32 = 1
525+
if r, err := GetReplicas(obj, jobSpecPrefix+"replicas"); err == nil {
526+
replicas = r
508527
}
509-
// infer replica count
510-
podSets = append(podSets, workloadv1beta2.AppWrapperPodSet{Replicas: ptr.To(replicas), Path: jobSpecPrefix + "template.spec.template"})
528+
podSets = append(podSets, workloadv1beta2.AppWrapperPodSet{
529+
Replicas: ptr.To(replicas * podCount),
530+
Path: jobSpecPrefix + "template.spec.template",
531+
Annotations: map[string]string{
532+
PodSetAnnotationTASPodIndexLabel: batchv1.JobCompletionIndexAnnotation,
533+
PodSetAnnotationTASSubGroupIndexLabel: jobsetapi.JobIndexKey,
534+
PodSetAnnotationTASSubGroupCount: strconv.Itoa(int(replicas)),
535+
},
536+
})
511537
}
512538
}
513539
}
@@ -523,7 +549,13 @@ func InferPodSets(obj *unstructured.Unstructured) ([]workloadv1beta2.AppWrapperP
523549
if err != nil {
524550
return nil, err
525551
}
526-
podSets = append(podSets, workloadv1beta2.AppWrapperPodSet{Replicas: ptr.To(replicas), Path: prefix + templateString})
552+
podSets = append(podSets, workloadv1beta2.AppWrapperPodSet{
553+
Replicas: ptr.To(replicas),
554+
Path: prefix + templateString,
555+
Annotations: map[string]string{
556+
PodSetAnnotationTASPodIndexLabel: kftraining.ReplicaIndexLabel,
557+
},
558+
})
527559
}
528560
}
529561

Diff for: samples/wrapped-jobset.yaml

+3-2
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,11 @@ spec:
1414
spec:
1515
replicatedJobs:
1616
- name: workers
17+
replicas: 2
1718
template:
1819
spec:
19-
parallelism: 4
20-
completions: 4
20+
parallelism: 2
21+
completions: 2
2122
backoffLimit: 0
2223
template:
2324
spec:

Diff for: samples/wrapped-pytorch-job.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ spec:
2929
requests:
3030
cpu: 1
3131
Worker:
32-
replicas: 1
32+
replicas: 2
3333
restartPolicy: OnFailure
3434
template:
3535
spec:

0 commit comments

Comments
 (0)