@@ -54,9 +54,7 @@ import (
54
54
55
55
v1 "k8s.io/api/core/v1"
56
56
57
- "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejobresources"
58
57
"github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejobresources/genericresource"
59
- respod "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejobresources/pod"
60
58
"k8s.io/apimachinery/pkg/labels"
61
59
62
60
arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1"
@@ -79,9 +77,9 @@ type XController struct {
79
77
80
78
appwrapperInformer arbinformers.AppWrapperInformer
81
79
// resources registered for the AppWrapper
82
- qjobRegisteredResources queuejobresources.RegisteredResources
80
+ // qjobRegisteredResources queuejobresources.RegisteredResources
83
81
// controllers for these resources
84
- qjobResControls map [arbv1.ResourceType ]queuejobresources.Interface
82
+ // qjobResControls map[arbv1.ResourceType]queuejobresources.Interface
85
83
86
84
// Captures all available resources in the cluster
87
85
genericresources * genericresource.GenericResources
@@ -140,9 +138,9 @@ type JobAndClusterAgent struct {
140
138
}
141
139
142
140
// RegisterAllQueueJobResourceTypes - registers all resources
143
- func RegisterAllQueueJobResourceTypes (regs * queuejobresources.RegisteredResources ) {
144
- respod .Register (regs )
145
- }
141
+ // func RegisterAllQueueJobResourceTypes(regs *queuejobresources.RegisteredResources) {
142
+ // respod.Register(regs)
143
+ // }
146
144
147
145
func GetQueueJobKey (obj interface {}) (string , error ) {
148
146
qj , ok := obj .(* arbv1.AppWrapper )
@@ -153,6 +151,47 @@ func GetQueueJobKey(obj interface{}) (string, error) {
153
151
return fmt .Sprintf ("%s/%s" , qj .Namespace , qj .Name ), nil
154
152
}
155
153
154
+ //UpdateQueueJobStatus was part of pod informer, this is now a method of queuejob_controller file.
155
+ //This change is done in an effort to simplify the controller and enable to move to controller runtime.
156
+ func (qjm * XController ) UpdateQueueJobStatus (queuejob * arbv1.AppWrapper ) error {
157
+
158
+ labelSelector := fmt .Sprintf ("%s=%s" , "appwrapper.mcad.ibm.com" , queuejob .Name )
159
+ pods , errt := qjm .clients .CoreV1 ().Pods ("" ).List (context .TODO (), metav1.ListOptions {LabelSelector : labelSelector })
160
+ if errt != nil {
161
+ return errt
162
+ }
163
+
164
+ running := int32 (FilterPods (pods .Items , v1 .PodRunning ))
165
+ podPhases := []v1.PodPhase {v1 .PodRunning , v1 .PodSucceeded }
166
+ totalResourcesConsumedForPodPhases := clusterstateapi .EmptyResource ()
167
+ for _ , phase := range podPhases {
168
+ totalResourcesConsumedForPodPhases .Add (GetPodResourcesByPhase (phase , pods .Items ))
169
+ }
170
+ pending := int32 (FilterPods (pods .Items , v1 .PodPending ))
171
+ succeeded := int32 (FilterPods (pods .Items , v1 .PodSucceeded ))
172
+ failed := int32 (FilterPods (pods .Items , v1 .PodFailed ))
173
+ podsConditionMap := PendingPodsFailedSchd (pods .Items )
174
+ klog .V (10 ).Infof ("[UpdateQueueJobStatus] There are %d pods of AppWrapper %s: pending %d, running %d, succeeded %d, failed %d, pendingpodsfailedschd %d, total resource consumed %v" ,
175
+ len (pods .Items ), queuejob .Name , pending , running , succeeded , failed , len (podsConditionMap ), totalResourcesConsumedForPodPhases )
176
+
177
+ queuejob .Status .Pending = pending
178
+ queuejob .Status .Running = running
179
+ queuejob .Status .Succeeded = succeeded
180
+ queuejob .Status .Failed = failed
181
+ // Total resources by all running pods
182
+ queuejob .Status .TotalGPU = int32 (totalResourcesConsumedForPodPhases .GPU )
183
+ queuejob .Status .TotalCPU = int32 (totalResourcesConsumedForPodPhases .MilliCPU )
184
+ queuejob .Status .TotalMemory = int32 (totalResourcesConsumedForPodPhases .Memory )
185
+
186
+ queuejob .Status .PendingPodConditions = nil
187
+ for podName , cond := range podsConditionMap {
188
+ podCond := GeneratePodFailedCondition (podName , cond )
189
+ queuejob .Status .PendingPodConditions = append (queuejob .Status .PendingPodConditions , podCond )
190
+ }
191
+
192
+ return nil
193
+ }
194
+
156
195
//allocatableCapacity calculates the capacity available on each node by substracting resources
157
196
//consumed by existing pods.
158
197
//For a large cluster with thousands of nodes and hundreds of thousands of pods this
@@ -217,20 +256,20 @@ func NewJobController(config *rest.Config, serverOption *options.ServerOption) *
217
256
218
257
cc .genericresources = genericresource .NewAppWrapperGenericResource (config )
219
258
220
- cc .qjobResControls = map [arbv1.ResourceType ]queuejobresources.Interface {}
221
- RegisterAllQueueJobResourceTypes (& cc .qjobRegisteredResources )
259
+ // cc.qjobResControls = map[arbv1.ResourceType]queuejobresources.Interface{}
260
+ // RegisterAllQueueJobResourceTypes(&cc.qjobRegisteredResources)
222
261
223
262
// initialize pod sub-resource control
224
- resControlPod , found , err := cc .qjobRegisteredResources .InitQueueJobResource (arbv1 .ResourceTypePod , config )
225
- if err != nil {
226
- klog .Errorf ("fail to create queuejob resource control" )
227
- return nil
228
- }
229
- if ! found {
230
- klog .Errorf ("queuejob resource type Pod not found" )
231
- return nil
232
- }
233
- cc .qjobResControls [arbv1 .ResourceTypePod ] = resControlPod
263
+ // resControlPod, found, err := cc.qjobRegisteredResources.InitQueueJobResource(arbv1.ResourceTypePod, config)
264
+ // if err != nil {
265
+ // klog.Errorf("fail to create queuejob resource control")
266
+ // return nil
267
+ // }
268
+ // if !found {
269
+ // klog.Errorf("queuejob resource type Pod not found")
270
+ // return nil
271
+ // }
272
+ // cc.qjobResControls[arbv1.ResourceTypePod] = resControlPod
234
273
235
274
appWrapperClient , err := clientset .NewForConfig (cc .config )
236
275
if err != nil {
@@ -816,7 +855,7 @@ func (qjm *XController) getAggregatedAvailableResourcesPriority(unallocatedClust
816
855
817
856
}
818
857
819
- err := qjm .qjobResControls [ arbv1 . ResourceTypePod ]. UpdateQueueJobStatus (value )
858
+ err := qjm .UpdateQueueJobStatus (value )
820
859
if err != nil {
821
860
klog .Warningf ("[getAggAvaiResPri] Error updating pod status counts for AppWrapper job: %s, err=%+v" , value .Name , err )
822
861
}
@@ -843,7 +882,7 @@ func (qjm *XController) getAggregatedAvailableResourcesPriority(unallocatedClust
843
882
klog .V (10 ).Infof ("[getAggAvaiResPri] Subtract all resources %+v in genericItem=%T for job %s which can-run is set to: %v but state is still pending." , qjv , genericItem , value .Name , value .Status .CanRun )
844
883
}
845
884
846
- err := qjm .qjobResControls [ arbv1 . ResourceTypePod ]. UpdateQueueJobStatus (value )
885
+ err := qjm .UpdateQueueJobStatus (value )
847
886
if err != nil {
848
887
klog .Warningf ("[getAggAvaiResPri] Error updating pod status counts for AppWrapper job: %s, err=%+v" , value .Name , err )
849
888
}
@@ -1458,7 +1497,7 @@ func (qjm *XController) backoff(ctx context.Context, q *arbv1.AppWrapper, reason
1458
1497
func (cc * XController ) Run (stopCh <- chan struct {}) {
1459
1498
go cc .appwrapperInformer .Informer ().Run (stopCh )
1460
1499
1461
- go cc .qjobResControls [arbv1 .ResourceTypePod ].Run (stopCh )
1500
+ // go cc.qjobResControls[arbv1.ResourceTypePod].Run(stopCh)
1462
1501
1463
1502
cache .WaitForCacheSync (stopCh , cc .appWrapperSynced )
1464
1503
@@ -1508,7 +1547,7 @@ func (qjm *XController) UpdateQueueJobs() {
1508
1547
}
1509
1548
}
1510
1549
if (newjob .Status .State == arbv1 .AppWrapperStateActive || newjob .Status .State == arbv1 .AppWrapperStateRunningHoldCompletion ) && containsCompletionStatus {
1511
- err := qjm .qjobResControls [ arbv1 . ResourceTypePod ]. UpdateQueueJobStatus (newjob )
1550
+ err := qjm .UpdateQueueJobStatus (newjob )
1512
1551
if err != nil {
1513
1552
klog .Errorf ("[UpdateQueueJobs] Error updating pod status counts for AppWrapper job: %s, err=%+v" , newjob .Name , err )
1514
1553
continue
@@ -1911,7 +1950,7 @@ func (cc *XController) syncQueueJob(ctx context.Context, qj *arbv1.AppWrapper) e
1911
1950
awNew := qj .DeepCopy ()
1912
1951
// we call sync to update pods running, pending,...
1913
1952
if qj .Status .State == arbv1 .AppWrapperStateActive {
1914
- err := cc .qjobResControls [ arbv1 . ResourceTypePod ]. UpdateQueueJobStatus (awNew )
1953
+ err := cc .UpdateQueueJobStatus (awNew )
1915
1954
if err != nil {
1916
1955
klog .Errorf ("[syncQueueJob] Error updating pod status counts for AppWrapper job: %s, err=%+v" , qj .Name , err )
1917
1956
return err
0 commit comments