diff --git a/pkg/apis/controller/v1beta1/appwrapper.go b/pkg/apis/controller/v1beta1/appwrapper.go index 408404ebe..4349ff1cc 100644 --- a/pkg/apis/controller/v1beta1/appwrapper.go +++ b/pkg/apis/controller/v1beta1/appwrapper.go @@ -42,8 +42,8 @@ const AppWrapperPlural string = "appwrappers" // which AppWrapper it belongs to. const AppWrapperAnnotationKey = "appwrapper.mcad.ibm.com/appwrapper-name" -//+kubebuilder:object:root=true -//+kubebuilder:subresource:status +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status // Definition of AppWrapper class // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object @@ -54,7 +54,7 @@ type AppWrapper struct { Status AppWrapperStatus `json:"status,omitempty"` } -//+kubebuilder:object:root=true +// +kubebuilder:object:root=true // AppWrapperList is a collection of AppWrappers. // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object @@ -89,8 +89,6 @@ type AppWrapperResourceList struct { metav1.TypeMeta `json:",inline"` metav1.ListMeta `json:"metadata"` - // +optional - Items []AppWrapperResource `json:"Items"` // +optional GenericItems []AppWrapperGenericResource `json:"GenericItems"` } @@ -100,42 +98,7 @@ type AppWrapperService struct { Spec v1.ServiceSpec `json:"spec"` } -// AppWrapperResource is App Wrapper aggregation resource -// todo: To be depricated -type AppWrapperResource struct { - metav1.TypeMeta `json:",inline"` - metav1.ObjectMeta `json:"metadata"` - // Replicas is the number of desired replicas - Replicas int32 `json:"replicas,omitempty" protobuf:"bytes,2,opt,name=replicas"` - - // The minimal available pods to run for this AppWrapper; the default value is nil - MinAvailable *int32 `json:"minavailable,omitempty" protobuf:"bytes,3,opt,name=minavailable"` - - // The number of allocated replicas from this resource type - // +optional - AllocatedReplicas int32 `json:"allocatedreplicas"` - - // +kubebuilder:validation:Type=number - // +kubebuilder:validation:Format=float - // +optional - Priority float64 `json:"priority,omitempty"` - - // The increasing rate of priority value for this resource - // +kubebuilder:validation:Type=number - // +kubebuilder:validation:Format=float - PrioritySlope float64 `json:"priorityslope"` - - //The type of the resource (is the resource a Pod, a ReplicaSet, a ... ?) - // +optional - Type ResourceType `json:"type"` - - //The template for the resource; it is now a raw text because we don't know for what resource - //it should be instantiated - // +kubebuilder:pruning:PreserveUnknownFields - Template runtime.RawExtension `json:"template"` -} - -// AppWrapperResource is App Wrapper aggregation resource +// AppWrapperGenericResource is App Wrapper aggregation resource type AppWrapperGenericResource struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata"` @@ -161,25 +124,25 @@ type AppWrapperGenericResource struct { // +kubebuilder:validation:Format=float PrioritySlope float64 `json:"priorityslope"` - //The template for the resource; it is now a raw text because we don't know for what resource - //it should be instantiated + // The template for the resource; it is now a raw text because we don't know for what resource + // it should be instantiated // +optional // +kubebuilder:pruning:PreserveUnknownFields // +kubebuilder:validation:EmbeddedResource GenericTemplate runtime.RawExtension `json:"generictemplate"` - //Optional section that specifies resource requirements for non-standard k8s resources, follows same format as that + // Optional section that specifies resource requirements for non-standard k8s resources, follows same format as that // of standard k8s resources CustomPodResources []CustomPodResourceTemplate `json:"custompodresources,omitempty"` - //Optional field for users to determine completion status of item + // Optional field for users to determine completion status of item CompletionStatus string `json:"completionstatus,omitempty"` } type CustomPodResourceTemplate struct { Replicas int `json:"replicas"` - //todo: replace with - //Containers []Container Contain v1.ResourceRequirements + // todo: replace with + // Containers []Container Contain v1.ResourceRequirements Requests v1.ResourceList `json:"requests"` // +optional @@ -190,17 +153,7 @@ type CustomPodResourceTemplate struct { type ResourceType string const ( - ResourceTypePod ResourceType = "Pod" - ResourceTypeService ResourceType = "Service" - ResourceTypeSecret ResourceType = "Secret" - ResourceTypeStatefulSet ResourceType = "StatefulSet" - ResourceTypeDeployment ResourceType = "Deployment" - ResourceTypeReplicaSet ResourceType = "ReplicaSet" - ResourceTypePersistentVolume ResourceType = "PersistentVolume" - ResourceTypePersistentVolumeClaim ResourceType = "PersistentVolumeClaim" - ResourceTypeNamespace ResourceType = "Namespace" - ResourceTypeConfigMap ResourceType = "ConfigMap" - ResourceTypeNetworkPolicy ResourceType = "NetworkPolicy" + ResourceTypePod ResourceType = "Pod" ) // AppWrapperStatus represents the current state of a AppWrapper @@ -224,13 +177,13 @@ type AppWrapperStatus struct { // +optional MinAvailable int32 `json:"template,omitempty" protobuf:"bytes,4,opt,name=template"` - //Can run? + // Can run? CanRun bool `json:"canrun,omitempty" protobuf:"bytes,1,opt,name=canrun"` - //Is Dispatched? + // Is Dispatched? IsDispatched bool `json:"isdispatched,omitempty" protobuf:"bytes,1,opt,name=isdispatched"` - //State - Pending, Running, Failed, Deleted + // State - Pending, Running, Failed, Deleted State AppWrapperState `json:"state,omitempty"` Message string `json:"message,omitempty"` diff --git a/pkg/apis/controller/v1beta1/zz_generated.deepcopy.go b/pkg/apis/controller/v1beta1/zz_generated.deepcopy.go index 7af3a4dbf..056c6e604 100644 --- a/pkg/apis/controller/v1beta1/zz_generated.deepcopy.go +++ b/pkg/apis/controller/v1beta1/zz_generated.deepcopy.go @@ -152,42 +152,11 @@ func (in *AppWrapperList) DeepCopyObject() runtime.Object { return nil } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *AppWrapperResource) DeepCopyInto(out *AppWrapperResource) { - *out = *in - out.TypeMeta = in.TypeMeta - in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) - if in.MinAvailable != nil { - in, out := &in.MinAvailable, &out.MinAvailable - *out = new(int32) - **out = **in - } - in.Template.DeepCopyInto(&out.Template) - return -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AppWrapperResource. -func (in *AppWrapperResource) DeepCopy() *AppWrapperResource { - if in == nil { - return nil - } - out := new(AppWrapperResource) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *AppWrapperResourceList) DeepCopyInto(out *AppWrapperResourceList) { *out = *in out.TypeMeta = in.TypeMeta in.ListMeta.DeepCopyInto(&out.ListMeta) - if in.Items != nil { - in, out := &in.Items, &out.Items - *out = make([]AppWrapperResource, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } if in.GenericItems != nil { in, out := &in.GenericItems, &out.GenericItems *out = make([]AppWrapperGenericResource, len(*in)) diff --git a/pkg/controller/queuejob/queuejob_controller.go b/pkg/controller/queuejob/queuejob_controller.go deleted file mode 100644 index 10a8ada63..000000000 --- a/pkg/controller/queuejob/queuejob_controller.go +++ /dev/null @@ -1,430 +0,0 @@ -/* -Copyright 2017 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -/* -Copyright 2019, 2021 The Multi-Cluster App Dispatcher Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package queuejob - -import ( - "context" - "fmt" - "sync" - "time" - - v1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/api/meta" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/labels" - "k8s.io/apimachinery/pkg/util/wait" - "k8s.io/client-go/informers" - coreinformers "k8s.io/client-go/informers/core/v1" - "k8s.io/client-go/kubernetes" - corelisters "k8s.io/client-go/listers/core/v1" - "k8s.io/client-go/rest" - "k8s.io/client-go/tools/cache" - "k8s.io/klog/v2" - - "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/utils" - arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" - clientset "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/clientset/controller-versioned" - "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/clientset/controller-versioned/clients" - arbinformers "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/informers/controller-externalversion" - informersv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/informers/controller-externalversion/v1" - listersv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/listers/controller/v1" -) - -const ( - // QueueJobLabel label string for queuejob name - QueueJobLabel string = "queuejob.kube-mcad.ibm.com" -) - -// Controller the QueueJob Controller type -type Controller struct { - config *rest.Config - queueJobInformer informersv1.QueueJobInformer - podInformer coreinformers.PodInformer - clients *kubernetes.Clientset - arbclients *clientset.Clientset - - // A store of jobs - queueJobLister listersv1.QueueJobLister - queueJobSynced func() bool - - // A store of pods, populated by the podController - podListr corelisters.PodLister - podSynced func() bool - - // eventQueue that need to sync up - eventQueue *cache.FIFO -} - -// NewQueueJobController create new QueueJob Controller -func NewQueueJobController(config *rest.Config) *Controller { - cc := &Controller{ - config: config, - clients: kubernetes.NewForConfigOrDie(config), - arbclients: clientset.NewForConfigOrDie(config), - eventQueue: cache.NewFIFO(eventKey), - } - - queueJobClient, _, err := clients.NewClient(cc.config) - if err != nil { - panic(err) - } - - cc.queueJobInformer = arbinformers.NewSharedInformerFactory(queueJobClient, 0).QueueJob().QueueJobs() - cc.queueJobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ - AddFunc: cc.addQueueJob, - UpdateFunc: cc.updateQueueJob, - DeleteFunc: cc.deleteQueueJob, - }) - cc.queueJobLister = cc.queueJobInformer.Lister() - cc.queueJobSynced = cc.queueJobInformer.Informer().HasSynced - - cc.podInformer = informers.NewSharedInformerFactory(cc.clients, 0).Core().V1().Pods() - cc.podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ - AddFunc: cc.addPod, - UpdateFunc: cc.updatePod, - DeleteFunc: cc.deletePod, - }) - - cc.podListr = cc.podInformer.Lister() - cc.podSynced = cc.podInformer.Informer().HasSynced - - return cc -} - -// Run start QueueJob Controller -func (cc *Controller) Run(stopCh chan struct{}) { - // initialized - createQueueJobKind(cc.config) - - go cc.queueJobInformer.Informer().Run(stopCh) - go cc.podInformer.Informer().Run(stopCh) - - cache.WaitForCacheSync(stopCh, cc.queueJobSynced, cc.podSynced) - - go wait.Until(cc.worker, time.Second, stopCh) -} - -func (cc *Controller) addQueueJob(obj interface{}) { - qj, ok := obj.(*arbv1.QueueJob) - if !ok { - klog.Errorf("obj is not QueueJob") - return - } - - cc.enqueue(qj) -} - -func (cc *Controller) updateQueueJob(oldObj, newObj interface{}) { - newQJ, ok := newObj.(*arbv1.QueueJob) - if !ok { - klog.Errorf("newObj is not QueueJob") - return - } - - cc.enqueue(newQJ) -} - -func (cc *Controller) deleteQueueJob(obj interface{}) { - qj, ok := obj.(*arbv1.QueueJob) - if !ok { - klog.Errorf("obj is not QueueJob") - return - } - - cc.enqueue(qj) -} - -func (cc *Controller) addPod(obj interface{}) { - pod, ok := obj.(*v1.Pod) - if !ok { - klog.Error("Failed to convert %v to v1.Pod", obj) - return - } - - cc.enqueue(pod) -} - -func (cc *Controller) updatePod(oldObj, newObj interface{}) { - pod, ok := newObj.(*v1.Pod) - if !ok { - klog.Error("Failed to convert %v to v1.Pod", newObj) - return - } - - cc.enqueue(pod) -} - -func (cc *Controller) deletePod(obj interface{}) { - var pod *v1.Pod - switch t := obj.(type) { - case *v1.Pod: - pod = t - case cache.DeletedFinalStateUnknown: - var ok bool - pod, ok = t.Obj.(*v1.Pod) - if !ok { - klog.Errorf("Cannot convert to *v1.Pod: %v", t.Obj) - return - } - default: - klog.Errorf("Cannot convert to *v1.Pod: %v", t) - return - } - - queuejobs, err := cc.queueJobLister.List(labels.Everything()) - if err != nil { - klog.Errorf("Failed to list QueueJobs for Pod %v/%v", pod.Namespace, pod.Name) - } - - ctl := utils.GetController(pod) - for _, qj := range queuejobs { - if qj.UID == ctl { - cc.enqueue(qj) - break - } - } -} - -func (cc *Controller) enqueue(obj interface{}) { - err := cc.eventQueue.Add(obj) - if err != nil { - klog.Errorf("Fail to enqueue QueueJob to updateQueue, err %#v", err) - } -} - -func (cc *Controller) worker() { - if _, err := cc.eventQueue.Pop(func(obj interface{}) error { - var queuejob *arbv1.QueueJob - switch v := obj.(type) { - case *arbv1.QueueJob: - queuejob = v - case *v1.Pod: - queuejobs, err := cc.queueJobLister.List(labels.Everything()) - if err != nil { - klog.Errorf("Failed to list QueueJobs for Pod %v/%v", v.Namespace, v.Name) - } - - ctl := utils.GetController(v) - for _, qj := range queuejobs { - if qj.UID == ctl { - queuejob = qj - break - } - } - - default: - klog.Errorf("Un-supported type of %v", obj) - return nil - } - - if queuejob == nil { - if acc, err := meta.Accessor(obj); err != nil { - klog.Warningf("Failed to get QueueJob for %v/%v", acc.GetNamespace(), acc.GetName()) - } - - return nil - } - - // sync Pods for a QueueJob - if err := cc.syncQueueJob(queuejob); err != nil { - klog.Errorf("Failed to sync QueueJob %s, err %#v", queuejob.Name, err) - // If any error, requeue it. - return err - } - - return nil - }); err != nil { - klog.Errorf("Fail to pop item from updateQueue, err %#v", err) - return - } -} - -// filterActivePods returns pods that have not terminated. -func filterActivePods(pods []*v1.Pod) []*v1.Pod { - var result []*v1.Pod - for _, p := range pods { - if isPodActive(p) { - result = append(result, p) - } else { - klog.V(4).Infof("Ignoring inactive pod %v/%v in state %v, deletion time %v", - p.Namespace, p.Name, p.Status.Phase, p.DeletionTimestamp) - } - } - return result -} - -func isPodActive(p *v1.Pod) bool { - return v1.PodSucceeded != p.Status.Phase && - v1.PodFailed != p.Status.Phase && - p.DeletionTimestamp == nil -} - -func (cc *Controller) syncQueueJob(qj *arbv1.QueueJob) error { - queueJob, err := cc.queueJobLister.QueueJobs(qj.Namespace).Get(qj.Name) - if err != nil { - if apierrors.IsNotFound(err) { - klog.V(3).Infof("Job has been deleted: %v", qj.Name) - return nil - } - return err - } - - pods, err := cc.getPodsForQueueJob(queueJob) - if err != nil { - return err - } - - return cc.manageQueueJob(queueJob, pods) -} - -func (cc *Controller) getPodsForQueueJob(qj *arbv1.QueueJob) (map[string][]*v1.Pod, error) { - pods := map[string][]*v1.Pod{} - - for _, ts := range qj.Spec.TaskSpecs { - selector, err := metav1.LabelSelectorAsSelector(ts.Selector) - if err != nil { - return nil, fmt.Errorf("couldn't convert QueueJob selector: %v", err) - } - - // List all pods under QueueJob - ps, err := cc.podListr.Pods(qj.Namespace).List(selector) - if err != nil { - return nil, err - } - - // TODO (k82cn): optimic by cache - for _, pod := range ps { - if !metav1.IsControlledBy(pod, qj) { - continue - } - // Hash by TaskSpec.Template.Name - pods[ts.Template.Name] = append(pods[ts.Template.Name], pod) - } - } - - return pods, nil -} - -// manageQueueJob is the core method responsible for managing the number of running -// pods according to what is specified in the job.Spec. -func (cc *Controller) manageQueueJob(qj *arbv1.QueueJob, pods map[string][]*v1.Pod) error { - var err error - - runningSum := int32(0) - pendingSum := int32(0) - succeededSum := int32(0) - failedSum := int32(0) - - ss, err := cc.arbclients.ArbV1().SchedulingSpecs(qj.Namespace).List(metav1.ListOptions{ - FieldSelector: fmt.Sprintf("metadata.name=%s", qj.Name), - }) - - if len(ss.Items) == 0 { - schedSpc := createQueueJobSchedulingSpec(qj) - _, err := cc.arbclients.ArbV1().SchedulingSpecs(qj.Namespace).Create(schedSpc) - if err != nil { - klog.Errorf("Failed to create SchedulingSpec for QueueJob %v/%v: %v", - qj.Namespace, qj.Name, err) - } - } else { - klog.V(3).Infof("There's %v SchedulingSpec for QueueJob %v/%v", - len(ss.Items), qj.Namespace, qj.Name) - } - - for _, ts := range qj.Spec.TaskSpecs { - replicas := ts.Replicas - name := ts.Template.Name - - running := int32(filterPods(pods[name], v1.PodRunning)) - pending := int32(filterPods(pods[name], v1.PodPending)) - succeeded := int32(filterPods(pods[name], v1.PodSucceeded)) - failed := int32(filterPods(pods[name], v1.PodFailed)) - - runningSum += running - pendingSum += pending - succeededSum += succeeded - failedSum += failed - - klog.V(3).Infof("There are %d pods of QueueJob %s (%s): replicas %d, pending %d, running %d, succeeded %d, failed %d", - len(pods), qj.Name, name, replicas, pending, running, succeeded, failed) - - // Create pod if necessary - if diff := replicas - pending - running - succeeded; diff > 0 { - klog.V(3).Infof("Try to create %v Pods for QueueJob %v/%v", diff, qj.Namespace, qj.Name) - - var errs []error - wait := sync.WaitGroup{} - wait.Add(int(diff)) - for i := int32(0); i < diff; i++ { - go func(ix int32) { - defer wait.Done() - newPod := createQueueJobPod(qj, &ts.Template, ix) - _, err := cc.clients.CoreV1().Pods(newPod.Namespace).Create(context.Background(), newPod, metav1.CreateOptions{ - TypeMeta: metav1.TypeMeta{}, - DryRun: []string{}, - FieldManager: "", - }) - if err != nil { - // Failed to create Pod, wait a moment and then create it again - // This is to ensure all pods under the same QueueJob created - // So gang-scheduling could schedule the QueueJob successfully - klog.Errorf("Failed to create pod %s for QueueJob %s, err %#v", - newPod.Name, qj.Name, err) - errs = append(errs, err) - } - }(i) - } - wait.Wait() - - if len(errs) != 0 { - return fmt.Errorf("failed to create %d pods of %d", len(errs), diff) - } - } - } - - qj.Status = arbv1.QueueJobStatus{ - Pending: pendingSum, - Running: runningSum, - Succeeded: succeededSum, - Failed: failedSum, - MinAvailable: int32(qj.Spec.SchedSpec.MinAvailable), - } - - // TODO(k82cn): replaced it with `UpdateStatus` - if _, err := cc.arbclients.ArbV1().QueueJobs(qj.Namespace).Update(qj); err != nil { - klog.Errorf("Failed to update status of QueueJob %v/%v: %v", - qj.Namespace, qj.Name, err) - return err - } - - return err -} diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 66462d604..dbe86710d 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -1,19 +1,4 @@ /* -Copyright 2017 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -/* Copyright 2019, 2021, 2022 The Multi-Cluster App Dispatcher Authors. Licensed under the Apache License, Version 2.0 (the "License"); @@ -28,9 +13,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + package queuejob import ( + jsons "encoding/json" "errors" "fmt" "math" @@ -38,7 +25,6 @@ import ( "reflect" "runtime/debug" "sort" - "strconv" "strings" "sync" "time" @@ -68,10 +54,6 @@ import ( v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" - runtimeJson "k8s.io/apimachinery/pkg/runtime/serializer/json" - "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejobresources" "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejobresources/genericresource" respod "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejobresources/pod" @@ -86,20 +68,10 @@ import ( "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejobdispatch" - jsons "encoding/json" - clusterstateapi "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/api" clusterstatecache "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/cache" ) -const ( - // QueueJobNameLabel label string for queuejob name - QueueJobNameLabel string = "appwrapper-name" - - // ControllerUIDLabel label string for queuejob controller uid - ControllerUIDLabel string = "controller-uid" -) - // XController the AppWrapper Controller type type XController struct { config *rest.Config @@ -131,15 +103,12 @@ type XController struct { // eventQueue that need to sync up eventQueue *cache.FIFO - //QJ queue that needs to be allocated + // QJ queue that needs to be allocated qjqueue SchedulingQueue // our own local cache, used for computing total amount of resources cache clusterstatecache.Cache - // Reference manager to manage membership of queuejob resource and its members - refManager queuejobresources.RefManager - // is dispatcher or deployer? isDispatcher bool @@ -169,26 +138,11 @@ type JobAndClusterAgent struct { queueJobAgentKey string } -func NewJobAndClusterAgent(qjKey string, qaKey string) *JobAndClusterAgent { - return &JobAndClusterAgent{ - queueJobKey: qjKey, - queueJobAgentKey: qaKey, - } -} - // RegisterAllQueueJobResourceTypes - registers all resources func RegisterAllQueueJobResourceTypes(regs *queuejobresources.RegisteredResources) { respod.Register(regs) } -func GetQueueJobAgentKey(obj interface{}) (string, error) { - qa, ok := obj.(*queuejobdispatch.JobClusterAgent) - if !ok { - return "", fmt.Errorf("not a AppWrapperAgent") - } - return fmt.Sprintf("%s;%s", qa.AgentId, qa.DeploymentName), nil -} - func GetQueueJobKey(obj interface{}) (string, error) { qj, ok := obj.(*arbv1.AppWrapper) if !ok { @@ -220,7 +174,7 @@ func NewJobController(config *rest.Config, serverOption *options.ServerOption) * cc.qjobResControls = map[arbv1.ResourceType]queuejobresources.Interface{} RegisterAllQueueJobResourceTypes(&cc.qjobRegisteredResources) - //initialize pod sub-resource control + // initialize pod sub-resource control resControlPod, found, err := cc.qjobRegisteredResources.InitQueueJobResource(arbv1.ResourceTypePod, config) if err != nil { klog.Errorf("fail to create queuejob resource control") @@ -244,10 +198,10 @@ func NewJobController(config *rest.Config, serverOption *options.ServerOption) * case *arbv1.AppWrapper: klog.V(10).Infof("[Informer] Filter Name=%s Version=%s Local=%t FilterIgnore=%t Sender=%s &qj=%p qj=%+v", t.Name, t.ResourceVersion, t.Status.Local, t.Status.FilterIgnore, t.Status.Sender, t, t) // todo: This is a current workaround for duplicate message bug. - //if t.Status.Local == true { // ignore duplicate message from cache + // if t.Status.Local == true { // ignore duplicate message from cache // return false - //} - //t.Status.Local = true // another copy of this will be recognized as duplicate + // } + // t.Status.Local = true // another copy of this will be recognized as duplicate return true // return !t.Status.FilterIgnore // ignore update messages default: @@ -263,9 +217,6 @@ func NewJobController(config *rest.Config, serverOption *options.ServerOption) * cc.queueJobLister = cc.queueJobInformer.Lister() cc.queueJobSynced = cc.queueJobInformer.Informer().HasSynced - //create sub-resource reference manager - cc.refManager = queuejobresources.NewLabelRefManager() - // Setup Quota if serverOption.QuotaEnabled { dispatchedAWDemands, dispatchedAWs := cc.getDispatchedAppWrappers() @@ -287,7 +238,7 @@ func NewJobController(config *rest.Config, serverOption *options.ServerOption) * klog.Infof("[Controller] Agent mode") } - //create agents and agentMap + // create agents and agentMap cc.agentMap = map[string]*queuejobdispatch.JobClusterAgent{} cc.agentList = []string{} for _, agentconfig := range strings.Split(serverOption.AgentConfigs, ",") { @@ -301,7 +252,7 @@ func NewJobController(config *rest.Config, serverOption *options.ServerOption) * return nil } - //create (empty) dispatchMap + // create (empty) dispatchMap cc.dispatchMap = map[string]string{} return cc @@ -324,7 +275,7 @@ func (qjm *XController) PreemptQueueJobs() { newjob.Status.CanRun = false newjob.Status.FilterIgnore = true // update QueueJobState only cleanAppWrapper := false - //If dispatch deadline is exceeded no matter what the state of AW, kill the job and set status as Failed. + // If dispatch deadline is exceeded no matter what the state of AW, kill the job and set status as Failed. if (aw.Status.State == arbv1.AppWrapperStateActive) && (aw.Spec.SchedSpec.DispatchDuration.Limit > 0) { if aw.Spec.SchedSpec.DispatchDuration.Overrun { index := getIndexOfMatchedCondition(aw, arbv1.AppWrapperCondPreemptCandidate, "DispatchDeadlineExceeded") @@ -336,7 +287,7 @@ func (qjm *XController) PreemptQueueJobs() { cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondPreemptCandidate, v1.ConditionTrue, "DispatchDeadlineExceeded", "") newjob.Status.Conditions[index] = *cond.DeepCopy() } - //should the AW state be set in this method?? + // should the AW state be set in this method?? newjob.Status.State = arbv1.AppWrapperStateFailed newjob.Status.QueueJobState = arbv1.AppWrapperCondFailed newjob.Status.Running = 0 @@ -347,10 +298,10 @@ func (qjm *XController) PreemptQueueJobs() { klog.Warningf("[PreemptQueueJobs] status update CanRun: false -- DispatchDeadlineExceeded for '%s/%s' failed", aw.Namespace, aw.Name) continue } - //cannot use cleanup AW, since it puts AW back in running state + // cannot use cleanup AW, since it puts AW back in running state go qjm.qjqueue.AddUnschedulableIfNotPresent(updateNewJob) - //Move to next AW + // Move to next AW continue } } @@ -394,11 +345,11 @@ func (qjm *XController) PreemptQueueJobs() { updateNewJob = newjob.DeepCopy() } else { - //If pods failed scheduling generate new preempt condition + // If pods failed scheduling generate new preempt condition message = fmt.Sprintf("Pods failed scheduling failed=%v, running=%v.", len(aw.Status.PendingPodConditions), aw.Status.Running) index := getIndexOfMatchedCondition(newjob, arbv1.AppWrapperCondPreemptCandidate, "PodsFailedScheduling") - //ignore co-scheduler failed scheduling events. This is a temp - //work around until co-scheduler version 0.22.X perf issues are resolved. + // ignore co-scheduler failed scheduling events. This is a temp + // work-around until co-scheduler version 0.22.X perf issues are resolved. if index < 0 { cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondPreemptCandidate, v1.ConditionTrue, "PodsFailedScheduling", message) newjob.Status.Conditions = append(newjob.Status.Conditions, cond) @@ -420,7 +371,7 @@ func (qjm *XController) PreemptQueueJobs() { klog.V(4).Infof("[PreemptQueueJobs] Deleting AppWrapper %s/%s due to maximum number of re-queueing(s) exceeded.", aw.Name, aw.Namespace) go qjm.Cleanup(updateNewJob) } else { - //Only back-off AWs that are in state running and not in state Failed + // Only back-off AWs that are in state running and not in state Failed if updateNewJob.Status.State != arbv1.AppWrapperStateFailed { klog.Infof("[PreemptQueueJobs] Adding preempted AppWrapper %s/%s to back off queue.", aw.Name, aw.Namespace) go qjm.backoff(updateNewJob, "PreemptionTriggered", string(message)) @@ -484,7 +435,7 @@ func (qjm *XController) GetQueueJobsEligibleForPreemption() []*arbv1.AppWrapper klog.V(8).Infof("Appwrapper Dispatch limit exceeded, currentTime %v, dispatchTimeInSeconds %v", currentTime, dispatchDuration) value.Spec.SchedSpec.DispatchDuration.Overrun = true qjobs = append(qjobs, value) - //Got AW which exceeded dispatch runtime limit, move to next AW + // Got AW which exceeded dispatch runtime limit, move to next AW continue } } @@ -551,28 +502,6 @@ func (qjm *XController) GetQueueJobsEligibleForPreemption() []*arbv1.AppWrapper return qjobs } -func GetPodTemplate(qjobRes *arbv1.AppWrapperResource) (*v1.PodTemplateSpec, error) { - rtScheme := runtime.NewScheme() - v1.AddToScheme(rtScheme) - - jsonSerializer := runtimeJson.NewYAMLSerializer(runtimeJson.DefaultMetaFactory, rtScheme, rtScheme) - - podGVK := schema.GroupVersion{Group: v1.GroupName, Version: "v1"}.WithKind("PodTemplate") - - obj, _, err := jsonSerializer.Decode(qjobRes.Template.Raw, &podGVK, nil) - if err != nil { - return nil, err - } - - template, ok := obj.(*v1.PodTemplate) - if !ok { - return nil, fmt.Errorf("resource template not define as a PodTemplate") - } - - return &template.Template, nil - -} - func (qjm *XController) GetAggregatedResourcesPerGenericItem(cqj *arbv1.AppWrapper) []*clusterstateapi.Resource { var retVal []*clusterstateapi.Resource @@ -601,7 +530,7 @@ func (qjm *XController) getAppWrapperCompletionStatus(caw *arbv1.AppWrapper) arb if err := jsons.Unmarshal(objectName.Raw, &blob); err != nil { klog.Errorf("[getAppWrapperCompletionStatus] Error unmarshalling, err=%#v", err) } - unstruct.Object = blob.(map[string]interface{}) //set object to the content of the blob after Unmarshalling + unstruct.Object = blob.(map[string]interface{}) // set object to the content of the blob after Unmarshalling name := "" if md, ok := unstruct.Object["metadata"]; ok { metadata := md.(map[string]interface{}) @@ -616,18 +545,18 @@ func (qjm *XController) getAppWrapperCompletionStatus(caw *arbv1.AppWrapper) arb status := qjm.genericresources.IsItemCompleted(&genericItem, caw.Namespace, caw.Name, name) if !status { klog.V(4).Infof("[getAppWrapperCompletionStatus] Item %d named %s not completed for appwrapper: '%s/%s'", i+1, name, caw.Namespace, caw.Name) - //early termination because a required item is not completed + // early termination because a required item is not completed return caw.Status.State } - //only consider count completion required for valid items + // only consider count completion required for valid items countCompletionRequired = countCompletionRequired + 1 } } klog.V(4).Infof("[getAppWrapperCompletionStatus] App wrapper '%s/%s' countCompletionRequired %d, podsRunning %d, podsPending %d", caw.Namespace, caw.Name, countCompletionRequired, caw.Status.Running, caw.Status.Pending) - //Set new status only when completion required flag is present in genericitems array + // Set new status only when completion required flag is present in genericitems array if countCompletionRequired > 0 { if caw.Status.Running == 0 && caw.Status.Pending == 0 { return arbv1.AppWrapperStateCompleted @@ -637,17 +566,12 @@ func (qjm *XController) getAppWrapperCompletionStatus(caw *arbv1.AppWrapper) arb return arbv1.AppWrapperStateRunningHoldCompletion } } - //return previous condition + // return previous condition return caw.Status.State } func (qjm *XController) GetAggregatedResources(cqj *arbv1.AppWrapper) *clusterstateapi.Resource { - //todo: deprecate resource controllers allocated := clusterstateapi.EmptyResource() - for _, resctrl := range qjm.qjobResControls { - qjv := resctrl.GetAggregatedResources(cqj) - allocated = allocated.Add(qjv) - } for _, genericItem := range cqj.Spec.AggrResources.GenericItems { qjv, err := genericresource.GetResources(&genericItem) @@ -682,7 +606,7 @@ func (qjm *XController) getProposedPreemptions(requestingJob *arbv1.AppWrapper, klog.V(10).Infof("[getProposedPreemptions] Processing %v candidate jobs for preemption.", len(preemptableAWs)) } - //Sort keys of map + // Sort keys of map priorityKeyValues := make([]float64, len(preemptableAWs)) i := 0 for key := range preemptableAWs { @@ -857,11 +781,6 @@ func (qjm *XController) getAggregatedAvailableResourcesPriority(unallocatedClust continue } else if value.Status.CanRun { qjv := clusterstateapi.EmptyResource() - for _, resctrl := range qjm.qjobResControls { - res := resctrl.GetAggregatedResources(value) - qjv.Add(res) - klog.V(10).Infof("[getAggAvaiResPri] Subtract all resources %+v in resctrlType=%T for job %s which can-run is set to: %v but state is still pending.", qjv, resctrl, value.Name, value.Status.CanRun) - } for _, genericItem := range value.Spec.AggrResources.GenericItems { res, _ := genericresource.GetResources(&genericItem) qjv.Add(res) @@ -917,7 +836,7 @@ func (qjm *XController) chooseAgent(qj *arbv1.AppWrapper) string { if qjAggrResources.LessEqual(resources) { klog.V(2).Infof("[chooseAgent] Agent %s has enough resources\n", agentId) - //Now evaluate quota + // Now evaluate quota if qjm.serverOption.QuotaEnabled { if qjm.quotaManager != nil { if fits, preemptAWs, _ := qjm.quotaManager.Fits(qj, qjAggrResources, proposedPreemptions); fits { @@ -931,7 +850,7 @@ func (qjm *XController) chooseAgent(qj *arbv1.AppWrapper) string { klog.Errorf("[chooseAgent] Quota evaluation is enable but not initialize. AppWrapper %s/%s does not have enough quota\n", qj.Name, qj.Namespace) } } else { - //Quota is not enabled to return selected agent + // Quota is not enabled to return selected agent return agentId } } else { @@ -1006,7 +925,7 @@ func (qjm *XController) ScheduleNext() { scheduleNextRetrier := retrier.New(retrier.ExponentialBackoff(10, 100*time.Millisecond), &EtcdErrorClassifier{}) scheduleNextRetrier.SetJitter(0.05) - //Retry the execution + // Retry the execution err = scheduleNextRetrier.Run(func() error { klog.Infof("[ScheduleNext] activeQ.Pop %s *Delay=%.6f seconds RemainingLength=%d &qj=%p Version=%s Status=%+v", qj.Name, time.Now().Sub(qj.Status.ControllerFirstTimestamp.Time).Seconds(), qjm.qjqueue.Length(), qj, qj.ResourceVersion, qj.Status) @@ -1256,7 +1175,7 @@ func (qjm *XController) ScheduleNext() { fits = quotaFits } else { fits = false - //Quota manager not initialized + // Quota manager not initialized dispatchFailedMessage = "Quota evaluation is enabled but not initialized. Insufficient quota to dispatch AppWrapper." klog.Errorf("[ScheduleNext] [Agent Mode] Quota evaluation is enabled but not initialized. AppWrapper '%s/%s' does not have enough quota", qj.Namespace, qj.Name) } @@ -1276,18 +1195,12 @@ func (qjm *XController) ScheduleNext() { klog.Errorf("[ScheduleNext] [Agent Mode] Failed to get fresh copy of the app wrapper '%s/%s' to update status, err = %v", qj.Namespace, qj.Name, err) return retryErr } - desired := int32(0) - for i, ar := range tempAW.Spec.AggrResources.Items { - desired += ar.Replicas - tempAW.Spec.AggrResources.Items[i].AllocatedReplicas = ar.Replicas - } - tempAW.Status.CanRun = true tempAW.Status.FilterIgnore = true // update CanRun & Spec. no need to trigger event retryErr = qjm.updateStatusInEtcd(tempAW, "ScheduleNext - setCanRun") if retryErr != nil { if qjm.quotaManager != nil && quotaFits { - //Quota was allocated for this appwrapper, release it. + // Quota was allocated for this appwrapper, release it. qjm.quotaManager.Release(qj) } if apierrors.IsNotFound(retryErr) { @@ -1312,7 +1225,7 @@ func (qjm *XController) ScheduleNext() { klog.Infof("[ScheduleNext] [Agent Mode] Successfully dispatched app wrapper '%s/%s' activeQ=%t, Unsched=%t &aw=%p Version=%s Status=%+v", qj.Namespace, qj.Name, qjm.qjqueue.IfExistActiveQ(qj), qjm.qjqueue.IfExistUnschedulableQ(qj), qj, qj.ResourceVersion, qj.Status) } - } //fits + } // fits } else { // Not enough free resources to dispatch HOL dispatchFailedMessage = "Insufficient resources to dispatch AppWrapper." klog.Infof("[ScheduleNext] [Agent Mode] Failed to dispatch app wrapper '%s/%s' due to insuficient resources, activeQ=%t Unsched=%t &qj=%p Version=%s Status=%+v", @@ -1327,13 +1240,13 @@ func (qjm *XController) ScheduleNext() { // stop trying to dispatch after HeadOfLineHoldingTime // release quota if allocated if qjm.quotaManager != nil && quotaFits { - //Quota was allocated for this appwrapper, release it. + // Quota was allocated for this appwrapper, release it. qjm.quotaManager.Release(qj) } break } else { // Try to dispatch again after one second if qjm.quotaManager != nil && quotaFits { - //release any quota as the qj will be tried again and the quota might have been allocated. + // release any quota as the qj will be tried again and the quota might have been allocated. qjm.quotaManager.Release(qj) } time.Sleep(time.Second * 1) @@ -1396,6 +1309,7 @@ func (cc *XController) updateStatusInEtcd(currentAppwrapper *arbv1.AppWrapper, c klog.V(4).Infof("[updateStatusInEtcd] update success '%s/%s' called by '%s'", currentAppwrapper.Namespace, currentAppwrapper.Name, caller) return nil } + func (cc *XController) updateStatusInEtcdWithRetry(source *arbv1.AppWrapper, caller string) error { klog.V(4).Infof("[updateStatusInEtcdWithMergeFunction] trying to update '%s/%s' version '%s' called by '%s'", source.Namespace, source.Name, source.ResourceVersion, caller) source.Status.Sender = "before " + caller // set Sender string to indicate code location @@ -1556,7 +1470,7 @@ func (qjm *XController) UpdateQueueJobs() { klog.V(6).Infof("[UpdateQueueJobs] Found new appwraper '%s/%s' 0Delay=%.6f seconds CreationTimestamp=%s ControllerFirstTimestamp=%s", newjob.Namespace, newjob.Name, time.Now().Sub(newjob.Status.ControllerFirstTimestamp.Time).Seconds(), newjob.CreationTimestamp, newjob.Status.ControllerFirstTimestamp) } - //only set if appwrapper is running and dispatch time is not set previously + // only set if appwrapper is running and dispatch time is not set previously if newjob.Status.QueueJobState == "Running" && newjob.Status.ControllerFirstDispatchTimestamp.String() == "0001-01-01 00:00:00 +0000 UTC" { newjob.Status.ControllerFirstDispatchTimestamp = firstTime } @@ -1805,7 +1719,7 @@ func (cc *XController) syncQueueJob(qj *arbv1.AppWrapper) error { if err00 := cc.Cleanup(qj); err00 != nil { klog.Warningf("Failed to cleanup resources for app wrapper '%s/%s', err = %v", qj.Namespace, qj.Name, err00) } - //empty finalizers and delete the queuejob again + // empty finalizers and delete the queuejob again if accessor, err00 := meta.Accessor(qj); err00 == nil { accessor.SetFinalizers(nil) } @@ -1841,7 +1755,7 @@ func (cc *XController) syncQueueJob(qj *arbv1.AppWrapper) error { // If it is Agent (not a dispatcher), update pod information podPhaseChanges := false if !cc.isDispatcher { // agent mode - //Make a copy first to not update cache object and to use for comparing + // Make a copy first to not update cache object and to use for comparing awNew := qj.DeepCopy() // we call sync to update pods running, pending,... if qj.Status.State == arbv1.AppWrapperStateActive { @@ -1867,7 +1781,7 @@ func (cc *XController) syncQueueJob(qj *arbv1.AppWrapper) error { return nil } - //For debugging? + // For debugging? if !reflect.DeepEqual(awNew.Status, qj.Status) { podPhaseChanges = true // Using DeepCopy before DeepCopyInto as it seems that DeepCopyInto does not alloc a new memory object @@ -1889,7 +1803,7 @@ func (cc *XController) syncQueueJob(qj *arbv1.AppWrapper) error { func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool) error { if !cc.isDispatcher { // Agent Mode - //Job is Complete only update pods if needed. + // Job is Complete only update pods if needed. if qj.Status.State == arbv1.AppWrapperStateCompleted || qj.Status.State == arbv1.AppWrapperStateRunningHoldCompletion { if podPhaseChanges { // Only update etcd if AW status has changed. This can happen for periodic @@ -1970,39 +1884,14 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool if qj.Status.CanRun && qj.Status.State != arbv1.AppWrapperStateActive && qj.Status.State != arbv1.AppWrapperStateCompleted && qj.Status.State != arbv1.AppWrapperStateRunningHoldCompletion { - //keep conditions until the appwrapper is re-dispatched + // keep conditions until the appwrapper is re-dispatched qj.Status.PendingPodConditions = nil qj.Status.State = arbv1.AppWrapperStateActive - if qj.Spec.AggrResources.Items != nil { - for i := range qj.Spec.AggrResources.Items { - err00 := cc.refManager.AddTag(&qj.Spec.AggrResources.Items[i], func() string { - return strconv.Itoa(i) - }) - if err00 != nil { - klog.Warningf("Failed to add tag to aggregate resource item %s of app apprapper '%s/%s', err = %v", qj.Spec.AggrResources.Items[i].Name, qj.Namespace, qj.Name, err00) - } - } - } klog.V(4).Infof("[manageQueueJob] App wrapper '%s/%s' BeforeDispatchingToEtcd Version=%s Status=%+v", qj.Namespace, qj.Name, qj.ResourceVersion, qj.Status) dispatched := true dispatchFailureReason := "ItemCreationFailure." dispatchFailureMessage := "" - for _, ar := range qj.Spec.AggrResources.Items { - klog.V(10).Infof("[manageQueueJob] before dispatch [%v].SyncQueueJob %s Version=%s Status.CanRun=%t, Status.State=%s", ar.Type, qj.Name, qj.ResourceVersion, qj.Status.CanRun, qj.Status.State) - // Call Resource Controller of ar.Type to issue REST call to Etcd for resource creation - err00 := cc.qjobResControls[ar.Type].SyncQueueJob(qj, &ar) - if err00 != nil { - if apierrors.IsInvalid(err00) { - klog.Warningf("[manageQueueJob] Invalid item sent for dispatching by app wrapper='%s/%s' type=%v", qj.Namespace, qj.Name, ar.Type) - } else { - klog.Errorf("[manageQueueJob] Error dispatching item for app wrapper='%s/%s' type=%v err=%v", qj.Namespace, qj.Name, ar.Type, err00) - } - dispatchFailureMessage = fmt.Sprintf("%s/%s creation failure: %+v", qj.Namespace, qj.Name, err00) - dispatched = false - break - } - } if dispatched { // Handle generic resources for _, ar := range qj.Spec.AggrResources.GenericItems { @@ -2062,14 +1951,14 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool klog.Infof("[manageQueueJob] Getting completion status for app wrapper '%s/%s' Version=%s Status.CanRun=%t Status.State=%s, pod counts [Pending: %d, Running: %d, Succeded: %d, Failed %d]", qj.Namespace, qj.Name, qj.ResourceVersion, qj.Status.CanRun, qj.Status.State, qj.Status.Pending, qj.Status.Running, qj.Status.Succeeded, qj.Status.Failed) - //set appwrapper status to Complete or RunningHoldCompletion + // set appwrapper status to Complete or RunningHoldCompletion derivedAwStatus := cc.getAppWrapperCompletionStatus(qj) klog.Infof("[manageQueueJob] Got completion status '%s' for app wrapper '%s/%s' Version=%s Status.CanRun=%t Status.State=%s, pod counts [Pending: %d, Running: %d, Succeded: %d, Failed %d]", derivedAwStatus, qj.Namespace, qj.Name, qj.ResourceVersion, qj.Status.CanRun, qj.Status.State, qj.Status.Pending, qj.Status.Running, qj.Status.Succeeded, qj.Status.Failed) - //Set Appwrapper state to complete if all items in Appwrapper - //are completed + // Set Appwrapper state to complete if all items in Appwrapper + // are completed if derivedAwStatus == arbv1.AppWrapperStateRunningHoldCompletion { qj.Status.State = derivedAwStatus var updateQj *arbv1.AppWrapper @@ -2091,7 +1980,7 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool return err } } - //Set appwrapper status to complete + // Set appwrapper status to complete if derivedAwStatus == arbv1.AppWrapperStateCompleted { qj.Status.State = derivedAwStatus qj.Status.CanRun = false @@ -2134,7 +2023,6 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool } return nil } else { // Dispatcher Mode - if !qj.Status.CanRun && (qj.Status.State != arbv1.AppWrapperStateEnqueued && qj.Status.State != arbv1.AppWrapperStateDeleted) { // if there are running resources for this job then delete them because the job was put in // pending state... @@ -2207,20 +2095,6 @@ func (cc *XController) Cleanup(appwrapper *arbv1.AppWrapper) error { klog.V(3).Infof("[Cleanup] begin AppWrapper '%s/%s' Version=%s", appwrapper.Namespace, appwrapper.Name, appwrapper.ResourceVersion) var err *multierror.Error if !cc.isDispatcher { - if appwrapper.Spec.AggrResources.Items != nil { - // we call clean-up for each controller - for _, ar := range appwrapper.Spec.AggrResources.Items { - err00 := cc.qjobResControls[ar.Type].Cleanup(appwrapper, &ar) - if err00 != nil && !CanIgnoreAPIError(err00) && !IsJsonSyntaxError(err00) { - klog.Errorf("[Cleanup] Error deleting item %s from app wrapper='%s/%s' err=%v.", - ar.Type, appwrapper.Namespace, appwrapper.Name, err00) - err = multierror.Append(err, err00) - continue - } - klog.V(3).Infof("[Cleanup] Deleted item from app wrapper='%s/%s'", - appwrapper.Namespace, appwrapper.Name) - } - } if appwrapper.Spec.AggrResources.GenericItems != nil { for _, ar := range appwrapper.Spec.AggrResources.GenericItems { genericResourceName, gvk, err00 := cc.genericresources.Cleanup(appwrapper, &ar) @@ -2239,7 +2113,6 @@ func (cc *XController) Cleanup(appwrapper *arbv1.AppWrapper) error { } } } - } else { if appwrapper.Status.IsDispatched { queuejobKey, _ := GetQueueJobKey(appwrapper) diff --git a/pkg/controller/queuejobresources/configmap/configmap.go b/pkg/controller/queuejobresources/configmap/configmap.go deleted file mode 100644 index b2dc1fb8d..000000000 --- a/pkg/controller/queuejobresources/configmap/configmap.go +++ /dev/null @@ -1,324 +0,0 @@ -/* -Copyright 2017 The Kubernetes Authors. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -/* -Copyright 2019, 2021 The Multi-Cluster App Dispatcher Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package configmap - -import ( - "context" - "fmt" - - arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" - clientset "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/clientset/controller-versioned" - "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejobresources" - - clusterstateapi "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/api" - - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/klog/v2" - - "sync" - "time" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/apimachinery/pkg/runtime/serializer/json" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - "k8s.io/client-go/informers" - corev1informer "k8s.io/client-go/informers/core/v1" - "k8s.io/client-go/kubernetes" - corelisters "k8s.io/client-go/listers/core/v1" - "k8s.io/client-go/rest" - "k8s.io/client-go/tools/cache" -) - -var queueJobKind = arbv1.SchemeGroupVersion.WithKind("AppWrapper") -var queueJobName = "appwrapper.mcad.ibm.com" - -const ( - // QueueJobNameLabel label string for queuejob name - QueueJobNameLabel string = "appwrapper-name" - - // ControllerUIDLabel label string for queuejob controller uid - ControllerUIDLabel string = "controller-uid" -) - -//QueueJobResService contains service info -type QueueJobResConfigMap struct { - clients *kubernetes.Clientset - arbclients *clientset.Clientset - // A store of services, populated by the serviceController - configmapStore corelisters.ConfigMapLister - configmapInformer corev1informer.ConfigMapInformer - rtScheme *runtime.Scheme - jsonSerializer *json.Serializer - // Reference manager to manage membership of queuejob resource and its members - refManager queuejobresources.RefManager -} - -//Register registers a queue job resource type -func Register(regs *queuejobresources.RegisteredResources) { - regs.Register(arbv1.ResourceTypeConfigMap, func(config *rest.Config) queuejobresources.Interface { - return NewQueueJobResConfigMap(config) - }) -} - -//NewQueueJobResService creates a service controller -func NewQueueJobResConfigMap(config *rest.Config) queuejobresources.Interface { - qjrConfigMap := &QueueJobResConfigMap{ - clients: kubernetes.NewForConfigOrDie(config), - arbclients: clientset.NewForConfigOrDie(config), - } - - qjrConfigMap.configmapInformer = informers.NewSharedInformerFactory(qjrConfigMap.clients, 0).Core().V1().ConfigMaps() - qjrConfigMap.configmapInformer.Informer().AddEventHandler( - cache.FilteringResourceEventHandler{ - FilterFunc: func(obj interface{}) bool { - switch obj.(type) { - case *v1.ConfigMap: - return true - default: - return false - } - }, - Handler: cache.ResourceEventHandlerFuncs{ - AddFunc: qjrConfigMap.addConfigMap, - UpdateFunc: qjrConfigMap.updateConfigMap, - DeleteFunc: qjrConfigMap.deleteConfigMap, - }, - }) - - qjrConfigMap.rtScheme = runtime.NewScheme() - v1.AddToScheme(qjrConfigMap.rtScheme) - - qjrConfigMap.jsonSerializer = json.NewYAMLSerializer(json.DefaultMetaFactory, qjrConfigMap.rtScheme, qjrConfigMap.rtScheme) - - qjrConfigMap.refManager = queuejobresources.NewLabelRefManager() - - return qjrConfigMap -} - -// Run the main goroutine responsible for watching and services. -func (qjrConfigMap *QueueJobResConfigMap) Run(stopCh <-chan struct{}) { - - qjrConfigMap.configmapInformer.Informer().Run(stopCh) -} - -func (qjrConfigMap *QueueJobResConfigMap) GetAggregatedResources(job *arbv1.AppWrapper) *clusterstateapi.Resource { - return clusterstateapi.EmptyResource() -} - -func (qjrConfigMap *QueueJobResConfigMap) addConfigMap(obj interface{}) { - - return -} - -func (qjrConfigMap *QueueJobResConfigMap) updateConfigMap(old, cur interface{}) { - - return -} - -func (qjrConfigMap *QueueJobResConfigMap) deleteConfigMap(obj interface{}) { - - return -} - -func (qjrConfigMap *QueueJobResConfigMap) GetAggregatedResourcesByPriority(priority float64, job *arbv1.AppWrapper) *clusterstateapi.Resource { - total := clusterstateapi.EmptyResource() - return total -} - -// Parse queue job api object to get Service template -func (qjrConfigMap *QueueJobResConfigMap) getConfigMapTemplate(qjobRes *arbv1.AppWrapperResource) (*v1.ConfigMap, error) { - - configmapGVK := schema.GroupVersion{Group: v1.GroupName, Version: "v1"}.WithKind("ConfigMap") - - obj, _, err := qjrConfigMap.jsonSerializer.Decode(qjobRes.Template.Raw, &configmapGVK, nil) - if err != nil { - return nil, err - } - - configmap, ok := obj.(*v1.ConfigMap) - if !ok { - return nil, fmt.Errorf("Queuejob resource not defined as a ConfigMap") - } - - return configmap, nil - -} - -func (qjrConfigMap *QueueJobResConfigMap) createConfigMapWithControllerRef(namespace string, configmap *v1.ConfigMap, controllerRef *metav1.OwnerReference) error { - if controllerRef != nil { - configmap.OwnerReferences = append(configmap.OwnerReferences, *controllerRef) - } - - if _, err := qjrConfigMap.clients.CoreV1().ConfigMaps(namespace).Create(context.Background(), configmap, metav1.CreateOptions{}); err != nil { - return err - } - - return nil -} - -func (qjrConfigMap *QueueJobResConfigMap) delConfigMap(namespace string, name string) error { - if err := qjrConfigMap.clients.CoreV1().ConfigMaps(namespace).Delete(context.Background(), name, metav1.DeleteOptions{}); err != nil { - return err - } - - return nil -} - -func (qjrConfigMap *QueueJobResConfigMap) UpdateQueueJobStatus(queuejob *arbv1.AppWrapper) error { - return nil -} - -func (qjrConfigMap *QueueJobResConfigMap) SyncQueueJob(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { - - startTime := time.Now() - - defer func() { - klog.V(4).Infof("Finished syncing queue job resource %s (%v)", queuejob.Name, time.Now().Sub(startTime)) - }() - - _namespace, configMapInQjr, configMapsInEtcd, err := qjrConfigMap.getConfigMapForQueueJobRes(qjobRes, queuejob) - if err != nil { - return err - } - - configMapLen := len(configMapsInEtcd) - replicas := qjobRes.Replicas - - diff := int(replicas) - int(configMapLen) - - klog.V(4).Infof("QJob: %s had %d configMaps and %d desired configMaps", queuejob.Name, configMapLen, replicas) - - if diff > 0 { - //TODO: need set reference after Service has been really added - tmpConfigMap := v1.ConfigMap{} - err = qjrConfigMap.refManager.AddReference(qjobRes, &tmpConfigMap) - if err != nil { - klog.Errorf("Cannot add reference to configmap resource %+v", err) - return err - } - if configMapInQjr.Labels == nil { - configMapInQjr.Labels = map[string]string{} - } - for k, v := range tmpConfigMap.Labels { - configMapInQjr.Labels[k] = v - } - configMapInQjr.Labels[queueJobName] = queuejob.Name - - wait := sync.WaitGroup{} - wait.Add(int(diff)) - for i := 0; i < diff; i++ { - go func() { - defer wait.Done() - - err := qjrConfigMap.createConfigMapWithControllerRef(*_namespace, configMapInQjr, metav1.NewControllerRef(queuejob, queueJobKind)) - - if err != nil && errors.IsTimeout(err) { - return - } - if err != nil { - defer utilruntime.HandleError(err) - } - }() - } - wait.Wait() - } - - return nil -} - -func (qjrConfigMap *QueueJobResConfigMap) getConfigMapForQueueJobRes(qjobRes *arbv1.AppWrapperResource, queuejob *arbv1.AppWrapper) (*string, *v1.ConfigMap, []*v1.ConfigMap, error) { - - // Get "a" ConfigMap from AppWrapper Resource - configMapInQjr, err := qjrConfigMap.getConfigMapTemplate(qjobRes) - if err != nil { - klog.Errorf("Cannot read template from resource %+v %+v", qjobRes, err) - return nil, nil, nil, err - } - - // Get ConfigMap"s" in Etcd Server - var _namespace *string - if configMapInQjr.Namespace != "" { - _namespace = &configMapInQjr.Namespace - } else { - _namespace = &queuejob.Namespace - } - configMapList, err := qjrConfigMap.clients.CoreV1().ConfigMaps(*_namespace).List(context.Background(), metav1.ListOptions{LabelSelector: fmt.Sprintf("%s=%s", queueJobName, queuejob.Name)}) - // configMapList, err := qjrConfigMap.clients.CoreV1().ConfigMaps(*_namespace).List(metav1.ListOptions{}) - if err != nil { - return nil, nil, nil, err - } - - configMapsInEtcd := []*v1.ConfigMap{} - for i, _ := range configMapList.Items { - configMapsInEtcd = append(configMapsInEtcd, &configMapList.Items[i]) - } - - myConfigMapsInEtcd := []*v1.ConfigMap{} - for i, configMap := range configMapsInEtcd { - if qjrConfigMap.refManager.BelongTo(qjobRes, configMap) { - myConfigMapsInEtcd = append(myConfigMapsInEtcd, configMapsInEtcd[i]) - } - } - - return _namespace, configMapInQjr, myConfigMapsInEtcd, nil -} - -func (qjrConfigMap *QueueJobResConfigMap) deleteQueueJobResConfigMaps(qjobRes *arbv1.AppWrapperResource, queuejob *arbv1.AppWrapper) error { - - job := *queuejob - - _namespace, _, activeConfigMaps, err := qjrConfigMap.getConfigMapForQueueJobRes(qjobRes, queuejob) - if err != nil { - return err - } - - active := int32(len(activeConfigMaps)) - - wait := sync.WaitGroup{} - wait.Add(int(active)) - for i := int32(0); i < active; i++ { - go func(ix int32) { - defer wait.Done() - if err := qjrConfigMap.delConfigMap(*_namespace, activeConfigMaps[ix].Name); err != nil { - defer utilruntime.HandleError(err) - klog.V(2).Infof("Failed to delete %v, application wrapper %q/%q deadline exceeded", activeConfigMaps[ix].Name, *_namespace, job.Name) - } - }(i) - } - wait.Wait() - - return nil -} - -//Cleanup deletes all services -func (qjrConfigMap *QueueJobResConfigMap) Cleanup(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { - return qjrConfigMap.deleteQueueJobResConfigMaps(qjobRes, queuejob) -} diff --git a/pkg/controller/queuejobresources/deployment/deployment.go b/pkg/controller/queuejobresources/deployment/deployment.go deleted file mode 100644 index e7c3af108..000000000 --- a/pkg/controller/queuejobresources/deployment/deployment.go +++ /dev/null @@ -1,385 +0,0 @@ -/* -Copyright 2017 The Kubernetes Authors. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -/* -Copyright 2019, 2021 The Multi-Cluster App Dispatcher Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package deployment - -import ( - "context" - "fmt" - "sync" - "time" - - arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" - clientset "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/clientset/controller-versioned" - clusterstateapi "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/api" - "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejobresources" - apps "k8s.io/api/apps/v1" - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/apimachinery/pkg/runtime/serializer/json" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - "k8s.io/client-go/informers" - extinformer "k8s.io/client-go/informers/apps/v1" - "k8s.io/client-go/kubernetes" - extlister "k8s.io/client-go/listers/apps/v1" - "k8s.io/client-go/rest" - "k8s.io/client-go/tools/cache" - "k8s.io/klog/v2" -) - -var queueJobKind = arbv1.SchemeGroupVersion.WithKind("AppWrapper") -var queueJobName = "appwrapper.mcad.ibm.com" - -const ( - // QueueJobNameLabel label string for queuejob name - QueueJobNameLabel string = "appwrapper-name" - - // ControllerUIDLabel label string for queuejob controller uid - ControllerUIDLabel string = "controller-uid" -) - -//QueueJobResDeployment contains the resources of this queuejob -type QueueJobResDeployment struct { - clients *kubernetes.Clientset - arbclients *clientset.Clientset - // A store of deployments, populated by the deploymentController - deploymentStore extlister.DeploymentLister - deployInformer extinformer.DeploymentInformer - rtScheme *runtime.Scheme - jsonSerializer *json.Serializer - // Reference manager to manage membership of queuejob resource and its members - refManager queuejobresources.RefManager -} - -//Register registers a queue job resource type -func Register(regs *queuejobresources.RegisteredResources) { - regs.Register(arbv1.ResourceTypeDeployment, func(config *rest.Config) queuejobresources.Interface { - return NewQueueJobResDeployment(config) - }) -} - -//NewQueueJobResDeployment returns a new deployment controller -func NewQueueJobResDeployment(config *rest.Config) queuejobresources.Interface { - qjrDeployment := &QueueJobResDeployment{ - clients: kubernetes.NewForConfigOrDie(config), - arbclients: clientset.NewForConfigOrDie(config), - } - - qjrDeployment.deployInformer = informers.NewSharedInformerFactory(qjrDeployment.clients, 0).Apps().V1().Deployments() - qjrDeployment.deployInformer.Informer().AddEventHandler( - cache.FilteringResourceEventHandler{ - FilterFunc: func(obj interface{}) bool { - switch obj.(type) { - case *apps.Deployment: - return true - default: - return false - } - }, - Handler: cache.ResourceEventHandlerFuncs{ - AddFunc: qjrDeployment.addDeployment, - UpdateFunc: qjrDeployment.updateDeployment, - DeleteFunc: qjrDeployment.deleteDeployment, - }, - }) - - qjrDeployment.rtScheme = runtime.NewScheme() - v1.AddToScheme(qjrDeployment.rtScheme) - apps.AddToScheme(qjrDeployment.rtScheme) - qjrDeployment.jsonSerializer = json.NewYAMLSerializer(json.DefaultMetaFactory, qjrDeployment.rtScheme, qjrDeployment.rtScheme) - - qjrDeployment.refManager = queuejobresources.NewLabelRefManager() - - return qjrDeployment -} - -func (qjrDeployment *QueueJobResDeployment) GetPodTemplate(qjobRes *arbv1.AppWrapperResource) (*v1.PodTemplateSpec, int32, error) { - res, err := qjrDeployment.getDeploymentTemplate(qjobRes) - if err != nil { - return nil, -1, err - } - - // Validate template field - if res.Spec.Replicas == nil { - return nil, 0, fmt.Errorf("spec.replicas field not defined in resource object: %#v", qjobRes) - } - return &res.Spec.Template, *res.Spec.Replicas, nil -} - -func (qjrDeployment *QueueJobResDeployment) GetAggregatedResources(job *arbv1.AppWrapper) *clusterstateapi.Resource { - total := clusterstateapi.EmptyResource() - if job.Spec.AggrResources.Items != nil { - //calculate scaling - for _, ar := range job.Spec.AggrResources.Items { - if ar.Type == arbv1.ResourceTypeDeployment { - template, replicas, err := qjrDeployment.GetPodTemplate(&ar) - if err != nil { - klog.Errorf("Pod Template not found in item: %#v error: %#v. Aggregated resources set to 0.", ar, err) - } else { - myres := queuejobresources.GetPodResources(template) - myres.MilliCPU = float64(replicas) * myres.MilliCPU - myres.Memory = float64(replicas) * myres.Memory - myres.GPU = int64(replicas) * myres.GPU - total = total.Add(myres) - } - } - } - } - return total -} - -func (qjrDeployment *QueueJobResDeployment) GetAggregatedResourcesByPriority(priority float64, job *arbv1.AppWrapper) *clusterstateapi.Resource { - total := clusterstateapi.EmptyResource() - if job.Spec.AggrResources.Items != nil { - //calculate scaling - for _, ar := range job.Spec.AggrResources.Items { - if ar.Priority < priority { - continue - } - if ar.Type == arbv1.ResourceTypeDeployment { - template, replicas, _ := qjrDeployment.GetPodTemplate(&ar) - myres := queuejobresources.GetPodResources(template) - myres.MilliCPU = float64(replicas) * myres.MilliCPU - myres.Memory = float64(replicas) * myres.Memory - myres.GPU = int64(replicas) * myres.GPU - total = total.Add(myres) - } - } - } - return total -} - -//func (qjrDeployment *QueueJobResDeployment) GetAggregatedResourcesByPhase(phase v1.PodPhase, job *arbv1.AppWrapper) *clusterstateapi.Resource { -// total := clusterstateapi.EmptyResource() -// if job.Spec.AggrResources.Items != nil { -// //calculate scaling -// for _, ar := range job.Spec.AggrResources.Items { -// if ar.Type == arbv1.ResourceTypeDeployment { -// template, replicas, _ := qjrDeployment.GetPodTemplate(&ar) -// myres := queuejobresources.GetPodResourcesByPhase(phase, template) -// myres.MilliCPU = float64(replicas) * myres.MilliCPU -// myres.Memory = float64(replicas) * myres.Memory -// myres.GPU = int64(replicas) * myres.GPU -// total = total.Add(myres) -// } -// } -// } -// return total -//} - -//Run the main goroutine responsible for watching and deployments. -func (qjrDeployment *QueueJobResDeployment) Run(stopCh <-chan struct{}) { - qjrDeployment.deployInformer.Informer().Run(stopCh) -} - -func (qjrDeployment *QueueJobResDeployment) addDeployment(obj interface{}) { - - return -} - -func (qjrDeployment *QueueJobResDeployment) updateDeployment(old, cur interface{}) { - - return -} - -func (qjrDeployment *QueueJobResDeployment) deleteDeployment(obj interface{}) { - - return -} - -// Parse queue job api object to get Service template -func (qjrDeployment *QueueJobResDeployment) getDeploymentTemplate(qjobRes *arbv1.AppWrapperResource) (*apps.Deployment, error) { - deploymentGVK := schema.GroupVersion{Group: apps.GroupName, Version: "v1"}.WithKind("Deployment") - obj, _, err := qjrDeployment.jsonSerializer.Decode(qjobRes.Template.Raw, &deploymentGVK, nil) - if err != nil { - return nil, err - } - - deployment, ok := obj.(*apps.Deployment) - if !ok { - return nil, fmt.Errorf("Queuejob resource not defined as a Deployment") - } - - return deployment, nil - -} - -func (qjrDeployment *QueueJobResDeployment) createDeploymentWithControllerRef(namespace string, deployment *apps.Deployment, controllerRef *metav1.OwnerReference) error { - if controllerRef != nil { - deployment.OwnerReferences = append(deployment.OwnerReferences, *controllerRef) - } - - if _, err := qjrDeployment.clients.AppsV1().Deployments(namespace).Create(context.Background(), deployment, metav1.CreateOptions{}); err != nil { - return err - } - - return nil -} - -func (qjrDeployment *QueueJobResDeployment) delDeployment(namespace string, name string) error { - if err := qjrDeployment.clients.AppsV1().Deployments(namespace).Delete(context.Background(), name, metav1.DeleteOptions{}); err != nil { - return err - } - return nil -} - -func (qjrDeployment *QueueJobResDeployment) UpdateQueueJobStatus(queuejob *arbv1.AppWrapper) error { - return nil -} - -func (qjrDeployment *QueueJobResDeployment) SyncQueueJob(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { - - startTime := time.Now() - - defer func() { - klog.V(4).Infof("Finished syncing queue job resource %s (%v)", queuejob.Name, time.Now().Sub(startTime)) - }() - - _namespace, deploymentInQjr, deploymentsInEtcd, err := qjrDeployment.getDeploymentForQueueJobRes(qjobRes, queuejob) - if err != nil { - return err - } - - deploymentLen := len(deploymentsInEtcd) - replicas := qjobRes.Replicas - - diff := int(replicas) - int(deploymentLen) - - klog.V(4).Infof("QJob: %s had %d Deployments and %d desired Deployments", queuejob.Name, deploymentLen, replicas) - - if diff > 0 { - //TODO: need set reference after Service has been really added - tmpDeployment := apps.Deployment{} - err = qjrDeployment.refManager.AddReference(qjobRes, &tmpDeployment) - if err != nil { - klog.Errorf("Cannot add reference to configmap resource %+v", err) - return err - } - if deploymentInQjr.Labels == nil { - deploymentInQjr.Labels = map[string]string{} - } - for k, v := range tmpDeployment.Labels { - deploymentInQjr.Labels[k] = v - } - deploymentInQjr.Labels[queueJobName] = queuejob.Name - if deploymentInQjr.Spec.Template.Labels == nil { - deploymentInQjr.Labels = map[string]string{} - } - deploymentInQjr.Spec.Template.Labels[queueJobName] = queuejob.Name - - wait := sync.WaitGroup{} - wait.Add(int(diff)) - for i := 0; i < diff; i++ { - go func() { - defer wait.Done() - - err := qjrDeployment.createDeploymentWithControllerRef(*_namespace, deploymentInQjr, metav1.NewControllerRef(queuejob, queueJobKind)) - - if err != nil && errors.IsTimeout(err) { - return - } - if err != nil { - defer utilruntime.HandleError(err) - } - }() - } - wait.Wait() - } - - return nil -} - -func (qjrDeployment *QueueJobResDeployment) getDeploymentForQueueJobRes(qjobRes *arbv1.AppWrapperResource, queuejob *arbv1.AppWrapper) (*string, *apps.Deployment, []*apps.Deployment, error) { - - // Get "a" Deployment from AppWrapper Resource - deploymentInQjr, err := qjrDeployment.getDeploymentTemplate(qjobRes) - if err != nil { - klog.Errorf("Cannot read template from resource %+v %+v", qjobRes, err) - return nil, nil, nil, err - } - - // Get Deployment"s" in Etcd Server - var _namespace *string - if deploymentInQjr.Namespace != "" { - _namespace = &deploymentInQjr.Namespace - } else { - _namespace = &queuejob.Namespace - } - - deploymentList, err := qjrDeployment.clients.AppsV1().Deployments(*_namespace).List(context.Background(), metav1.ListOptions{LabelSelector: fmt.Sprintf("%s=%s", queueJobName, queuejob.Name)}) - if err != nil { - return nil, nil, nil, err - } - deploymentsInEtcd := []*apps.Deployment{} - for i, _ := range deploymentList.Items { - deploymentsInEtcd = append(deploymentsInEtcd, &deploymentList.Items[i]) - } - - myDeploymentsInEtcd := []*apps.Deployment{} - for i, deployment := range deploymentsInEtcd { - if qjrDeployment.refManager.BelongTo(qjobRes, deployment) { - myDeploymentsInEtcd = append(myDeploymentsInEtcd, deploymentsInEtcd[i]) - } - } - - return _namespace, deploymentInQjr, myDeploymentsInEtcd, nil -} - -func (qjrDeployment *QueueJobResDeployment) deleteQueueJobResDeployments(qjobRes *arbv1.AppWrapperResource, queuejob *arbv1.AppWrapper) error { - - job := *queuejob - - _namespace, _, activeDeployments, err := qjrDeployment.getDeploymentForQueueJobRes(qjobRes, queuejob) - if err != nil { - return err - } - - active := int32(len(activeDeployments)) - - wait := sync.WaitGroup{} - wait.Add(int(active)) - for i := int32(0); i < active; i++ { - go func(ix int32) { - defer wait.Done() - if err := qjrDeployment.delDeployment(*_namespace, activeDeployments[ix].Name); err != nil { - defer utilruntime.HandleError(err) - klog.V(2).Infof("Failed to delete %v, queue job %q/%q deadline exceeded", activeDeployments[ix].Name, *_namespace, job.Name) - } - }(i) - } - wait.Wait() - - return nil -} - -//Cleanup deletes all services -func (qjrDeployment *QueueJobResDeployment) Cleanup(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { - return qjrDeployment.deleteQueueJobResDeployments(qjobRes, queuejob) -} diff --git a/pkg/controller/queuejobresources/interfaces.go b/pkg/controller/queuejobresources/interfaces.go index d442a2df4..fbb46a0d4 100644 --- a/pkg/controller/queuejobresources/interfaces.go +++ b/pkg/controller/queuejobresources/interfaces.go @@ -1,19 +1,4 @@ /* -Copyright 2014 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -/* Copyright 2019, 2021 The Multi-Cluster App Dispatcher Authors. Licensed under the Apache License, Version 2.0 (the "License"); @@ -28,21 +13,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + package queuejobresources import ( qjobv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" - clusterstateapi "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/api" ) // Interface is an abstract interface for queue job resource management. type Interface interface { - SyncQueueJob(queuejob *qjobv1.AppWrapper, qjobRes *qjobv1.AppWrapperResource) error UpdateQueueJobStatus(queuejob *qjobv1.AppWrapper) error - GetAggregatedResources(queuejob *qjobv1.AppWrapper) *clusterstateapi.Resource - GetAggregatedResourcesByPriority(priority float64, queuejob *qjobv1.AppWrapper) *clusterstateapi.Resource - //TODO: Add to calculate more accurate partial deployments while job is being realized -// GetAggregatedResourcesByPhase(phase v1.PodPhase, queuejob *qjobv1.AppWrapper) *clusterstateapi.Resource - Cleanup(queuejob *qjobv1.AppWrapper, qjobRes *qjobv1.AppWrapperResource) error + // TODO: Add to calculate more accurate partial deployments while job is being realized + // GetAggregatedResourcesByPhase(phase v1.PodPhase, queuejob *qjobv1.AppWrapper) *clusterstateapi.Resource Run(stopCh <-chan struct{}) } diff --git a/pkg/controller/queuejobresources/namespace/namespace.go b/pkg/controller/queuejobresources/namespace/namespace.go deleted file mode 100644 index b58f74e9b..000000000 --- a/pkg/controller/queuejobresources/namespace/namespace.go +++ /dev/null @@ -1,327 +0,0 @@ -/* -Copyright 2017 The Kubernetes Authors. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -/* -Copyright 2019, 2021 The Multi-Cluster App Dispatcher Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package namespace - -import ( - "context" - "fmt" - - arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" - clientset "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/clientset/controller-versioned" - "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejobresources" - - "sync" - "time" - - clusterstateapi "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/api" - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/klog/v2" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/apimachinery/pkg/runtime/serializer/json" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - "k8s.io/client-go/informers" - corev1informer "k8s.io/client-go/informers/core/v1" - "k8s.io/client-go/kubernetes" - corelisters "k8s.io/client-go/listers/core/v1" - "k8s.io/client-go/rest" - "k8s.io/client-go/tools/cache" -) - -var queueJobKind = arbv1.SchemeGroupVersion.WithKind("AppWrapper") -var queueJobName = "appwrapper.mcad.ibm.com" - -const ( - // QueueJobNameLabel label string for queuejob name - QueueJobNameLabel string = "appwrapper-name" - - // ControllerUIDLabel label string for queuejob controller uid - ControllerUIDLabel string = "controller-uid" -) - -//QueueJobResService contains service info -type QueueJobResNamespace struct { - clients *kubernetes.Clientset - arbclients *clientset.Clientset - // A store of services, populated by the serviceController - namespaceStore corelisters.NamespaceLister - namespaceInformer corev1informer.NamespaceInformer - rtScheme *runtime.Scheme - jsonSerializer *json.Serializer - // Reference manager to manage membership of queuejob resource and its members - refManager queuejobresources.RefManager -} - -//Register registers a queue job resource type -func Register(regs *queuejobresources.RegisteredResources) { - regs.Register(arbv1.ResourceTypeNamespace, func(config *rest.Config) queuejobresources.Interface { - return NewQueueJobResNamespace(config) - }) -} - -//NewQueueJobResService creates a service controller -func NewQueueJobResNamespace(config *rest.Config) queuejobresources.Interface { - qjrNamespace := &QueueJobResNamespace{ - clients: kubernetes.NewForConfigOrDie(config), - arbclients: clientset.NewForConfigOrDie(config), - } - - qjrNamespace.namespaceInformer = informers.NewSharedInformerFactory(qjrNamespace.clients, 0).Core().V1().Namespaces() - qjrNamespace.namespaceInformer.Informer().AddEventHandler( - cache.FilteringResourceEventHandler{ - FilterFunc: func(obj interface{}) bool { - switch obj.(type) { - case *v1.Namespace: - return true - default: - return false - } - }, - Handler: cache.ResourceEventHandlerFuncs{ - AddFunc: qjrNamespace.addNamespace, - UpdateFunc: qjrNamespace.updateNamespace, - DeleteFunc: qjrNamespace.deleteNamespace, - }, - }) - - qjrNamespace.rtScheme = runtime.NewScheme() - v1.AddToScheme(qjrNamespace.rtScheme) - - qjrNamespace.jsonSerializer = json.NewYAMLSerializer(json.DefaultMetaFactory, qjrNamespace.rtScheme, qjrNamespace.rtScheme) - - qjrNamespace.refManager = queuejobresources.NewLabelRefManager() - - return qjrNamespace -} - -// Run the main goroutine responsible for watching and services. -func (qjrNamespace *QueueJobResNamespace) Run(stopCh <-chan struct{}) { - - qjrNamespace.namespaceInformer.Informer().Run(stopCh) -} - -func (qjrNamespace *QueueJobResNamespace) GetAggregatedResources(job *arbv1.AppWrapper) *clusterstateapi.Resource { - return clusterstateapi.EmptyResource() -} - -func (qjrNamespace *QueueJobResNamespace) addNamespace(obj interface{}) { - - return -} - -func (qjrNamespace *QueueJobResNamespace) updateNamespace(old, cur interface{}) { - - return -} - -func (qjrNamespace *QueueJobResNamespace) deleteNamespace(obj interface{}) { - - return -} - -func (qjrNamespace *QueueJobResNamespace) GetAggregatedResourcesByPriority(priority float64, job *arbv1.AppWrapper) *clusterstateapi.Resource { - total := clusterstateapi.EmptyResource() - return total -} - -// Parse queue job api object to get Service template -func (qjrNamespace *QueueJobResNamespace) getNamespaceTemplate(qjobRes *arbv1.AppWrapperResource) (*v1.Namespace, error) { - - namespaceGVK := schema.GroupVersion{Group: v1.GroupName, Version: "v1"}.WithKind("Namespace") - - obj, _, err := qjrNamespace.jsonSerializer.Decode(qjobRes.Template.Raw, &namespaceGVK, nil) - if err != nil { - return nil, err - } - - namespace, ok := obj.(*v1.Namespace) - if !ok { - return nil, fmt.Errorf("Queuejob resource not defined as a Namespace") - } - - return namespace, nil - -} - -func (qjrNamespace *QueueJobResNamespace) createNamespaceWithControllerRef(namespace *v1.Namespace, controllerRef *metav1.OwnerReference) error { - - // klog.V(4).Infof("==========create Namespace: %+v \n", namespace) - if controllerRef != nil { - namespace.OwnerReferences = append(namespace.OwnerReferences, *controllerRef) - } - - if _, err := qjrNamespace.clients.CoreV1().Namespaces().Create(context.Background(), namespace, metav1.CreateOptions{}); err != nil { - return err - } - - return nil -} - -func (qjrNamespace *QueueJobResNamespace) delNamespace(name string) error { - - klog.V(4).Infof("==========delete namespace: %s \n", name) - if err := qjrNamespace.clients.CoreV1().Namespaces().Delete(context.Background(), name, metav1.DeleteOptions{}); err != nil { - return err - } - - return nil -} - -func (qjrNamespace *QueueJobResNamespace) UpdateQueueJobStatus(queuejob *arbv1.AppWrapper) error { - return nil -} - -//SyncQueueJob syncs the services -func (qjrNamespace *QueueJobResNamespace) SyncQueueJob(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { - - startTime := time.Now() - defer func() { - klog.V(4).Infof("Finished syncing queue job resource %s (%v)", queuejob.Name, time.Now().Sub(startTime)) - }() - - namespaces, err := qjrNamespace.getNamespaceForQueueJobRes(qjobRes, queuejob) - if err != nil { - return err - } - - namespaceLen := len(namespaces) - replicas := qjobRes.Replicas - - diff := int(replicas) - int(namespaceLen) - - klog.V(4).Infof("QJob: %s had %d namespaces and %d desired namespaces", queuejob.Name, namespaceLen, replicas) - - if diff > 0 { - template, err := qjrNamespace.getNamespaceTemplate(qjobRes) - if err != nil { - klog.Errorf("Cannot read template from resource %+v %+v", qjobRes, err) - return err - } - //TODO: need set reference after Service has been really added - tmpNamespace := v1.Namespace{} - err = qjrNamespace.refManager.AddReference(qjobRes, &tmpNamespace) - if err != nil { - klog.Errorf("Cannot add reference to namespace resource %+v", err) - return err - } - - if template.Labels == nil { - template.Labels = map[string]string{} - } - for k, v := range tmpNamespace.Labels { - template.Labels[k] = v - } - template.Labels[queueJobName] = queuejob.Name - - wait := sync.WaitGroup{} - wait.Add(int(diff)) - for i := 0; i < diff; i++ { - go func() { - defer wait.Done() - err := qjrNamespace.createNamespaceWithControllerRef(template, metav1.NewControllerRef(queuejob, queueJobKind)) - if err != nil && errors.IsTimeout(err) { - return - } - if err != nil { - defer utilruntime.HandleError(err) - } - }() - } - wait.Wait() - } - - return nil -} - -func (qjrNamespace *QueueJobResNamespace) getNamespaceForQueueJob(j *arbv1.AppWrapper) ([]*v1.Namespace, error) { - namespacelist, err := qjrNamespace.clients.CoreV1().Namespaces().List(context.Background(), metav1.ListOptions{LabelSelector: fmt.Sprintf("%s=%s", queueJobName, j.Name)}) - if err != nil { - return nil, err - } - - namespaces := []*v1.Namespace{} - for i, _ := range namespacelist.Items { - namespaces = append(namespaces, &namespacelist.Items[i]) - } - - return namespaces, nil - -} - -func (qjrNamespace *QueueJobResNamespace) getNamespaceForQueueJobRes(qjobRes *arbv1.AppWrapperResource, j *arbv1.AppWrapper) ([]*v1.Namespace, error) { - - namespaces, err := qjrNamespace.getNamespaceForQueueJob(j) - if err != nil { - return nil, err - } - - myNamespaces := []*v1.Namespace{} - for i, namespace := range namespaces { - if qjrNamespace.refManager.BelongTo(qjobRes, namespace) { - myNamespaces = append(myNamespaces, namespaces[i]) - } - } - - return myNamespaces, nil - -} - -func (qjrNamespace *QueueJobResNamespace) deleteQueueJobResNamespaces(qjobRes *arbv1.AppWrapperResource, queuejob *arbv1.AppWrapper) error { - - job := *queuejob - - activeNamespaces, err := qjrNamespace.getNamespaceForQueueJobRes(qjobRes, queuejob) - if err != nil { - return err - } - - active := int32(len(activeNamespaces)) - - wait := sync.WaitGroup{} - wait.Add(int(active)) - for i := int32(0); i < active; i++ { - go func(ix int32) { - defer wait.Done() - if err := qjrNamespace.delNamespace(activeNamespaces[ix].Name); err != nil { - defer utilruntime.HandleError(err) - klog.V(2).Infof("Failed to delete %v, queue job %q/%q deadline exceeded", activeNamespaces[ix].Name, job.Namespace, job.Name) - } - }(i) - } - wait.Wait() - - return nil -} - -//Cleanup deletes all services -func (qjrNamespace *QueueJobResNamespace) Cleanup(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { - return qjrNamespace.deleteQueueJobResNamespaces(qjobRes, queuejob) -} diff --git a/pkg/controller/queuejobresources/networkpolicy/networkpolicy.go b/pkg/controller/queuejobresources/networkpolicy/networkpolicy.go deleted file mode 100644 index 56240abbc..000000000 --- a/pkg/controller/queuejobresources/networkpolicy/networkpolicy.go +++ /dev/null @@ -1,325 +0,0 @@ -/* -Copyright 2017 The Kubernetes Authors. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -/* -Copyright 2019, 2021 The Multi-Cluster App Dispatcher Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package networkpolicy - -import ( - "context" - "fmt" - - arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" - clientset "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/clientset/controller-versioned" - "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejobresources" - - clusterstateapi "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/api" - "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/apimachinery/pkg/runtime/serializer/json" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - "k8s.io/client-go/informers" - "k8s.io/client-go/kubernetes" - "k8s.io/klog/v2" - - "sync" - "time" - - "k8s.io/client-go/rest" - "k8s.io/client-go/tools/cache" - - networkingv1 "k8s.io/api/networking/v1" - networkingv1informer "k8s.io/client-go/informers/networking/v1" - networkingv1lister "k8s.io/client-go/listers/networking/v1" -) - -var queueJobKind = arbv1.SchemeGroupVersion.WithKind("AppWrapper") -var queueJobName = "appwrapper.mcad.ibm.com" - -const ( - // QueueJobNameLabel label string for queuejob name - QueueJobNameLabel string = "appwrapper-name" - - // ControllerUIDLabel label string for queuejob controller uid - ControllerUIDLabel string = "controller-uid" -) - -//QueueJobResService contains service info -type QueueJobResNetworkPolicy struct { - clients *kubernetes.Clientset - arbclients *clientset.Clientset - // A store of services, populated by the serviceController - networkpolicyStore networkingv1lister.NetworkPolicyLister - networkpolicyInformer networkingv1informer.NetworkPolicyInformer - rtScheme *runtime.Scheme - jsonSerializer *json.Serializer - // Reference manager to manage membership of queuejob resource and its members - refManager queuejobresources.RefManager -} - -//Register registers a queue job resource type -func Register(regs *queuejobresources.RegisteredResources) { - regs.Register(arbv1.ResourceTypeNetworkPolicy, func(config *rest.Config) queuejobresources.Interface { - return NewQueueJobResNetworkPolicy(config) - }) -} - -//NewQueueJobResService creates a service controller -func NewQueueJobResNetworkPolicy(config *rest.Config) queuejobresources.Interface { - qjrNetworkPolicy := &QueueJobResNetworkPolicy{ - clients: kubernetes.NewForConfigOrDie(config), - arbclients: clientset.NewForConfigOrDie(config), - } - - qjrNetworkPolicy.networkpolicyInformer = informers.NewSharedInformerFactory(qjrNetworkPolicy.clients, 0).Networking().V1().NetworkPolicies() - qjrNetworkPolicy.networkpolicyInformer.Informer().AddEventHandler( - cache.FilteringResourceEventHandler{ - FilterFunc: func(obj interface{}) bool { - switch obj.(type) { - case *networkingv1.NetworkPolicy: - return true - default: - return false - } - }, - Handler: cache.ResourceEventHandlerFuncs{ - AddFunc: qjrNetworkPolicy.addNetworkPolicy, - UpdateFunc: qjrNetworkPolicy.updateNetworkPolicy, - DeleteFunc: qjrNetworkPolicy.deleteNetworkPolicy, - }, - }) - - qjrNetworkPolicy.rtScheme = runtime.NewScheme() - // v1.AddToScheme(qjrNetworkPolicy.rtScheme) - networkingv1.AddToScheme(qjrNetworkPolicy.rtScheme) // Tonghoon - - qjrNetworkPolicy.jsonSerializer = json.NewYAMLSerializer(json.DefaultMetaFactory, qjrNetworkPolicy.rtScheme, qjrNetworkPolicy.rtScheme) - - qjrNetworkPolicy.refManager = queuejobresources.NewLabelRefManager() - - return qjrNetworkPolicy -} - -// Run the main goroutine responsible for watching and services. -func (qjrNetworkPolicy *QueueJobResNetworkPolicy) Run(stopCh <-chan struct{}) { - - qjrNetworkPolicy.networkpolicyInformer.Informer().Run(stopCh) -} - -func (qjrNetworkPolicy *QueueJobResNetworkPolicy) GetAggregatedResources(job *arbv1.AppWrapper) *clusterstateapi.Resource { - return clusterstateapi.EmptyResource() -} - -func (qjrNetworkPolicy *QueueJobResNetworkPolicy) addNetworkPolicy(obj interface{}) { - - return -} - -func (qjrNetworkPolicy *QueueJobResNetworkPolicy) updateNetworkPolicy(old, cur interface{}) { - - return -} - -func (qjrNetworkPolicy *QueueJobResNetworkPolicy) deleteNetworkPolicy(obj interface{}) { - - return -} - -func (qjrNetworkPolicy *QueueJobResNetworkPolicy) GetAggregatedResourcesByPriority(priority float64, job *arbv1.AppWrapper) *clusterstateapi.Resource { - total := clusterstateapi.EmptyResource() - return total -} - -// Parse queue job api object to get Service template -func (qjrNetworkPolicy *QueueJobResNetworkPolicy) getNetworkPolicyTemplate(qjobRes *arbv1.AppWrapperResource) (*networkingv1.NetworkPolicy, error) { - - networkpolicyGVK := schema.GroupVersion{Group: networkingv1.GroupName, Version: "v1"}.WithKind("NetworkPolicy") - obj, _, err := qjrNetworkPolicy.jsonSerializer.Decode(qjobRes.Template.Raw, &networkpolicyGVK, nil) - if err != nil { - return nil, err - } - - networkpolicy, ok := obj.(*networkingv1.NetworkPolicy) - if !ok { - return nil, fmt.Errorf("Queuejob resource not defined as a NetworkPolicy") - } - - return networkpolicy, nil - -} - -func (qjrNetworkPolicy *QueueJobResNetworkPolicy) createNetworkPolicyWithControllerRef(namespace string, networkpolicy *networkingv1.NetworkPolicy, controllerRef *metav1.OwnerReference) error { - - if controllerRef != nil { - networkpolicy.OwnerReferences = append(networkpolicy.OwnerReferences, *controllerRef) - } - - if _, err := qjrNetworkPolicy.clients.NetworkingV1().NetworkPolicies(namespace).Create(context.Background(), networkpolicy, metav1.CreateOptions{}); err != nil { - return err - } - - return nil -} - -func (qjrNetworkPolicy *QueueJobResNetworkPolicy) delNetworkPolicy(namespace string, name string) error { - - klog.V(4).Infof("==========delete networkpolicy: %s \n", name) - if err := qjrNetworkPolicy.clients.NetworkingV1().NetworkPolicies(namespace).Delete(context.Background(), name, metav1.DeleteOptions{}); err != nil { - return err - } - - return nil -} - -func (qjrNetworkPolicy *QueueJobResNetworkPolicy) UpdateQueueJobStatus(queuejob *arbv1.AppWrapper) error { - return nil -} - -func (qjrNetworkPolicy *QueueJobResNetworkPolicy) SyncQueueJob(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { - - startTime := time.Now() - - defer func() { - klog.V(4).Infof("Finished syncing queue job resource %s (%v)", queuejob.Name, time.Now().Sub(startTime)) - }() - - _namespace, networkPolicyInQjr, networkPoliciesInEtcd, err := qjrNetworkPolicy.getNetworkPolicyForQueueJobRes(qjobRes, queuejob) - if err != nil { - return err - } - - networkPolicyLen := len(networkPoliciesInEtcd) - replicas := qjobRes.Replicas - - diff := int(replicas) - int(networkPolicyLen) - - klog.V(4).Infof("QJob: %s had %d NetworkPolicies and %d desired NetworkPolicies", queuejob.Name, networkPolicyLen, replicas) - - if diff > 0 { - //TODO: need set reference after Service has been really added - tmpNetworkPolicy := networkingv1.NetworkPolicy{} - err = qjrNetworkPolicy.refManager.AddReference(qjobRes, &tmpNetworkPolicy) - if err != nil { - klog.Errorf("Cannot add reference to configmap resource %+v", err) - return err - } - - if networkPolicyInQjr.Labels == nil { - networkPolicyInQjr.Labels = map[string]string{} - } - for k, v := range tmpNetworkPolicy.Labels { - networkPolicyInQjr.Labels[k] = v - } - networkPolicyInQjr.Labels[queueJobName] = queuejob.Name - - wait := sync.WaitGroup{} - wait.Add(int(diff)) - for i := 0; i < diff; i++ { - go func() { - defer wait.Done() - - err := qjrNetworkPolicy.createNetworkPolicyWithControllerRef(*_namespace, networkPolicyInQjr, metav1.NewControllerRef(queuejob, queueJobKind)) - - if err != nil && errors.IsTimeout(err) { - return - } - if err != nil { - defer utilruntime.HandleError(err) - } - }() - } - wait.Wait() - } - - return nil -} - -func (qjrNetworkPolicy *QueueJobResNetworkPolicy) getNetworkPolicyForQueueJobRes(qjobRes *arbv1.AppWrapperResource, queuejob *arbv1.AppWrapper) (*string, *networkingv1.NetworkPolicy, []*networkingv1.NetworkPolicy, error) { - - // Get "a" NetworkPolicy from AppWrapper Resource - networkPolicyInQjr, err := qjrNetworkPolicy.getNetworkPolicyTemplate(qjobRes) - if err != nil { - klog.Errorf("Cannot read template from resource %+v %+v", qjobRes, err) - return nil, nil, nil, err - } - - // Get NetworkPolicy"s" in Etcd Server - var _namespace *string - if networkPolicyInQjr.Namespace != "" { - _namespace = &networkPolicyInQjr.Namespace - } else { - _namespace = &queuejob.Namespace - } - networkPolicyList, err := qjrNetworkPolicy.clients.NetworkingV1().NetworkPolicies(*_namespace).List(context.Background(), metav1.ListOptions{LabelSelector: fmt.Sprintf("%s=%s", queueJobName, queuejob.Name)}) - if err != nil { - return nil, nil, nil, err - } - networkPoliciesInEtcd := []*networkingv1.NetworkPolicy{} - for i, _ := range networkPolicyList.Items { - networkPoliciesInEtcd = append(networkPoliciesInEtcd, &networkPolicyList.Items[i]) - } - myNetworkPoliciesInEtcd := []*networkingv1.NetworkPolicy{} - for i, networkPolicy := range networkPoliciesInEtcd { - if qjrNetworkPolicy.refManager.BelongTo(qjobRes, networkPolicy) { - myNetworkPoliciesInEtcd = append(myNetworkPoliciesInEtcd, networkPoliciesInEtcd[i]) - } - } - - return _namespace, networkPolicyInQjr, myNetworkPoliciesInEtcd, nil -} - -func (qjrNetworkPolicy *QueueJobResNetworkPolicy) deleteQueueJobResNetworkPolicies(qjobRes *arbv1.AppWrapperResource, queuejob *arbv1.AppWrapper) error { - - job := *queuejob - - _namespace, _, activeNetworkPolicies, err := qjrNetworkPolicy.getNetworkPolicyForQueueJobRes(qjobRes, queuejob) - if err != nil { - return err - } - - active := int32(len(activeNetworkPolicies)) - - wait := sync.WaitGroup{} - wait.Add(int(active)) - for i := int32(0); i < active; i++ { - go func(ix int32) { - defer wait.Done() - if err := qjrNetworkPolicy.delNetworkPolicy(*_namespace, activeNetworkPolicies[ix].Name); err != nil { - defer utilruntime.HandleError(err) - klog.V(2).Infof("Failed to delete %v, queue job %q/%q deadline exceeded", activeNetworkPolicies[ix].Name, *_namespace, job.Name) - } - }(i) - } - wait.Wait() - - return nil -} - -//Cleanup deletes all services -func (qjrNetworkPolicy *QueueJobResNetworkPolicy) Cleanup(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { - return qjrNetworkPolicy.deleteQueueJobResNetworkPolicies(qjobRes, queuejob) -} diff --git a/pkg/controller/queuejobresources/persistentvolume/persistentvolume.go b/pkg/controller/queuejobresources/persistentvolume/persistentvolume.go deleted file mode 100644 index 02248b376..000000000 --- a/pkg/controller/queuejobresources/persistentvolume/persistentvolume.go +++ /dev/null @@ -1,339 +0,0 @@ -/* -Copyright 2017 The Kubernetes Authors. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -/* -Copyright 2019, 2021 The Multi-Cluster App Dispatcher Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package persistentvolume - -import ( - "context" - "fmt" - - arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" - clientset "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/clientset/controller-versioned" - "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejobresources" - - clusterstateapi "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/api" - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/klog/v2" - - "sync" - "time" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/apimachinery/pkg/runtime/serializer/json" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - "k8s.io/client-go/informers" - corev1informer "k8s.io/client-go/informers/core/v1" - "k8s.io/client-go/kubernetes" - corelisters "k8s.io/client-go/listers/core/v1" - "k8s.io/client-go/rest" - "k8s.io/client-go/tools/cache" -) - -var queueJobKind = arbv1.SchemeGroupVersion.WithKind("AppWrapper") -var queueJobName = "appwrapper.mcad.ibm.com" - -const ( - // QueueJobNameLabel label string for queuejob name - QueueJobNameLabel string = "appwrapper-name" - - // ControllerUIDLabel label string for queuejob controller uid - ControllerUIDLabel string = "controller-uid" -) - -//QueueJobResService contains service info -type QueueJobResPersistentvolume struct { - clients *kubernetes.Clientset - arbclients *clientset.Clientset - // A store of services, populated by the serviceController - persistentvolumeStore corelisters.PersistentVolumeLister - persistentvolumeInformer corev1informer.PersistentVolumeInformer - rtScheme *runtime.Scheme - jsonSerializer *json.Serializer - // Reference manager to manage membership of queuejob resource and its members - refManager queuejobresources.RefManager -} - -//Register registers a queue job resource type -func Register(regs *queuejobresources.RegisteredResources) { - regs.Register(arbv1.ResourceTypePersistentVolume, func(config *rest.Config) queuejobresources.Interface { - return NewQueueJobResPersistentvolume(config) - }) -} - -//NewQueueJobResService creates a service controller -func NewQueueJobResPersistentvolume(config *rest.Config) queuejobresources.Interface { - qjrPersistentvolume := &QueueJobResPersistentvolume{ - clients: kubernetes.NewForConfigOrDie(config), - arbclients: clientset.NewForConfigOrDie(config), - } - - qjrPersistentvolume.persistentvolumeInformer = informers.NewSharedInformerFactory(qjrPersistentvolume.clients, 0).Core().V1().PersistentVolumes() - qjrPersistentvolume.persistentvolumeInformer.Informer().AddEventHandler( - cache.FilteringResourceEventHandler{ - FilterFunc: func(obj interface{}) bool { - switch obj.(type) { - case *v1.PersistentVolume: - return true - default: - return false - } - }, - Handler: cache.ResourceEventHandlerFuncs{ - AddFunc: qjrPersistentvolume.addPersistentVolume, - UpdateFunc: qjrPersistentvolume.updatePersistentVolume, - DeleteFunc: qjrPersistentvolume.deletePersistentVolume, - }, - }) - - qjrPersistentvolume.rtScheme = runtime.NewScheme() - v1.AddToScheme(qjrPersistentvolume.rtScheme) - - qjrPersistentvolume.jsonSerializer = json.NewYAMLSerializer(json.DefaultMetaFactory, qjrPersistentvolume.rtScheme, qjrPersistentvolume.rtScheme) - - qjrPersistentvolume.refManager = queuejobresources.NewLabelRefManager() - - return qjrPersistentvolume -} - -// Run the main goroutine responsible for watching and services. -func (qjrPersistentvolume *QueueJobResPersistentvolume) Run(stopCh <-chan struct{}) { - - qjrPersistentvolume.persistentvolumeInformer.Informer().Run(stopCh) -} - -func (qjrPersistentvolume *QueueJobResPersistentvolume) GetAggregatedResources(job *arbv1.AppWrapper) *clusterstateapi.Resource { - return clusterstateapi.EmptyResource() -} - -func (qjrPersistentvolume *QueueJobResPersistentvolume) addPersistentVolume(obj interface{}) { - - return -} - -func (qjrPersistentvolume *QueueJobResPersistentvolume) updatePersistentVolume(old, cur interface{}) { - - return -} - -func (qjrPersistentvolume *QueueJobResPersistentvolume) deletePersistentVolume(obj interface{}) { - - return -} - -func (qjrPersistentvolume *QueueJobResPersistentvolume) GetAggregatedResourcesByPriority(priority float64, job *arbv1.AppWrapper) *clusterstateapi.Resource { - total := clusterstateapi.EmptyResource() - return total -} - -// Parse queue job api object to get Service template -func (qjrPersistentvolume *QueueJobResPersistentvolume) getPersistentVolumeTemplate(qjobRes *arbv1.AppWrapperResource) (*v1.PersistentVolume, error) { - - persistentvolumeGVK := schema.GroupVersion{Group: v1.GroupName, Version: "v1"}.WithKind("PersistentVolume") - - obj, _, err := qjrPersistentvolume.jsonSerializer.Decode(qjobRes.Template.Raw, &persistentvolumeGVK, nil) - if err != nil { - return nil, err - } - - persistentvolume, ok := obj.(*v1.PersistentVolume) - if !ok { - return nil, fmt.Errorf("Queuejob resource not defined as a PersistentVolume") - } - - return persistentvolume, nil - -} - -func (qjrPersistentvolume *QueueJobResPersistentvolume) createPersistentVolumeWithControllerRef(persistentvolume *v1.PersistentVolume, controllerRef *metav1.OwnerReference) error { - - if controllerRef != nil { - persistentvolume.OwnerReferences = append(persistentvolume.OwnerReferences, *controllerRef) - } - - if _, err := qjrPersistentvolume.clients.CoreV1().PersistentVolumes().Create(context.Background(), persistentvolume, metav1.CreateOptions{}); err != nil { - return err - } - - return nil -} - -func (qjrPersistentvolume *QueueJobResPersistentvolume) delPersistentVolume(name string) error { - - klog.V(4).Infof("==========delete persistentvolume: %s \n", name) - if err := qjrPersistentvolume.clients.CoreV1().PersistentVolumes().Delete(context.Background(), name, metav1.DeleteOptions{}); err != nil { - return err - } - - return nil -} - -func (qjrPersistentvolume *QueueJobResPersistentvolume) UpdateQueueJobStatus(queuejob *arbv1.AppWrapper) error { - return nil -} - -//SyncQueueJob syncs the services -func (qjrPersistentvolume *QueueJobResPersistentvolume) SyncQueueJob(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { - - startTime := time.Now() - defer func() { - klog.V(4).Infof("Finished syncing queue job resource %s (%v)", queuejob.Name, time.Now().Sub(startTime)) - }() - - persistentvolumes, err := qjrPersistentvolume.getPersistentVolumeForQueueJobRes(qjobRes, queuejob) - if err != nil { - return err - } - - persistentvolumeLen := len(persistentvolumes) - replicas := qjobRes.Replicas - - diff := int(replicas) - int(persistentvolumeLen) - - klog.V(4).Infof("QJob: %s had %d persistentvolumes and %d desired persistentvolumes", queuejob.Name, persistentvolumeLen, replicas) - - if diff > 0 { - template, err := qjrPersistentvolume.getPersistentVolumeTemplate(qjobRes) - if err != nil { - klog.Errorf("Cannot read template from resource %+v %+v", qjobRes, err) - return err - } - //TODO: need set reference after Service has been really added - tmpPersistentVolume := v1.PersistentVolume{} - err = qjrPersistentvolume.refManager.AddReference(qjobRes, &tmpPersistentVolume) - if err != nil { - klog.Errorf("Cannot add reference to persistentvolume resource %+v", err) - return err - } - - if template.Labels == nil { - template.Labels = map[string]string{} - } - for k, v := range tmpPersistentVolume.Labels { - template.Labels[k] = v - } - template.Labels[queueJobName] = queuejob.Name - - wait := sync.WaitGroup{} - wait.Add(int(diff)) - for i := 0; i < diff; i++ { - go func() { - defer wait.Done() - err := qjrPersistentvolume.createPersistentVolumeWithControllerRef(template, metav1.NewControllerRef(queuejob, queueJobKind)) - if err != nil && errors.IsTimeout(err) { - return - } - if err != nil { - defer utilruntime.HandleError(err) - } - }() - } - wait.Wait() - } - - return nil -} - -func (qjrPersistentvolume *QueueJobResPersistentvolume) getPersistentVolumeForQueueJob(j *arbv1.AppWrapper) ([]*v1.PersistentVolume, error) { - persistentvolumelist, err := qjrPersistentvolume.clients.CoreV1().PersistentVolumes().List(context.Background(), metav1.ListOptions{LabelSelector: fmt.Sprintf("%s=%s", queueJobName, j.Name)}) - if err != nil { - return nil, err - } - - persistentvolumes := []*v1.PersistentVolume{} - for i, _ := range persistentvolumelist.Items { - persistentvolumes = append(persistentvolumes, &persistentvolumelist.Items[i]) - } - - // for i, persistentvolume := range persistentvolumelist.Items { - // metaPersistentVolume, err := meta.Accessor(&persistentvolume) - // if err != nil { - // return nil, err - // } - // - // controllerRef := metav1.GetControllerOf(metaPersistentVolume) - // if controllerRef != nil { - // if controllerRef.UID == j.UID { - // persistentvolumes = append(persistentvolumes, &persistentvolumelist.Items[i]) - // } - // } - // } - return persistentvolumes, nil - -} - -func (qjrPersistentvolume *QueueJobResPersistentvolume) getPersistentVolumeForQueueJobRes(qjobRes *arbv1.AppWrapperResource, j *arbv1.AppWrapper) ([]*v1.PersistentVolume, error) { - - persistentvolumes, err := qjrPersistentvolume.getPersistentVolumeForQueueJob(j) - if err != nil { - return nil, err - } - - myPersistentVolumes := []*v1.PersistentVolume{} - for i, persistentvolume := range persistentvolumes { - if qjrPersistentvolume.refManager.BelongTo(qjobRes, persistentvolume) { - myPersistentVolumes = append(myPersistentVolumes, persistentvolumes[i]) - } - } - - return myPersistentVolumes, nil - -} - -func (qjrPersistentvolume *QueueJobResPersistentvolume) deleteQueueJobResPersistentVolumes(qjobRes *arbv1.AppWrapperResource, queuejob *arbv1.AppWrapper) error { - - job := *queuejob - - activePersistentVolumes, err := qjrPersistentvolume.getPersistentVolumeForQueueJobRes(qjobRes, queuejob) - if err != nil { - return err - } - - active := int32(len(activePersistentVolumes)) - - wait := sync.WaitGroup{} - wait.Add(int(active)) - for i := int32(0); i < active; i++ { - go func(ix int32) { - defer wait.Done() - if err := qjrPersistentvolume.delPersistentVolume(activePersistentVolumes[ix].Name); err != nil { - defer utilruntime.HandleError(err) - klog.V(2).Infof("Failed to delete %v, queue job %q/%q deadline exceeded", activePersistentVolumes[ix].Name, job.Namespace, job.Name) - } - }(i) - } - wait.Wait() - - return nil -} - -//Cleanup deletes all services -func (qjrPersistentvolume *QueueJobResPersistentvolume) Cleanup(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { - return qjrPersistentvolume.deleteQueueJobResPersistentVolumes(qjobRes, queuejob) -} diff --git a/pkg/controller/queuejobresources/persistentvolumeclaim/persistentvolumeclaim.go b/pkg/controller/queuejobresources/persistentvolumeclaim/persistentvolumeclaim.go deleted file mode 100644 index 710e24e8f..000000000 --- a/pkg/controller/queuejobresources/persistentvolumeclaim/persistentvolumeclaim.go +++ /dev/null @@ -1,324 +0,0 @@ -/* -Copyright 2017 The Kubernetes Authors. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -/* -Copyright 2019, 2021 The Multi-Cluster App Dispatcher Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package persistentvolumeclaim - -import ( - "context" - "fmt" - - arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" - clientset "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/clientset/controller-versioned" - "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejobresources" - - "sync" - "time" - - clusterstateapi "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/api" - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/apimachinery/pkg/runtime/serializer/json" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - "k8s.io/client-go/informers" - corev1informer "k8s.io/client-go/informers/core/v1" - "k8s.io/client-go/kubernetes" - corelisters "k8s.io/client-go/listers/core/v1" - "k8s.io/client-go/rest" - "k8s.io/client-go/tools/cache" - "k8s.io/klog/v2" -) - -var queueJobKind = arbv1.SchemeGroupVersion.WithKind("AppWrapper") -var queueJobName = "appwrapper.mcad.ibm.com" - -const ( - // QueueJobNameLabel label string for queuejob name - QueueJobNameLabel string = "appwrapper-name" - - // ControllerUIDLabel label string for queuejob controller uid - ControllerUIDLabel string = "controller-uid" -) - -//QueueJobResService contains service info -type QueueJobResPersistentVolumeClaim struct { - clients *kubernetes.Clientset - arbclients *clientset.Clientset - // A store of services, populated by the serviceController - persistentvolumeclaimStore corelisters.PersistentVolumeClaimLister - persistentvolumeclaimInformer corev1informer.PersistentVolumeClaimInformer - rtScheme *runtime.Scheme - jsonSerializer *json.Serializer - // Reference manager to manage membership of queuejob resource and its members - refManager queuejobresources.RefManager -} - -//Register registers a queue job resource type -func Register(regs *queuejobresources.RegisteredResources) { - regs.Register(arbv1.ResourceTypePersistentVolumeClaim, func(config *rest.Config) queuejobresources.Interface { - return NewQueueJobResPersistentVolumeClaim(config) - }) -} - -//NewQueueJobResService creates a service controller -func NewQueueJobResPersistentVolumeClaim(config *rest.Config) queuejobresources.Interface { - qjrPersistentVolumeClaim := &QueueJobResPersistentVolumeClaim{ - clients: kubernetes.NewForConfigOrDie(config), - arbclients: clientset.NewForConfigOrDie(config), - } - - qjrPersistentVolumeClaim.persistentvolumeclaimInformer = informers.NewSharedInformerFactory(qjrPersistentVolumeClaim.clients, 0).Core().V1().PersistentVolumeClaims() - qjrPersistentVolumeClaim.persistentvolumeclaimInformer.Informer().AddEventHandler( - cache.FilteringResourceEventHandler{ - FilterFunc: func(obj interface{}) bool { - switch obj.(type) { - case *v1.PersistentVolumeClaim: - return true - default: - return false - } - }, - Handler: cache.ResourceEventHandlerFuncs{ - AddFunc: qjrPersistentVolumeClaim.addPersistentVolumeClaim, - UpdateFunc: qjrPersistentVolumeClaim.updatePersistentVolumeClaim, - DeleteFunc: qjrPersistentVolumeClaim.deletePersistentVolumeClaim, - }, - }) - - qjrPersistentVolumeClaim.rtScheme = runtime.NewScheme() - v1.AddToScheme(qjrPersistentVolumeClaim.rtScheme) - - qjrPersistentVolumeClaim.jsonSerializer = json.NewYAMLSerializer(json.DefaultMetaFactory, qjrPersistentVolumeClaim.rtScheme, qjrPersistentVolumeClaim.rtScheme) - - qjrPersistentVolumeClaim.refManager = queuejobresources.NewLabelRefManager() - - return qjrPersistentVolumeClaim -} - -// Run the main goroutine responsible for watching and services. -func (qjrPersistentVolumeClaim *QueueJobResPersistentVolumeClaim) Run(stopCh <-chan struct{}) { - - qjrPersistentVolumeClaim.persistentvolumeclaimInformer.Informer().Run(stopCh) -} - -func (qjrPersistentVolumeClaim *QueueJobResPersistentVolumeClaim) GetAggregatedResources(job *arbv1.AppWrapper) *clusterstateapi.Resource { - return clusterstateapi.EmptyResource() -} - -func (qjrPersistentVolumeClaim *QueueJobResPersistentVolumeClaim) addPersistentVolumeClaim(obj interface{}) { - - return -} - -func (qjrPersistentVolumeClaim *QueueJobResPersistentVolumeClaim) updatePersistentVolumeClaim(old, cur interface{}) { - - return -} - -func (qjrPersistentVolumeClaim *QueueJobResPersistentVolumeClaim) deletePersistentVolumeClaim(obj interface{}) { - - return -} - -func (qjrPersistentVolumeClaim *QueueJobResPersistentVolumeClaim) GetAggregatedResourcesByPriority(priority float64, job *arbv1.AppWrapper) *clusterstateapi.Resource { - total := clusterstateapi.EmptyResource() - return total -} - -// Parse queue job api object to get Service template -func (qjrPersistentVolumeClaim *QueueJobResPersistentVolumeClaim) getPersistentVolumeClaimTemplate(qjobRes *arbv1.AppWrapperResource) (*v1.PersistentVolumeClaim, error) { - - persistentvolumeclaimGVK := schema.GroupVersion{Group: v1.GroupName, Version: "v1"}.WithKind("PersistentVolumeClaim") - - obj, _, err := qjrPersistentVolumeClaim.jsonSerializer.Decode(qjobRes.Template.Raw, &persistentvolumeclaimGVK, nil) - if err != nil { - return nil, err - } - - persistentvolumeclaim, ok := obj.(*v1.PersistentVolumeClaim) - if !ok { - return nil, fmt.Errorf("Queuejob resource not defined as a PersistentVolumeClaim") - } - - return persistentvolumeclaim, nil - -} - -func (qjrPersistentVolumeClaim *QueueJobResPersistentVolumeClaim) createPersistentVolumeClaimWithControllerRef(namespace string, persistentvolumeclaim *v1.PersistentVolumeClaim, controllerRef *metav1.OwnerReference) error { - - if controllerRef != nil { - persistentvolumeclaim.OwnerReferences = append(persistentvolumeclaim.OwnerReferences, *controllerRef) - } - - if _, err := qjrPersistentVolumeClaim.clients.CoreV1().PersistentVolumeClaims(namespace).Create(context.Background(), persistentvolumeclaim, metav1.CreateOptions{}); err != nil { - return err - } - - return nil -} - -func (qjrPersistentVolumeClaim *QueueJobResPersistentVolumeClaim) delPersistentVolumeClaim(namespace string, name string) error { - - klog.V(4).Infof("==========delete persistentvolumeclaim: %s \n", name) - if err := qjrPersistentVolumeClaim.clients.CoreV1().PersistentVolumeClaims(namespace).Delete(context.Background(), name, metav1.DeleteOptions{}); err != nil { - return err - } - - return nil -} - -func (qjrPersistentVolumeClaim *QueueJobResPersistentVolumeClaim) UpdateQueueJobStatus(queuejob *arbv1.AppWrapper) error { - return nil -} - -func (qjrPersistentVolumeClaim *QueueJobResPersistentVolumeClaim) SyncQueueJob(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { - - startTime := time.Now() - - defer func() { - klog.V(4).Infof("Finished syncing queue job resource %s (%v)", queuejob.Name, time.Now().Sub(startTime)) - }() - - _namespace, persistentVolumeClaimInQjr, persistentVolumeClaimsInEtcd, err := qjrPersistentVolumeClaim.getPersistentVolumeClaimForQueueJobRes(qjobRes, queuejob) - if err != nil { - return err - } - - persistentVolumeClaimLen := len(persistentVolumeClaimsInEtcd) - replicas := qjobRes.Replicas - - diff := int(replicas) - int(persistentVolumeClaimLen) - - klog.V(4).Infof("QJob: %s had %d PersistVolumeClaims and %d desired PersistVolumeClaims", queuejob.Name, persistentVolumeClaimLen, replicas) - - if diff > 0 { - //TODO: need set reference after Service has been really added - tmpPersistentVolumeClaim := v1.PersistentVolumeClaim{} - err = qjrPersistentVolumeClaim.refManager.AddReference(qjobRes, &tmpPersistentVolumeClaim) - if err != nil { - klog.Errorf("Cannot add reference to configmap resource %+v", err) - return err - } - - if persistentVolumeClaimInQjr.Labels == nil { - persistentVolumeClaimInQjr.Labels = map[string]string{} - } - for k, v := range tmpPersistentVolumeClaim.Labels { - persistentVolumeClaimInQjr.Labels[k] = v - } - persistentVolumeClaimInQjr.Labels[queueJobName] = queuejob.Name - - wait := sync.WaitGroup{} - wait.Add(int(diff)) - for i := 0; i < diff; i++ { - go func() { - defer wait.Done() - - err := qjrPersistentVolumeClaim.createPersistentVolumeClaimWithControllerRef(*_namespace, persistentVolumeClaimInQjr, metav1.NewControllerRef(queuejob, queueJobKind)) - - if err != nil && errors.IsTimeout(err) { - return - } - if err != nil { - defer utilruntime.HandleError(err) - } - }() - } - wait.Wait() - } - - return nil -} - -func (qjrPersistentVolumeClaim *QueueJobResPersistentVolumeClaim) getPersistentVolumeClaimForQueueJobRes(qjobRes *arbv1.AppWrapperResource, queuejob *arbv1.AppWrapper) (*string, *v1.PersistentVolumeClaim, []*v1.PersistentVolumeClaim, error) { - - // Get "a" PersistentVolumeClaim from AppWrapper Resource - persistentVolumeClaimInQjr, err := qjrPersistentVolumeClaim.getPersistentVolumeClaimTemplate(qjobRes) - if err != nil { - klog.Errorf("Cannot read template from resource %+v %+v", qjobRes, err) - return nil, nil, nil, err - } - - // Get PersistentVolumeClaim"s" in Etcd Server - var _namespace *string - if persistentVolumeClaimInQjr.Namespace != "" { - _namespace = &persistentVolumeClaimInQjr.Namespace - } else { - _namespace = &queuejob.Namespace - } - persistentVolumeClaimList, err := qjrPersistentVolumeClaim.clients.CoreV1().PersistentVolumeClaims(*_namespace).List(context.Background(), metav1.ListOptions{LabelSelector: fmt.Sprintf("%s=%s", queueJobName, queuejob.Name)}) - if err != nil { - return nil, nil, nil, err - } - persistentVolumeClaimsInEtcd := []*v1.PersistentVolumeClaim{} - for i, _ := range persistentVolumeClaimList.Items { - persistentVolumeClaimsInEtcd = append(persistentVolumeClaimsInEtcd, &persistentVolumeClaimList.Items[i]) - } - - myPersistentVolumeClaimsInEtcd := []*v1.PersistentVolumeClaim{} - for i, persistentVolumeClaim := range persistentVolumeClaimsInEtcd { - if qjrPersistentVolumeClaim.refManager.BelongTo(qjobRes, persistentVolumeClaim) { - myPersistentVolumeClaimsInEtcd = append(myPersistentVolumeClaimsInEtcd, persistentVolumeClaimsInEtcd[i]) - } - } - - return _namespace, persistentVolumeClaimInQjr, myPersistentVolumeClaimsInEtcd, nil -} - -func (qjrPersistentVolumeClaim *QueueJobResPersistentVolumeClaim) deleteQueueJobResPersistentVolumeClaims(qjobRes *arbv1.AppWrapperResource, queuejob *arbv1.AppWrapper) error { - - job := *queuejob - - _namespace, _, activePersistentVolumeClaims, err := qjrPersistentVolumeClaim.getPersistentVolumeClaimForQueueJobRes(qjobRes, queuejob) - if err != nil { - return err - } - - active := int32(len(activePersistentVolumeClaims)) - - wait := sync.WaitGroup{} - wait.Add(int(active)) - for i := int32(0); i < active; i++ { - go func(ix int32) { - defer wait.Done() - if err := qjrPersistentVolumeClaim.delPersistentVolumeClaim(*_namespace, activePersistentVolumeClaims[ix].Name); err != nil { - defer utilruntime.HandleError(err) - klog.V(2).Infof("Failed to delete %v, queue job %q/%q deadline exceeded", activePersistentVolumeClaims[ix].Name, *_namespace, job.Name) - } - }(i) - } - wait.Wait() - - return nil -} - -//Cleanup deletes all services -func (qjrPersistentVolumeClaim *QueueJobResPersistentVolumeClaim) Cleanup(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { - return qjrPersistentVolumeClaim.deleteQueueJobResPersistentVolumeClaims(qjobRes, queuejob) -} diff --git a/pkg/controller/queuejobresources/pod/pod.go b/pkg/controller/queuejobresources/pod/pod.go index c9f033f8a..62abd4783 100644 --- a/pkg/controller/queuejobresources/pod/pod.go +++ b/pkg/controller/queuejobresources/pod/pod.go @@ -1,19 +1,4 @@ /* -Copyright 2017 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -/* Copyright 2019, 2021 The Multi-Cluster App Dispatcher Authors. Licensed under the Apache License, Version 2.0 (the "License"); @@ -28,28 +13,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + package pod import ( - "context" "fmt" - arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" - clientset "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/clientset/controller-versioned" - "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/maputils" - "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejobresources" - - "sync" - "time" - - clusterstateapi "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/api" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/runtime/serializer/json" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - "k8s.io/apimachinery/pkg/util/uuid" "k8s.io/client-go/informers" corev1informer "k8s.io/client-go/informers/core/v1" "k8s.io/client-go/kubernetes" @@ -57,20 +30,22 @@ import ( "k8s.io/client-go/rest" "k8s.io/client-go/tools/cache" "k8s.io/klog/v2" + + arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" + clientset "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/clientset/controller-versioned" + clusterstateapi "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/api" + "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/maputils" + "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejobresources" ) -var queueJobKind = arbv1.SchemeGroupVersion.WithKind("AppWrapper") var queueJobName = "appwrapper.mcad.ibm.com" const ( // QueueJobNameLabel label string for queuejob name QueueJobNameLabel string = "appwrapper-name" - - // ControllerUIDLabel label string for queuejob controller uid - ControllerUIDLabel string = "controller-uid" ) -//QueueJobResPod Controller for QueueJob pods +// QueueJobResPod Controller for QueueJob pods type QueueJobResPod struct { clients *kubernetes.Clientset arbclients *clientset.Clientset @@ -91,7 +66,7 @@ type QueueJobResPod struct { jsonSerializer *json.Serializer // Reference manager to manage membership of queuejob resource and its members - refManager queuejobresources.RefManager + // refManager queuejobresources.RefManager // A counter that store the current terminating pods no of QueueJob // this is used to avoid to re-create the pods of a QueueJob before // all the old pods are terminated @@ -105,7 +80,7 @@ func Register(regs *queuejobresources.RegisteredResources) { }) } -//NewQueueJobResPod Creates a new controller for QueueJob pods +// NewQueueJobResPod Creates a new controller for QueueJob pods func NewQueueJobResPod(config *rest.Config) queuejobresources.Interface { // create k8s clientset @@ -143,14 +118,11 @@ func NewQueueJobResPod(config *rest.Config) queuejobresources.Interface { qjrPod.podStore = qjrPod.podInformer.Lister() qjrPod.podSynced = qjrPod.podInformer.Informer().HasSynced - qjrPod.refManager = queuejobresources.NewLabelRefManager() - return qjrPod } // Run the main goroutine responsible for watching and pods. func (qjrPod *QueueJobResPod) Run(stopCh <-chan struct{}) { - qjrPod.podInformer.Informer().Run(stopCh) } @@ -205,24 +177,6 @@ func isPodActive(p *v1.Pod) bool { p.DeletionTimestamp == nil } -//SyncQueueJob : method to sync the resources of this job -func (qjrPod *QueueJobResPod) SyncQueueJob(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { - // check if there are still terminating pods for this QueueJob - //counter, ok := qjrPod.deletedPodsCounter.Get(fmt.Sprintf("%s/%s", queuejob.Namespace, queuejob.Name)) - //if ok && counter >= 0 { - // return fmt.Errorf("There are still terminating pods for QueueJob %s/%s, can not sync it now", queuejob.Namespace, queuejob.Name) - //} - - pods, err := qjrPod.getPodsForQueueJob(queuejob) - if err != nil { - return err - } - - err = qjrPod.manageQueueJob(queuejob, pods, qjobRes) - - return err -} - func (qjrPod *QueueJobResPod) UpdateQueueJobStatus(queuejob *arbv1.AppWrapper) error { sel := &metav1.LabelSelector{ @@ -259,7 +213,7 @@ func (qjrPod *QueueJobResPod) UpdateQueueJobStatus(queuejob *arbv1.AppWrapper) e queuejob.Status.Running = running queuejob.Status.Succeeded = succeeded queuejob.Status.Failed = failed - //Total resources by all running pods + // Total resources by all running pods queuejob.Status.TotalGPU = totalResourcesConsumedForPodPhases.GPU queuejob.Status.TotalCPU = totalResourcesConsumedForPodPhases.MilliCPU queuejob.Status.TotalMemory = totalResourcesConsumedForPodPhases.Memory @@ -273,387 +227,7 @@ func (qjrPod *QueueJobResPod) UpdateQueueJobStatus(queuejob *arbv1.AppWrapper) e return nil } -// manageQueueJob is the core method responsible for managing the number of running -// pods according to what is specified in the job.Spec. -// Does NOT modify . -func (qjrPod *QueueJobResPod) manageQueueJob(qj *arbv1.AppWrapper, pods []*v1.Pod, ar *arbv1.AppWrapperResource) error { - var err error - replicas := 0 - if qj.Spec.AggrResources.Items != nil { - // we call clean-up for each controller - for _, ar := range qj.Spec.AggrResources.Items { - if ar.Type == arbv1.ResourceTypePod { - replicas = int(ar.Replicas) - } - } - } - running := int32(queuejobresources.FilterPods(pods, v1.PodRunning)) - pending := int32(queuejobresources.FilterPods(pods, v1.PodPending)) - succeeded := int32(queuejobresources.FilterPods(pods, v1.PodSucceeded)) - failed := int32(queuejobresources.FilterPods(pods, v1.PodFailed)) - - klog.Infof("[manageQueueJob] There are %d pods of QueueJob %s: replicas: %d pending %d, running %d, succeeded %d, failed %d", - len(pods), qj.Name, replicas, pending, running, succeeded, failed) - - ss, err := qjrPod.arbclients.ArbV1().SchedulingSpecs(qj.Namespace).List(metav1.ListOptions{ - FieldSelector: fmt.Sprintf("metadata.name=%s", qj.Name), - }) - - if len(ss.Items) == 0 { - schedSpc := createQueueJobSchedulingSpec(qj) - _, err := qjrPod.arbclients.ArbV1().SchedulingSpecs(qj.Namespace).Create(schedSpc) - if err != nil { - klog.Errorf("Failed to create SchedulingSpec for QueueJob %v/%v: %v", - qj.Namespace, qj.Name, err) - } - } else { - klog.V(3).Infof("There's %v SchedulingSpec for QueueJob %v/%v", - len(ss.Items), qj.Namespace, qj.Name) - } - - // Create pod if necessary - if diff := int32(replicas) - pending - running - succeeded; diff > 0 { - klog.V(3).Infof("[manageQueueJob] Try to create %v Pods for QueueJob %v/%v", diff, qj.Namespace, qj.Name) - var errs []error - wait := sync.WaitGroup{} - wait.Add(int(diff)) - for i := int32(0); i < diff; i++ { - go func(ix int32) { - defer wait.Done() - newPod := qjrPod.createQueueJobPod(qj, ix, ar) - - if newPod == nil { - err := fmt.Errorf("Job resource template item not define as a PodTemplate") - klog.Errorf("Failed to create a pod for Job %s, error: %#v.", qj.Name, err) - errs = append(errs, err) - } else { - _, err := qjrPod.clients.CoreV1().Pods(newPod.Namespace).Create(context.Background(), newPod, metav1.CreateOptions{}) - if err != nil { - // Failed to create Pod, wait a moment and then create it again - // This is to ensure all pods under the same QueueJob created - // So gang-scheduling could schedule the QueueJob successfully - klog.Errorf("Failed to create pod %s for QueueJob %s, err %#v", - newPod.Name, qj.Name, err) - errs = append(errs, err) - } - } - }(i) - } - wait.Wait() - - if len(errs) != 0 { - return fmt.Errorf("failed to create %d pods of %d", len(errs), diff) - } - } - - qj.Status.Pending = pending - qj.Status.Running = running - qj.Status.Succeeded = succeeded - qj.Status.Failed = failed - return err -} - -func (qjrPod *QueueJobResPod) getPodsForQueueJob(qj *arbv1.AppWrapper) ([]*v1.Pod, error) { - sel := &metav1.LabelSelector{ - MatchLabels: map[string]string{ - queueJobName: qj.Name, - }, - } - selector, err := metav1.LabelSelectorAsSelector(sel) - if err != nil { - return nil, fmt.Errorf("couldn't convert QueueJob selector: %v", err) - } - - // List all pods under QueueJob - pods, errt := qjrPod.podStore.Pods(qj.Namespace).List(selector) - if errt != nil { - return nil, errt - } - - return pods, nil -} - -// manageQueueJobPods is the core method responsible for managing the number of running -// pods according to what is specified in the job.Spec. This is a controller for all pods specified in the QJ template -// Does NOT modify . -func (qjrPod *QueueJobResPod) manageQueueJobPods(activePods []*v1.Pod, succeeded int32, qj *arbv1.AppWrapper, ar *arbv1.AppWrapperResource) (bool, error) { - jobDone := false - var err error - active := int32(len(activePods)) - - replicas := 0 - if qj.Spec.AggrResources.Items != nil { - // we call clean-up for each controller - for _, ar := range qj.Spec.AggrResources.Items { - if ar.Type == arbv1.ResourceTypePod { - replicas = replicas + 1 - } - } - } - - if active+succeeded > int32(replicas) { - // the QueueJob replicas is reduce by user, terminated all pods for gang scheduling - // and re-create pods for the queuejob in next loop - jobDone = false - // TODO(jinzhejz): need make sure manage this QueueJob after all old pods are terminated - err = qjrPod.terminatePodsForQueueJob(qj) - } else if active+succeeded == int32(replicas) { - // pod number match QueueJob replicas perfectly - if succeeded == int32(replicas) { - // all pods exit successfully - jobDone = true - } else { - // some pods are still running - jobDone = false - } - } else if active+succeeded < int32(replicas) { - if active+succeeded == 0 { - // it is a new QueueJob, create pods for it - diff := int32(replicas) - active - succeeded - - wait := sync.WaitGroup{} - wait.Add(int(diff)) - for i := int32(0); i < diff; i++ { - go func(ix int32) { - defer wait.Done() - newPod := qjrPod.createQueueJobPod(qj, ix, ar) - if newPod == nil { - err = fmt.Errorf("Job resource template item not define as a PodTemplate") - klog.Errorf("Failed to create pod %s for Job %s, err %#v", - newPod.Name, qj.Name, err) - } else { - for { - _, err := qjrPod.clients.CoreV1().Pods(newPod.Namespace).Create(context.Background(), newPod, metav1.CreateOptions{}) - if err == nil { - // Create Pod successfully - break - } else { - // Failed to create Pod, wait a moment and then create it again - // This is to ensure all pods under the same QueueJob created - // So gang-scheduling could schedule the QueueJob successfully - klog.Warningf("Failed to create pod %s for Job %s, err %#v, wait 2 seconds and re-create it", newPod.Name, qj.Name, err) - time.Sleep(2 * time.Second) - } - } - } - }(i) - } - wait.Wait() - jobDone = false - } else if active+succeeded > 0 { - // the QueueJob replicas is reduce by user, terminated all pods for gang scheduling - // and re-create pods for the queuejob in next loop - jobDone = false - // TODO(jinzhejz): need make sure manage this QueueJob after all old pods are terminated - err = qjrPod.terminatePodsForQueueJob(qj) - } - } - - return jobDone, err -} - -func (qjrPod *QueueJobResPod) terminatePodsForQueueJob(qj *arbv1.AppWrapper) error { - - pods, err := qjrPod.getPodsForQueueJob(qj) - if len(pods) == 0 || err != nil { - return err - } - - qjrPod.deletedPodsCounter.Set(fmt.Sprintf("%s/%s", qj.Namespace, qj.Name), len(pods)) - - wait := sync.WaitGroup{} - wait.Add(len(pods)) - for _, pod := range pods { - go func(p *v1.Pod) { - defer wait.Done() - err := qjrPod.clients.CoreV1().Pods(p.Namespace).Delete(context.Background(), p.Name, metav1.DeleteOptions{}) - if err != nil { - klog.Warning("Fail to delete pod %s for QueueJob %s/%s", p.Name, qj.Namespace, qj.Name) - qjrPod.deletedPodsCounter.DecreaseCounter(fmt.Sprintf("%s/%s", qj.Namespace, qj.Name)) - } - }(pod) - } - wait.Wait() - - return nil -} - -func (qjrPod *QueueJobResPod) getPodsForQueueJobRes(qjobRes *arbv1.AppWrapperResource, j *arbv1.AppWrapper) ([]*v1.Pod, error) { - - pods, err := qjrPod.getPodsForQueueJob(j) - if err != nil { - return nil, err - } - - myPods := []*v1.Pod{} - for i, pod := range pods { - if qjrPod.refManager.BelongTo(qjobRes, pod) { - myPods = append(myPods, pods[i]) - } - } - - return myPods, nil - -} - -func generateUUID() string { - id := uuid.NewUUID() - - return fmt.Sprintf("%s", id) -} - -func (qjrPod *QueueJobResPod) deleteQueueJobResPods(qjobRes *arbv1.AppWrapperResource, queuejob *arbv1.AppWrapper) error { - - job := *queuejob - - pods, err := qjrPod.getPodsForQueueJob(queuejob) - if err != nil { - return err - } - - klog.Infof("I have found pods for QueueJob: %v \n", len(pods)) - - activePods := filterActivePods(pods) - active := int32(len(activePods)) - - wait := sync.WaitGroup{} - wait.Add(int(active)) - for i := int32(0); i < active; i++ { - go func(ix int32) { - defer wait.Done() - if err := qjrPod.clients.CoreV1().Pods(queuejob.Namespace).Delete(context.Background(), activePods[ix].Name, metav1.DeleteOptions{}); err != nil { - defer utilruntime.HandleError(err) - klog.V(2).Infof("Failed to delete %v, queue job %q/%q deadline exceeded", activePods[ix].Name, job.Namespace, job.Name) - } - }(i) - } - wait.Wait() - - return nil -} - -func createQueueJobSchedulingSpec(qj *arbv1.AppWrapper) *arbv1.SchedulingSpec { - return &arbv1.SchedulingSpec{ - ObjectMeta: metav1.ObjectMeta{ - Name: qj.Name, - Namespace: qj.Namespace, - OwnerReferences: []metav1.OwnerReference{ - *metav1.NewControllerRef(qj, queueJobKind), - }, - }, - Spec: qj.Spec.SchedSpec, - } -} - -//GetPodTemplate Parse queue job api object to get Pod template -func (qjrPod *QueueJobResPod) GetPodTemplate(qjobRes *arbv1.AppWrapperResource) (*v1.PodTemplateSpec, error) { - - podGVK := schema.GroupVersion{Group: v1.GroupName, Version: "v1"}.WithKind("PodTemplate") - - obj, _, err := qjrPod.jsonSerializer.Decode(qjobRes.Template.Raw, &podGVK, nil) - if err != nil { - return nil, err - } - - template, ok := obj.(*v1.PodTemplate) - if !ok { - return nil, fmt.Errorf("Job resource template item not define as a PodTemplate") - } - - return &template.Template, nil - -} - -func (qjrPod *QueueJobResPod) GetAggregatedResources(job *arbv1.AppWrapper) *clusterstateapi.Resource { - total := clusterstateapi.EmptyResource() - if job.Spec.AggrResources.Items != nil { - //calculate scaling - for _, ar := range job.Spec.AggrResources.Items { - if ar.Type == arbv1.ResourceTypePod { - template, err := qjrPod.GetPodTemplate(&ar) - if err != nil { - klog.Errorf("Can not parse pod template in item: %+v error: %+v. Aggregated resources set to 0.", ar, err) - } else { - replicas := ar.Replicas - myres := queuejobresources.GetPodResources(template) - - myres.MilliCPU = float64(replicas) * myres.MilliCPU - myres.Memory = float64(replicas) * myres.Memory - myres.GPU = int64(replicas) * myres.GPU - total = total.Add(myres) - } - } - } - } - return total -} - -func (qjrPod *QueueJobResPod) GetAggregatedResourcesByPriority(priority float64, job *arbv1.AppWrapper) *clusterstateapi.Resource { - total := clusterstateapi.EmptyResource() - if job.Spec.AggrResources.Items != nil { - //calculate scaling - for _, ar := range job.Spec.AggrResources.Items { - if ar.Priority < priority { - continue - } - - if ar.Type == arbv1.ResourceTypePod { - template, err := qjrPod.GetPodTemplate(&ar) - if err != nil { - klog.Errorf("Cannot parse pod template in item: %+v error: %+v. Aggregated resources set to 0.", ar, err) - } else { - total = total.Add(queuejobresources.GetPodResources(template)) - } - } - } - } - return total -} - -func (qjrPod *QueueJobResPod) createQueueJobPod(qj *arbv1.AppWrapper, ix int32, qjobRes *arbv1.AppWrapperResource) *v1.Pod { - templateCopy, err := qjrPod.GetPodTemplate(qjobRes) - - if err != nil { - klog.Errorf("[createQueueJobPod] Cannot parse PodTemplate in job: %s, namespace: %s, item: %+v error: %+v.", - qj.Name, qj.Namespace, qjobRes, err) - return nil - } - podName := fmt.Sprintf("%s-%d-%s", qj.Name, ix, generateUUID()) - - klog.Infof("Template copy for the pod %+v", templateCopy) - - // Set additional appwrapper label - tmpl := templateCopy.Labels - if tmpl == nil { - tmpl = make(map[string]string) - } - - tmpl[queueJobName] = qj.Name - - // Include pre-defined metadata info, e.g. annotations - templateObjMetadata := templateCopy.ObjectMeta - - // Set new field values - templateObjMetadata.SetName(podName) - templateObjMetadata.SetNamespace(qj.Namespace) - templateObjMetadata.SetOwnerReferences([]metav1.OwnerReference{ - *metav1.NewControllerRef(qj, queueJobKind), - }) - templateObjMetadata.SetLabels(tmpl) - - return &v1.Pod{ - ObjectMeta: templateObjMetadata, - Spec: templateCopy.Spec, - } -} - -// Cleanup : deletes all resources from the queuejob -func (qjrPod *QueueJobResPod) Cleanup(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { - - return qjrPod.terminatePodsForQueueJob(queuejob) -} - -// AppWrapperCondition returns condition of a AppWrapper condition. +// GeneratePodFailedCondition returns condition of a AppWrapper condition. func GeneratePodFailedCondition(podName string, podCondition []v1.PodCondition) arbv1.PendingPodSpec { return arbv1.PendingPodSpec{ PodName: podName, diff --git a/pkg/controller/queuejobresources/queuejobresource_ref_mananger.go b/pkg/controller/queuejobresources/queuejobresource_ref_mananger.go deleted file mode 100644 index 07783bb56..000000000 --- a/pkg/controller/queuejobresources/queuejobresource_ref_mananger.go +++ /dev/null @@ -1,132 +0,0 @@ -/* -Copyright 2014 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -/* -Copyright 2019, 2021 The Multi-Cluster App Dispatcher Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package queuejobresources - -import ( - "fmt" - - qjobv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" - "k8s.io/apimachinery/pkg/api/meta" - "k8s.io/apimachinery/pkg/runtime" -) - -//RefManager : Define reference manager commont interface -type RefManager interface { - - //Tag the owner - AddTag(owner *qjobv1.AppWrapperResource, getTag func() string) error - - //check whether ownee is a member of owner - BelongTo(owner *qjobv1.AppWrapperResource, ownee runtime.Object) bool - - //mark the ownee to be a member of owner - AddReference(owner *qjobv1.AppWrapperResource, ownee runtime.Object) error -} - -//RefByLabel : A reference manager by resource vector index -type RefByLabel struct { - qjobResLabel string -} - -//NewLabelRefManager : new ref manager -func NewLabelRefManager() RefManager { - return &RefByLabel{qjobResLabel: "qjobResLabel"} -} - -//AddTag : add tag -func (rm *RefByLabel) AddTag(owner *qjobv1.AppWrapperResource, getTag func() string) error { - - accessor, err := meta.Accessor(owner) - if err != nil { - return fmt.Errorf("Cannot get object meta") - } - - labels := accessor.GetLabels() - if labels == nil { - labels = map[string]string{} - } - labels[rm.qjobResLabel] = getTag() - - accessor.SetLabels(labels) - - return nil -} - -//BelongTo : belong to QJ -func (rm *RefByLabel) BelongTo(owner *qjobv1.AppWrapperResource, ownee runtime.Object) bool { - - accessor, err := meta.Accessor(ownee) - if err != nil { - return false - } - - accessorOwner, err := meta.Accessor(owner) - if err != nil { - return false - } - - labels := accessor.GetLabels() - labelsOwner := accessorOwner.GetLabels() - - return labels != nil && labelsOwner != nil && labels[rm.qjobResLabel] == labelsOwner[rm.qjobResLabel] -} - -//AddReference : add ref -func (rm *RefByLabel) AddReference(owner *qjobv1.AppWrapperResource, ownee runtime.Object) error { - - accessor, err := meta.Accessor(ownee) - if err != nil { - return fmt.Errorf("Cannot get object meta") - } - - accessorOwner, err := meta.Accessor(owner) - if err != nil { - return fmt.Errorf("Cannot get object meta") - } - - labelsOwner := accessorOwner.GetLabels() - - tag, found := labelsOwner[rm.qjobResLabel] - if !found { - return fmt.Errorf("The identification label not found") - } - - labels := accessor.GetLabels() - if labels == nil { - labels = map[string]string{} - } - labels[rm.qjobResLabel] = tag - - accessor.SetLabels(labels) - - return nil - -} diff --git a/pkg/controller/queuejobresources/queuejobresources.go b/pkg/controller/queuejobresources/queuejobresources.go index d81ce651b..7707e0bd3 100644 --- a/pkg/controller/queuejobresources/queuejobresources.go +++ b/pkg/controller/queuejobresources/queuejobresources.go @@ -1,19 +1,4 @@ /* -Copyright 2014 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -/* Copyright 2019, 2021 The Multi-Cluster App Dispatcher Authors. Licensed under the Apache License, Version 2.0 (the "License"); @@ -28,20 +13,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + package queuejobresources import ( "sync" - qjobv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" "k8s.io/client-go/rest" "k8s.io/klog/v2" + + qjobv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" ) // Factory is a function that returns an Interface for queue job resources. type Factory func(config *rest.Config) Interface -//RegisteredResources : registered resources +// RegisteredResources : registered resources type RegisteredResources struct { lock sync.Mutex registry map[qjobv1.ResourceType]Factory @@ -77,8 +64,8 @@ func (rres *RegisteredResources) Register(t qjobv1.ResourceType, factory Factory } -// InitQueueJobResource creates an instance of the type queue job resource. It returns -//`false` if the type is not known. +// InitQueueJobResource creates an instance of the type queue job resource. +// It returns `false` if the type is not known. func (rres *RegisteredResources) InitQueueJobResource(t qjobv1.ResourceType, config *rest.Config) (Interface, bool, error) { rres.lock.Lock() diff --git a/pkg/controller/queuejobresources/secret/secret.go b/pkg/controller/queuejobresources/secret/secret.go deleted file mode 100644 index 938380b00..000000000 --- a/pkg/controller/queuejobresources/secret/secret.go +++ /dev/null @@ -1,337 +0,0 @@ -/* -Copyright 2017 The Kubernetes Authors. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -/* -Copyright 2019, 2021 The Multi-Cluster App Dispatcher Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package secret - -import ( - "context" - "fmt" - - arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" - clientset "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/clientset/controller-versioned" - "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejobresources" - - clusterstateapi "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/api" - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/klog/v2" - - "sync" - "time" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/apimachinery/pkg/runtime/serializer/json" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - "k8s.io/client-go/informers" - corev1informer "k8s.io/client-go/informers/core/v1" - "k8s.io/client-go/kubernetes" - corelisters "k8s.io/client-go/listers/core/v1" - "k8s.io/client-go/rest" - "k8s.io/client-go/tools/cache" -) - -var queueJobKind = arbv1.SchemeGroupVersion.WithKind("AppWrapper") -var queueJobName = "appwrapper.mcad.ibm.com" - -const ( - // QueueJobNameLabel label string for queuejob name - QueueJobNameLabel string = "appwrapper-name" - - // ControllerUIDLabel label string for queuejob controller uid - ControllerUIDLabel string = "controller-uid" -) - -//QueueJobResService contains service info -type QueueJobResSecret struct { - clients *kubernetes.Clientset - arbclients *clientset.Clientset - // A store of services, populated by the serviceController - secretStore corelisters.SecretLister - secretInformer corev1informer.SecretInformer - rtScheme *runtime.Scheme - jsonSerializer *json.Serializer - // Reference manager to manage membership of queuejob resource and its members - refManager queuejobresources.RefManager -} - -//Register registers a queue job resource type -func Register(regs *queuejobresources.RegisteredResources) { - regs.Register(arbv1.ResourceTypeSecret, func(config *rest.Config) queuejobresources.Interface { - return NewQueueJobResSecret(config) - }) -} - -//NewQueueJobResService creates a service controller -func NewQueueJobResSecret(config *rest.Config) queuejobresources.Interface { - qjrSecret := &QueueJobResSecret{ - clients: kubernetes.NewForConfigOrDie(config), - arbclients: clientset.NewForConfigOrDie(config), - } - - qjrSecret.secretInformer = informers.NewSharedInformerFactory(qjrSecret.clients, 0).Core().V1().Secrets() - qjrSecret.secretInformer.Informer().AddEventHandler( - cache.FilteringResourceEventHandler{ - FilterFunc: func(obj interface{}) bool { - switch obj.(type) { - case *v1.Secret: - return true - default: - return false - } - }, - Handler: cache.ResourceEventHandlerFuncs{ - AddFunc: qjrSecret.addSecret, - UpdateFunc: qjrSecret.updateSecret, - DeleteFunc: qjrSecret.deleteSecret, - }, - }) - - qjrSecret.rtScheme = runtime.NewScheme() - v1.AddToScheme(qjrSecret.rtScheme) - - qjrSecret.jsonSerializer = json.NewYAMLSerializer(json.DefaultMetaFactory, qjrSecret.rtScheme, qjrSecret.rtScheme) - - qjrSecret.refManager = queuejobresources.NewLabelRefManager() - - return qjrSecret -} - -// Run the main goroutine responsible for watching and services. -func (qjrSecret *QueueJobResSecret) Run(stopCh <-chan struct{}) { - - qjrSecret.secretInformer.Informer().Run(stopCh) -} - -func (qjrSecret *QueueJobResSecret) GetAggregatedResources(job *arbv1.AppWrapper) *clusterstateapi.Resource { - return clusterstateapi.EmptyResource() -} - -func (qjrSecret *QueueJobResSecret) addSecret(obj interface{}) { - - return -} - -func (qjrSecret *QueueJobResSecret) updateSecret(old, cur interface{}) { - - return -} - -func (qjrSecret *QueueJobResSecret) deleteSecret(obj interface{}) { - - return -} - -func (qjrSecret *QueueJobResSecret) GetAggregatedResourcesByPriority(priority float64, job *arbv1.AppWrapper) *clusterstateapi.Resource { - total := clusterstateapi.EmptyResource() - return total -} - -// Parse queue job api object to get Service template -func (qjrSecret *QueueJobResSecret) getSecretTemplate(qjobRes *arbv1.AppWrapperResource) (*v1.Secret, error) { - - secretGVK := schema.GroupVersion{Group: v1.GroupName, Version: "v1"}.WithKind("Secret") - - obj, _, err := qjrSecret.jsonSerializer.Decode(qjobRes.Template.Raw, &secretGVK, nil) - if err != nil { - return nil, err - } - - secret, ok := obj.(*v1.Secret) - if !ok { - return nil, fmt.Errorf("Queuejob resource not defined as a Secret") - } - - return secret, nil - -} - -func (qjrSecret *QueueJobResSecret) createSecretWithControllerRef(namespace string, secret *v1.Secret, controllerRef *metav1.OwnerReference) error { - - if controllerRef != nil { - secret.OwnerReferences = append(secret.OwnerReferences, *controllerRef) - } - - if _, err := qjrSecret.clients.CoreV1().Secrets(namespace).Create(context.Background(), secret, metav1.CreateOptions{}); err != nil { - return err - } - - return nil -} - -func (qjrSecret *QueueJobResSecret) delSecret(namespace string, name string) error { - - klog.V(4).Infof("==========delete secret: %s \n", name) - if err := qjrSecret.clients.CoreV1().Secrets(namespace).Delete(context.Background(), name, metav1.DeleteOptions{}); err != nil { - return err - } - - return nil -} - -func (qjrSecret *QueueJobResSecret) UpdateQueueJobStatus(queuejob *arbv1.AppWrapper) error { - return nil -} - -func (qjrSecret *QueueJobResSecret) SyncQueueJob(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { - - startTime := time.Now() - - defer func() { - klog.V(4).Infof("Finished syncing queue job resource %s (%v)", queuejob.Name, time.Now().Sub(startTime)) - }() - - _namespace, secretInQjr, secretsInEtcd, err := qjrSecret.getSecretForQueueJobRes(qjobRes, queuejob) - if err != nil { - return err - } - - secretLen := len(secretsInEtcd) - replicas := qjobRes.Replicas - - diff := int(replicas) - int(secretLen) - - klog.V(4).Infof("QJob: %s had %d Secrets and %d desired Secrets", queuejob.Name, secretLen, replicas) - - if diff > 0 { - //TODO: need set reference after Service has been really added - tmpSecret := v1.Secret{} - err = qjrSecret.refManager.AddReference(qjobRes, &tmpSecret) - if err != nil { - klog.Errorf("Cannot add reference to configmap resource %+v", err) - return err - } - - if secretInQjr.Labels == nil { - secretInQjr.Labels = map[string]string{} - } - for k, v := range tmpSecret.Labels { - secretInQjr.Labels[k] = v - } - secretInQjr.Labels[queueJobName] = queuejob.Name - - wait := sync.WaitGroup{} - wait.Add(int(diff)) - for i := 0; i < diff; i++ { - go func() { - defer wait.Done() - - err := qjrSecret.createSecretWithControllerRef(*_namespace, secretInQjr, metav1.NewControllerRef(queuejob, queueJobKind)) - - if err != nil && errors.IsTimeout(err) { - return - } - if err != nil { - defer utilruntime.HandleError(err) - } - }() - } - wait.Wait() - } - - return nil -} - -func (qjrSecret *QueueJobResSecret) getSecretForQueueJobRes(qjobRes *arbv1.AppWrapperResource, queuejob *arbv1.AppWrapper) (*string, *v1.Secret, []*v1.Secret, error) { - - // Get "a" Secret from AppWrapper Resource - secretInQjr, err := qjrSecret.getSecretTemplate(qjobRes) - if err != nil { - klog.Errorf("Cannot read template from resource %+v %+v", qjobRes, err) - return nil, nil, nil, err - } - - // Get Secret"s" in Etcd Server - var _namespace *string - if secretInQjr.Namespace != "" { - _namespace = &secretInQjr.Namespace - } else { - _namespace = &queuejob.Namespace - } - secretList, err := qjrSecret.clients.CoreV1().Secrets(*_namespace).List(context.Background(), metav1.ListOptions{LabelSelector: fmt.Sprintf("%s=%s", queueJobName, queuejob.Name)}) - if err != nil { - return nil, nil, nil, err - } - secretsInEtcd := []*v1.Secret{} - for i, _ := range secretList.Items { - secretsInEtcd = append(secretsInEtcd, &secretList.Items[i]) - } - - // for i, secret := range secretList.Items { - // metaSecret, err := meta.Accessor(&secret) - // if err != nil { - // return nil, nil, nil, err - // } - // controllerRef := metav1.GetControllerOf(metaSecret) - // if controllerRef != nil { - // if controllerRef.UID == queuejob.UID { - // secretsInEtcd = append(secretsInEtcd, &secretList.Items[i]) - // } - // } - // } - mySecretsInEtcd := []*v1.Secret{} - for i, secret := range secretsInEtcd { - if qjrSecret.refManager.BelongTo(qjobRes, secret) { - mySecretsInEtcd = append(mySecretsInEtcd, secretsInEtcd[i]) - } - } - - return _namespace, secretInQjr, mySecretsInEtcd, nil -} - -func (qjrSecret *QueueJobResSecret) deleteQueueJobResSecrets(qjobRes *arbv1.AppWrapperResource, queuejob *arbv1.AppWrapper) error { - - job := *queuejob - - _namespace, _, activeSecrets, err := qjrSecret.getSecretForQueueJobRes(qjobRes, queuejob) - if err != nil { - return err - } - - active := int32(len(activeSecrets)) - - wait := sync.WaitGroup{} - wait.Add(int(active)) - for i := int32(0); i < active; i++ { - go func(ix int32) { - defer wait.Done() - if err := qjrSecret.delSecret(*_namespace, activeSecrets[ix].Name); err != nil { - defer utilruntime.HandleError(err) - klog.V(2).Infof("Failed to delete %v, queue job %q/%q deadline exceeded", activeSecrets[ix].Name, *_namespace, job.Name) - } - }(i) - } - wait.Wait() - - return nil -} - -//Cleanup deletes all services -func (qjrSecret *QueueJobResSecret) Cleanup(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { - return qjrSecret.deleteQueueJobResSecrets(qjobRes, queuejob) -} diff --git a/pkg/controller/queuejobresources/service/service.go b/pkg/controller/queuejobresources/service/service.go deleted file mode 100644 index 9592ecd0e..000000000 --- a/pkg/controller/queuejobresources/service/service.go +++ /dev/null @@ -1,337 +0,0 @@ -/* -Copyright 2017 The Kubernetes Authors. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -/* -Copyright 2019, 2021 The Multi-Cluster App Dispatcher Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package service - -import ( - "context" - "fmt" - - arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" - clientset "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/clientset/controller-versioned" - "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejobresources" - - clusterstateapi "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/api" - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/klog/v2" - - "sync" - "time" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/apimachinery/pkg/runtime/serializer/json" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - "k8s.io/client-go/informers" - corev1informer "k8s.io/client-go/informers/core/v1" - "k8s.io/client-go/kubernetes" - corelisters "k8s.io/client-go/listers/core/v1" - "k8s.io/client-go/rest" - "k8s.io/client-go/tools/cache" -) - -var queueJobKind = arbv1.SchemeGroupVersion.WithKind("AppWrapper") -var queueJobName = "appwrapper.mcad.ibm.com" - -const ( - // QueueJobNameLabel label string for queuejob name - QueueJobNameLabel string = "appwrapper-name" - - // ControllerUIDLabel label string for queuejob controller uid - ControllerUIDLabel string = "controller-uid" -) - -//QueueJobResService contains service info -type QueueJobResService struct { - clients *kubernetes.Clientset - arbclients *clientset.Clientset - // A store of services, populated by the serviceController - serviceStore corelisters.ServiceLister - serviceInformer corev1informer.ServiceInformer - rtScheme *runtime.Scheme - jsonSerializer *json.Serializer - // Reference manager to manage membership of queuejob resource and its members - refManager queuejobresources.RefManager -} - -//Register registers a queue job resource type -func Register(regs *queuejobresources.RegisteredResources) { - regs.Register(arbv1.ResourceTypeService, func(config *rest.Config) queuejobresources.Interface { - return NewQueueJobResService(config) - }) -} - -//NewQueueJobResService creates a service controller -func NewQueueJobResService(config *rest.Config) queuejobresources.Interface { - qjrService := &QueueJobResService{ - clients: kubernetes.NewForConfigOrDie(config), - arbclients: clientset.NewForConfigOrDie(config), - } - - qjrService.serviceInformer = informers.NewSharedInformerFactory(qjrService.clients, 0).Core().V1().Services() - qjrService.serviceInformer.Informer().AddEventHandler( - cache.FilteringResourceEventHandler{ - FilterFunc: func(obj interface{}) bool { - switch obj.(type) { - case *v1.Service: - return true - default: - return false - } - }, - Handler: cache.ResourceEventHandlerFuncs{ - AddFunc: qjrService.addService, - UpdateFunc: qjrService.updateService, - DeleteFunc: qjrService.deleteService, - }, - }) - - qjrService.rtScheme = runtime.NewScheme() - v1.AddToScheme(qjrService.rtScheme) - - qjrService.jsonSerializer = json.NewYAMLSerializer(json.DefaultMetaFactory, qjrService.rtScheme, qjrService.rtScheme) - - qjrService.refManager = queuejobresources.NewLabelRefManager() - - return qjrService -} - -// Run the main goroutine responsible for watching and services. -func (qjrService *QueueJobResService) Run(stopCh <-chan struct{}) { - - qjrService.serviceInformer.Informer().Run(stopCh) -} - -func (qjrService *QueueJobResService) GetAggregatedResources(job *arbv1.AppWrapper) *clusterstateapi.Resource { - return clusterstateapi.EmptyResource() -} - -func (qjrService *QueueJobResService) addService(obj interface{}) { - - return -} - -func (qjrService *QueueJobResService) updateService(old, cur interface{}) { - - return -} - -func (qjrService *QueueJobResService) deleteService(obj interface{}) { - - return -} - -func (qjrService *QueueJobResService) GetAggregatedResourcesByPriority(priority float64, job *arbv1.AppWrapper) *clusterstateapi.Resource { - total := clusterstateapi.EmptyResource() - return total -} - -// Parse queue job api object to get Service template -func (qjrService *QueueJobResService) getServiceTemplate(qjobRes *arbv1.AppWrapperResource) (*v1.Service, error) { - - serviceGVK := schema.GroupVersion{Group: v1.GroupName, Version: "v1"}.WithKind("Service") - - obj, _, err := qjrService.jsonSerializer.Decode(qjobRes.Template.Raw, &serviceGVK, nil) - if err != nil { - return nil, err - } - - service, ok := obj.(*v1.Service) - if !ok { - return nil, fmt.Errorf("Queuejob resource not defined as a Service") - } - - return service, nil - -} - -func (qjrService *QueueJobResService) createServiceWithControllerRef(namespace string, service *v1.Service, controllerRef *metav1.OwnerReference) error { - - if controllerRef != nil { - service.OwnerReferences = append(service.OwnerReferences, *controllerRef) - } - - if _, err := qjrService.clients.CoreV1().Services(namespace).Create(context.Background(), service, metav1.CreateOptions{}); err != nil { - return err - } - - return nil -} - -func (qjrService *QueueJobResService) delService(namespace string, name string) error { - - klog.V(4).Infof("==========delete service: %s, %s \n", namespace, name) - if err := qjrService.clients.CoreV1().Services(namespace).Delete(context.Background(), name, metav1.DeleteOptions{}); err != nil { - return err - } - - return nil -} - -func (qjrService *QueueJobResService) UpdateQueueJobStatus(queuejob *arbv1.AppWrapper) error { - return nil -} - -func (qjrService *QueueJobResService) SyncQueueJob(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { - - startTime := time.Now() - - defer func() { - klog.V(4).Infof("Finished syncing queue job resource %s (%v)", queuejob.Name, time.Now().Sub(startTime)) - }() - - _namespace, serviceInQjr, servicesInEtcd, err := qjrService.getServiceForQueueJobRes(qjobRes, queuejob) - if err != nil { - return err - } - - serviceLen := len(servicesInEtcd) - replicas := qjobRes.Replicas - - diff := int(replicas) - int(serviceLen) - - klog.V(4).Infof("QJob: %s had %d Services and %d desired Services", queuejob.Name, serviceLen, replicas) - - if diff > 0 { - //TODO: need set reference after Service has been really added - tmpService := v1.Service{} - err = qjrService.refManager.AddReference(qjobRes, &tmpService) - if err != nil { - klog.Errorf("Cannot add reference to configmap resource %+v", err) - return err - } - - if serviceInQjr.Labels == nil { - serviceInQjr.Labels = map[string]string{} - } - for k, v := range tmpService.Labels { - serviceInQjr.Labels[k] = v - } - serviceInQjr.Labels[queueJobName] = queuejob.Name - - wait := sync.WaitGroup{} - wait.Add(int(diff)) - for i := 0; i < diff; i++ { - go func() { - defer wait.Done() - - err := qjrService.createServiceWithControllerRef(*_namespace, serviceInQjr, metav1.NewControllerRef(queuejob, queueJobKind)) - - if err != nil && errors.IsTimeout(err) { - return - } - if err != nil { - defer utilruntime.HandleError(err) - } - }() - } - wait.Wait() - } - - return nil -} - -func (qjrService *QueueJobResService) getServiceForQueueJobRes(qjobRes *arbv1.AppWrapperResource, queuejob *arbv1.AppWrapper) (*string, *v1.Service, []*v1.Service, error) { - - // Get "a" Service from AppWrapper Resource - serviceInQjr, err := qjrService.getServiceTemplate(qjobRes) - if err != nil { - klog.Errorf("Cannot read template from resource %+v %+v", qjobRes, err) - return nil, nil, nil, err - } - - // Get Service"s" in Etcd Server - var _namespace *string - if serviceInQjr.Namespace != "" { - _namespace = &serviceInQjr.Namespace - } else { - _namespace = &queuejob.Namespace - } - serviceList, err := qjrService.clients.CoreV1().Services(*_namespace).List(context.Background(), metav1.ListOptions{LabelSelector: fmt.Sprintf("%s=%s", queueJobName, queuejob.Name)}) - if err != nil { - return nil, nil, nil, err - } - servicesInEtcd := []*v1.Service{} - for i, _ := range serviceList.Items { - servicesInEtcd = append(servicesInEtcd, &serviceList.Items[i]) - } - - // for i, service := range serviceList.Items { - // metaService, err := meta.Accessor(&service) - // if err != nil { - // return nil, nil, nil, err - // } - // controllerRef := metav1.GetControllerOf(metaService) - // if controllerRef != nil { - // if controllerRef.UID == queuejob.UID { - // servicesInEtcd = append(servicesInEtcd, &serviceList.Items[i]) - // } - // } - // } - myServicesInEtcd := []*v1.Service{} - for i, service := range servicesInEtcd { - if qjrService.refManager.BelongTo(qjobRes, service) { - myServicesInEtcd = append(myServicesInEtcd, servicesInEtcd[i]) - } - } - - return _namespace, serviceInQjr, myServicesInEtcd, nil -} - -func (qjrService *QueueJobResService) deleteQueueJobResServices(qjobRes *arbv1.AppWrapperResource, queuejob *arbv1.AppWrapper) error { - - job := *queuejob - - _namespace, _, activeServices, err := qjrService.getServiceForQueueJobRes(qjobRes, queuejob) - if err != nil { - return err - } - - active := int32(len(activeServices)) - - wait := sync.WaitGroup{} - wait.Add(int(active)) - for i := int32(0); i < active; i++ { - go func(ix int32) { - defer wait.Done() - if err := qjrService.delService(*_namespace, activeServices[ix].Name); err != nil { - defer utilruntime.HandleError(err) - klog.V(2).Infof("Failed to delete %v, queue job %q/%q deadline exceeded", activeServices[ix].Name, *_namespace, job.Name) - } - }(i) - } - wait.Wait() - - return nil -} - -//Cleanup deletes all services -func (qjrService *QueueJobResService) Cleanup(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { - return qjrService.deleteQueueJobResServices(qjobRes, queuejob) -} diff --git a/pkg/controller/queuejobresources/statefulset/statefulset.go b/pkg/controller/queuejobresources/statefulset/statefulset.go deleted file mode 100644 index a47c604a9..000000000 --- a/pkg/controller/queuejobresources/statefulset/statefulset.go +++ /dev/null @@ -1,371 +0,0 @@ -/* -Copyright 2017 The Kubernetes Authors. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -/* -Copyright 2019, 2021 The Multi-Cluster App Dispatcher Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package statefulset - -import ( - "context" - "fmt" - - arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" - "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejobresources" - - "sync" - "time" - - clusterstateapi "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/api" - apps "k8s.io/api/apps/v1" - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/apimachinery/pkg/runtime/serializer/json" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - "k8s.io/client-go/informers" - "k8s.io/client-go/kubernetes" - "k8s.io/klog/v2" - - ssinformer "k8s.io/client-go/informers/apps/v1" - sslister "k8s.io/client-go/listers/apps/v1" - - clientset "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/clientset/controller-versioned" - "k8s.io/client-go/rest" - "k8s.io/client-go/tools/cache" -) - -var queueJobKind = arbv1.SchemeGroupVersion.WithKind("AppWrapper") -var queueJobName = "appwrapper.mcad.ibm.com" - -const ( - // QueueJobNameLabel label string for queuejob name - QueueJobNameLabel string = "appwrapper-name" - - // ControllerUIDLabel label string for queuejob controller uid - ControllerUIDLabel string = "controller-uid" -) - -//QueueJobResStatefulSet - stateful sets -type QueueJobResStatefulSet struct { - clients *kubernetes.Clientset - arbclients *clientset.Clientset - // A store of services, populated by the serviceController - statefulSetStore sslister.StatefulSetLister - deployInformer ssinformer.StatefulSetInformer - rtScheme *runtime.Scheme - jsonSerializer *json.Serializer - // Reference manager to manage membership of queuejob resource and its members - refManager queuejobresources.RefManager -} - -// Register registers a queue job resource type -func Register(regs *queuejobresources.RegisteredResources) { - regs.Register(arbv1.ResourceTypeStatefulSet, func(config *rest.Config) queuejobresources.Interface { - return NewQueueJobResStatefulSet(config) - }) -} - -//NewQueueJobResStatefulSet - creates a controller for SS -func NewQueueJobResStatefulSet(config *rest.Config) queuejobresources.Interface { - qjrd := &QueueJobResStatefulSet{ - clients: kubernetes.NewForConfigOrDie(config), - arbclients: clientset.NewForConfigOrDie(config), - } - - qjrd.deployInformer = informers.NewSharedInformerFactory(qjrd.clients, 0).Apps().V1().StatefulSets() - qjrd.deployInformer.Informer().AddEventHandler( - cache.FilteringResourceEventHandler{ - FilterFunc: func(obj interface{}) bool { - switch obj.(type) { - case *apps.StatefulSet: - return true - default: - return false - } - }, - Handler: cache.ResourceEventHandlerFuncs{ - AddFunc: qjrd.addStatefulSet, - UpdateFunc: qjrd.updateStatefulSet, - DeleteFunc: qjrd.deleteStatefulSet, - }, - }) - - qjrd.rtScheme = runtime.NewScheme() - v1.AddToScheme(qjrd.rtScheme) - apps.AddToScheme(qjrd.rtScheme) - qjrd.jsonSerializer = json.NewYAMLSerializer(json.DefaultMetaFactory, qjrd.rtScheme, qjrd.rtScheme) - - qjrd.refManager = queuejobresources.NewLabelRefManager() - - return qjrd -} - -// Run the main goroutine responsible for watching and services. -func (qjrStatefulSet *QueueJobResStatefulSet) Run(stopCh <-chan struct{}) { - qjrStatefulSet.deployInformer.Informer().Run(stopCh) -} - -//GetPodTemplate Parse queue job api object to get Pod template -func (qjrStatefulSet *QueueJobResStatefulSet) GetPodTemplate(qjobRes *arbv1.AppWrapperResource) (*v1.PodTemplateSpec, int32, error) { - res, err := qjrStatefulSet.getStatefulSetTemplate(qjobRes) - if err != nil { - return nil, -1, err - } - return &res.Spec.Template, *res.Spec.Replicas, nil -} - -func (qjrStatefulSet *QueueJobResStatefulSet) GetAggregatedResources(queueJob *arbv1.AppWrapper) *clusterstateapi.Resource { - total := clusterstateapi.EmptyResource() - if queueJob.Spec.AggrResources.Items != nil { - //calculate scaling - for _, ar := range queueJob.Spec.AggrResources.Items { - if ar.Type == arbv1.ResourceTypeStatefulSet { - podTemplate, replicas, _ := qjrStatefulSet.GetPodTemplate(&ar) - myres := queuejobresources.GetPodResources(podTemplate) - myres.MilliCPU = float64(replicas) * myres.MilliCPU - myres.Memory = float64(replicas) * myres.Memory - myres.GPU = int64(replicas) * myres.GPU - total = total.Add(myres) - } - } - } - return total -} - -func (qjrStatefulSet *QueueJobResStatefulSet) GetAggregatedResourcesByPriority(priority float64, queueJob *arbv1.AppWrapper) *clusterstateapi.Resource { - total := clusterstateapi.EmptyResource() - if queueJob.Spec.AggrResources.Items != nil { - //calculate scaling - for _, ar := range queueJob.Spec.AggrResources.Items { - if ar.Priority < priority { - continue - } - if ar.Type == arbv1.ResourceTypeStatefulSet { - podTemplate, replicas, _ := qjrStatefulSet.GetPodTemplate(&ar) - myres := queuejobresources.GetPodResources(podTemplate) - myres.MilliCPU = float64(replicas) * myres.MilliCPU - myres.Memory = float64(replicas) * myres.Memory - myres.GPU = int64(replicas) * myres.GPU - total = total.Add(myres) - } - } - } - return total -} - -func (qjrStatefulSet *QueueJobResStatefulSet) addStatefulSet(obj interface{}) { - - return -} - -func (qjrStatefulSet *QueueJobResStatefulSet) updateStatefulSet(old, cur interface{}) { - - return -} - -func (qjrStatefulSet *QueueJobResStatefulSet) deleteStatefulSet(obj interface{}) { - - return -} - -func (qjrStatefulSet *QueueJobResStatefulSet) getStatefulSetTemplate(qjobRes *arbv1.AppWrapperResource) (*apps.StatefulSet, error) { - statefulSetGVK := schema.GroupVersion{Group: "", Version: "v1"}.WithKind("StatefulSet") - obj, _, err := qjrStatefulSet.jsonSerializer.Decode(qjobRes.Template.Raw, &statefulSetGVK, nil) - if err != nil { - return nil, err - } - statefulSet, ok := obj.(*apps.StatefulSet) - if !ok { - return nil, fmt.Errorf("Queuejob resource not defined as a StatefulSet") - } - return statefulSet, nil -} - -func (qjrStatefulSet *QueueJobResStatefulSet) createStatefulSetWithControllerRef(namespace string, statefulSet *apps.StatefulSet, controllerRef *metav1.OwnerReference) error { - klog.V(4).Infof("==========create statefulSet: %s, %+v \n", namespace, statefulSet) - if controllerRef != nil { - statefulSet.OwnerReferences = append(statefulSet.OwnerReferences, *controllerRef) - } - if _, err := qjrStatefulSet.clients.AppsV1().StatefulSets(namespace).Create(context.Background(), statefulSet, metav1.CreateOptions{}); err != nil { - return err - } - return nil -} - -func (qjrStatefulSet *QueueJobResStatefulSet) delStatefulSet(namespace string, name string) error { - - klog.V(4).Infof("==========delete statefulSet: %s, %s \n", namespace, name) - if err := qjrStatefulSet.clients.AppsV1().StatefulSets(namespace).Delete(context.Background(), name, metav1.DeleteOptions{}); err != nil { - return err - } - return nil -} - -func (qjrStatefulSet *QueueJobResStatefulSet) UpdateQueueJobStatus(queuejob *arbv1.AppWrapper) error { - return nil -} - -func (qjrStatefulSet *QueueJobResStatefulSet) SyncQueueJob(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { - - startTime := time.Now() - - defer func() { - klog.V(4).Infof("Finished syncing queue job resource %s (%v)", queuejob.Name, time.Now().Sub(startTime)) - }() - - _namespace, statefulSetInQjr, statefulSetsInEtcd, err := qjrStatefulSet.getStatefulSetForQueueJobRes(qjobRes, queuejob) - if err != nil { - return err - } - - statefulSetLen := len(statefulSetsInEtcd) - replicas := qjobRes.Replicas - - diff := int(replicas) - int(statefulSetLen) - - klog.V(4).Infof("QJob: %s had %d StatefulSets and %d desired StatefulSets", queuejob.Name, statefulSetLen, replicas) - - if diff > 0 { - //TODO: need set reference after Service has been really added - tmpStatefulSet := apps.StatefulSet{} - err = qjrStatefulSet.refManager.AddReference(qjobRes, &tmpStatefulSet) - if err != nil { - klog.Errorf("Cannot add reference to configmap resource %+v", err) - return err - } - if statefulSetInQjr.Labels == nil { - statefulSetInQjr.Labels = map[string]string{} - } - for k, v := range tmpStatefulSet.Labels { - statefulSetInQjr.Labels[k] = v - } - statefulSetInQjr.Labels[queueJobName] = queuejob.Name - if statefulSetInQjr.Spec.Template.Labels == nil { - statefulSetInQjr.Labels = map[string]string{} - } - statefulSetInQjr.Spec.Template.Labels[queueJobName] = queuejob.Name - - wait := sync.WaitGroup{} - wait.Add(int(diff)) - for i := 0; i < diff; i++ { - go func() { - defer wait.Done() - - err := qjrStatefulSet.createStatefulSetWithControllerRef(*_namespace, statefulSetInQjr, metav1.NewControllerRef(queuejob, queueJobKind)) - - if err != nil && errors.IsTimeout(err) { - return - } - if err != nil { - defer utilruntime.HandleError(err) - } - }() - } - wait.Wait() - } - - return nil -} - -func (qjrStatefulSet *QueueJobResStatefulSet) getStatefulSetForQueueJobRes(qjobRes *arbv1.AppWrapperResource, queuejob *arbv1.AppWrapper) (*string, *apps.StatefulSet, []*apps.StatefulSet, error) { - - // Get "a" StatefulSet from AppWrapper Resource - statefulSetInQjr, err := qjrStatefulSet.getStatefulSetTemplate(qjobRes) - if err != nil { - klog.Errorf("Cannot read template from resource %+v %+v", qjobRes, err) - return nil, nil, nil, err - } - - // Get StatefulSet"s" in Etcd Server - var _namespace *string - if statefulSetInQjr.Namespace != "" { - _namespace = &statefulSetInQjr.Namespace - } else { - _namespace = &queuejob.Namespace - } - statefulSetList, err := qjrStatefulSet.clients.AppsV1().StatefulSets(*_namespace).List(context.Background(), metav1.ListOptions{LabelSelector: fmt.Sprintf("%s=%s", queueJobName, queuejob.Name)}) - if err != nil { - return nil, nil, nil, err - } - statefulSetsInEtcd := []*apps.StatefulSet{} - // for i, statefulSet := range statefulSetList.Items { - // metaStatefulSet, err := meta.Accessor(&statefulSet) - // if err != nil { - // return nil, nil, nil, err - // } - // controllerRef := metav1.GetControllerOf(metaStatefulSet) - // if controllerRef != nil { - // if controllerRef.UID == queuejob.UID { - // statefulSetsInEtcd = append(statefulSetsInEtcd, &statefulSetList.Items[i]) - // } - // } - // } - for i, _ := range statefulSetList.Items { - statefulSetsInEtcd = append(statefulSetsInEtcd, &statefulSetList.Items[i]) - } - - myStatefulSetsInEtcd := []*apps.StatefulSet{} - for i, statefulSet := range statefulSetsInEtcd { - if qjrStatefulSet.refManager.BelongTo(qjobRes, statefulSet) { - myStatefulSetsInEtcd = append(myStatefulSetsInEtcd, statefulSetsInEtcd[i]) - } - } - - return _namespace, statefulSetInQjr, myStatefulSetsInEtcd, nil -} - -func (qjrStatefulSet *QueueJobResStatefulSet) deleteQueueJobResStatefulSets(qjobRes *arbv1.AppWrapperResource, queuejob *arbv1.AppWrapper) error { - - job := *queuejob - - _namespace, _, activeStatefulSets, err := qjrStatefulSet.getStatefulSetForQueueJobRes(qjobRes, queuejob) - if err != nil { - return err - } - - active := int32(len(activeStatefulSets)) - - wait := sync.WaitGroup{} - wait.Add(int(active)) - for i := int32(0); i < active; i++ { - go func(ix int32) { - defer wait.Done() - if err := qjrStatefulSet.delStatefulSet(*_namespace, activeStatefulSets[ix].Name); err != nil { - defer utilruntime.HandleError(err) - klog.V(2).Infof("Failed to delete %v, queue job %q/%q deadline exceeded", activeStatefulSets[ix].Name, *_namespace, job.Name) - } - }(i) - } - wait.Wait() - - return nil -} - -//Cleanup deletes all services -func (qjrStatefulSet *QueueJobResStatefulSet) Cleanup(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { - return qjrStatefulSet.deleteQueueJobResStatefulSets(qjobRes, queuejob) -} diff --git a/pkg/controller/queuejobresources/utils.go b/pkg/controller/queuejobresources/utils.go index 7653388b0..5888fac27 100644 --- a/pkg/controller/queuejobresources/utils.go +++ b/pkg/controller/queuejobresources/utils.go @@ -1,19 +1,4 @@ /* -Copyright 2019 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -/* Copyright 2019, 2021 The Multi-Cluster App Dispatcher Authors. Licensed under the Apache License, Version 2.0 (the "License"); @@ -28,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + package queuejobresources import ( @@ -35,7 +21,6 @@ import ( clusterstateapi "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/api" v1 "k8s.io/api/core/v1" - "k8s.io/klog/v2" ) // filterPods returns pods based on their phase. @@ -49,7 +34,7 @@ func FilterPods(pods []*v1.Pod, phase v1.PodPhase) int { return result } -//check if pods pending are failed scheduling +// PendingPodsFailedSchd checks if pods pending have failed scheduling func PendingPodsFailedSchd(pods []*v1.Pod) map[string][]v1.PodCondition { var podCondition = make(map[string][]v1.PodCondition) for i := range pods { @@ -59,14 +44,14 @@ func PendingPodsFailedSchd(pods []*v1.Pod) map[string][]v1.PodCondition { // this exists until coscheduler performance issue is resolved. if cond.Type == v1.PodScheduled && cond.Status == v1.ConditionFalse && cond.Reason == v1.PodReasonUnschedulable { if strings.Contains(cond.Message, "pgName") && strings.Contains(cond.Message, "last") && strings.Contains(cond.Message, "failed") && strings.Contains(cond.Message, "deny") { - //ignore co-scheduled pending pods for coscheduler version:0.22.6 + // ignore co-scheduled pending pods for coscheduler version:0.22.6 continue } else if strings.Contains(cond.Message, "optimistic") && strings.Contains(cond.Message, "rejection") && strings.Contains(cond.Message, "PostFilter") || strings.Contains(cond.Message, "cannot") && strings.Contains(cond.Message, "find") && strings.Contains(cond.Message, "enough") && strings.Contains(cond.Message, "sibling") { - //ignore co-scheduled pending pods for coscheduler version:0.23.10 + // ignore co-scheduled pending pods for coscheduler version:0.23.10 continue } else { - podName := string(pods[i].Name) + podName := pods[i].Name podCondition[podName] = append(podCondition[podName], *cond.DeepCopy()) } } @@ -76,7 +61,7 @@ func PendingPodsFailedSchd(pods []*v1.Pod) map[string][]v1.PodCondition { return podCondition } -// filterPods returns pods based on their phase. +// GetPodResourcesByPhase returns pods based on their phase. func GetPodResourcesByPhase(phase v1.PodPhase, pods []*v1.Pod) *clusterstateapi.Resource { req := clusterstateapi.EmptyResource() for i := range pods { @@ -88,31 +73,3 @@ func GetPodResourcesByPhase(phase v1.PodPhase, pods []*v1.Pod) *clusterstateapi. } return req } - -func GetPodResources(template *v1.PodTemplateSpec) *clusterstateapi.Resource { - total := clusterstateapi.EmptyResource() - req := clusterstateapi.EmptyResource() - limit := clusterstateapi.EmptyResource() - spec := template.Spec - - if &spec == nil { - klog.Errorf("Pod Spec not found in Pod Template: %+v. Aggregated resources set to 0.", template) - return total - } - - for _, c := range template.Spec.Containers { - req.Add(clusterstateapi.NewResource(c.Resources.Requests)) - limit.Add(clusterstateapi.NewResource(c.Resources.Limits)) - } - if req.MilliCPU < limit.MilliCPU { - req.MilliCPU = limit.MilliCPU - } - if req.Memory < limit.Memory { - req.Memory = limit.Memory - } - if req.GPU < limit.GPU { - req.GPU = limit.GPU - } - total = total.Add(req) - return total -} diff --git a/test/e2e/util.go b/test/e2e/util.go index f22e8c68e..62ace9a18 100644 --- a/test/e2e/util.go +++ b/test/e2e/util.go @@ -32,7 +32,6 @@ package e2e import ( gcontext "context" - "encoding/json" "fmt" "math/rand" "os" @@ -41,27 +40,19 @@ import ( "strings" "time" - "k8s.io/apimachinery/pkg/runtime" - . "github.com/onsi/ginkgo" . "github.com/onsi/gomega" - appv1 "k8s.io/api/apps/v1" - batchv1 "k8s.io/api/batch/v1" v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/labels" - "k8s.io/apimachinery/pkg/types" - "k8s.io/apimachinery/pkg/util/strategicpatch" + "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/clientcmd" arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" versioned "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/clientset/controller-versioned" - csapi "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/api" ) var ninetySeconds = 90 * time.Second @@ -73,11 +64,6 @@ var oneCPU = v1.ResourceList{"cpu": resource.MustParse("1000m")} var twoCPU = v1.ResourceList{"cpu": resource.MustParse("2000m")} var threeCPU = v1.ResourceList{"cpu": resource.MustParse("3000m")} -const ( - workerPriority = "worker-pri" - masterPriority = "master-pri" -) - func homeDir() string { if h := os.Getenv("HOME"); h != "" { return h @@ -117,7 +103,7 @@ func initTestContext() *context { Name: cxt.namespace, }, }, metav1.CreateOptions{}) - //Expect(err).NotTo(HaveOccurred()) + // Expect(err).NotTo(HaveOccurred()) /* _, err = cxt.kubeclient.SchedulingV1beta1().PriorityClasses().Create(gcontext.Background(), &schedv1.PriorityClass{ ObjectMeta: metav1.ObjectMeta{ @@ -140,18 +126,8 @@ func initTestContext() *context { return cxt } -func namespaceNotExist(ctx *context) wait.ConditionFunc { - return func() (bool, error) { - _, err := ctx.kubeclient.CoreV1().Namespaces().Get(gcontext.Background(), ctx.namespace, metav1.GetOptions{}) - if !(err != nil && errors.IsNotFound(err)) { - return false, err - } - return true, nil - } -} - func cleanupTestContextExtendedTime(cxt *context, seconds time.Duration) { - //foreground := metav1.DeletePropagationForeground + // foreground := metav1.DeletePropagationForeground /* err := cxt.kubeclient.CoreV1().Namespaces().Delete(gcontext.Background(), cxt.namespace, metav1.DeleteOptions{ PropagationPolicy: &foreground, }) @@ -172,7 +148,7 @@ func cleanupTestContextExtendedTime(cxt *context, seconds time.Duration) { // if err != nil { // fmt.Fprintf(GinkgoWriter, "[cleanupTestContextExtendedTime] Failure check for namespace: %s.\n", cxt.namespace) // } - //Expect(err).NotTo(HaveOccurred()) + // Expect(err).NotTo(HaveOccurred()) } func cleanupTestContext(cxt *context) { @@ -295,108 +271,6 @@ func createGenericAWTimeoutWithStatus(context *context, name string) *arbv1.AppW return appwrapper } -func createJobEx(context *context, job *jobSpec) ([]*batchv1.Job, *arbv1.AppWrapper) { - var jobs []*batchv1.Job - var appwrapper *arbv1.AppWrapper - var min int32 - - ns := getNS(context, job) - - for i, task := range job.tasks { - job := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%d", job.name, i), - Namespace: ns, - }, - Spec: batchv1.JobSpec{ - Parallelism: &task.rep, - Completions: &task.rep, - Template: v1.PodTemplateSpec{ - ObjectMeta: metav1.ObjectMeta{ - Labels: task.labels, - Annotations: map[string]string{arbv1.AppWrapperAnnotationKey: job.name}, - }, - Spec: v1.PodSpec{ - SchedulerName: "default", - RestartPolicy: v1.RestartPolicyNever, - Containers: createContainers(task.img, task.req, task.hostport), - Affinity: task.affinity, - }, - }, - }, - } - - job, err := context.kubeclient.BatchV1().Jobs(job.Namespace).Create(gcontext.Background(), job, metav1.CreateOptions{}) - Expect(err).NotTo(HaveOccurred()) - jobs = append(jobs, job) - - min = min + task.min - } - - rb := []byte(`{"kind": "Pod", "apiVersion": "v1", "metadata": { "name": "foo"}}`) - - var schedSpecMin int = 1 - - aw := &arbv1.AppWrapper{ - ObjectMeta: metav1.ObjectMeta{ - Name: job.name, - Namespace: ns, - }, - Spec: arbv1.AppWrapperSpec{ - SchedSpec: arbv1.SchedulingSpecTemplate{ - MinAvailable: schedSpecMin, - }, - AggrResources: arbv1.AppWrapperResourceList{ - GenericItems: []arbv1.AppWrapperGenericResource{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", job.name, "resource1"), - Namespace: ns, - }, - DesiredAvailable: 1, - GenericTemplate: runtime.RawExtension{ - Raw: rb, - }, - }, - }, - }, - }, - } - - appwrapper, err := context.karclient.ArbV1().AppWrappers(ns).Create(aw) - Expect(err).NotTo(HaveOccurred()) - - return jobs, appwrapper -} - -/* -func taskPhase(ctx *context, pg *arbv1.PodGroup, phase []v1.PodPhase, taskNum int) wait.ConditionFunc { - return func() (bool, error) { - pg, err := ctx.karclient.Scheduling().PodGroups(pg.Namespace).Get(pg.Name, metav1.GetOptions{}) - Expect(err).NotTo(HaveOccurred()) - - pods, err := ctx.kubeclient.CoreV1().Pods(pg.Namespace).List(metav1.ListOptions{}) - Expect(err).NotTo(HaveOccurred()) - - readyTaskNum := 0 - for _, pod := range pods.Items { - if gn, found := pod.Annotations[arbv1.GroupNameAnnotationKey]; !found || gn != pg.Name { - continue - } - - for _, p := range phase { - if pod.Status.Phase == p { - readyTaskNum++ - break - } - } - } - - return taskNum <= readyTaskNum, nil - } -} -*/ - func anyPodsExist(ctx *context, awNamespace string, awName string) wait.ConditionFunc { return func() (bool, error) { podList, err := ctx.kubeclient.CoreV1().Pods(awNamespace).List(gcontext.Background(), metav1.ListOptions{}) @@ -407,8 +281,8 @@ func anyPodsExist(ctx *context, awNamespace string, awName string) wait.Conditio // First find a pod from the list that is part of the AW if awn, found := podFromPodList.Labels["appwrapper.mcad.ibm.com"]; !found || awn != awName { - //DEBUG fmt.Fprintf(GinkgoWriter, "[anyPodsExist] Pod %s in phase: %s not part of AppWrapper: %s, labels: %#v\n", - //DEBUG podFromPodList.Name, podFromPodList.Status.Phase, awName, podFromPodList.Labels) + // DEBUG fmt.Fprintf(GinkgoWriter, "[anyPodsExist] Pod %s in phase: %s not part of AppWrapper: %s, labels: %#v\n", + // DEBUG podFromPodList.Name, podFromPodList.Status.Phase, awName, podFromPodList.Labels) continue } podExistsNum++ @@ -471,26 +345,6 @@ func podPhase(ctx *context, awNamespace string, awName string, pods []*v1.Pod, p } } -func awStatePhase(ctx *context, aw *arbv1.AppWrapper, phase []arbv1.AppWrapperState, taskNum int, quite bool) wait.ConditionFunc { - return func() (bool, error) { - aw, err := ctx.karclient.ArbV1().AppWrappers(aw.Namespace).Get(aw.Name, metav1.GetOptions{}) - Expect(err).NotTo(HaveOccurred()) - - phaseCount := 0 - if !quite { - fmt.Fprintf(GinkgoWriter, "[awStatePhase] AW %s found with state: %s.\n", aw.Name, aw.Status.State) - } - - for _, p := range phase { - if aw.Status.State == p { - phaseCount++ - break - } - } - return 1 <= phaseCount, nil - } -} - func cleanupTestObjectsPtr(context *context, appwrappersPtr *[]*arbv1.AppWrapper) { cleanupTestObjectsPtrVerbose(context, appwrappersPtr, true) } @@ -514,7 +368,7 @@ func cleanupTestObjectsVerbose(context *context, appwrappers []*arbv1.AppWrapper } for _, aw := range appwrappers { - //context.karclient.ArbV1().AppWrappers(context.namespace).Delete(aw.Name, &metav1.DeleteOptions{PropagationPolicy: &foreground}) + // context.karclient.ArbV1().AppWrappers(context.namespace).Delete(aw.Name, &metav1.DeleteOptions{PropagationPolicy: &foreground}) pods := getPodsOfAppWrapper(context, aw) awNamespace := aw.Namespace @@ -575,9 +429,9 @@ func awPodPhase(ctx *context, aw *arbv1.AppWrapper, phase []v1.PodPhase, taskNum for _, p := range phase { if pod.Status.Phase == p { - //DEBUGif quite { - //DEBUG fmt.Fprintf(GinkgoWriter, "[awPodPhase] Found pod %s of AppWrapper: %s, phase: %v\n", pod.Name, aw.Name, p) - //DEBUG} + // DEBUGif quite { + // DEBUG fmt.Fprintf(GinkgoWriter, "[awPodPhase] Found pod %s of AppWrapper: %s, phase: %v\n", pod.Name, aw.Name, p) + // DEBUG} readyTaskNum++ break } else { @@ -603,62 +457,14 @@ func awPodPhase(ctx *context, aw *arbv1.AppWrapper, phase []v1.PodPhase, taskNum } } - //DEBUGif taskNum <= readyTaskNum && quite { - //DEBUG fmt.Fprintf(GinkgoWriter, "[awPodPhase] Successfully found %v podList of AppWrapper: %s, state: %s\n", readyTaskNum, aw.Name, aw.Status.State) - //DEBUG} + // DEBUGif taskNum <= readyTaskNum && quite { + // DEBUG fmt.Fprintf(GinkgoWriter, "[awPodPhase] Successfully found %v podList of AppWrapper: %s, state: %s\n", readyTaskNum, aw.Name, aw.Status.State) + // DEBUG} return taskNum <= readyTaskNum, nil } } -/* -func podGroupUnschedulable(ctx *context, pg *arbv1.PodGroup, time time.Time) wait.ConditionFunc { - return func() (bool, error) { - pg, err := ctx.karclient.Scheduling().PodGroups(pg.Namespace).Get(pg.Name, metav1.GetOptions{}) - Expect(err).NotTo(HaveOccurred()) - - events, err := ctx.kubeclient.CoreV1().Events(pg.Namespace).List(metav1.ListOptions{}) - Expect(err).NotTo(HaveOccurred()) - - for _, event := range events.Items { - target := event.InvolvedObject - if target.Name == pg.Name && target.Namespace == pg.Namespace { - if event.Reason == string(arbv1.UnschedulableEvent) && event.LastTimestamp.After(time) { - return true, nil - } - } - } - - return false, nil - } -} -*/ -/* -func waitPodGroupReady(ctx *context, pg *arbv1.PodGroup) error { - return waitTasksReadyEx(ctx, pg, int(pg.Spec.MinMember)) -} - -func waitPodGroupPending(ctx *context, pg *arbv1.PodGroup) error { - return wait.Poll(100*time.Millisecond, ninetySeconds, taskPhase(ctx, pg, - []v1.PodPhase{v1.PodPending}, int(pg.Spec.MinMember))) -} - -func waitTasksReadyEx(ctx *context, pg *arbv1.PodGroup, taskNum int) error { - return wait.Poll(100*time.Millisecond, ninetySeconds, taskPhase(ctx, pg, - []v1.PodPhase{v1.PodRunning, v1.PodSucceeded}, taskNum)) -} - -func waitTasksPendingEx(ctx *context, pg *arbv1.PodGroup, taskNum int) error { - return wait.Poll(100*time.Millisecond, ninetySeconds, taskPhase(ctx, pg, - []v1.PodPhase{v1.PodPending}, taskNum)) -} - -func waitPodGroupUnschedulable(ctx *context, pg *arbv1.PodGroup) error { - now := time.Now() - return wait.Poll(10*time.Second, ninetySeconds, podGroupUnschedulable(ctx, pg, now)) -} -*/ - func waitAWNonComputeResourceActive(ctx *context, aw *arbv1.AppWrapper) error { return waitAWNamespaceActive(ctx, aw) } @@ -768,69 +574,6 @@ func waitAWPodsTerminatedExVerbose(ctx *context, namespace string, name string, []v1.PodPhase{v1.PodRunning, v1.PodSucceeded, v1.PodUnknown, v1.PodFailed, v1.PodPending}, taskNum)) } -func createContainers(img string, req v1.ResourceList, hostport int32) []v1.Container { - container := v1.Container{ - Image: img, - Name: img, - ImagePullPolicy: v1.PullIfNotPresent, - Resources: v1.ResourceRequirements{ - Requests: req, - }, - } - - if hostport > 0 { - container.Ports = []v1.ContainerPort{ - { - ContainerPort: hostport, - HostPort: hostport, - }, - } - } - - return []v1.Container{container} -} - -func createReplicaSet(context *context, name string, rep int32, img string, req v1.ResourceList) *appv1.ReplicaSet { - deploymentName := "deployment.k8s.io" - deployment := &appv1.ReplicaSet{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: context.namespace, - }, - Spec: appv1.ReplicaSetSpec{ - Replicas: &rep, - Selector: &metav1.LabelSelector{ - MatchLabels: map[string]string{ - deploymentName: name, - }, - }, - Template: v1.PodTemplateSpec{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{deploymentName: name}, - }, - Spec: v1.PodSpec{ - RestartPolicy: v1.RestartPolicyAlways, - Containers: []v1.Container{ - { - Image: img, - Name: name, - ImagePullPolicy: v1.PullIfNotPresent, - Resources: v1.ResourceRequirements{ - Requests: req, - }, - }, - }, - }, - }, - }, - } - - deployment, err := context.kubeclient.AppsV1().ReplicaSets(context.namespace).Create(gcontext.Background(), deployment, metav1.CreateOptions{}) - Expect(err).NotTo(HaveOccurred()) - - return deployment -} - func createJobAWWithInitContainer(context *context, name string, requeuingTimeInSeconds int, requeuingGrowthType string, requeuingMaxNumRequeuings int) *arbv1.AppWrapper { rb := []byte(`{"apiVersion": "batch/v1", "kind": "Job", @@ -865,346 +608,25 @@ func createJobAWWithInitContainer(context *context, name string, requeuingTimeIn "cpu": "500m" } } - } - ], - "containers": [ - { - "name": "job-container", - "image": "k8s.gcr.io/busybox:latest", - "command": ["sleep", "10"], - "resources": { - "requests": { - "cpu": "500m" - } - } - } - ] - } - } - }} `) - - var minAvailable int = 3 - - aw := &arbv1.AppWrapper{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: context.namespace, - }, - Spec: arbv1.AppWrapperSpec{ - SchedSpec: arbv1.SchedulingSpecTemplate{ - MinAvailable: minAvailable, - Requeuing: arbv1.RequeuingTemplate{ - TimeInSeconds: requeuingTimeInSeconds, - GrowthType: requeuingGrowthType, - MaxNumRequeuings: requeuingMaxNumRequeuings, - }, - }, - AggrResources: arbv1.AppWrapperResourceList{ - GenericItems: []arbv1.AppWrapperGenericResource{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: context.namespace, - }, - DesiredAvailable: 1, - GenericTemplate: runtime.RawExtension{ - Raw: rb, - }, - CompletionStatus: "Complete", - }, - }, - }, - }, - } - - appwrapper, err := context.karclient.ArbV1().AppWrappers(context.namespace).Create(aw) - Expect(err).NotTo(HaveOccurred()) - - return appwrapper -} - -func createDeploymentAW(context *context, name string) *arbv1.AppWrapper { - rb := []byte(`{"apiVersion": "apps/v1", - "kind": "Deployment", - "metadata": { - "name": "aw-deployment-3", - "namespace": "test", - "labels": { - "app": "aw-deployment-3" - } - }, - "spec": { - "replicas": 3, - "selector": { - "matchLabels": { - "app": "aw-deployment-3" - } - }, - "template": { - "metadata": { - "labels": { - "app": "aw-deployment-3" - }, - "annotations": { - "appwrapper.mcad.ibm.com/appwrapper-name": "aw-deployment-3" - } - }, - "spec": { - "containers": [ - { - "name": "aw-deployment-3", - "image": "kicbase/echo-server:1.0", - "ports": [ - { - "containerPort": 80 - } - ] - } - ] - } - } - }} `) - var schedSpecMin int = 3 - - aw := &arbv1.AppWrapper{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: context.namespace, - }, - Spec: arbv1.AppWrapperSpec{ - SchedSpec: arbv1.SchedulingSpecTemplate{ - MinAvailable: schedSpecMin, - }, - AggrResources: arbv1.AppWrapperResourceList{ - GenericItems: []arbv1.AppWrapperGenericResource{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", name, "item1"), - Namespace: context.namespace, - }, - DesiredAvailable: 1, - GenericTemplate: runtime.RawExtension{ - Raw: rb, - }, - }, - }, - }, - }, - } - - appwrapper, err := context.karclient.ArbV1().AppWrappers(context.namespace).Create(aw) - Expect(err).NotTo(HaveOccurred()) - - return appwrapper -} - -func createDeploymentAWwith900CPU(context *context, name string) *arbv1.AppWrapper { - rb := []byte(`{"apiVersion": "apps/v1", - "kind": "Deployment", - "metadata": { - "name": "aw-deployment-2-900cpu", - "namespace": "test", - "labels": { - "app": "aw-deployment-2-900cpu" - } - }, - "spec": { - "replicas": 2, - "selector": { - "matchLabels": { - "app": "aw-deployment-2-900cpu" - } - }, - "template": { - "metadata": { - "labels": { - "app": "aw-deployment-2-900cpu" - }, - "annotations": { - "appwrapper.mcad.ibm.com/appwrapper-name": "aw-deployment-2-900cpu" - } - }, - "spec": { - "containers": [ - { - "name": "aw-deployment-2-900cpu", - "image": "kicbase/echo-server:1.0", - "resources": { - "requests": { - "cpu": "900m" - } - }, - "ports": [ - { - "containerPort": 80 - } - ] - } - ] - } - } - }} `) - var schedSpecMin int = 2 - - aw := &arbv1.AppWrapper{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: context.namespace, - }, - Spec: arbv1.AppWrapperSpec{ - SchedSpec: arbv1.SchedulingSpecTemplate{ - MinAvailable: schedSpecMin, - }, - AggrResources: arbv1.AppWrapperResourceList{ - GenericItems: []arbv1.AppWrapperGenericResource{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", name, "item1"), - Namespace: context.namespace, - }, - DesiredAvailable: 1, - GenericTemplate: runtime.RawExtension{ - Raw: rb, - }, - }, - }, - }, - }, - } - - appwrapper, err := context.karclient.ArbV1().AppWrappers(context.namespace).Create(aw) - Expect(err).NotTo(HaveOccurred()) - - return appwrapper -} - -func createDeploymentAWwith550CPU(context *context, name string) *arbv1.AppWrapper { - rb := []byte(`{"apiVersion": "apps/v1", - "kind": "Deployment", - "metadata": { - "name": "` + name + `", - "namespace": "test", - "labels": { - "app": "` + name + `" - } - }, - "spec": { - "replicas": 2, - "selector": { - "matchLabels": { - "app": "` + name + `" - } - }, - "template": { - "metadata": { - "labels": { - "app": "` + name + `" - }, - "annotations": { - "appwrapper.mcad.ibm.com/appwrapper-name": "` + name + `" - } - }, - "spec": { - "containers": [ - { - "name": "` + name + `", - "image": "kicbase/echo-server:1.0", - "resources": { - "requests": { - "cpu": "550m" - } - }, - "ports": [ - { - "containerPort": 80 - } - ] - } - ] - } - } - }} `) - var schedSpecMin int = 2 - - aw := &arbv1.AppWrapper{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: context.namespace, - }, - Spec: arbv1.AppWrapperSpec{ - SchedSpec: arbv1.SchedulingSpecTemplate{ - MinAvailable: schedSpecMin, - }, - AggrResources: arbv1.AppWrapperResourceList{ - GenericItems: []arbv1.AppWrapperGenericResource{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", name, "item1"), - Namespace: context.namespace, - }, - DesiredAvailable: 1, - GenericTemplate: runtime.RawExtension{ - Raw: rb, - }, - }, - }, - }, - }, - } - - appwrapper, err := context.karclient.ArbV1().AppWrappers(context.namespace).Create(aw) - Expect(err).NotTo(HaveOccurred()) - - return appwrapper -} - -func createDeploymentAWwith125CPU(context *context, name string) *arbv1.AppWrapper { - rb := []byte(`{"apiVersion": "apps/v1", - "kind": "Deployment", - "metadata": { - "name": "aw-deployment-2-125cpu", - "namespace": "test", - "labels": { - "app": "aw-deployment-2-125cpu" - } - }, - "spec": { - "replicas": 2, - "selector": { - "matchLabels": { - "app": "aw-deployment-2-125cpu" - } - }, - "template": { - "metadata": { - "labels": { - "app": "aw-deployment-2-125cpu" - }, - "annotations": { - "appwrapper.mcad.ibm.com/appwrapper-name": "aw-deployment-2-125cpu" - } - }, - "spec": { + } + ], "containers": [ { - "name": "aw-deployment-2-125cpu", - "image": "kicbase/echo-server:1.0", + "name": "job-container", + "image": "k8s.gcr.io/busybox:latest", + "command": ["sleep", "10"], "resources": { "requests": { - "cpu": "125m" - } - }, - "ports": [ - { - "containerPort": 80 + "cpu": "500m" } - ] + } } ] } } }} `) - var schedSpecMin int = 2 + + var minAvailable int = 3 aw := &arbv1.AppWrapper{ ObjectMeta: metav1.ObjectMeta{ @@ -1213,19 +635,25 @@ func createDeploymentAWwith125CPU(context *context, name string) *arbv1.AppWrapp }, Spec: arbv1.AppWrapperSpec{ SchedSpec: arbv1.SchedulingSpecTemplate{ - MinAvailable: schedSpecMin, + MinAvailable: minAvailable, + Requeuing: arbv1.RequeuingTemplate{ + TimeInSeconds: requeuingTimeInSeconds, + GrowthType: requeuingGrowthType, + MaxNumRequeuings: requeuingMaxNumRequeuings, + }, }, AggrResources: arbv1.AppWrapperResourceList{ GenericItems: []arbv1.AppWrapperGenericResource{ { ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", name, "item1"), + Name: name, Namespace: context.namespace, }, DesiredAvailable: 1, GenericTemplate: runtime.RawExtension{ Raw: rb, }, + CompletionStatus: "Complete", }, }, }, @@ -1238,42 +666,37 @@ func createDeploymentAWwith125CPU(context *context, name string) *arbv1.AppWrapp return appwrapper } -func createDeploymentAWwith126CPU(context *context, name string) *arbv1.AppWrapper { +func createDeploymentAW(context *context, name string) *arbv1.AppWrapper { rb := []byte(`{"apiVersion": "apps/v1", "kind": "Deployment", "metadata": { - "name": "aw-deployment-2-126cpu", + "name": "aw-deployment-3", "namespace": "test", "labels": { - "app": "aw-deployment-2-126cpu" + "app": "aw-deployment-3" } }, "spec": { - "replicas": 2, + "replicas": 3, "selector": { "matchLabels": { - "app": "aw-deployment-2-126cpu" + "app": "aw-deployment-3" } }, "template": { "metadata": { "labels": { - "app": "aw-deployment-2-126cpu" + "app": "aw-deployment-3" }, "annotations": { - "appwrapper.mcad.ibm.com/appwrapper-name": "aw-deployment-2-126cpu" + "appwrapper.mcad.ibm.com/appwrapper-name": "aw-deployment-3" } }, "spec": { "containers": [ { - "name": "aw-deployment-2-126cpu", + "name": "aw-deployment-3", "image": "kicbase/echo-server:1.0", - "resources": { - "requests": { - "cpu": "126m" - } - }, "ports": [ { "containerPort": 80 @@ -1284,7 +707,7 @@ func createDeploymentAWwith126CPU(context *context, name string) *arbv1.AppWrapp } } }} `) - var schedSpecMin int = 2 + var schedSpecMin int = 3 aw := &arbv1.AppWrapper{ ObjectMeta: metav1.ObjectMeta{ @@ -1318,40 +741,40 @@ func createDeploymentAWwith126CPU(context *context, name string) *arbv1.AppWrapp return appwrapper } -func createDeploymentAWwith350CPU(context *context, name string) *arbv1.AppWrapper { +func createDeploymentAWwith550CPU(context *context, name string) *arbv1.AppWrapper { rb := []byte(`{"apiVersion": "apps/v1", "kind": "Deployment", "metadata": { - "name": "aw-deployment-2-350cpu", + "name": "` + name + `", "namespace": "test", "labels": { - "app": "aw-deployment-2-350cpu" + "app": "` + name + `" } }, "spec": { "replicas": 2, "selector": { "matchLabels": { - "app": "aw-deployment-2-350cpu" + "app": "` + name + `" } }, "template": { "metadata": { "labels": { - "app": "aw-deployment-2-350cpu" + "app": "` + name + `" }, "annotations": { - "appwrapper.mcad.ibm.com/appwrapper-name": "aw-deployment-2-350cpu" + "appwrapper.mcad.ibm.com/appwrapper-name": "` + name + `" } }, "spec": { "containers": [ { - "name": "aw-deployment-2-350cpu", + "name": "` + name + `", "image": "kicbase/echo-server:1.0", "resources": { "requests": { - "cpu": "350m" + "cpu": "550m" } }, "ports": [ @@ -1398,40 +821,40 @@ func createDeploymentAWwith350CPU(context *context, name string) *arbv1.AppWrapp return appwrapper } -func createDeploymentAWwith351CPU(context *context, name string) *arbv1.AppWrapper { +func createDeploymentAWwith350CPU(context *context, name string) *arbv1.AppWrapper { rb := []byte(`{"apiVersion": "apps/v1", "kind": "Deployment", "metadata": { - "name": "aw-deployment-2-351cpu", + "name": "aw-deployment-2-350cpu", "namespace": "test", "labels": { - "app": "aw-deployment-2-351cpu" + "app": "aw-deployment-2-350cpu" } }, "spec": { "replicas": 2, "selector": { "matchLabels": { - "app": "aw-deployment-2-351cpu" + "app": "aw-deployment-2-350cpu" } }, "template": { "metadata": { "labels": { - "app": "aw-deployment-2-351cpu" + "app": "aw-deployment-2-350cpu" }, "annotations": { - "appwrapper.mcad.ibm.com/appwrapper-name": "aw-deployment-2-351cpu" + "appwrapper.mcad.ibm.com/appwrapper-name": "aw-deployment-2-350cpu" } }, "spec": { "containers": [ { - "name": "aw-deployment-2-351cpu", + "name": "aw-deployment-2-350cpu", "image": "kicbase/echo-server:1.0", "resources": { "requests": { - "cpu": "351m" + "cpu": "350m" } }, "ports": [ @@ -1762,7 +1185,7 @@ func createGenericJobAWWithStatus(context *context, name string) *arbv1.AppWrapp } } }`) - //var schedSpecMin int = 1 + // var schedSpecMin int = 1 aw := &arbv1.AppWrapper{ ObjectMeta: metav1.ObjectMeta{ @@ -1771,7 +1194,7 @@ func createGenericJobAWWithStatus(context *context, name string) *arbv1.AppWrapp }, Spec: arbv1.AppWrapperSpec{ SchedSpec: arbv1.SchedulingSpecTemplate{ - //MinAvailable: schedSpecMin, + // MinAvailable: schedSpecMin, }, AggrResources: arbv1.AppWrapperResourceList{ GenericItems: []arbv1.AppWrapperGenericResource{ @@ -2115,7 +1538,7 @@ func createGenericJobAWtWithLargeCompute(context *context, name string) *arbv1.A } } }`) - //var schedSpecMin int = 1 + // var schedSpecMin int = 1 aw := &arbv1.AppWrapper{ ObjectMeta: metav1.ObjectMeta{ @@ -2124,7 +1547,7 @@ func createGenericJobAWtWithLargeCompute(context *context, name string) *arbv1.A }, Spec: arbv1.AppWrapperSpec{ SchedSpec: arbv1.SchedulingSpecTemplate{ - //MinAvailable: schedSpecMin, + // MinAvailable: schedSpecMin, }, AggrResources: arbv1.AppWrapperResourceList{ GenericItems: []arbv1.AppWrapperGenericResource{ @@ -2137,7 +1560,7 @@ func createGenericJobAWtWithLargeCompute(context *context, name string) *arbv1.A GenericTemplate: runtime.RawExtension{ Raw: rb, }, - //CompletionStatus: "Complete", + // CompletionStatus: "Complete", }, }, }, @@ -2190,7 +1613,7 @@ func createGenericServiceAWWithNoStatus(context *context, name string) *arbv1.Ap "type": "ClusterIP" } }`) - //var schedSpecMin int = 1 + // var schedSpecMin int = 1 aw := &arbv1.AppWrapper{ ObjectMeta: metav1.ObjectMeta{ @@ -2199,7 +1622,7 @@ func createGenericServiceAWWithNoStatus(context *context, name string) *arbv1.Ap }, Spec: arbv1.AppWrapperSpec{ SchedSpec: arbv1.SchedulingSpecTemplate{ - //MinAvailable: schedSpecMin, + // MinAvailable: schedSpecMin, }, AggrResources: arbv1.AppWrapperResourceList{ GenericItems: []arbv1.AppWrapperGenericResource{ @@ -2365,128 +1788,6 @@ func createGenericDeploymentAWWithMultipleItems(context *context, name string) * return appwrapper } -func createGenericDeploymentAWWithService(context *context, name string) *arbv1.AppWrapper { - rb := []byte(`{"apiVersion": "apps/v1", - "kind": "Deployment", - "metadata": { - "name": "aw-deployment-3-status", - "namespace": "test", - "labels": { - "app": "aw-deployment-3-status" - } - }, - "spec": { - "replicas": 1, - "selector": { - "matchLabels": { - "app": "aw-deployment-3-status" - } - }, - "template": { - "metadata": { - "labels": { - "app": "aw-deployment-3-status" - }, - "annotations": { - "appwrapper.mcad.ibm.com/appwrapper-name": "aw-deployment-3-status" - } - }, - "spec": { - "containers": [ - { - "name": "aw-deployment-3-status", - "image": "kicbase/echo-server:1.0", - "ports": [ - { - "containerPort": 80 - } - ] - } - ] - } - } - }} `) - - rb1 := []byte(`{ - "apiVersion": "v1", - "kind": "Service", - "metadata": { - "name": "my-service", - "namespace": "test" - }, - "spec": { - "clusterIP": "10.96.76.247", - "clusterIPs": [ - "10.96.76.247" - ], - "ipFamilies": [ - "IPv4" - ], - "ipFamilyPolicy": "SingleStack", - "ports": [ - { - "port": 80, - "protocol": "TCP", - "targetPort": 9376 - } - ], - "selector": { - "app.kubernetes.io/name": "MyApp" - }, - "sessionAffinity": "None", - "type": "ClusterIP" - }, - "status": { - "loadBalancer": {} - } - }`) - - var schedSpecMin int = 1 - - aw := &arbv1.AppWrapper{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: "test", - }, - Spec: arbv1.AppWrapperSpec{ - SchedSpec: arbv1.SchedulingSpecTemplate{ - MinAvailable: schedSpecMin, - }, - AggrResources: arbv1.AppWrapperResourceList{ - GenericItems: []arbv1.AppWrapperGenericResource{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", name, "aw-deployment-3-status"), - Namespace: "test", - }, - DesiredAvailable: 1, - GenericTemplate: runtime.RawExtension{ - Raw: rb, - }, - CompletionStatus: "Progressing", - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", name, "my-service"), - Namespace: "test", - }, - DesiredAvailable: 1, - GenericTemplate: runtime.RawExtension{ - Raw: rb1, - }, - CompletionStatus: "bogus", - }, - }, - }, - }, - } - - appwrapper, err := context.karclient.ArbV1().AppWrappers(context.namespace).Create(aw) - Expect(err).NotTo(HaveOccurred()) - - return appwrapper -} - func createGenericDeploymentWithCPUAW(context *context, name string, cpuDemand string, replicas int) *arbv1.AppWrapper { rb := []byte(fmt.Sprintf(`{ "apiVersion": "apps/v1", @@ -2890,7 +2191,7 @@ func createGenericStatefulSetAW(context *context, name string) *arbv1.AppWrapper // NOTE: // // Recommend this test not to be the last test in the test suite it may pass -// may pass the local test but may cause controller to fail which is not +// the local test but may cause controller to fail which is not // part of this test's validation. func createBadPodTemplateAW(context *context, name string) *arbv1.AppWrapper { rb := []byte(`{"apiVersion": "v1", @@ -3017,7 +2318,6 @@ func createPodTemplateAW(context *context, name string) *arbv1.AppWrapper { Name: fmt.Sprintf("%s-%s", name, "item"), Namespace: context.namespace, }, - DesiredAvailable: 1, GenericTemplate: runtime.RawExtension{ Raw: rb, }, @@ -3027,7 +2327,6 @@ func createPodTemplateAW(context *context, name string) *arbv1.AppWrapper { Name: fmt.Sprintf("%s-%s", name, "item1"), Namespace: context.namespace, }, - DesiredAvailable: 1, GenericTemplate: runtime.RawExtension{ Raw: rb1, }, @@ -3044,7 +2343,8 @@ func createPodTemplateAW(context *context, name string) *arbv1.AppWrapper { } func createPodCheckFailedStatusAW(context *context, name string) *arbv1.AppWrapper { - rb := []byte(`{"apiVersion": "v1", + rb := []byte(`{ + "apiVersion": "v1", "kind": "Pod", "metadata": { "name": "aw-checkfailedstatus-1", @@ -3075,6 +2375,7 @@ func createPodCheckFailedStatusAW(context *context, name string) *arbv1.AppWrapp ] } } `) + var schedSpecMin int = 1 aw := &arbv1.AppWrapper{ @@ -3110,7 +2411,6 @@ func createPodCheckFailedStatusAW(context *context, name string) *arbv1.AppWrapp } func createGenericPodAWCustomDemand(context *context, name string, cpuDemand string) *arbv1.AppWrapper { - genericItems := fmt.Sprintf(`{ "apiVersion": "v1", "kind": "Pod", @@ -3186,7 +2486,8 @@ func createGenericPodAWCustomDemand(context *context, name string, cpuDemand str } func createGenericPodAW(context *context, name string) *arbv1.AppWrapper { - rb := []byte(`{"apiVersion": "v1", + rb := []byte(`{ + "apiVersion": "v1", "kind": "Pod", "metadata": { "name": "aw-generic-pod-1", @@ -3259,7 +2560,8 @@ func createGenericPodAW(context *context, name string) *arbv1.AppWrapper { } func createGenericPodTooBigAW(context *context, name string) *arbv1.AppWrapper { - rb := []byte(`{"apiVersion": "v1", + rb := []byte(`{ + "apiVersion": "v1", "kind": "Pod", "metadata": { "name": "aw-generic-big-pod-1", @@ -3334,7 +2636,8 @@ func createGenericPodTooBigAW(context *context, name string) *arbv1.AppWrapper { } func createBadGenericPodAW(context *context, name string) *arbv1.AppWrapper { - rb := []byte(`{"apiVersion": "v1", + rb := []byte(`{ + "apiVersion": "v1", "kind": "Pod", "metadata": { "labels": { @@ -3392,7 +2695,7 @@ func createBadGenericPodAW(context *context, name string) *arbv1.AppWrapper { } func createBadGenericItemAW(context *context, name string) *arbv1.AppWrapper { - //rb := []byte(`""`) + // rb := []byte(`""`) var schedSpecMin int = 1 aw := &arbv1.AppWrapper{ @@ -3427,7 +2730,7 @@ func createBadGenericItemAW(context *context, name string) *arbv1.AppWrapper { } func createBadGenericPodTemplateAW(context *context, name string) (*arbv1.AppWrapper, error) { - rb := []byte(`{"metadata": + rb := []byte(`{"metadata": { "name": "aw-generic-podtemplate-2", "namespace": "test", @@ -3491,13 +2794,6 @@ func createBadGenericPodTemplateAW(context *context, name string) (*arbv1.AppWra return appwrapper, err } -func deleteReplicaSet(ctx *context, name string) error { - foreground := metav1.DeletePropagationForeground - return ctx.kubeclient.AppsV1().ReplicaSets(ctx.namespace).Delete(gcontext.Background(), name, metav1.DeleteOptions{ - PropagationPolicy: &foreground, - }) -} - func deleteAppWrapper(ctx *context, name string) error { foreground := metav1.DeletePropagationForeground return ctx.karclient.ArbV1().AppWrappers(ctx.namespace).Delete(name, &metav1.DeleteOptions{ @@ -3505,90 +2801,6 @@ func deleteAppWrapper(ctx *context, name string) error { }) } -func replicaSetReady(ctx *context, name string) wait.ConditionFunc { - return func() (bool, error) { - deployment, err := ctx.kubeclient.ExtensionsV1beta1().ReplicaSets(ctx.namespace).Get(gcontext.Background(), name, metav1.GetOptions{}) - Expect(err).NotTo(HaveOccurred()) - - pods, err := ctx.kubeclient.CoreV1().Pods(ctx.namespace).List(gcontext.Background(), metav1.ListOptions{}) - Expect(err).NotTo(HaveOccurred()) - - labelSelector := labels.SelectorFromSet(deployment.Spec.Selector.MatchLabels) - - readyTaskNum := 0 - for _, pod := range pods.Items { - if !labelSelector.Matches(labels.Set(pod.Labels)) { - continue - } - if pod.Status.Phase == v1.PodRunning || pod.Status.Phase == v1.PodSucceeded { - readyTaskNum++ - } - } - - return *(deployment.Spec.Replicas) == int32(readyTaskNum), nil - } -} - -func waitReplicaSetReady(ctx *context, name string) error { - return wait.Poll(100*time.Millisecond, ninetySeconds, replicaSetReady(ctx, name)) -} - -func clusterSize(ctx *context, req v1.ResourceList) int32 { - nodes, err := ctx.kubeclient.CoreV1().Nodes().List(gcontext.Background(), metav1.ListOptions{}) - Expect(err).NotTo(HaveOccurred()) - - pods, err := ctx.kubeclient.CoreV1().Pods(metav1.NamespaceAll).List(gcontext.Background(), metav1.ListOptions{}) - Expect(err).NotTo(HaveOccurred()) - - used := map[string]*csapi.Resource{} - - for _, pod := range pods.Items { - nodeName := pod.Spec.NodeName - if len(nodeName) == 0 || pod.DeletionTimestamp != nil { - continue - } - - if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed { - continue - } - - if _, found := used[nodeName]; !found { - used[nodeName] = csapi.EmptyResource() - } - - for _, c := range pod.Spec.Containers { - req := csapi.NewResource(c.Resources.Requests) - used[nodeName].Add(req) - } - } - - res := int32(0) - - for _, node := range nodes.Items { - // Skip node with taints - if len(node.Spec.Taints) != 0 { - continue - } - - alloc := csapi.NewResource(node.Status.Allocatable) - slot := csapi.NewResource(req) - - // Removed used resources. - if res, found := used[node.Name]; found { - _, err := alloc.Sub(res) - Expect(err).NotTo(HaveOccurred()) - } - - for slot.LessEqual(alloc) { - _, err := alloc.Sub(slot) - Expect(err).NotTo(HaveOccurred()) - res++ - } - } - - return res -} - func getPodsOfAppWrapper(ctx *context, aw *arbv1.AppWrapper) []*v1.Pod { aw, err := ctx.karclient.ArbV1().AppWrappers(aw.Namespace).Get(aw.Name, metav1.GetOptions{}) Expect(err).NotTo(HaveOccurred()) @@ -3611,59 +2823,6 @@ func getPodsOfAppWrapper(ctx *context, aw *arbv1.AppWrapper) []*v1.Pod { return awpods } -func taintAllNodes(ctx *context, taints []v1.Taint) error { - nodes, err := ctx.kubeclient.CoreV1().Nodes().List(gcontext.Background(), metav1.ListOptions{}) - Expect(err).NotTo(HaveOccurred()) - - for _, node := range nodes.Items { - newNode := node.DeepCopy() - - newTaints := newNode.Spec.Taints - for _, t := range taints { - found := false - for _, nt := range newTaints { - if nt.Key == t.Key { - found = true - break - } - } - - if !found { - newTaints = append(newTaints, t) - } - } - - newNode.Spec.Taints = newTaints - - patchBytes, err := preparePatchBytesforNode(node.Name, &node, newNode) - Expect(err).NotTo(HaveOccurred()) - - _, err = ctx.kubeclient.CoreV1().Nodes().Patch(gcontext.Background(), node.Name, types.StrategicMergePatchType, patchBytes, metav1.PatchOptions{}) - Expect(err).NotTo(HaveOccurred()) - } - - return nil -} - -func preparePatchBytesforNode(nodeName string, oldNode *v1.Node, newNode *v1.Node) ([]byte, error) { - oldData, err := json.Marshal(oldNode) - if err != nil { - return nil, fmt.Errorf("failed to Marshal oldData for node %q: %v", nodeName, err) - } - - newData, err := json.Marshal(newNode) - if err != nil { - return nil, fmt.Errorf("failed to Marshal newData for node %q: %v", nodeName, err) - } - - patchBytes, err := strategicpatch.CreateTwoWayMergePatch(oldData, newData, v1.Node{}) - if err != nil { - return nil, fmt.Errorf("failed to CreateTwoWayMergePatch for node %q: %v", nodeName, err) - } - - return patchBytes, nil -} - const charset = "abcdefghijklmnopqrstuvwxyz0123456789" func appendRandomString(value string) string {