Skip to content

Commit 9df2f58

Browse files
authored
Quota check preceding resource check (#614)
* Quota release when AppWrapper is completed * Quota release testing. Fix in the Fits function to allocate consumer. * Aggregate all the resources names accross all children in the tree to create a single resource name group * Reverted changes from previous commits to current version of main * Added GPU as one of the default resources * Formatting fixes * Fixed bug in quota management library that didn't preempt jobs when borrowing at the root * Quota check preceeding resource check update. This version allows to dispatch AppWrappers that would have enough quota if borrowing AppWrappers are discounted from the resources (due to those being preempted). * Interface fix * Added some comments and TODOs * Kuttl quota borrowing test * Separated borrowing tests so that quotas are resetted * Added missing files for the borrowing test * Added commands to build quota tree when the test is run
1 parent 104c841 commit 9df2f58

21 files changed

+515
-152
lines changed

Diff for: hack/run-e2e-kind.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ export IMAGE_MCAD="${IMAGE_REPOSITORY_MCAD}:${IMAGE_TAG_MCAD}"
4545
CLUSTER_STARTED="false"
4646
export KUTTL_VERSION=0.15.0
4747
export KUTTL_OPTIONS=${TEST_KUTTL_OPTIONS}
48-
export KUTTL_TEST_SUITES=("${ROOT_DIR}/test/kuttl-test.yaml" "${ROOT_DIR}/test/kuttl-test-deployment-03.yaml" "${ROOT_DIR}/test/kuttl-test-deployment-02.yaml" "${ROOT_DIR}/test/kuttl-test-deployment-01.yaml")
48+
export KUTTL_TEST_SUITES=("${ROOT_DIR}/test/kuttl-test.yaml" "${ROOT_DIR}/test/kuttl-test-borrowing.yaml" "${ROOT_DIR}/test/kuttl-test-deployment-03.yaml" "${ROOT_DIR}/test/kuttl-test-deployment-02.yaml" "${ROOT_DIR}/test/kuttl-test-deployment-01.yaml")
4949
DUMP_LOGS="true"
5050

5151
function update_test_host {

Diff for: pkg/controller/clusterstate/api/node_info.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,10 @@ type NodeInfo struct {
4343

4444
// The releasing resource on that node
4545
Releasing *Resource
46+
4647
// The idle resource on that node
4748
Idle *Resource
49+
4850
// The used resource on that node, including running and terminating
4951
// pods
5052
Used *Resource
@@ -104,7 +106,6 @@ func (ni *NodeInfo) Clone() *NodeInfo {
104106
func (ni *NodeInfo) SetNode(node *v1.Node) {
105107
if ni.Node == nil {
106108
ni.Idle = NewResource(node.Status.Allocatable)
107-
108109
}
109110

110111
ni.Name = node.Name
@@ -121,5 +122,4 @@ func (ni NodeInfo) String() string {
121122

122123
return fmt.Sprintf("Node (%s): idle <%v>, used <%v>, releasing <%v>%s",
123124
ni.Name, ni.Idle, ni.Used, ni.Releasing, res)
124-
125125
}

Diff for: pkg/controller/queuejob/queuejob_controller_ex.go

+135-138
Large diffs are not rendered by default.

Diff for: pkg/controller/quota/quota_manager_interface.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ import (
2121
)
2222

2323
type QuotaManagerInterface interface {
24-
Fits(aw *arbv1.AppWrapper, resources *clusterstateapi.Resource, proposedPremptions []*arbv1.AppWrapper) (bool, []*arbv1.AppWrapper, string)
24+
Fits(aw *arbv1.AppWrapper, requestedResources *clusterstateapi.Resource, clusterResources *clusterstateapi.Resource, proposedPremptions []*arbv1.AppWrapper) (bool, []*arbv1.AppWrapper, string)
2525
Release(aw *arbv1.AppWrapper) bool
2626
GetValidQuotaLabels() []string
2727
}

Diff for: pkg/controller/quota/quotaforestmanager/qm_lib_backend_with_quotasubt_mgr.go

+42-3
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ import (
3232
qmbackend "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/quotaplugins/quota-forest/quota-manager/quota"
3333
qmbackendutils "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/quotaplugins/quota-forest/quota-manager/quota/utils"
3434
"github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/quotaplugins/util"
35+
"github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejobresources/genericresource"
3536
"k8s.io/client-go/rest"
3637

3738
"math"
@@ -205,7 +206,7 @@ func (qm *QuotaManager) loadDispatchedAWs(dispatchedAWDemands map[string]*cluste
205206
}
206207
aw.SetLabels(newLabels)
207208

208-
doesFit, preemptionIds, errorMessage := qm.Fits(aw, v, nil)
209+
doesFit, preemptionIds, errorMessage := qm.Fits(aw, v, nil, nil)
209210
if !doesFit {
210211
klog.Errorf("[loadDispatchedAWs] Loading of AppWrapper %s/%s failed.",
211212
aw.Namespace, aw.Name)
@@ -509,7 +510,7 @@ func (qm *QuotaManager) buildRequest(aw *arbv1.AppWrapper,
509510

510511
return consumerInfo, err
511512
}
512-
func (qm *QuotaManager) Fits(aw *arbv1.AppWrapper, awResDemands *clusterstateapi.Resource,
513+
func (qm *QuotaManager) Fits(aw *arbv1.AppWrapper, awResDemands *clusterstateapi.Resource, clusterResources *clusterstateapi.Resource,
513514
proposedPreemptions []*arbv1.AppWrapper) (bool, []*arbv1.AppWrapper, string) {
514515

515516
// If a Quota Manager Backend instance does not exists then assume quota failed
@@ -555,7 +556,7 @@ func (qm *QuotaManager) Fits(aw *arbv1.AppWrapper, awResDemands *clusterstateapi
555556

556557
consumerID := consumerInfo.GetID()
557558
klog.V(4).Infof("[Fits] Sending quota allocation request: %#v ", consumerInfo)
558-
allocResponse, err := qm.quotaManagerBackend.AllocateForest(QuotaManagerForestName, consumerID)
559+
allocResponse, err := qm.quotaManagerBackend.TryAllocateForest(QuotaManagerForestName, consumerID)
559560
if err != nil {
560561
qm.removeConsumer(consumerID)
561562
klog.Errorf("[Fits] Error allocating consumer: %s/%s, err=%#v.", aw.Namespace, aw.Name, err)
@@ -569,9 +570,47 @@ func (qm *QuotaManager) Fits(aw *arbv1.AppWrapper, awResDemands *clusterstateapi
569570
return doesFit, preemptIds, strings.TrimSpace(allocResponse.GetMessage())
570571
}
571572
preemptIds = qm.getAppWrappers(allocResponse.GetPreemptedIds())
573+
574+
// Update cluster resources in the even that preemption happens
575+
// TODO: Potentially move this resource updated out to the calling function (would need to comeback again to undo the allocation
576+
// if the resources are not enough after preemption)
577+
if clusterResources != nil {
578+
updatedResources := clusterResources
579+
580+
for _, appWrapper := range preemptIds {
581+
updatedResources.Add(qm.getAggregatedResources(appWrapper))
582+
}
583+
584+
// Check if job fits with the update resources after preempted AppWrappers are removed
585+
if clusterResources != nil && !awResDemands.LessEqual(updatedResources) {
586+
qm.quotaManagerBackend.UndoAllocateForest(QuotaManagerForestName, consumerID)
587+
qm.removeConsumer(consumerID)
588+
return false, preemptIds, fmt.Sprintf("[Fits] AppWrapper '%s/%s' does not fit in the cluster, even after borrowed quota is freed", aw.Namespace, aw.Name)
589+
}
590+
}
591+
572592
return doesFit, preemptIds, strings.TrimSpace(allocResponse.GetMessage())
573593
}
574594

595+
func (qm *QuotaManager) getAggregatedResources(appWrapper *arbv1.AppWrapper) *clusterstateapi.Resource {
596+
// After quota evaluation, a set of AppWrappers is returned for preemption. Before deciding to delete them,
597+
// we need to make sure enough resources are free for the new AppWrapper after the preemptable list is deleted.
598+
// For this we need to add back the requests consumed by the preemptable AppWrappers to the available resources
599+
// in order to perform a correct resource check with updated values.
600+
allocated := clusterstateapi.EmptyResource()
601+
602+
for _, genericItem := range appWrapper.Spec.AggrResources.GenericItems {
603+
resources, err := genericresource.GetResources(&genericItem)
604+
if err != nil {
605+
klog.V(8).Infof("[GetAggregatedResources] Failure aggregating resources for %s/%s, err=%#v, genericItem=%#v",
606+
appWrapper.Namespace, appWrapper.Name, err, genericItem)
607+
}
608+
allocated = allocated.Add(resources)
609+
}
610+
611+
return allocated
612+
}
613+
575614
func (qm *QuotaManager) getAppWrappers(preemptIds []string) []*arbv1.AppWrapper {
576615
var aws []*arbv1.AppWrapper
577616
if len(preemptIds) <= 0 {

Diff for: pkg/quotaplugins/quota-forest/quota-manager/quota/core/quotanode.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ func (qn *QuotaNode) SlideDown() {
138138
func (qn *QuotaNode) SlideUp(c *Consumer, applyPriority bool, allocationRecovery *AllocationRecovery,
139139
preemptedConsumers *[]string) bool {
140140

141-
if qn.isHard {
141+
if qn.isHard && !qn.IsRoot() {
142142
return false
143143
}
144144

Diff for: pkg/quotaplugins/quota-forest/quota-manager/quota/quotamanager.go

+5-3
Original file line numberDiff line numberDiff line change
@@ -494,12 +494,14 @@ func (m *Manager) AllocateForest(forestName string, consumerID string) (response
494494

495495
// TryAllocateForest : allocate a consumer on a forest
496496
func (m *Manager) TryAllocateForest(forestName string, consumerID string) (response *core.AllocationResponse, err error) {
497+
if m.mode != Normal {
498+
response, err = m.AllocateForest(forestName, consumerID)
499+
return response, err
500+
}
501+
497502
m.mutex.Lock()
498503
defer m.mutex.Unlock()
499504

500-
if m.mode != Normal {
501-
return nil, fmt.Errorf("manager is not in normal mode")
502-
}
503505
forestController, forestConsumer, err := m.preAllocateForest(forestName, consumerID)
504506
if err == nil && forestController.IsConsumerAllocated(consumerID) {
505507
err = fmt.Errorf("consumer %s already allocated on forest %s", consumerID, forestName)

Diff for: pkg/quotaplugins/quota-forest/quota-manager/quota/utils/defaults.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ var (
2121
DefaultTreeName string = "default"
2222

2323
// DefaultResourceNames : the default resource names
24-
DefaultResourceNames []string = []string{"cpu", "memory"}
24+
DefaultResourceNames []string = []string{"cpu", "memory", "nvidia.com/gpu"}
2525

2626
// DefaultTreeKind : the default kind attribute of the tree
2727
DefaultTreeKind string = "QuotaTree"

Diff for: pkg/quotaplugins/quota-simple-rest/quota_rest_manager.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@ func (qm *QuotaManager) getQuotaDesignation(aw *arbv1.AppWrapper) []QuotaGroup {
257257
return groups
258258
}
259259

260-
func (qm *QuotaManager) Fits(aw *arbv1.AppWrapper, awResDemands *clusterstateapi.Resource,
260+
func (qm *QuotaManager) Fits(aw *arbv1.AppWrapper, awResDemands *clusterstateapi.Resource, clusterResources *clusterstateapi.Resource,
261261
proposedPreemptions []*arbv1.AppWrapper) (bool, []*arbv1.AppWrapper, string) {
262262

263263
// Handle uninitialized quota manager

Diff for: test/e2e-kuttl-borrowing/install-quota-subtree.yaml

+95
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
---
2+
apiVersion: ibm.com/v1
3+
kind: QuotaSubtree
4+
metadata:
5+
name: context-root
6+
namespace: kube-system
7+
labels:
8+
tree: quota_context
9+
spec:
10+
children:
11+
- name: context-root
12+
quotas:
13+
requests:
14+
cpu: 1075m
15+
memory: 1045Mi
16+
nvidia.com/gpu: 16
17+
---
18+
apiVersion: ibm.com/v1
19+
kind: QuotaSubtree
20+
metadata:
21+
name: service-root
22+
namespace: kube-system
23+
labels:
24+
tree: quota_service
25+
spec:
26+
children:
27+
- name: service-root
28+
quotas:
29+
requests:
30+
cpu: 1075m
31+
memory: 1045Mi
32+
nvidia.com/gpu: 16
33+
---
34+
apiVersion: ibm.com/v1
35+
kind: QuotaSubtree
36+
metadata:
37+
name: context-root-children
38+
namespace: kube-system
39+
labels:
40+
tree: quota_context
41+
spec:
42+
parent: context-root
43+
children:
44+
- name: gold
45+
quotas:
46+
hardLimit: false
47+
requests:
48+
cpu: 1075m
49+
memory: 450Mi
50+
nvidia.com/gpu: 8
51+
- name: silver
52+
quotas:
53+
hardLimit: false
54+
requests:
55+
cpu: 1075m
56+
memory: 400Mi
57+
nvidia.com/gpu: 8
58+
- name: bronze
59+
quotas:
60+
hardLimit: true
61+
requests:
62+
cpu: 900m
63+
memory: 300Mi
64+
nvidia.com/gpu: 8
65+
- name: default
66+
quotas:
67+
hardLimit: false
68+
requests:
69+
cpu: 0m
70+
memory: 0Mi
71+
nvidia.com/gpu: 0
72+
---
73+
apiVersion: ibm.com/v1
74+
kind: QuotaSubtree
75+
metadata:
76+
name: service-root-children
77+
namespace: kube-system
78+
labels:
79+
tree: quota_service
80+
spec:
81+
parent: service-root
82+
children:
83+
- name: gold
84+
quotas:
85+
requests:
86+
cpu: 1075m
87+
memory: 1045Mi
88+
nvidia.com/gpu: 16
89+
- name: default
90+
quotas:
91+
hardLimit: false
92+
requests:
93+
cpu: 0m
94+
memory: 0Mi
95+
nvidia.com/gpu: 0

Diff for: test/e2e-kuttl-borrowing/steps/00-assert.yaml

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Verify CRDs existence
2+
apiVersion: apiextensions.k8s.io/v1
3+
kind: CustomResourceDefinition
4+
metadata:
5+
name: appwrappers.mcad.ibm.com
6+
status:
7+
acceptedNames:
8+
kind: AppWrapper
9+
listKind: AppWrapperList
10+
plural: appwrappers
11+
singular: appwrapper
12+
storedVersions:
13+
- v1beta1
14+
---
15+
apiVersion: apiextensions.k8s.io/v1
16+
kind: CustomResourceDefinition
17+
metadata:
18+
name: quotasubtrees.ibm.com
19+
status:
20+
acceptedNames:
21+
kind: QuotaSubtree
22+
singular: quotasubtree
23+
plural: quotasubtrees
24+
storedVersions:
25+
- v1

Diff for: test/e2e-kuttl-borrowing/steps/01-assert.yaml

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Verify subtree creations
2+
apiVersion: ibm.com/v1
3+
kind: QuotaSubtree
4+
metadata:
5+
name: context-root
6+
namespace: kube-system
7+
labels:
8+
tree: quota_context
9+
---
10+
apiVersion: ibm.com/v1
11+
kind: QuotaSubtree
12+
metadata:
13+
name: service-root
14+
namespace: kube-system
15+
labels:
16+
tree: quota_service
17+
---
18+
apiVersion: ibm.com/v1
19+
kind: QuotaSubtree
20+
metadata:
21+
name: context-root-children
22+
namespace: kube-system
23+
labels:
24+
tree: quota_context
25+
---
26+
apiVersion: ibm.com/v1
27+
kind: QuotaSubtree
28+
metadata:
29+
name: service-root-children
30+
namespace: kube-system
31+
labels:
32+
tree: quota_service

Diff for: test/e2e-kuttl-borrowing/steps/02-assert.yaml

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Verify test namespace existence
2+
apiVersion: v1
3+
kind: Namespace
4+
metadata:
5+
name: test

Diff for: test/e2e-kuttl-borrowing/steps/02-install.yaml

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
apiVersion: v1
2+
kind: Namespace
3+
metadata:
4+
name: test

Diff for: test/e2e-kuttl-borrowing/steps/03-assert.yaml

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Verify job is running
2+
---
3+
apiVersion: mcad.ibm.com/v1beta1
4+
kind: AppWrapper
5+
metadata:
6+
name: my-job-1
7+
namespace: test
8+
status:
9+
state: Running

0 commit comments

Comments
 (0)