Skip to content

Commit d82353a

Browse files
committed
e2e test for heterogenous cluster
1 parent 2e28f8a commit d82353a

File tree

3 files changed

+149
-30
lines changed

3 files changed

+149
-30
lines changed

Diff for: .github/workflows/e2e_tests.yaml

+4
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ jobs:
7373

7474
- name: Setup and start KinD cluster
7575
uses: ./common/github-actions/kind
76+
with:
77+
worker-nodes: 1
7678

7779
- name: Install NVidia GPU operator for KinD
7880
uses: ./common/github-actions/nvidia-gpu-operator
@@ -111,6 +113,8 @@ jobs:
111113
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
112114
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
113115
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
116+
kubectl create clusterrole pod-creator --verb=get,list --resource=pods
117+
kubectl create clusterrolebinding sdk-user-pod-creator --clusterrole=pod-creator --user=sdk-user
114118
kubectl config use-context sdk-user
115119
116120
- name: Run e2e tests

Diff for: tests/e2e/heterogeneous_clusters_test.py

+76
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
from time import sleep
2+
import time
3+
from codeflare_sdk import (
4+
Cluster,
5+
ClusterConfiguration,
6+
TokenAuthentication,
7+
generate_cert,
8+
)
9+
10+
from codeflare_sdk.common.kueue.kueue import list_local_queues
11+
12+
import pytest
13+
import ray
14+
import math
15+
16+
from support import *
17+
18+
19+
@pytest.mark.kind
20+
class TestRayLocalInteractiveOauth:
21+
def setup_method(self):
22+
initialize_kubernetes_client(self)
23+
24+
def teardown_method(self):
25+
delete_namespace(self)
26+
delete_kueue_resources(self)
27+
28+
@pytest.mark.nvidia_gpu
29+
def test_heterogeneous_clusters(self):
30+
create_namespace(self)
31+
create_kueue_resources(self)
32+
self.run_heterogeneous_clusters()
33+
34+
def run_heterogeneous_clusters(
35+
self, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
36+
):
37+
cluster_name = "test-ray-cluster-li"
38+
39+
used_nodes = []
40+
41+
for flavor in self.resource_flavors:
42+
queues = list_local_queues(namespace=self.namespace, flavors=[flavor])
43+
queue_name = queues[0]["name"] if queues else None
44+
print(f"Using flavor: {flavor}, Queue: {queue_name}")
45+
cluster = Cluster(
46+
ClusterConfiguration(
47+
name=f"{cluster_name}-{flavor}",
48+
namespace=self.namespace,
49+
num_workers=1,
50+
head_cpu_requests="500m",
51+
head_cpu_limits="500m",
52+
head_memory_requests=2,
53+
head_memory_limits=2,
54+
worker_cpu_requests="500m",
55+
worker_cpu_limits=1,
56+
worker_memory_requests=1,
57+
worker_memory_limits=4,
58+
worker_extended_resource_requests={
59+
gpu_resource_name: number_of_gpus
60+
},
61+
write_to_file=True,
62+
verify_tls=False,
63+
local_queue=queue_name,
64+
)
65+
)
66+
cluster.up()
67+
time.sleep(2)
68+
pod_name = f"{cluster_name}-{flavor}"
69+
node_name = get_pod_node(self, self.namespace, pod_name)
70+
print(f"Cluster {cluster_name}-{flavor} is running on node: {node_name}")
71+
time.sleep(2)
72+
assert (
73+
node_name not in used_nodes
74+
), f"Node {node_name} was already used by another flavor."
75+
used_nodes.append(node_name)
76+
cluster.down()

Diff for: tests/e2e/support.py

+69-30
Original file line numberDiff line numberDiff line change
@@ -65,19 +65,30 @@ def create_namespace(self):
6565
return RuntimeError(e)
6666

6767

68-
def create_new_resource_flavor(self):
69-
self.resource_flavor = f"test-resource-flavor-{random_choice()}"
70-
create_resource_flavor(self, self.resource_flavor)
68+
def create_new_resource_flavor(self, num_flavors=2):
69+
self.resource_flavors = []
70+
for i in range(num_flavors):
71+
default = i < 1
72+
resource_flavor = f"test-resource-flavor-{random_choice()}"
73+
create_resource_flavor(self, resource_flavor, default)
74+
self.resource_flavors.append(resource_flavor)
7175

7276

73-
def create_new_cluster_queue(self):
74-
self.cluster_queue = f"test-cluster-queue-{random_choice()}"
75-
create_cluster_queue(self, self.cluster_queue, self.resource_flavor)
77+
def create_new_cluster_queue(self, num_queues=2):
78+
self.cluster_queues = []
79+
for i in range(num_queues):
80+
cluster_queue_name = f"test-cluster-queue-{random_choice()}"
81+
create_cluster_queue(self, cluster_queue_name, self.resource_flavors[i])
82+
self.cluster_queues.append(cluster_queue_name)
7683

7784

78-
def create_new_local_queue(self):
79-
self.local_queue = f"test-local-queue-{random_choice()}"
80-
create_local_queue(self, self.cluster_queue, self.local_queue)
85+
def create_new_local_queue(self, num_queues=2):
86+
self.local_queues = []
87+
for i in range(num_queues):
88+
is_default = i == 0
89+
local_queue_name = f"test-local-queue-{random_choice()}"
90+
create_local_queue(self, self.cluster_queues[i], local_queue_name, is_default)
91+
self.local_queues.append(local_queue_name)
8192

8293

8394
def create_namespace_with_name(self, namespace_name):
@@ -132,7 +143,7 @@ def create_cluster_queue(self, cluster_queue, flavor):
132143
{"name": "memory", "nominalQuota": "36Gi"},
133144
{"name": "nvidia.com/gpu", "nominalQuota": 1},
134145
],
135-
}
146+
},
136147
],
137148
}
138149
],
@@ -161,11 +172,21 @@ def create_cluster_queue(self, cluster_queue, flavor):
161172
self.cluster_queue = cluster_queue
162173

163174

164-
def create_resource_flavor(self, flavor):
175+
def create_resource_flavor(self, flavor, default=True):
165176
resource_flavor_json = {
166177
"apiVersion": "kueue.x-k8s.io/v1beta1",
167178
"kind": "ResourceFlavor",
168179
"metadata": {"name": flavor},
180+
"spec": {
181+
"nodeLabels": {"worker-1" if default else "ingress-ready": "true"},
182+
"tolerations": [
183+
{
184+
"key": "node-role.kubernetes.io/control-plane",
185+
"operator": "Exists",
186+
"effect": "NoSchedule",
187+
}
188+
],
189+
},
169190
}
170191

171192
try:
@@ -190,14 +211,14 @@ def create_resource_flavor(self, flavor):
190211
self.resource_flavor = flavor
191212

192213

193-
def create_local_queue(self, cluster_queue, local_queue):
214+
def create_local_queue(self, cluster_queue, local_queue, is_default=True):
194215
local_queue_json = {
195216
"apiVersion": "kueue.x-k8s.io/v1beta1",
196217
"kind": "LocalQueue",
197218
"metadata": {
198219
"namespace": self.namespace,
199220
"name": local_queue,
200-
"annotations": {"kueue.x-k8s.io/default-queue": "true"},
221+
"annotations": {"kueue.x-k8s.io/default-queue": str(is_default).lower()},
201222
},
202223
"spec": {"clusterQueue": cluster_queue},
203224
}
@@ -235,25 +256,43 @@ def create_kueue_resources(self):
235256

236257
def delete_kueue_resources(self):
237258
# Delete if given cluster-queue exists
238-
try:
239-
self.custom_api.delete_cluster_custom_object(
240-
group="kueue.x-k8s.io",
241-
plural="clusterqueues",
242-
version="v1beta1",
243-
name=self.cluster_queue,
244-
)
245-
print(f"\n'{self.cluster_queue}' cluster-queue deleted")
246-
except Exception as e:
247-
print(f"\nError deleting cluster-queue '{self.cluster_queue}' : {e}")
259+
for cq in self.cluster_queues:
260+
try:
261+
self.custom_api.delete_cluster_custom_object(
262+
group="kueue.x-k8s.io",
263+
plural="clusterqueues",
264+
version="v1beta1",
265+
name=cq,
266+
)
267+
print(f"\n'{cq}' cluster-queue deleted")
268+
except Exception as e:
269+
print(f"\nError deleting cluster-queue '{cq}' : {e}")
248270

249271
# Delete if given resource-flavor exists
272+
for flavor in self.resource_flavors:
273+
try:
274+
self.custom_api.delete_cluster_custom_object(
275+
group="kueue.x-k8s.io",
276+
plural="resourceflavors",
277+
version="v1beta1",
278+
name=flavor,
279+
)
280+
print(f"'{flavor}' resource-flavor deleted")
281+
except Exception as e:
282+
print(f"\nError deleting resource-flavor '{flavor}': {e}")
283+
284+
285+
def get_pod_node(self, namespace, name):
286+
label_selector = f"ray.io/cluster={name}"
250287
try:
251-
self.custom_api.delete_cluster_custom_object(
252-
group="kueue.x-k8s.io",
253-
plural="resourceflavors",
254-
version="v1beta1",
255-
name=self.resource_flavor,
288+
pods = self.api_instance.list_namespaced_pod(
289+
namespace, label_selector=label_selector
256290
)
257-
print(f"'{self.resource_flavor}' resource-flavor deleted")
291+
if not pods.items:
292+
raise ValueError(
293+
f"No pods found with label 'ray.io/cluster={name}' in namespace '{namespace}'"
294+
)
295+
pod = pods.items[0]
296+
return pod.spec.node_name
258297
except Exception as e:
259-
print(f"\nError deleting resource-flavor '{self.resource_flavor}' : {e}")
298+
print(f"\nError retrieving pod node: {e}")

0 commit comments

Comments
 (0)