Skip to content

Commit 15e6f26

Browse files
authored
Cli submit delete raycluster (#257)
* add: create cluster from yaml function * add: submit and delete functions * change: cluster_name to name in submit raycluster * add: load_auth in delete function * update: make get_cluster function use new config * test: unit tests for submit and delete raycluster commands * change: format slightly on submit/delete commands * Add: context for current namespace and .codeflare path * fix: remove load_auth in functions so it doesn't run twice * Add: help messages for submit and delete functions * cleanup * remove: remove get_namespace every function call * fix: fix tests * change: make namespace default to 'default' and change test slightly * refactor: remove unused imports
1 parent 760c1e3 commit 15e6f26

File tree

6 files changed

+167
-20
lines changed

6 files changed

+167
-20
lines changed

Diff for: src/codeflare_sdk/cli/codeflare_cli.py

+13-12
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,21 @@
11
import click
2-
import sys
32
import os
43

4+
from codeflare_sdk.cli.cli_utils import load_auth
5+
56
cmd_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "commands"))
67

78

89
class CodeflareContext:
9-
def __init__(self, codeflare_path):
10-
self.codeflare_path = codeflare_path
10+
def __init__(self):
11+
self.codeflare_path = _initialize_codeflare_folder()
12+
13+
14+
def _initialize_codeflare_folder():
15+
codeflare_folder = os.path.expanduser("~/.codeflare")
16+
if not os.path.exists(codeflare_folder):
17+
os.makedirs(codeflare_folder)
18+
return codeflare_folder
1119

1220

1321
class CodeflareCLI(click.MultiCommand):
@@ -31,18 +39,11 @@ def get_command(self, ctx, name):
3139
return
3240

3341

34-
def initialize_cli(ctx):
35-
# Make .codeflare folder
36-
codeflare_folder = os.path.expanduser("~/.codeflare")
37-
if not os.path.exists(codeflare_folder):
38-
os.makedirs(codeflare_folder)
39-
ctx.obj = CodeflareContext(codeflare_folder)
40-
41-
4242
@click.command(cls=CodeflareCLI)
4343
@click.pass_context
4444
def cli(ctx):
45-
initialize_cli(ctx) # Ran on every command
45+
load_auth()
46+
ctx.obj = CodeflareContext() # Ran on every command
4647
pass
4748

4849

Diff for: src/codeflare_sdk/cli/commands/delete.py

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import click
2+
3+
from codeflare_sdk.cluster.cluster import get_cluster
4+
5+
6+
@click.group()
7+
def cli():
8+
"""
9+
Delete a specified resource from the Kubernetes cluster
10+
"""
11+
pass
12+
13+
14+
@cli.command()
15+
@click.argument("name", type=str)
16+
@click.option("--namespace", type=str, default="default")
17+
def raycluster(name, namespace):
18+
"""
19+
Delete a specified RayCluster from the Kubernetes cluster
20+
"""
21+
cluster = get_cluster(name, namespace)
22+
cluster.down()
23+
click.echo(f"Cluster deleted successfully")

Diff for: src/codeflare_sdk/cli/commands/submit.py

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import click
2+
3+
from codeflare_sdk.cluster.cluster import Cluster
4+
5+
6+
@click.group()
7+
def cli():
8+
"""
9+
Submit a defined resource to the Kubernetes cluster
10+
"""
11+
pass
12+
13+
14+
@cli.command()
15+
@click.argument("name", type=str)
16+
@click.option("--wait", is_flag=True)
17+
def raycluster(name, wait):
18+
"""
19+
Submit a defined RayCluster to the Kubernetes cluster
20+
"""
21+
cluster = Cluster.from_definition_yaml(name + ".yaml")
22+
if not cluster:
23+
click.echo(
24+
"Error submitting RayCluster. Make sure the RayCluster is defined before submitting it"
25+
)
26+
return
27+
if not wait:
28+
cluster.up()
29+
click.echo("Cluster submitted successfully")
30+
return
31+
cluster.up()
32+
cluster.wait_ready()

Diff for: src/codeflare_sdk/cluster/cluster.py

+54-1
Original file line numberDiff line numberDiff line change
@@ -307,7 +307,8 @@ def torchx_config(
307307
def from_k8_cluster_object(rc):
308308
machine_types = (
309309
rc["metadata"]["labels"]["orderedinstance"].split("_")
310-
if "orderedinstance" in rc["metadata"]["labels"]
310+
if "labels" in rc["metadata"]
311+
and "orderedinstance" in rc["metadata"]["labels"]
311312
else []
312313
)
313314
local_interactive = (
@@ -347,6 +348,58 @@ def from_k8_cluster_object(rc):
347348
)
348349
return Cluster(cluster_config)
349350

351+
def from_definition_yaml(yaml_path):
352+
try:
353+
with open(yaml_path) as yaml_file:
354+
rc = yaml.load(yaml_file, Loader=yaml.FullLoader)
355+
machine_types = (
356+
rc["metadata"]["labels"]["orderedinstance"].split("_")
357+
if "labels" in rc["metadata"]
358+
and "orderedinstance" in rc["metadata"]["labels"]
359+
else []
360+
)
361+
worker_group_specs = rc["spec"]["resources"]["GenericItems"][0][
362+
"generictemplate"
363+
]["spec"]["workerGroupSpecs"][0]
364+
local_interactive = (
365+
"volumeMounts"
366+
in worker_group_specs["template"]["spec"]["containers"][0]
367+
)
368+
cluster_config = ClusterConfiguration(
369+
name=rc["metadata"]["name"],
370+
namespace=rc["metadata"]["namespace"],
371+
machine_types=machine_types,
372+
min_worker=worker_group_specs["minReplicas"],
373+
max_worker=worker_group_specs["maxReplicas"],
374+
min_cpus=worker_group_specs["template"]["spec"]["containers"][0][
375+
"resources"
376+
]["requests"]["cpu"],
377+
max_cpus=worker_group_specs["template"]["spec"]["containers"][0][
378+
"resources"
379+
]["limits"]["cpu"],
380+
min_memory=int(
381+
worker_group_specs["template"]["spec"]["containers"][0][
382+
"resources"
383+
]["requests"]["memory"][:-1]
384+
),
385+
max_memory=int(
386+
worker_group_specs["template"]["spec"]["containers"][0][
387+
"resources"
388+
]["limits"]["memory"][:-1]
389+
),
390+
gpu=worker_group_specs["template"]["spec"]["containers"][0][
391+
"resources"
392+
]["requests"]["nvidia.com/gpu"],
393+
instascale=True if machine_types else False,
394+
image=worker_group_specs["template"]["spec"]["containers"][0][
395+
"image"
396+
],
397+
local_interactive=local_interactive,
398+
)
399+
return Cluster(cluster_config)
400+
except IOError:
401+
return None
402+
350403
def local_client_url(self):
351404
if self.config.local_interactive == True:
352405
ingress_domain = _get_ingress_domain()

Diff for: tests/cli-test-case.yaml

+3-3
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ metadata:
44
labels:
55
orderedinstance: cpu.small_gpu.large
66
name: cli-test-cluster
7-
namespace: ns
7+
namespace: default
88
spec:
99
priority: 9
1010
resources:
@@ -36,7 +36,7 @@ spec:
3636
appwrapper.mcad.ibm.com: cli-test-cluster
3737
controller-tools.k8s.io: '1.0'
3838
name: cli-test-cluster
39-
namespace: ns
39+
namespace: default
4040
spec:
4141
autoscalerOptions:
4242
idleTimeoutSeconds: 60
@@ -184,7 +184,7 @@ spec:
184184
labels:
185185
odh-ray-cluster-service: cli-test-cluster-head-svc
186186
name: ray-dashboard-cli-test-cluster
187-
namespace: ns
187+
namespace: default
188188
spec:
189189
port:
190190
targetPort: dashboard

Diff for: tests/unit_test.py

+42-4
Original file line numberDiff line numberDiff line change
@@ -87,12 +87,13 @@ def test_cli_working():
8787
assert result.exit_code == 0
8888

8989

90-
def test_cluster_definition_cli():
90+
def test_cluster_definition_cli(mocker):
91+
mocker.patch.object(client, "ApiClient")
9192
runner = CliRunner()
9293
define_cluster_command = """
9394
define raycluster
9495
--name=cli-test-cluster
95-
--namespace=ns
96+
--namespace=default
9697
--min_worker=1
9798
--max_worker=2
9899
--min_cpus=3
@@ -105,7 +106,10 @@ def test_cluster_definition_cli():
105106
--image_pull_secrets='["cli-test-pull-secret"]'
106107
"""
107108
result = runner.invoke(cli, define_cluster_command)
108-
assert result.output == "Written to: cli-test-cluster.yaml\n"
109+
assert (
110+
result.output
111+
== "No authentication found, trying default kubeconfig\nWritten to: cli-test-cluster.yaml\n"
112+
)
109113
assert filecmp.cmp(
110114
"cli-test-cluster.yaml", f"{parent}/tests/cli-test-case.yaml", shallow=True
111115
)
@@ -120,7 +124,10 @@ def test_login_cli(mocker):
120124
--token=testtoken
121125
"""
122126
login_result = runner.invoke(cli, k8s_login_command)
123-
assert login_result.output == "Logged into 'testserver:6443'\n"
127+
assert (
128+
login_result.output
129+
== "No authentication found, trying default kubeconfig\nLogged into 'testserver:6443'\n"
130+
)
124131
try:
125132
auth_file_path = os.path.expanduser("~/.codeflare/auth")
126133
with open(auth_file_path, "rb") as file:
@@ -170,6 +177,37 @@ def test_load_auth():
170177
assert sdk_auth.api_client is not None
171178

172179

180+
def test_cluster_submission_cli(mocker):
181+
mocker.patch.object(client, "ApiClient")
182+
runner = CliRunner()
183+
submit_cluster_command = """
184+
submit raycluster
185+
cli-test-cluster
186+
"""
187+
result = runner.invoke(cli, submit_cluster_command)
188+
189+
assert result.exit_code == 0
190+
assert "Cluster submitted successfully" in result.output
191+
192+
193+
def test_cluster_deletion_cli(mocker):
194+
mocker.patch.object(client, "ApiClient")
195+
mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
196+
mocker.patch(
197+
"kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
198+
side_effect=get_ray_obj,
199+
)
200+
runner = CliRunner()
201+
delete_cluster_command = """
202+
delete raycluster
203+
quicktest
204+
"""
205+
result = runner.invoke(cli, delete_cluster_command)
206+
207+
assert result.exit_code == 0
208+
assert "Cluster deleted successfully" in result.output
209+
210+
173211
# For mocking openshift client results
174212
fake_res = openshift.Result("fake")
175213

0 commit comments

Comments
 (0)