From 905074039ac29b7bbf05db85a513dd2eed2192d4 Mon Sep 17 00:00:00 2001 From: Carson Harrell <64709520+carsonmh@users.noreply.github.com> Date: Wed, 26 Jul 2023 08:38:11 -0700 Subject: [PATCH 01/47] CLI Layout and Create RayCluster function (#227) * Create: base and file layout for CLI * Add: Create raycluster command for CLI * Refactor: refactor CLI using pre-commit * Test: unit tests for create raycluster function in the CLI * Update: update egg-info with more paths * Change: change Framework Cluster to RayCluster * merge: rebase with main * Fix: unit tests * Change: create cluster to define cluster in unit tests * Add: error handling for invalid command * test: change tests so cli cluster definition has its own yaml file --- README.md | 1 + pyproject.toml | 8 + requirements.txt | 1 + src/codeflare_sdk.egg-info/SOURCES.txt | 4 + src/codeflare_sdk/cli/__init__.py | 0 src/codeflare_sdk/cli/cli_utils.py | 12 ++ src/codeflare_sdk/cli/codeflare_cli.py | 36 +++++ src/codeflare_sdk/cli/commands/define.py | 36 +++++ tests/cli-test-case.yaml | 195 +++++++++++++++++++++++ tests/unit_test.py | 34 ++++ 10 files changed, 327 insertions(+) create mode 100644 src/codeflare_sdk/cli/__init__.py create mode 100644 src/codeflare_sdk/cli/cli_utils.py create mode 100644 src/codeflare_sdk/cli/codeflare_cli.py create mode 100644 src/codeflare_sdk/cli/commands/define.py create mode 100644 tests/cli-test-case.yaml diff --git a/README.md b/README.md index fedd64c46..bd6fb2868 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,7 @@ We use pre-commit to make sure the code is consistently formatted. To make sure - To run the unit tests, run `pytest -v tests/unit_test.py` - Any new test functions/scripts can be added into the `tests` folder - NOTE: Functional tests coming soon, will live in `tests/func_test.py` +- To test CLI, run `codeflare` followed by any command. To see list of commands, simply run `codeflare` #### Code Coverage diff --git a/pyproject.toml b/pyproject.toml index 6f8393ef2..77662e908 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,7 @@ codeflare-torchx = "0.6.0.dev0" cryptography = "40.0.2" executing = "1.2.0" pydantic = "< 2" +click = "8.0.4" [tool.poetry.group.docs] optional = true @@ -40,3 +41,10 @@ pdoc3 = "0.10.0" pytest = "7.4.0" coverage = "7.2.7" pytest-mock = "3.11.1" + +[tool.poetry.scripts] +codeflare = "codeflare_sdk.cli.codeflare_cli:cli" + +[build-system] +requires = ["poetry_core>=1.0.0"] +build-backend = "poetry.core.masonry.api" diff --git a/requirements.txt b/requirements.txt index 2a48812aa..c5d04bdc7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ codeflare-torchx==0.6.0.dev0 pydantic<2 # 2.0+ broke ray[default] see detail: https://github.com/ray-project/ray/pull/37000 cryptography==40.0.2 executing==1.2.0 +click==8.0.4 diff --git a/src/codeflare_sdk.egg-info/SOURCES.txt b/src/codeflare_sdk.egg-info/SOURCES.txt index cfea1dbff..acd40f211 100644 --- a/src/codeflare_sdk.egg-info/SOURCES.txt +++ b/src/codeflare_sdk.egg-info/SOURCES.txt @@ -19,3 +19,7 @@ src/codeflare_sdk/utils/generate_cert.py src/codeflare_sdk/utils/generate_yaml.py src/codeflare_sdk/utils/kube_api_helpers.py src/codeflare_sdk/utils/pretty_print.py +src/codeflare_sdk/cli/__init__.py +src/codeflare_sdk/cli/codeflare_cli.py +src/codeflare_sdk/cli/commands/create.py +src/codeflare_sdk/cli/cli_utils.py diff --git a/src/codeflare_sdk/cli/__init__.py b/src/codeflare_sdk/cli/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/codeflare_sdk/cli/cli_utils.py b/src/codeflare_sdk/cli/cli_utils.py new file mode 100644 index 000000000..7152cc397 --- /dev/null +++ b/src/codeflare_sdk/cli/cli_utils.py @@ -0,0 +1,12 @@ +import ast +import click + + +class PythonLiteralOption(click.Option): + def type_cast_value(self, ctx, value): + try: + if not value: + return None + return ast.literal_eval(value) + except: + raise click.BadParameter(value) diff --git a/src/codeflare_sdk/cli/codeflare_cli.py b/src/codeflare_sdk/cli/codeflare_cli.py new file mode 100644 index 000000000..3083a40d0 --- /dev/null +++ b/src/codeflare_sdk/cli/codeflare_cli.py @@ -0,0 +1,36 @@ +import click +import sys +import os + +cmd_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "commands")) + + +class CodeflareCLI(click.MultiCommand): + def list_commands(self, ctx): + rv = [] + for filename in os.listdir(cmd_folder): + if filename.endswith(".py") and filename != "__init__.py": + rv.append(filename[:-3]) + rv.sort() + return rv + + def get_command(self, ctx, name): + ns = {} + fn = os.path.join(cmd_folder, name + ".py") + try: + with open(fn) as f: + code = compile(f.read(), fn, "exec") + eval(code, ns, ns) + return ns["cli"] + except FileNotFoundError: + return + + +@click.command(cls=CodeflareCLI) +@click.pass_context +def cli(ctx): + pass + + +if __name__ == "__main__": + cli() diff --git a/src/codeflare_sdk/cli/commands/define.py b/src/codeflare_sdk/cli/commands/define.py new file mode 100644 index 000000000..16b6fa480 --- /dev/null +++ b/src/codeflare_sdk/cli/commands/define.py @@ -0,0 +1,36 @@ +import click + +from codeflare_sdk.cluster.cluster import Cluster +from codeflare_sdk.cluster.config import ClusterConfiguration +from codeflare_sdk.cli.cli_utils import PythonLiteralOption + + +@click.group() +def cli(): + """Define a resource with parameter specifications""" + pass + + +@cli.command() +@click.option("--name", type=str, required=True) +@click.option("--namespace", "-n", type=str) +@click.option("--head_info", cls=PythonLiteralOption, type=list) +@click.option("--machine_types", cls=PythonLiteralOption, type=list) +@click.option("--min_cpus", type=int) +@click.option("--max_cpus", type=int) +@click.option("--min_worker", type=int) +@click.option("--max_worker", type=int) +@click.option("--min_memory", type=int) +@click.option("--max_memory", type=int) +@click.option("--gpu", type=int) +@click.option("--template", type=str) +@click.option("--instascale", type=bool) +@click.option("--envs", cls=PythonLiteralOption, type=dict) +@click.option("--image", type=str) +@click.option("--local_interactive", type=bool) +@click.option("--image_pull_secrets", cls=PythonLiteralOption, type=list) +def raycluster(**kwargs): + """Define a RayCluster with parameter specifications""" + filtered_kwargs = {k: v for k, v in kwargs.items() if v is not None} + clusterConfig = ClusterConfiguration(**filtered_kwargs) + Cluster(clusterConfig) # Creates yaml file diff --git a/tests/cli-test-case.yaml b/tests/cli-test-case.yaml new file mode 100644 index 000000000..0788996a7 --- /dev/null +++ b/tests/cli-test-case.yaml @@ -0,0 +1,195 @@ +apiVersion: mcad.ibm.com/v1beta1 +kind: AppWrapper +metadata: + labels: + orderedinstance: cpu.small_gpu.large + name: cli-test-cluster + namespace: ns +spec: + priority: 9 + resources: + GenericItems: + - custompodresources: + - limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + replicas: 1 + requests: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + - limits: + cpu: 4 + memory: 6G + nvidia.com/gpu: 7 + replicas: 2 + requests: + cpu: 3 + memory: 5G + nvidia.com/gpu: 7 + generictemplate: + apiVersion: ray.io/v1alpha1 + kind: RayCluster + metadata: + labels: + appwrapper.mcad.ibm.com: cli-test-cluster + controller-tools.k8s.io: '1.0' + name: cli-test-cluster + namespace: ns + spec: + autoscalerOptions: + idleTimeoutSeconds: 60 + imagePullPolicy: Always + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 500m + memory: 512Mi + upscalingMode: Default + enableInTreeAutoscaling: false + headGroupSpec: + rayStartParams: + block: 'true' + dashboard-host: 0.0.0.0 + num-gpus: '0' + serviceType: ClusterIP + template: + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cli-test-cluster + operator: In + values: + - cli-test-cluster + containers: + - env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: RAY_USE_TLS + value: '0' + - name: RAY_TLS_SERVER_CERT + value: /home/ray/workspace/tls/server.crt + - name: RAY_TLS_SERVER_KEY + value: /home/ray/workspace/tls/server.key + - name: RAY_TLS_CA_CERT + value: /home/ray/workspace/tls/ca.crt + image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 + imagePullPolicy: Always + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: ray-head + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + resources: + limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + requests: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + imagePullSecrets: + - name: cli-test-pull-secret + rayVersion: 2.1.0 + workerGroupSpecs: + - groupName: small-group-cli-test-cluster + maxReplicas: 2 + minReplicas: 2 + rayStartParams: + block: 'true' + num-gpus: '7' + replicas: 2 + template: + metadata: + annotations: + key: value + labels: + key: value + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cli-test-cluster + operator: In + values: + - cli-test-cluster + containers: + - env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: RAY_USE_TLS + value: '0' + - name: RAY_TLS_SERVER_CERT + value: /home/ray/workspace/tls/server.crt + - name: RAY_TLS_SERVER_KEY + value: /home/ray/workspace/tls/server.key + - name: RAY_TLS_CA_CERT + value: /home/ray/workspace/tls/ca.crt + image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: machine-learning + resources: + limits: + cpu: 4 + memory: 6G + nvidia.com/gpu: 7 + requests: + cpu: 3 + memory: 5G + nvidia.com/gpu: 7 + imagePullSecrets: + - name: cli-test-pull-secret + initContainers: + - command: + - sh + - -c + - until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; + do echo waiting for myservice; sleep 2; done + image: busybox:1.28 + name: init-myservice + replicas: 1 + - generictemplate: + apiVersion: route.openshift.io/v1 + kind: Route + metadata: + labels: + odh-ray-cluster-service: cli-test-cluster-head-svc + name: ray-dashboard-cli-test-cluster + namespace: ns + spec: + port: + targetPort: dashboard + to: + kind: Service + name: cli-test-cluster-head-svc + replica: 1 + Items: [] diff --git a/tests/unit_test.py b/tests/unit_test.py index ac126016f..cccc2f478 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -17,6 +17,7 @@ import filecmp import os import re +from click.testing import CliRunner parent = Path(__file__).resolve().parents[1] sys.path.append(str(parent) + "/src") @@ -63,6 +64,7 @@ generate_tls_cert, export_env, ) +from codeflare_sdk.cli.codeflare_cli import cli import openshift from openshift.selector import Selector @@ -75,6 +77,37 @@ import yaml +# CLI testing +def test_cli_working(): + runner = CliRunner() + result = runner.invoke(cli) + assert result.exit_code == 0 + + +def test_cluster_definition_cli(): + runner = CliRunner() + define_cluster_command = """ + define raycluster + --name=cli-test-cluster + --namespace=ns + --min_worker=1 + --max_worker=2 + --min_cpus=3 + --max_cpus=4 + --min_memory=5 + --max_memory=6 + --gpu=7 + --instascale=True + --machine_types='["cpu.small", "gpu.large"]' + --image_pull_secrets='["cli-test-pull-secret"]' + """ + result = runner.invoke(cli, define_cluster_command) + assert result.output == "Written to: cli-test-cluster.yaml\n" + assert filecmp.cmp( + "cli-test-cluster.yaml", f"{parent}/tests/cli-test-case.yaml", shallow=True + ) + + # For mocking openshift client results fake_res = openshift.Result("fake") @@ -2222,3 +2255,4 @@ def test_cleanup(): os.remove("tls-cluster-namespace/tls.crt") os.remove("tls-cluster-namespace/tls.key") os.rmdir("tls-cluster-namespace") + os.remove("cli-test-cluster.yaml") From 380f4d3dbdcaa73e726b6c85e1a3b4f2c041a3e2 Mon Sep 17 00:00:00 2001 From: Carson Harrell <64709520+carsonmh@users.noreply.github.com> Date: Mon, 31 Jul 2023 08:20:22 -0700 Subject: [PATCH 02/47] CLI Authentication (#252) * Add: create_api_client_config helper function for the SDK * Add: login function for CLI * Change: change options and help for login function * Create: logout function * Test: add unit tests for login and logout functions * add: additional error handling and change layout slightly * test: add unit test for load_auth * change: make tls skip false by default * add: make authentication go into .codeflare * test: add unit tests for checking validity of auth file and split login/logout tests --- src/codeflare_sdk/cli/cli_utils.py | 47 ++++++++++++++++++ src/codeflare_sdk/cli/codeflare_cli.py | 14 ++++++ src/codeflare_sdk/cli/commands/login.py | 46 ++++++++++++++++++ src/codeflare_sdk/cli/commands/logout.py | 19 ++++++++ src/codeflare_sdk/cluster/auth.py | 35 ++++++++----- tests/unit_test.py | 62 ++++++++++++++++++++++++ 6 files changed, 212 insertions(+), 11 deletions(-) create mode 100644 src/codeflare_sdk/cli/commands/login.py create mode 100644 src/codeflare_sdk/cli/commands/logout.py diff --git a/src/codeflare_sdk/cli/cli_utils.py b/src/codeflare_sdk/cli/cli_utils.py index 7152cc397..0c557a8ea 100644 --- a/src/codeflare_sdk/cli/cli_utils.py +++ b/src/codeflare_sdk/cli/cli_utils.py @@ -1,5 +1,12 @@ import ast import click +from kubernetes import client, config +import pickle +import os + +from codeflare_sdk.cluster.auth import _create_api_client_config +from codeflare_sdk.utils.kube_api_helpers import _kube_api_error_handling +import codeflare_sdk.cluster.auth as sdk_auth class PythonLiteralOption(click.Option): @@ -10,3 +17,43 @@ def type_cast_value(self, ctx, value): return ast.literal_eval(value) except: raise click.BadParameter(value) + + +class AuthenticationConfig: + """ + Authentication configuration that will be stored in a file once + the user logs in using `codeflare login` + """ + + def __init__( + self, + token: str, + server: str, + skip_tls: bool, + ca_cert_path: str, + ): + self.api_client_config = _create_api_client_config( + token, server, skip_tls, ca_cert_path + ) + self.server = server + self.token = token + + def create_client(self): + return client.ApiClient(self.api_client_config) + + +def load_auth(): + """ + Loads AuthenticationConfiguration and stores it in global variables + which can be used by the SDK for authentication + """ + try: + auth_file_path = os.path.expanduser("~/.codeflare/auth") + with open(auth_file_path, "rb") as file: + auth = pickle.load(file) + sdk_auth.api_client = auth.create_client() + return auth + except (IOError, EOFError): + click.echo("No authentication found, trying default kubeconfig") + except client.ApiException: + click.echo("Invalid authentication, trying default kubeconfig") diff --git a/src/codeflare_sdk/cli/codeflare_cli.py b/src/codeflare_sdk/cli/codeflare_cli.py index 3083a40d0..f8a5cbab7 100644 --- a/src/codeflare_sdk/cli/codeflare_cli.py +++ b/src/codeflare_sdk/cli/codeflare_cli.py @@ -5,6 +5,11 @@ cmd_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "commands")) +class CodeflareContext: + def __init__(self, codeflare_path): + self.codeflare_path = codeflare_path + + class CodeflareCLI(click.MultiCommand): def list_commands(self, ctx): rv = [] @@ -26,9 +31,18 @@ def get_command(self, ctx, name): return +def initialize_cli(ctx): + # Make .codeflare folder + codeflare_folder = os.path.expanduser("~/.codeflare") + if not os.path.exists(codeflare_folder): + os.makedirs(codeflare_folder) + ctx.obj = CodeflareContext(codeflare_folder) + + @click.command(cls=CodeflareCLI) @click.pass_context def cli(ctx): + initialize_cli(ctx) # Ran on every command pass diff --git a/src/codeflare_sdk/cli/commands/login.py b/src/codeflare_sdk/cli/commands/login.py new file mode 100644 index 000000000..288607a89 --- /dev/null +++ b/src/codeflare_sdk/cli/commands/login.py @@ -0,0 +1,46 @@ +import click +import pickle +from kubernetes import client +import os + +from codeflare_sdk.cluster.auth import TokenAuthentication +from codeflare_sdk.cli.cli_utils import AuthenticationConfig +import codeflare_sdk.cluster.auth as sdk_auth + + +@click.command() +@click.pass_context +@click.option("--server", "-s", type=str, required=True, help="Cluster API address") +@click.option("--token", "-t", type=str, required=True, help="Authentication token") +@click.option( + "--insecure-skip-tls-verify", + type=bool, + help="If true, server's certificate won't be checked for validity", + default=False, +) +@click.option( + "--certificate-authority", + type=str, + help="Path to cert file for certificate authority", +) +def cli(ctx, server, token, insecure_skip_tls_verify, certificate_authority): + """ + Login to your Kubernetes cluster and save login for subsequent use + """ + auth = TokenAuthentication( + token, server, insecure_skip_tls_verify, certificate_authority + ) + auth.login() + if not sdk_auth.api_client: # TokenAuthentication failed + return + + auth_config = AuthenticationConfig( + token, + server, + insecure_skip_tls_verify, + certificate_authority, + ) + auth_file_path = ctx.obj.codeflare_path + "/auth" + with open(auth_file_path, "wb") as file: + pickle.dump(auth_config, file) + click.echo(f"Logged into '{server}'") diff --git a/src/codeflare_sdk/cli/commands/logout.py b/src/codeflare_sdk/cli/commands/logout.py new file mode 100644 index 000000000..0001b2331 --- /dev/null +++ b/src/codeflare_sdk/cli/commands/logout.py @@ -0,0 +1,19 @@ +import click +import os +import pickle + + +@click.command() +@click.pass_context +def cli(ctx): + """ + Log out of current Kubernetes cluster + """ + try: + auth_file_path = ctx.obj.codeflare_path + "/auth" + with open(auth_file_path, "rb") as file: + auth = pickle.load(file) + os.remove(auth_file_path) + click.echo(f"Successfully logged out of '{auth.server}'") + except: + click.echo("Not logged in") diff --git a/src/codeflare_sdk/cluster/auth.py b/src/codeflare_sdk/cluster/auth.py index 85db3d61d..90c1f726a 100644 --- a/src/codeflare_sdk/cluster/auth.py +++ b/src/codeflare_sdk/cluster/auth.py @@ -97,17 +97,11 @@ def login(self) -> str: global config_path global api_client try: - configuration = client.Configuration() - configuration.api_key_prefix["authorization"] = "Bearer" - configuration.host = self.server - configuration.api_key["authorization"] = self.token - if self.skip_tls == False and self.ca_cert_path == None: - configuration.verify_ssl = True - elif self.skip_tls == False: - configuration.ssl_ca_cert = self.ca_cert_path - else: - configuration.verify_ssl = False - api_client = client.ApiClient(configuration) + api_client = client.ApiClient( + _create_api_client_config( + self.token, self.server, self.skip_tls, self.ca_cert_path + ) + ) client.AuthenticationApi(api_client).get_api_group() config_path = None return "Logged into %s" % self.server @@ -154,6 +148,25 @@ def load_kube_config(self): return response +def _create_api_client_config( + token: str, server: str, skip_tls: bool = False, ca_cert_path: str = None +): + """ + Creates Kubernetes client configuration given necessary parameters + """ + configuration = client.Configuration() + configuration.api_key_prefix["authorization"] = "Bearer" + configuration.host = server + configuration.api_key["authorization"] = token + if skip_tls == False and ca_cert_path == None: + configuration.verify_ssl = True + elif skip_tls == False: + configuration.ssl_ca_cert = ca_cert_path + else: + configuration.verify_ssl = False + return configuration + + def config_check() -> str: """ Function for loading the config file at the default config location ~/.kube/config if the user has not diff --git a/tests/unit_test.py b/tests/unit_test.py index cccc2f478..10103fef6 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -18,6 +18,7 @@ import os import re from click.testing import CliRunner +import pickle parent = Path(__file__).resolve().parents[1] sys.path.append(str(parent) + "/src") @@ -65,6 +66,8 @@ export_env, ) from codeflare_sdk.cli.codeflare_cli import cli +from codeflare_sdk.cli.cli_utils import load_auth +import codeflare_sdk.cluster.auth as sdk_auth import openshift from openshift.selector import Selector @@ -108,6 +111,65 @@ def test_cluster_definition_cli(): ) +def test_login_cli(mocker): + runner = CliRunner() + mocker.patch.object(client, "ApiClient") + k8s_login_command = """ + login + --server=testserver:6443 + --token=testtoken + """ + login_result = runner.invoke(cli, k8s_login_command) + assert login_result.output == "Logged into 'testserver:6443'\n" + try: + auth_file_path = os.path.expanduser("~/.codeflare/auth") + with open(auth_file_path, "rb") as file: + auth = pickle.load(file) + except: + assert 0 == 1 + assert auth.server == "testserver:6443" + assert auth.token == "testtoken" + assert auth.api_client_config.api_key["authorization"] == "testtoken" + assert auth.api_client_config.verify_ssl + assert auth.api_client_config.host == "testserver:6443" + + +def test_login_tls_cli(mocker): + runner = CliRunner() + mocker.patch.object(client, "ApiClient") + k8s_tls_login_command = """ + login + --server=testserver:6443 + --token=testtoken + --insecure-skip-tls-verify=False + """ + k8s_skip_tls_login_command = """ + login + --server=testserver:6443 + --token=testtoken + --insecure-skip-tls-verify=True + """ + tls_result = runner.invoke(cli, k8s_tls_login_command) + skip_tls_result = runner.invoke(cli, k8s_skip_tls_login_command) + assert ( + tls_result.output == skip_tls_result.output == "Logged into 'testserver:6443'\n" + ) + + +def test_logout_cli(mocker): + runner = CliRunner() + mocker.patch.object(client, "ApiClient") + k8s_logout_command = "logout" + logout_result = runner.invoke(cli, k8s_logout_command) + assert logout_result.output == "Successfully logged out of 'testserver:6443'\n" + assert not os.path.exists(os.path.expanduser("~/.codeflare/auth")) + + +def test_load_auth(): + load_auth() + assert sdk_auth.api_client is not None + + # For mocking openshift client results fake_res = openshift.Result("fake") From 5107aed811c56ed3918dd4742da74a0cb9ad311b Mon Sep 17 00:00:00 2001 From: carsonmh Date: Mon, 31 Jul 2023 15:02:20 -0700 Subject: [PATCH 03/47] change: use updated auth on get_cluster --- src/codeflare_sdk/cluster/cluster.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index d698331e6..61005ee5b 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -411,8 +411,8 @@ def get_current_namespace(): # pragma: no cover def get_cluster(cluster_name: str, namespace: str = "default"): try: - config.load_kube_config() - api_instance = client.CustomObjectsApi() + config_check() + api_instance = client.CustomObjectsApi(api_config_handler()) rcs = api_instance.list_namespaced_custom_object( group="ray.io", version="v1alpha1", From 2c7f0c7757a52392f7d9570c11016dc254d84b49 Mon Sep 17 00:00:00 2001 From: Carson Harrell <64709520+carsonmh@users.noreply.github.com> Date: Wed, 2 Aug 2023 06:19:32 -0700 Subject: [PATCH 04/47] Cli submit delete raycluster (#257) * add: create cluster from yaml function * add: submit and delete functions * change: cluster_name to name in submit raycluster * add: load_auth in delete function * update: make get_cluster function use new config * test: unit tests for submit and delete raycluster commands * change: format slightly on submit/delete commands * Add: context for current namespace and .codeflare path * fix: remove load_auth in functions so it doesn't run twice * Add: help messages for submit and delete functions * cleanup * remove: remove get_namespace every function call * fix: fix tests * change: make namespace default to 'default' and change test slightly * refactor: remove unused imports --- src/codeflare_sdk/cli/codeflare_cli.py | 25 +++++------ src/codeflare_sdk/cli/commands/delete.py | 23 ++++++++++ src/codeflare_sdk/cli/commands/submit.py | 32 ++++++++++++++ src/codeflare_sdk/cluster/cluster.py | 55 +++++++++++++++++++++++- tests/cli-test-case.yaml | 6 +-- tests/unit_test.py | 46 ++++++++++++++++++-- 6 files changed, 167 insertions(+), 20 deletions(-) create mode 100644 src/codeflare_sdk/cli/commands/delete.py create mode 100644 src/codeflare_sdk/cli/commands/submit.py diff --git a/src/codeflare_sdk/cli/codeflare_cli.py b/src/codeflare_sdk/cli/codeflare_cli.py index f8a5cbab7..78354695f 100644 --- a/src/codeflare_sdk/cli/codeflare_cli.py +++ b/src/codeflare_sdk/cli/codeflare_cli.py @@ -1,13 +1,21 @@ import click -import sys import os +from codeflare_sdk.cli.cli_utils import load_auth + cmd_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "commands")) class CodeflareContext: - def __init__(self, codeflare_path): - self.codeflare_path = codeflare_path + def __init__(self): + self.codeflare_path = _initialize_codeflare_folder() + + +def _initialize_codeflare_folder(): + codeflare_folder = os.path.expanduser("~/.codeflare") + if not os.path.exists(codeflare_folder): + os.makedirs(codeflare_folder) + return codeflare_folder class CodeflareCLI(click.MultiCommand): @@ -31,18 +39,11 @@ def get_command(self, ctx, name): return -def initialize_cli(ctx): - # Make .codeflare folder - codeflare_folder = os.path.expanduser("~/.codeflare") - if not os.path.exists(codeflare_folder): - os.makedirs(codeflare_folder) - ctx.obj = CodeflareContext(codeflare_folder) - - @click.command(cls=CodeflareCLI) @click.pass_context def cli(ctx): - initialize_cli(ctx) # Ran on every command + load_auth() + ctx.obj = CodeflareContext() # Ran on every command pass diff --git a/src/codeflare_sdk/cli/commands/delete.py b/src/codeflare_sdk/cli/commands/delete.py new file mode 100644 index 000000000..c1ec12451 --- /dev/null +++ b/src/codeflare_sdk/cli/commands/delete.py @@ -0,0 +1,23 @@ +import click + +from codeflare_sdk.cluster.cluster import get_cluster + + +@click.group() +def cli(): + """ + Delete a specified resource from the Kubernetes cluster + """ + pass + + +@cli.command() +@click.argument("name", type=str) +@click.option("--namespace", type=str, default="default") +def raycluster(name, namespace): + """ + Delete a specified RayCluster from the Kubernetes cluster + """ + cluster = get_cluster(name, namespace) + cluster.down() + click.echo(f"Cluster deleted successfully") diff --git a/src/codeflare_sdk/cli/commands/submit.py b/src/codeflare_sdk/cli/commands/submit.py new file mode 100644 index 000000000..8a476d602 --- /dev/null +++ b/src/codeflare_sdk/cli/commands/submit.py @@ -0,0 +1,32 @@ +import click + +from codeflare_sdk.cluster.cluster import Cluster + + +@click.group() +def cli(): + """ + Submit a defined resource to the Kubernetes cluster + """ + pass + + +@cli.command() +@click.argument("name", type=str) +@click.option("--wait", is_flag=True) +def raycluster(name, wait): + """ + Submit a defined RayCluster to the Kubernetes cluster + """ + cluster = Cluster.from_definition_yaml(name + ".yaml") + if not cluster: + click.echo( + "Error submitting RayCluster. Make sure the RayCluster is defined before submitting it" + ) + return + if not wait: + cluster.up() + click.echo("Cluster submitted successfully") + return + cluster.up() + cluster.wait_ready() diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 61005ee5b..8d2b93648 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -313,7 +313,8 @@ def torchx_config( def from_k8_cluster_object(rc): machine_types = ( rc["metadata"]["labels"]["orderedinstance"].split("_") - if "orderedinstance" in rc["metadata"]["labels"] + if "labels" in rc["metadata"] + and "orderedinstance" in rc["metadata"]["labels"] else [] ) local_interactive = ( @@ -352,6 +353,58 @@ def from_k8_cluster_object(rc): ) return Cluster(cluster_config) + def from_definition_yaml(yaml_path): + try: + with open(yaml_path) as yaml_file: + rc = yaml.load(yaml_file, Loader=yaml.FullLoader) + machine_types = ( + rc["metadata"]["labels"]["orderedinstance"].split("_") + if "labels" in rc["metadata"] + and "orderedinstance" in rc["metadata"]["labels"] + else [] + ) + worker_group_specs = rc["spec"]["resources"]["GenericItems"][0][ + "generictemplate" + ]["spec"]["workerGroupSpecs"][0] + local_interactive = ( + "volumeMounts" + in worker_group_specs["template"]["spec"]["containers"][0] + ) + cluster_config = ClusterConfiguration( + name=rc["metadata"]["name"], + namespace=rc["metadata"]["namespace"], + machine_types=machine_types, + min_worker=worker_group_specs["minReplicas"], + max_worker=worker_group_specs["maxReplicas"], + min_cpus=worker_group_specs["template"]["spec"]["containers"][0][ + "resources" + ]["requests"]["cpu"], + max_cpus=worker_group_specs["template"]["spec"]["containers"][0][ + "resources" + ]["limits"]["cpu"], + min_memory=int( + worker_group_specs["template"]["spec"]["containers"][0][ + "resources" + ]["requests"]["memory"][:-1] + ), + max_memory=int( + worker_group_specs["template"]["spec"]["containers"][0][ + "resources" + ]["limits"]["memory"][:-1] + ), + gpu=worker_group_specs["template"]["spec"]["containers"][0][ + "resources" + ]["requests"]["nvidia.com/gpu"], + instascale=True if machine_types else False, + image=worker_group_specs["template"]["spec"]["containers"][0][ + "image" + ], + local_interactive=local_interactive, + ) + return Cluster(cluster_config) + except IOError: + return None + def local_client_url(self): if self.config.local_interactive == True: ingress_domain = _get_ingress_domain() diff --git a/tests/cli-test-case.yaml b/tests/cli-test-case.yaml index 0788996a7..c312abfaa 100644 --- a/tests/cli-test-case.yaml +++ b/tests/cli-test-case.yaml @@ -4,7 +4,7 @@ metadata: labels: orderedinstance: cpu.small_gpu.large name: cli-test-cluster - namespace: ns + namespace: default spec: priority: 9 resources: @@ -36,7 +36,7 @@ spec: appwrapper.mcad.ibm.com: cli-test-cluster controller-tools.k8s.io: '1.0' name: cli-test-cluster - namespace: ns + namespace: default spec: autoscalerOptions: idleTimeoutSeconds: 60 @@ -184,7 +184,7 @@ spec: labels: odh-ray-cluster-service: cli-test-cluster-head-svc name: ray-dashboard-cli-test-cluster - namespace: ns + namespace: default spec: port: targetPort: dashboard diff --git a/tests/unit_test.py b/tests/unit_test.py index 10103fef6..7b1a661f1 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -87,12 +87,13 @@ def test_cli_working(): assert result.exit_code == 0 -def test_cluster_definition_cli(): +def test_cluster_definition_cli(mocker): + mocker.patch.object(client, "ApiClient") runner = CliRunner() define_cluster_command = """ define raycluster --name=cli-test-cluster - --namespace=ns + --namespace=default --min_worker=1 --max_worker=2 --min_cpus=3 @@ -105,7 +106,10 @@ def test_cluster_definition_cli(): --image_pull_secrets='["cli-test-pull-secret"]' """ result = runner.invoke(cli, define_cluster_command) - assert result.output == "Written to: cli-test-cluster.yaml\n" + assert ( + result.output + == "No authentication found, trying default kubeconfig\nWritten to: cli-test-cluster.yaml\n" + ) assert filecmp.cmp( "cli-test-cluster.yaml", f"{parent}/tests/cli-test-case.yaml", shallow=True ) @@ -120,7 +124,10 @@ def test_login_cli(mocker): --token=testtoken """ login_result = runner.invoke(cli, k8s_login_command) - assert login_result.output == "Logged into 'testserver:6443'\n" + assert ( + login_result.output + == "No authentication found, trying default kubeconfig\nLogged into 'testserver:6443'\n" + ) try: auth_file_path = os.path.expanduser("~/.codeflare/auth") with open(auth_file_path, "rb") as file: @@ -170,6 +177,37 @@ def test_load_auth(): assert sdk_auth.api_client is not None +def test_cluster_submission_cli(mocker): + mocker.patch.object(client, "ApiClient") + runner = CliRunner() + submit_cluster_command = """ + submit raycluster + cli-test-cluster + """ + result = runner.invoke(cli, submit_cluster_command) + + assert result.exit_code == 0 + assert "Cluster submitted successfully" in result.output + + +def test_cluster_deletion_cli(mocker): + mocker.patch.object(client, "ApiClient") + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_ray_obj, + ) + runner = CliRunner() + delete_cluster_command = """ + delete raycluster + quicktest + """ + result = runner.invoke(cli, delete_cluster_command) + + assert result.exit_code == 0 + assert "Cluster deleted successfully" in result.output + + # For mocking openshift client results fake_res = openshift.Result("fake") From 3bc9120b65527eb86f719440623fec84172deeb6 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Wed, 2 Aug 2023 21:59:20 -0700 Subject: [PATCH 05/47] add: design doc --- .../cli/CodeflareCLI_Design_Doc.md | 179 ++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 src/codeflare_sdk/cli/CodeflareCLI_Design_Doc.md diff --git a/src/codeflare_sdk/cli/CodeflareCLI_Design_Doc.md b/src/codeflare_sdk/cli/CodeflareCLI_Design_Doc.md new file mode 100644 index 000000000..da3c2a510 --- /dev/null +++ b/src/codeflare_sdk/cli/CodeflareCLI_Design_Doc.md @@ -0,0 +1,179 @@ +# CodeFlare CLI Design + + +## Context and Scope + + +The primary purpose of the CLI is to serve as an interaction layer between a user and the CodeFlare stack (MCAD, InstaScale, KubeRay) from within the terminal. This addition is required due to the fact that a large set of our target users come from a high-performance computing background and are most familiar and comfortable submitting jobs to a cluster via a CLI. + + +The CLI will utilize the existing CodeFlare SDK. It will allow for similar operations that the SDK provides (such as Ray Cluster and job management) but in the terminal. The CLI adds some additional functions, allows for saved time, simpler workspaces, and automation of certain processes via bash scripts on top of the existing SDK. + + + + +## Goals + + +- Provide users the ability to request, monitor and stop the Kubernetes resources associated with the CodeFlare stack within the terminal. +- Serve as an interaction layer between the data scientist and CodeFlare stack (MCAD, InstaScale, KubeRay) +- Allow for a user-friendly workflow within the terminal +- Allow for automation and scripting of job/RayCluster management via bash scripts + + +## Non-Goals + + +- Do not want to re-make the functionality that is found in the existing CodeFlare SDK or any of the SDK’s clients for Ray, MCAD, or any other service + + +## Architecture and Design + + +The CodeFlare CLI is an extension to the CodeFlare SDK package that allows a user to create, monitor, and shut down framework clusters (RayClusters for now) and distributed training jobs on an authenticated Kubernetes cluster from the terminal. + + +The user should have the ability to do the following from within the terminal: +- Create, view details, view status, submit, delete Ray Clusters via appwrappers +- Create, view logs, view status, submit, delete jobs +- List out all jobs +- List out all ray clusters +- Login to Kubernetes cluster +- Logout of Kubernetes cluster + + +To support these operations, additional functions to the SDK may include: +- Formatted listing ray clusters +- Formatted listing jobs +- Getting a job given the name + + +For the majority of functionality, the CLI will utilize the SDK’s already built functionality. + + +### CLI Framework: + + +[Click](https://click.palletsprojects.com/en/8.1.x/) is the chosen CLI framework for the following reasons +- Simple syntax/layout: Since the CLI commands are very complex, it is important that the CLI framework doesn’t add any unnecessary complexity +- Supports functional commands instead of objects: This is important because the SDK is designed with various functions, and the CLI being similar improves readability +- Comes with testing and help generation: Testing library and automatic help generation quickens development process +- Large community support/documentation: extensive documentation and large community leads to less errors and easier development. + + +### Framework Clusters: + + +When the user invokes the `define raycluster` command, a yaml file with default values is created and put in the user’s current working directory. Users can customize their clusters by adding parameters to the define command and these values will override the defaults when creating the AppWrapper yaml file. + + +Once the appwrapper is defined, the user can create the ray cluster via a create command. When the user invokes the `create raycluster`, they will specify the name of the cluster to submit. The CLI will first check to see whether or not the specified name is already present in the Kubernetes cluster. If it isn’t already present, then it will search the current working directory for a yaml file corresponding to cluster name and apply it to the K8S cluster. If the wait flag is specified, then the CLI will display a loading sign with status updates until the cluster is up. + + +We will try to find a good balance between exposing more parameters and simplifying the process by acting on feedback from CLI users. + + +For `delete raycluster`, the user will invoke the command, and the CLI will shut it down and delete it. + + +### Training Jobs + + +When the user invokes `define job` command, a DDPJobDefiniton object will be created and saved into a file. Users can customize their jobs using parameters to the define command. + + +Once the job is defined, the user can submit the job via a `job submit` command. When the user submits a job, the user will specify the job name. The CLI will then check to see if the job is already on the Kubernetes cluster and if not it will submit the job. The job submitted will be a DDPJob and it will be submitted onto a specified ray cluster. + + +When the user wants to delete a job, they just invoke the job delete command, and the CLI will stop the job and delete it. This can happen at any time assuming there is a job running. + + +### Authentication + + +Users will need to be authenticated into a Kubernetes cluster in order to be able to perform all operations. + + +If the user tries to perform any operation without being logged in, the CLI will prompt them to authenticate. A kubeconfig will have to be valid in the users environment in order to perform any operation. + + +The user will be able to login using a simple `login` command and will have the choice of logging in via server + token. The user can also choose whether or not they want tls-verification. If there is a kubeconfig, the CLI will update it, else it will create one for the user. + + +Alternatively, the user can invoke the login command with their kubeconfig file path, and this will login the user using their kubeconfig file. + + +Users can logout of their cluster using the `logout` command. + + + + +### Listing Info + + +Users can list both ray cluster information and job information by invoking respective commands. CLI will list information for each raycluster/job such as requested resources, status, name, and namespace. + + +## Alternatives Considered + + +- Existing CodeFlare CLI + - Written in TypeScript and overcomplicated. Did not support +- Just using SDK + - Making a CLI saves a lot of time and is easier for the user in some cases +- Interactive CLI + - Interactive CLIs make it harder for automation via bash scripts +- Other CLI libraries + - **Cliff:** Ugly syntax, less readability, not much functionality. + - **Argparse:** Less functionality out of the box. More time spent on unnecessary reimplementation. + - **Cement:** Ugly syntax and low community support. + + +## Security Considerations + + +We will rely on Kubernetes default security, where users can not perform any operations on a cluster if they are not authenticated correctly. + + +## Testing and Validation +The CLI is found within the SDK, so it will be [tested](https://github.com/project-codeflare/codeflare-sdk/blob/main/CodeFlareSDK_Design_Doc.md#testing-and-validation) the same way. + + +## Deployment and Rollout +- The CLI will be deployed within the CodeFlare SDK so similar [considerations](https://github.com/project-codeflare/codeflare-sdk/blob/main/CodeFlareSDK_Design_Doc.md#deployment-and-rollout) will be taken into account. + + +## Command Usage Examples +Create ray cluster +- `codeflare create raycluster [options]` + + +Doing something to a ray cluster: +- `codeflare {operation} raycluster {cluster_name} [options e.g. --gpu=0]` + + +Create job +- `codeflare create job [options]` + + +Doing something to a job: +- `codeflare {operation} job {job_name} [options e.g. cluster-name=”mycluster”]` +- Namespace and ray cluster name will be required as options + + +Listing out clusters +- `codeflare list raycluster -n {namespace} OR codeflare list ray-cluster –all` + + +Listing out jobs +- `codeflare list job -c {cluster_name} -n {namespace}` +- `codeflare list job -n {namespace}` +- `codeflare list job --all` + + +Login to kubernetes cluster +- `codeflare login [options e.g. --configpath={path/to/kubeconfig}]` (if configpath is left blank default value is used) + + +Logout of kubernetes cluster +- `codeflare logout` From a6753d397458d3c6447cbb4d459fc9d4a70cc008 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Mon, 31 Jul 2023 12:29:59 -0700 Subject: [PATCH 06/47] add: cli status function --- src/codeflare_sdk/cli/commands/status.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 src/codeflare_sdk/cli/commands/status.py diff --git a/src/codeflare_sdk/cli/commands/status.py b/src/codeflare_sdk/cli/commands/status.py new file mode 100644 index 000000000..5d61fa933 --- /dev/null +++ b/src/codeflare_sdk/cli/commands/status.py @@ -0,0 +1,20 @@ +import click + +from codeflare_sdk.cluster.cluster import get_cluster + + +@click.group() +def cli(): + """Get the status of a specified resource""" + pass + + +@cli.command() +@click.argument("name", type=str) +@click.option("--namespace", type=str) +@click.pass_context +def raycluster(ctx, name, namespace): + """Get the status of a specified RayCluster""" + namespace = namespace or "default" + cluster = get_cluster(name, namespace) + cluster.status() From 47fda05709c8916b6ad13c860abcb2aa7ce255c2 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Mon, 31 Jul 2023 12:34:13 -0700 Subject: [PATCH 07/47] add: details cli function --- jobtest.yaml | 173 ++++++++++++++++++++++ src/codeflare_sdk/cli/commands/details.py | 20 +++ 2 files changed, 193 insertions(+) create mode 100644 jobtest.yaml create mode 100644 src/codeflare_sdk/cli/commands/details.py diff --git a/jobtest.yaml b/jobtest.yaml new file mode 100644 index 000000000..92cd39b4c --- /dev/null +++ b/jobtest.yaml @@ -0,0 +1,173 @@ +apiVersion: mcad.ibm.com/v1beta1 +kind: AppWrapper +metadata: + name: jobtest + namespace: default +spec: + priority: 9 + resources: + GenericItems: + - custompodresources: + - limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + replicas: 1 + requests: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + - limits: + cpu: 1 + memory: 1G + nvidia.com/gpu: 0 + replicas: 2 + requests: + cpu: 1 + memory: 1G + nvidia.com/gpu: 0 + generictemplate: + apiVersion: ray.io/v1alpha1 + kind: RayCluster + metadata: + labels: + appwrapper.mcad.ibm.com: jobtest + controller-tools.k8s.io: '1.0' + name: jobtest + namespace: default + spec: + autoscalerOptions: + idleTimeoutSeconds: 60 + imagePullPolicy: Always + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 500m + memory: 512Mi + upscalingMode: Default + enableInTreeAutoscaling: false + headGroupSpec: + rayStartParams: + block: 'true' + dashboard-host: 0.0.0.0 + num-gpus: '0' + serviceType: ClusterIP + template: + spec: + containers: + - env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: RAY_USE_TLS + value: '0' + - name: RAY_TLS_SERVER_CERT + value: /home/ray/workspace/tls/server.crt + - name: RAY_TLS_SERVER_KEY + value: /home/ray/workspace/tls/server.key + - name: RAY_TLS_CA_CERT + value: /home/ray/workspace/tls/ca.crt + image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 + imagePullPolicy: Always + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: ray-head + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + resources: + limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + requests: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + imagePullSecrets: [] + rayVersion: 2.1.0 + workerGroupSpecs: + - groupName: small-group-jobtest + maxReplicas: 2 + minReplicas: 2 + rayStartParams: + block: 'true' + num-gpus: '0' + replicas: 2 + template: + metadata: + annotations: + key: value + labels: + key: value + spec: + containers: + - env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: RAY_USE_TLS + value: '0' + - name: RAY_TLS_SERVER_CERT + value: /home/ray/workspace/tls/server.crt + - name: RAY_TLS_SERVER_KEY + value: /home/ray/workspace/tls/server.key + - name: RAY_TLS_CA_CERT + value: /home/ray/workspace/tls/ca.crt + image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: machine-learning + resources: + limits: + cpu: 1 + memory: 1G + nvidia.com/gpu: 0 + requests: + cpu: 1 + memory: 1G + nvidia.com/gpu: 0 + imagePullSecrets: [] + initContainers: + - command: + - sh + - -c + - until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; + do echo waiting for myservice; sleep 2; done + image: busybox:1.28 + name: init-myservice + replicas: 1 + - generictemplate: + apiVersion: route.openshift.io/v1 + kind: Route + metadata: + labels: + odh-ray-cluster-service: jobtest-head-svc + name: ray-dashboard-jobtest + namespace: default + spec: + port: + targetPort: dashboard + to: + kind: Service + name: jobtest-head-svc + replica: 1 + Items: [] diff --git a/src/codeflare_sdk/cli/commands/details.py b/src/codeflare_sdk/cli/commands/details.py new file mode 100644 index 000000000..b12edd8b6 --- /dev/null +++ b/src/codeflare_sdk/cli/commands/details.py @@ -0,0 +1,20 @@ +import click + +from codeflare_sdk.cluster.cluster import get_cluster + + +@click.group() +def cli(): + """Get the details of a specified resource""" + pass + + +@cli.command() +@click.argument("name", type=str) +@click.option("--namespace", type=str) +@click.pass_context +def raycluster(ctx, name, namespace): + """Get the details of a specified RayCluster""" + namespace = namespace or "default" + cluster = get_cluster(name, namespace) + cluster.details() From ae451c1f6a3bf7e815e89740fd3376f9e529f03e Mon Sep 17 00:00:00 2001 From: carsonmh Date: Thu, 27 Jul 2023 13:24:05 -0700 Subject: [PATCH 08/47] create: function to list rayclusters in all namespaces --- src/codeflare_sdk/cluster/cluster.py | 31 ++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 8d2b93648..c04f9c49b 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -417,7 +417,17 @@ def list_all_clusters(namespace: str, print_to_console: bool = True): """ Returns (and prints by default) a list of all clusters in a given namespace. """ - clusters = _get_ray_clusters(namespace) + clusters = _get_ray_clusters_in_namespace(namespace) + if print_to_console: + pretty_print.print_clusters(clusters) + return clusters + + +def list_clusters_all_namespaces(print_to_console: bool = True): + """ + Returns (and prints by default) a list of all clusters in the Kubernetes cluster. + """ + clusters = _get_all_ray_clusters() if print_to_console: pretty_print.print_clusters(clusters) return clusters @@ -534,7 +544,7 @@ def _ray_cluster_status(name, namespace="default") -> Optional[RayCluster]: return None -def _get_ray_clusters(namespace="default") -> List[RayCluster]: +def _get_ray_clusters_in_namespace(namespace="default") -> List[RayCluster]: list_of_clusters = [] try: config_check() @@ -553,6 +563,23 @@ def _get_ray_clusters(namespace="default") -> List[RayCluster]: return list_of_clusters +def _get_all_ray_clusters() -> List[RayCluster]: + list_of_clusters = [] + try: + config_check() + api_instance = client.CustomObjectsApi(api_config_handler()) + rcs = api_instance.list_cluster_custom_object( + group="ray.io", + version="v1alpha1", + plural="rayclusters", + ) + except Exception as e: + return _kube_api_error_handling(e) + for rc in rcs["items"]: + list_of_clusters.append(_map_to_ray_cluster(rc)) + return list_of_clusters + + def _get_app_wrappers( namespace="default", filter=List[AppWrapperStatus] ) -> List[AppWrapper]: From 9372d4cc5548874caf6b44285486300ee7440140 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Mon, 31 Jul 2023 12:36:18 -0700 Subject: [PATCH 09/47] add: list raycluster function cli --- src/codeflare_sdk/cli/commands/list.py | 31 ++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 src/codeflare_sdk/cli/commands/list.py diff --git a/src/codeflare_sdk/cli/commands/list.py b/src/codeflare_sdk/cli/commands/list.py new file mode 100644 index 000000000..4479ae327 --- /dev/null +++ b/src/codeflare_sdk/cli/commands/list.py @@ -0,0 +1,31 @@ +import click +from kubernetes import client, config + +from codeflare_sdk.cluster.cluster import ( + list_clusters_all_namespaces, + list_all_clusters, + get_current_namespace, +) +from codeflare_sdk.cli.cli_utils import load_auth + + +@click.group() +def cli(): + """List a specified resource""" + pass + + +@cli.command() +@click.option("--namespace") +@click.option("--all", is_flag=True) +@click.pass_context +def rayclusters(ctx, namespace, all): + """List all rayclusters in a specified namespace""" + if all and namespace: + click.echo("--all and --namespace are mutually exclusive") + return + namespace = namespace or "default" + if not all: + list_all_clusters(namespace) + return + list_clusters_all_namespaces() From c213393fab7c3bcbe6b2e266526b894383b89190 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Mon, 31 Jul 2023 13:46:21 -0700 Subject: [PATCH 10/47] test: add unit test for list_clusters_all_namespaces --- tests/unit_test.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tests/unit_test.py b/tests/unit_test.py index 7b1a661f1..031f019e8 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -34,6 +34,7 @@ get_cluster, _app_wrapper_status, _ray_cluster_status, + list_clusters_all_namespaces, ) from codeflare_sdk.cluster.auth import ( TokenAuthentication, @@ -206,6 +207,33 @@ def test_cluster_deletion_cli(mocker): assert result.exit_code == 0 assert "Cluster deleted successfully" in result.output +def test_list_clusters_all_namespaces(mocker, capsys): + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_cluster_custom_object", + side_effect=get_ray_obj_no_namespace, + ) + list_clusters_all_namespaces() + captured = capsys.readouterr() + assert captured.out == ( + " 🚀 CodeFlare Cluster Details 🚀 \n" + " \n" + " ╭──────────────────────────────────────────────────────────────╮ \n" + " │ Name │ \n" + " │ quicktest Active ✅ │ \n" + " │ │ \n" + " │ URI: ray://quicktest-head-svc.ns.svc:10001 │ \n" + " │ │ \n" + " │ Dashboard🔗 │ \n" + " │ │ \n" + " │ Cluster Resources │ \n" + " │ ╭─ Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n" + " │ │ Min Max │ │ Memory CPU GPU │ │ \n" + " │ │ │ │ │ │ \n" + " │ │ 1 1 │ │ 2G~2G 1 0 │ │ \n" + " │ │ │ │ │ │ \n" + " │ ╰────────────╯ ╰──────────────────────────────────────╯ │ \n" + " ╰──────────────────────────────────────────────────────────────╯ \n" + ) # For mocking openshift client results @@ -989,6 +1017,10 @@ def get_ray_obj(group, version, namespace, plural, cls=None): return api_obj +def get_ray_obj_no_namespace(group, version, plural, cls=None): + return get_ray_obj(group, version, "ns", plural, cls) + + def get_aw_obj(group, version, namespace, plural): api_obj1 = { "items": [ From e41dfabd2ae08dbb5d9ebab98c27e8faacee38e9 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Mon, 31 Jul 2023 14:17:12 -0700 Subject: [PATCH 11/47] test: add unit tests for status, details, and list CLI commands --- tests/unit_test.py | 129 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) diff --git a/tests/unit_test.py b/tests/unit_test.py index 031f019e8..a9386ced3 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -236,6 +236,135 @@ def test_list_clusters_all_namespaces(mocker, capsys): ) +def test_raycluster_details_cli(mocker): + runner = CliRunner() + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_ray_obj, + ) + mocker.patch( + "codeflare_sdk.cluster.cluster.get_current_namespace", + return_value="ns", + ) + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.status", + return_value=(False, CodeFlareClusterStatus.UNKNOWN), + ) + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", + return_value="", + ) + mocker.patch.object(client, "ApiClient") + raycluster_details_command = """ + details raycluster quicktest + """ + result = runner.invoke(cli, raycluster_details_command) + quicktest_details = ( + " ╭──────────────────────────────────────────────────────────────╮ \n" + + " │ Name │ \n" + + " │ quicktest Inactive ❌ │ \n" + + " │ │ \n" + + " │ URI: ray://quicktest-head-svc.ns.svc:10001 │ \n" + + " │ │ \n" + + " │ Dashboard🔗 │ \n" + + " │ │ \n" + + " │ Cluster Resources │ \n" + + " │ ╭─ Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n" + + " │ │ Min Max │ │ Memory CPU GPU │ │ \n" + + " │ │ │ │ │ │ \n" + + " │ │ 1 1 │ │ 2~2 1 0 │ │ \n" + + " │ │ │ │ │ │ \n" + + " │ ╰────────────╯ ╰──────────────────────────────────────╯ │ \n" + + " ╰──────────────────────────────────────────────────────────────╯ " + ) + assert quicktest_details in result.output + + +def test_raycluster_status_cli(mocker): + runner = CliRunner() + test_raycluster = RayCluster( + "quicktest", + RayClusterStatus.READY, + 1, + 1, + "1", + "1", + 1, + 1, + "default", + "dashboard-url", + ) + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_ray_obj, + ) + mocker.patch( + "codeflare_sdk.cluster.cluster.get_current_namespace", + return_value="ns", + ) + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", + return_value="", + ) + mocker.patch( + "codeflare_sdk.cluster.cluster._app_wrapper_status", + return_value=test_raycluster, + ) + mocker.patch( + "codeflare_sdk.cluster.cluster._ray_cluster_status", + return_value=test_raycluster, + ) + mocker.patch.object(client, "ApiClient") + raycluster_status_command = """ + status raycluster quicktest + """ + result = runner.invoke(cli, raycluster_status_command) + assert "Active" in result.output + + +def test_raycluster_list_cli(mocker): + runner = CliRunner() + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_ray_obj, + ) + mocker.patch( + "codeflare_sdk.cluster.cluster.get_current_namespace", + return_value="ns", + ) + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.status", + return_value=(False, CodeFlareClusterStatus.UNKNOWN), + ) + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", + return_value="", + ) + mocker.patch.object(client, "ApiClient") + list_rayclusters_command = """ + list rayclusters --namespace=ns + """ + result = runner.invoke(cli, list_rayclusters_command) + assert ( + " ╭──────────────────────────────────────────────────────────────╮ \n" + + " │ Name │ \n" + + " │ quicktest Active ✅ │ \n" + + " │ │ \n" + + " │ URI: ray://quicktest-head-svc.ns.svc:10001 │ \n" + + " │ │ \n" + + " │ Dashboard🔗 │ \n" + + " │ │ \n" + + " │ Cluster Resources │ \n" + + " │ ╭─ Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n" + + " │ │ Min Max │ │ Memory CPU GPU │ │ \n" + + " │ │ │ │ │ │ \n" + + " │ │ 1 1 │ │ 2G~2G 1 0 │ │ \n" + + " │ │ │ │ │ │ \n" + + " │ ╰────────────╯ ╰──────────────────────────────────────╯ │ \n" + + " ╰──────────────────────────────────────────────────────────────╯ " + ) in result.output + + # For mocking openshift client results fake_res = openshift.Result("fake") From 819cf57af1943ff247aceaabbcf2aa0a4ed0ca44 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Mon, 31 Jul 2023 14:21:50 -0700 Subject: [PATCH 12/47] cleanup --- jobtest.yaml | 173 --------------------------------------------------- 1 file changed, 173 deletions(-) delete mode 100644 jobtest.yaml diff --git a/jobtest.yaml b/jobtest.yaml deleted file mode 100644 index 92cd39b4c..000000000 --- a/jobtest.yaml +++ /dev/null @@ -1,173 +0,0 @@ -apiVersion: mcad.ibm.com/v1beta1 -kind: AppWrapper -metadata: - name: jobtest - namespace: default -spec: - priority: 9 - resources: - GenericItems: - - custompodresources: - - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - replicas: 1 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - - limits: - cpu: 1 - memory: 1G - nvidia.com/gpu: 0 - replicas: 2 - requests: - cpu: 1 - memory: 1G - nvidia.com/gpu: 0 - generictemplate: - apiVersion: ray.io/v1alpha1 - kind: RayCluster - metadata: - labels: - appwrapper.mcad.ibm.com: jobtest - controller-tools.k8s.io: '1.0' - name: jobtest - namespace: default - spec: - autoscalerOptions: - idleTimeoutSeconds: 60 - imagePullPolicy: Always - resources: - limits: - cpu: 500m - memory: 512Mi - requests: - cpu: 500m - memory: 512Mi - upscalingMode: Default - enableInTreeAutoscaling: false - headGroupSpec: - rayStartParams: - block: 'true' - dashboard-host: 0.0.0.0 - num-gpus: '0' - serviceType: ClusterIP - template: - spec: - containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 - imagePullPolicy: Always - lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - ray stop - name: ray-head - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - resources: - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - imagePullSecrets: [] - rayVersion: 2.1.0 - workerGroupSpecs: - - groupName: small-group-jobtest - maxReplicas: 2 - minReplicas: 2 - rayStartParams: - block: 'true' - num-gpus: '0' - replicas: 2 - template: - metadata: - annotations: - key: value - labels: - key: value - spec: - containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 - lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - ray stop - name: machine-learning - resources: - limits: - cpu: 1 - memory: 1G - nvidia.com/gpu: 0 - requests: - cpu: 1 - memory: 1G - nvidia.com/gpu: 0 - imagePullSecrets: [] - initContainers: - - command: - - sh - - -c - - until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; - do echo waiting for myservice; sleep 2; done - image: busybox:1.28 - name: init-myservice - replicas: 1 - - generictemplate: - apiVersion: route.openshift.io/v1 - kind: Route - metadata: - labels: - odh-ray-cluster-service: jobtest-head-svc - name: ray-dashboard-jobtest - namespace: default - spec: - port: - targetPort: dashboard - to: - kind: Service - name: jobtest-head-svc - replica: 1 - Items: [] From f548525cc00e364dbf78ba51dc6c6102e58f77a3 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Mon, 31 Jul 2023 15:03:43 -0700 Subject: [PATCH 13/47] fix: unit tests --- tests/unit_test.py | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/tests/unit_test.py b/tests/unit_test.py index a9386ced3..d0329766b 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -242,10 +242,6 @@ def test_raycluster_details_cli(mocker): "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", side_effect=get_ray_obj, ) - mocker.patch( - "codeflare_sdk.cluster.cluster.get_current_namespace", - return_value="ns", - ) mocker.patch( "codeflare_sdk.cluster.cluster.Cluster.status", return_value=(False, CodeFlareClusterStatus.UNKNOWN), @@ -282,18 +278,6 @@ def test_raycluster_details_cli(mocker): def test_raycluster_status_cli(mocker): runner = CliRunner() - test_raycluster = RayCluster( - "quicktest", - RayClusterStatus.READY, - 1, - 1, - "1", - "1", - 1, - 1, - "default", - "dashboard-url", - ) mocker.patch( "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", side_effect=get_ray_obj, @@ -306,6 +290,19 @@ def test_raycluster_status_cli(mocker): "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", return_value="", ) + mocker.patch.object(client, "ApiClient") + test_raycluster = RayCluster( + "quicktest", + RayClusterStatus.READY, + 1, + 1, + "1", + "1", + 1, + 1, + "default", + "dashboard-url", + ) mocker.patch( "codeflare_sdk.cluster.cluster._app_wrapper_status", return_value=test_raycluster, @@ -314,7 +311,6 @@ def test_raycluster_status_cli(mocker): "codeflare_sdk.cluster.cluster._ray_cluster_status", return_value=test_raycluster, ) - mocker.patch.object(client, "ApiClient") raycluster_status_command = """ status raycluster quicktest """ From 5937034dd8a0fcddc05e0f5d66243c6c9414300d Mon Sep 17 00:00:00 2001 From: carsonmh Date: Wed, 2 Aug 2023 13:40:58 -0700 Subject: [PATCH 14/47] change: make namespace required for functions --- src/codeflare_sdk/cli/commands/define.py | 2 +- src/codeflare_sdk/cli/commands/delete.py | 2 +- src/codeflare_sdk/cli/commands/details.py | 2 +- src/codeflare_sdk/cli/commands/list.py | 6 ++++-- src/codeflare_sdk/cli/commands/status.py | 2 +- tests/unit_test.py | 8 +++++--- 6 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/codeflare_sdk/cli/commands/define.py b/src/codeflare_sdk/cli/commands/define.py index 16b6fa480..09cfd1f0e 100644 --- a/src/codeflare_sdk/cli/commands/define.py +++ b/src/codeflare_sdk/cli/commands/define.py @@ -13,7 +13,7 @@ def cli(): @cli.command() @click.option("--name", type=str, required=True) -@click.option("--namespace", "-n", type=str) +@click.option("--namespace", "-n", type=str, required=True) @click.option("--head_info", cls=PythonLiteralOption, type=list) @click.option("--machine_types", cls=PythonLiteralOption, type=list) @click.option("--min_cpus", type=int) diff --git a/src/codeflare_sdk/cli/commands/delete.py b/src/codeflare_sdk/cli/commands/delete.py index c1ec12451..f828457f9 100644 --- a/src/codeflare_sdk/cli/commands/delete.py +++ b/src/codeflare_sdk/cli/commands/delete.py @@ -13,7 +13,7 @@ def cli(): @cli.command() @click.argument("name", type=str) -@click.option("--namespace", type=str, default="default") +@click.option("--namespace", type=str, required=True) def raycluster(name, namespace): """ Delete a specified RayCluster from the Kubernetes cluster diff --git a/src/codeflare_sdk/cli/commands/details.py b/src/codeflare_sdk/cli/commands/details.py index b12edd8b6..3f749f7d6 100644 --- a/src/codeflare_sdk/cli/commands/details.py +++ b/src/codeflare_sdk/cli/commands/details.py @@ -11,7 +11,7 @@ def cli(): @cli.command() @click.argument("name", type=str) -@click.option("--namespace", type=str) +@click.option("--namespace", type=str, required=True) @click.pass_context def raycluster(ctx, name, namespace): """Get the details of a specified RayCluster""" diff --git a/src/codeflare_sdk/cli/commands/list.py b/src/codeflare_sdk/cli/commands/list.py index 4479ae327..dd3ad4e22 100644 --- a/src/codeflare_sdk/cli/commands/list.py +++ b/src/codeflare_sdk/cli/commands/list.py @@ -16,7 +16,7 @@ def cli(): @cli.command() -@click.option("--namespace") +@click.option("--namespace", type=str) @click.option("--all", is_flag=True) @click.pass_context def rayclusters(ctx, namespace, all): @@ -24,7 +24,9 @@ def rayclusters(ctx, namespace, all): if all and namespace: click.echo("--all and --namespace are mutually exclusive") return - namespace = namespace or "default" + if not all and not namespace: + click.echo("You must specify either --namespace or --all") + return if not all: list_all_clusters(namespace) return diff --git a/src/codeflare_sdk/cli/commands/status.py b/src/codeflare_sdk/cli/commands/status.py index 5d61fa933..be836348d 100644 --- a/src/codeflare_sdk/cli/commands/status.py +++ b/src/codeflare_sdk/cli/commands/status.py @@ -11,7 +11,7 @@ def cli(): @cli.command() @click.argument("name", type=str) -@click.option("--namespace", type=str) +@click.option("--namespace", type=str, required=True) @click.pass_context def raycluster(ctx, name, namespace): """Get the status of a specified RayCluster""" diff --git a/tests/unit_test.py b/tests/unit_test.py index d0329766b..7a4cb4a7d 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -201,12 +201,14 @@ def test_cluster_deletion_cli(mocker): runner = CliRunner() delete_cluster_command = """ delete raycluster - quicktest + quicktest --namespace=default """ result = runner.invoke(cli, delete_cluster_command) assert result.exit_code == 0 assert "Cluster deleted successfully" in result.output + + def test_list_clusters_all_namespaces(mocker, capsys): mocker.patch( "kubernetes.client.CustomObjectsApi.list_cluster_custom_object", @@ -252,7 +254,7 @@ def test_raycluster_details_cli(mocker): ) mocker.patch.object(client, "ApiClient") raycluster_details_command = """ - details raycluster quicktest + details raycluster quicktest --namespace=default """ result = runner.invoke(cli, raycluster_details_command) quicktest_details = ( @@ -312,7 +314,7 @@ def test_raycluster_status_cli(mocker): return_value=test_raycluster, ) raycluster_status_command = """ - status raycluster quicktest + status raycluster quicktest --namespace=default """ result = runner.invoke(cli, raycluster_status_command) assert "Active" in result.output From 7ccb625ba57db4dabec1a4d66f7cf2e3e6f91a9d Mon Sep 17 00:00:00 2001 From: carsonmh Date: Wed, 2 Aug 2023 13:44:23 -0700 Subject: [PATCH 15/47] add: error handling for cluster not found --- src/codeflare_sdk/cli/commands/delete.py | 6 +++++- src/codeflare_sdk/cli/commands/details.py | 7 +++++-- src/codeflare_sdk/cli/commands/status.py | 7 +++++-- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/codeflare_sdk/cli/commands/delete.py b/src/codeflare_sdk/cli/commands/delete.py index f828457f9..7ce9744bd 100644 --- a/src/codeflare_sdk/cli/commands/delete.py +++ b/src/codeflare_sdk/cli/commands/delete.py @@ -18,6 +18,10 @@ def raycluster(name, namespace): """ Delete a specified RayCluster from the Kubernetes cluster """ - cluster = get_cluster(name, namespace) + try: + cluster = get_cluster(name, namespace) + except FileNotFoundError: + click.echo(f"Cluster {name} not found in {namespace} namespace") + return cluster.down() click.echo(f"Cluster deleted successfully") diff --git a/src/codeflare_sdk/cli/commands/details.py b/src/codeflare_sdk/cli/commands/details.py index 3f749f7d6..b865caa47 100644 --- a/src/codeflare_sdk/cli/commands/details.py +++ b/src/codeflare_sdk/cli/commands/details.py @@ -15,6 +15,9 @@ def cli(): @click.pass_context def raycluster(ctx, name, namespace): """Get the details of a specified RayCluster""" - namespace = namespace or "default" - cluster = get_cluster(name, namespace) + try: + cluster = get_cluster(name, namespace) + except FileNotFoundError: + click.echo(f"Cluster {name} not found in {namespace} namespace") + return cluster.details() diff --git a/src/codeflare_sdk/cli/commands/status.py b/src/codeflare_sdk/cli/commands/status.py index be836348d..fc76ffc1d 100644 --- a/src/codeflare_sdk/cli/commands/status.py +++ b/src/codeflare_sdk/cli/commands/status.py @@ -15,6 +15,9 @@ def cli(): @click.pass_context def raycluster(ctx, name, namespace): """Get the status of a specified RayCluster""" - namespace = namespace or "default" - cluster = get_cluster(name, namespace) + try: + cluster = get_cluster(name, namespace) + except FileNotFoundError: + click.echo(f"Cluster {name} not found in {namespace} namespace") + return cluster.status() From f1907862c7359d534187e8a46bf6829f51d1d2e4 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Thu, 3 Aug 2023 11:52:05 -0700 Subject: [PATCH 16/47] add: plural alias to list raycluster --- src/codeflare_sdk/cli/cli_utils.py | 16 ++++++++++++++++ src/codeflare_sdk/cli/commands/list.py | 7 +++---- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/src/codeflare_sdk/cli/cli_utils.py b/src/codeflare_sdk/cli/cli_utils.py index 0c557a8ea..c9d2c87a6 100644 --- a/src/codeflare_sdk/cli/cli_utils.py +++ b/src/codeflare_sdk/cli/cli_utils.py @@ -57,3 +57,19 @@ def load_auth(): click.echo("No authentication found, trying default kubeconfig") except client.ApiException: click.echo("Invalid authentication, trying default kubeconfig") + + +class PluralAlias(click.Group): + def get_command(self, ctx, cmd_name): + rv = click.Group.get_command(self, ctx, cmd_name) + if rv is not None: + return rv + for x in self.list_commands(ctx): + if x + "s" == cmd_name: + return click.Group.get_command(self, ctx, x) + return None + + def resolve_command(self, ctx, args): + # always return the full command name + _, cmd, args = super().resolve_command(ctx, args) + return cmd.name, cmd, args diff --git a/src/codeflare_sdk/cli/commands/list.py b/src/codeflare_sdk/cli/commands/list.py index dd3ad4e22..753982579 100644 --- a/src/codeflare_sdk/cli/commands/list.py +++ b/src/codeflare_sdk/cli/commands/list.py @@ -4,12 +4,11 @@ from codeflare_sdk.cluster.cluster import ( list_clusters_all_namespaces, list_all_clusters, - get_current_namespace, ) -from codeflare_sdk.cli.cli_utils import load_auth +from codeflare_sdk.cli.cli_utils import PluralAlias -@click.group() +@click.group(cls=PluralAlias) def cli(): """List a specified resource""" pass @@ -19,7 +18,7 @@ def cli(): @click.option("--namespace", type=str) @click.option("--all", is_flag=True) @click.pass_context -def rayclusters(ctx, namespace, all): +def raycluster(ctx, namespace, all): """List all rayclusters in a specified namespace""" if all and namespace: click.echo("--all and --namespace are mutually exclusive") From 7ee83fd4993d40225c5d8d8f9536d1ac160f76d8 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Thu, 3 Aug 2023 12:38:47 -0700 Subject: [PATCH 17/47] change: use current namespace when not specified --- carson.yaml | 173 ++++++++++++++++++++++ src/codeflare_sdk/cli/codeflare_cli.py | 2 + src/codeflare_sdk/cli/commands/define.py | 7 +- src/codeflare_sdk/cli/commands/delete.py | 6 +- src/codeflare_sdk/cli/commands/details.py | 3 +- src/codeflare_sdk/cli/commands/list.py | 4 +- src/codeflare_sdk/cli/commands/status.py | 3 +- test-job.yaml | 173 ++++++++++++++++++++++ tests/unit_test.py | 9 +- 9 files changed, 369 insertions(+), 11 deletions(-) create mode 100644 carson.yaml create mode 100644 test-job.yaml diff --git a/carson.yaml b/carson.yaml new file mode 100644 index 000000000..79ff972cb --- /dev/null +++ b/carson.yaml @@ -0,0 +1,173 @@ +apiVersion: mcad.ibm.com/v1beta1 +kind: AppWrapper +metadata: + name: carson + namespace: default +spec: + priority: 9 + resources: + GenericItems: + - custompodresources: + - limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + replicas: 1 + requests: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + - limits: + cpu: 1 + memory: 2G + nvidia.com/gpu: 0 + replicas: 1 + requests: + cpu: 1 + memory: 2G + nvidia.com/gpu: 0 + generictemplate: + apiVersion: ray.io/v1alpha1 + kind: RayCluster + metadata: + labels: + appwrapper.mcad.ibm.com: carson + controller-tools.k8s.io: '1.0' + name: carson + namespace: default + spec: + autoscalerOptions: + idleTimeoutSeconds: 60 + imagePullPolicy: Always + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 500m + memory: 512Mi + upscalingMode: Default + enableInTreeAutoscaling: false + headGroupSpec: + rayStartParams: + block: 'true' + dashboard-host: 0.0.0.0 + num-gpus: '0' + serviceType: ClusterIP + template: + spec: + containers: + - env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: RAY_USE_TLS + value: '0' + - name: RAY_TLS_SERVER_CERT + value: /home/ray/workspace/tls/server.crt + - name: RAY_TLS_SERVER_KEY + value: /home/ray/workspace/tls/server.key + - name: RAY_TLS_CA_CERT + value: /home/ray/workspace/tls/ca.crt + image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 + imagePullPolicy: Always + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: ray-head + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + resources: + limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + requests: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + imagePullSecrets: [] + rayVersion: 2.1.0 + workerGroupSpecs: + - groupName: small-group-carson + maxReplicas: 1 + minReplicas: 1 + rayStartParams: + block: 'true' + num-gpus: '0' + replicas: 1 + template: + metadata: + annotations: + key: value + labels: + key: value + spec: + containers: + - env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: RAY_USE_TLS + value: '0' + - name: RAY_TLS_SERVER_CERT + value: /home/ray/workspace/tls/server.crt + - name: RAY_TLS_SERVER_KEY + value: /home/ray/workspace/tls/server.key + - name: RAY_TLS_CA_CERT + value: /home/ray/workspace/tls/ca.crt + image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: machine-learning + resources: + limits: + cpu: 1 + memory: 2G + nvidia.com/gpu: 0 + requests: + cpu: 1 + memory: 2G + nvidia.com/gpu: 0 + imagePullSecrets: [] + initContainers: + - command: + - sh + - -c + - until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; + do echo waiting for myservice; sleep 2; done + image: busybox:1.28 + name: init-myservice + replicas: 1 + - generictemplate: + apiVersion: route.openshift.io/v1 + kind: Route + metadata: + labels: + odh-ray-cluster-service: carson-head-svc + name: ray-dashboard-carson + namespace: default + spec: + port: + targetPort: dashboard + to: + kind: Service + name: carson-head-svc + replica: 1 + Items: [] diff --git a/src/codeflare_sdk/cli/codeflare_cli.py b/src/codeflare_sdk/cli/codeflare_cli.py index 78354695f..2731ac0b7 100644 --- a/src/codeflare_sdk/cli/codeflare_cli.py +++ b/src/codeflare_sdk/cli/codeflare_cli.py @@ -2,6 +2,7 @@ import os from codeflare_sdk.cli.cli_utils import load_auth +from codeflare_sdk.cluster.cluster import get_current_namespace cmd_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "commands")) @@ -9,6 +10,7 @@ class CodeflareContext: def __init__(self): self.codeflare_path = _initialize_codeflare_folder() + self.current_namespace = get_current_namespace() def _initialize_codeflare_folder(): diff --git a/src/codeflare_sdk/cli/commands/define.py b/src/codeflare_sdk/cli/commands/define.py index 09cfd1f0e..4db177f3b 100644 --- a/src/codeflare_sdk/cli/commands/define.py +++ b/src/codeflare_sdk/cli/commands/define.py @@ -12,8 +12,9 @@ def cli(): @cli.command() +@click.pass_context @click.option("--name", type=str, required=True) -@click.option("--namespace", "-n", type=str, required=True) +@click.option("--namespace", "-n", type=str) @click.option("--head_info", cls=PythonLiteralOption, type=list) @click.option("--machine_types", cls=PythonLiteralOption, type=list) @click.option("--min_cpus", type=int) @@ -29,8 +30,10 @@ def cli(): @click.option("--image", type=str) @click.option("--local_interactive", type=bool) @click.option("--image_pull_secrets", cls=PythonLiteralOption, type=list) -def raycluster(**kwargs): +def raycluster(ctx, **kwargs): """Define a RayCluster with parameter specifications""" filtered_kwargs = {k: v for k, v in kwargs.items() if v is not None} + if "namespace" not in filtered_kwargs.keys(): + filtered_kwargs["namespace"] = ctx.obj.current_namespace clusterConfig = ClusterConfiguration(**filtered_kwargs) Cluster(clusterConfig) # Creates yaml file diff --git a/src/codeflare_sdk/cli/commands/delete.py b/src/codeflare_sdk/cli/commands/delete.py index 7ce9744bd..c225d428a 100644 --- a/src/codeflare_sdk/cli/commands/delete.py +++ b/src/codeflare_sdk/cli/commands/delete.py @@ -12,12 +12,14 @@ def cli(): @cli.command() +@click.pass_context @click.argument("name", type=str) -@click.option("--namespace", type=str, required=True) -def raycluster(name, namespace): +@click.option("--namespace", type=str) +def raycluster(ctx, name, namespace): """ Delete a specified RayCluster from the Kubernetes cluster """ + namespace = namespace or ctx.obj.current_namespace try: cluster = get_cluster(name, namespace) except FileNotFoundError: diff --git a/src/codeflare_sdk/cli/commands/details.py b/src/codeflare_sdk/cli/commands/details.py index b865caa47..f6890e7d6 100644 --- a/src/codeflare_sdk/cli/commands/details.py +++ b/src/codeflare_sdk/cli/commands/details.py @@ -11,10 +11,11 @@ def cli(): @cli.command() @click.argument("name", type=str) -@click.option("--namespace", type=str, required=True) +@click.option("--namespace", type=str) @click.pass_context def raycluster(ctx, name, namespace): """Get the details of a specified RayCluster""" + namespace = namespace or ctx.obj.current_namespace try: cluster = get_cluster(name, namespace) except FileNotFoundError: diff --git a/src/codeflare_sdk/cli/commands/list.py b/src/codeflare_sdk/cli/commands/list.py index 753982579..533aaeda1 100644 --- a/src/codeflare_sdk/cli/commands/list.py +++ b/src/codeflare_sdk/cli/commands/list.py @@ -23,9 +23,7 @@ def raycluster(ctx, namespace, all): if all and namespace: click.echo("--all and --namespace are mutually exclusive") return - if not all and not namespace: - click.echo("You must specify either --namespace or --all") - return + namespace = namespace or ctx.obj.current_namespace if not all: list_all_clusters(namespace) return diff --git a/src/codeflare_sdk/cli/commands/status.py b/src/codeflare_sdk/cli/commands/status.py index fc76ffc1d..dbd92a555 100644 --- a/src/codeflare_sdk/cli/commands/status.py +++ b/src/codeflare_sdk/cli/commands/status.py @@ -11,10 +11,11 @@ def cli(): @cli.command() @click.argument("name", type=str) -@click.option("--namespace", type=str, required=True) +@click.option("--namespace", type=str) @click.pass_context def raycluster(ctx, name, namespace): """Get the status of a specified RayCluster""" + namespace = namespace or ctx.obj.current_namespace try: cluster = get_cluster(name, namespace) except FileNotFoundError: diff --git a/test-job.yaml b/test-job.yaml new file mode 100644 index 000000000..3a0827080 --- /dev/null +++ b/test-job.yaml @@ -0,0 +1,173 @@ +apiVersion: mcad.ibm.com/v1beta1 +kind: AppWrapper +metadata: + name: test-job + namespace: default +spec: + priority: 9 + resources: + GenericItems: + - custompodresources: + - limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + replicas: 1 + requests: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + - limits: + cpu: 1 + memory: 2G + nvidia.com/gpu: 0 + replicas: 1 + requests: + cpu: 1 + memory: 2G + nvidia.com/gpu: 0 + generictemplate: + apiVersion: ray.io/v1alpha1 + kind: RayCluster + metadata: + labels: + appwrapper.mcad.ibm.com: test-job + controller-tools.k8s.io: '1.0' + name: test-job + namespace: default + spec: + autoscalerOptions: + idleTimeoutSeconds: 60 + imagePullPolicy: Always + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 500m + memory: 512Mi + upscalingMode: Default + enableInTreeAutoscaling: false + headGroupSpec: + rayStartParams: + block: 'true' + dashboard-host: 0.0.0.0 + num-gpus: '0' + serviceType: ClusterIP + template: + spec: + containers: + - env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: RAY_USE_TLS + value: '0' + - name: RAY_TLS_SERVER_CERT + value: /home/ray/workspace/tls/server.crt + - name: RAY_TLS_SERVER_KEY + value: /home/ray/workspace/tls/server.key + - name: RAY_TLS_CA_CERT + value: /home/ray/workspace/tls/ca.crt + image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 + imagePullPolicy: Always + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: ray-head + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + resources: + limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + requests: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + imagePullSecrets: [] + rayVersion: 2.1.0 + workerGroupSpecs: + - groupName: small-group-test-job + maxReplicas: 1 + minReplicas: 1 + rayStartParams: + block: 'true' + num-gpus: '0' + replicas: 1 + template: + metadata: + annotations: + key: value + labels: + key: value + spec: + containers: + - env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: RAY_USE_TLS + value: '0' + - name: RAY_TLS_SERVER_CERT + value: /home/ray/workspace/tls/server.crt + - name: RAY_TLS_SERVER_KEY + value: /home/ray/workspace/tls/server.key + - name: RAY_TLS_CA_CERT + value: /home/ray/workspace/tls/ca.crt + image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: machine-learning + resources: + limits: + cpu: 1 + memory: 2G + nvidia.com/gpu: 0 + requests: + cpu: 1 + memory: 2G + nvidia.com/gpu: 0 + imagePullSecrets: [] + initContainers: + - command: + - sh + - -c + - until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; + do echo waiting for myservice; sleep 2; done + image: busybox:1.28 + name: init-myservice + replicas: 1 + - generictemplate: + apiVersion: route.openshift.io/v1 + kind: Route + metadata: + labels: + odh-ray-cluster-service: test-job-head-svc + name: ray-dashboard-test-job + namespace: default + spec: + port: + targetPort: dashboard + to: + kind: Service + name: test-job-head-svc + replica: 1 + Items: [] diff --git a/tests/unit_test.py b/tests/unit_test.py index 7a4cb4a7d..32a70b965 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -160,7 +160,8 @@ def test_login_tls_cli(mocker): tls_result = runner.invoke(cli, k8s_tls_login_command) skip_tls_result = runner.invoke(cli, k8s_skip_tls_login_command) assert ( - tls_result.output == skip_tls_result.output == "Logged into 'testserver:6443'\n" + "Logged into 'testserver:6443'\n" in tls_result.output + and "Logged into 'testserver:6443'\n" in skip_tls_result.output ) @@ -169,7 +170,7 @@ def test_logout_cli(mocker): mocker.patch.object(client, "ApiClient") k8s_logout_command = "logout" logout_result = runner.invoke(cli, k8s_logout_command) - assert logout_result.output == "Successfully logged out of 'testserver:6443'\n" + assert "Successfully logged out of 'testserver:6443'\n" in logout_result.output assert not os.path.exists(os.path.expanduser("~/.codeflare/auth")) @@ -198,6 +199,10 @@ def test_cluster_deletion_cli(mocker): "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", side_effect=get_ray_obj, ) + mocker.patch( + "codeflare_sdk.cluster.cluster.get_current_namespace", + return_value="ns", + ) runner = CliRunner() delete_cluster_command = """ delete raycluster From ec3059e6d2951944c1860e86f19aee2f66c6a39f Mon Sep 17 00:00:00 2001 From: carsonmh Date: Thu, 3 Aug 2023 12:44:37 -0700 Subject: [PATCH 18/47] refactor: make _get_all_rayclusters which handles namespaced and all raycluster listing --- src/codeflare_sdk/cluster/cluster.py | 40 +++++++++++----------------- 1 file changed, 15 insertions(+), 25 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index c04f9c49b..7bb4b3579 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -417,7 +417,7 @@ def list_all_clusters(namespace: str, print_to_console: bool = True): """ Returns (and prints by default) a list of all clusters in a given namespace. """ - clusters = _get_ray_clusters_in_namespace(namespace) + clusters = _get_all_ray_clusters(namespace) if print_to_console: pretty_print.print_clusters(clusters) return clusters @@ -544,17 +544,24 @@ def _ray_cluster_status(name, namespace="default") -> Optional[RayCluster]: return None -def _get_ray_clusters_in_namespace(namespace="default") -> List[RayCluster]: +def _get_all_ray_clusters(namespace: str = None) -> List[RayCluster]: list_of_clusters = [] try: config_check() api_instance = client.CustomObjectsApi(api_config_handler()) - rcs = api_instance.list_namespaced_custom_object( - group="ray.io", - version="v1alpha1", - namespace=namespace, - plural="rayclusters", - ) + if namespace: + rcs = api_instance.list_namespaced_custom_object( + group="ray.io", + version="v1alpha1", + namespace=namespace, + plural="rayclusters", + ) + else: + rcs = api_instance.list_cluster_custom_object( + group="ray.io", + version="v1alpha1", + plural="rayclusters", + ) except Exception as e: # pragma: no cover return _kube_api_error_handling(e) @@ -563,23 +570,6 @@ def _get_ray_clusters_in_namespace(namespace="default") -> List[RayCluster]: return list_of_clusters -def _get_all_ray_clusters() -> List[RayCluster]: - list_of_clusters = [] - try: - config_check() - api_instance = client.CustomObjectsApi(api_config_handler()) - rcs = api_instance.list_cluster_custom_object( - group="ray.io", - version="v1alpha1", - plural="rayclusters", - ) - except Exception as e: - return _kube_api_error_handling(e) - for rc in rcs["items"]: - list_of_clusters.append(_map_to_ray_cluster(rc)) - return list_of_clusters - - def _get_app_wrappers( namespace="default", filter=List[AppWrapperStatus] ) -> List[AppWrapper]: From 1a94b262598fb36cb86ce67ab5ed9775b5416ce5 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Thu, 3 Aug 2023 12:48:59 -0700 Subject: [PATCH 19/47] cleanup --- carson.yaml | 173 -------------------------------------------------- test-job.yaml | 173 -------------------------------------------------- 2 files changed, 346 deletions(-) delete mode 100644 carson.yaml delete mode 100644 test-job.yaml diff --git a/carson.yaml b/carson.yaml deleted file mode 100644 index 79ff972cb..000000000 --- a/carson.yaml +++ /dev/null @@ -1,173 +0,0 @@ -apiVersion: mcad.ibm.com/v1beta1 -kind: AppWrapper -metadata: - name: carson - namespace: default -spec: - priority: 9 - resources: - GenericItems: - - custompodresources: - - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - replicas: 1 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - - limits: - cpu: 1 - memory: 2G - nvidia.com/gpu: 0 - replicas: 1 - requests: - cpu: 1 - memory: 2G - nvidia.com/gpu: 0 - generictemplate: - apiVersion: ray.io/v1alpha1 - kind: RayCluster - metadata: - labels: - appwrapper.mcad.ibm.com: carson - controller-tools.k8s.io: '1.0' - name: carson - namespace: default - spec: - autoscalerOptions: - idleTimeoutSeconds: 60 - imagePullPolicy: Always - resources: - limits: - cpu: 500m - memory: 512Mi - requests: - cpu: 500m - memory: 512Mi - upscalingMode: Default - enableInTreeAutoscaling: false - headGroupSpec: - rayStartParams: - block: 'true' - dashboard-host: 0.0.0.0 - num-gpus: '0' - serviceType: ClusterIP - template: - spec: - containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 - imagePullPolicy: Always - lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - ray stop - name: ray-head - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - resources: - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - imagePullSecrets: [] - rayVersion: 2.1.0 - workerGroupSpecs: - - groupName: small-group-carson - maxReplicas: 1 - minReplicas: 1 - rayStartParams: - block: 'true' - num-gpus: '0' - replicas: 1 - template: - metadata: - annotations: - key: value - labels: - key: value - spec: - containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 - lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - ray stop - name: machine-learning - resources: - limits: - cpu: 1 - memory: 2G - nvidia.com/gpu: 0 - requests: - cpu: 1 - memory: 2G - nvidia.com/gpu: 0 - imagePullSecrets: [] - initContainers: - - command: - - sh - - -c - - until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; - do echo waiting for myservice; sleep 2; done - image: busybox:1.28 - name: init-myservice - replicas: 1 - - generictemplate: - apiVersion: route.openshift.io/v1 - kind: Route - metadata: - labels: - odh-ray-cluster-service: carson-head-svc - name: ray-dashboard-carson - namespace: default - spec: - port: - targetPort: dashboard - to: - kind: Service - name: carson-head-svc - replica: 1 - Items: [] diff --git a/test-job.yaml b/test-job.yaml deleted file mode 100644 index 3a0827080..000000000 --- a/test-job.yaml +++ /dev/null @@ -1,173 +0,0 @@ -apiVersion: mcad.ibm.com/v1beta1 -kind: AppWrapper -metadata: - name: test-job - namespace: default -spec: - priority: 9 - resources: - GenericItems: - - custompodresources: - - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - replicas: 1 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - - limits: - cpu: 1 - memory: 2G - nvidia.com/gpu: 0 - replicas: 1 - requests: - cpu: 1 - memory: 2G - nvidia.com/gpu: 0 - generictemplate: - apiVersion: ray.io/v1alpha1 - kind: RayCluster - metadata: - labels: - appwrapper.mcad.ibm.com: test-job - controller-tools.k8s.io: '1.0' - name: test-job - namespace: default - spec: - autoscalerOptions: - idleTimeoutSeconds: 60 - imagePullPolicy: Always - resources: - limits: - cpu: 500m - memory: 512Mi - requests: - cpu: 500m - memory: 512Mi - upscalingMode: Default - enableInTreeAutoscaling: false - headGroupSpec: - rayStartParams: - block: 'true' - dashboard-host: 0.0.0.0 - num-gpus: '0' - serviceType: ClusterIP - template: - spec: - containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 - imagePullPolicy: Always - lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - ray stop - name: ray-head - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - resources: - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - imagePullSecrets: [] - rayVersion: 2.1.0 - workerGroupSpecs: - - groupName: small-group-test-job - maxReplicas: 1 - minReplicas: 1 - rayStartParams: - block: 'true' - num-gpus: '0' - replicas: 1 - template: - metadata: - annotations: - key: value - labels: - key: value - spec: - containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 - lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - ray stop - name: machine-learning - resources: - limits: - cpu: 1 - memory: 2G - nvidia.com/gpu: 0 - requests: - cpu: 1 - memory: 2G - nvidia.com/gpu: 0 - imagePullSecrets: [] - initContainers: - - command: - - sh - - -c - - until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; - do echo waiting for myservice; sleep 2; done - image: busybox:1.28 - name: init-myservice - replicas: 1 - - generictemplate: - apiVersion: route.openshift.io/v1 - kind: Route - metadata: - labels: - odh-ray-cluster-service: test-job-head-svc - name: ray-dashboard-test-job - namespace: default - spec: - port: - targetPort: dashboard - to: - kind: Service - name: test-job-head-svc - replica: 1 - Items: [] From 61e6723d51a55231aa38d945f1d7a6d300513278 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Wed, 2 Aug 2023 21:50:19 -0700 Subject: [PATCH 20/47] create: CLI job define command --- src/codeflare_sdk/cli/commands/define.py | 54 +++++++++++++++++++----- 1 file changed, 44 insertions(+), 10 deletions(-) diff --git a/src/codeflare_sdk/cli/commands/define.py b/src/codeflare_sdk/cli/commands/define.py index 4db177f3b..004daf742 100644 --- a/src/codeflare_sdk/cli/commands/define.py +++ b/src/codeflare_sdk/cli/commands/define.py @@ -1,8 +1,10 @@ import click +import pickle from codeflare_sdk.cluster.cluster import Cluster from codeflare_sdk.cluster.config import ClusterConfiguration from codeflare_sdk.cli.cli_utils import PythonLiteralOption +from codeflare_sdk.job.jobs import DDPJobDefinition @click.group() @@ -15,21 +17,21 @@ def cli(): @click.pass_context @click.option("--name", type=str, required=True) @click.option("--namespace", "-n", type=str) -@click.option("--head_info", cls=PythonLiteralOption, type=list) -@click.option("--machine_types", cls=PythonLiteralOption, type=list) -@click.option("--min_cpus", type=int) -@click.option("--max_cpus", type=int) -@click.option("--min_worker", type=int) -@click.option("--max_worker", type=int) -@click.option("--min_memory", type=int) -@click.option("--max_memory", type=int) +@click.option("--head-info", cls=PythonLiteralOption, type=list) +@click.option("--machine-types", cls=PythonLiteralOption, type=list) +@click.option("--min-cpus", type=int) +@click.option("--max-cpus", type=int) +@click.option("--min-worker", type=int) +@click.option("--max-worker", type=int) +@click.option("--min-memory", type=int) +@click.option("--max-memory", type=int) @click.option("--gpu", type=int) @click.option("--template", type=str) @click.option("--instascale", type=bool) @click.option("--envs", cls=PythonLiteralOption, type=dict) @click.option("--image", type=str) -@click.option("--local_interactive", type=bool) -@click.option("--image_pull_secrets", cls=PythonLiteralOption, type=list) +@click.option("--local-interactive", type=bool) +@click.option("--image-pull-secrets", cls=PythonLiteralOption, type=list) def raycluster(ctx, **kwargs): """Define a RayCluster with parameter specifications""" filtered_kwargs = {k: v for k, v in kwargs.items() if v is not None} @@ -37,3 +39,35 @@ def raycluster(ctx, **kwargs): filtered_kwargs["namespace"] = ctx.obj.current_namespace clusterConfig = ClusterConfiguration(**filtered_kwargs) Cluster(clusterConfig) # Creates yaml file + + +@cli.command() +@click.pass_context +@click.option("--script", type=str) +@click.option("--m", type=str) +@click.option("--script-args", cls=PythonLiteralOption, type=list) +@click.option("--name", type=str) +@click.option("--cpu", type=int) +@click.option("--gpu", type=int) +@click.option("--memMB", type=int) +@click.option("--h", type=str) +@click.option("--j", type=str) +@click.option("--env", cls=PythonLiteralOption, type=dict) +@click.option("--max-retries", type=int) +@click.option("--mounts", cls=PythonLiteralOption, type=list) +@click.option("--rdzv-port", type=int) +@click.option("--rdzv-backend", type=str) +@click.option("--schedular-args", cls=PythonLiteralOption, type=dict) +@click.option("--image", type=str) +@click.option("--workspace", type=str) +def job(ctx, **kwargs): + """Define a job with specified resources""" + filtered_kwargs = {k: v for k, v in kwargs.items() if v is not None} + if "memmb" in filtered_kwargs: + filtered_kwargs["memMB"] = filtered_kwargs["memmb"] + del filtered_kwargs["memmb"] + job_def = DDPJobDefinition(**filtered_kwargs) + job_file_path = ctx.obj.codeflare_path + f"/{job_def.name}" + with open(job_file_path, "wb") as file: + pickle.dump(job_def, file) + click.echo("Job definition saved to " + job_file_path) From 26d00a1577607cd3d800f2b777e436842748acfa Mon Sep 17 00:00:00 2001 From: carsonmh Date: Thu, 3 Aug 2023 11:34:29 -0700 Subject: [PATCH 21/47] create: submit job command cli --- src/codeflare_sdk/cli/commands/submit.py | 37 ++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/src/codeflare_sdk/cli/commands/submit.py b/src/codeflare_sdk/cli/commands/submit.py index 8a476d602..debb7e333 100644 --- a/src/codeflare_sdk/cli/commands/submit.py +++ b/src/codeflare_sdk/cli/commands/submit.py @@ -1,6 +1,10 @@ import click from codeflare_sdk.cluster.cluster import Cluster +import pickle +from torchx.runner import get_runner + +from codeflare_sdk.cluster.cluster import get_cluster @click.group() @@ -30,3 +34,36 @@ def raycluster(name, wait): return cluster.up() cluster.wait_ready() + + +@cli.command() +@click.pass_context +@click.argument("name", type=str) +@click.option("--cluster-name", type=str) +@click.option("--namespace", type=str, required=True) +def job(ctx, name, cluster_name, namespace): + """ + Submit a defined job to the Kubernetes cluster or a RayCluster + """ + runner = get_runner() + try: + job_path = ctx.obj.codeflare_path + f"/{name}" + with open(job_path, "rb") as file: + job_def = pickle.load(file) + except Exception as e: + click.echo( + f"Error submitting job. Make sure the job is defined before submitting it" + ) + return + if not cluster_name: + job = job_def.submit() + submission_id = runner.describe(job._app_handle).name.split(":")[1] + click.echo(f"{submission_id} submitted successfully") + return + cluster = get_cluster(cluster_name, namespace) + job = job_def.submit(cluster) + full_name = runner.describe(job._app_handle).name + submission_id = full_name[full_name.rfind(name) :] + click.echo( + f"{submission_id} submitted onto {cluster_name} RayCluster successfully\nView dashboard: {cluster.cluster_dashboard_uri()}" + ) From 6e4bcac9e68ca2fb988f7a381b74760e4290dbfd Mon Sep 17 00:00:00 2001 From: carsonmh Date: Thu, 3 Aug 2023 11:34:54 -0700 Subject: [PATCH 22/47] fix: login help message no longer has ellipsis --- src/codeflare_sdk/cli/commands/login.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/codeflare_sdk/cli/commands/login.py b/src/codeflare_sdk/cli/commands/login.py index 288607a89..56df911b2 100644 --- a/src/codeflare_sdk/cli/commands/login.py +++ b/src/codeflare_sdk/cli/commands/login.py @@ -25,7 +25,7 @@ ) def cli(ctx, server, token, insecure_skip_tls_verify, certificate_authority): """ - Login to your Kubernetes cluster and save login for subsequent use + Login to your Kubernetes cluster and save login for later use """ auth = TokenAuthentication( token, server, insecure_skip_tls_verify, certificate_authority From 7ad81271ef88ba930e8389c65cd5ad35c49655d2 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Thu, 3 Aug 2023 15:40:29 -0700 Subject: [PATCH 23/47] test: unit tests for submit define job --- tests/unit_test.py | 92 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 75 insertions(+), 17 deletions(-) diff --git a/tests/unit_test.py b/tests/unit_test.py index 32a70b965..d5113f063 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -95,16 +95,16 @@ def test_cluster_definition_cli(mocker): define raycluster --name=cli-test-cluster --namespace=default - --min_worker=1 - --max_worker=2 - --min_cpus=3 - --max_cpus=4 - --min_memory=5 - --max_memory=6 + --min-worker=1 + --max-worker=2 + --min-cpus=3 + --max-cpus=4 + --min-memory=5 + --max-memory=6 --gpu=7 --instascale=True - --machine_types='["cpu.small", "gpu.large"]' - --image_pull_secrets='["cli-test-pull-secret"]' + --machine-types='["cpu.small", "gpu.large"]' + --image-pull-secrets='["cli-test-pull-secret"]' """ result = runner.invoke(cli, define_cluster_command) assert ( @@ -165,15 +165,6 @@ def test_login_tls_cli(mocker): ) -def test_logout_cli(mocker): - runner = CliRunner() - mocker.patch.object(client, "ApiClient") - k8s_logout_command = "logout" - logout_result = runner.invoke(cli, k8s_logout_command) - assert "Successfully logged out of 'testserver:6443'\n" in logout_result.output - assert not os.path.exists(os.path.expanduser("~/.codeflare/auth")) - - def test_load_auth(): load_auth() assert sdk_auth.api_client is not None @@ -243,6 +234,63 @@ def test_list_clusters_all_namespaces(mocker, capsys): ) +def test_job_definition_cli(): + runner = CliRunner() + define_job_command = """ + define job + --script=test-script.py + --script-args='["arg1", "arg2"]' + --memMB=2 + --image=test-image + --name=test + """ + result = runner.invoke(cli, define_job_command) + file_path = os.path.expanduser("~") + "/.codeflare/test" + assert result.output == "Job definition saved to " + file_path + "\n" + try: + with open(file_path, "rb") as file: + job = pickle.load(file) + except Exception as e: + print("Error opening file: ", e) + assert 0 == 1 + assert job.script == "test-script.py" + assert job.script_args == ["arg1", "arg2"] + assert job.memMB == 2 + assert job.image == "test-image" + assert job.name == "test" + + +def test_job_submission_cli(mocker): + mocker.patch.object(client, "ApiClient") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_ray_obj, + ) + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", + return_value="test-url.com", + ) + mocker.patch( + "codeflare_sdk.job.jobs.torchx_runner.schedule", + return_value="test-url.com", + ) + mocker.patch("torchx.runner.Runner.describe", return_value=AppDef(name="test-1234")) + runner = CliRunner() + submit_job_command = """ + submit job + test + --cluster-name=quicktest + --namespace=default + """ + result = runner.invoke(cli, submit_job_command) + assert ( + result.output + == "Written to: quicktest.yaml\n" + + "test-1234 submitted onto quicktest RayCluster successfully\n" + + "View dashboard: test-url.com\n" + ) + + def test_raycluster_details_cli(mocker): runner = CliRunner() mocker.patch( @@ -368,6 +416,16 @@ def test_raycluster_list_cli(mocker): ) in result.output +# Keep this test at the end of CLI tests +def test_logout_cli(mocker): + runner = CliRunner() + mocker.patch.object(client, "ApiClient") + k8s_logout_command = "logout" + logout_result = runner.invoke(cli, k8s_logout_command) + assert logout_result.output == "Successfully logged out of 'testserver:6443'\n" + assert not os.path.exists(os.path.expanduser("~/.codeflare/auth")) + + # For mocking openshift client results fake_res = openshift.Result("fake") From bc8113ffa3b5dd47125ef0574af4a5c585577512 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Fri, 4 Aug 2023 10:55:57 -0700 Subject: [PATCH 24/47] fix: typo --- src/codeflare_sdk/cli/commands/define.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/codeflare_sdk/cli/commands/define.py b/src/codeflare_sdk/cli/commands/define.py index 004daf742..51571463c 100644 --- a/src/codeflare_sdk/cli/commands/define.py +++ b/src/codeflare_sdk/cli/commands/define.py @@ -57,7 +57,7 @@ def raycluster(ctx, **kwargs): @click.option("--mounts", cls=PythonLiteralOption, type=list) @click.option("--rdzv-port", type=int) @click.option("--rdzv-backend", type=str) -@click.option("--schedular-args", cls=PythonLiteralOption, type=dict) +@click.option("--scheduler-args", cls=PythonLiteralOption, type=dict) @click.option("--image", type=str) @click.option("--workspace", type=str) def job(ctx, **kwargs): From 245dde1152fef7252789c8e78122852b9123379b Mon Sep 17 00:00:00 2001 From: carsonmh Date: Fri, 4 Aug 2023 11:00:40 -0700 Subject: [PATCH 25/47] change: make submit job use current namespace --- src/codeflare_sdk/cli/commands/submit.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/codeflare_sdk/cli/commands/submit.py b/src/codeflare_sdk/cli/commands/submit.py index debb7e333..498c0b1d1 100644 --- a/src/codeflare_sdk/cli/commands/submit.py +++ b/src/codeflare_sdk/cli/commands/submit.py @@ -40,7 +40,7 @@ def raycluster(name, wait): @click.pass_context @click.argument("name", type=str) @click.option("--cluster-name", type=str) -@click.option("--namespace", type=str, required=True) +@click.option("--namespace", type=str) def job(ctx, name, cluster_name, namespace): """ Submit a defined job to the Kubernetes cluster or a RayCluster @@ -60,7 +60,7 @@ def job(ctx, name, cluster_name, namespace): submission_id = runner.describe(job._app_handle).name.split(":")[1] click.echo(f"{submission_id} submitted successfully") return - cluster = get_cluster(cluster_name, namespace) + cluster = get_cluster(cluster_name, namespace or ctx.obj.current_namespace) job = job_def.submit(cluster) full_name = runner.describe(job._app_handle).name submission_id = full_name[full_name.rfind(name) :] From 25104f258417c8e6e96090704a7e7d83a7124631 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Fri, 4 Aug 2023 15:05:58 -0700 Subject: [PATCH 26/47] change: make load_auth only happen on login command --- src/codeflare_sdk/cli/codeflare_cli.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/codeflare_sdk/cli/codeflare_cli.py b/src/codeflare_sdk/cli/codeflare_cli.py index 2731ac0b7..143c2a352 100644 --- a/src/codeflare_sdk/cli/codeflare_cli.py +++ b/src/codeflare_sdk/cli/codeflare_cli.py @@ -44,7 +44,8 @@ def get_command(self, ctx, name): @click.command(cls=CodeflareCLI) @click.pass_context def cli(ctx): - load_auth() + if ctx.invoked_subcommand != "login": + load_auth() ctx.obj = CodeflareContext() # Ran on every command pass From 0c28afdacb3d7c8c246410e7e70e4f2bbc93a944 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Mon, 7 Aug 2023 15:40:33 -0700 Subject: [PATCH 27/47] add: raycluster not found error handling --- src/codeflare_sdk/cli/codeflare_cli.py | 2 +- src/codeflare_sdk/cli/commands/submit.py | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/codeflare_sdk/cli/codeflare_cli.py b/src/codeflare_sdk/cli/codeflare_cli.py index 143c2a352..c9f17a6dc 100644 --- a/src/codeflare_sdk/cli/codeflare_cli.py +++ b/src/codeflare_sdk/cli/codeflare_cli.py @@ -44,7 +44,7 @@ def get_command(self, ctx, name): @click.command(cls=CodeflareCLI) @click.pass_context def cli(ctx): - if ctx.invoked_subcommand != "login": + if ctx.invoked_subcommand != "login" and ctx.invoked_subcommand != "logout": load_auth() ctx.obj = CodeflareContext() # Ran on every command pass diff --git a/src/codeflare_sdk/cli/commands/submit.py b/src/codeflare_sdk/cli/commands/submit.py index 498c0b1d1..d4ac9eafa 100644 --- a/src/codeflare_sdk/cli/commands/submit.py +++ b/src/codeflare_sdk/cli/commands/submit.py @@ -58,12 +58,17 @@ def job(ctx, name, cluster_name, namespace): if not cluster_name: job = job_def.submit() submission_id = runner.describe(job._app_handle).name.split(":")[1] - click.echo(f"{submission_id} submitted successfully") + click.echo(f"Job {submission_id} submitted successfully") + return + namespace = namespace or ctx.obj.current_namespace + try: + cluster = get_cluster(cluster_name, namespace) + except FileNotFoundError: + click.echo(f"Cluster {name} not found in {namespace} namespace") return - cluster = get_cluster(cluster_name, namespace or ctx.obj.current_namespace) job = job_def.submit(cluster) full_name = runner.describe(job._app_handle).name submission_id = full_name[full_name.rfind(name) :] click.echo( - f"{submission_id} submitted onto {cluster_name} RayCluster successfully\nView dashboard: {cluster.cluster_dashboard_uri()}" + f"Job {submission_id} submitted onto {cluster_name} RayCluster successfully\nView dashboard: {cluster.cluster_dashboard_uri()}" ) From f4aff678c5a7167984747c05639d3c9db4544f49 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Wed, 9 Aug 2023 11:35:07 -0700 Subject: [PATCH 28/47] make define params required and refactor job submit --- src/codeflare_sdk/cli/commands/define.py | 4 ++-- src/codeflare_sdk/cli/commands/submit.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/codeflare_sdk/cli/commands/define.py b/src/codeflare_sdk/cli/commands/define.py index 51571463c..7c8d4476f 100644 --- a/src/codeflare_sdk/cli/commands/define.py +++ b/src/codeflare_sdk/cli/commands/define.py @@ -43,10 +43,10 @@ def raycluster(ctx, **kwargs): @cli.command() @click.pass_context -@click.option("--script", type=str) +@click.option("--script", type=str, required=True) @click.option("--m", type=str) @click.option("--script-args", cls=PythonLiteralOption, type=list) -@click.option("--name", type=str) +@click.option("--name", type=str, required=True) @click.option("--cpu", type=int) @click.option("--gpu", type=int) @click.option("--memMB", type=int) diff --git a/src/codeflare_sdk/cli/commands/submit.py b/src/codeflare_sdk/cli/commands/submit.py index d4ac9eafa..7c6760cae 100644 --- a/src/codeflare_sdk/cli/commands/submit.py +++ b/src/codeflare_sdk/cli/commands/submit.py @@ -1,4 +1,5 @@ import click +import os from codeflare_sdk.cluster.cluster import Cluster import pickle @@ -46,15 +47,14 @@ def job(ctx, name, cluster_name, namespace): Submit a defined job to the Kubernetes cluster or a RayCluster """ runner = get_runner() - try: - job_path = ctx.obj.codeflare_path + f"/{name}" - with open(job_path, "rb") as file: - job_def = pickle.load(file) - except Exception as e: + job_path = ctx.obj.codeflare_path + f"/{name}" + if not os.path.isfile(job_path): click.echo( f"Error submitting job. Make sure the job is defined before submitting it" ) return + with open(job_path, "rb") as file: + job_def = pickle.load(file) if not cluster_name: job = job_def.submit() submission_id = runner.describe(job._app_handle).name.split(":")[1] From 64a8e4527e1af1abb45954d8bb3b8b0ce7069328 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Wed, 2 Aug 2023 21:47:43 -0700 Subject: [PATCH 29/47] create: list_jobs and get_job functions --- jobtest.yaml | 173 +++++++++++++++++++++++++++++ src/codeflare_sdk/cli/cli_utils.py | 90 +++++++++++++++ 2 files changed, 263 insertions(+) create mode 100644 jobtest.yaml diff --git a/jobtest.yaml b/jobtest.yaml new file mode 100644 index 000000000..92cd39b4c --- /dev/null +++ b/jobtest.yaml @@ -0,0 +1,173 @@ +apiVersion: mcad.ibm.com/v1beta1 +kind: AppWrapper +metadata: + name: jobtest + namespace: default +spec: + priority: 9 + resources: + GenericItems: + - custompodresources: + - limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + replicas: 1 + requests: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + - limits: + cpu: 1 + memory: 1G + nvidia.com/gpu: 0 + replicas: 2 + requests: + cpu: 1 + memory: 1G + nvidia.com/gpu: 0 + generictemplate: + apiVersion: ray.io/v1alpha1 + kind: RayCluster + metadata: + labels: + appwrapper.mcad.ibm.com: jobtest + controller-tools.k8s.io: '1.0' + name: jobtest + namespace: default + spec: + autoscalerOptions: + idleTimeoutSeconds: 60 + imagePullPolicy: Always + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 500m + memory: 512Mi + upscalingMode: Default + enableInTreeAutoscaling: false + headGroupSpec: + rayStartParams: + block: 'true' + dashboard-host: 0.0.0.0 + num-gpus: '0' + serviceType: ClusterIP + template: + spec: + containers: + - env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: RAY_USE_TLS + value: '0' + - name: RAY_TLS_SERVER_CERT + value: /home/ray/workspace/tls/server.crt + - name: RAY_TLS_SERVER_KEY + value: /home/ray/workspace/tls/server.key + - name: RAY_TLS_CA_CERT + value: /home/ray/workspace/tls/ca.crt + image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 + imagePullPolicy: Always + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: ray-head + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + resources: + limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + requests: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + imagePullSecrets: [] + rayVersion: 2.1.0 + workerGroupSpecs: + - groupName: small-group-jobtest + maxReplicas: 2 + minReplicas: 2 + rayStartParams: + block: 'true' + num-gpus: '0' + replicas: 2 + template: + metadata: + annotations: + key: value + labels: + key: value + spec: + containers: + - env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: RAY_USE_TLS + value: '0' + - name: RAY_TLS_SERVER_CERT + value: /home/ray/workspace/tls/server.crt + - name: RAY_TLS_SERVER_KEY + value: /home/ray/workspace/tls/server.key + - name: RAY_TLS_CA_CERT + value: /home/ray/workspace/tls/ca.crt + image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: machine-learning + resources: + limits: + cpu: 1 + memory: 1G + nvidia.com/gpu: 0 + requests: + cpu: 1 + memory: 1G + nvidia.com/gpu: 0 + imagePullSecrets: [] + initContainers: + - command: + - sh + - -c + - until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; + do echo waiting for myservice; sleep 2; done + image: busybox:1.28 + name: init-myservice + replicas: 1 + - generictemplate: + apiVersion: route.openshift.io/v1 + kind: Route + metadata: + labels: + odh-ray-cluster-service: jobtest-head-svc + name: ray-dashboard-jobtest + namespace: default + spec: + port: + targetPort: dashboard + to: + kind: Service + name: jobtest-head-svc + replica: 1 + Items: [] diff --git a/src/codeflare_sdk/cli/cli_utils.py b/src/codeflare_sdk/cli/cli_utils.py index c9d2c87a6..8c8597a16 100644 --- a/src/codeflare_sdk/cli/cli_utils.py +++ b/src/codeflare_sdk/cli/cli_utils.py @@ -3,7 +3,15 @@ from kubernetes import client, config import pickle import os +from ray.job_submission import JobSubmissionClient +from torchx.runner import get_runner +from rich.table import Table +from rich import print +from codeflare_sdk.cluster.cluster import ( + list_clusters_all_namespaces, +) +from codeflare_sdk.cluster.model import RayCluster from codeflare_sdk.cluster.auth import _create_api_client_config from codeflare_sdk.utils.kube_api_helpers import _kube_api_error_handling import codeflare_sdk.cluster.auth as sdk_auth @@ -73,3 +81,85 @@ def resolve_command(self, ctx, args): # always return the full command name _, cmd, args = super().resolve_command(ctx, args) return cmd.name, cmd, args +def print_jobs(jobs): + headers = ["Submission ID", "Job ID", "RayCluster", "Namespace", "Status"] + table = Table(show_header=True) + for header in headers: + table.add_column(header) + for job in jobs: + table.add_row(*job.values()) + print(table) + + +def list_all_kubernetes_jobs(print_to_console=True): + k8s_jobs = [] + runner = get_runner() + jobs = runner.list(scheduler="kubernetes_mcad") + rayclusters = { + raycluster.name for raycluster in list_clusters_all_namespaces(False) + } + for job in jobs: + namespace, name = job.app_id.split(":") + status = job.state + if name in rayclusters: + continue + k8s_jobs.append( + { + "Submission ID": name, + "Job ID": "N/A", + "RayCluster": "N/A", + "Namespace": namespace, + "Status": str(status), + } + ) + if print_to_console: + print_jobs(k8s_jobs) + return k8s_jobs + + +def list_all_jobs(print_to_console=True): + k8s_jobs = list_all_kubernetes_jobs(False) + rc_jobs = list_all_raycluster_jobs(False) + all_jobs = rc_jobs + k8s_jobs + if print_to_console: + print_jobs(all_jobs) + return all_jobs + + +def list_raycluster_jobs(cluster: RayCluster, print_to_console=True): + rc_jobs = [] + client = JobSubmissionClient(cluster.dashboard) + jobs = client.list_jobs() + for job in jobs: + job_obj = { + "Submission ID": job.submission_id, + "Job ID": job.job_id, + "RayCluster": cluster.name, + "Namespace": cluster.namespace, + "Status": str(job.status), + } + rc_jobs.append(job_obj) + if print_to_console: + print_jobs(rc_jobs) + return rc_jobs + + +def list_all_raycluster_jobs(print_to_console=True): + rc_jobs = [] + clusters = list_clusters_all_namespaces(False) + for cluster in clusters: + cluster.dashboard = "http://" + cluster.dashboard + rc_jobs += list_raycluster_jobs(cluster, False) + if print_to_console: + print_jobs(rc_jobs) + return rc_jobs + + +def get_job(job_submission): + all_jobs = list_all_jobs(False) + for job in all_jobs: + if job["Submission ID"] == job_submission: + return job + raise ( + f"Job {job_submission} not found. Try using 'codeflare list --all' to see all jobs" + ) From 2e4fdca3aeb03922e55f9f2f50474163957d23a1 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Thu, 3 Aug 2023 14:33:20 -0700 Subject: [PATCH 30/47] create: list jobs CLI command --- src/codeflare_sdk/cli/commands/list.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/codeflare_sdk/cli/commands/list.py b/src/codeflare_sdk/cli/commands/list.py index 533aaeda1..0ae29d04d 100644 --- a/src/codeflare_sdk/cli/commands/list.py +++ b/src/codeflare_sdk/cli/commands/list.py @@ -1,11 +1,15 @@ import click -from kubernetes import client, config from codeflare_sdk.cluster.cluster import ( list_clusters_all_namespaces, list_all_clusters, ) from codeflare_sdk.cli.cli_utils import PluralAlias +from codeflare_sdk.cluster.cluster import get_cluster +from codeflare_sdk.cluster.cluster import _copy_to_ray +from codeflare_sdk.cli.cli_utils import list_all_jobs +from codeflare_sdk.cli.cli_utils import list_all_kubernetes_jobs +from codeflare_sdk.cli.cli_utils import list_raycluster_jobs @click.group(cls=PluralAlias) @@ -28,3 +32,19 @@ def raycluster(ctx, namespace, all): list_all_clusters(namespace) return list_clusters_all_namespaces() + + +@cli.command() +@click.option("--cluster-name", "-c", type=str) +@click.option("--namespace", type=str) +@click.option("--all", is_flag=True) +def job(cluster_name, namespace, all): + """List all jobs in a specified RayCluster or in K8S cluster""" + if all: + list_all_jobs(True) + return + if cluster_name: + cluster = get_cluster(cluster_name, namespace) + list_raycluster_jobs(_copy_to_ray(cluster), True) + return + list_all_kubernetes_jobs(True) From 2c8f81ae18adc63aaf00a5225213eb5af99c4de9 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Thu, 3 Aug 2023 14:43:45 -0700 Subject: [PATCH 31/47] create: job status command --- src/codeflare_sdk/cli/commands/status.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/codeflare_sdk/cli/commands/status.py b/src/codeflare_sdk/cli/commands/status.py index dbd92a555..7c00b552e 100644 --- a/src/codeflare_sdk/cli/commands/status.py +++ b/src/codeflare_sdk/cli/commands/status.py @@ -1,6 +1,7 @@ import click from codeflare_sdk.cluster.cluster import get_cluster +from codeflare_sdk.cli.cli_utils import get_job @click.group() @@ -22,3 +23,12 @@ def raycluster(ctx, name, namespace): click.echo(f"Cluster {name} not found in {namespace} namespace") return cluster.status() + + +@cli.command() +@click.pass_context +@click.argument("submission-id", type=str) +def job(ctx, submission_id): + """Get the status of a specified job""" + job = get_job(submission_id) + click.echo(job["Status"]) From a611267657e5179ddfb659ca6cbb922f6cdbd41b Mon Sep 17 00:00:00 2001 From: carsonmh Date: Mon, 7 Aug 2023 22:02:34 -0700 Subject: [PATCH 32/47] create: cancel job function --- src/codeflare_sdk/cli/commands/cancel.py | 27 ++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 src/codeflare_sdk/cli/commands/cancel.py diff --git a/src/codeflare_sdk/cli/commands/cancel.py b/src/codeflare_sdk/cli/commands/cancel.py new file mode 100644 index 000000000..7222d3448 --- /dev/null +++ b/src/codeflare_sdk/cli/commands/cancel.py @@ -0,0 +1,27 @@ +import click +from torchx.runner import get_runner + + +from codeflare_sdk.cli.cli_utils import get_job_app_handle + + +@click.group() +def cli(): + """Cancel a resource""" + pass + + +@cli.command() +@click.pass_context +@click.argument("submission-id", type=str) +def job(ctx, submission_id): + """Cancel a job""" + runner = get_runner() + try: + app_handle = get_job_app_handle(submission_id) + runner.cancel(app_handle=app_handle) + click.echo(f"{submission_id} cancelled successfully") + except FileNotFoundError: + click.echo(f"Submission ID {submission_id} not found in Kubernetes Cluster") + except Exception as e: + click.echo("Error cancelling job: " + str(e)) From 979aaa4c3da86e95e219c2d76ba26ef4d8cff5b0 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Mon, 7 Aug 2023 22:03:27 -0700 Subject: [PATCH 33/47] create: jobs logs command --- src/codeflare_sdk/cli/commands/logs.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 src/codeflare_sdk/cli/commands/logs.py diff --git a/src/codeflare_sdk/cli/commands/logs.py b/src/codeflare_sdk/cli/commands/logs.py new file mode 100644 index 000000000..402cbf2ff --- /dev/null +++ b/src/codeflare_sdk/cli/commands/logs.py @@ -0,0 +1,25 @@ +import click +from torchx.runner import get_runner + +from codeflare_sdk.cli.cli_utils import get_job_app_handle + + +@click.group() +def cli(): + """Get the logs of a specified resource""" + pass + + +@cli.command() +@click.pass_context +@click.argument("submission-id", type=str) +def job(ctx, submission_id): + """Get the logs of a specified job""" + runner = get_runner() + try: + app_handle = get_job_app_handle(submission_id) + click.echo("".join(runner.log_lines(app_handle, None))) + except FileNotFoundError: + click.echo(f"Submission ID {submission_id} not found in Kubernetes Cluster") + except Exception as e: + click.echo("Error getting job logs: " + str(e)) From 1e81b8fb1076719f8ce5611cc37a11b23457550e Mon Sep 17 00:00:00 2001 From: carsonmh Date: Mon, 7 Aug 2023 22:04:16 -0700 Subject: [PATCH 34/47] change: slightly change messages and namespace options for job status + list --- src/codeflare_sdk/cli/commands/list.py | 5 +++-- src/codeflare_sdk/cli/commands/status.py | 13 ++++++++++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/codeflare_sdk/cli/commands/list.py b/src/codeflare_sdk/cli/commands/list.py index 0ae29d04d..5df1f803c 100644 --- a/src/codeflare_sdk/cli/commands/list.py +++ b/src/codeflare_sdk/cli/commands/list.py @@ -35,16 +35,17 @@ def raycluster(ctx, namespace, all): @cli.command() +@click.pass_context @click.option("--cluster-name", "-c", type=str) @click.option("--namespace", type=str) @click.option("--all", is_flag=True) -def job(cluster_name, namespace, all): +def job(ctx, cluster_name, namespace, all): """List all jobs in a specified RayCluster or in K8S cluster""" if all: list_all_jobs(True) return if cluster_name: - cluster = get_cluster(cluster_name, namespace) + cluster = get_cluster(cluster_name, namespace or ctx.obj.current_namespace) list_raycluster_jobs(_copy_to_ray(cluster), True) return list_all_kubernetes_jobs(True) diff --git a/src/codeflare_sdk/cli/commands/status.py b/src/codeflare_sdk/cli/commands/status.py index 7c00b552e..cce584c7d 100644 --- a/src/codeflare_sdk/cli/commands/status.py +++ b/src/codeflare_sdk/cli/commands/status.py @@ -1,7 +1,8 @@ import click +from torchx.runner import get_runner from codeflare_sdk.cluster.cluster import get_cluster -from codeflare_sdk.cli.cli_utils import get_job +from codeflare_sdk.cli.cli_utils import get_job_app_handle @click.group() @@ -30,5 +31,11 @@ def raycluster(ctx, name, namespace): @click.argument("submission-id", type=str) def job(ctx, submission_id): """Get the status of a specified job""" - job = get_job(submission_id) - click.echo(job["Status"]) + runner = get_runner() + try: + app_handle = get_job_app_handle(submission_id) + click.echo(runner.status(app_handle=app_handle)) + except FileNotFoundError: + click.echo(f"Submission ID {submission_id} not found in Kubernetes Cluster") + except Exception as e: + click.echo("Error getting job status: " + str(e)) From 216d255c761f9d6c458b3ffb96bfbd43178b3d35 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Mon, 7 Aug 2023 22:04:51 -0700 Subject: [PATCH 35/47] add: error handling and refactor to main CLI --- src/codeflare_sdk/cli/cli_utils.py | 44 ++++++++++++++++++------------ 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/src/codeflare_sdk/cli/cli_utils.py b/src/codeflare_sdk/cli/cli_utils.py index 8c8597a16..e4ce6432d 100644 --- a/src/codeflare_sdk/cli/cli_utils.py +++ b/src/codeflare_sdk/cli/cli_utils.py @@ -8,11 +8,9 @@ from rich.table import Table from rich import print -from codeflare_sdk.cluster.cluster import ( - list_clusters_all_namespaces, -) +from codeflare_sdk.cluster.cluster import list_clusters_all_namespaces, get_cluster from codeflare_sdk.cluster.model import RayCluster -from codeflare_sdk.cluster.auth import _create_api_client_config +from codeflare_sdk.cluster.auth import _create_api_client_config, config_check from codeflare_sdk.utils.kube_api_helpers import _kube_api_error_handling import codeflare_sdk.cluster.auth as sdk_auth @@ -81,13 +79,15 @@ def resolve_command(self, ctx, args): # always return the full command name _, cmd, args = super().resolve_command(ctx, args) return cmd.name, cmd, args + + def print_jobs(jobs): headers = ["Submission ID", "Job ID", "RayCluster", "Namespace", "Status"] table = Table(show_header=True) for header in headers: table.add_column(header) for job in jobs: - table.add_row(*job.values()) + table.add_row(*[job[header] for header in headers]) print(table) @@ -101,17 +101,17 @@ def list_all_kubernetes_jobs(print_to_console=True): for job in jobs: namespace, name = job.app_id.split(":") status = job.state - if name in rayclusters: - continue - k8s_jobs.append( - { - "Submission ID": name, - "Job ID": "N/A", - "RayCluster": "N/A", - "Namespace": namespace, - "Status": str(status), - } - ) + if name not in rayclusters: + k8s_jobs.append( + { + "Submission ID": name, + "Job ID": "N/A", + "RayCluster": "N/A", + "Namespace": namespace, + "Status": str(status), + "App Handle": job.app_handle, + } + ) if print_to_console: print_jobs(k8s_jobs) return k8s_jobs @@ -137,6 +137,7 @@ def list_raycluster_jobs(cluster: RayCluster, print_to_console=True): "RayCluster": cluster.name, "Namespace": cluster.namespace, "Status": str(job.status), + "App Handle": "ray://torchx/" + cluster.dashboard + "-" + job.submission_id, } rc_jobs.append(job_obj) if print_to_console: @@ -155,11 +156,18 @@ def list_all_raycluster_jobs(print_to_console=True): return rc_jobs -def get_job(job_submission): +def get_job_app_handle(job_submission): + job = get_job_object(job_submission) + return job["App Handle"] + + +def get_job_object(job_submission): all_jobs = list_all_jobs(False) for job in all_jobs: if job["Submission ID"] == job_submission: return job raise ( - f"Job {job_submission} not found. Try using 'codeflare list --all' to see all jobs" + FileNotFoundError( + f"Job {job_submission} not found. Try using 'codeflare list --all' to see all jobs" + ) ) From 4f652fd04b962cdca9e44b25b3858870d6db0de8 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Mon, 7 Aug 2023 22:05:27 -0700 Subject: [PATCH 36/47] test: change tests for job functions, refactor tests and add tests for view and cancel jobs --- tests/unit_test.py | 85 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 78 insertions(+), 7 deletions(-) diff --git a/tests/unit_test.py b/tests/unit_test.py index d5113f063..141377b09 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -119,16 +119,16 @@ def test_cluster_definition_cli(mocker): def test_login_cli(mocker): runner = CliRunner() mocker.patch.object(client, "ApiClient") + mocker.patch( + "codeflare_sdk.cli.codeflare_cli.get_current_namespace", return_value="ns" + ) k8s_login_command = """ login --server=testserver:6443 --token=testtoken """ login_result = runner.invoke(cli, k8s_login_command) - assert ( - login_result.output - == "No authentication found, trying default kubeconfig\nLogged into 'testserver:6443'\n" - ) + assert login_result.output == "Logged into 'testserver:6443'\n" try: auth_file_path = os.path.expanduser("~/.codeflare/auth") with open(auth_file_path, "rb") as file: @@ -145,6 +145,9 @@ def test_login_cli(mocker): def test_login_tls_cli(mocker): runner = CliRunner() mocker.patch.object(client, "ApiClient") + mocker.patch( + "codeflare_sdk.cli.codeflare_cli.get_current_namespace", return_value="ns" + ) k8s_tls_login_command = """ login --server=testserver:6443 @@ -160,8 +163,7 @@ def test_login_tls_cli(mocker): tls_result = runner.invoke(cli, k8s_tls_login_command) skip_tls_result = runner.invoke(cli, k8s_skip_tls_login_command) assert ( - "Logged into 'testserver:6443'\n" in tls_result.output - and "Logged into 'testserver:6443'\n" in skip_tls_result.output + "Logged into 'testserver:6443'\n" == tls_result.output == skip_tls_result.output ) @@ -416,7 +418,76 @@ def test_raycluster_list_cli(mocker): ) in result.output -# Keep this test at the end of CLI tests +def test_status_job_cli(mocker): + runner = CliRunner() + mocker.patch( + "codeflare_sdk.cli.codeflare_cli.get_current_namespace", + return_value="opendatahub", + ) + mocker.patch("torchx.runner.Runner.status", return_value="fake-status") + mocker.patch( + "codeflare_sdk.cli.cli_utils.get_job_app_handle", + return_value="fake-handle", + ) + job_status_command = """ + status job test-job + """ + result = runner.invoke(cli, job_status_command) + assert result.output == "fake-status\n" + + +def test_logs_job_cli(mocker): + runner = CliRunner() + mocker.patch.object(client, "ApiClient") + mocker.patch( + "codeflare_sdk.cli.codeflare_cli.get_current_namespace", return_value="ns" + ) + mocker.patch("torchx.runner.Runner.log_lines", return_value=["fake-logs"]) + mocker.patch( + "codeflare_sdk.cli.cli_utils.get_job_app_handle", + return_value="fake-handle", + ) + job_logs_command = """ + logs job test-job + """ + result = runner.invoke(cli, job_logs_command) + assert result.output == "fake-logs\n" + + +def test_list_jobs_cli(mocker): + runner = CliRunner() + mocker.patch.object(client, "ApiClient") + mocker.patch( + "codeflare_sdk.cli.codeflare_cli.get_current_namespace", return_value="ns" + ) + test_job = { + "Submission ID": "fake-id", + "Job ID": "N/A", + "RayCluster": "N/A", + "Namespace": "default", + "Status": "Pending", + "App Handle": "test", + } + mocker.patch( + "codeflare_sdk.cli.cli_utils.list_all_kubernetes_jobs", return_value=[test_job] + ) + mocker.patch( + "codeflare_sdk.cli.cli_utils.list_all_raycluster_jobs", return_value=[test_job] + ) + list_jobs_command = """ + list jobs --all + """ + result = runner.invoke(cli, list_jobs_command) + assert result.output == ( + "┏━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━┓\n" + + "┃ Submission ID ┃ Job ID ┃ RayCluster ┃ Namespace ┃ Status ┃\n" + + "┡━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━┩\n" + + "│ fake-id │ N/A │ N/A │ default │ Pending │\n" + + "│ fake-id │ N/A │ N/A │ default │ Pending │\n" + + "└───────────────┴────────┴────────────┴───────────┴─────────┘\n" + ) + + def test_logout_cli(mocker): runner = CliRunner() mocker.patch.object(client, "ApiClient") From 8f72ab4c7aacdcb19a53a32cbe2a31d5cf0c661d Mon Sep 17 00:00:00 2001 From: carsonmh Date: Mon, 7 Aug 2023 22:05:50 -0700 Subject: [PATCH 37/47] cleanup --- jobtest.yaml | 173 --------------------------------------------------- 1 file changed, 173 deletions(-) delete mode 100644 jobtest.yaml diff --git a/jobtest.yaml b/jobtest.yaml deleted file mode 100644 index 92cd39b4c..000000000 --- a/jobtest.yaml +++ /dev/null @@ -1,173 +0,0 @@ -apiVersion: mcad.ibm.com/v1beta1 -kind: AppWrapper -metadata: - name: jobtest - namespace: default -spec: - priority: 9 - resources: - GenericItems: - - custompodresources: - - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - replicas: 1 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - - limits: - cpu: 1 - memory: 1G - nvidia.com/gpu: 0 - replicas: 2 - requests: - cpu: 1 - memory: 1G - nvidia.com/gpu: 0 - generictemplate: - apiVersion: ray.io/v1alpha1 - kind: RayCluster - metadata: - labels: - appwrapper.mcad.ibm.com: jobtest - controller-tools.k8s.io: '1.0' - name: jobtest - namespace: default - spec: - autoscalerOptions: - idleTimeoutSeconds: 60 - imagePullPolicy: Always - resources: - limits: - cpu: 500m - memory: 512Mi - requests: - cpu: 500m - memory: 512Mi - upscalingMode: Default - enableInTreeAutoscaling: false - headGroupSpec: - rayStartParams: - block: 'true' - dashboard-host: 0.0.0.0 - num-gpus: '0' - serviceType: ClusterIP - template: - spec: - containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 - imagePullPolicy: Always - lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - ray stop - name: ray-head - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - resources: - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - imagePullSecrets: [] - rayVersion: 2.1.0 - workerGroupSpecs: - - groupName: small-group-jobtest - maxReplicas: 2 - minReplicas: 2 - rayStartParams: - block: 'true' - num-gpus: '0' - replicas: 2 - template: - metadata: - annotations: - key: value - labels: - key: value - spec: - containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 - lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - ray stop - name: machine-learning - resources: - limits: - cpu: 1 - memory: 1G - nvidia.com/gpu: 0 - requests: - cpu: 1 - memory: 1G - nvidia.com/gpu: 0 - imagePullSecrets: [] - initContainers: - - command: - - sh - - -c - - until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; - do echo waiting for myservice; sleep 2; done - image: busybox:1.28 - name: init-myservice - replicas: 1 - - generictemplate: - apiVersion: route.openshift.io/v1 - kind: Route - metadata: - labels: - odh-ray-cluster-service: jobtest-head-svc - name: ray-dashboard-jobtest - namespace: default - spec: - port: - targetPort: dashboard - to: - kind: Service - name: jobtest-head-svc - replica: 1 - Items: [] From f256b45704382bc4f21a4e792d6a3dd5c9b55b3a Mon Sep 17 00:00:00 2001 From: carsonmh Date: Wed, 9 Aug 2023 11:59:25 -0700 Subject: [PATCH 38/47] make list command list all resources by default --- src/codeflare_sdk/cli/commands/list.py | 32 +++++++++++++------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/codeflare_sdk/cli/commands/list.py b/src/codeflare_sdk/cli/commands/list.py index 5df1f803c..d34153362 100644 --- a/src/codeflare_sdk/cli/commands/list.py +++ b/src/codeflare_sdk/cli/commands/list.py @@ -20,15 +20,13 @@ def cli(): @cli.command() @click.option("--namespace", type=str) -@click.option("--all", is_flag=True) @click.pass_context -def raycluster(ctx, namespace, all): - """List all rayclusters in a specified namespace""" - if all and namespace: - click.echo("--all and --namespace are mutually exclusive") - return - namespace = namespace or ctx.obj.current_namespace - if not all: +def raycluster(ctx, namespace): + """ + List all rayclusters in a specified namespace or + all namespaces if no namespace is given + """ + if namespace: list_all_clusters(namespace) return list_clusters_all_namespaces() @@ -37,15 +35,17 @@ def raycluster(ctx, namespace, all): @cli.command() @click.pass_context @click.option("--cluster-name", "-c", type=str) -@click.option("--namespace", type=str) -@click.option("--all", is_flag=True) -def job(ctx, cluster_name, namespace, all): - """List all jobs in a specified RayCluster or in K8S cluster""" - if all: - list_all_jobs(True) - return +@click.option("--namespace", "-n", type=str) +@click.option("--no-ray", is_flag=True) +def job(ctx, cluster_name, namespace, no_ray): + """ + List all jobs in a specified RayCluster or in K8S cluster + """ if cluster_name: cluster = get_cluster(cluster_name, namespace or ctx.obj.current_namespace) list_raycluster_jobs(_copy_to_ray(cluster), True) return - list_all_kubernetes_jobs(True) + if no_ray: + list_all_kubernetes_jobs(True) + return + list_all_jobs(True) From 65925d3a7a73ce837354f2fdb07b2d71319fe047 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Wed, 9 Aug 2023 15:42:55 -0700 Subject: [PATCH 39/47] fix unit tests --- tests/unit_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit_test.py b/tests/unit_test.py index 141377b09..a120f8374 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -475,7 +475,7 @@ def test_list_jobs_cli(mocker): "codeflare_sdk.cli.cli_utils.list_all_raycluster_jobs", return_value=[test_job] ) list_jobs_command = """ - list jobs --all + list jobs """ result = runner.invoke(cli, list_jobs_command) assert result.output == ( From df432c5799a201a802fc7870bce5cc1fa502725b Mon Sep 17 00:00:00 2001 From: carsonmh Date: Wed, 9 Aug 2023 15:43:30 -0700 Subject: [PATCH 40/47] fix description of list jobs/rayclusters and change --no-ray flag --- src/codeflare_sdk/cli/commands/list.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/codeflare_sdk/cli/commands/list.py b/src/codeflare_sdk/cli/commands/list.py index d34153362..9991ee4b4 100644 --- a/src/codeflare_sdk/cli/commands/list.py +++ b/src/codeflare_sdk/cli/commands/list.py @@ -23,8 +23,7 @@ def cli(): @click.pass_context def raycluster(ctx, namespace): """ - List all rayclusters in a specified namespace or - all namespaces if no namespace is given + List all rayclusters """ if namespace: list_all_clusters(namespace) @@ -36,10 +35,10 @@ def raycluster(ctx, namespace): @click.pass_context @click.option("--cluster-name", "-c", type=str) @click.option("--namespace", "-n", type=str) -@click.option("--no-ray", is_flag=True) +@click.option("--kube-mcad-scheduler-only", is_flag=True) def job(ctx, cluster_name, namespace, no_ray): """ - List all jobs in a specified RayCluster or in K8S cluster + List all jobs submitted """ if cluster_name: cluster = get_cluster(cluster_name, namespace or ctx.obj.current_namespace) From 83e8367f64b42b727d68412733b764927c6e3af5 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Thu, 10 Aug 2023 11:47:18 -0700 Subject: [PATCH 41/47] fix unit tests and list function --- src/codeflare_sdk/cli/commands/list.py | 4 ++-- tests/unit_test.py | 13 +++++++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/codeflare_sdk/cli/commands/list.py b/src/codeflare_sdk/cli/commands/list.py index 9991ee4b4..a91998e04 100644 --- a/src/codeflare_sdk/cli/commands/list.py +++ b/src/codeflare_sdk/cli/commands/list.py @@ -36,7 +36,7 @@ def raycluster(ctx, namespace): @click.option("--cluster-name", "-c", type=str) @click.option("--namespace", "-n", type=str) @click.option("--kube-mcad-scheduler-only", is_flag=True) -def job(ctx, cluster_name, namespace, no_ray): +def job(ctx, cluster_name, namespace, kube_mcad_scheduler_only): """ List all jobs submitted """ @@ -44,7 +44,7 @@ def job(ctx, cluster_name, namespace, no_ray): cluster = get_cluster(cluster_name, namespace or ctx.obj.current_namespace) list_raycluster_jobs(_copy_to_ray(cluster), True) return - if no_ray: + if kube_mcad_scheduler_only: list_all_kubernetes_jobs(True) return list_all_jobs(True) diff --git a/tests/unit_test.py b/tests/unit_test.py index a120f8374..896b429f3 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -236,8 +236,11 @@ def test_list_clusters_all_namespaces(mocker, capsys): ) -def test_job_definition_cli(): +def test_job_definition_cli(mocker): runner = CliRunner() + mocker.patch( + "codeflare_sdk.cli.codeflare_cli.get_current_namespace", return_value="ns" + ) define_job_command = """ define job --script=test-script.py @@ -268,6 +271,9 @@ def test_job_submission_cli(mocker): "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", side_effect=get_ray_obj, ) + mocker.patch( + "codeflare_sdk.cli.codeflare_cli.get_current_namespace", return_value="ns" + ) mocker.patch( "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", return_value="test-url.com", @@ -288,7 +294,7 @@ def test_job_submission_cli(mocker): assert ( result.output == "Written to: quicktest.yaml\n" - + "test-1234 submitted onto quicktest RayCluster successfully\n" + + "Job test-1234 submitted onto quicktest RayCluster successfully\n" + "View dashboard: test-url.com\n" ) @@ -491,6 +497,9 @@ def test_list_jobs_cli(mocker): def test_logout_cli(mocker): runner = CliRunner() mocker.patch.object(client, "ApiClient") + mocker.patch( + "codeflare_sdk.cli.codeflare_cli.get_current_namespace", return_value="ns" + ) k8s_logout_command = "logout" logout_result = runner.invoke(cli, k8s_logout_command) assert logout_result.output == "Successfully logged out of 'testserver:6443'\n" From c8206e5b24df57474753e6547979186ca0570a17 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Tue, 8 Aug 2023 14:30:41 -0700 Subject: [PATCH 42/47] add and implement option to not generate appwrapper in a Cluster --- src/codeflare_sdk/cluster/cluster.py | 21 ++++++++++++++------- tests/unit_test.py | 1 - 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 7bb4b3579..0488ff6fe 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -50,7 +50,7 @@ class Cluster: torchx_scheduler = "ray" - def __init__(self, config: ClusterConfiguration): + def __init__(self, config: ClusterConfiguration, generate_app_wrapper: bool = True): """ Create the resource cluster object by passing in a ClusterConfiguration (defined in the config sub-module). An AppWrapper will then be generated @@ -58,13 +58,17 @@ def __init__(self, config: ClusterConfiguration): request. """ self.config = config - self.app_wrapper_yaml = self.create_app_wrapper() - self.app_wrapper_name = self.app_wrapper_yaml.split(".")[0] + self.app_wrapper_yaml = None + self.app_wrapper_name = None + + if generate_app_wrapper: + self.app_wrapper_yaml = self.create_app_wrapper() + self.app_wrapper_name = self.app_wrapper_yaml.split(".")[0] def create_app_wrapper(self): """ - Called upon cluster object creation, creates an AppWrapper yaml based on - the specifications of the ClusterConfiguration. + Called upon cluster object creation if generate_app_wrapper is True, creates an AppWrapper yaml + based on the specifications of the ClusterConfiguration. """ if self.config.namespace is None: @@ -115,6 +119,9 @@ def up(self): Applies the AppWrapper yaml, pushing the resource request onto the MCAD queue. """ + if self.app_wrapper_yaml is None: + print("Error putting up RayCluster: AppWrapper yaml not generated") + return namespace = self.config.namespace try: config_check() @@ -145,7 +152,7 @@ def down(self): version="v1beta1", namespace=namespace, plural="appwrappers", - name=self.app_wrapper_name, + name=self.config.name, ) except Exception as e: # pragma: no cover return _kube_api_error_handling(e) @@ -351,7 +358,7 @@ def from_k8_cluster_object(rc): ]["image"], local_interactive=local_interactive, ) - return Cluster(cluster_config) + return Cluster(cluster_config, False) def from_definition_yaml(yaml_path): try: diff --git a/tests/unit_test.py b/tests/unit_test.py index 896b429f3..aad909065 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -2652,7 +2652,6 @@ def test_cleanup(): os.remove("unit-test-default-cluster.yaml") os.remove("test.yaml") os.remove("raytest2.yaml") - os.remove("quicktest.yaml") os.remove("tls-cluster-namespace/ca.crt") os.remove("tls-cluster-namespace/tls.crt") os.remove("tls-cluster-namespace/tls.key") From 4208ef8b2b268721d93b4a39e16aa46951c06233 Mon Sep 17 00:00:00 2001 From: Carson Harrell <64709520+carsonmh@users.noreply.github.com> Date: Thu, 10 Aug 2023 11:58:23 -0700 Subject: [PATCH 43/47] Change create_app_wrapper description Co-authored-by: Michael Clifford --- src/codeflare_sdk/cluster/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 0488ff6fe..d95c0bda4 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -67,7 +67,7 @@ def __init__(self, config: ClusterConfiguration, generate_app_wrapper: bool = Tr def create_app_wrapper(self): """ - Called upon cluster object creation if generate_app_wrapper is True, creates an AppWrapper yaml + Creates an AppWrapper yaml based on the specified cluster config based on the specifications of the ClusterConfiguration. """ From fb502f65bda4777d067624b9ddee0ac62ec5a249 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Thu, 10 Aug 2023 12:11:05 -0700 Subject: [PATCH 44/47] fix down function if no name available and changed up to create an app wrapper --- src/codeflare_sdk/cluster/cluster.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index d95c0bda4..1ba9e41d8 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -120,8 +120,8 @@ def up(self): the MCAD queue. """ if self.app_wrapper_yaml is None: - print("Error putting up RayCluster: AppWrapper yaml not generated") - return + self.app_wrapper_yaml = self.create_app_wrapper() + self.app_wrapper_name = self.app_wrapper_yaml.split(".")[0] namespace = self.config.namespace try: config_check() @@ -144,6 +144,9 @@ def down(self): associated with the cluster. """ namespace = self.config.namespace + if not self.config.name and not self.app_wrapper_name: + print("Error taking down cluster: missing name or AppWrapper") + return try: config_check() api_instance = client.CustomObjectsApi(api_config_handler()) @@ -152,7 +155,7 @@ def down(self): version="v1beta1", namespace=namespace, plural="appwrappers", - name=self.config.name, + name=self.app_wrapper_name or self.config.name, ) except Exception as e: # pragma: no cover return _kube_api_error_handling(e) From de3af4f6efb5938523d14ee00cf7641f413dbe7b Mon Sep 17 00:00:00 2001 From: carsonmh Date: Fri, 11 Aug 2023 11:52:04 -0700 Subject: [PATCH 45/47] refactor and cleanup cli unit tests --- tests/unit_test.py | 326 +++++++++++++++++++++++---------------------- 1 file changed, 167 insertions(+), 159 deletions(-) diff --git a/tests/unit_test.py b/tests/unit_test.py index aad909065..6619c825c 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -88,34 +88,6 @@ def test_cli_working(): assert result.exit_code == 0 -def test_cluster_definition_cli(mocker): - mocker.patch.object(client, "ApiClient") - runner = CliRunner() - define_cluster_command = """ - define raycluster - --name=cli-test-cluster - --namespace=default - --min-worker=1 - --max-worker=2 - --min-cpus=3 - --max-cpus=4 - --min-memory=5 - --max-memory=6 - --gpu=7 - --instascale=True - --machine-types='["cpu.small", "gpu.large"]' - --image-pull-secrets='["cli-test-pull-secret"]' - """ - result = runner.invoke(cli, define_cluster_command) - assert ( - result.output - == "No authentication found, trying default kubeconfig\nWritten to: cli-test-cluster.yaml\n" - ) - assert filecmp.cmp( - "cli-test-cluster.yaml", f"{parent}/tests/cli-test-case.yaml", shallow=True - ) - - def test_login_cli(mocker): runner = CliRunner() mocker.patch.object(client, "ApiClient") @@ -172,8 +144,42 @@ def test_load_auth(): assert sdk_auth.api_client is not None +def test_cluster_definition_cli(mocker): + mocker.patch.object(client, "ApiClient") + mocker.patch( + "codeflare_sdk.cli.codeflare_cli.get_current_namespace", return_value="ns" + ) + runner = CliRunner() + define_cluster_command = """ + define raycluster + --name=cli-test-cluster + --namespace=default + --min-worker=1 + --max-worker=2 + --min-cpus=3 + --max-cpus=4 + --min-memory=5 + --max-memory=6 + --gpu=7 + --instascale=True + --machine-types='["cpu.small", "gpu.large"]' + --image-pull-secrets='["cli-test-pull-secret"]' + """ + result = runner.invoke(cli, define_cluster_command) + assert ( + result.output + == "Written to: cli-test-cluster.yaml\n" + ) + assert filecmp.cmp( + "cli-test-cluster.yaml", f"{parent}/tests/cli-test-case.yaml", shallow=True + ) + + def test_cluster_submission_cli(mocker): mocker.patch.object(client, "ApiClient") + mocker.patch( + "codeflare_sdk.cli.codeflare_cli.get_current_namespace", return_value="ns" + ) runner = CliRunner() submit_cluster_command = """ submit raycluster @@ -182,7 +188,7 @@ def test_cluster_submission_cli(mocker): result = runner.invoke(cli, submit_cluster_command) assert result.exit_code == 0 - assert "Cluster submitted successfully" in result.output + assert result.output == "Written to: cli-test-cluster.yaml\nCluster submitted successfully\n" def test_cluster_deletion_cli(mocker): @@ -193,8 +199,7 @@ def test_cluster_deletion_cli(mocker): side_effect=get_ray_obj, ) mocker.patch( - "codeflare_sdk.cluster.cluster.get_current_namespace", - return_value="ns", + "codeflare_sdk.cli.codeflare_cli.get_current_namespace", return_value="ns" ) runner = CliRunner() delete_cluster_command = """ @@ -204,7 +209,136 @@ def test_cluster_deletion_cli(mocker): result = runner.invoke(cli, delete_cluster_command) assert result.exit_code == 0 - assert "Cluster deleted successfully" in result.output + assert result.output == "Cluster deleted successfully\n" + + +def test_raycluster_details_cli(mocker): + runner = CliRunner() + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_ray_obj, + ) + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.status", + return_value=(False, CodeFlareClusterStatus.UNKNOWN), + ) + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", + return_value="", + ) + mocker.patch("codeflare_sdk.cli.codeflare_cli.get_current_namespace", return_value="ns") + mocker.patch.object(client, "ApiClient") + raycluster_details_command = """ + details raycluster quicktest --namespace=default + """ + result = runner.invoke(cli, raycluster_details_command) + assert result.output == ( + " 🚀 CodeFlare Cluster Details 🚀 \n" + " \n" + " ╭──────────────────────────────────────────────────────────────╮ \n" + " │ Name │ \n" + " │ quicktest Inactive ❌ │ \n" + " │ │ \n" + " │ URI: ray://quicktest-head-svc.ns.svc:10001 │ \n" + " │ │ \n" + " │ Dashboard🔗 │ \n" + " │ │ \n" + " │ Cluster Resources │ \n" + " │ ╭─ Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n" + " │ │ Min Max │ │ Memory CPU GPU │ │ \n" + " │ │ │ │ │ │ \n" + " │ │ 1 1 │ │ 2~2 1 0 │ │ \n" + " │ │ │ │ │ │ \n" + " │ ╰────────────╯ ╰──────────────────────────────────────╯ │ \n" + " ╰──────────────────────────────────────────────────────────────╯ \n" + ) + + +def test_raycluster_status_cli(mocker): + runner = CliRunner() + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_ray_obj, + ) + mocker.patch( + "codeflare_sdk.cluster.cluster.get_current_namespace", + return_value="ns", + ) + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", + return_value="", + ) + mocker.patch.object(client, "ApiClient") + test_raycluster = RayCluster( + "quicktest", + RayClusterStatus.READY, + 1, + 1, + "1", + "1", + 1, + 1, + "default", + "dashboard-url", + ) + mocker.patch( + "codeflare_sdk.cluster.cluster._app_wrapper_status", + return_value=test_raycluster, + ) + mocker.patch( + "codeflare_sdk.cluster.cluster._ray_cluster_status", + return_value=test_raycluster, + ) + raycluster_status_command = """ + status raycluster quicktest --namespace=default + """ + result = runner.invoke(cli, raycluster_status_command) + assert "Active" in result.output + + +def test_raycluster_list_cli(mocker): + runner = CliRunner() + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_ray_obj, + ) + mocker.patch( + "codeflare_sdk.cli.codeflare_cli.get_current_namespace", + return_value="ns", + ) + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.status", + return_value=(False, CodeFlareClusterStatus.UNKNOWN), + ) + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", + return_value="", + ) + mocker.patch.object(client, "ApiClient") + list_rayclusters_command = """ + list rayclusters --namespace=ns + """ + result = runner.invoke(cli, list_rayclusters_command) + assert result.output == ( + " 🚀 CodeFlare Cluster Details 🚀 \n" + " \n" + " ╭──────────────────────────────────────────────────────────────╮ \n" + " │ Name │ \n" + " │ quicktest Active ✅ │ \n" + " │ │ \n" + " │ URI: ray://quicktest-head-svc.ns.svc:10001 │ \n" + " │ │ \n" + " │ Dashboard🔗 │ \n" + " │ │ \n" + " │ Cluster Resources │ \n" + " │ ╭─ Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n" + " │ │ Min Max │ │ Memory CPU GPU │ │ \n" + " │ │ │ │ │ │ \n" + " │ │ 1 1 │ │ 2G~2G 1 0 │ │ \n" + " │ │ │ │ │ │ \n" + " │ ╰────────────╯ ╰──────────────────────────────────────╯ │ \n" + " ╰──────────────────────────────────────────────────────────────╯ \n" + ) def test_list_clusters_all_namespaces(mocker, capsys): @@ -293,137 +427,11 @@ def test_job_submission_cli(mocker): result = runner.invoke(cli, submit_job_command) assert ( result.output - == "Written to: quicktest.yaml\n" - + "Job test-1234 submitted onto quicktest RayCluster successfully\n" + == "Job test-1234 submitted onto quicktest RayCluster successfully\n" + "View dashboard: test-url.com\n" ) -def test_raycluster_details_cli(mocker): - runner = CliRunner() - mocker.patch( - "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", - side_effect=get_ray_obj, - ) - mocker.patch( - "codeflare_sdk.cluster.cluster.Cluster.status", - return_value=(False, CodeFlareClusterStatus.UNKNOWN), - ) - mocker.patch( - "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", - return_value="", - ) - mocker.patch.object(client, "ApiClient") - raycluster_details_command = """ - details raycluster quicktest --namespace=default - """ - result = runner.invoke(cli, raycluster_details_command) - quicktest_details = ( - " ╭──────────────────────────────────────────────────────────────╮ \n" - + " │ Name │ \n" - + " │ quicktest Inactive ❌ │ \n" - + " │ │ \n" - + " │ URI: ray://quicktest-head-svc.ns.svc:10001 │ \n" - + " │ │ \n" - + " │ Dashboard🔗 │ \n" - + " │ │ \n" - + " │ Cluster Resources │ \n" - + " │ ╭─ Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n" - + " │ │ Min Max │ │ Memory CPU GPU │ │ \n" - + " │ │ │ │ │ │ \n" - + " │ │ 1 1 │ │ 2~2 1 0 │ │ \n" - + " │ │ │ │ │ │ \n" - + " │ ╰────────────╯ ╰──────────────────────────────────────╯ │ \n" - + " ╰──────────────────────────────────────────────────────────────╯ " - ) - assert quicktest_details in result.output - - -def test_raycluster_status_cli(mocker): - runner = CliRunner() - mocker.patch( - "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", - side_effect=get_ray_obj, - ) - mocker.patch( - "codeflare_sdk.cluster.cluster.get_current_namespace", - return_value="ns", - ) - mocker.patch( - "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", - return_value="", - ) - mocker.patch.object(client, "ApiClient") - test_raycluster = RayCluster( - "quicktest", - RayClusterStatus.READY, - 1, - 1, - "1", - "1", - 1, - 1, - "default", - "dashboard-url", - ) - mocker.patch( - "codeflare_sdk.cluster.cluster._app_wrapper_status", - return_value=test_raycluster, - ) - mocker.patch( - "codeflare_sdk.cluster.cluster._ray_cluster_status", - return_value=test_raycluster, - ) - raycluster_status_command = """ - status raycluster quicktest --namespace=default - """ - result = runner.invoke(cli, raycluster_status_command) - assert "Active" in result.output - - -def test_raycluster_list_cli(mocker): - runner = CliRunner() - mocker.patch( - "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", - side_effect=get_ray_obj, - ) - mocker.patch( - "codeflare_sdk.cluster.cluster.get_current_namespace", - return_value="ns", - ) - mocker.patch( - "codeflare_sdk.cluster.cluster.Cluster.status", - return_value=(False, CodeFlareClusterStatus.UNKNOWN), - ) - mocker.patch( - "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", - return_value="", - ) - mocker.patch.object(client, "ApiClient") - list_rayclusters_command = """ - list rayclusters --namespace=ns - """ - result = runner.invoke(cli, list_rayclusters_command) - assert ( - " ╭──────────────────────────────────────────────────────────────╮ \n" - + " │ Name │ \n" - + " │ quicktest Active ✅ │ \n" - + " │ │ \n" - + " │ URI: ray://quicktest-head-svc.ns.svc:10001 │ \n" - + " │ │ \n" - + " │ Dashboard🔗 │ \n" - + " │ │ \n" - + " │ Cluster Resources │ \n" - + " │ ╭─ Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n" - + " │ │ Min Max │ │ Memory CPU GPU │ │ \n" - + " │ │ │ │ │ │ \n" - + " │ │ 1 1 │ │ 2G~2G 1 0 │ │ \n" - + " │ │ │ │ │ │ \n" - + " │ ╰────────────╯ ╰──────────────────────────────────────╯ │ \n" - + " ╰──────────────────────────────────────────────────────────────╯ " - ) in result.output - - def test_status_job_cli(mocker): runner = CliRunner() mocker.patch( From a03ec9c62455bd0293fcdbd9dd0f07a2cd20d91a Mon Sep 17 00:00:00 2001 From: carsonmh Date: Fri, 11 Aug 2023 13:24:51 -0700 Subject: [PATCH 46/47] fix CLI tests --- src/codeflare_sdk/cli/commands/define.py | 5 +- src/codeflare_sdk/cluster/cluster.py | 5 +- tests/cli-test-case.yaml | 8 +- tests/unit_test.py | 114 +++++++++++------------ 4 files changed, 64 insertions(+), 68 deletions(-) diff --git a/src/codeflare_sdk/cli/commands/define.py b/src/codeflare_sdk/cli/commands/define.py index 7c8d4476f..d28262902 100644 --- a/src/codeflare_sdk/cli/commands/define.py +++ b/src/codeflare_sdk/cli/commands/define.py @@ -21,11 +21,10 @@ def cli(): @click.option("--machine-types", cls=PythonLiteralOption, type=list) @click.option("--min-cpus", type=int) @click.option("--max-cpus", type=int) -@click.option("--min-worker", type=int) -@click.option("--max-worker", type=int) +@click.option("--num-workers", type=int) @click.option("--min-memory", type=int) @click.option("--max-memory", type=int) -@click.option("--gpu", type=int) +@click.option("--num-gpus", type=int) @click.option("--template", type=str) @click.option("--instascale", type=bool) @click.option("--envs", cls=PythonLiteralOption, type=dict) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 1ba9e41d8..c2cfc1277 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -384,8 +384,7 @@ def from_definition_yaml(yaml_path): name=rc["metadata"]["name"], namespace=rc["metadata"]["namespace"], machine_types=machine_types, - min_worker=worker_group_specs["minReplicas"], - max_worker=worker_group_specs["maxReplicas"], + num_workers=worker_group_specs["minReplicas"], min_cpus=worker_group_specs["template"]["spec"]["containers"][0][ "resources" ]["requests"]["cpu"], @@ -402,7 +401,7 @@ def from_definition_yaml(yaml_path): "resources" ]["limits"]["memory"][:-1] ), - gpu=worker_group_specs["template"]["spec"]["containers"][0][ + num_gpus=worker_group_specs["template"]["spec"]["containers"][0][ "resources" ]["requests"]["nvidia.com/gpu"], instascale=True if machine_types else False, diff --git a/tests/cli-test-case.yaml b/tests/cli-test-case.yaml index c312abfaa..41f62b97d 100644 --- a/tests/cli-test-case.yaml +++ b/tests/cli-test-case.yaml @@ -23,7 +23,7 @@ spec: cpu: 4 memory: 6G nvidia.com/gpu: 7 - replicas: 2 + replicas: 1 requests: cpu: 3 memory: 5G @@ -112,12 +112,12 @@ spec: rayVersion: 2.1.0 workerGroupSpecs: - groupName: small-group-cli-test-cluster - maxReplicas: 2 - minReplicas: 2 + maxReplicas: 1 + minReplicas: 1 rayStartParams: block: 'true' num-gpus: '7' - replicas: 2 + replicas: 1 template: metadata: annotations: diff --git a/tests/unit_test.py b/tests/unit_test.py index 6619c825c..8f3e318da 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -154,13 +154,12 @@ def test_cluster_definition_cli(mocker): define raycluster --name=cli-test-cluster --namespace=default - --min-worker=1 - --max-worker=2 + --num-workers=1 --min-cpus=3 --max-cpus=4 --min-memory=5 --max-memory=6 - --gpu=7 + --num-gpus=7 --instascale=True --machine-types='["cpu.small", "gpu.large"]' --image-pull-secrets='["cli-test-pull-secret"]' @@ -233,24 +232,24 @@ def test_raycluster_details_cli(mocker): """ result = runner.invoke(cli, raycluster_details_command) assert result.output == ( - " 🚀 CodeFlare Cluster Details 🚀 \n" - " \n" - " ╭──────────────────────────────────────────────────────────────╮ \n" - " │ Name │ \n" - " │ quicktest Inactive ❌ │ \n" - " │ │ \n" - " │ URI: ray://quicktest-head-svc.ns.svc:10001 │ \n" - " │ │ \n" - " │ Dashboard🔗 │ \n" - " │ │ \n" - " │ Cluster Resources │ \n" - " │ ╭─ Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n" - " │ │ Min Max │ │ Memory CPU GPU │ │ \n" - " │ │ │ │ │ │ \n" - " │ │ 1 1 │ │ 2~2 1 0 │ │ \n" - " │ │ │ │ │ │ \n" - " │ ╰────────────╯ ╰──────────────────────────────────────╯ │ \n" - " ╰──────────────────────────────────────────────────────────────╯ \n" + " 🚀 CodeFlare Cluster Details 🚀 \n" + " \n" + " ╭───────────────────────────────────────────────────────────────╮ \n" + " │ Name │ \n" + " │ quicktest Inactive ❌ │ \n" + " │ │ \n" + " │ URI: ray://quicktest-head-svc.ns.svc:10001 │ \n" + " │ │ \n" + " │ Dashboard🔗 │ \n" + " │ │ \n" + " │ Cluster Resources │ \n" + " │ ╭── Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n" + " │ │ # Workers │ │ Memory CPU GPU │ │ \n" + " │ │ │ │ │ │ \n" + " │ │ 1 │ │ 2~2 1 0 │ │ \n" + " │ │ │ │ │ │ \n" + " │ ╰─────────────╯ ╰──────────────────────────────────────╯ │ \n" + " ╰───────────────────────────────────────────────────────────────╯ \n" ) @@ -277,7 +276,6 @@ def test_raycluster_status_cli(mocker): "1", "1", 1, - 1, "default", "dashboard-url", ) @@ -320,24 +318,24 @@ def test_raycluster_list_cli(mocker): """ result = runner.invoke(cli, list_rayclusters_command) assert result.output == ( - " 🚀 CodeFlare Cluster Details 🚀 \n" - " \n" - " ╭──────────────────────────────────────────────────────────────╮ \n" - " │ Name │ \n" - " │ quicktest Active ✅ │ \n" - " │ │ \n" - " │ URI: ray://quicktest-head-svc.ns.svc:10001 │ \n" - " │ │ \n" - " │ Dashboard🔗 │ \n" - " │ │ \n" - " │ Cluster Resources │ \n" - " │ ╭─ Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n" - " │ │ Min Max │ │ Memory CPU GPU │ │ \n" - " │ │ │ │ │ │ \n" - " │ │ 1 1 │ │ 2G~2G 1 0 │ │ \n" - " │ │ │ │ │ │ \n" - " │ ╰────────────╯ ╰──────────────────────────────────────╯ │ \n" - " ╰──────────────────────────────────────────────────────────────╯ \n" + " 🚀 CodeFlare Cluster Details 🚀 \n" + " \n" + " ╭───────────────────────────────────────────────────────────────╮ \n" + " │ Name │ \n" + " │ quicktest Active ✅ │ \n" + " │ │ \n" + " │ URI: ray://quicktest-head-svc.ns.svc:10001 │ \n" + " │ │ \n" + " │ Dashboard🔗 │ \n" + " │ │ \n" + " │ Cluster Resources │ \n" + " │ ╭── Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n" + " │ │ # Workers │ │ Memory CPU GPU │ │ \n" + " │ │ │ │ │ │ \n" + " │ │ 1 │ │ 2G~2G 1 0 │ │ \n" + " │ │ │ │ │ │ \n" + " │ ╰─────────────╯ ╰──────────────────────────────────────╯ │ \n" + " ╰───────────────────────────────────────────────────────────────╯ \n" ) @@ -349,24 +347,24 @@ def test_list_clusters_all_namespaces(mocker, capsys): list_clusters_all_namespaces() captured = capsys.readouterr() assert captured.out == ( - " 🚀 CodeFlare Cluster Details 🚀 \n" - " \n" - " ╭──────────────────────────────────────────────────────────────╮ \n" - " │ Name │ \n" - " │ quicktest Active ✅ │ \n" - " │ │ \n" - " │ URI: ray://quicktest-head-svc.ns.svc:10001 │ \n" - " │ │ \n" - " │ Dashboard🔗 │ \n" - " │ │ \n" - " │ Cluster Resources │ \n" - " │ ╭─ Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n" - " │ │ Min Max │ │ Memory CPU GPU │ │ \n" - " │ │ │ │ │ │ \n" - " │ │ 1 1 │ │ 2G~2G 1 0 │ │ \n" - " │ │ │ │ │ │ \n" - " │ ╰────────────╯ ╰──────────────────────────────────────╯ │ \n" - " ╰──────────────────────────────────────────────────────────────╯ \n" + " 🚀 CodeFlare Cluster Details 🚀 \n" + " \n" + " ╭───────────────────────────────────────────────────────────────╮ \n" + " │ Name │ \n" + " │ quicktest Active ✅ │ \n" + " │ │ \n" + " │ URI: ray://quicktest-head-svc.ns.svc:10001 │ \n" + " │ │ \n" + " │ Dashboard🔗 │ \n" + " │ │ \n" + " │ Cluster Resources │ \n" + " │ ╭── Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n" + " │ │ # Workers │ │ Memory CPU GPU │ │ \n" + " │ │ │ │ │ │ \n" + " │ │ 1 │ │ 2G~2G 1 0 │ │ \n" + " │ │ │ │ │ │ \n" + " │ ╰─────────────╯ ╰──────────────────────────────────────╯ │ \n" + " ╰───────────────────────────────────────────────────────────────╯ \n" ) From 27c24a604c57bd9db6f645a87ea6bd93fcfcf5d3 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Fri, 11 Aug 2023 13:28:01 -0700 Subject: [PATCH 47/47] refactor unit tests --- tests/unit_test.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/unit_test.py b/tests/unit_test.py index 8f3e318da..0b469b226 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -165,10 +165,7 @@ def test_cluster_definition_cli(mocker): --image-pull-secrets='["cli-test-pull-secret"]' """ result = runner.invoke(cli, define_cluster_command) - assert ( - result.output - == "Written to: cli-test-cluster.yaml\n" - ) + assert result.output == "Written to: cli-test-cluster.yaml\n" assert filecmp.cmp( "cli-test-cluster.yaml", f"{parent}/tests/cli-test-case.yaml", shallow=True ) @@ -187,7 +184,10 @@ def test_cluster_submission_cli(mocker): result = runner.invoke(cli, submit_cluster_command) assert result.exit_code == 0 - assert result.output == "Written to: cli-test-cluster.yaml\nCluster submitted successfully\n" + assert ( + result.output + == "Written to: cli-test-cluster.yaml\nCluster submitted successfully\n" + ) def test_cluster_deletion_cli(mocker): @@ -225,7 +225,9 @@ def test_raycluster_details_cli(mocker): "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", return_value="", ) - mocker.patch("codeflare_sdk.cli.codeflare_cli.get_current_namespace", return_value="ns") + mocker.patch( + "codeflare_sdk.cli.codeflare_cli.get_current_namespace", return_value="ns" + ) mocker.patch.object(client, "ApiClient") raycluster_details_command = """ details raycluster quicktest --namespace=default