From f4639ddc809972c61d4ea718eb9adaa46567bb40 Mon Sep 17 00:00:00 2001 From: Bobbins228 Date: Mon, 26 Feb 2024 14:31:41 +0000 Subject: [PATCH 1/2] Made Kueue the default queueing strategy Updated oauth test to have mcad=True Changed .codeflare/appwrappers to .codeflare/resources Addressed comments & added SUSPENDED status Review changes & list_cluster functions Updated tests and load_components Update tests, Rebase --- src/codeflare_sdk/__init__.py | 2 + src/codeflare_sdk/cluster/__init__.py | 8 +- src/codeflare_sdk/cluster/cluster.py | 43 ++- src/codeflare_sdk/cluster/config.py | 4 +- src/codeflare_sdk/cluster/model.py | 2 + src/codeflare_sdk/utils/generate_yaml.py | 62 ++- src/codeflare_sdk/utils/pretty_print.py | 24 ++ tests/e2e/mnist_raycluster_sdk_oauth_test.py | 1 + tests/e2e/mnist_raycluster_sdk_test.py | 1 + tests/e2e/start_ray_cluster.py | 1 + tests/test-case-no-mcad.yamls | 2 +- tests/unit_test.py | 387 ++++++++++++++++++- tests/unit_test_support.py | 1 + 13 files changed, 515 insertions(+), 23 deletions(-) diff --git a/src/codeflare_sdk/__init__.py b/src/codeflare_sdk/__init__.py index 0ed41d15b..86b6da880 100644 --- a/src/codeflare_sdk/__init__.py +++ b/src/codeflare_sdk/__init__.py @@ -12,6 +12,8 @@ RayCluster, AppWrapper, get_cluster, + list_all_queued, + list_all_clusters, ) from .job import JobDefinition, Job, DDPJobDefinition, DDPJob, RayJobClient diff --git a/src/codeflare_sdk/cluster/__init__.py b/src/codeflare_sdk/cluster/__init__.py index 419561d7f..0b1849e51 100644 --- a/src/codeflare_sdk/cluster/__init__.py +++ b/src/codeflare_sdk/cluster/__init__.py @@ -13,6 +13,12 @@ AppWrapper, ) -from .cluster import Cluster, ClusterConfiguration, get_cluster +from .cluster import ( + Cluster, + ClusterConfiguration, + get_cluster, + list_all_queued, + list_all_clusters, +) from .awload import AWManager diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 707ea61d2..baf1e9746 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -189,6 +189,7 @@ def create_app_wrapper(self): dispatch_priority = self.config.dispatch_priority write_to_file = self.config.write_to_file verify_tls = self.config.verify_tls + local_queue = self.config.local_queue return generate_appwrapper( name=name, namespace=namespace, @@ -213,6 +214,7 @@ def create_app_wrapper(self): priority_val=priority_val, write_to_file=write_to_file, verify_tls=verify_tls, + local_queue=local_queue, ) # creates a new cluster with the provided or default spec @@ -319,6 +321,9 @@ def status( # check the ray cluster status cluster = _ray_cluster_status(self.config.name, self.config.namespace) if cluster: + if cluster.status == RayClusterStatus.SUSPENDED: + ready = False + status = CodeFlareClusterStatus.SUSPENDED if cluster.status == RayClusterStatus.UNKNOWN: ready = False status = CodeFlareClusterStatus.STARTING @@ -588,17 +593,24 @@ def list_all_clusters(namespace: str, print_to_console: bool = True): return clusters -def list_all_queued(namespace: str, print_to_console: bool = True): +def list_all_queued(namespace: str, print_to_console: bool = True, mcad: bool = False): """ - Returns (and prints by default) a list of all currently queued-up AppWrappers + Returns (and prints by default) a list of all currently queued-up Ray Clusters or AppWrappers in a given namespace. """ - app_wrappers = _get_app_wrappers( - namespace, filter=[AppWrapperStatus.RUNNING, AppWrapperStatus.PENDING] - ) - if print_to_console: - pretty_print.print_app_wrappers_status(app_wrappers) - return app_wrappers + if mcad: + resources = _get_app_wrappers( + namespace, filter=[AppWrapperStatus.RUNNING, AppWrapperStatus.PENDING] + ) + if print_to_console: + pretty_print.print_app_wrappers_status(resources) + else: + resources = _get_ray_clusters( + namespace, filter=[RayClusterStatus.READY, RayClusterStatus.SUSPENDED] + ) + if print_to_console: + pretty_print.print_ray_clusters_status(resources) + return resources def get_current_namespace(): # pragma: no cover @@ -798,7 +810,9 @@ def _ray_cluster_status(name, namespace="default") -> Optional[RayCluster]: return None -def _get_ray_clusters(namespace="default") -> List[RayCluster]: +def _get_ray_clusters( + namespace="default", filter: Optional[List[RayClusterStatus]] = None +) -> List[RayCluster]: list_of_clusters = [] try: config_check() @@ -812,8 +826,15 @@ def _get_ray_clusters(namespace="default") -> List[RayCluster]: except Exception as e: # pragma: no cover return _kube_api_error_handling(e) - for rc in rcs["items"]: - list_of_clusters.append(_map_to_ray_cluster(rc)) + # Get a list of RCs with the filter if it is passed to the function + if filter is not None: + for rc in rcs["items"]: + ray_cluster = _map_to_ray_cluster(rc) + if filter and ray_cluster.status in filter: + list_of_clusters.append(ray_cluster) + else: + for rc in rcs["items"]: + list_of_clusters.append(_map_to_ray_cluster(rc)) return list_of_clusters diff --git a/src/codeflare_sdk/cluster/config.py b/src/codeflare_sdk/cluster/config.py index 7156495ff..f6bcac89c 100644 --- a/src/codeflare_sdk/cluster/config.py +++ b/src/codeflare_sdk/cluster/config.py @@ -46,7 +46,7 @@ class ClusterConfiguration: num_gpus: int = 0 template: str = f"{dir}/templates/base-template.yaml" instascale: bool = False - mcad: bool = True + mcad: bool = False envs: dict = field(default_factory=dict) image: str = "" local_interactive: bool = False @@ -60,3 +60,5 @@ def __post_init__(self): print( "Warning: TLS verification has been disabled - Endpoint checks will be bypassed" ) + + local_queue: str = None diff --git a/src/codeflare_sdk/cluster/model.py b/src/codeflare_sdk/cluster/model.py index 2e1abaf75..e2dcb6522 100644 --- a/src/codeflare_sdk/cluster/model.py +++ b/src/codeflare_sdk/cluster/model.py @@ -32,6 +32,7 @@ class RayClusterStatus(Enum): UNHEALTHY = "unhealthy" FAILED = "failed" UNKNOWN = "unknown" + SUSPENDED = "suspended" class AppWrapperStatus(Enum): @@ -59,6 +60,7 @@ class CodeFlareClusterStatus(Enum): QUEUEING = 4 FAILED = 5 UNKNOWN = 6 + SUSPENDED = 7 @dataclass diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py index 7f14b5ba9..2088b9102 100755 --- a/src/codeflare_sdk/utils/generate_yaml.py +++ b/src/codeflare_sdk/utils/generate_yaml.py @@ -17,6 +17,7 @@ (in the cluster sub-module) for AppWrapper generation. """ +from typing import Optional import typing import yaml import sys @@ -460,7 +461,35 @@ def _create_oauth_sidecar_object( ) -def write_components(user_yaml: dict, output_file_name: str): +def get_default_kueue_name(namespace: str): + # If the local queue is set, use it. Otherwise, try to use the default queue. + try: + config_check() + api_instance = client.CustomObjectsApi(api_config_handler()) + local_queues = api_instance.list_namespaced_custom_object( + group="kueue.x-k8s.io", + version="v1beta1", + namespace=namespace, + plural="localqueues", + ) + except Exception as e: # pragma: no cover + return _kube_api_error_handling(e) + for lq in local_queues["items"]: + if ( + "annotations" in lq["metadata"] + and "kueue.x-k8s.io/default-queue" in lq["metadata"]["annotations"] + and lq["metadata"]["annotations"]["kueue.x-k8s.io/default-queue"].lower() + == "true" + ): + return lq["metadata"]["name"] + raise ValueError( + "Default Local Queue with kueue.x-k8s.io/default-queue: true annotation not found please create a default Local Queue or provide the local_queue name in Cluster Configuration" + ) + + +def write_components( + user_yaml: dict, output_file_name: str, namespace: str, local_queue: Optional[str] +): # Create the directory if it doesn't exist directory_path = os.path.dirname(output_file_name) if not os.path.exists(directory_path): @@ -468,9 +497,19 @@ def write_components(user_yaml: dict, output_file_name: str): components = user_yaml.get("spec", "resources")["resources"].get("GenericItems") open(output_file_name, "w").close() + lq_name = local_queue or get_default_kueue_name(namespace) with open(output_file_name, "a") as outfile: for component in components: if "generictemplate" in component: + if ( + "workload.codeflare.dev/appwrapper" + in component["generictemplate"]["metadata"]["labels"] + ): + del component["generictemplate"]["metadata"]["labels"][ + "workload.codeflare.dev/appwrapper" + ] + labels = component["generictemplate"]["metadata"]["labels"] + labels.update({"kueue.x-k8s.io/queue-name": lq_name}) outfile.write("---\n") yaml.dump( component["generictemplate"], outfile, default_flow_style=False @@ -478,11 +517,23 @@ def write_components(user_yaml: dict, output_file_name: str): print(f"Written to: {output_file_name}") -def load_components(user_yaml: dict, name: str): +def load_components( + user_yaml: dict, name: str, namespace: str, local_queue: Optional[str] +): component_list = [] components = user_yaml.get("spec", "resources")["resources"].get("GenericItems") + lq_name = local_queue or get_default_kueue_name(namespace) for component in components: if "generictemplate" in component: + if ( + "workload.codeflare.dev/appwrapper" + in component["generictemplate"]["metadata"]["labels"] + ): + del component["generictemplate"]["metadata"]["labels"][ + "workload.codeflare.dev/appwrapper" + ] + labels = component["generictemplate"]["metadata"]["labels"] + labels.update({"kueue.x-k8s.io/queue-name": lq_name}) component_list.append(component["generictemplate"]) resources = "---\n" + "---\n".join( @@ -523,6 +574,7 @@ def generate_appwrapper( priority_val: int, write_to_file: bool, verify_tls: bool, + local_queue: Optional[str], ): user_yaml = read_template(template) appwrapper_name, cluster_name = gen_names(name) @@ -575,18 +627,18 @@ def generate_appwrapper( if is_openshift_cluster(): enable_openshift_oauth(user_yaml, cluster_name, namespace) - directory_path = os.path.expanduser("~/.codeflare/appwrapper/") + directory_path = os.path.expanduser("~/.codeflare/resources/") outfile = os.path.join(directory_path, appwrapper_name + ".yaml") if write_to_file: if mcad: write_user_appwrapper(user_yaml, outfile) else: - write_components(user_yaml, outfile) + write_components(user_yaml, outfile, namespace, local_queue) return outfile else: if mcad: user_yaml = load_appwrapper(user_yaml, name) else: - user_yaml = load_components(user_yaml, name) + user_yaml = load_components(user_yaml, name, namespace, local_queue) return user_yaml diff --git a/src/codeflare_sdk/utils/pretty_print.py b/src/codeflare_sdk/utils/pretty_print.py index 0bdc185de..74678ecc3 100644 --- a/src/codeflare_sdk/utils/pretty_print.py +++ b/src/codeflare_sdk/utils/pretty_print.py @@ -56,6 +56,30 @@ def print_app_wrappers_status(app_wrappers: List[AppWrapper], starting: bool = F console.print(Panel.fit(table)) +def print_ray_clusters_status(app_wrappers: List[AppWrapper], starting: bool = False): + if not app_wrappers: + print_no_resources_found() + return # shortcircuit + + console = Console() + table = Table( + box=box.ASCII_DOUBLE_HEAD, + title="[bold] :rocket: Cluster Queue Status :rocket:", + ) + table.add_column("Name", style="cyan", no_wrap=True) + table.add_column("Status", style="magenta") + + for app_wrapper in app_wrappers: + name = app_wrapper.name + status = app_wrapper.status.value + if starting: + status += " (starting)" + table.add_row(name, status) + table.add_row("") # empty row for spacing + + console.print(Panel.fit(table)) + + def print_cluster_status(cluster: RayCluster): "Pretty prints the status of a passed-in cluster" if not cluster: diff --git a/tests/e2e/mnist_raycluster_sdk_oauth_test.py b/tests/e2e/mnist_raycluster_sdk_oauth_test.py index 708a389d5..90bec08df 100644 --- a/tests/e2e/mnist_raycluster_sdk_oauth_test.py +++ b/tests/e2e/mnist_raycluster_sdk_oauth_test.py @@ -52,6 +52,7 @@ def run_mnist_raycluster_sdk_oauth(self): instascale=False, image=ray_image, write_to_file=True, + mcad=True, ) ) diff --git a/tests/e2e/mnist_raycluster_sdk_test.py b/tests/e2e/mnist_raycluster_sdk_test.py index b98b860b0..a38cb48df 100644 --- a/tests/e2e/mnist_raycluster_sdk_test.py +++ b/tests/e2e/mnist_raycluster_sdk_test.py @@ -52,6 +52,7 @@ def run_mnist_raycluster_sdk(self): instascale=False, image=ray_image, write_to_file=True, + mcad=True, ) ) diff --git a/tests/e2e/start_ray_cluster.py b/tests/e2e/start_ray_cluster.py index f4cf7e73a..8bb185808 100644 --- a/tests/e2e/start_ray_cluster.py +++ b/tests/e2e/start_ray_cluster.py @@ -22,6 +22,7 @@ num_gpus=0, instascale=False, image=ray_image, + mcad=True, ) ) diff --git a/tests/test-case-no-mcad.yamls b/tests/test-case-no-mcad.yamls index 997457604..e6bbcdd27 100644 --- a/tests/test-case-no-mcad.yamls +++ b/tests/test-case-no-mcad.yamls @@ -6,7 +6,7 @@ metadata: sdk.codeflare.dev/local_interactive: 'False' labels: controller-tools.k8s.io: '1.0' - workload.codeflare.dev/appwrapper: unit-test-cluster-ray + kueue.x-k8s.io/queue-name: local-queue-default name: unit-test-cluster-ray namespace: ns spec: diff --git a/tests/unit_test.py b/tests/unit_test.py index 6831ea651..b25d3dd0c 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -24,7 +24,7 @@ from codeflare_sdk.cluster import cluster parent = Path(__file__).resolve().parents[1] -aw_dir = os.path.expanduser("~/.codeflare/appwrapper/") +aw_dir = os.path.expanduser("~/.codeflare/resources/") sys.path.append(str(parent) + "/src") from kubernetes import client, config, dynamic @@ -299,8 +299,59 @@ def test_create_app_wrapper_raises_error_with_no_image(): ), "Error message did not match expected output." +def get_local_queue(group, version, namespace, plural): + assert group == "kueue.x-k8s.io" + assert version == "v1beta1" + assert namespace == "ns" + assert plural == "localqueues" + local_queues = { + "apiVersion": "kueue.x-k8s.io/v1beta1", + "items": [ + { + "apiVersion": "kueue.x-k8s.io/v1beta1", + "kind": "LocalQueue", + "metadata": { + "annotations": {"kueue.x-k8s.io/default-queue": "true"}, + "name": "local-queue-default", + "namespace": "ns", + }, + "spec": {"clusterQueue": "cluster-queue"}, + } + ], + "kind": "LocalQueueList", + "metadata": {"continue": "", "resourceVersion": "2266811"}, + } + return local_queues + + def test_cluster_creation_no_mcad(mocker): + # Create Ray Cluster with no local queue specified + mocker.patch("kubernetes.client.ApisApi.get_api_versions") + mocker.patch( + "kubernetes.client.CustomObjectsApi.get_cluster_custom_object", + return_value={"spec": {"domain": "apps.cluster.awsroute.org"}}, + ) + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"), + ) + config = createClusterConfig() + config.name = "unit-test-cluster-ray" + config.write_to_file = True + config.mcad = False + cluster = Cluster(config) + assert cluster.app_wrapper_yaml == f"{aw_dir}unit-test-cluster-ray.yaml" + assert cluster.app_wrapper_name == "unit-test-cluster-ray" + assert filecmp.cmp( + f"{aw_dir}unit-test-cluster-ray.yaml", + f"{parent}/tests/test-case-no-mcad.yamls", + shallow=True, + ) + + +def test_cluster_creation_no_mcad_local_queue(mocker): # With written resources + # Create Ray Cluster with local queue specified mocker.patch("kubernetes.client.ApisApi.get_api_versions") mocker.patch( "kubernetes.client.CustomObjectsApi.get_cluster_custom_object", @@ -310,6 +361,7 @@ def test_cluster_creation_no_mcad(mocker): config.name = "unit-test-cluster-ray" config.mcad = False config.write_to_file = True + config.local_queue = "local-queue-default" cluster = Cluster(config) assert cluster.app_wrapper_yaml == f"{aw_dir}unit-test-cluster-ray.yaml" assert cluster.app_wrapper_name == "unit-test-cluster-ray" @@ -334,6 +386,7 @@ def test_cluster_creation_no_mcad(mocker): image="quay.io/project-codeflare/ray:latest-py39-cu118", write_to_file=False, mcad=False, + local_queue="local-queue-default", ) cluster = Cluster(config) test_resources = [] @@ -383,6 +436,7 @@ def test_default_cluster_creation(mocker): default_config = ClusterConfiguration( name="unit-test-default-cluster", image="quay.io/project-codeflare/ray:latest-py39-cu118", + mcad=True, ) cluster = Cluster(default_config) test_aw = yaml.safe_load(cluster.app_wrapper_yaml) @@ -494,6 +548,10 @@ def test_cluster_up_down(mocker): def test_cluster_up_down_no_mcad(mocker): + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"), + ) mocker.patch("kubernetes.client.ApisApi.get_api_versions") mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") mocker.patch( @@ -832,6 +890,7 @@ def test_ray_details(mocker, capsys): namespace="ns", image="quay.io/project-codeflare/ray:latest-py39-cu118", write_to_file=True, + mcad=True, ) ) captured = capsys.readouterr() @@ -1389,7 +1448,263 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "observedGeneration": 1, "state": "ready", }, - } + }, + { + "apiVersion": "ray.io/v1", + "kind": "RayCluster", + "metadata": { + "creationTimestamp": "2023-02-22T16:26:07Z", + "generation": 1, + "labels": { + "workload.codeflare.dev/appwrapper": "quicktest2", + "controller-tools.k8s.io": "1.0", + "resourceName": "quicktest2", + "orderedinstance": "m4.xlarge_g4dn.xlarge", + }, + "managedFields": [ + { + "apiVersion": "ray.io/v1", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:metadata": { + "f:labels": { + ".": {}, + "f:workload.codeflare.dev/appwrapper": {}, + "f:controller-tools.k8s.io": {}, + "f:resourceName": {}, + }, + "f:ownerReferences": { + ".": {}, + 'k:{"uid":"6334fc1b-471e-4876-8e7b-0b2277679235"}': {}, + }, + }, + "f:spec": { + ".": {}, + "f:autoscalerOptions": { + ".": {}, + "f:idleTimeoutSeconds": {}, + "f:imagePullPolicy": {}, + "f:resources": { + ".": {}, + "f:limits": { + ".": {}, + "f:cpu": {}, + "f:memory": {}, + }, + "f:requests": { + ".": {}, + "f:cpu": {}, + "f:memory": {}, + }, + }, + "f:upscalingMode": {}, + }, + "f:enableInTreeAutoscaling": {}, + "f:headGroupSpec": { + ".": {}, + "f:rayStartParams": { + ".": {}, + "f:block": {}, + "f:dashboard-host": {}, + "f:num-gpus": {}, + }, + "f:serviceType": {}, + "f:template": { + ".": {}, + "f:spec": {".": {}, "f:containers": {}}, + }, + }, + "f:rayVersion": {}, + "f:workerGroupSpecs": {}, + }, + }, + "manager": "mcad-controller", + "operation": "Update", + "time": "2023-02-22T16:26:07Z", + }, + { + "apiVersion": "ray.io/v1", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:status": { + ".": {}, + "f:availableWorkerReplicas": {}, + "f:desiredWorkerReplicas": {}, + "f:endpoints": { + ".": {}, + "f:client": {}, + "f:dashboard": {}, + "f:gcs": {}, + }, + "f:lastUpdateTime": {}, + "f:maxWorkerReplicas": {}, + "f:minWorkerReplicas": {}, + "f:state": {}, + } + }, + "manager": "manager", + "operation": "Update", + "subresource": "status", + "time": "2023-02-22T16:26:16Z", + }, + ], + "name": "quicktest2", + "namespace": "ns", + "ownerReferences": [ + { + "apiVersion": "workload.codeflare.dev/v1beta1", + "blockOwnerDeletion": True, + "controller": True, + "kind": "AppWrapper", + "name": "quicktest2", + "uid": "6334fc1b-471e-4876-8e7b-0b2277679235", + } + ], + "resourceVersion": "9482407", + "uid": "44d45d1f-26c8-43e7-841f-831dbd8c1285", + }, + "spec": { + "autoscalerOptions": { + "idleTimeoutSeconds": 60, + "imagePullPolicy": "Always", + "resources": { + "limits": {"cpu": "500m", "memory": "512Mi"}, + "requests": {"cpu": "500m", "memory": "512Mi"}, + }, + "upscalingMode": "Default", + }, + "enableInTreeAutoscaling": False, + "headGroupSpec": { + "rayStartParams": { + "block": "true", + "dashboard-host": "0.0.0.0", + "num-gpus": "0", + }, + "serviceType": "ClusterIP", + "template": { + "spec": { + "containers": [ + { + "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", + "imagePullPolicy": "Always", + "lifecycle": { + "preStop": { + "exec": { + "command": [ + "/bin/sh", + "-c", + "ray stop", + ] + } + } + }, + "name": "ray-head", + "ports": [ + { + "containerPort": 6379, + "name": "gcs", + "protocol": "TCP", + }, + { + "containerPort": 8265, + "name": "dashboard", + "protocol": "TCP", + }, + { + "containerPort": 10001, + "name": "client", + "protocol": "TCP", + }, + ], + "resources": { + "limits": { + "cpu": 2, + "memory": "8G", + "nvidia.com/gpu": 0, + }, + "requests": { + "cpu": 2, + "memory": "8G", + "nvidia.com/gpu": 0, + }, + }, + } + ] + } + }, + }, + "rayVersion": "1.12.0", + "workerGroupSpecs": [ + { + "groupName": "small-group-quicktest2", + "maxReplicas": 1, + "minReplicas": 1, + "rayStartParams": {"block": "true", "num-gpus": "0"}, + "replicas": 1, + "template": { + "metadata": { + "annotations": {"key": "value"}, + "labels": {"key": "value"}, + }, + "spec": { + "containers": [ + { + "env": [ + { + "name": "MY_POD_IP", + "valueFrom": { + "fieldRef": { + "fieldPath": "status.podIP" + } + }, + } + ], + "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", + "lifecycle": { + "preStop": { + "exec": { + "command": [ + "/bin/sh", + "-c", + "ray stop", + ] + } + } + }, + "name": "machine-learning", + "resources": { + "limits": { + "cpu": 1, + "memory": "2G", + "nvidia.com/gpu": 0, + }, + "requests": { + "cpu": 1, + "memory": "2G", + "nvidia.com/gpu": 0, + }, + }, + } + ], + }, + }, + } + ], + }, + "status": { + "availableWorkerReplicas": 2, + "desiredWorkerReplicas": 1, + "endpoints": { + "client": "10001", + "dashboard": "8265", + "gcs": "6379", + }, + "lastUpdateTime": "2023-02-22T16:26:16Z", + "maxWorkerReplicas": 1, + "minWorkerReplicas": 1, + "state": "suspended", + }, + }, ] } return api_obj @@ -2117,6 +2432,8 @@ def custom_side_effect(group, version, namespace, plural, **kwargs): return get_ray_obj("ray.io", "v1", "ns", "rayclusters") elif plural == "appwrappers": return get_aw_obj("workload.codeflare.dev", "v1beta1", "ns", "appwrappers") + elif plural == "localqueues": + return get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues") mocker.patch( "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", get_aw_obj @@ -2296,6 +2613,22 @@ def test_list_clusters(mocker, capsys): " │ │ │ │ │ │ \n" " │ ╰─────────────╯ ╰──────────────────────────────────────╯ │ \n" " ╰───────────────────────────────────────────────────────────────╯ \n" + "╭───────────────────────────────────────────────────────────────╮\n" + "│ Name │\n" + "│ quicktest2 Inactive ❌ │\n" + "│ │\n" + "│ URI: ray://quicktest2-head-svc.ns.svc:10001 │\n" + "│ │\n" + "│ Dashboard🔗 │\n" + "│ │\n" + "│ Cluster Resources │\n" + "│ ╭── Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │\n" + "│ │ # Workers │ │ Memory CPU GPU │ │\n" + "│ │ │ │ │ │\n" + "│ │ 1 │ │ 2G~2G 1 0 │ │\n" + "│ │ │ │ │ │\n" + "│ ╰─────────────╯ ╰──────────────────────────────────────╯ │\n" + "╰───────────────────────────────────────────────────────────────╯\n" ) @@ -2305,7 +2638,7 @@ def test_list_queue(mocker, capsys): "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", side_effect=get_obj_none, ) - list_all_queued("ns") + list_all_queued("ns", mcad=True) captured = capsys.readouterr() assert captured.out == ( "╭──────────────────────────────────────────────────────────────────────────────╮\n" @@ -2316,7 +2649,7 @@ def test_list_queue(mocker, capsys): "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", side_effect=get_aw_obj, ) - list_all_queued("ns") + list_all_queued("ns", mcad=True) captured = capsys.readouterr() assert captured.out == ( "╭──────────────────────────╮\n" @@ -2334,6 +2667,49 @@ def test_list_queue(mocker, capsys): ) +def test_list_queue_rayclusters(mocker, capsys): + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mock_api = MagicMock() + mock_api.get_api_versions.return_value.groups = [ + MagicMock(versions=[MagicMock(group_version="route.openshift.io/v1")]) + ] + mocker.patch("kubernetes.client.ApisApi", return_value=mock_api) + + assert is_openshift_cluster() == True + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_obj_none, + ) + list_all_queued("ns") + captured = capsys.readouterr() + assert captured.out == ( + "╭──────────────────────────────────────────────────────────────────────────────╮\n" + "│ No resources found, have you run cluster.up() yet? │\n" + "╰──────────────────────────────────────────────────────────────────────────────╯\n" + ) + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_ray_obj, + ) + list_all_queued("ns") + captured = capsys.readouterr() + print(captured.out) + assert captured.out == ( + "╭────────────────────────────╮\n" + "│ 🚀 Cluster Queue Status │\n" + "│ 🚀 │\n" + "│ +------------+-----------+ │\n" + "│ | Name | Status | │\n" + "│ +============+===========+ │\n" + "│ | quicktest | ready | │\n" + "│ | | | │\n" + "│ | quicktest2 | suspended | │\n" + "│ | | | │\n" + "│ +------------+-----------+ │\n" + "╰────────────────────────────╯\n" + ) + + def test_cluster_status(mocker): mocker.patch("kubernetes.client.ApisApi.get_api_versions") mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") @@ -2360,6 +2736,7 @@ def test_cluster_status(mocker): namespace="ns", image="quay.io/project-codeflare/ray:latest-py39-cu118", write_to_file=True, + mcad=True, ) ) mocker.patch("codeflare_sdk.cluster.cluster._app_wrapper_status", return_value=None) @@ -2454,6 +2831,7 @@ def test_wait_ready(mocker, capsys): namespace="ns", image="quay.io/project-codeflare/ray:latest-py39-cu118", write_to_file=True, + mcad=True, ) ) try: @@ -3125,6 +3503,7 @@ def test_gen_app_wrapper_with_oauth(mocker: MockerFixture): "test_cluster", image="quay.io/project-codeflare/ray:latest-py39-cu118", write_to_file=True, + mcad=True, ) ) user_yaml = write_user_appwrapper.call_args.args[0] diff --git a/tests/unit_test_support.py b/tests/unit_test_support.py index 31328338e..190c4f1a9 100644 --- a/tests/unit_test_support.py +++ b/tests/unit_test_support.py @@ -43,6 +43,7 @@ def createClusterConfig(): min_memory=5, max_memory=6, num_gpus=7, + mcad=True, instascale=True, machine_types=["cpu.small", "gpu.large"], image_pull_secrets=["unit-test-pull-secret"], From b06a8715c7218d0653bf1cef9a23359c8368a093 Mon Sep 17 00:00:00 2001 From: Mark Campbell Date: Fri, 5 Apr 2024 17:35:56 +0100 Subject: [PATCH 2/2] Update src/codeflare_sdk/cluster/cluster.py Co-authored-by: Antonin Stefanutti --- src/codeflare_sdk/cluster/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index baf1e9746..be59c5c6c 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -595,7 +595,7 @@ def list_all_clusters(namespace: str, print_to_console: bool = True): def list_all_queued(namespace: str, print_to_console: bool = True, mcad: bool = False): """ - Returns (and prints by default) a list of all currently queued-up Ray Clusters or AppWrappers + Returns (and prints by default) a list of all currently queued-up Ray Clusters in a given namespace. """ if mcad: