From 5691a20dc7ccd158b0f4fa483b5c505da62dd850 Mon Sep 17 00:00:00 2001 From: Kevin Date: Tue, 23 Apr 2024 15:32:00 -0400 Subject: [PATCH] use string for storing mem resources Signed-off-by: Kevin --- src/codeflare_sdk/cluster/cluster.py | 34 ++++++++++-------------- src/codeflare_sdk/cluster/config.py | 31 ++++++++++++++++----- src/codeflare_sdk/utils/generate_yaml.py | 12 ++++----- src/codeflare_sdk/utils/pretty_print.py | 2 +- tests/unit_test.py | 16 +++++------ 5 files changed, 54 insertions(+), 41 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 295332ae4..eef0ae51d 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -18,6 +18,7 @@ cluster setup queue, a list of all existing clusters, and the user's working namespace. """ +import re from time import sleep from typing import List, Optional, Tuple, Dict @@ -41,6 +42,7 @@ RayClusterStatus, ) from kubernetes import client, config +from kubernetes.utils import parse_quantity import yaml import os import requests @@ -488,26 +490,18 @@ def from_k8_cluster_object( namespace=rc["metadata"]["namespace"], machine_types=machine_types, num_workers=rc["spec"]["workerGroupSpecs"][0]["minReplicas"], - min_cpus=int( - rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][ - "resources" - ]["requests"]["cpu"] - ), - max_cpus=int( - rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][ - "resources" - ]["limits"]["cpu"] - ), - min_memory=int( - rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][ - "resources" - ]["requests"]["memory"][:-1] - ), - max_memory=int( - rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][ - "resources" - ]["limits"]["memory"][:-1] - ), + min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + "containers" + ][0]["resources"]["requests"]["cpu"], + max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + "containers" + ][0]["resources"]["limits"]["cpu"], + min_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + "containers" + ][0]["resources"]["requests"]["memory"], + max_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + "containers" + ][0]["resources"]["limits"]["memory"], num_gpus=int( rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][ "resources" diff --git a/src/codeflare_sdk/cluster/config.py b/src/codeflare_sdk/cluster/config.py index e4d046f93..064b51cd4 100644 --- a/src/codeflare_sdk/cluster/config.py +++ b/src/codeflare_sdk/cluster/config.py @@ -20,6 +20,7 @@ from dataclasses import dataclass, field import pathlib +import typing dir = pathlib.Path(__file__).parent.parent.resolve() @@ -34,15 +35,15 @@ class ClusterConfiguration: name: str namespace: str = None head_info: list = field(default_factory=list) - head_cpus: int = 2 - head_memory: int = 8 + head_cpus: typing.Union[int, str] = 2 + head_memory: typing.Union[int, str] = 8 head_gpus: int = 0 machine_types: list = field(default_factory=list) # ["m4.xlarge", "g4dn.xlarge"] - min_cpus: int = 1 - max_cpus: int = 1 + min_cpus: typing.Union[int, str] = 1 + max_cpus: typing.Union[int, str] = 1 num_workers: int = 1 - min_memory: int = 2 - max_memory: int = 2 + min_memory: typing.Union[int, str] = 2 + max_memory: typing.Union[int, str] = 2 num_gpus: int = 0 template: str = f"{dir}/templates/base-template.yaml" instascale: bool = False @@ -59,5 +60,23 @@ def __post_init__(self): print( "Warning: TLS verification has been disabled - Endpoint checks will be bypassed" ) + self._memory_to_string() + self._str_mem_no_unit_add_GB() + + def _str_mem_no_unit_add_GB(self): + if isinstance(self.head_memory, str) and self.head_memory.isdecimal(): + self.head_memory = f"{self.head_memory}G" + if isinstance(self.min_memory, str) and self.min_memory.isdecimal(): + self.min_memory = f"{self.min_memory}G" + if isinstance(self.max_memory, str) and self.max_memory.isdecimal(): + self.max_memory = f"{self.max_memory}G" + + def _memory_to_string(self): + if isinstance(self.head_memory, int): + self.head_memory = f"{self.head_memory}G" + if isinstance(self.min_memory, int): + self.min_memory = f"{self.min_memory}G" + if isinstance(self.max_memory, int): + self.max_memory = f"{self.max_memory}G" local_queue: str = None diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py index 95c3d04f0..97dda5ba7 100755 --- a/src/codeflare_sdk/utils/generate_yaml.py +++ b/src/codeflare_sdk/utils/generate_yaml.py @@ -140,8 +140,8 @@ def update_custompodresources( # Leave head node resources as template default resource["requests"]["cpu"] = head_cpus resource["limits"]["cpu"] = head_cpus - resource["requests"]["memory"] = str(head_memory) + "G" - resource["limits"]["memory"] = str(head_memory) + "G" + resource["requests"]["memory"] = head_memory + resource["limits"]["memory"] = head_memory resource["requests"]["nvidia.com/gpu"] = head_gpus resource["limits"]["nvidia.com/gpu"] = head_gpus @@ -158,9 +158,9 @@ def update_custompodresources( resource[k][spec] = min_cpu if spec == "memory": if k == "limits": - resource[k][spec] = str(max_memory) + "G" + resource[k][spec] = max_memory else: - resource[k][spec] = str(min_memory) + "G" + resource[k][spec] = min_memory if spec == "nvidia.com/gpu": if i == 0: resource[k][spec] = 0 @@ -213,12 +213,12 @@ def update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu): requests = resource.get("resources").get("requests") if requests is not None: requests["cpu"] = min_cpu - requests["memory"] = str(min_memory) + "G" + requests["memory"] = min_memory requests["nvidia.com/gpu"] = gpu limits = resource.get("resources").get("limits") if limits is not None: limits["cpu"] = max_cpu - limits["memory"] = str(max_memory) + "G" + limits["memory"] = max_memory limits["nvidia.com/gpu"] = gpu diff --git a/src/codeflare_sdk/utils/pretty_print.py b/src/codeflare_sdk/utils/pretty_print.py index 74678ecc3..42ef8398b 100644 --- a/src/codeflare_sdk/utils/pretty_print.py +++ b/src/codeflare_sdk/utils/pretty_print.py @@ -136,7 +136,7 @@ def print_clusters(clusters: List[RayCluster]): name = cluster.name dashboard = cluster.dashboard workers = str(cluster.workers) - memory = str(cluster.worker_mem_min) + "~" + str(cluster.worker_mem_max) + memory = f"{cluster.worker_mem_min}~{cluster.worker_mem_max}" cpu = str(cluster.worker_cpu) gpu = str(cluster.worker_gpu) diff --git a/tests/unit_test.py b/tests/unit_test.py index 322449fbf..d46c4908e 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -248,7 +248,7 @@ def test_config_creation(): assert config.name == "unit-test-cluster" and config.namespace == "ns" assert config.num_workers == 2 assert config.min_cpus == 3 and config.max_cpus == 4 - assert config.min_memory == 5 and config.max_memory == 6 + assert config.min_memory == "5G" and config.max_memory == "6G" assert config.num_gpus == 7 assert config.image == "quay.io/project-codeflare/ray:latest-py39-cu118" assert config.template == f"{parent}/src/codeflare_sdk/templates/base-template.yaml" @@ -849,8 +849,8 @@ def test_ray_details(mocker, capsys): name="raytest1", status=RayClusterStatus.READY, workers=1, - worker_mem_min=2, - worker_mem_max=2, + worker_mem_min="2G", + worker_mem_max="2G", worker_cpu=1, worker_gpu=0, namespace="ns", @@ -909,7 +909,7 @@ def test_ray_details(mocker, capsys): " │ ╭── Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n" " │ │ # Workers │ │ Memory CPU GPU │ │ \n" " │ │ │ │ │ │ \n" - " │ │ 1 │ │ 2~2 1 0 │ │ \n" + " │ │ 1 │ │ 2G~2G 1 0 │ │ \n" " │ │ │ │ │ │ \n" " │ ╰─────────────╯ ╰──────────────────────────────────────╯ │ \n" " ╰───────────────────────────────────────────────────────────────╯ \n" @@ -927,7 +927,7 @@ def test_ray_details(mocker, capsys): " │ ╭── Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n" " │ │ # Workers │ │ Memory CPU GPU │ │ \n" " │ │ │ │ │ │ \n" - " │ │ 1 │ │ 2~2 1 0 │ │ \n" + " │ │ 1 │ │ 2G~2G 1 0 │ │ \n" " │ │ │ │ │ │ \n" " │ ╰─────────────╯ ╰──────────────────────────────────────╯ │ \n" " ╰───────────────────────────────────────────────────────────────╯ \n" @@ -943,7 +943,7 @@ def test_ray_details(mocker, capsys): "│ ╭── Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │\n" "│ │ # Workers │ │ Memory CPU GPU │ │\n" "│ │ │ │ │ │\n" - "│ │ 1 │ │ 2~2 1 0 │ │\n" + "│ │ 1 │ │ 2G~2G 1 0 │ │\n" "│ │ │ │ │ │\n" "│ ╰─────────────╯ ╰──────────────────────────────────────╯ │\n" "╰───────────────────────────────────────────────────────────────╯\n" @@ -2436,7 +2436,7 @@ def custom_side_effect(group, version, namespace, plural, **kwargs): and "g4dn.xlarge" in cluster_config.machine_types ) assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1 - assert cluster_config.min_memory == 2 and cluster_config.max_memory == 2 + assert cluster_config.min_memory == "2G" and cluster_config.max_memory == "2G" assert cluster_config.num_gpus == 0 assert ( cluster_config.image @@ -2468,7 +2468,7 @@ def test_get_cluster(mocker): and "g4dn.xlarge" in cluster_config.machine_types ) assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1 - assert cluster_config.min_memory == 2 and cluster_config.max_memory == 2 + assert cluster_config.min_memory == "2G" and cluster_config.max_memory == "2G" assert cluster_config.num_gpus == 0 assert cluster_config.instascale assert (