From 5691a20dc7ccd158b0f4fa483b5c505da62dd850 Mon Sep 17 00:00:00 2001
From: Kevin <kpostlet@redhat.com>
Date: Tue, 23 Apr 2024 15:32:00 -0400
Subject: [PATCH] use string for storing mem resources

Signed-off-by: Kevin <kpostlet@redhat.com>
---
 src/codeflare_sdk/cluster/cluster.py     | 34 ++++++++++--------------
 src/codeflare_sdk/cluster/config.py      | 31 ++++++++++++++++-----
 src/codeflare_sdk/utils/generate_yaml.py | 12 ++++-----
 src/codeflare_sdk/utils/pretty_print.py  |  2 +-
 tests/unit_test.py                       | 16 +++++------
 5 files changed, 54 insertions(+), 41 deletions(-)

diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py
index 295332ae4..eef0ae51d 100644
--- a/src/codeflare_sdk/cluster/cluster.py
+++ b/src/codeflare_sdk/cluster/cluster.py
@@ -18,6 +18,7 @@
 cluster setup queue, a list of all existing clusters, and the user's working namespace.
 """
 
+import re
 from time import sleep
 from typing import List, Optional, Tuple, Dict
 
@@ -41,6 +42,7 @@
     RayClusterStatus,
 )
 from kubernetes import client, config
+from kubernetes.utils import parse_quantity
 import yaml
 import os
 import requests
@@ -488,26 +490,18 @@ def from_k8_cluster_object(
             namespace=rc["metadata"]["namespace"],
             machine_types=machine_types,
             num_workers=rc["spec"]["workerGroupSpecs"][0]["minReplicas"],
-            min_cpus=int(
-                rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
-                    "resources"
-                ]["requests"]["cpu"]
-            ),
-            max_cpus=int(
-                rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
-                    "resources"
-                ]["limits"]["cpu"]
-            ),
-            min_memory=int(
-                rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
-                    "resources"
-                ]["requests"]["memory"][:-1]
-            ),
-            max_memory=int(
-                rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
-                    "resources"
-                ]["limits"]["memory"][:-1]
-            ),
+            min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
+                "containers"
+            ][0]["resources"]["requests"]["cpu"],
+            max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
+                "containers"
+            ][0]["resources"]["limits"]["cpu"],
+            min_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
+                "containers"
+            ][0]["resources"]["requests"]["memory"],
+            max_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
+                "containers"
+            ][0]["resources"]["limits"]["memory"],
             num_gpus=int(
                 rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
                     "resources"
diff --git a/src/codeflare_sdk/cluster/config.py b/src/codeflare_sdk/cluster/config.py
index e4d046f93..064b51cd4 100644
--- a/src/codeflare_sdk/cluster/config.py
+++ b/src/codeflare_sdk/cluster/config.py
@@ -20,6 +20,7 @@
 
 from dataclasses import dataclass, field
 import pathlib
+import typing
 
 dir = pathlib.Path(__file__).parent.parent.resolve()
 
@@ -34,15 +35,15 @@ class ClusterConfiguration:
     name: str
     namespace: str = None
     head_info: list = field(default_factory=list)
-    head_cpus: int = 2
-    head_memory: int = 8
+    head_cpus: typing.Union[int, str] = 2
+    head_memory: typing.Union[int, str] = 8
     head_gpus: int = 0
     machine_types: list = field(default_factory=list)  # ["m4.xlarge", "g4dn.xlarge"]
-    min_cpus: int = 1
-    max_cpus: int = 1
+    min_cpus: typing.Union[int, str] = 1
+    max_cpus: typing.Union[int, str] = 1
     num_workers: int = 1
-    min_memory: int = 2
-    max_memory: int = 2
+    min_memory: typing.Union[int, str] = 2
+    max_memory: typing.Union[int, str] = 2
     num_gpus: int = 0
     template: str = f"{dir}/templates/base-template.yaml"
     instascale: bool = False
@@ -59,5 +60,23 @@ def __post_init__(self):
             print(
                 "Warning: TLS verification has been disabled - Endpoint checks will be bypassed"
             )
+        self._memory_to_string()
+        self._str_mem_no_unit_add_GB()
+
+    def _str_mem_no_unit_add_GB(self):
+        if isinstance(self.head_memory, str) and self.head_memory.isdecimal():
+            self.head_memory = f"{self.head_memory}G"
+        if isinstance(self.min_memory, str) and self.min_memory.isdecimal():
+            self.min_memory = f"{self.min_memory}G"
+        if isinstance(self.max_memory, str) and self.max_memory.isdecimal():
+            self.max_memory = f"{self.max_memory}G"
+
+    def _memory_to_string(self):
+        if isinstance(self.head_memory, int):
+            self.head_memory = f"{self.head_memory}G"
+        if isinstance(self.min_memory, int):
+            self.min_memory = f"{self.min_memory}G"
+        if isinstance(self.max_memory, int):
+            self.max_memory = f"{self.max_memory}G"
 
     local_queue: str = None
diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py
index 95c3d04f0..97dda5ba7 100755
--- a/src/codeflare_sdk/utils/generate_yaml.py
+++ b/src/codeflare_sdk/utils/generate_yaml.py
@@ -140,8 +140,8 @@ def update_custompodresources(
                 # Leave head node resources as template default
                 resource["requests"]["cpu"] = head_cpus
                 resource["limits"]["cpu"] = head_cpus
-                resource["requests"]["memory"] = str(head_memory) + "G"
-                resource["limits"]["memory"] = str(head_memory) + "G"
+                resource["requests"]["memory"] = head_memory
+                resource["limits"]["memory"] = head_memory
                 resource["requests"]["nvidia.com/gpu"] = head_gpus
                 resource["limits"]["nvidia.com/gpu"] = head_gpus
 
@@ -158,9 +158,9 @@ def update_custompodresources(
                                     resource[k][spec] = min_cpu
                             if spec == "memory":
                                 if k == "limits":
-                                    resource[k][spec] = str(max_memory) + "G"
+                                    resource[k][spec] = max_memory
                                 else:
-                                    resource[k][spec] = str(min_memory) + "G"
+                                    resource[k][spec] = min_memory
                             if spec == "nvidia.com/gpu":
                                 if i == 0:
                                     resource[k][spec] = 0
@@ -213,12 +213,12 @@ def update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu):
         requests = resource.get("resources").get("requests")
         if requests is not None:
             requests["cpu"] = min_cpu
-            requests["memory"] = str(min_memory) + "G"
+            requests["memory"] = min_memory
             requests["nvidia.com/gpu"] = gpu
         limits = resource.get("resources").get("limits")
         if limits is not None:
             limits["cpu"] = max_cpu
-            limits["memory"] = str(max_memory) + "G"
+            limits["memory"] = max_memory
             limits["nvidia.com/gpu"] = gpu
 
 
diff --git a/src/codeflare_sdk/utils/pretty_print.py b/src/codeflare_sdk/utils/pretty_print.py
index 74678ecc3..42ef8398b 100644
--- a/src/codeflare_sdk/utils/pretty_print.py
+++ b/src/codeflare_sdk/utils/pretty_print.py
@@ -136,7 +136,7 @@ def print_clusters(clusters: List[RayCluster]):
         name = cluster.name
         dashboard = cluster.dashboard
         workers = str(cluster.workers)
-        memory = str(cluster.worker_mem_min) + "~" + str(cluster.worker_mem_max)
+        memory = f"{cluster.worker_mem_min}~{cluster.worker_mem_max}"
         cpu = str(cluster.worker_cpu)
         gpu = str(cluster.worker_gpu)
 
diff --git a/tests/unit_test.py b/tests/unit_test.py
index 322449fbf..d46c4908e 100644
--- a/tests/unit_test.py
+++ b/tests/unit_test.py
@@ -248,7 +248,7 @@ def test_config_creation():
     assert config.name == "unit-test-cluster" and config.namespace == "ns"
     assert config.num_workers == 2
     assert config.min_cpus == 3 and config.max_cpus == 4
-    assert config.min_memory == 5 and config.max_memory == 6
+    assert config.min_memory == "5G" and config.max_memory == "6G"
     assert config.num_gpus == 7
     assert config.image == "quay.io/project-codeflare/ray:latest-py39-cu118"
     assert config.template == f"{parent}/src/codeflare_sdk/templates/base-template.yaml"
@@ -849,8 +849,8 @@ def test_ray_details(mocker, capsys):
         name="raytest1",
         status=RayClusterStatus.READY,
         workers=1,
-        worker_mem_min=2,
-        worker_mem_max=2,
+        worker_mem_min="2G",
+        worker_mem_max="2G",
         worker_cpu=1,
         worker_gpu=0,
         namespace="ns",
@@ -909,7 +909,7 @@ def test_ray_details(mocker, capsys):
         " │   ╭── Workers ──╮  ╭───────── Worker specs(each) ─────────╮   │ \n"
         " │   │  # Workers  │  │  Memory      CPU         GPU         │   │ \n"
         " │   │             │  │                                      │   │ \n"
-        " │   │  1          │  │  2~2         1           0           │   │ \n"
+        " │   │  1          │  │  2G~2G       1           0           │   │ \n"
         " │   │             │  │                                      │   │ \n"
         " │   ╰─────────────╯  ╰──────────────────────────────────────╯   │ \n"
         " ╰───────────────────────────────────────────────────────────────╯ \n"
@@ -927,7 +927,7 @@ def test_ray_details(mocker, capsys):
         " │   ╭── Workers ──╮  ╭───────── Worker specs(each) ─────────╮   │ \n"
         " │   │  # Workers  │  │  Memory      CPU         GPU         │   │ \n"
         " │   │             │  │                                      │   │ \n"
-        " │   │  1          │  │  2~2         1           0           │   │ \n"
+        " │   │  1          │  │  2G~2G       1           0           │   │ \n"
         " │   │             │  │                                      │   │ \n"
         " │   ╰─────────────╯  ╰──────────────────────────────────────╯   │ \n"
         " ╰───────────────────────────────────────────────────────────────╯ \n"
@@ -943,7 +943,7 @@ def test_ray_details(mocker, capsys):
         "│   ╭── Workers ──╮  ╭───────── Worker specs(each) ─────────╮   │\n"
         "│   │  # Workers  │  │  Memory      CPU         GPU         │   │\n"
         "│   │             │  │                                      │   │\n"
-        "│   │  1          │  │  2~2         1           0           │   │\n"
+        "│   │  1          │  │  2G~2G       1           0           │   │\n"
         "│   │             │  │                                      │   │\n"
         "│   ╰─────────────╯  ╰──────────────────────────────────────╯   │\n"
         "╰───────────────────────────────────────────────────────────────╯\n"
@@ -2436,7 +2436,7 @@ def custom_side_effect(group, version, namespace, plural, **kwargs):
         and "g4dn.xlarge" in cluster_config.machine_types
     )
     assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1
-    assert cluster_config.min_memory == 2 and cluster_config.max_memory == 2
+    assert cluster_config.min_memory == "2G" and cluster_config.max_memory == "2G"
     assert cluster_config.num_gpus == 0
     assert (
         cluster_config.image
@@ -2468,7 +2468,7 @@ def test_get_cluster(mocker):
         and "g4dn.xlarge" in cluster_config.machine_types
     )
     assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1
-    assert cluster_config.min_memory == 2 and cluster_config.max_memory == 2
+    assert cluster_config.min_memory == "2G" and cluster_config.max_memory == "2G"
     assert cluster_config.num_gpus == 0
     assert cluster_config.instascale
     assert (