Skip to content

Commit 59cbccc

Browse files
authored
use string for storing mem resources (project-codeflare#522)
Signed-off-by: Kevin <[email protected]>
1 parent 82d2c5b commit 59cbccc

File tree

5 files changed

+54
-41
lines changed

5 files changed

+54
-41
lines changed

src/codeflare_sdk/cluster/cluster.py

+14-20
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
cluster setup queue, a list of all existing clusters, and the user's working namespace.
1919
"""
2020

21+
import re
2122
from time import sleep
2223
from typing import List, Optional, Tuple, Dict
2324

@@ -41,6 +42,7 @@
4142
RayClusterStatus,
4243
)
4344
from kubernetes import client, config
45+
from kubernetes.utils import parse_quantity
4446
import yaml
4547
import os
4648
import requests
@@ -513,26 +515,18 @@ def from_k8_cluster_object(
513515
namespace=rc["metadata"]["namespace"],
514516
machine_types=machine_types,
515517
num_workers=rc["spec"]["workerGroupSpecs"][0]["minReplicas"],
516-
min_cpus=int(
517-
rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
518-
"resources"
519-
]["requests"]["cpu"]
520-
),
521-
max_cpus=int(
522-
rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
523-
"resources"
524-
]["limits"]["cpu"]
525-
),
526-
min_memory=int(
527-
rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
528-
"resources"
529-
]["requests"]["memory"][:-1]
530-
),
531-
max_memory=int(
532-
rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
533-
"resources"
534-
]["limits"]["memory"][:-1]
535-
),
518+
min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
519+
"containers"
520+
][0]["resources"]["requests"]["cpu"],
521+
max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
522+
"containers"
523+
][0]["resources"]["limits"]["cpu"],
524+
min_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
525+
"containers"
526+
][0]["resources"]["requests"]["memory"],
527+
max_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
528+
"containers"
529+
][0]["resources"]["limits"]["memory"],
536530
num_gpus=int(
537531
rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
538532
"resources"

src/codeflare_sdk/cluster/config.py

+25-6
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
from dataclasses import dataclass, field
2222
import pathlib
23+
import typing
2324

2425
dir = pathlib.Path(__file__).parent.parent.resolve()
2526

@@ -34,15 +35,15 @@ class ClusterConfiguration:
3435
name: str
3536
namespace: str = None
3637
head_info: list = field(default_factory=list)
37-
head_cpus: int = 2
38-
head_memory: int = 8
38+
head_cpus: typing.Union[int, str] = 2
39+
head_memory: typing.Union[int, str] = 8
3940
head_gpus: int = 0
4041
machine_types: list = field(default_factory=list) # ["m4.xlarge", "g4dn.xlarge"]
41-
min_cpus: int = 1
42-
max_cpus: int = 1
42+
min_cpus: typing.Union[int, str] = 1
43+
max_cpus: typing.Union[int, str] = 1
4344
num_workers: int = 1
44-
min_memory: int = 2
45-
max_memory: int = 2
45+
min_memory: typing.Union[int, str] = 2
46+
max_memory: typing.Union[int, str] = 2
4647
num_gpus: int = 0
4748
template: str = f"{dir}/templates/base-template.yaml"
4849
instascale: bool = False
@@ -59,5 +60,23 @@ def __post_init__(self):
5960
print(
6061
"Warning: TLS verification has been disabled - Endpoint checks will be bypassed"
6162
)
63+
self._memory_to_string()
64+
self._str_mem_no_unit_add_GB()
65+
66+
def _str_mem_no_unit_add_GB(self):
67+
if isinstance(self.head_memory, str) and self.head_memory.isdecimal():
68+
self.head_memory = f"{self.head_memory}G"
69+
if isinstance(self.min_memory, str) and self.min_memory.isdecimal():
70+
self.min_memory = f"{self.min_memory}G"
71+
if isinstance(self.max_memory, str) and self.max_memory.isdecimal():
72+
self.max_memory = f"{self.max_memory}G"
73+
74+
def _memory_to_string(self):
75+
if isinstance(self.head_memory, int):
76+
self.head_memory = f"{self.head_memory}G"
77+
if isinstance(self.min_memory, int):
78+
self.min_memory = f"{self.min_memory}G"
79+
if isinstance(self.max_memory, int):
80+
self.max_memory = f"{self.max_memory}G"
6281

6382
local_queue: str = None

src/codeflare_sdk/utils/generate_yaml.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -140,8 +140,8 @@ def update_custompodresources(
140140
# Leave head node resources as template default
141141
resource["requests"]["cpu"] = head_cpus
142142
resource["limits"]["cpu"] = head_cpus
143-
resource["requests"]["memory"] = str(head_memory) + "G"
144-
resource["limits"]["memory"] = str(head_memory) + "G"
143+
resource["requests"]["memory"] = head_memory
144+
resource["limits"]["memory"] = head_memory
145145
resource["requests"]["nvidia.com/gpu"] = head_gpus
146146
resource["limits"]["nvidia.com/gpu"] = head_gpus
147147

@@ -158,9 +158,9 @@ def update_custompodresources(
158158
resource[k][spec] = min_cpu
159159
if spec == "memory":
160160
if k == "limits":
161-
resource[k][spec] = str(max_memory) + "G"
161+
resource[k][spec] = max_memory
162162
else:
163-
resource[k][spec] = str(min_memory) + "G"
163+
resource[k][spec] = min_memory
164164
if spec == "nvidia.com/gpu":
165165
if i == 0:
166166
resource[k][spec] = 0
@@ -213,12 +213,12 @@ def update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu):
213213
requests = resource.get("resources").get("requests")
214214
if requests is not None:
215215
requests["cpu"] = min_cpu
216-
requests["memory"] = str(min_memory) + "G"
216+
requests["memory"] = min_memory
217217
requests["nvidia.com/gpu"] = gpu
218218
limits = resource.get("resources").get("limits")
219219
if limits is not None:
220220
limits["cpu"] = max_cpu
221-
limits["memory"] = str(max_memory) + "G"
221+
limits["memory"] = max_memory
222222
limits["nvidia.com/gpu"] = gpu
223223

224224

src/codeflare_sdk/utils/pretty_print.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ def print_clusters(clusters: List[RayCluster]):
136136
name = cluster.name
137137
dashboard = cluster.dashboard
138138
workers = str(cluster.workers)
139-
memory = str(cluster.worker_mem_min) + "~" + str(cluster.worker_mem_max)
139+
memory = f"{cluster.worker_mem_min}~{cluster.worker_mem_max}"
140140
cpu = str(cluster.worker_cpu)
141141
gpu = str(cluster.worker_gpu)
142142

tests/unit_test.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ def test_config_creation():
248248
assert config.name == "unit-test-cluster" and config.namespace == "ns"
249249
assert config.num_workers == 2
250250
assert config.min_cpus == 3 and config.max_cpus == 4
251-
assert config.min_memory == 5 and config.max_memory == 6
251+
assert config.min_memory == "5G" and config.max_memory == "6G"
252252
assert config.num_gpus == 7
253253
assert config.image == "quay.io/project-codeflare/ray:latest-py39-cu118"
254254
assert config.template == f"{parent}/src/codeflare_sdk/templates/base-template.yaml"
@@ -851,8 +851,8 @@ def test_ray_details(mocker, capsys):
851851
name="raytest1",
852852
status=RayClusterStatus.READY,
853853
workers=1,
854-
worker_mem_min=2,
855-
worker_mem_max=2,
854+
worker_mem_min="2G",
855+
worker_mem_max="2G",
856856
worker_cpu=1,
857857
worker_gpu=0,
858858
namespace="ns",
@@ -911,7 +911,7 @@ def test_ray_details(mocker, capsys):
911911
" │ ╭── Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n"
912912
" │ │ # Workers │ │ Memory CPU GPU │ │ \n"
913913
" │ │ │ │ │ │ \n"
914-
" │ │ 1 │ │ 2~2 1 0 │ │ \n"
914+
" │ │ 1 │ │ 2G~2G 1 0 │ │ \n"
915915
" │ │ │ │ │ │ \n"
916916
" │ ╰─────────────╯ ╰──────────────────────────────────────╯ │ \n"
917917
" ╰───────────────────────────────────────────────────────────────╯ \n"
@@ -929,7 +929,7 @@ def test_ray_details(mocker, capsys):
929929
" │ ╭── Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n"
930930
" │ │ # Workers │ │ Memory CPU GPU │ │ \n"
931931
" │ │ │ │ │ │ \n"
932-
" │ │ 1 │ │ 2~2 1 0 │ │ \n"
932+
" │ │ 1 │ │ 2G~2G 1 0 │ │ \n"
933933
" │ │ │ │ │ │ \n"
934934
" │ ╰─────────────╯ ╰──────────────────────────────────────╯ │ \n"
935935
" ╰───────────────────────────────────────────────────────────────╯ \n"
@@ -945,7 +945,7 @@ def test_ray_details(mocker, capsys):
945945
"│ ╭── Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │\n"
946946
"│ │ # Workers │ │ Memory CPU GPU │ │\n"
947947
"│ │ │ │ │ │\n"
948-
"│ │ 1 │ │ 2~2 1 0 │ │\n"
948+
"│ │ 1 │ │ 2G~2G 1 0 │ │\n"
949949
"│ │ │ │ │ │\n"
950950
"│ ╰─────────────╯ ╰──────────────────────────────────────╯ │\n"
951951
"╰───────────────────────────────────────────────────────────────╯\n"
@@ -2438,7 +2438,7 @@ def custom_side_effect(group, version, namespace, plural, **kwargs):
24382438
and "g4dn.xlarge" in cluster_config.machine_types
24392439
)
24402440
assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1
2441-
assert cluster_config.min_memory == 2 and cluster_config.max_memory == 2
2441+
assert cluster_config.min_memory == "2G" and cluster_config.max_memory == "2G"
24422442
assert cluster_config.num_gpus == 0
24432443
assert (
24442444
cluster_config.image
@@ -2470,7 +2470,7 @@ def test_get_cluster(mocker):
24702470
and "g4dn.xlarge" in cluster_config.machine_types
24712471
)
24722472
assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1
2473-
assert cluster_config.min_memory == 2 and cluster_config.max_memory == 2
2473+
assert cluster_config.min_memory == "2G" and cluster_config.max_memory == "2G"
24742474
assert cluster_config.num_gpus == 0
24752475
assert cluster_config.instascale
24762476
assert (

0 commit comments

Comments
 (0)