feat: add custom volumes/volume mounts for ray clusters

Bobbins228 · Bobbins228 · commit da3041d14322 · 2025-01-08T09:56:46.000Z
diff --git a/src/codeflare_sdk/ray/cluster/build_ray_cluster.py b/src/codeflare_sdk/ray/cluster/build_ray_cluster.py
@@ -249,7 +249,7 @@ def get_pod_spec(cluster: "codeflare_sdk.ray.cluster.Cluster", containers):
     """
     pod_spec = V1PodSpec(
         containers=containers,
-        volumes=VOLUMES,
+        volumes=generate_custom_storage(cluster.config.volumes, VOLUMES),
     )
     if cluster.config.image_pull_secrets != []:
         pod_spec.image_pull_secrets = generate_image_pull_secrets(cluster)
@@ -295,7 +295,9 @@ def get_head_container_spec(
             cluster.config.head_memory_limits,
             cluster.config.head_extended_resource_requests,
         ),
-        volume_mounts=VOLUME_MOUNTS,
+        volume_mounts=generate_custom_storage(
+            cluster.config.volume_mounts, VOLUME_MOUNTS
+        ),
     )
     if cluster.config.envs != {}:
         head_container.env = generate_env_vars(cluster)
@@ -337,7 +339,9 @@ def get_worker_container_spec(
             cluster.config.worker_memory_limits,
             cluster.config.worker_extended_resource_requests,
         ),
-        volume_mounts=VOLUME_MOUNTS,
+        volume_mounts=generate_custom_storage(
+            cluster.config.volume_mounts, VOLUME_MOUNTS
+        ),
     )
 
     if cluster.config.envs != {}:
@@ -521,6 +525,22 @@ def wrap_cluster(
 
 
 # Etc.
+def generate_custom_storage(provided_storage: list, default_storage: list):
+    """
+    The generate_custom_storage function updates the volumes/volume mounts configs with the default volumes/volume mounts.
+    """
+    storage_list = provided_storage.copy()
+
+    if storage_list == []:
+        storage_list = default_storage
+    else:
+        # We append the list of volumes/volume mounts with the defaults and return the full list
+        for storage in default_storage:
+            storage_list.append(storage)
+
+    return storage_list
+
+
 def write_to_file(cluster: "codeflare_sdk.ray.cluster.Cluster", resource: dict):
     """
     The write_to_file function writes the built Ray Cluster/AppWrapper dict as a yaml file in the .codeflare folder
diff --git a/src/codeflare_sdk/ray/cluster/config.py b/src/codeflare_sdk/ray/cluster/config.py
@@ -22,6 +22,7 @@
 import warnings
 from dataclasses import dataclass, field, fields
 from typing import Dict, List, Optional, Union, get_args, get_origin
+from kubernetes.client import V1Volume, V1VolumeMount
 
 dir = pathlib.Path(__file__).parent.parent.resolve()
 
@@ -41,56 +42,63 @@
 @dataclass
 class ClusterConfiguration:
     """
-    This dataclass is used to specify resource requirements and other details, and
-    is passed in as an argument when creating a Cluster object.
+        This dataclass is used to specify resource requirements and other details, and
+        is passed in as an argument when creating a Cluster object.
 
-    Args:
-        name:
-            The name of the cluster.
-        namespace:
-            The namespace in which the cluster should be created.
-        head_cpus:
-            The number of CPUs to allocate to the head node.
-        head_memory:
-            The amount of memory to allocate to the head node.
-        head_gpus:
-            The number of GPUs to allocate to the head node. (Deprecated, use head_extended_resource_requests)
-        head_extended_resource_requests:
-            A dictionary of extended resource requests for the head node. ex: {"nvidia.com/gpu": 1}
-        min_cpus:
-            The minimum number of CPUs to allocate to each worker.
-        max_cpus:
-            The maximum number of CPUs to allocate to each worker.
-        num_workers:
-            The number of workers to create.
-        min_memory:
-            The minimum amount of memory to allocate to each worker.
-        max_memory:
-            The maximum amount of memory to allocate to each worker.
-        num_gpus:
-            The number of GPUs to allocate to each worker. (Deprecated, use worker_extended_resource_requests)
-        appwrapper:
-            A boolean indicating whether to use an AppWrapper.
-        envs:
-            A dictionary of environment variables to set for the cluster.
-        image:
-            The image to use for the cluster.
-        image_pull_secrets:
-            A list of image pull secrets to use for the cluster.
-        write_to_file:
-            A boolean indicating whether to write the cluster configuration to a file.
-        verify_tls:
-            A boolean indicating whether to verify TLS when connecting to the cluster.
-        labels:
-            A dictionary of labels to apply to the cluster.
-        worker_extended_resource_requests:
-            A dictionary of extended resource requests for each worker. ex: {"nvidia.com/gpu": 1}
-        extended_resource_mapping:
-            A dictionary of custom resource mappings to map extended resource requests to RayCluster resource names
-        overwrite_default_resource_mapping:
-            A boolean indicating whether to overwrite the default resource mapping.
-        annotations:
-            A dictionary of annotations to apply to the cluster.
+        Args:
+            name:
+                The name of the cluster.
+            namespace:
+                The namespace in which the cluster should be created.
+            head_cpus:
+                The number of CPUs to allocate to the head node.
+            head_memory:
+                The amount of memory to allocate to the head node.
+            head_gpus:
+                The number of GPUs to allocate to the head node. (Deprecated, use head_extended_resource_requests)
+            head_extended_resource_requests:
+                A dictionary of extended resource requests for the head node. ex: {"nvidia.com/gpu": 1}
+            min_cpus:
+                The minimum number of CPUs to allocate to each worker.
+            max_cpus:
+                The maximum number of CPUs to allocate to each worker.
+            num_workers:
+                The number of workers to create.
+            min_memory:
+                The minimum amount of memory to allocate to each worker.
+            max_memory:
+                The maximum amount of memory to allocate to each worker.
+            num_gpus:
+                The number of GPUs to allocate to each worker. (Deprecated, use worker_extended_resource_requests)
+            appwrapper:
+                A boolean indicating whether to use an AppWrapper.
+            envs:
+                A dictionary of environment variables to set for the cluster.
+            image:
+                The image to use for the cluster.
+            image_pull_secrets:
+                A list of image pull secrets to use for the cluster.
+            write_to_file:
+                A boolean indicating whether to write the cluster configuration to a file.
+            verify_tls:
+                A boolean indicating whether to verify TLS when connecting to the cluster.
+            labels:
+                A dictionary of labels to apply to the cluster.
+            worker_extended_resource_requests:
+                A dictionary of extended resource requests for each worker. ex: {"nvidia.com/gpu": 1}
+            extended_resource_mapping:
+                A dictionary of custom resource mappings to map extended resource requests to RayCluster resource names
+            overwrite_default_resource_mapping:
+                A boolean indicating whether to overwrite the default resource mapping.
+    <<<<<<< HEAD
+            annotations:
+                A dictionary of annotations to apply to the cluster.
+    =======
+            volumes:
+                A list of V1Volume objects to add to the Cluster
+            volume_mounts:
+                A list of V1VolumeMount objects to add to the Cluster
+    >>>>>>> 625b209 (feat: add custom volumes/volume mounts for ray clusters)
     """
 
     name: str
@@ -129,6 +137,8 @@ class ClusterConfiguration:
     overwrite_default_resource_mapping: bool = False
     local_queue: Optional[str] = None
     annotations: Dict[str, str] = field(default_factory=dict)
+    volumes: list[V1Volume] = field(default_factory=list)
+    volume_mounts: list[V1VolumeMount] = field(default_factory=list)
 
     def __post_init__(self):
         if not self.verify_tls: