From b2077aef305f82804f44d73e404565093f62446b Mon Sep 17 00:00:00 2001 From: ted chang Date: Mon, 28 Aug 2023 19:35:38 -0700 Subject: [PATCH 1/4] Update mcad kuberay example --- .../kuberay/config/aw-raycluster-1.yaml | 153 ++++++++++++++++ .../kuberay/config/aw-raycluster.yaml | 167 +++++++++--------- doc/usage/examples/kuberay/kuberay-mcad.md | 56 +++++- 3 files changed, 284 insertions(+), 92 deletions(-) create mode 100644 doc/usage/examples/kuberay/config/aw-raycluster-1.yaml diff --git a/doc/usage/examples/kuberay/config/aw-raycluster-1.yaml b/doc/usage/examples/kuberay/config/aw-raycluster-1.yaml new file mode 100644 index 000000000..22293c1b5 --- /dev/null +++ b/doc/usage/examples/kuberay/config/aw-raycluster-1.yaml @@ -0,0 +1,153 @@ +apiVersion: mcad.ibm.com/v1beta1 +kind: AppWrapper +metadata: + name: raycluster-complete-1 + namespace: default +spec: + resources: + GenericItems: + - replicas: 1 + custompodresources: # Optional section that specifies resource requirements + # for non-standard k8s resources, follows same format as + # that of standard k8s resources. + - replicas: 2 # because AppWrappers are generic they must define the resultant pods that will be needed + # to fulfill a request as the request values cannot be reliably extracted from the + # generictemplate below + requests: + cpu: 8 + memory: 512Mi + limits: + cpu: 10 + memory: 1G + generictemplate: + # The resource requests and limits in this config are too small for production! + # For examples with more realistic resource configuration, see + # ray-cluster.complete.large.yaml and + # ray-cluster.autoscaler.large.yaml. + apiVersion: ray.io/v1alpha1 + kind: RayCluster + metadata: + labels: + controller-tools.k8s.io: "1.0" + # A unique identifier for the head node and workers of this cluster. + name: raycluster-complete-1 + spec: + rayVersion: '2.5.0' + # Ray head pod configuration + headGroupSpec: + # Kubernetes Service Type. This is an optional field, and the default value is ClusterIP. + # Refer to https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types. + serviceType: ClusterIP + # The `rayStartParams` are used to configure the `ray start` command. + # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay. + # See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`. + rayStartParams: + dashboard-host: '0.0.0.0' + # pod template + template: + metadata: + # Custom labels. NOTE: To avoid conflicts with KubeRay operator, do not define custom labels start with `raycluster`. + # Refer to https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ + labels: {} + spec: + containers: + - name: ray-head + image: rayproject/ray:2.5.0 + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + lifecycle: + preStop: + exec: + command: ["/bin/sh","-c","ray stop"] + volumeMounts: + - mountPath: /tmp/ray + name: ray-logs + # The resource requests and limits in this config are too small for production! + # For an example with more realistic resource configuration, see + # ray-cluster.autoscaler.large.yaml. + # It is better to use a few large Ray pod than many small ones. + # For production, it is ideal to size each Ray pod to take up the + # entire Kubernetes node on which it is scheduled. + resources: + limits: + cpu: "1" + memory: "2G" + requests: + # For production use-cases, we recommend specifying integer CPU reqests and limits. + # We also recommend setting requests equal to limits for both CPU and memory. + # For this example, we use a 500m CPU request to accomodate resource-constrained local + # Kubernetes testing environments such as KinD and minikube. + cpu: "500m" + memory: "2G" + volumes: + - name: ray-logs + emptyDir: {} + workerGroupSpecs: + # the pod replicas in this group typed worker + - replicas: 1 + minReplicas: 1 + maxReplicas: 10 + # logical group name, for this called small-group, also can be functional + groupName: small-group + # If worker pods need to be added, we can increment the replicas. + # If worker pods need to be removed, we decrement the replicas, and populate the workersToDelete list. + # The operator will remove pods from the list until the desired number of replicas is satisfied. + # If the difference between the current replica count and the desired replicas is greater than the + # number of entries in workersToDelete, random worker pods will be deleted. + #scaleStrategy: + # workersToDelete: + # - raycluster-complete-worker-small-group-bdtwh + # - raycluster-complete-worker-small-group-hv457 + # - raycluster-complete-worker-small-group-k8tj7 + # The `rayStartParams` are used to configure the `ray start` command. + # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay. + # See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`. + rayStartParams: {} + #pod template + template: + spec: + containers: + - name: ray-worker + image: rayproject/ray:2.5.0 + lifecycle: + preStop: + exec: + command: ["/bin/sh","-c","ray stop"] + # use volumeMounts.Optional. + # Refer to https://kubernetes.io/docs/concepts/storage/volumes/ + volumeMounts: + - mountPath: /tmp/ray + name: ray-logs + # The resource requests and limits in this config are too small for production! + # For an example with more realistic resource configuration, see + # ray-cluster.autoscaler.large.yaml. + # It is better to use a few large Ray pod than many small ones. + # For production, it is ideal to size each Ray pod to take up the + # entire Kubernetes node on which it is scheduled. + resources: + limits: + cpu: "1" + memory: "1G" + # For production use-cases, we recommend specifying integer CPU reqests and limits. + # We also recommend setting requests equal to limits for both CPU and memory. + # For this example, we use a 500m CPU request to accomodate resource-constrained local + # Kubernetes testing environments such as KinD and minikube. + requests: + # For production use-cases, we recommend specifying integer CPU reqests and limits. + # We also recommend setting requests equal to limits for both CPU and memory. + # For this example, we use a 500m CPU request to accomodate resource-constrained local + # Kubernetes testing environments such as KinD and minikube. + cpu: "500m" + # For production use-cases, we recommend allocating at least 8Gb memory for each Ray container. + memory: "1G" + # use volumes + # Refer to https://kubernetes.io/docs/concepts/storage/volumes/ + volumes: + - name: ray-logs + emptyDir: {} + diff --git a/doc/usage/examples/kuberay/config/aw-raycluster.yaml b/doc/usage/examples/kuberay/config/aw-raycluster.yaml index bbef20d0d..dc33b268f 100644 --- a/doc/usage/examples/kuberay/config/aw-raycluster.yaml +++ b/doc/usage/examples/kuberay/config/aw-raycluster.yaml @@ -1,25 +1,28 @@ apiVersion: mcad.ibm.com/v1beta1 kind: AppWrapper metadata: - name: raycluster-autoscaler + name: raycluster-complete namespace: default spec: resources: - Items: [] GenericItems: - replicas: 1 - custompodresources: - - replicas: 2 + custompodresources: # Optional section that specifies resource requirements + # for non-standard k8s resources, follows same format as + # that of standard k8s resources. + - replicas: 2 # because AppWrappers are generic they must define the resultant pods that will be needed + # to fulfill a request as the request values cannot be reliably extracted from the + # generictemplate below requests: - cpu: 10 + cpu: 8 memory: 512Mi limits: cpu: 10 memory: 1G generictemplate: - # This config demonstrates KubeRay's Ray autoscaler integration. # The resource requests and limits in this config are too small for production! - # For an example with more realistic resource configuration, see + # For examples with more realistic resource configuration, see + # ray-cluster.complete.large.yaml and # ray-cluster.autoscaler.large.yaml. apiVersion: ray.io/v1alpha1 kind: RayCluster @@ -27,66 +30,29 @@ spec: labels: controller-tools.k8s.io: "1.0" # A unique identifier for the head node and workers of this cluster. - name: raycluster-autoscaler + name: raycluster-complete spec: - # The version of Ray you are using. Make sure all Ray containers are running this version of Ray. - rayVersion: '2.0.0' - # If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod. - # Ray autoscaler integration is supported only for Ray versions >= 1.11.0 - # Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0. - enableInTreeAutoscaling: true - # autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler. - # The example configuration shown below below represents the DEFAULT values. - # (You may delete autoscalerOptions if the defaults are suitable.) - autoscalerOptions: - # upscalingMode is "Default" or "Aggressive." - # Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster. - # Default: Upscaling is not rate-limited. - # Aggressive: An alias for Default; upscaling is not rate-limited. - upscalingMode: Default - # idleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources. - idleTimeoutSeconds: 60 - # image optionally overrides the autoscaler's container image. - # If instance.spec.rayVersion is at least "2.0.0", the autoscaler will default to the same image as - # the ray container. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image. - ## image: "my-repo/my-custom-autoscaler-image:tag" - # imagePullPolicy optionally overrides the autoscaler container's image pull policy. - imagePullPolicy: Always - # resources specifies optional resource request and limit overrides for the autoscaler container. - # For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required. - resources: - limits: - cpu: "500m" - memory: "512Mi" - requests: - cpu: "500m" - memory: "512Mi" - ######################headGroupSpec################################# - # head group template and specs, (perhaps 'group' is not needed in the name) + rayVersion: '2.5.0' + # Ray head pod configuration headGroupSpec: - # Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer' + # Kubernetes Service Type. This is an optional field, and the default value is ClusterIP. + # Refer to https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types. serviceType: ClusterIP - # logical group name, for this called head-group, also can be functional - # pod type head or worker - # rayNodeType: head # Not needed since it is under the headgroup - # the following params are used to complete the ray start: ray start --head --block ... + # The `rayStartParams` are used to configure the `ray start` command. + # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay. + # See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`. rayStartParams: - # Flag "no-monitor" will be automatically set when autoscaling is enabled. dashboard-host: '0.0.0.0' - block: 'true' - # num-cpus: '1' # can be auto-completed from the limits - # Use `resources` to optionally specify custom resource annotations for the Ray node. - # The value of `resources` is a string-integer mapping. - # Currently, `resources` must be provided in the specific format demonstrated below: - # resources: '"{\"Custom1\": 1, \"Custom2\": 5}"' - #pod template + # pod template template: + metadata: + # Custom labels. NOTE: To avoid conflicts with KubeRay operator, do not define custom labels start with `raycluster`. + # Refer to https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ + labels: {} spec: containers: - # The Ray head pod - name: ray-head - image: rayproject/ray:2.0.0 - imagePullPolicy: Always + image: rayproject/ray:2.5.0 ports: - containerPort: 6379 name: gcs @@ -98,59 +64,90 @@ spec: preStop: exec: command: ["/bin/sh","-c","ray stop"] + volumeMounts: + - mountPath: /tmp/ray + name: ray-logs + # The resource requests and limits in this config are too small for production! + # For an example with more realistic resource configuration, see + # ray-cluster.autoscaler.large.yaml. + # It is better to use a few large Ray pod than many small ones. + # For production, it is ideal to size each Ray pod to take up the + # entire Kubernetes node on which it is scheduled. resources: limits: cpu: "1" - memory: "1G" + memory: "2G" requests: + # For production use-cases, we recommend specifying integer CPU reqests and limits. + # We also recommend setting requests equal to limits for both CPU and memory. + # For this example, we use a 500m CPU request to accomodate resource-constrained local + # Kubernetes testing environments such as KinD and minikube. cpu: "500m" - memory: "512Mi" + memory: "2G" + volumes: + - name: ray-logs + emptyDir: {} workerGroupSpecs: # the pod replicas in this group typed worker - replicas: 1 minReplicas: 1 - maxReplicas: 300 + maxReplicas: 10 # logical group name, for this called small-group, also can be functional groupName: small-group - # if worker pods need to be added, we can simply increment the replicas - # if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list - # the operator will remove pods from the list until the number of replicas is satisfied - # when a pod is confirmed to be deleted, its name will be removed from the list below + # If worker pods need to be added, we can increment the replicas. + # If worker pods need to be removed, we decrement the replicas, and populate the workersToDelete list. + # The operator will remove pods from the list until the desired number of replicas is satisfied. + # If the difference between the current replica count and the desired replicas is greater than the + # number of entries in workersToDelete, random worker pods will be deleted. #scaleStrategy: # workersToDelete: # - raycluster-complete-worker-small-group-bdtwh # - raycluster-complete-worker-small-group-hv457 # - raycluster-complete-worker-small-group-k8tj7 - # the following params are used to complete the ray start: ray start --block ... - rayStartParams: - block: 'true' + # The `rayStartParams` are used to configure the `ray start` command. + # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay. + # See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`. + rayStartParams: {} #pod template template: - metadata: - labels: - key: value - # annotations for pod - annotations: - key: value spec: - initContainers: - # the env var $RAY_IP is set by the operator if missing, with the value of the head service name - - name: init-myservice - image: busybox:1.28 - command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"] containers: - - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc' - image: rayproject/ray:2.0.0 - # environment variables to set in the container.Optional. - # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/ + - name: ray-worker + image: rayproject/ray:2.5.0 lifecycle: preStop: exec: command: ["/bin/sh","-c","ray stop"] + # use volumeMounts.Optional. + # Refer to https://kubernetes.io/docs/concepts/storage/volumes/ + volumeMounts: + - mountPath: /tmp/ray + name: ray-logs + # The resource requests and limits in this config are too small for production! + # For an example with more realistic resource configuration, see + # ray-cluster.autoscaler.large.yaml. + # It is better to use a few large Ray pod than many small ones. + # For production, it is ideal to size each Ray pod to take up the + # entire Kubernetes node on which it is scheduled. resources: limits: cpu: "1" - memory: "512Mi" + memory: "1G" + # For production use-cases, we recommend specifying integer CPU reqests and limits. + # We also recommend setting requests equal to limits for both CPU and memory. + # For this example, we use a 500m CPU request to accomodate resource-constrained local + # Kubernetes testing environments such as KinD and minikube. requests: + # For production use-cases, we recommend specifying integer CPU reqests and limits. + # We also recommend setting requests equal to limits for both CPU and memory. + # For this example, we use a 500m CPU request to accomodate resource-constrained local + # Kubernetes testing environments such as KinD and minikube. cpu: "500m" - memory: "256Mi" \ No newline at end of file + # For production use-cases, we recommend allocating at least 8Gb memory for each Ray container. + memory: "1G" + # use volumes + # Refer to https://kubernetes.io/docs/concepts/storage/volumes/ + volumes: + - name: ray-logs + emptyDir: {} + diff --git a/doc/usage/examples/kuberay/kuberay-mcad.md b/doc/usage/examples/kuberay/kuberay-mcad.md index 922217e19..7aa7fbe80 100644 --- a/doc/usage/examples/kuberay/kuberay-mcad.md +++ b/doc/usage/examples/kuberay/kuberay-mcad.md @@ -4,13 +4,55 @@ This integration will help in queuing on [kuberay](https://github.com/ray-projec #### Prerequisites -- kubernetes or Openshift cluster -- Install MCAD using instructions present under `deployment` directory -- Make sure MCAD has clusterrole to create ray resources, please patch using configuration file present in `config` directory with name `xqueuejob-controller.yaml` +- Kubernetes(see [KinD](https://helm.sh/docs/intro/install/)) or Openshift cluster(see [OpenShift Local](https://developers.redhat.com/products/openshift-local/overview)) +- Kubernetes client tools such as [kubectl](https://kubernetes.io/docs/tasks/tools/) or [OpenShift CLI](https://docs.openshift.com/container-platform/4.13/cli_reference/openshift_cli/getting-started-cli.html) +- [Helm](https://helm.sh/docs/intro/install/) +- Install MCAD and KubeRay operators: + - KinD cluster: + + Install the stable release of MCAD opeartor from local charts + ```bash + git clone https://github.com/project-codeflare/multi-cluster-app-dispatcher + cd multi-cluster-app-dispatcher + helm install mcad --set image.repository=quay.io/project-codeflare/mcad-controller --set image.tag=stable deployment/mcad-controller + ``` + + Make sure MCAD has clusterrole to create ray resources, please patch using [xqueuejob-controller.yaml](doc/usage/examples/kuberay/config/xqueuejob-controller.yaml). For example: + ``` + kubectl apply -f doc/usage/examples/kuberay/config/xqueuejob-controller.yaml + ``` + + See [deployment.md](../../../../doc/deploy/deployment.md) for more options. + + Install kuberay operator using the [instructions](https://github.com/ray-project/kuberay#quick-start). For example, install kuberay v0.6.0 from remote helm repo: + ``` + helm repo add kuberay https://ray-project.github.io/kuberay-helm/ + helm install kuberay-operator kuberay/kuberay-operator --version 0.6.0 + ``` + + - OpenShift cluster: + + MCAD and KubeRay Operators are part of the CodeFlare stack which provides a simple, user-friendly abstraction for scaling, +queuing and resource management of distributed AI/ML and Python workloads. Please follow the `Distributed Workloads` [Quick-Start](https://github.com/opendatahub-io/distributed-workloads/blob/main/Quick-Start.md) for installation. + #### Steps -- Install kuberay operator from [link](https://docs.ray.io/en/latest/cluster/kubernetes/getting-started.html#deploying-the-kuberay-operator) -- Submit ray cluster to MCAD as appwrapper using the config file `aw-raycluster.yaml` present in the `config` directory using command `kubectl create -f aw-raycluster.yaml` -- Check the status of the appwrapper using command `kubectl describe appwrapper ` -- Check running pods using command `kubectl get pods -n ` \ No newline at end of file + +- Submit the RayCluster custom resource to MCAD as AppWrapper using the [aw-raycluster.yaml](doc/usage/examples/kuberay/config/aw-raycluster.yaml) exmaple: + ```bash + kubectl create -f doc/usage/examples/kuberay/config/aw-raycluster.yaml + ``` +- Check the status of the AppWrapper custom resource using command + ```bash + kubectl describe appwrapper raycluster-complete -n default + ``` +- Check the raycluster status is ready using command + ```bash + kubectl get raycluster -n default + ``` + Expect: + `````` + NAME DESIRED WORKERS AVAILABLE WORKERS STATUS AGE + raycluster-complete 1 1 ready 6m45s + ``` From 3a03149287995f872fcca793fca009d4fda67cdd Mon Sep 17 00:00:00 2001 From: ted chang Date: Thu, 31 Aug 2023 00:22:12 -0700 Subject: [PATCH 2/4] Update doc/usage/examples/kuberay/kuberay-mcad.md Co-authored-by: Anish Asthana --- doc/usage/examples/kuberay/kuberay-mcad.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/usage/examples/kuberay/kuberay-mcad.md b/doc/usage/examples/kuberay/kuberay-mcad.md index 7aa7fbe80..f133c8e20 100644 --- a/doc/usage/examples/kuberay/kuberay-mcad.md +++ b/doc/usage/examples/kuberay/kuberay-mcad.md @@ -32,8 +32,7 @@ This integration will help in queuing on [kuberay](https://github.com/ray-projec - OpenShift cluster: - MCAD and KubeRay Operators are part of the CodeFlare stack which provides a simple, user-friendly abstraction for scaling, -queuing and resource management of distributed AI/ML and Python workloads. Please follow the `Distributed Workloads` [Quick-Start](https://github.com/opendatahub-io/distributed-workloads/blob/main/Quick-Start.md) for installation. + On OpenShift, MCAD and KubeRay are already part of the Open Data Hub Distributed Workload Stack. The stack provides a simple, user-friendly abstraction for scaling, queuing and resource management of distributed AI/ML and Python workloads. Please follow the Quick Start in the [Distributed Workloads](https://github.com/opendatahub-io/distributed-workloads) for installation. #### Steps From 5eefdcd790d3127852b85dfa7d940d6309b3e2f5 Mon Sep 17 00:00:00 2001 From: ted chang Date: Thu, 31 Aug 2023 00:28:59 -0700 Subject: [PATCH 3/4] Update doc/usage/examples/kuberay/kuberay-mcad.md Co-authored-by: Kai-Hsun Chen --- doc/usage/examples/kuberay/kuberay-mcad.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/usage/examples/kuberay/kuberay-mcad.md b/doc/usage/examples/kuberay/kuberay-mcad.md index f133c8e20..a15eb1b1f 100644 --- a/doc/usage/examples/kuberay/kuberay-mcad.md +++ b/doc/usage/examples/kuberay/kuberay-mcad.md @@ -4,7 +4,7 @@ This integration will help in queuing on [kuberay](https://github.com/ray-projec #### Prerequisites -- Kubernetes(see [KinD](https://helm.sh/docs/intro/install/)) or Openshift cluster(see [OpenShift Local](https://developers.redhat.com/products/openshift-local/overview)) +- Kubernetes(see [KinD](https://kind.sigs.k8s.io/)) or Openshift cluster(see [OpenShift Local](https://developers.redhat.com/products/openshift-local/overview)) - Kubernetes client tools such as [kubectl](https://kubernetes.io/docs/tasks/tools/) or [OpenShift CLI](https://docs.openshift.com/container-platform/4.13/cli_reference/openshift_cli/getting-started-cli.html) - [Helm](https://helm.sh/docs/intro/install/) - Install MCAD and KubeRay operators: From 65abae1a4ff96655d4dcf1be057f69964d8f4fca Mon Sep 17 00:00:00 2001 From: ted chang Date: Thu, 31 Aug 2023 10:20:31 -0700 Subject: [PATCH 4/4] update appwrapper yamls --- .../kuberay/config/aw-raycluster-1.yaml | 55 +++++++++++-------- .../kuberay/config/aw-raycluster.yaml | 55 +++++++++++-------- 2 files changed, 62 insertions(+), 48 deletions(-) diff --git a/doc/usage/examples/kuberay/config/aw-raycluster-1.yaml b/doc/usage/examples/kuberay/config/aw-raycluster-1.yaml index 22293c1b5..ffb1f9701 100644 --- a/doc/usage/examples/kuberay/config/aw-raycluster-1.yaml +++ b/doc/usage/examples/kuberay/config/aw-raycluster-1.yaml @@ -7,18 +7,31 @@ spec: resources: GenericItems: - replicas: 1 - custompodresources: # Optional section that specifies resource requirements - # for non-standard k8s resources, follows same format as - # that of standard k8s resources. - - replicas: 2 # because AppWrappers are generic they must define the resultant pods that will be needed - # to fulfill a request as the request values cannot be reliably extracted from the - # generictemplate below + custompodresources: + # Optional section that specifies resource requirements + # for non-standard k8s resources, follows same format as + # that of standard k8s resources. + # Each item in the custompodresources stanza should include resources consumed by target Item. + # In this example, the 2 items correspond to 1 Ray head pod and 1 Ray worker pod + - replicas: 1 + limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 requests: - cpu: 8 - memory: 512Mi + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + # The replica should match the number of worker pods + - replicas: 1 limits: - cpu: 10 - memory: 1G + cpu: 8 + memory: 8G + nvidia.com/gpu: 0 + requests: + cpu: 8 + memory: 8G + nvidia.com/gpu: 0 generictemplate: # The resource requests and limits in this config are too small for production! # For examples with more realistic resource configuration, see @@ -75,15 +88,13 @@ spec: # entire Kubernetes node on which it is scheduled. resources: limits: - cpu: "1" - memory: "2G" + cpu: "2" + memory: "8G" requests: # For production use-cases, we recommend specifying integer CPU reqests and limits. # We also recommend setting requests equal to limits for both CPU and memory. - # For this example, we use a 500m CPU request to accomodate resource-constrained local - # Kubernetes testing environments such as KinD and minikube. - cpu: "500m" - memory: "2G" + cpu: "2" + memory: "8G" volumes: - name: ray-logs emptyDir: {} @@ -131,20 +142,16 @@ spec: # entire Kubernetes node on which it is scheduled. resources: limits: - cpu: "1" - memory: "1G" + cpu: "8" + memory: "8G" # For production use-cases, we recommend specifying integer CPU reqests and limits. # We also recommend setting requests equal to limits for both CPU and memory. - # For this example, we use a 500m CPU request to accomodate resource-constrained local - # Kubernetes testing environments such as KinD and minikube. requests: # For production use-cases, we recommend specifying integer CPU reqests and limits. # We also recommend setting requests equal to limits for both CPU and memory. - # For this example, we use a 500m CPU request to accomodate resource-constrained local - # Kubernetes testing environments such as KinD and minikube. - cpu: "500m" + cpu: "8" # For production use-cases, we recommend allocating at least 8Gb memory for each Ray container. - memory: "1G" + memory: "8G" # use volumes # Refer to https://kubernetes.io/docs/concepts/storage/volumes/ volumes: diff --git a/doc/usage/examples/kuberay/config/aw-raycluster.yaml b/doc/usage/examples/kuberay/config/aw-raycluster.yaml index dc33b268f..932b004e9 100644 --- a/doc/usage/examples/kuberay/config/aw-raycluster.yaml +++ b/doc/usage/examples/kuberay/config/aw-raycluster.yaml @@ -7,18 +7,31 @@ spec: resources: GenericItems: - replicas: 1 - custompodresources: # Optional section that specifies resource requirements - # for non-standard k8s resources, follows same format as - # that of standard k8s resources. - - replicas: 2 # because AppWrappers are generic they must define the resultant pods that will be needed - # to fulfill a request as the request values cannot be reliably extracted from the - # generictemplate below + custompodresources: + # Optional section that specifies resource requirements + # for non-standard k8s resources, follows same format as + # that of standard k8s resources. + # Each item in the custompodresources stanza should include resources consumed by target Item. + # In this example, the 2 items correspond to 1 Ray head pod and 1 Ray worker pod + - replicas: 1 + limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 requests: - cpu: 8 - memory: 512Mi + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + # The replica should match the number of worker pods + - replicas: 1 limits: - cpu: 10 - memory: 1G + cpu: 8 + memory: 8G + nvidia.com/gpu: 0 + requests: + cpu: 8 + memory: 8G + nvidia.com/gpu: 0 generictemplate: # The resource requests and limits in this config are too small for production! # For examples with more realistic resource configuration, see @@ -75,15 +88,13 @@ spec: # entire Kubernetes node on which it is scheduled. resources: limits: - cpu: "1" - memory: "2G" + cpu: "2" + memory: "8G" requests: # For production use-cases, we recommend specifying integer CPU reqests and limits. # We also recommend setting requests equal to limits for both CPU and memory. - # For this example, we use a 500m CPU request to accomodate resource-constrained local - # Kubernetes testing environments such as KinD and minikube. - cpu: "500m" - memory: "2G" + cpu: "2" + memory: "8G" volumes: - name: ray-logs emptyDir: {} @@ -131,20 +142,16 @@ spec: # entire Kubernetes node on which it is scheduled. resources: limits: - cpu: "1" - memory: "1G" + cpu: "8" + memory: "8G" # For production use-cases, we recommend specifying integer CPU reqests and limits. # We also recommend setting requests equal to limits for both CPU and memory. - # For this example, we use a 500m CPU request to accomodate resource-constrained local - # Kubernetes testing environments such as KinD and minikube. requests: # For production use-cases, we recommend specifying integer CPU reqests and limits. # We also recommend setting requests equal to limits for both CPU and memory. - # For this example, we use a 500m CPU request to accomodate resource-constrained local - # Kubernetes testing environments such as KinD and minikube. - cpu: "500m" + cpu: "8" # For production use-cases, we recommend allocating at least 8Gb memory for each Ray container. - memory: "1G" + memory: "8G" # use volumes # Refer to https://kubernetes.io/docs/concepts/storage/volumes/ volumes: