diff --git a/doc/usage/examples/kuberay/config/aw-raycluster-1.yaml b/doc/usage/examples/kuberay/config/aw-raycluster-1.yaml new file mode 100644 index 000000000..ffb1f9701 --- /dev/null +++ b/doc/usage/examples/kuberay/config/aw-raycluster-1.yaml @@ -0,0 +1,160 @@ +apiVersion: mcad.ibm.com/v1beta1 +kind: AppWrapper +metadata: + name: raycluster-complete-1 + namespace: default +spec: + resources: + GenericItems: + - replicas: 1 + custompodresources: + # Optional section that specifies resource requirements + # for non-standard k8s resources, follows same format as + # that of standard k8s resources. + # Each item in the custompodresources stanza should include resources consumed by target Item. + # In this example, the 2 items correspond to 1 Ray head pod and 1 Ray worker pod + - replicas: 1 + limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + requests: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + # The replica should match the number of worker pods + - replicas: 1 + limits: + cpu: 8 + memory: 8G + nvidia.com/gpu: 0 + requests: + cpu: 8 + memory: 8G + nvidia.com/gpu: 0 + generictemplate: + # The resource requests and limits in this config are too small for production! + # For examples with more realistic resource configuration, see + # ray-cluster.complete.large.yaml and + # ray-cluster.autoscaler.large.yaml. + apiVersion: ray.io/v1alpha1 + kind: RayCluster + metadata: + labels: + controller-tools.k8s.io: "1.0" + # A unique identifier for the head node and workers of this cluster. + name: raycluster-complete-1 + spec: + rayVersion: '2.5.0' + # Ray head pod configuration + headGroupSpec: + # Kubernetes Service Type. This is an optional field, and the default value is ClusterIP. + # Refer to https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types. + serviceType: ClusterIP + # The `rayStartParams` are used to configure the `ray start` command. + # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay. + # See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`. + rayStartParams: + dashboard-host: '0.0.0.0' + # pod template + template: + metadata: + # Custom labels. NOTE: To avoid conflicts with KubeRay operator, do not define custom labels start with `raycluster`. + # Refer to https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ + labels: {} + spec: + containers: + - name: ray-head + image: rayproject/ray:2.5.0 + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + lifecycle: + preStop: + exec: + command: ["/bin/sh","-c","ray stop"] + volumeMounts: + - mountPath: /tmp/ray + name: ray-logs + # The resource requests and limits in this config are too small for production! + # For an example with more realistic resource configuration, see + # ray-cluster.autoscaler.large.yaml. + # It is better to use a few large Ray pod than many small ones. + # For production, it is ideal to size each Ray pod to take up the + # entire Kubernetes node on which it is scheduled. + resources: + limits: + cpu: "2" + memory: "8G" + requests: + # For production use-cases, we recommend specifying integer CPU reqests and limits. + # We also recommend setting requests equal to limits for both CPU and memory. + cpu: "2" + memory: "8G" + volumes: + - name: ray-logs + emptyDir: {} + workerGroupSpecs: + # the pod replicas in this group typed worker + - replicas: 1 + minReplicas: 1 + maxReplicas: 10 + # logical group name, for this called small-group, also can be functional + groupName: small-group + # If worker pods need to be added, we can increment the replicas. + # If worker pods need to be removed, we decrement the replicas, and populate the workersToDelete list. + # The operator will remove pods from the list until the desired number of replicas is satisfied. + # If the difference between the current replica count and the desired replicas is greater than the + # number of entries in workersToDelete, random worker pods will be deleted. + #scaleStrategy: + # workersToDelete: + # - raycluster-complete-worker-small-group-bdtwh + # - raycluster-complete-worker-small-group-hv457 + # - raycluster-complete-worker-small-group-k8tj7 + # The `rayStartParams` are used to configure the `ray start` command. + # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay. + # See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`. + rayStartParams: {} + #pod template + template: + spec: + containers: + - name: ray-worker + image: rayproject/ray:2.5.0 + lifecycle: + preStop: + exec: + command: ["/bin/sh","-c","ray stop"] + # use volumeMounts.Optional. + # Refer to https://kubernetes.io/docs/concepts/storage/volumes/ + volumeMounts: + - mountPath: /tmp/ray + name: ray-logs + # The resource requests and limits in this config are too small for production! + # For an example with more realistic resource configuration, see + # ray-cluster.autoscaler.large.yaml. + # It is better to use a few large Ray pod than many small ones. + # For production, it is ideal to size each Ray pod to take up the + # entire Kubernetes node on which it is scheduled. + resources: + limits: + cpu: "8" + memory: "8G" + # For production use-cases, we recommend specifying integer CPU reqests and limits. + # We also recommend setting requests equal to limits for both CPU and memory. + requests: + # For production use-cases, we recommend specifying integer CPU reqests and limits. + # We also recommend setting requests equal to limits for both CPU and memory. + cpu: "8" + # For production use-cases, we recommend allocating at least 8Gb memory for each Ray container. + memory: "8G" + # use volumes + # Refer to https://kubernetes.io/docs/concepts/storage/volumes/ + volumes: + - name: ray-logs + emptyDir: {} + diff --git a/doc/usage/examples/kuberay/config/aw-raycluster.yaml b/doc/usage/examples/kuberay/config/aw-raycluster.yaml index bbef20d0d..932b004e9 100644 --- a/doc/usage/examples/kuberay/config/aw-raycluster.yaml +++ b/doc/usage/examples/kuberay/config/aw-raycluster.yaml @@ -1,25 +1,41 @@ apiVersion: mcad.ibm.com/v1beta1 kind: AppWrapper metadata: - name: raycluster-autoscaler + name: raycluster-complete namespace: default spec: resources: - Items: [] GenericItems: - replicas: 1 custompodresources: - - replicas: 2 + # Optional section that specifies resource requirements + # for non-standard k8s resources, follows same format as + # that of standard k8s resources. + # Each item in the custompodresources stanza should include resources consumed by target Item. + # In this example, the 2 items correspond to 1 Ray head pod and 1 Ray worker pod + - replicas: 1 + limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 requests: - cpu: 10 - memory: 512Mi + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + # The replica should match the number of worker pods + - replicas: 1 limits: - cpu: 10 - memory: 1G + cpu: 8 + memory: 8G + nvidia.com/gpu: 0 + requests: + cpu: 8 + memory: 8G + nvidia.com/gpu: 0 generictemplate: - # This config demonstrates KubeRay's Ray autoscaler integration. # The resource requests and limits in this config are too small for production! - # For an example with more realistic resource configuration, see + # For examples with more realistic resource configuration, see + # ray-cluster.complete.large.yaml and # ray-cluster.autoscaler.large.yaml. apiVersion: ray.io/v1alpha1 kind: RayCluster @@ -27,66 +43,29 @@ spec: labels: controller-tools.k8s.io: "1.0" # A unique identifier for the head node and workers of this cluster. - name: raycluster-autoscaler + name: raycluster-complete spec: - # The version of Ray you are using. Make sure all Ray containers are running this version of Ray. - rayVersion: '2.0.0' - # If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod. - # Ray autoscaler integration is supported only for Ray versions >= 1.11.0 - # Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0. - enableInTreeAutoscaling: true - # autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler. - # The example configuration shown below below represents the DEFAULT values. - # (You may delete autoscalerOptions if the defaults are suitable.) - autoscalerOptions: - # upscalingMode is "Default" or "Aggressive." - # Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster. - # Default: Upscaling is not rate-limited. - # Aggressive: An alias for Default; upscaling is not rate-limited. - upscalingMode: Default - # idleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources. - idleTimeoutSeconds: 60 - # image optionally overrides the autoscaler's container image. - # If instance.spec.rayVersion is at least "2.0.0", the autoscaler will default to the same image as - # the ray container. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image. - ## image: "my-repo/my-custom-autoscaler-image:tag" - # imagePullPolicy optionally overrides the autoscaler container's image pull policy. - imagePullPolicy: Always - # resources specifies optional resource request and limit overrides for the autoscaler container. - # For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required. - resources: - limits: - cpu: "500m" - memory: "512Mi" - requests: - cpu: "500m" - memory: "512Mi" - ######################headGroupSpec################################# - # head group template and specs, (perhaps 'group' is not needed in the name) + rayVersion: '2.5.0' + # Ray head pod configuration headGroupSpec: - # Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer' + # Kubernetes Service Type. This is an optional field, and the default value is ClusterIP. + # Refer to https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types. serviceType: ClusterIP - # logical group name, for this called head-group, also can be functional - # pod type head or worker - # rayNodeType: head # Not needed since it is under the headgroup - # the following params are used to complete the ray start: ray start --head --block ... + # The `rayStartParams` are used to configure the `ray start` command. + # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay. + # See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`. rayStartParams: - # Flag "no-monitor" will be automatically set when autoscaling is enabled. dashboard-host: '0.0.0.0' - block: 'true' - # num-cpus: '1' # can be auto-completed from the limits - # Use `resources` to optionally specify custom resource annotations for the Ray node. - # The value of `resources` is a string-integer mapping. - # Currently, `resources` must be provided in the specific format demonstrated below: - # resources: '"{\"Custom1\": 1, \"Custom2\": 5}"' - #pod template + # pod template template: + metadata: + # Custom labels. NOTE: To avoid conflicts with KubeRay operator, do not define custom labels start with `raycluster`. + # Refer to https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ + labels: {} spec: containers: - # The Ray head pod - name: ray-head - image: rayproject/ray:2.0.0 - imagePullPolicy: Always + image: rayproject/ray:2.5.0 ports: - containerPort: 6379 name: gcs @@ -98,59 +77,84 @@ spec: preStop: exec: command: ["/bin/sh","-c","ray stop"] + volumeMounts: + - mountPath: /tmp/ray + name: ray-logs + # The resource requests and limits in this config are too small for production! + # For an example with more realistic resource configuration, see + # ray-cluster.autoscaler.large.yaml. + # It is better to use a few large Ray pod than many small ones. + # For production, it is ideal to size each Ray pod to take up the + # entire Kubernetes node on which it is scheduled. resources: limits: - cpu: "1" - memory: "1G" + cpu: "2" + memory: "8G" requests: - cpu: "500m" - memory: "512Mi" + # For production use-cases, we recommend specifying integer CPU reqests and limits. + # We also recommend setting requests equal to limits for both CPU and memory. + cpu: "2" + memory: "8G" + volumes: + - name: ray-logs + emptyDir: {} workerGroupSpecs: # the pod replicas in this group typed worker - replicas: 1 minReplicas: 1 - maxReplicas: 300 + maxReplicas: 10 # logical group name, for this called small-group, also can be functional groupName: small-group - # if worker pods need to be added, we can simply increment the replicas - # if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list - # the operator will remove pods from the list until the number of replicas is satisfied - # when a pod is confirmed to be deleted, its name will be removed from the list below + # If worker pods need to be added, we can increment the replicas. + # If worker pods need to be removed, we decrement the replicas, and populate the workersToDelete list. + # The operator will remove pods from the list until the desired number of replicas is satisfied. + # If the difference between the current replica count and the desired replicas is greater than the + # number of entries in workersToDelete, random worker pods will be deleted. #scaleStrategy: # workersToDelete: # - raycluster-complete-worker-small-group-bdtwh # - raycluster-complete-worker-small-group-hv457 # - raycluster-complete-worker-small-group-k8tj7 - # the following params are used to complete the ray start: ray start --block ... - rayStartParams: - block: 'true' + # The `rayStartParams` are used to configure the `ray start` command. + # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay. + # See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`. + rayStartParams: {} #pod template template: - metadata: - labels: - key: value - # annotations for pod - annotations: - key: value spec: - initContainers: - # the env var $RAY_IP is set by the operator if missing, with the value of the head service name - - name: init-myservice - image: busybox:1.28 - command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"] containers: - - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc' - image: rayproject/ray:2.0.0 - # environment variables to set in the container.Optional. - # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/ + - name: ray-worker + image: rayproject/ray:2.5.0 lifecycle: preStop: exec: command: ["/bin/sh","-c","ray stop"] + # use volumeMounts.Optional. + # Refer to https://kubernetes.io/docs/concepts/storage/volumes/ + volumeMounts: + - mountPath: /tmp/ray + name: ray-logs + # The resource requests and limits in this config are too small for production! + # For an example with more realistic resource configuration, see + # ray-cluster.autoscaler.large.yaml. + # It is better to use a few large Ray pod than many small ones. + # For production, it is ideal to size each Ray pod to take up the + # entire Kubernetes node on which it is scheduled. resources: limits: - cpu: "1" - memory: "512Mi" + cpu: "8" + memory: "8G" + # For production use-cases, we recommend specifying integer CPU reqests and limits. + # We also recommend setting requests equal to limits for both CPU and memory. requests: - cpu: "500m" - memory: "256Mi" \ No newline at end of file + # For production use-cases, we recommend specifying integer CPU reqests and limits. + # We also recommend setting requests equal to limits for both CPU and memory. + cpu: "8" + # For production use-cases, we recommend allocating at least 8Gb memory for each Ray container. + memory: "8G" + # use volumes + # Refer to https://kubernetes.io/docs/concepts/storage/volumes/ + volumes: + - name: ray-logs + emptyDir: {} + diff --git a/doc/usage/examples/kuberay/kuberay-mcad.md b/doc/usage/examples/kuberay/kuberay-mcad.md index 922217e19..a15eb1b1f 100644 --- a/doc/usage/examples/kuberay/kuberay-mcad.md +++ b/doc/usage/examples/kuberay/kuberay-mcad.md @@ -4,13 +4,54 @@ This integration will help in queuing on [kuberay](https://github.com/ray-projec #### Prerequisites -- kubernetes or Openshift cluster -- Install MCAD using instructions present under `deployment` directory -- Make sure MCAD has clusterrole to create ray resources, please patch using configuration file present in `config` directory with name `xqueuejob-controller.yaml` +- Kubernetes(see [KinD](https://kind.sigs.k8s.io/)) or Openshift cluster(see [OpenShift Local](https://developers.redhat.com/products/openshift-local/overview)) +- Kubernetes client tools such as [kubectl](https://kubernetes.io/docs/tasks/tools/) or [OpenShift CLI](https://docs.openshift.com/container-platform/4.13/cli_reference/openshift_cli/getting-started-cli.html) +- [Helm](https://helm.sh/docs/intro/install/) +- Install MCAD and KubeRay operators: + - KinD cluster: + + Install the stable release of MCAD opeartor from local charts + ```bash + git clone https://github.com/project-codeflare/multi-cluster-app-dispatcher + cd multi-cluster-app-dispatcher + helm install mcad --set image.repository=quay.io/project-codeflare/mcad-controller --set image.tag=stable deployment/mcad-controller + ``` + + Make sure MCAD has clusterrole to create ray resources, please patch using [xqueuejob-controller.yaml](doc/usage/examples/kuberay/config/xqueuejob-controller.yaml). For example: + ``` + kubectl apply -f doc/usage/examples/kuberay/config/xqueuejob-controller.yaml + ``` + + See [deployment.md](../../../../doc/deploy/deployment.md) for more options. + + Install kuberay operator using the [instructions](https://github.com/ray-project/kuberay#quick-start). For example, install kuberay v0.6.0 from remote helm repo: + ``` + helm repo add kuberay https://ray-project.github.io/kuberay-helm/ + helm install kuberay-operator kuberay/kuberay-operator --version 0.6.0 + ``` + + - OpenShift cluster: + + On OpenShift, MCAD and KubeRay are already part of the Open Data Hub Distributed Workload Stack. The stack provides a simple, user-friendly abstraction for scaling, queuing and resource management of distributed AI/ML and Python workloads. Please follow the Quick Start in the [Distributed Workloads](https://github.com/opendatahub-io/distributed-workloads) for installation. + #### Steps -- Install kuberay operator from [link](https://docs.ray.io/en/latest/cluster/kubernetes/getting-started.html#deploying-the-kuberay-operator) -- Submit ray cluster to MCAD as appwrapper using the config file `aw-raycluster.yaml` present in the `config` directory using command `kubectl create -f aw-raycluster.yaml` -- Check the status of the appwrapper using command `kubectl describe appwrapper ` -- Check running pods using command `kubectl get pods -n ` \ No newline at end of file + +- Submit the RayCluster custom resource to MCAD as AppWrapper using the [aw-raycluster.yaml](doc/usage/examples/kuberay/config/aw-raycluster.yaml) exmaple: + ```bash + kubectl create -f doc/usage/examples/kuberay/config/aw-raycluster.yaml + ``` +- Check the status of the AppWrapper custom resource using command + ```bash + kubectl describe appwrapper raycluster-complete -n default + ``` +- Check the raycluster status is ready using command + ```bash + kubectl get raycluster -n default + ``` + Expect: + `````` + NAME DESIRED WORKERS AVAILABLE WORKERS STATUS AGE + raycluster-complete 1 1 ready 6m45s + ```