diff --git a/demo-notebooks/additional-demos/hf_interactive.ipynb b/demo-notebooks/additional-demos/hf_interactive.ipynb index 37216b5d5..9181f3aac 100644 --- a/demo-notebooks/additional-demos/hf_interactive.ipynb +++ b/demo-notebooks/additional-demos/hf_interactive.ipynb @@ -68,7 +68,7 @@ "id": "bc27f84c", "metadata": {}, "source": [ - "Here, we want to define our cluster by specifying the resources we require for our batch workload. Below, we define our cluster object (which generates a corresponding AppWrapper).\n", + "Here, we want to define our cluster by specifying the resources we require for our batch workload. Below, we define our cluster object (which generates a corresponding Ray Cluster).\n", "\n", "NOTE: We must specify the `image` which will be used in our RayCluster, we recommend you bring your own image which suits your purposes. \n", "The example here is a community image." @@ -89,9 +89,10 @@ } ], "source": [ - "# Create our cluster and submit appwrapper\n", + "# Create our cluster and submit\n", + "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n", "cluster = Cluster(ClusterConfiguration(name='hfgputest', \n", - " namespace=\"default\",\n", + " namespace=\"default\", # Update to your namespace\n", " num_workers=1,\n", " min_cpus=8, \n", " max_cpus=8, \n", @@ -99,7 +100,9 @@ " max_memory=16, \n", " num_gpus=4,\n", " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n", - " instascale=True, machine_types=[\"m5.xlarge\", \"p3.8xlarge\"]))" + " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n", + " # local_queue=\"local-queue-name\" # Specify the local queue manually\n", + " ))" ] }, { @@ -107,7 +110,7 @@ "id": "12eef53c", "metadata": {}, "source": [ - "Next, we want to bring our cluster up, so we call the `up()` function below to submit our cluster AppWrapper yaml onto the MCAD queue, and begin the process of obtaining our resource cluster." + "Next, we want to bring our cluster up, so we call the `up()` function below to submit our Ray Cluster onto the queue, and begin the process of obtaining our resource cluster." ] }, { diff --git a/demo-notebooks/additional-demos/local_interactive.ipynb b/demo-notebooks/additional-demos/local_interactive.ipynb index 674a655ea..36adfb500 100644 --- a/demo-notebooks/additional-demos/local_interactive.ipynb +++ b/demo-notebooks/additional-demos/local_interactive.ipynb @@ -48,13 +48,12 @@ }, "outputs": [], "source": [ - "# Create our cluster and submit appwrapper\n", - "namespace = \"default\"\n", + "# Create and submit our cluster\n", + "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n", + "namespace = \"default\" # Update to your namespace\n", "cluster_name = \"hfgputest-1\"\n", - "local_interactive = True\n", "\n", - "cluster = Cluster(ClusterConfiguration(local_interactive=local_interactive,\n", - " namespace=namespace,\n", + "cluster = Cluster(ClusterConfiguration(namespace=namespace,\n", " name=cluster_name,\n", " num_workers=1,\n", " min_cpus=1,\n", @@ -63,8 +62,9 @@ " max_memory=4,\n", " num_gpus=0,\n", " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n", - " instascale=False,\n", - " machine_types=[\"m5.xlarge\", \"p3.8xlarge\"]))" + " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n", + " # local_queue=\"local-queue-name\" # Specify the local queue manually\n", + " ))" ] }, { @@ -117,9 +117,8 @@ "source": [ "from codeflare_sdk import generate_cert\n", "\n", - "if local_interactive:\n", - " generate_cert.generate_tls_cert(cluster_name, namespace)\n", - " generate_cert.export_env(cluster_name, namespace)" + "generate_cert.generate_tls_cert(cluster_name, namespace)\n", + "generate_cert.export_env(cluster_name, namespace)" ] }, { @@ -339,7 +338,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.17" + "version": "3.9.18" }, "vscode": { "interpreter": { diff --git a/demo-notebooks/guided-demos/2_job_client.ipynb b/demo-notebooks/additional-demos/ray_job_client.ipynb similarity index 93% rename from demo-notebooks/guided-demos/2_job_client.ipynb rename to demo-notebooks/additional-demos/ray_job_client.ipynb index 7b3d619b4..e3d90cd39 100644 --- a/demo-notebooks/guided-demos/2_job_client.ipynb +++ b/demo-notebooks/additional-demos/ray_job_client.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In this third demo we will go over the basics of the Ray Job Submission Client in the SDK" + "In this demo we will go over the basics of the RayJobClient in the SDK" ] }, { @@ -26,7 +26,6 @@ "# Create authentication object for user permissions\n", "# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config\n", "# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually\n", - "\n", "auth_token = \"XXXXX\" # The auth_token is used later for the RayJobClient\n", "auth = TokenAuthentication(\n", " token = auth_token,\n", @@ -43,16 +42,18 @@ "outputs": [], "source": [ "# Create and configure our cluster object\n", + "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n", "cluster = Cluster(ClusterConfiguration(\n", " name='jobtest',\n", - " namespace='default',\n", + " namespace='default', # Update to your namespace\n", " num_workers=2,\n", " min_cpus=1,\n", " max_cpus=1,\n", " min_memory=4,\n", " max_memory=4,\n", " num_gpus=0,\n", - " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\"\n", + " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n", + " write_to_file=False # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n", "))" ] }, diff --git a/demo-notebooks/guided-demos/0_basic_ray.ipynb b/demo-notebooks/guided-demos/0_basic_ray.ipynb index 205f02175..6a3b37108 100644 --- a/demo-notebooks/guided-demos/0_basic_ray.ipynb +++ b/demo-notebooks/guided-demos/0_basic_ray.ipynb @@ -5,7 +5,7 @@ "id": "8d4a42f6", "metadata": {}, "source": [ - "In this first notebook, we will go through the basics of using the SDK to:\n", + "In this notebook, we will go through the basics of using the SDK to:\n", " - Spin up a Ray cluster with our desired resources\n", " - View the status and specs of our Ray cluster\n", " - Take down the Ray cluster when finished" @@ -45,7 +45,7 @@ "id": "bc27f84c", "metadata": {}, "source": [ - "Here, we want to define our cluster by specifying the resources we require for our batch workload. Below, we define our cluster object (which generates a corresponding AppWrapper).\n", + "Here, we want to define our cluster by specifying the resources we require for our batch workload. Below, we define our cluster object (which generates a corresponding RayCluster).\n", "\n", "NOTE: We must specify the `image` which will be used in our RayCluster, we recommend you bring your own image which suits your purposes. \n", "The example here is a community image." @@ -58,10 +58,11 @@ "metadata": {}, "outputs": [], "source": [ - "# Create and configure our cluster object (and appwrapper)\n", + "# Create and configure our cluster object\n", + "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n", "cluster = Cluster(ClusterConfiguration(\n", " name='raytest',\n", - " namespace='default',\n", + " namespace='default', # Update to your namespace\n", " num_workers=2,\n", " min_cpus=1,\n", " max_cpus=1,\n", @@ -69,7 +70,8 @@ " max_memory=4,\n", " num_gpus=0,\n", " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n", - " instascale=False\n", + " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n", + " # local_queue=\"local-queue-name\" # Specify the local queue manually\n", "))" ] }, @@ -78,7 +80,7 @@ "id": "12eef53c", "metadata": {}, "source": [ - "Next, we want to bring our cluster up, so we call the `up()` function below to submit our cluster AppWrapper yaml onto the MCAD queue, and begin the process of obtaining our resource cluster." + "Next, we want to bring our cluster up, so we call the `up()` function below to submit our Ray Cluster onto the queue, and begin the process of obtaining our resource cluster." ] }, { diff --git a/demo-notebooks/guided-demos/1_basic_instascale.ipynb b/demo-notebooks/guided-demos/1_basic_instascale.ipynb deleted file mode 100644 index 418737eb6..000000000 --- a/demo-notebooks/guided-demos/1_basic_instascale.ipynb +++ /dev/null @@ -1,177 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "9865ee8c", - "metadata": {}, - "source": [ - "In this second notebook, we will go over the basics of using InstaScale to scale up/down necessary resources that are not currently available on your OpenShift Cluster (in cloud environments)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a", - "metadata": {}, - "outputs": [], - "source": [ - "# Import pieces from codeflare-sdk\n", - "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "614daa0c", - "metadata": {}, - "outputs": [], - "source": [ - "# Create authentication object for user permissions\n", - "# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config\n", - "# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually\n", - "auth = TokenAuthentication(\n", - " token = \"XXXXX\",\n", - " server = \"XXXXX\",\n", - " skip_tls=False\n", - ")\n", - "auth.login()" - ] - }, - { - "cell_type": "markdown", - "id": "bc27f84c", - "metadata": {}, - "source": [ - "This time, we are working in a cloud environment, and our OpenShift cluster does not have the resources needed for our desired workloads. We will use InstaScale to dynamically scale-up guaranteed resources based on our request (that will also automatically scale-down when we are finished working):\n", - "\n", - "NOTE: We must specify the `image` which will be used in our RayCluster, we recommend you bring your own image which suits your purposes. \n", - "The example here is a community image." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f4bc870-091f-4e11-9642-cba145710159", - "metadata": {}, - "outputs": [], - "source": [ - "# Create and configure our cluster object (and appwrapper)\n", - "cluster = Cluster(ClusterConfiguration(\n", - " name='instascaletest',\n", - " namespace='default',\n", - " num_workers=2,\n", - " min_cpus=2,\n", - " max_cpus=2,\n", - " min_memory=8,\n", - " max_memory=8,\n", - " num_gpus=1,\n", - " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n", - " instascale=True, # InstaScale now enabled, will scale OCP cluster to guarantee resource request\n", - " machine_types=[\"m5.xlarge\", \"g4dn.xlarge\"] # Head, worker AWS machine types desired\n", - "))" - ] - }, - { - "cell_type": "markdown", - "id": "12eef53c", - "metadata": {}, - "source": [ - "Same as last time, we will bring the cluster up, wait for it to be ready, and confirm that the specs are as-requested:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f0884bbc-c224-4ca0-98a0-02dfa09c2200", - "metadata": {}, - "outputs": [], - "source": [ - "# Bring up the cluster\n", - "cluster.up()\n", - "cluster.wait_ready()" - ] - }, - { - "cell_type": "markdown", - "id": "6abfe904", - "metadata": {}, - "source": [ - "While the resources are being scaled, we can also go into the console and take a look at the InstaScale logs, as well as the new machines/nodes spinning up.\n", - "\n", - "Once the cluster is ready, we can confirm the specs:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7fd45bc5-03c0-4ae5-9ec5-dd1c30f1a084", - "metadata": {}, - "outputs": [], - "source": [ - "cluster.details()" - ] - }, - { - "cell_type": "markdown", - "id": "5af8cd32", - "metadata": {}, - "source": [ - "Finally, we bring our resource cluster down and release/terminate the associated resources, bringing everything back to the way it was before our cluster was brought up." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5f36db0f-31f6-4373-9503-dc3c1c4c3f57", - "metadata": {}, - "outputs": [], - "source": [ - "cluster.down()" - ] - }, - { - "cell_type": "markdown", - "id": "c883caea", - "metadata": {}, - "source": [ - "Once again, we can look at the machines/nodes and see that everything has been successfully scaled down!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0d41b90e", - "metadata": {}, - "outputs": [], - "source": [ - "auth.logout()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - }, - "vscode": { - "interpreter": { - "hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/demo-notebooks/guided-demos/preview_nbs/2_job_client.ipynb b/demo-notebooks/guided-demos/1_cluster_job_client.ipynb similarity index 67% rename from demo-notebooks/guided-demos/preview_nbs/2_job_client.ipynb rename to demo-notebooks/guided-demos/1_cluster_job_client.ipynb index 7b3d619b4..e46dc62a9 100644 --- a/demo-notebooks/guided-demos/preview_nbs/2_job_client.ipynb +++ b/demo-notebooks/guided-demos/1_cluster_job_client.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In this third demo we will go over the basics of the Ray Job Submission Client in the SDK" + "In this demo we will go over the basics of the Ray Job Submission Client in the SDK" ] }, { @@ -14,7 +14,7 @@ "outputs": [], "source": [ "# Import pieces from codeflare-sdk\n", - "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication, RayJobClient" + "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication" ] }, { @@ -26,10 +26,8 @@ "# Create authentication object for user permissions\n", "# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config\n", "# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually\n", - "\n", - "auth_token = \"XXXXX\" # The auth_token is used later for the RayJobClient\n", "auth = TokenAuthentication(\n", - " token = auth_token,\n", + " token = \"XXXXX\",\n", " server = \"XXXXX\",\n", " skip_tls=False\n", ")\n", @@ -43,16 +41,19 @@ "outputs": [], "source": [ "# Create and configure our cluster object\n", + "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n", "cluster = Cluster(ClusterConfiguration(\n", " name='jobtest',\n", - " namespace='default',\n", + " namespace='default', # Update to your namespace\n", " num_workers=2,\n", " min_cpus=1,\n", " max_cpus=1,\n", " min_memory=4,\n", " max_memory=4,\n", " num_gpus=0,\n", - " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\"\n", + " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n", + " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n", + " # local_queue=\"local-queue-name\" # Specify the local queue manually\n", "))" ] }, @@ -80,14 +81,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Ray Job Submission - Authorized Ray Cluster" + "### Ray Job Submission" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "* Submit a job using an authorized Ray dashboard and the Job Submission Client\n", + "* Initialise the Cluster Job Client \n", "* Provide an entrypoint command directed to your job script\n", "* Set up your runtime environment" ] @@ -98,16 +99,11 @@ "metadata": {}, "outputs": [], "source": [ - "# Gather the dashboard URL\n", - "ray_dashboard = cluster.cluster_dashboard_uri()\n", - "\n", - "# Create the header for passing your bearer token\n", - "header = {\n", - " 'Authorization': f'Bearer {auth_token}'\n", - "}\n", - "\n", - "# Initialize the RayJobClient\n", - "client = RayJobClient(address=ray_dashboard, headers=header, verify=True)" + "# Initialize the Job Submission Client\n", + "\"\"\"\n", + "The SDK will automatically gather the dashboard address and authenticate using the Ray Job Submission Client\n", + "\"\"\"\n", + "client = cluster.job_client" ] }, { @@ -116,7 +112,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Submit an example mnist job using the RayJobClient\n", + "# Submit an example mnist job using the Job Submission Client\n", "submission_id = client.submit_job(\n", " entrypoint=\"python mnist.py\",\n", " runtime_env={\"working_dir\": \"./\",\"pip\": \"requirements.txt\"},\n", @@ -186,60 +182,6 @@ "client.delete_job(submission_id)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Unauthorized Ray Cluster with the Ray Job Client" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"\n", - "Initialise the RayJobClient with the Ray Dashboard\n", - "\"\"\"\n", - "ray_dashboard = cluster.cluster_dashboard_uri()\n", - "client = RayJobClient(address=ray_dashboard, verify=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Submit an example mnist job using the RayJobClient\n", - "submission_id = client.submit_job(\n", - " entrypoint=\"python mnist.py\",\n", - " runtime_env={\"working_dir\": \"./\",\"pip\": \"requirements.txt\"},\n", - ")\n", - "print(submission_id)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Stop the job \n", - "client.stop_job(submission_id)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Delete the job\n", - "client.delete_job(submission_id)" - ] - }, { "cell_type": "code", "execution_count": null, diff --git a/demo-notebooks/guided-demos/3_basic_interactive.ipynb b/demo-notebooks/guided-demos/2_basic_interactive.ipynb similarity index 89% rename from demo-notebooks/guided-demos/3_basic_interactive.ipynb rename to demo-notebooks/guided-demos/2_basic_interactive.ipynb index 090a4a305..943425a89 100644 --- a/demo-notebooks/guided-demos/3_basic_interactive.ipynb +++ b/demo-notebooks/guided-demos/2_basic_interactive.ipynb @@ -5,7 +5,7 @@ "id": "bbc21043", "metadata": {}, "source": [ - "In this fourth and final notebook, we will go over how to leverage the SDK to directly work interactively with a Ray cluster during development." + "In this notebook, we will go over how to leverage the SDK to directly work interactively with a Ray Cluster during development." ] }, { @@ -55,10 +55,13 @@ "metadata": {}, "outputs": [], "source": [ - "# Create and configure our cluster object (and appwrapper)\n", + "# Create and configure our cluster object\n", + "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n", + "namespace = \"default\" # Update to your namespace\n", + "cluster_name = \"interactivetest\"\n", "cluster = Cluster(ClusterConfiguration(\n", - " name='interactivetest',\n", - " namespace='default',\n", + " name=cluster_name,\n", + " namespace=namespace,\n", " num_workers=2,\n", " min_cpus=2,\n", " max_cpus=2,\n", @@ -66,9 +69,8 @@ " max_memory=8,\n", " num_gpus=1,\n", " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n", - " instascale=True, #<---instascale enabled\n", - " machine_types=[\"m5.xlarge\", \"g4dn.xlarge\"]\n", - " \n", + " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n", + " # local_queue=\"local-queue-name\" # Specify the local queue manually\n", "))" ] }, @@ -125,6 +127,19 @@ "Now we can connect directly to our Ray cluster via the Ray python client:" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9436436", + "metadata": {}, + "outputs": [], + "source": [ + "from codeflare_sdk import generate_cert\n", + "# Create required TLS cert and export the environment variables to enable TLS\n", + "generate_cert.generate_tls_cert(cluster_name, namespace)\n", + "generate_cert.export_env(cluster_name, namespace)" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/demo-notebooks/guided-demos/notebook-ex-outputs/0_basic_ray.ipynb b/demo-notebooks/guided-demos/notebook-ex-outputs/0_basic_ray.ipynb index d8f6c34c4..6771e9d7a 100644 --- a/demo-notebooks/guided-demos/notebook-ex-outputs/0_basic_ray.ipynb +++ b/demo-notebooks/guided-demos/notebook-ex-outputs/0_basic_ray.ipynb @@ -5,7 +5,7 @@ "id": "8d4a42f6", "metadata": {}, "source": [ - "In this first notebook, we will go through the basics of using the SDK to:\n", + "In this notebook, we will go through the basics of using the SDK to:\n", " - Spin up a Ray cluster with our desired resources\n", " - View the status and specs of our Ray cluster\n", " - Take down the Ray cluster when finished" @@ -45,7 +45,7 @@ "id": "bc27f84c", "metadata": {}, "source": [ - "Here, we want to define our cluster by specifying the resources we require for our batch workload. Below, we define our cluster object (which generates a corresponding AppWrapper).\n", + "Here, we want to define our cluster by specifying the resources we require for our batch workload. Below, we define our cluster object (which generates a corresponding RayCluster).\n", "\n", "NOTE: We must specify the `image` which will be used in our RayCluster, we recommend you bring your own image which suits your purposes. \n", "The example here is a community image." @@ -66,10 +66,11 @@ } ], "source": [ - "# Create and configure our cluster object (and appwrapper)\n", + "# Create and configure our cluster object\n", + "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n", "cluster = Cluster(ClusterConfiguration(\n", " name='raytest',\n", - " namespace='default',\n", + " namespace='default', # Update to your namespace\n", " num_workers=2,\n", " min_cpus=1,\n", " max_cpus=1,\n", @@ -77,7 +78,8 @@ " max_memory=4,\n", " num_gpus=0,\n", " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n", - " instascale=False\n", + " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n", + " # local_queue=\"local-queue-name\" # Specify the local queue manually\n", "))" ] }, @@ -86,7 +88,7 @@ "id": "12eef53c", "metadata": {}, "source": [ - "Next, we want to bring our cluster up, so we call the `up()` function below to submit our cluster AppWrapper yaml onto the MCAD queue, and begin the process of obtaining our resource cluster." + "Next, we want to bring our cluster up, so we call the `up()` function below to submit our Ray Cluster onto the queue, and begin the process of obtaining our resource cluster." ] }, { @@ -354,7 +356,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.17" + "version": "3.9.18" }, "vscode": { "interpreter": { diff --git a/demo-notebooks/guided-demos/notebook-ex-outputs/1_basic_instascale.ipynb b/demo-notebooks/guided-demos/notebook-ex-outputs/1_basic_instascale.ipynb deleted file mode 100644 index 4b28b2058..000000000 --- a/demo-notebooks/guided-demos/notebook-ex-outputs/1_basic_instascale.ipynb +++ /dev/null @@ -1,252 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "9865ee8c", - "metadata": {}, - "source": [ - "In this second notebook, we will go over the basics of using InstaScale to scale up/down necessary resources that are not currently available on your OpenShift Cluster (in cloud environments)." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a", - "metadata": {}, - "outputs": [], - "source": [ - "# Import pieces from codeflare-sdk\n", - "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "614daa0c", - "metadata": {}, - "outputs": [], - "source": [ - "# Create authentication object for user permissions\n", - "# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config\n", - "# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually\n", - "auth = TokenAuthentication(\n", - " token = \"XXXXX\",\n", - " server = \"XXXXX\",\n", - " skip_tls=False\n", - ")\n", - "auth.login()" - ] - }, - { - "cell_type": "markdown", - "id": "bc27f84c", - "metadata": {}, - "source": [ - "This time, we are working in a cloud environment, and our OpenShift cluster does not have the resources needed for our desired workloads. We will use InstaScale to dynamically scale-up guaranteed resources based on our request (that will also automatically scale-down when we are finished working):\n", - "\n", - "NOTE: We must specify the `image` which will be used in our RayCluster, we recommend you bring your own image which suits your purposes. \n", - "The example here is a community image." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "0f4bc870-091f-4e11-9642-cba145710159", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Written to: instascaletest.yaml\n" - ] - } - ], - "source": [ - "# Create and configure our cluster object (and appwrapper)\n", - "cluster = Cluster(ClusterConfiguration(\n", - " name='instascaletest',\n", - " namespace='default',\n", - " num_workers=2,\n", - " min_cpus=2,\n", - " max_cpus=2,\n", - " min_memory=8,\n", - " max_memory=8,\n", - " num_gpus=1,\n", - " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n", - " instascale=True, # InstaScale now enabled, will scale OCP cluster to guarantee resource request\n", - " machine_types=[\"m5.xlarge\", \"g4dn.xlarge\"] # Head, worker AWS machine types desired\n", - "))" - ] - }, - { - "cell_type": "markdown", - "id": "12eef53c", - "metadata": {}, - "source": [ - "Same as last time, we will bring the cluster up, wait for it to be ready, and confirm that the specs are as-requested:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "f0884bbc-c224-4ca0-98a0-02dfa09c2200", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Waiting for requested resources to be set up...\n", - "Requested cluster up and running!\n" - ] - } - ], - "source": [ - "# Bring up the cluster\n", - "cluster.up()\n", - "cluster.wait_ready()" - ] - }, - { - "cell_type": "markdown", - "id": "6abfe904", - "metadata": {}, - "source": [ - "While the resources are being scaled, we can also go into the console and take a look at the InstaScale logs, as well as the new machines/nodes spinning up.\n", - "\n", - "Once the cluster is ready, we can confirm the specs:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "7fd45bc5-03c0-4ae5-9ec5-dd1c30f1a084", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
                     ๐Ÿš€ CodeFlare Cluster Details ๐Ÿš€                     \n",
-       "                                                                         \n",
-       " โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ \n",
-       " โ”‚   Name                                                              โ”‚ \n",
-       " โ”‚   instascaletest                                        Active โœ…   โ”‚ \n",
-       " โ”‚                                                                     โ”‚ \n",
-       " โ”‚   URI: ray://instascaletest-head-svc.default.svc:10001              โ”‚ \n",
-       " โ”‚                                                                     โ”‚ \n",
-       " โ”‚   Dashboard๐Ÿ”—                                                       โ”‚ \n",
-       " โ”‚                                                                     โ”‚ \n",
-       " โ”‚                       Cluster Resources                             โ”‚ \n",
-       " โ”‚   โ•ญโ”€โ”€ Workers โ”€โ”€โ•ฎ  โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ Worker specs(each) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ         โ”‚ \n",
-       " โ”‚   โ”‚  # Workers  โ”‚  โ”‚  Memory      CPU         GPU         โ”‚         โ”‚ \n",
-       " โ”‚   โ”‚             โ”‚  โ”‚                                      โ”‚         โ”‚ \n",
-       " โ”‚   โ”‚  2          โ”‚  โ”‚  8~8         2           1           โ”‚         โ”‚ \n",
-       " โ”‚   โ”‚             โ”‚  โ”‚                                      โ”‚         โ”‚ \n",
-       " โ”‚   โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ  โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ         โ”‚ \n",
-       " โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ \n",
-       "
\n" - ], - "text/plain": [ - "\u001b[3m \u001b[0m\u001b[1;3m ๐Ÿš€ CodeFlare Cluster Details ๐Ÿš€\u001b[0m\u001b[3m \u001b[0m\n", - "\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\n", - " โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ \n", - " โ”‚ \u001b[1;37;42mName\u001b[0m โ”‚ \n", - " โ”‚ \u001b[1;4minstascaletest\u001b[0m Active โœ… โ”‚ \n", - " โ”‚ โ”‚ \n", - " โ”‚ \u001b[1mURI:\u001b[0m ray://instascaletest-head-svc.default.svc:10001 โ”‚ \n", - " โ”‚ โ”‚ \n", - " โ”‚ \u001b]8;id=65933;http://ray-dashboard-instascaletest-default.apps.meyceoz-07122023.psap.aws.rhperfscale.org\u001b\\\u001b[4;34mDashboard๐Ÿ”—\u001b[0m\u001b]8;;\u001b\\ โ”‚ \n", - " โ”‚ โ”‚ \n", - " โ”‚ \u001b[3m Cluster Resources \u001b[0m โ”‚ \n", - " โ”‚ โ•ญโ”€โ”€ Workers โ”€โ”€โ•ฎ โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ Worker specs(each) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ โ”‚ \n", - " โ”‚ โ”‚ \u001b[1m \u001b[0m\u001b[1m# Workers\u001b[0m\u001b[1m \u001b[0m โ”‚ โ”‚ \u001b[1m \u001b[0m\u001b[1mMemory \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mCPU \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mGPU \u001b[0m\u001b[1m \u001b[0m โ”‚ โ”‚ \n", - " โ”‚ โ”‚ \u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", - " โ”‚ โ”‚ \u001b[35m \u001b[0m\u001b[35m2 \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m8~8 \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m2 \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m1 \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", - " โ”‚ โ”‚ \u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", - " โ”‚ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ โ”‚ \n", - " โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ \n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "RayCluster(name='instascaletest', status=, workers=2, worker_mem_min=8, worker_mem_max=8, worker_cpu=2, worker_gpu=1, namespace='default', dashboard='http://ray-dashboard-instascaletest-default.apps.meyceoz-07122023.psap.aws.rhperfscale.org')" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cluster.details()" - ] - }, - { - "cell_type": "markdown", - "id": "5af8cd32", - "metadata": {}, - "source": [ - "Finally, we bring our resource cluster down and release/terminate the associated resources, bringing everything back to the way it was before our cluster was brought up." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "5f36db0f-31f6-4373-9503-dc3c1c4c3f57", - "metadata": {}, - "outputs": [], - "source": [ - "cluster.down()" - ] - }, - { - "cell_type": "markdown", - "id": "c883caea", - "metadata": {}, - "source": [ - "Once again, we can look at the machines/nodes and see that everything has been successfully scaled down!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0d41b90e", - "metadata": {}, - "outputs": [], - "source": [ - "auth.logout()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.17" - }, - "vscode": { - "interpreter": { - "hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/demo-notebooks/guided-demos/notebook-ex-outputs/1_cluster_job_client.ipynb b/demo-notebooks/guided-demos/notebook-ex-outputs/1_cluster_job_client.ipynb new file mode 100644 index 000000000..e46dc62a9 --- /dev/null +++ b/demo-notebooks/guided-demos/notebook-ex-outputs/1_cluster_job_client.ipynb @@ -0,0 +1,225 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this demo we will go over the basics of the Ray Job Submission Client in the SDK" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import pieces from codeflare-sdk\n", + "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create authentication object for user permissions\n", + "# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config\n", + "# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually\n", + "auth = TokenAuthentication(\n", + " token = \"XXXXX\",\n", + " server = \"XXXXX\",\n", + " skip_tls=False\n", + ")\n", + "auth.login()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create and configure our cluster object\n", + "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n", + "cluster = Cluster(ClusterConfiguration(\n", + " name='jobtest',\n", + " namespace='default', # Update to your namespace\n", + " num_workers=2,\n", + " min_cpus=1,\n", + " max_cpus=1,\n", + " min_memory=4,\n", + " max_memory=4,\n", + " num_gpus=0,\n", + " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n", + " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n", + " # local_queue=\"local-queue-name\" # Specify the local queue manually\n", + "))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Bring up the cluster\n", + "cluster.up()\n", + "cluster.wait_ready()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cluster.details()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Ray Job Submission" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* Initialise the Cluster Job Client \n", + "* Provide an entrypoint command directed to your job script\n", + "* Set up your runtime environment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the Job Submission Client\n", + "\"\"\"\n", + "The SDK will automatically gather the dashboard address and authenticate using the Ray Job Submission Client\n", + "\"\"\"\n", + "client = cluster.job_client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Submit an example mnist job using the Job Submission Client\n", + "submission_id = client.submit_job(\n", + " entrypoint=\"python mnist.py\",\n", + " runtime_env={\"working_dir\": \"./\",\"pip\": \"requirements.txt\"},\n", + ")\n", + "print(submission_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get the job's logs\n", + "client.get_job_logs(submission_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get the job's status\n", + "client.get_job_status(submission_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get job related info\n", + "client.get_job_info(submission_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# List all existing jobs\n", + "client.list_jobs()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Iterate through the logs of a job \n", + "async for lines in client.tail_job_logs(submission_id):\n", + " print(lines, end=\"\") " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Delete a job\n", + "# Can run client.cancel_job(submission_id) first if job is still running\n", + "client.delete_job(submission_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cluster.down()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "auth.logout()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/demo-notebooks/guided-demos/notebook-ex-outputs/3_basic_interactive.ipynb b/demo-notebooks/guided-demos/notebook-ex-outputs/2_basic_interactive.ipynb similarity index 99% rename from demo-notebooks/guided-demos/notebook-ex-outputs/3_basic_interactive.ipynb rename to demo-notebooks/guided-demos/notebook-ex-outputs/2_basic_interactive.ipynb index 7ac004706..62d34f3f8 100644 --- a/demo-notebooks/guided-demos/notebook-ex-outputs/3_basic_interactive.ipynb +++ b/demo-notebooks/guided-demos/notebook-ex-outputs/2_basic_interactive.ipynb @@ -5,7 +5,7 @@ "id": "bbc21043", "metadata": {}, "source": [ - "In this fourth and final notebook, we will go over how to leverage the SDK to directly work interactively with a Ray cluster during development." + "In this notebook, we will go over how to leverage the SDK to directly work interactively with a Ray cluster during development." ] }, { @@ -63,10 +63,13 @@ } ], "source": [ - "# Create and configure our cluster object (and appwrapper)\n", + "# Create and configure our cluster object\n", + "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n", + "namespace = \"default\" # Update to your namespace\n", + "cluster_name = \"interactivetest\"\n", "cluster = Cluster(ClusterConfiguration(\n", - " name='interactivetest',\n", - " namespace='default',\n", + " name=cluster_name,\n", + " namespace=namespace,\n", " num_workers=2,\n", " min_cpus=2,\n", " max_cpus=2,\n", @@ -74,9 +77,8 @@ " max_memory=8,\n", " num_gpus=1,\n", " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n", - " instascale=True, #<---instascale enabled\n", - " machine_types=[\"m5.xlarge\", \"g4dn.xlarge\"]\n", - " \n", + " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n", + " # local_queue=\"local-queue-name\" # Specify the local queue manually\n", "))" ] }, @@ -209,6 +211,19 @@ "Now we can connect directly to our Ray cluster via the Ray python client:" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "13eb52f6", + "metadata": {}, + "outputs": [], + "source": [ + "from codeflare_sdk import generate_cert\n", + "# Create required TLS cert and export the environment variables to enable TLS\n", + "generate_cert.generate_tls_cert(cluster_name, namespace)\n", + "generate_cert.export_env(cluster_name, namespace)" + ] + }, { "cell_type": "code", "execution_count": 6, diff --git a/demo-notebooks/guided-demos/notebook-ex-outputs/2_job_client.ipynb b/demo-notebooks/guided-demos/notebook-ex-outputs/2_job_client.ipynb deleted file mode 100644 index 75000ce46..000000000 --- a/demo-notebooks/guided-demos/notebook-ex-outputs/2_job_client.ipynb +++ /dev/null @@ -1,430 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this third demo we will go over the basics of the Ray Job Submission Client in the SDK" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# Import pieces from codeflare-sdk\n", - "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication, RayJobClient" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create authentication object for user permissions\n", - "# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config\n", - "# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually\n", - "\n", - "auth_token = \"XXXXX\" # The auth_token is used later for the RayJobClient\n", - "auth = TokenAuthentication(\n", - " token = auth_token,\n", - " server = \"XXXXX\",\n", - " skip_tls=False\n", - ")\n", - "auth.login()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Yaml resources loaded for jobtest\n" - ] - } - ], - "source": [ - "# Create and configure our cluster object\n", - "cluster = Cluster(ClusterConfiguration(\n", - " name='jobtest',\n", - " namespace='default',\n", - " num_workers=2,\n", - " min_cpus=1,\n", - " max_cpus=1,\n", - " min_memory=4,\n", - " max_memory=4,\n", - " num_gpus=0,\n", - " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\"\n", - "))" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Waiting for requested resources to be set up...\n", - "Requested cluster is up and running!\n", - "Dashboard is ready!\n" - ] - } - ], - "source": [ - "# Bring up the cluster\n", - "cluster.up()\n", - "cluster.wait_ready()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
                  ๐Ÿš€ CodeFlare Cluster Details ๐Ÿš€                  \n",
-       "                                                                   \n",
-       " โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ \n",
-       " โ”‚   Name                                                        โ”‚ \n",
-       " โ”‚   jobtest                                        Active โœ…    โ”‚ \n",
-       " โ”‚                                                               โ”‚ \n",
-       " โ”‚   URI: ray://jobtest-head-svc.default.svc:10001               โ”‚ \n",
-       " โ”‚                                                               โ”‚ \n",
-       " โ”‚   Dashboard๐Ÿ”—                                                 โ”‚ \n",
-       " โ”‚                                                               โ”‚ \n",
-       " โ”‚                       Cluster Resources                       โ”‚ \n",
-       " โ”‚   โ•ญโ”€โ”€ Workers โ”€โ”€โ•ฎ  โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ Worker specs(each) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ   โ”‚ \n",
-       " โ”‚   โ”‚  # Workers  โ”‚  โ”‚  Memory      CPU         GPU         โ”‚   โ”‚ \n",
-       " โ”‚   โ”‚             โ”‚  โ”‚                                      โ”‚   โ”‚ \n",
-       " โ”‚   โ”‚  2          โ”‚  โ”‚  4~4         1           0           โ”‚   โ”‚ \n",
-       " โ”‚   โ”‚             โ”‚  โ”‚                                      โ”‚   โ”‚ \n",
-       " โ”‚   โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ  โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ   โ”‚ \n",
-       " โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ \n",
-       "
\n" - ], - "text/plain": [ - "\u001b[3m \u001b[0m\u001b[1;3m ๐Ÿš€ CodeFlare Cluster Details ๐Ÿš€\u001b[0m\u001b[3m \u001b[0m\n", - "\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\n", - " โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ \n", - " โ”‚ \u001b[1;37;42mName\u001b[0m โ”‚ \n", - " โ”‚ \u001b[1;4mjobtest\u001b[0m Active โœ… โ”‚ \n", - " โ”‚ โ”‚ \n", - " โ”‚ \u001b[1mURI:\u001b[0m ray://jobtest-head-svc.default.svc:10001 โ”‚ \n", - " โ”‚ โ”‚ \n", - " โ”‚ \u001b]8;id=561347;https://ray-dashboard-jobtest-default.apps.rosa.mcampbel.af68.p3.openshiftapps.com\u001b\\\u001b[4;34mDashboard๐Ÿ”—\u001b[0m\u001b]8;;\u001b\\ โ”‚ \n", - " โ”‚ โ”‚ \n", - " โ”‚ \u001b[3m Cluster Resources \u001b[0m โ”‚ \n", - " โ”‚ โ•ญโ”€โ”€ Workers โ”€โ”€โ•ฎ โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ Worker specs(each) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ โ”‚ \n", - " โ”‚ โ”‚ \u001b[1m \u001b[0m\u001b[1m# Workers\u001b[0m\u001b[1m \u001b[0m โ”‚ โ”‚ \u001b[1m \u001b[0m\u001b[1mMemory \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mCPU \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mGPU \u001b[0m\u001b[1m \u001b[0m โ”‚ โ”‚ \n", - " โ”‚ โ”‚ \u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", - " โ”‚ โ”‚ \u001b[35m \u001b[0m\u001b[35m2 \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m4~4 \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m1 \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m0 \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", - " โ”‚ โ”‚ \u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", - " โ”‚ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ โ”‚ \n", - " โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ \n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "RayCluster(name='jobtest', status=, head_cpus=2, head_mem=8, head_gpu=0, workers=2, worker_mem_min=4, worker_mem_max=4, worker_cpu=1, worker_gpu=0, namespace='default', dashboard='https://ray-dashboard-jobtest-default.apps.rosa.mcampbel.af68.p3.openshiftapps.com')" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cluster.details()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Ray Job Submission - Authorized Ray Cluster" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Submit a job using an authorized Ray dashboard and the Job Submission Client\n", - "* Provide an entrypoint command directed to your job script\n", - "* Set up your runtime environment" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# Gather the dashboard URL\n", - "ray_dashboard = cluster.cluster_dashboard_uri()\n", - "\n", - "# Create the header for passing your bearer token\n", - "header = {\n", - " 'Authorization': f'Bearer {auth_token}'\n", - "}\n", - "\n", - "# Initialize the RayJobClient\n", - "client = RayJobClient(address=ray_dashboard, headers=header, verify=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-04-03 12:16:07,112\tINFO dashboard_sdk.py:338 -- Uploading package gcs://_ray_pkg_431abdedbcc7e123.zip.\n", - "2024-04-03 12:16:07,115\tINFO packaging.py:518 -- Creating a file package for local directory './'.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "raysubmit_NvXkkh1QP1kdq4LG\n" - ] - } - ], - "source": [ - "# Submit an example mnist job using the RayJobClient\n", - "submission_id = client.submit_job(\n", - " entrypoint=\"python mnist.py\",\n", - " runtime_env={\"working_dir\": \"./\",\"pip\": \"requirements.txt\"},\n", - ")\n", - "print(submission_id)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "''" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Get the job's logs\n", - "client.get_job_logs(submission_id)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Get the job's status\n", - "client.get_job_status(submission_id)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "JobDetails(type=, job_id=None, submission_id='raysubmit_NvXkkh1QP1kdq4LG', driver_info=None, status=, entrypoint='python mnist.py', message='Job has not started yet. It may be waiting for the runtime environment to be set up.', error_type=None, start_time=1712142968879, end_time=None, metadata={}, runtime_env={'working_dir': 'gcs://_ray_pkg_431abdedbcc7e123.zip', 'pip': {'packages': ['pytorch_lightning==1.5.10', 'ray_lightning', 'torchmetrics==0.9.1', 'torchvision==0.12.0'], 'pip_check': False}, '_ray_commit': 'b4bba4717f5ba04ee25580fe8f88eed63ef0c5dc'}, driver_agent_http_address=None, driver_node_id=None)" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Get job related info\n", - "client.get_job_info(submission_id)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[JobDetails(type=, job_id=None, submission_id='raysubmit_NvXkkh1QP1kdq4LG', driver_info=None, status=, entrypoint='python mnist.py', message='Job has not started yet. It may be waiting for the runtime environment to be set up.', error_type=None, start_time=1712142968879, end_time=None, metadata={}, runtime_env={'working_dir': 'gcs://_ray_pkg_431abdedbcc7e123.zip', 'pip': {'packages': ['pytorch_lightning==1.5.10', 'ray_lightning', 'torchmetrics==0.9.1', 'torchvision==0.12.0'], 'pip_check': False}, '_ray_commit': 'b4bba4717f5ba04ee25580fe8f88eed63ef0c5dc'}, driver_agent_http_address=None, driver_node_id=None)]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# List all existing jobs\n", - "client.list_jobs()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Iterate through the logs of a job \n", - "async for lines in client.tail_job_logs(submission_id):\n", - " print(lines, end=\"\") " - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(True, 'Successfully deleted Job raysubmit_NvXkkh1QP1kdq4LG')" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Delete a job\n", - "# Can run client.cancel_job(submission_id) first if job is still running\n", - "client.delete_job(submission_id)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Unauthorized Ray Cluster with the Ray Job Client" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"\n", - "Initialise the RayJobClient with the Ray Dashboard\n", - "\"\"\"\n", - "ray_dashboard = cluster.cluster_dashboard_uri()\n", - "client = RayJobClient(address=ray_dashboard, verify=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Submit an example mnist job using the RayJobClient\n", - "submission_id = client.submit_job(\n", - " entrypoint=\"python mnist.py\",\n", - " runtime_env={\"working_dir\": \"./\",\"pip\": \"requirements.txt\"},\n", - ")\n", - "print(submission_id)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Stop the job \n", - "client.stop_job(submission_id)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Delete the job\n", - "client.delete_job(submission_id)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "cluster.down()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "auth.logout()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.18" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/demo-notebooks/guided-demos/notebook-ex-outputs/instascaletest.yaml b/demo-notebooks/guided-demos/notebook-ex-outputs/instascaletest.yaml deleted file mode 100644 index 8cb96a794..000000000 --- a/demo-notebooks/guided-demos/notebook-ex-outputs/instascaletest.yaml +++ /dev/null @@ -1,185 +0,0 @@ -apiVersion: workload.codeflare.dev/v1beta1 -kind: AppWrapper -metadata: - labels: - orderedinstance: m5.xlarge_g4dn.xlarge - name: instascaletest - namespace: default -spec: - priority: 9 - resources: - GenericItems: - - custompodresources: - - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - replicas: 1 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 1 - replicas: 2 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 1 - generictemplate: - apiVersion: ray.io/v1 - kind: RayCluster - metadata: - labels: - appwrapper.mcad.ibm.com: instascaletest - controller-tools.k8s.io: '1.0' - name: instascaletest - namespace: default - spec: - autoscalerOptions: - idleTimeoutSeconds: 60 - imagePullPolicy: Always - resources: - limits: - cpu: 500m - memory: 512Mi - requests: - cpu: 500m - memory: 512Mi - upscalingMode: Default - enableInTreeAutoscaling: false - headGroupSpec: - rayStartParams: - block: 'true' - dashboard-host: 0.0.0.0 - num-gpus: '0' - serviceType: ClusterIP - template: - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: instascaletest - operator: In - values: - - instascaletest - containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - image: quay.io/project-codeflare/ray:latest-py39-cu118 - imagePullPolicy: Always - lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - ray stop - name: ray-head - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - resources: - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - imagePullSecrets: [] - rayVersion: 2.1.0 - workerGroupSpecs: - - groupName: small-group-instascaletest - maxReplicas: 2 - minReplicas: 2 - rayStartParams: - block: 'true' - num-gpus: '1' - replicas: 2 - template: - metadata: - annotations: - key: value - labels: - key: value - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: instascaletest - operator: In - values: - - instascaletest - containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - image: quay.io/project-codeflare/ray:latest-py39-cu118 - lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - ray stop - name: machine-learning - resources: - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 1 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 1 - imagePullSecrets: [] - replicas: 1 - - generictemplate: - apiVersion: route.openshift.io/v1 - kind: Route - metadata: - labels: - odh-ray-cluster-service: instascaletest-head-svc - name: ray-dashboard-instascaletest - namespace: default - spec: - port: - targetPort: dashboard - to: - kind: Service - name: instascaletest-head-svc - replicas: 1 - Items: [] diff --git a/demo-notebooks/guided-demos/preview_nbs/0_basic_ray.ipynb b/demo-notebooks/guided-demos/preview_nbs/0_basic_ray.ipynb index b0f12d4ba..6a3b37108 100644 --- a/demo-notebooks/guided-demos/preview_nbs/0_basic_ray.ipynb +++ b/demo-notebooks/guided-demos/preview_nbs/0_basic_ray.ipynb @@ -5,7 +5,7 @@ "id": "8d4a42f6", "metadata": {}, "source": [ - "In this first notebook, we will go through the basics of using the SDK to:\n", + "In this notebook, we will go through the basics of using the SDK to:\n", " - Spin up a Ray cluster with our desired resources\n", " - View the status and specs of our Ray cluster\n", " - Take down the Ray cluster when finished" @@ -45,7 +45,7 @@ "id": "bc27f84c", "metadata": {}, "source": [ - "Here, we want to define our cluster by specifying the resources we require for our batch workload. Below, we define our cluster object (which generates a corresponding AppWrapper).\n", + "Here, we want to define our cluster by specifying the resources we require for our batch workload. Below, we define our cluster object (which generates a corresponding RayCluster).\n", "\n", "NOTE: We must specify the `image` which will be used in our RayCluster, we recommend you bring your own image which suits your purposes. \n", "The example here is a community image." @@ -58,10 +58,11 @@ "metadata": {}, "outputs": [], "source": [ - "# Create and configure our cluster object (and appwrapper)\n", + "# Create and configure our cluster object\n", + "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n", "cluster = Cluster(ClusterConfiguration(\n", " name='raytest',\n", - " namespace='default',\n", + " namespace='default', # Update to your namespace\n", " num_workers=2,\n", " min_cpus=1,\n", " max_cpus=1,\n", @@ -69,7 +70,8 @@ " max_memory=4,\n", " num_gpus=0,\n", " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n", - " instascale=False\n", + " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n", + " # local_queue=\"local-queue-name\" # Specify the local queue manually\n", "))" ] }, @@ -78,7 +80,7 @@ "id": "12eef53c", "metadata": {}, "source": [ - "Next, we want to bring our cluster up, so we call the `up()` function below to submit our cluster AppWrapper yaml onto the MCAD queue, and begin the process of obtaining our resource cluster." + "Next, we want to bring our cluster up, so we call the `up()` function below to submit our Ray Cluster onto the queue, and begin the process of obtaining our resource cluster." ] }, { @@ -193,7 +195,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.9.18" }, "vscode": { "interpreter": { diff --git a/demo-notebooks/guided-demos/preview_nbs/1_basic_instascale.ipynb b/demo-notebooks/guided-demos/preview_nbs/1_basic_instascale.ipynb deleted file mode 100644 index 418737eb6..000000000 --- a/demo-notebooks/guided-demos/preview_nbs/1_basic_instascale.ipynb +++ /dev/null @@ -1,177 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "9865ee8c", - "metadata": {}, - "source": [ - "In this second notebook, we will go over the basics of using InstaScale to scale up/down necessary resources that are not currently available on your OpenShift Cluster (in cloud environments)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a", - "metadata": {}, - "outputs": [], - "source": [ - "# Import pieces from codeflare-sdk\n", - "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "614daa0c", - "metadata": {}, - "outputs": [], - "source": [ - "# Create authentication object for user permissions\n", - "# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config\n", - "# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually\n", - "auth = TokenAuthentication(\n", - " token = \"XXXXX\",\n", - " server = \"XXXXX\",\n", - " skip_tls=False\n", - ")\n", - "auth.login()" - ] - }, - { - "cell_type": "markdown", - "id": "bc27f84c", - "metadata": {}, - "source": [ - "This time, we are working in a cloud environment, and our OpenShift cluster does not have the resources needed for our desired workloads. We will use InstaScale to dynamically scale-up guaranteed resources based on our request (that will also automatically scale-down when we are finished working):\n", - "\n", - "NOTE: We must specify the `image` which will be used in our RayCluster, we recommend you bring your own image which suits your purposes. \n", - "The example here is a community image." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f4bc870-091f-4e11-9642-cba145710159", - "metadata": {}, - "outputs": [], - "source": [ - "# Create and configure our cluster object (and appwrapper)\n", - "cluster = Cluster(ClusterConfiguration(\n", - " name='instascaletest',\n", - " namespace='default',\n", - " num_workers=2,\n", - " min_cpus=2,\n", - " max_cpus=2,\n", - " min_memory=8,\n", - " max_memory=8,\n", - " num_gpus=1,\n", - " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n", - " instascale=True, # InstaScale now enabled, will scale OCP cluster to guarantee resource request\n", - " machine_types=[\"m5.xlarge\", \"g4dn.xlarge\"] # Head, worker AWS machine types desired\n", - "))" - ] - }, - { - "cell_type": "markdown", - "id": "12eef53c", - "metadata": {}, - "source": [ - "Same as last time, we will bring the cluster up, wait for it to be ready, and confirm that the specs are as-requested:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f0884bbc-c224-4ca0-98a0-02dfa09c2200", - "metadata": {}, - "outputs": [], - "source": [ - "# Bring up the cluster\n", - "cluster.up()\n", - "cluster.wait_ready()" - ] - }, - { - "cell_type": "markdown", - "id": "6abfe904", - "metadata": {}, - "source": [ - "While the resources are being scaled, we can also go into the console and take a look at the InstaScale logs, as well as the new machines/nodes spinning up.\n", - "\n", - "Once the cluster is ready, we can confirm the specs:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7fd45bc5-03c0-4ae5-9ec5-dd1c30f1a084", - "metadata": {}, - "outputs": [], - "source": [ - "cluster.details()" - ] - }, - { - "cell_type": "markdown", - "id": "5af8cd32", - "metadata": {}, - "source": [ - "Finally, we bring our resource cluster down and release/terminate the associated resources, bringing everything back to the way it was before our cluster was brought up." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5f36db0f-31f6-4373-9503-dc3c1c4c3f57", - "metadata": {}, - "outputs": [], - "source": [ - "cluster.down()" - ] - }, - { - "cell_type": "markdown", - "id": "c883caea", - "metadata": {}, - "source": [ - "Once again, we can look at the machines/nodes and see that everything has been successfully scaled down!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0d41b90e", - "metadata": {}, - "outputs": [], - "source": [ - "auth.logout()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - }, - "vscode": { - "interpreter": { - "hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/demo-notebooks/guided-demos/preview_nbs/1_cluster_job_client.ipynb b/demo-notebooks/guided-demos/preview_nbs/1_cluster_job_client.ipynb new file mode 100644 index 000000000..b20f920bd --- /dev/null +++ b/demo-notebooks/guided-demos/preview_nbs/1_cluster_job_client.ipynb @@ -0,0 +1,225 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this demo we will go over the basics of the Ray Job Submission Client in the SDK" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import pieces from codeflare-sdk\n", + "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create authentication object for user permissions\n", + "# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config\n", + "# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually\n", + "auth = TokenAuthentication(\n", + " token = \"XXXXX\",\n", + " server = \"XXXXX\",\n", + " skip_tls=False\n", + ")\n", + "auth.login()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create and configure our cluster object\n", + "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n", + "cluster = Cluster(ClusterConfiguration(\n", + " name='jobtest',\n", + " namespace='default', # Update to your namespace\n", + " num_workers=2,\n", + " min_cpus=1,\n", + " max_cpus=1,\n", + " min_memory=4,\n", + " max_memory=4,\n", + " num_gpus=0,\n", + " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n", + " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources\n", + " # local_queue=\"local-queue-name\" # Specify the local queue manually\n", + "))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Bring up the cluster\n", + "cluster.up()\n", + "cluster.wait_ready()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cluster.details()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Ray Job Submission" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* Initialise the Cluster Job Client \n", + "* Provide an entrypoint command directed to your job script\n", + "* Set up your runtime environment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the Job Submission Client\n", + "\"\"\"\n", + "The SDK will automatically gather the dashboard address and authenticate using the Ray Job Submission Client\n", + "\"\"\"\n", + "client = cluster.job_client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Submit an example mnist job using the Job Submission Client\n", + "submission_id = client.submit_job(\n", + " entrypoint=\"python mnist.py\",\n", + " runtime_env={\"working_dir\": \"./\",\"pip\": \"requirements.txt\"},\n", + ")\n", + "print(submission_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get the job's logs\n", + "client.get_job_logs(submission_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get the job's status\n", + "client.get_job_status(submission_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get job related info\n", + "client.get_job_info(submission_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# List all existing jobs\n", + "client.list_jobs()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Iterate through the logs of a job \n", + "async for lines in client.tail_job_logs(submission_id):\n", + " print(lines, end=\"\") " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Delete a job\n", + "# Can run client.cancel_job(submission_id) first if job is still running\n", + "client.delete_job(submission_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cluster.down()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "auth.logout()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/demo-notebooks/guided-demos/preview_nbs/3_basic_interactive.ipynb b/demo-notebooks/guided-demos/preview_nbs/2_basic_interactive.ipynb similarity index 89% rename from demo-notebooks/guided-demos/preview_nbs/3_basic_interactive.ipynb rename to demo-notebooks/guided-demos/preview_nbs/2_basic_interactive.ipynb index 090a4a305..28e05a26a 100644 --- a/demo-notebooks/guided-demos/preview_nbs/3_basic_interactive.ipynb +++ b/demo-notebooks/guided-demos/preview_nbs/2_basic_interactive.ipynb @@ -5,7 +5,7 @@ "id": "bbc21043", "metadata": {}, "source": [ - "In this fourth and final notebook, we will go over how to leverage the SDK to directly work interactively with a Ray cluster during development." + "In this notebook, we will go over how to leverage the SDK to directly work interactively with a Ray Cluster during development." ] }, { @@ -55,10 +55,13 @@ "metadata": {}, "outputs": [], "source": [ - "# Create and configure our cluster object (and appwrapper)\n", + "# Create and configure our cluster object\n", + "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n", + "namespace = \"default\" # Update to your namespace\n", + "cluster_name = \"interactivetest\"\n", "cluster = Cluster(ClusterConfiguration(\n", - " name='interactivetest',\n", - " namespace='default',\n", + " name=cluster_name,\n", + " namespace=namespace,\n", " num_workers=2,\n", " min_cpus=2,\n", " max_cpus=2,\n", @@ -66,9 +69,8 @@ " max_memory=8,\n", " num_gpus=1,\n", " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n", - " instascale=True, #<---instascale enabled\n", - " machine_types=[\"m5.xlarge\", \"g4dn.xlarge\"]\n", - " \n", + " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n", + " # local_queue=\"local-queue-name\" # Specify the local queue manually\n", "))" ] }, @@ -125,6 +127,19 @@ "Now we can connect directly to our Ray cluster via the Ray python client:" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5308271", + "metadata": {}, + "outputs": [], + "source": [ + "from codeflare_sdk import generate_cert\n", + "# Create required TLS cert and export the environment variables to enable TLS\n", + "generate_cert.generate_tls_cert(cluster_name, namespace)\n", + "generate_cert.export_env(cluster_name, namespace)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -295,7 +310,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.9.18" }, "vscode": { "interpreter": {