diff --git a/demo-notebooks/additional-demos/hf_interactive.ipynb b/demo-notebooks/additional-demos/hf_interactive.ipynb
index 37216b5d5..9181f3aac 100644
--- a/demo-notebooks/additional-demos/hf_interactive.ipynb
+++ b/demo-notebooks/additional-demos/hf_interactive.ipynb
@@ -68,7 +68,7 @@
"id": "bc27f84c",
"metadata": {},
"source": [
- "Here, we want to define our cluster by specifying the resources we require for our batch workload. Below, we define our cluster object (which generates a corresponding AppWrapper).\n",
+ "Here, we want to define our cluster by specifying the resources we require for our batch workload. Below, we define our cluster object (which generates a corresponding Ray Cluster).\n",
"\n",
"NOTE: We must specify the `image` which will be used in our RayCluster, we recommend you bring your own image which suits your purposes. \n",
"The example here is a community image."
@@ -89,9 +89,10 @@
}
],
"source": [
- "# Create our cluster and submit appwrapper\n",
+ "# Create our cluster and submit\n",
+ "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n",
"cluster = Cluster(ClusterConfiguration(name='hfgputest', \n",
- " namespace=\"default\",\n",
+ " namespace=\"default\", # Update to your namespace\n",
" num_workers=1,\n",
" min_cpus=8, \n",
" max_cpus=8, \n",
@@ -99,7 +100,9 @@
" max_memory=16, \n",
" num_gpus=4,\n",
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
- " instascale=True, machine_types=[\"m5.xlarge\", \"p3.8xlarge\"]))"
+ " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
+ " # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
+ " ))"
]
},
{
@@ -107,7 +110,7 @@
"id": "12eef53c",
"metadata": {},
"source": [
- "Next, we want to bring our cluster up, so we call the `up()` function below to submit our cluster AppWrapper yaml onto the MCAD queue, and begin the process of obtaining our resource cluster."
+ "Next, we want to bring our cluster up, so we call the `up()` function below to submit our Ray Cluster onto the queue, and begin the process of obtaining our resource cluster."
]
},
{
diff --git a/demo-notebooks/additional-demos/local_interactive.ipynb b/demo-notebooks/additional-demos/local_interactive.ipynb
index 674a655ea..36adfb500 100644
--- a/demo-notebooks/additional-demos/local_interactive.ipynb
+++ b/demo-notebooks/additional-demos/local_interactive.ipynb
@@ -48,13 +48,12 @@
},
"outputs": [],
"source": [
- "# Create our cluster and submit appwrapper\n",
- "namespace = \"default\"\n",
+ "# Create and submit our cluster\n",
+ "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n",
+ "namespace = \"default\" # Update to your namespace\n",
"cluster_name = \"hfgputest-1\"\n",
- "local_interactive = True\n",
"\n",
- "cluster = Cluster(ClusterConfiguration(local_interactive=local_interactive,\n",
- " namespace=namespace,\n",
+ "cluster = Cluster(ClusterConfiguration(namespace=namespace,\n",
" name=cluster_name,\n",
" num_workers=1,\n",
" min_cpus=1,\n",
@@ -63,8 +62,9 @@
" max_memory=4,\n",
" num_gpus=0,\n",
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
- " instascale=False,\n",
- " machine_types=[\"m5.xlarge\", \"p3.8xlarge\"]))"
+ " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
+ " # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
+ " ))"
]
},
{
@@ -117,9 +117,8 @@
"source": [
"from codeflare_sdk import generate_cert\n",
"\n",
- "if local_interactive:\n",
- " generate_cert.generate_tls_cert(cluster_name, namespace)\n",
- " generate_cert.export_env(cluster_name, namespace)"
+ "generate_cert.generate_tls_cert(cluster_name, namespace)\n",
+ "generate_cert.export_env(cluster_name, namespace)"
]
},
{
@@ -339,7 +338,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.17"
+ "version": "3.9.18"
},
"vscode": {
"interpreter": {
diff --git a/demo-notebooks/guided-demos/2_job_client.ipynb b/demo-notebooks/additional-demos/ray_job_client.ipynb
similarity index 93%
rename from demo-notebooks/guided-demos/2_job_client.ipynb
rename to demo-notebooks/additional-demos/ray_job_client.ipynb
index 7b3d619b4..e3d90cd39 100644
--- a/demo-notebooks/guided-demos/2_job_client.ipynb
+++ b/demo-notebooks/additional-demos/ray_job_client.ipynb
@@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "In this third demo we will go over the basics of the Ray Job Submission Client in the SDK"
+ "In this demo we will go over the basics of the RayJobClient in the SDK"
]
},
{
@@ -26,7 +26,6 @@
"# Create authentication object for user permissions\n",
"# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config\n",
"# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually\n",
- "\n",
"auth_token = \"XXXXX\" # The auth_token is used later for the RayJobClient\n",
"auth = TokenAuthentication(\n",
" token = auth_token,\n",
@@ -43,16 +42,18 @@
"outputs": [],
"source": [
"# Create and configure our cluster object\n",
+ "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n",
"cluster = Cluster(ClusterConfiguration(\n",
" name='jobtest',\n",
- " namespace='default',\n",
+ " namespace='default', # Update to your namespace\n",
" num_workers=2,\n",
" min_cpus=1,\n",
" max_cpus=1,\n",
" min_memory=4,\n",
" max_memory=4,\n",
" num_gpus=0,\n",
- " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\"\n",
+ " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
+ " write_to_file=False # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
"))"
]
},
diff --git a/demo-notebooks/guided-demos/0_basic_ray.ipynb b/demo-notebooks/guided-demos/0_basic_ray.ipynb
index 205f02175..6a3b37108 100644
--- a/demo-notebooks/guided-demos/0_basic_ray.ipynb
+++ b/demo-notebooks/guided-demos/0_basic_ray.ipynb
@@ -5,7 +5,7 @@
"id": "8d4a42f6",
"metadata": {},
"source": [
- "In this first notebook, we will go through the basics of using the SDK to:\n",
+ "In this notebook, we will go through the basics of using the SDK to:\n",
" - Spin up a Ray cluster with our desired resources\n",
" - View the status and specs of our Ray cluster\n",
" - Take down the Ray cluster when finished"
@@ -45,7 +45,7 @@
"id": "bc27f84c",
"metadata": {},
"source": [
- "Here, we want to define our cluster by specifying the resources we require for our batch workload. Below, we define our cluster object (which generates a corresponding AppWrapper).\n",
+ "Here, we want to define our cluster by specifying the resources we require for our batch workload. Below, we define our cluster object (which generates a corresponding RayCluster).\n",
"\n",
"NOTE: We must specify the `image` which will be used in our RayCluster, we recommend you bring your own image which suits your purposes. \n",
"The example here is a community image."
@@ -58,10 +58,11 @@
"metadata": {},
"outputs": [],
"source": [
- "# Create and configure our cluster object (and appwrapper)\n",
+ "# Create and configure our cluster object\n",
+ "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n",
"cluster = Cluster(ClusterConfiguration(\n",
" name='raytest',\n",
- " namespace='default',\n",
+ " namespace='default', # Update to your namespace\n",
" num_workers=2,\n",
" min_cpus=1,\n",
" max_cpus=1,\n",
@@ -69,7 +70,8 @@
" max_memory=4,\n",
" num_gpus=0,\n",
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
- " instascale=False\n",
+ " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
+ " # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
"))"
]
},
@@ -78,7 +80,7 @@
"id": "12eef53c",
"metadata": {},
"source": [
- "Next, we want to bring our cluster up, so we call the `up()` function below to submit our cluster AppWrapper yaml onto the MCAD queue, and begin the process of obtaining our resource cluster."
+ "Next, we want to bring our cluster up, so we call the `up()` function below to submit our Ray Cluster onto the queue, and begin the process of obtaining our resource cluster."
]
},
{
diff --git a/demo-notebooks/guided-demos/1_basic_instascale.ipynb b/demo-notebooks/guided-demos/1_basic_instascale.ipynb
deleted file mode 100644
index 418737eb6..000000000
--- a/demo-notebooks/guided-demos/1_basic_instascale.ipynb
+++ /dev/null
@@ -1,177 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "9865ee8c",
- "metadata": {},
- "source": [
- "In this second notebook, we will go over the basics of using InstaScale to scale up/down necessary resources that are not currently available on your OpenShift Cluster (in cloud environments)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Import pieces from codeflare-sdk\n",
- "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "614daa0c",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Create authentication object for user permissions\n",
- "# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config\n",
- "# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually\n",
- "auth = TokenAuthentication(\n",
- " token = \"XXXXX\",\n",
- " server = \"XXXXX\",\n",
- " skip_tls=False\n",
- ")\n",
- "auth.login()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "bc27f84c",
- "metadata": {},
- "source": [
- "This time, we are working in a cloud environment, and our OpenShift cluster does not have the resources needed for our desired workloads. We will use InstaScale to dynamically scale-up guaranteed resources based on our request (that will also automatically scale-down when we are finished working):\n",
- "\n",
- "NOTE: We must specify the `image` which will be used in our RayCluster, we recommend you bring your own image which suits your purposes. \n",
- "The example here is a community image."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "0f4bc870-091f-4e11-9642-cba145710159",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Create and configure our cluster object (and appwrapper)\n",
- "cluster = Cluster(ClusterConfiguration(\n",
- " name='instascaletest',\n",
- " namespace='default',\n",
- " num_workers=2,\n",
- " min_cpus=2,\n",
- " max_cpus=2,\n",
- " min_memory=8,\n",
- " max_memory=8,\n",
- " num_gpus=1,\n",
- " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
- " instascale=True, # InstaScale now enabled, will scale OCP cluster to guarantee resource request\n",
- " machine_types=[\"m5.xlarge\", \"g4dn.xlarge\"] # Head, worker AWS machine types desired\n",
- "))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "12eef53c",
- "metadata": {},
- "source": [
- "Same as last time, we will bring the cluster up, wait for it to be ready, and confirm that the specs are as-requested:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "f0884bbc-c224-4ca0-98a0-02dfa09c2200",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Bring up the cluster\n",
- "cluster.up()\n",
- "cluster.wait_ready()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "6abfe904",
- "metadata": {},
- "source": [
- "While the resources are being scaled, we can also go into the console and take a look at the InstaScale logs, as well as the new machines/nodes spinning up.\n",
- "\n",
- "Once the cluster is ready, we can confirm the specs:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "7fd45bc5-03c0-4ae5-9ec5-dd1c30f1a084",
- "metadata": {},
- "outputs": [],
- "source": [
- "cluster.details()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "5af8cd32",
- "metadata": {},
- "source": [
- "Finally, we bring our resource cluster down and release/terminate the associated resources, bringing everything back to the way it was before our cluster was brought up."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "5f36db0f-31f6-4373-9503-dc3c1c4c3f57",
- "metadata": {},
- "outputs": [],
- "source": [
- "cluster.down()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "c883caea",
- "metadata": {},
- "source": [
- "Once again, we can look at the machines/nodes and see that everything has been successfully scaled down!"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "0d41b90e",
- "metadata": {},
- "outputs": [],
- "source": [
- "auth.logout()"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.13"
- },
- "vscode": {
- "interpreter": {
- "hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac"
- }
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/demo-notebooks/guided-demos/preview_nbs/2_job_client.ipynb b/demo-notebooks/guided-demos/1_cluster_job_client.ipynb
similarity index 67%
rename from demo-notebooks/guided-demos/preview_nbs/2_job_client.ipynb
rename to demo-notebooks/guided-demos/1_cluster_job_client.ipynb
index 7b3d619b4..e46dc62a9 100644
--- a/demo-notebooks/guided-demos/preview_nbs/2_job_client.ipynb
+++ b/demo-notebooks/guided-demos/1_cluster_job_client.ipynb
@@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "In this third demo we will go over the basics of the Ray Job Submission Client in the SDK"
+ "In this demo we will go over the basics of the Ray Job Submission Client in the SDK"
]
},
{
@@ -14,7 +14,7 @@
"outputs": [],
"source": [
"# Import pieces from codeflare-sdk\n",
- "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication, RayJobClient"
+ "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication"
]
},
{
@@ -26,10 +26,8 @@
"# Create authentication object for user permissions\n",
"# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config\n",
"# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually\n",
- "\n",
- "auth_token = \"XXXXX\" # The auth_token is used later for the RayJobClient\n",
"auth = TokenAuthentication(\n",
- " token = auth_token,\n",
+ " token = \"XXXXX\",\n",
" server = \"XXXXX\",\n",
" skip_tls=False\n",
")\n",
@@ -43,16 +41,19 @@
"outputs": [],
"source": [
"# Create and configure our cluster object\n",
+ "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n",
"cluster = Cluster(ClusterConfiguration(\n",
" name='jobtest',\n",
- " namespace='default',\n",
+ " namespace='default', # Update to your namespace\n",
" num_workers=2,\n",
" min_cpus=1,\n",
" max_cpus=1,\n",
" min_memory=4,\n",
" max_memory=4,\n",
" num_gpus=0,\n",
- " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\"\n",
+ " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
+ " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
+ " # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
"))"
]
},
@@ -80,14 +81,14 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "### Ray Job Submission - Authorized Ray Cluster"
+ "### Ray Job Submission"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "* Submit a job using an authorized Ray dashboard and the Job Submission Client\n",
+ "* Initialise the Cluster Job Client \n",
"* Provide an entrypoint command directed to your job script\n",
"* Set up your runtime environment"
]
@@ -98,16 +99,11 @@
"metadata": {},
"outputs": [],
"source": [
- "# Gather the dashboard URL\n",
- "ray_dashboard = cluster.cluster_dashboard_uri()\n",
- "\n",
- "# Create the header for passing your bearer token\n",
- "header = {\n",
- " 'Authorization': f'Bearer {auth_token}'\n",
- "}\n",
- "\n",
- "# Initialize the RayJobClient\n",
- "client = RayJobClient(address=ray_dashboard, headers=header, verify=True)"
+ "# Initialize the Job Submission Client\n",
+ "\"\"\"\n",
+ "The SDK will automatically gather the dashboard address and authenticate using the Ray Job Submission Client\n",
+ "\"\"\"\n",
+ "client = cluster.job_client"
]
},
{
@@ -116,7 +112,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# Submit an example mnist job using the RayJobClient\n",
+ "# Submit an example mnist job using the Job Submission Client\n",
"submission_id = client.submit_job(\n",
" entrypoint=\"python mnist.py\",\n",
" runtime_env={\"working_dir\": \"./\",\"pip\": \"requirements.txt\"},\n",
@@ -186,60 +182,6 @@
"client.delete_job(submission_id)"
]
},
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Unauthorized Ray Cluster with the Ray Job Client"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "\"\"\"\n",
- "Initialise the RayJobClient with the Ray Dashboard\n",
- "\"\"\"\n",
- "ray_dashboard = cluster.cluster_dashboard_uri()\n",
- "client = RayJobClient(address=ray_dashboard, verify=False)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Submit an example mnist job using the RayJobClient\n",
- "submission_id = client.submit_job(\n",
- " entrypoint=\"python mnist.py\",\n",
- " runtime_env={\"working_dir\": \"./\",\"pip\": \"requirements.txt\"},\n",
- ")\n",
- "print(submission_id)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Stop the job \n",
- "client.stop_job(submission_id)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Delete the job\n",
- "client.delete_job(submission_id)"
- ]
- },
{
"cell_type": "code",
"execution_count": null,
diff --git a/demo-notebooks/guided-demos/3_basic_interactive.ipynb b/demo-notebooks/guided-demos/2_basic_interactive.ipynb
similarity index 89%
rename from demo-notebooks/guided-demos/3_basic_interactive.ipynb
rename to demo-notebooks/guided-demos/2_basic_interactive.ipynb
index 090a4a305..943425a89 100644
--- a/demo-notebooks/guided-demos/3_basic_interactive.ipynb
+++ b/demo-notebooks/guided-demos/2_basic_interactive.ipynb
@@ -5,7 +5,7 @@
"id": "bbc21043",
"metadata": {},
"source": [
- "In this fourth and final notebook, we will go over how to leverage the SDK to directly work interactively with a Ray cluster during development."
+ "In this notebook, we will go over how to leverage the SDK to directly work interactively with a Ray Cluster during development."
]
},
{
@@ -55,10 +55,13 @@
"metadata": {},
"outputs": [],
"source": [
- "# Create and configure our cluster object (and appwrapper)\n",
+ "# Create and configure our cluster object\n",
+ "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n",
+ "namespace = \"default\" # Update to your namespace\n",
+ "cluster_name = \"interactivetest\"\n",
"cluster = Cluster(ClusterConfiguration(\n",
- " name='interactivetest',\n",
- " namespace='default',\n",
+ " name=cluster_name,\n",
+ " namespace=namespace,\n",
" num_workers=2,\n",
" min_cpus=2,\n",
" max_cpus=2,\n",
@@ -66,9 +69,8 @@
" max_memory=8,\n",
" num_gpus=1,\n",
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
- " instascale=True, #<---instascale enabled\n",
- " machine_types=[\"m5.xlarge\", \"g4dn.xlarge\"]\n",
- " \n",
+ " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
+ " # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
"))"
]
},
@@ -125,6 +127,19 @@
"Now we can connect directly to our Ray cluster via the Ray python client:"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c9436436",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from codeflare_sdk import generate_cert\n",
+ "# Create required TLS cert and export the environment variables to enable TLS\n",
+ "generate_cert.generate_tls_cert(cluster_name, namespace)\n",
+ "generate_cert.export_env(cluster_name, namespace)"
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,
diff --git a/demo-notebooks/guided-demos/notebook-ex-outputs/0_basic_ray.ipynb b/demo-notebooks/guided-demos/notebook-ex-outputs/0_basic_ray.ipynb
index d8f6c34c4..6771e9d7a 100644
--- a/demo-notebooks/guided-demos/notebook-ex-outputs/0_basic_ray.ipynb
+++ b/demo-notebooks/guided-demos/notebook-ex-outputs/0_basic_ray.ipynb
@@ -5,7 +5,7 @@
"id": "8d4a42f6",
"metadata": {},
"source": [
- "In this first notebook, we will go through the basics of using the SDK to:\n",
+ "In this notebook, we will go through the basics of using the SDK to:\n",
" - Spin up a Ray cluster with our desired resources\n",
" - View the status and specs of our Ray cluster\n",
" - Take down the Ray cluster when finished"
@@ -45,7 +45,7 @@
"id": "bc27f84c",
"metadata": {},
"source": [
- "Here, we want to define our cluster by specifying the resources we require for our batch workload. Below, we define our cluster object (which generates a corresponding AppWrapper).\n",
+ "Here, we want to define our cluster by specifying the resources we require for our batch workload. Below, we define our cluster object (which generates a corresponding RayCluster).\n",
"\n",
"NOTE: We must specify the `image` which will be used in our RayCluster, we recommend you bring your own image which suits your purposes. \n",
"The example here is a community image."
@@ -66,10 +66,11 @@
}
],
"source": [
- "# Create and configure our cluster object (and appwrapper)\n",
+ "# Create and configure our cluster object\n",
+ "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n",
"cluster = Cluster(ClusterConfiguration(\n",
" name='raytest',\n",
- " namespace='default',\n",
+ " namespace='default', # Update to your namespace\n",
" num_workers=2,\n",
" min_cpus=1,\n",
" max_cpus=1,\n",
@@ -77,7 +78,8 @@
" max_memory=4,\n",
" num_gpus=0,\n",
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
- " instascale=False\n",
+ " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
+ " # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
"))"
]
},
@@ -86,7 +88,7 @@
"id": "12eef53c",
"metadata": {},
"source": [
- "Next, we want to bring our cluster up, so we call the `up()` function below to submit our cluster AppWrapper yaml onto the MCAD queue, and begin the process of obtaining our resource cluster."
+ "Next, we want to bring our cluster up, so we call the `up()` function below to submit our Ray Cluster onto the queue, and begin the process of obtaining our resource cluster."
]
},
{
@@ -354,7 +356,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.17"
+ "version": "3.9.18"
},
"vscode": {
"interpreter": {
diff --git a/demo-notebooks/guided-demos/notebook-ex-outputs/1_basic_instascale.ipynb b/demo-notebooks/guided-demos/notebook-ex-outputs/1_basic_instascale.ipynb
deleted file mode 100644
index 4b28b2058..000000000
--- a/demo-notebooks/guided-demos/notebook-ex-outputs/1_basic_instascale.ipynb
+++ /dev/null
@@ -1,252 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "9865ee8c",
- "metadata": {},
- "source": [
- "In this second notebook, we will go over the basics of using InstaScale to scale up/down necessary resources that are not currently available on your OpenShift Cluster (in cloud environments)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Import pieces from codeflare-sdk\n",
- "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "614daa0c",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Create authentication object for user permissions\n",
- "# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config\n",
- "# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually\n",
- "auth = TokenAuthentication(\n",
- " token = \"XXXXX\",\n",
- " server = \"XXXXX\",\n",
- " skip_tls=False\n",
- ")\n",
- "auth.login()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "bc27f84c",
- "metadata": {},
- "source": [
- "This time, we are working in a cloud environment, and our OpenShift cluster does not have the resources needed for our desired workloads. We will use InstaScale to dynamically scale-up guaranteed resources based on our request (that will also automatically scale-down when we are finished working):\n",
- "\n",
- "NOTE: We must specify the `image` which will be used in our RayCluster, we recommend you bring your own image which suits your purposes. \n",
- "The example here is a community image."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "0f4bc870-091f-4e11-9642-cba145710159",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Written to: instascaletest.yaml\n"
- ]
- }
- ],
- "source": [
- "# Create and configure our cluster object (and appwrapper)\n",
- "cluster = Cluster(ClusterConfiguration(\n",
- " name='instascaletest',\n",
- " namespace='default',\n",
- " num_workers=2,\n",
- " min_cpus=2,\n",
- " max_cpus=2,\n",
- " min_memory=8,\n",
- " max_memory=8,\n",
- " num_gpus=1,\n",
- " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
- " instascale=True, # InstaScale now enabled, will scale OCP cluster to guarantee resource request\n",
- " machine_types=[\"m5.xlarge\", \"g4dn.xlarge\"] # Head, worker AWS machine types desired\n",
- "))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "12eef53c",
- "metadata": {},
- "source": [
- "Same as last time, we will bring the cluster up, wait for it to be ready, and confirm that the specs are as-requested:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "f0884bbc-c224-4ca0-98a0-02dfa09c2200",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Waiting for requested resources to be set up...\n",
- "Requested cluster up and running!\n"
- ]
- }
- ],
- "source": [
- "# Bring up the cluster\n",
- "cluster.up()\n",
- "cluster.wait_ready()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "6abfe904",
- "metadata": {},
- "source": [
- "While the resources are being scaled, we can also go into the console and take a look at the InstaScale logs, as well as the new machines/nodes spinning up.\n",
- "\n",
- "Once the cluster is ready, we can confirm the specs:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "7fd45bc5-03c0-4ae5-9ec5-dd1c30f1a084",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
๐ CodeFlare Cluster Details ๐ \n",
- " \n",
- " โญโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฎ \n",
- " โ Name โ \n",
- " โ instascaletest Active โ
โ \n",
- " โ โ \n",
- " โ URI: ray://instascaletest-head-svc.default.svc:10001 โ \n",
- " โ โ \n",
- " โ Dashboard๐ โ \n",
- " โ โ \n",
- " โ Cluster Resources โ \n",
- " โ โญโโ Workers โโโฎ โญโโโโโโโโโ Worker specs(each) โโโโโโโโโโฎ โ \n",
- " โ โ # Workers โ โ Memory CPU GPU โ โ \n",
- " โ โ โ โ โ โ \n",
- " โ โ 2 โ โ 8~8 2 1 โ โ \n",
- " โ โ โ โ โ โ \n",
- " โ โฐโโโโโโโโโโโโโโฏ โฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ โ \n",
- " โฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ \n",
- "
\n"
- ],
- "text/plain": [
- "\u001b[3m \u001b[0m\u001b[1;3m ๐ CodeFlare Cluster Details ๐\u001b[0m\u001b[3m \u001b[0m\n",
- "\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\n",
- " โญโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฎ \n",
- " โ \u001b[1;37;42mName\u001b[0m โ \n",
- " โ \u001b[1;4minstascaletest\u001b[0m Active โ
โ \n",
- " โ โ \n",
- " โ \u001b[1mURI:\u001b[0m ray://instascaletest-head-svc.default.svc:10001 โ \n",
- " โ โ \n",
- " โ \u001b]8;id=65933;http://ray-dashboard-instascaletest-default.apps.meyceoz-07122023.psap.aws.rhperfscale.org\u001b\\\u001b[4;34mDashboard๐\u001b[0m\u001b]8;;\u001b\\ โ \n",
- " โ โ \n",
- " โ \u001b[3m Cluster Resources \u001b[0m โ \n",
- " โ โญโโ Workers โโโฎ โญโโโโโโโโโ Worker specs(each) โโโโโโโโโโฎ โ \n",
- " โ โ \u001b[1m \u001b[0m\u001b[1m# Workers\u001b[0m\u001b[1m \u001b[0m โ โ \u001b[1m \u001b[0m\u001b[1mMemory \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mCPU \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mGPU \u001b[0m\u001b[1m \u001b[0m โ โ \n",
- " โ โ \u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ โ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ โ \n",
- " โ โ \u001b[35m \u001b[0m\u001b[35m2 \u001b[0m\u001b[35m \u001b[0m โ โ \u001b[36m \u001b[0m\u001b[36m8~8 \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m2 \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m1 \u001b[0m\u001b[35m \u001b[0m โ โ \n",
- " โ โ \u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ โ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ โ \n",
- " โ โฐโโโโโโโโโโโโโโฏ โฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ โ \n",
- " โฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ \n"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/plain": [
- "RayCluster(name='instascaletest', status=, workers=2, worker_mem_min=8, worker_mem_max=8, worker_cpu=2, worker_gpu=1, namespace='default', dashboard='http://ray-dashboard-instascaletest-default.apps.meyceoz-07122023.psap.aws.rhperfscale.org')"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "cluster.details()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "5af8cd32",
- "metadata": {},
- "source": [
- "Finally, we bring our resource cluster down and release/terminate the associated resources, bringing everything back to the way it was before our cluster was brought up."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "id": "5f36db0f-31f6-4373-9503-dc3c1c4c3f57",
- "metadata": {},
- "outputs": [],
- "source": [
- "cluster.down()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "c883caea",
- "metadata": {},
- "source": [
- "Once again, we can look at the machines/nodes and see that everything has been successfully scaled down!"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "0d41b90e",
- "metadata": {},
- "outputs": [],
- "source": [
- "auth.logout()"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.17"
- },
- "vscode": {
- "interpreter": {
- "hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac"
- }
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/demo-notebooks/guided-demos/notebook-ex-outputs/1_cluster_job_client.ipynb b/demo-notebooks/guided-demos/notebook-ex-outputs/1_cluster_job_client.ipynb
new file mode 100644
index 000000000..e46dc62a9
--- /dev/null
+++ b/demo-notebooks/guided-demos/notebook-ex-outputs/1_cluster_job_client.ipynb
@@ -0,0 +1,225 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In this demo we will go over the basics of the Ray Job Submission Client in the SDK"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Import pieces from codeflare-sdk\n",
+ "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create authentication object for user permissions\n",
+ "# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config\n",
+ "# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually\n",
+ "auth = TokenAuthentication(\n",
+ " token = \"XXXXX\",\n",
+ " server = \"XXXXX\",\n",
+ " skip_tls=False\n",
+ ")\n",
+ "auth.login()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create and configure our cluster object\n",
+ "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n",
+ "cluster = Cluster(ClusterConfiguration(\n",
+ " name='jobtest',\n",
+ " namespace='default', # Update to your namespace\n",
+ " num_workers=2,\n",
+ " min_cpus=1,\n",
+ " max_cpus=1,\n",
+ " min_memory=4,\n",
+ " max_memory=4,\n",
+ " num_gpus=0,\n",
+ " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
+ " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
+ " # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
+ "))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Bring up the cluster\n",
+ "cluster.up()\n",
+ "cluster.wait_ready()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cluster.details()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Ray Job Submission"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "* Initialise the Cluster Job Client \n",
+ "* Provide an entrypoint command directed to your job script\n",
+ "* Set up your runtime environment"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Initialize the Job Submission Client\n",
+ "\"\"\"\n",
+ "The SDK will automatically gather the dashboard address and authenticate using the Ray Job Submission Client\n",
+ "\"\"\"\n",
+ "client = cluster.job_client"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Submit an example mnist job using the Job Submission Client\n",
+ "submission_id = client.submit_job(\n",
+ " entrypoint=\"python mnist.py\",\n",
+ " runtime_env={\"working_dir\": \"./\",\"pip\": \"requirements.txt\"},\n",
+ ")\n",
+ "print(submission_id)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get the job's logs\n",
+ "client.get_job_logs(submission_id)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get the job's status\n",
+ "client.get_job_status(submission_id)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get job related info\n",
+ "client.get_job_info(submission_id)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# List all existing jobs\n",
+ "client.list_jobs()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Iterate through the logs of a job \n",
+ "async for lines in client.tail_job_logs(submission_id):\n",
+ " print(lines, end=\"\") "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Delete a job\n",
+ "# Can run client.cancel_job(submission_id) first if job is still running\n",
+ "client.delete_job(submission_id)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cluster.down()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "auth.logout()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.18"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/demo-notebooks/guided-demos/notebook-ex-outputs/3_basic_interactive.ipynb b/demo-notebooks/guided-demos/notebook-ex-outputs/2_basic_interactive.ipynb
similarity index 99%
rename from demo-notebooks/guided-demos/notebook-ex-outputs/3_basic_interactive.ipynb
rename to demo-notebooks/guided-demos/notebook-ex-outputs/2_basic_interactive.ipynb
index 7ac004706..62d34f3f8 100644
--- a/demo-notebooks/guided-demos/notebook-ex-outputs/3_basic_interactive.ipynb
+++ b/demo-notebooks/guided-demos/notebook-ex-outputs/2_basic_interactive.ipynb
@@ -5,7 +5,7 @@
"id": "bbc21043",
"metadata": {},
"source": [
- "In this fourth and final notebook, we will go over how to leverage the SDK to directly work interactively with a Ray cluster during development."
+ "In this notebook, we will go over how to leverage the SDK to directly work interactively with a Ray cluster during development."
]
},
{
@@ -63,10 +63,13 @@
}
],
"source": [
- "# Create and configure our cluster object (and appwrapper)\n",
+ "# Create and configure our cluster object\n",
+ "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n",
+ "namespace = \"default\" # Update to your namespace\n",
+ "cluster_name = \"interactivetest\"\n",
"cluster = Cluster(ClusterConfiguration(\n",
- " name='interactivetest',\n",
- " namespace='default',\n",
+ " name=cluster_name,\n",
+ " namespace=namespace,\n",
" num_workers=2,\n",
" min_cpus=2,\n",
" max_cpus=2,\n",
@@ -74,9 +77,8 @@
" max_memory=8,\n",
" num_gpus=1,\n",
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
- " instascale=True, #<---instascale enabled\n",
- " machine_types=[\"m5.xlarge\", \"g4dn.xlarge\"]\n",
- " \n",
+ " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
+ " # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
"))"
]
},
@@ -209,6 +211,19 @@
"Now we can connect directly to our Ray cluster via the Ray python client:"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "13eb52f6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from codeflare_sdk import generate_cert\n",
+ "# Create required TLS cert and export the environment variables to enable TLS\n",
+ "generate_cert.generate_tls_cert(cluster_name, namespace)\n",
+ "generate_cert.export_env(cluster_name, namespace)"
+ ]
+ },
{
"cell_type": "code",
"execution_count": 6,
diff --git a/demo-notebooks/guided-demos/notebook-ex-outputs/2_job_client.ipynb b/demo-notebooks/guided-demos/notebook-ex-outputs/2_job_client.ipynb
deleted file mode 100644
index 75000ce46..000000000
--- a/demo-notebooks/guided-demos/notebook-ex-outputs/2_job_client.ipynb
+++ /dev/null
@@ -1,430 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "In this third demo we will go over the basics of the Ray Job Submission Client in the SDK"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Import pieces from codeflare-sdk\n",
- "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication, RayJobClient"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Create authentication object for user permissions\n",
- "# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config\n",
- "# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually\n",
- "\n",
- "auth_token = \"XXXXX\" # The auth_token is used later for the RayJobClient\n",
- "auth = TokenAuthentication(\n",
- " token = auth_token,\n",
- " server = \"XXXXX\",\n",
- " skip_tls=False\n",
- ")\n",
- "auth.login()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Yaml resources loaded for jobtest\n"
- ]
- }
- ],
- "source": [
- "# Create and configure our cluster object\n",
- "cluster = Cluster(ClusterConfiguration(\n",
- " name='jobtest',\n",
- " namespace='default',\n",
- " num_workers=2,\n",
- " min_cpus=1,\n",
- " max_cpus=1,\n",
- " min_memory=4,\n",
- " max_memory=4,\n",
- " num_gpus=0,\n",
- " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\"\n",
- "))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Waiting for requested resources to be set up...\n",
- "Requested cluster is up and running!\n",
- "Dashboard is ready!\n"
- ]
- }
- ],
- "source": [
- "# Bring up the cluster\n",
- "cluster.up()\n",
- "cluster.wait_ready()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- " ๐ CodeFlare Cluster Details ๐ \n",
- " \n",
- " โญโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฎ \n",
- " โ Name โ \n",
- " โ jobtest Active โ
โ \n",
- " โ โ \n",
- " โ URI: ray://jobtest-head-svc.default.svc:10001 โ \n",
- " โ โ \n",
- " โ Dashboard๐ โ \n",
- " โ โ \n",
- " โ Cluster Resources โ \n",
- " โ โญโโ Workers โโโฎ โญโโโโโโโโโ Worker specs(each) โโโโโโโโโโฎ โ \n",
- " โ โ # Workers โ โ Memory CPU GPU โ โ \n",
- " โ โ โ โ โ โ \n",
- " โ โ 2 โ โ 4~4 1 0 โ โ \n",
- " โ โ โ โ โ โ \n",
- " โ โฐโโโโโโโโโโโโโโฏ โฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ โ \n",
- " โฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ \n",
- "
\n"
- ],
- "text/plain": [
- "\u001b[3m \u001b[0m\u001b[1;3m ๐ CodeFlare Cluster Details ๐\u001b[0m\u001b[3m \u001b[0m\n",
- "\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\n",
- " โญโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฎ \n",
- " โ \u001b[1;37;42mName\u001b[0m โ \n",
- " โ \u001b[1;4mjobtest\u001b[0m Active โ
โ \n",
- " โ โ \n",
- " โ \u001b[1mURI:\u001b[0m ray://jobtest-head-svc.default.svc:10001 โ \n",
- " โ โ \n",
- " โ \u001b]8;id=561347;https://ray-dashboard-jobtest-default.apps.rosa.mcampbel.af68.p3.openshiftapps.com\u001b\\\u001b[4;34mDashboard๐\u001b[0m\u001b]8;;\u001b\\ โ \n",
- " โ โ \n",
- " โ \u001b[3m Cluster Resources \u001b[0m โ \n",
- " โ โญโโ Workers โโโฎ โญโโโโโโโโโ Worker specs(each) โโโโโโโโโโฎ โ \n",
- " โ โ \u001b[1m \u001b[0m\u001b[1m# Workers\u001b[0m\u001b[1m \u001b[0m โ โ \u001b[1m \u001b[0m\u001b[1mMemory \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mCPU \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mGPU \u001b[0m\u001b[1m \u001b[0m โ โ \n",
- " โ โ \u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ โ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ โ \n",
- " โ โ \u001b[35m \u001b[0m\u001b[35m2 \u001b[0m\u001b[35m \u001b[0m โ โ \u001b[36m \u001b[0m\u001b[36m4~4 \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m1 \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m0 \u001b[0m\u001b[35m \u001b[0m โ โ \n",
- " โ โ \u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ โ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ โ \n",
- " โ โฐโโโโโโโโโโโโโโฏ โฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ โ \n",
- " โฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ \n"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/plain": [
- "RayCluster(name='jobtest', status=, head_cpus=2, head_mem=8, head_gpu=0, workers=2, worker_mem_min=4, worker_mem_max=4, worker_cpu=1, worker_gpu=0, namespace='default', dashboard='https://ray-dashboard-jobtest-default.apps.rosa.mcampbel.af68.p3.openshiftapps.com')"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "cluster.details()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Ray Job Submission - Authorized Ray Cluster"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "* Submit a job using an authorized Ray dashboard and the Job Submission Client\n",
- "* Provide an entrypoint command directed to your job script\n",
- "* Set up your runtime environment"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Gather the dashboard URL\n",
- "ray_dashboard = cluster.cluster_dashboard_uri()\n",
- "\n",
- "# Create the header for passing your bearer token\n",
- "header = {\n",
- " 'Authorization': f'Bearer {auth_token}'\n",
- "}\n",
- "\n",
- "# Initialize the RayJobClient\n",
- "client = RayJobClient(address=ray_dashboard, headers=header, verify=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2024-04-03 12:16:07,112\tINFO dashboard_sdk.py:338 -- Uploading package gcs://_ray_pkg_431abdedbcc7e123.zip.\n",
- "2024-04-03 12:16:07,115\tINFO packaging.py:518 -- Creating a file package for local directory './'.\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "raysubmit_NvXkkh1QP1kdq4LG\n"
- ]
- }
- ],
- "source": [
- "# Submit an example mnist job using the RayJobClient\n",
- "submission_id = client.submit_job(\n",
- " entrypoint=\"python mnist.py\",\n",
- " runtime_env={\"working_dir\": \"./\",\"pip\": \"requirements.txt\"},\n",
- ")\n",
- "print(submission_id)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "''"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Get the job's logs\n",
- "client.get_job_logs(submission_id)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- ""
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Get the job's status\n",
- "client.get_job_status(submission_id)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "JobDetails(type=, job_id=None, submission_id='raysubmit_NvXkkh1QP1kdq4LG', driver_info=None, status=, entrypoint='python mnist.py', message='Job has not started yet. It may be waiting for the runtime environment to be set up.', error_type=None, start_time=1712142968879, end_time=None, metadata={}, runtime_env={'working_dir': 'gcs://_ray_pkg_431abdedbcc7e123.zip', 'pip': {'packages': ['pytorch_lightning==1.5.10', 'ray_lightning', 'torchmetrics==0.9.1', 'torchvision==0.12.0'], 'pip_check': False}, '_ray_commit': 'b4bba4717f5ba04ee25580fe8f88eed63ef0c5dc'}, driver_agent_http_address=None, driver_node_id=None)"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Get job related info\n",
- "client.get_job_info(submission_id)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[JobDetails(type=, job_id=None, submission_id='raysubmit_NvXkkh1QP1kdq4LG', driver_info=None, status=, entrypoint='python mnist.py', message='Job has not started yet. It may be waiting for the runtime environment to be set up.', error_type=None, start_time=1712142968879, end_time=None, metadata={}, runtime_env={'working_dir': 'gcs://_ray_pkg_431abdedbcc7e123.zip', 'pip': {'packages': ['pytorch_lightning==1.5.10', 'ray_lightning', 'torchmetrics==0.9.1', 'torchvision==0.12.0'], 'pip_check': False}, '_ray_commit': 'b4bba4717f5ba04ee25580fe8f88eed63ef0c5dc'}, driver_agent_http_address=None, driver_node_id=None)]"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# List all existing jobs\n",
- "client.list_jobs()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Iterate through the logs of a job \n",
- "async for lines in client.tail_job_logs(submission_id):\n",
- " print(lines, end=\"\") "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(True, 'Successfully deleted Job raysubmit_NvXkkh1QP1kdq4LG')"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Delete a job\n",
- "# Can run client.cancel_job(submission_id) first if job is still running\n",
- "client.delete_job(submission_id)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Unauthorized Ray Cluster with the Ray Job Client"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "\"\"\"\n",
- "Initialise the RayJobClient with the Ray Dashboard\n",
- "\"\"\"\n",
- "ray_dashboard = cluster.cluster_dashboard_uri()\n",
- "client = RayJobClient(address=ray_dashboard, verify=False)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Submit an example mnist job using the RayJobClient\n",
- "submission_id = client.submit_job(\n",
- " entrypoint=\"python mnist.py\",\n",
- " runtime_env={\"working_dir\": \"./\",\"pip\": \"requirements.txt\"},\n",
- ")\n",
- "print(submission_id)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Stop the job \n",
- "client.stop_job(submission_id)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Delete the job\n",
- "client.delete_job(submission_id)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [],
- "source": [
- "cluster.down()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "auth.logout()"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.9.18"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/demo-notebooks/guided-demos/notebook-ex-outputs/instascaletest.yaml b/demo-notebooks/guided-demos/notebook-ex-outputs/instascaletest.yaml
deleted file mode 100644
index 8cb96a794..000000000
--- a/demo-notebooks/guided-demos/notebook-ex-outputs/instascaletest.yaml
+++ /dev/null
@@ -1,185 +0,0 @@
-apiVersion: workload.codeflare.dev/v1beta1
-kind: AppWrapper
-metadata:
- labels:
- orderedinstance: m5.xlarge_g4dn.xlarge
- name: instascaletest
- namespace: default
-spec:
- priority: 9
- resources:
- GenericItems:
- - custompodresources:
- - limits:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 0
- replicas: 1
- requests:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 0
- - limits:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 1
- replicas: 2
- requests:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 1
- generictemplate:
- apiVersion: ray.io/v1
- kind: RayCluster
- metadata:
- labels:
- appwrapper.mcad.ibm.com: instascaletest
- controller-tools.k8s.io: '1.0'
- name: instascaletest
- namespace: default
- spec:
- autoscalerOptions:
- idleTimeoutSeconds: 60
- imagePullPolicy: Always
- resources:
- limits:
- cpu: 500m
- memory: 512Mi
- requests:
- cpu: 500m
- memory: 512Mi
- upscalingMode: Default
- enableInTreeAutoscaling: false
- headGroupSpec:
- rayStartParams:
- block: 'true'
- dashboard-host: 0.0.0.0
- num-gpus: '0'
- serviceType: ClusterIP
- template:
- spec:
- affinity:
- nodeAffinity:
- requiredDuringSchedulingIgnoredDuringExecution:
- nodeSelectorTerms:
- - matchExpressions:
- - key: instascaletest
- operator: In
- values:
- - instascaletest
- containers:
- - env:
- - name: MY_POD_IP
- valueFrom:
- fieldRef:
- fieldPath: status.podIP
- - name: RAY_USE_TLS
- value: '0'
- - name: RAY_TLS_SERVER_CERT
- value: /home/ray/workspace/tls/server.crt
- - name: RAY_TLS_SERVER_KEY
- value: /home/ray/workspace/tls/server.key
- - name: RAY_TLS_CA_CERT
- value: /home/ray/workspace/tls/ca.crt
- image: quay.io/project-codeflare/ray:latest-py39-cu118
- imagePullPolicy: Always
- lifecycle:
- preStop:
- exec:
- command:
- - /bin/sh
- - -c
- - ray stop
- name: ray-head
- ports:
- - containerPort: 6379
- name: gcs
- - containerPort: 8265
- name: dashboard
- - containerPort: 10001
- name: client
- resources:
- limits:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 0
- requests:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 0
- imagePullSecrets: []
- rayVersion: 2.1.0
- workerGroupSpecs:
- - groupName: small-group-instascaletest
- maxReplicas: 2
- minReplicas: 2
- rayStartParams:
- block: 'true'
- num-gpus: '1'
- replicas: 2
- template:
- metadata:
- annotations:
- key: value
- labels:
- key: value
- spec:
- affinity:
- nodeAffinity:
- requiredDuringSchedulingIgnoredDuringExecution:
- nodeSelectorTerms:
- - matchExpressions:
- - key: instascaletest
- operator: In
- values:
- - instascaletest
- containers:
- - env:
- - name: MY_POD_IP
- valueFrom:
- fieldRef:
- fieldPath: status.podIP
- - name: RAY_USE_TLS
- value: '0'
- - name: RAY_TLS_SERVER_CERT
- value: /home/ray/workspace/tls/server.crt
- - name: RAY_TLS_SERVER_KEY
- value: /home/ray/workspace/tls/server.key
- - name: RAY_TLS_CA_CERT
- value: /home/ray/workspace/tls/ca.crt
- image: quay.io/project-codeflare/ray:latest-py39-cu118
- lifecycle:
- preStop:
- exec:
- command:
- - /bin/sh
- - -c
- - ray stop
- name: machine-learning
- resources:
- limits:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 1
- requests:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 1
- imagePullSecrets: []
- replicas: 1
- - generictemplate:
- apiVersion: route.openshift.io/v1
- kind: Route
- metadata:
- labels:
- odh-ray-cluster-service: instascaletest-head-svc
- name: ray-dashboard-instascaletest
- namespace: default
- spec:
- port:
- targetPort: dashboard
- to:
- kind: Service
- name: instascaletest-head-svc
- replicas: 1
- Items: []
diff --git a/demo-notebooks/guided-demos/preview_nbs/0_basic_ray.ipynb b/demo-notebooks/guided-demos/preview_nbs/0_basic_ray.ipynb
index b0f12d4ba..6a3b37108 100644
--- a/demo-notebooks/guided-demos/preview_nbs/0_basic_ray.ipynb
+++ b/demo-notebooks/guided-demos/preview_nbs/0_basic_ray.ipynb
@@ -5,7 +5,7 @@
"id": "8d4a42f6",
"metadata": {},
"source": [
- "In this first notebook, we will go through the basics of using the SDK to:\n",
+ "In this notebook, we will go through the basics of using the SDK to:\n",
" - Spin up a Ray cluster with our desired resources\n",
" - View the status and specs of our Ray cluster\n",
" - Take down the Ray cluster when finished"
@@ -45,7 +45,7 @@
"id": "bc27f84c",
"metadata": {},
"source": [
- "Here, we want to define our cluster by specifying the resources we require for our batch workload. Below, we define our cluster object (which generates a corresponding AppWrapper).\n",
+ "Here, we want to define our cluster by specifying the resources we require for our batch workload. Below, we define our cluster object (which generates a corresponding RayCluster).\n",
"\n",
"NOTE: We must specify the `image` which will be used in our RayCluster, we recommend you bring your own image which suits your purposes. \n",
"The example here is a community image."
@@ -58,10 +58,11 @@
"metadata": {},
"outputs": [],
"source": [
- "# Create and configure our cluster object (and appwrapper)\n",
+ "# Create and configure our cluster object\n",
+ "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n",
"cluster = Cluster(ClusterConfiguration(\n",
" name='raytest',\n",
- " namespace='default',\n",
+ " namespace='default', # Update to your namespace\n",
" num_workers=2,\n",
" min_cpus=1,\n",
" max_cpus=1,\n",
@@ -69,7 +70,8 @@
" max_memory=4,\n",
" num_gpus=0,\n",
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
- " instascale=False\n",
+ " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
+ " # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
"))"
]
},
@@ -78,7 +80,7 @@
"id": "12eef53c",
"metadata": {},
"source": [
- "Next, we want to bring our cluster up, so we call the `up()` function below to submit our cluster AppWrapper yaml onto the MCAD queue, and begin the process of obtaining our resource cluster."
+ "Next, we want to bring our cluster up, so we call the `up()` function below to submit our Ray Cluster onto the queue, and begin the process of obtaining our resource cluster."
]
},
{
@@ -193,7 +195,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.13"
+ "version": "3.9.18"
},
"vscode": {
"interpreter": {
diff --git a/demo-notebooks/guided-demos/preview_nbs/1_basic_instascale.ipynb b/demo-notebooks/guided-demos/preview_nbs/1_basic_instascale.ipynb
deleted file mode 100644
index 418737eb6..000000000
--- a/demo-notebooks/guided-demos/preview_nbs/1_basic_instascale.ipynb
+++ /dev/null
@@ -1,177 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "9865ee8c",
- "metadata": {},
- "source": [
- "In this second notebook, we will go over the basics of using InstaScale to scale up/down necessary resources that are not currently available on your OpenShift Cluster (in cloud environments)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Import pieces from codeflare-sdk\n",
- "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "614daa0c",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Create authentication object for user permissions\n",
- "# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config\n",
- "# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually\n",
- "auth = TokenAuthentication(\n",
- " token = \"XXXXX\",\n",
- " server = \"XXXXX\",\n",
- " skip_tls=False\n",
- ")\n",
- "auth.login()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "bc27f84c",
- "metadata": {},
- "source": [
- "This time, we are working in a cloud environment, and our OpenShift cluster does not have the resources needed for our desired workloads. We will use InstaScale to dynamically scale-up guaranteed resources based on our request (that will also automatically scale-down when we are finished working):\n",
- "\n",
- "NOTE: We must specify the `image` which will be used in our RayCluster, we recommend you bring your own image which suits your purposes. \n",
- "The example here is a community image."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "0f4bc870-091f-4e11-9642-cba145710159",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Create and configure our cluster object (and appwrapper)\n",
- "cluster = Cluster(ClusterConfiguration(\n",
- " name='instascaletest',\n",
- " namespace='default',\n",
- " num_workers=2,\n",
- " min_cpus=2,\n",
- " max_cpus=2,\n",
- " min_memory=8,\n",
- " max_memory=8,\n",
- " num_gpus=1,\n",
- " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
- " instascale=True, # InstaScale now enabled, will scale OCP cluster to guarantee resource request\n",
- " machine_types=[\"m5.xlarge\", \"g4dn.xlarge\"] # Head, worker AWS machine types desired\n",
- "))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "12eef53c",
- "metadata": {},
- "source": [
- "Same as last time, we will bring the cluster up, wait for it to be ready, and confirm that the specs are as-requested:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "f0884bbc-c224-4ca0-98a0-02dfa09c2200",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Bring up the cluster\n",
- "cluster.up()\n",
- "cluster.wait_ready()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "6abfe904",
- "metadata": {},
- "source": [
- "While the resources are being scaled, we can also go into the console and take a look at the InstaScale logs, as well as the new machines/nodes spinning up.\n",
- "\n",
- "Once the cluster is ready, we can confirm the specs:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "7fd45bc5-03c0-4ae5-9ec5-dd1c30f1a084",
- "metadata": {},
- "outputs": [],
- "source": [
- "cluster.details()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "5af8cd32",
- "metadata": {},
- "source": [
- "Finally, we bring our resource cluster down and release/terminate the associated resources, bringing everything back to the way it was before our cluster was brought up."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "5f36db0f-31f6-4373-9503-dc3c1c4c3f57",
- "metadata": {},
- "outputs": [],
- "source": [
- "cluster.down()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "c883caea",
- "metadata": {},
- "source": [
- "Once again, we can look at the machines/nodes and see that everything has been successfully scaled down!"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "0d41b90e",
- "metadata": {},
- "outputs": [],
- "source": [
- "auth.logout()"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.13"
- },
- "vscode": {
- "interpreter": {
- "hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac"
- }
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/demo-notebooks/guided-demos/preview_nbs/1_cluster_job_client.ipynb b/demo-notebooks/guided-demos/preview_nbs/1_cluster_job_client.ipynb
new file mode 100644
index 000000000..b20f920bd
--- /dev/null
+++ b/demo-notebooks/guided-demos/preview_nbs/1_cluster_job_client.ipynb
@@ -0,0 +1,225 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In this demo we will go over the basics of the Ray Job Submission Client in the SDK"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Import pieces from codeflare-sdk\n",
+ "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create authentication object for user permissions\n",
+ "# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config\n",
+ "# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually\n",
+ "auth = TokenAuthentication(\n",
+ " token = \"XXXXX\",\n",
+ " server = \"XXXXX\",\n",
+ " skip_tls=False\n",
+ ")\n",
+ "auth.login()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create and configure our cluster object\n",
+ "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n",
+ "cluster = Cluster(ClusterConfiguration(\n",
+ " name='jobtest',\n",
+ " namespace='default', # Update to your namespace\n",
+ " num_workers=2,\n",
+ " min_cpus=1,\n",
+ " max_cpus=1,\n",
+ " min_memory=4,\n",
+ " max_memory=4,\n",
+ " num_gpus=0,\n",
+ " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
+ " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources\n",
+ " # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
+ "))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Bring up the cluster\n",
+ "cluster.up()\n",
+ "cluster.wait_ready()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cluster.details()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Ray Job Submission"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "* Initialise the Cluster Job Client \n",
+ "* Provide an entrypoint command directed to your job script\n",
+ "* Set up your runtime environment"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Initialize the Job Submission Client\n",
+ "\"\"\"\n",
+ "The SDK will automatically gather the dashboard address and authenticate using the Ray Job Submission Client\n",
+ "\"\"\"\n",
+ "client = cluster.job_client"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Submit an example mnist job using the Job Submission Client\n",
+ "submission_id = client.submit_job(\n",
+ " entrypoint=\"python mnist.py\",\n",
+ " runtime_env={\"working_dir\": \"./\",\"pip\": \"requirements.txt\"},\n",
+ ")\n",
+ "print(submission_id)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get the job's logs\n",
+ "client.get_job_logs(submission_id)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get the job's status\n",
+ "client.get_job_status(submission_id)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get job related info\n",
+ "client.get_job_info(submission_id)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# List all existing jobs\n",
+ "client.list_jobs()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Iterate through the logs of a job \n",
+ "async for lines in client.tail_job_logs(submission_id):\n",
+ " print(lines, end=\"\") "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Delete a job\n",
+ "# Can run client.cancel_job(submission_id) first if job is still running\n",
+ "client.delete_job(submission_id)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cluster.down()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "auth.logout()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.18"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/demo-notebooks/guided-demos/preview_nbs/3_basic_interactive.ipynb b/demo-notebooks/guided-demos/preview_nbs/2_basic_interactive.ipynb
similarity index 89%
rename from demo-notebooks/guided-demos/preview_nbs/3_basic_interactive.ipynb
rename to demo-notebooks/guided-demos/preview_nbs/2_basic_interactive.ipynb
index 090a4a305..28e05a26a 100644
--- a/demo-notebooks/guided-demos/preview_nbs/3_basic_interactive.ipynb
+++ b/demo-notebooks/guided-demos/preview_nbs/2_basic_interactive.ipynb
@@ -5,7 +5,7 @@
"id": "bbc21043",
"metadata": {},
"source": [
- "In this fourth and final notebook, we will go over how to leverage the SDK to directly work interactively with a Ray cluster during development."
+ "In this notebook, we will go over how to leverage the SDK to directly work interactively with a Ray Cluster during development."
]
},
{
@@ -55,10 +55,13 @@
"metadata": {},
"outputs": [],
"source": [
- "# Create and configure our cluster object (and appwrapper)\n",
+ "# Create and configure our cluster object\n",
+ "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n",
+ "namespace = \"default\" # Update to your namespace\n",
+ "cluster_name = \"interactivetest\"\n",
"cluster = Cluster(ClusterConfiguration(\n",
- " name='interactivetest',\n",
- " namespace='default',\n",
+ " name=cluster_name,\n",
+ " namespace=namespace,\n",
" num_workers=2,\n",
" min_cpus=2,\n",
" max_cpus=2,\n",
@@ -66,9 +69,8 @@
" max_memory=8,\n",
" num_gpus=1,\n",
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
- " instascale=True, #<---instascale enabled\n",
- " machine_types=[\"m5.xlarge\", \"g4dn.xlarge\"]\n",
- " \n",
+ " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
+ " # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
"))"
]
},
@@ -125,6 +127,19 @@
"Now we can connect directly to our Ray cluster via the Ray python client:"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e5308271",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from codeflare_sdk import generate_cert\n",
+ "# Create required TLS cert and export the environment variables to enable TLS\n",
+ "generate_cert.generate_tls_cert(cluster_name, namespace)\n",
+ "generate_cert.export_env(cluster_name, namespace)"
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,
@@ -295,7 +310,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.13"
+ "version": "3.9.18"
},
"vscode": {
"interpreter": {