Skip to content

Commit 446bcde

Browse files
committed
Added demo notebook for Kueue by default
1 parent 403cca6 commit 446bcde

20 files changed

+578
-3
lines changed

Diff for: demo-notebooks/additional-demos/hf_interactive.ipynb

+1
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@
9999
" max_memory=16, \n",
100100
" num_gpus=4,\n",
101101
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
102+
" mcad=True,\n",
102103
" instascale=True, machine_types=[\"m5.xlarge\", \"p3.8xlarge\"]))"
103104
]
104105
},

Diff for: demo-notebooks/additional-demos/local_interactive.ipynb

+1
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@
6363
" max_memory=4,\n",
6464
" num_gpus=0,\n",
6565
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
66+
" mcad=True,\n",
6667
" instascale=False,\n",
6768
" machine_types=[\"m5.xlarge\", \"p3.8xlarge\"]))"
6869
]

Diff for: demo-notebooks/guided-demos/0_basic_ray.ipynb

+1
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@
6969
" max_memory=4,\n",
7070
" num_gpus=0,\n",
7171
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
72+
" mcad=True,\n",
7273
" instascale=False\n",
7374
"))"
7475
]

Diff for: demo-notebooks/guided-demos/1_basic_instascale.ipynb

+1
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
" max_memory=8,\n",
6767
" num_gpus=1,\n",
6868
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
69+
" mcad=True,\n",
6970
" instascale=True, # InstaScale now enabled, will scale OCP cluster to guarantee resource request\n",
7071
" machine_types=[\"m5.xlarge\", \"g4dn.xlarge\"] # Head, worker AWS machine types desired\n",
7172
"))"

Diff for: demo-notebooks/guided-demos/2_basic_jobs.ipynb

+3-1
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
" max_memory=4,\n",
6767
" num_gpus=0,\n",
6868
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
69+
" mcad=True,\n",
6970
" instascale=False\n",
7071
"))"
7172
]
@@ -230,7 +231,8 @@
230231
" gpu=0,\n",
231232
" cpu=1,\n",
232233
" memMB=8000,\n",
233-
" image=\"quay.io/project-codeflare/mnist-job-test:v0.0.1\"\n",
234+
" image=\"quay.io/project-codeflare/mnist-job-test:v0.0.1\",\n",
235+
" mcad=True\n",
234236
")\n",
235237
"job = jobdef.submit()"
236238
]

Diff for: demo-notebooks/guided-demos/3_basic_interactive.ipynb

+1
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
" max_memory=8,\n",
6767
" num_gpus=1,\n",
6868
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
69+
" mcad=True,\n",
6970
" instascale=True, #<---instascale enabled\n",
7071
" machine_types=[\"m5.xlarge\", \"g4dn.xlarge\"]\n",
7172
" \n",

Diff for: demo-notebooks/guided-demos/4_gpt.ipynb

+1
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
" max_memory=8,\n",
5656
" num_gpus=1,\n",
5757
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
58+
" mcad=True,\n",
5859
" instascale=True, #<---instascale enabled\n",
5960
" machine_types=[\"m5.xlarge\", \"g4dn.xlarge\"],\n",
6061
"))"

Diff for: demo-notebooks/guided-demos/5_basic_kueue.ipynb

+185
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"In this notebook we will go over the basics of creating Kueue enabled resources with the SDK"
8+
]
9+
},
10+
{
11+
"cell_type": "code",
12+
"execution_count": null,
13+
"metadata": {},
14+
"outputs": [],
15+
"source": [
16+
"# Import pieces from codeflare-sdk\n",
17+
"from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication"
18+
]
19+
},
20+
{
21+
"cell_type": "code",
22+
"execution_count": null,
23+
"metadata": {},
24+
"outputs": [],
25+
"source": [
26+
"# Create authentication object for user permissions\n",
27+
"# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config\n",
28+
"# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually\n",
29+
"auth = TokenAuthentication(\n",
30+
" token = \"XXXXX\",\n",
31+
" server = \"XXXXX\",\n",
32+
" skip_tls=False\n",
33+
")\n",
34+
"auth.login()"
35+
]
36+
},
37+
{
38+
"cell_type": "code",
39+
"execution_count": null,
40+
"metadata": {},
41+
"outputs": [],
42+
"source": [
43+
"# Create and configure our cluster object\n",
44+
"# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\"\n",
45+
"cluster = Cluster(ClusterConfiguration(\n",
46+
" name='kueue-test',\n",
47+
" namespace='default',\n",
48+
" num_workers=2,\n",
49+
" min_cpus=1,\n",
50+
" max_cpus=1,\n",
51+
" min_memory=4,\n",
52+
" max_memory=4,\n",
53+
" num_gpus=0,\n",
54+
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
55+
" # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
56+
"))"
57+
]
58+
},
59+
{
60+
"cell_type": "code",
61+
"execution_count": null,
62+
"metadata": {},
63+
"outputs": [],
64+
"source": [
65+
"cluster.up()"
66+
]
67+
},
68+
{
69+
"cell_type": "code",
70+
"execution_count": null,
71+
"metadata": {},
72+
"outputs": [],
73+
"source": [
74+
"cluster.wait_ready()"
75+
]
76+
},
77+
{
78+
"cell_type": "code",
79+
"execution_count": null,
80+
"metadata": {},
81+
"outputs": [],
82+
"source": [
83+
"cluster.status()"
84+
]
85+
},
86+
{
87+
"cell_type": "code",
88+
"execution_count": null,
89+
"metadata": {},
90+
"outputs": [],
91+
"source": [
92+
"cluster.details()"
93+
]
94+
},
95+
{
96+
"cell_type": "code",
97+
"execution_count": null,
98+
"metadata": {},
99+
"outputs": [],
100+
"source": [
101+
"cluster.down()"
102+
]
103+
},
104+
{
105+
"cell_type": "markdown",
106+
"metadata": {},
107+
"source": [
108+
"Now, we will submit Jobs directly to Kueue, which will schedule a Batch Job to run with the requested resources using the Kueue Torchx scheduler:\n",
109+
"\n",
110+
"NOTE: To test this demo in an air-gapped/ disconnected environment alter the training script to use a local dataset."
111+
]
112+
},
113+
{
114+
"cell_type": "code",
115+
"execution_count": null,
116+
"metadata": {},
117+
"outputs": [],
118+
"source": [
119+
"from codeflare_sdk import DDPJobDefinition\n",
120+
"jobdef = DDPJobDefinition(\n",
121+
" name=\"mnistjob\",\n",
122+
" script=\"mnist.py\",\n",
123+
" # script=\"mnist_disconnected.py\", # training script for disconnected environment\n",
124+
" scheduler_args={\"namespace\": \"default\"},\n",
125+
" j=\"1x1\",\n",
126+
" gpu=0,\n",
127+
" cpu=1,\n",
128+
" memMB=8000,\n",
129+
" image=\"quay.io/project-codeflare/mnist-job-test:v0.0.1\",\n",
130+
")\n",
131+
"job = jobdef.submit()"
132+
]
133+
},
134+
{
135+
"cell_type": "code",
136+
"execution_count": null,
137+
"metadata": {},
138+
"outputs": [],
139+
"source": [
140+
"job.status()"
141+
]
142+
},
143+
{
144+
"cell_type": "code",
145+
"execution_count": null,
146+
"metadata": {},
147+
"outputs": [],
148+
"source": [
149+
"job.logs()"
150+
]
151+
},
152+
{
153+
"cell_type": "markdown",
154+
"metadata": {},
155+
"source": [
156+
"This time, once the pods complete, we can clean them up alongside any other associated resources. The following command can also be used to delete jobs early for Kueue submission:"
157+
]
158+
},
159+
{
160+
"cell_type": "code",
161+
"execution_count": null,
162+
"metadata": {},
163+
"outputs": [],
164+
"source": [
165+
"job.cancel()"
166+
]
167+
},
168+
{
169+
"cell_type": "code",
170+
"execution_count": null,
171+
"metadata": {},
172+
"outputs": [],
173+
"source": [
174+
"auth.logout()"
175+
]
176+
}
177+
],
178+
"metadata": {
179+
"language_info": {
180+
"name": "python"
181+
}
182+
},
183+
"nbformat": 4,
184+
"nbformat_minor": 2
185+
}

Diff for: demo-notebooks/guided-demos/notebook-ex-outputs/0_basic_ray.ipynb

+1
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@
7777
" max_memory=4,\n",
7878
" num_gpus=0,\n",
7979
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
80+
" mcad=True,\n",
8081
" instascale=False\n",
8182
"))"
8283
]

Diff for: demo-notebooks/guided-demos/notebook-ex-outputs/1_basic_instascale.ipynb

+1
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@
7575
" num_gpus=1,\n",
7676
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
7777
" instascale=True, # InstaScale now enabled, will scale OCP cluster to guarantee resource request\n",
78+
" mcad=True,\n",
7879
" machine_types=[\"m5.xlarge\", \"g4dn.xlarge\"] # Head, worker AWS machine types desired\n",
7980
"))"
8081
]

Diff for: demo-notebooks/guided-demos/notebook-ex-outputs/2_basic_jobs.ipynb

+4-2
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,8 @@
7474
" max_memory=4,\n",
7575
" num_gpus=0,\n",
7676
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
77-
" instascale=False\n",
77+
" instascale=False,\n",
78+
" mcad=True\n",
7879
"))"
7980
]
8081
},
@@ -387,7 +388,8 @@
387388
" gpu=0,\n",
388389
" cpu=1,\n",
389390
" memMB=8000,\n",
390-
" image=\"quay.io/project-codeflare/mnist-job-test:v0.0.1\"\n",
391+
" image=\"quay.io/project-codeflare/mnist-job-test:v0.0.1\",\n",
392+
" mcad=True\n",
391393
")\n",
392394
"job = jobdef.submit()"
393395
]

Diff for: demo-notebooks/guided-demos/notebook-ex-outputs/3_basic_interactive.ipynb

+1
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@
7474
" max_memory=8,\n",
7575
" num_gpus=1,\n",
7676
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
77+
" mcad=True,\n",
7778
" instascale=True, #<---instascale enabled\n",
7879
" machine_types=[\"m5.xlarge\", \"g4dn.xlarge\"]\n",
7980
" \n",

Diff for: demo-notebooks/guided-demos/notebook-ex-outputs/4_gpt.ipynb

+1
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@
6464
" max_memory=8,\n",
6565
" num_gpus=1,\n",
6666
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
67+
" mcad=True,\n",
6768
" instascale=True, #<---instascale enabled\n",
6869
" machine_types=[\"m5.xlarge\", \"g4dn.xlarge\"],\n",
6970
"))"

0 commit comments

Comments
 (0)