Skip to content

Commit 333e8d9

Browse files
committedMar 4, 2023
feat: in BYOC mode, ray helm chart should also submit the job
1 parent bc60f29 commit 333e8d9

File tree

13 files changed

+50
-42
lines changed

13 files changed

+50
-42
lines changed
 

‎.github/workflows/kind.yml

-1
Original file line numberDiff line numberDiff line change
@@ -47,5 +47,4 @@ jobs:
4747
env:
4848
EXECUTABLE_PATH: github-actions-production
4949
DEBUG_KUBERNETES: true
50-
TEST_LOG_AGGREGATOR: true
5150
run: ./tests/kind/run.sh ${{ matrix.profile }}

‎package-lock.json

+17-17
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎plugins/plugin-codeflare/package.json

+2-2
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,13 @@
3030
"@types/split2": "^3.2.1"
3131
},
3232
"dependencies": {
33-
"@guidebooks/store": "^5.3.4",
33+
"@guidebooks/store": "^5.4.8",
3434
"@logdna/tail-file": "^3.0.1",
3535
"@patternfly/react-charts": "^6.94.18",
3636
"@patternfly/react-core": "^4.276.6",
3737
"asciinema-player": "^3.1.0",
3838
"chokidar": "^3.5.3",
39-
"madwizard": "^6.2.5",
39+
"madwizard": "^6.2.8",
4040
"needle": "^3.2.0",
4141
"open": "^8.4.2",
4242
"pretty-bytes": "^6.1.0",

‎tests/kind/profiles/non-gpu1/keep-it-simple

+3-3
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"name": "keep-it-simple",
33
"creationTime": 1664149787016,
44
"lastModifiedTime": 1676828268772,
5-
"lastUsedTime": 1676836671254,
5+
"lastUsedTime": 1677870297449,
66
"choices": {
77
"madwizard/apriori/use-gpu": "don't use gpus",
88
"madwizard/apriori/arch": "x64",
@@ -11,7 +11,7 @@
1111
"madwizard/apriori/in-terminal": "HTML",
1212
"ml/codeflare": "Submit a new Run",
1313
"ml/codeflare/run": "Bring Your Own Code",
14-
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\"}",
14+
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
1717
"ml/ray/start/resources": "{\"Number of CPUs\":\"500m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",
@@ -22,4 +22,4 @@
2222
"ml/ray/cluster/choose/kubernetes": "codeflare-test-ray-cluster",
2323
"ml/ray/cluster/kubernetes/choose-pod-scheduler": "Keep It Simple"
2424
}
25-
}
25+
}

‎tests/kind/profiles/non-gpu1/mcad-coscheduler

+2-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
"madwizard/apriori/in-terminal": "HTML",
1212
"ml/codeflare": "Submit a new Run",
1313
"ml/codeflare/run": "Bring Your Own Code",
14-
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\"}",
14+
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
1717
"ml/ray/start/resources": "{\"Number of CPUs\":\"200m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",
@@ -24,4 +24,4 @@
2424
"kubernetes/mcad/choose/job-priority": "Default Priority",
2525
"kubernetes/mcad/choose/scheduler": "MCAD with the Advanced Coscheduler"
2626
}
27-
}
27+
}

‎tests/kind/profiles/non-gpu1/mcad-default

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
"madwizard/apriori/in-terminal": "HTML",
1212
"ml/codeflare": "Submit a new Run",
1313
"ml/codeflare/run": "Bring Your Own Code",
14-
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\"}",
14+
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
1717
"ml/ray/start/resources": "{\"Number of CPUs\":\"200m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",

‎tests/kind/profiles/non-gpu1/mcad-preinstalled

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
"madwizard/apriori/in-terminal": "HTML",
1212
"ml/codeflare": "Submit a new Run",
1313
"ml/codeflare/run": "Bring Your Own Code",
14-
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\"}",
14+
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
1717
"ml/ray/start/resources": "{\"Number of CPUs\":\"200m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",

‎tests/kind/profiles/non-gpu1/ray-autoscaler

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
"madwizard/apriori/in-terminal": "HTML",
1212
"ml/codeflare": "Submit a new Run",
1313
"ml/codeflare/run": "Bring Your Own Code",
14-
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\"}",
14+
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
1717
"ml/ray/start/resources": "{\"Number of CPUs\":\"200m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"0\",\"Maximum Workers\":\"0\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"2.5Gi\",\"Ephemeral Storage\":\"5Gi\"}",

‎tests/kind/profiles/non-gpu2/keep-it-simple

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
"madwizard/apriori/in-terminal": "HTML",
1212
"ml/codeflare": "Submit a new Run",
1313
"ml/codeflare/run": "Bring Your Own Code",
14-
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\"}",
14+
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
1717
"ml/ray/start/resources": "{\"Number of CPUs\":\"500m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",

‎tests/kind/profiles/non-gpu3/keep-it-simple

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
"madwizard/apriori/in-terminal": "HTML",
1212
"ml/codeflare": "Submit a new Run",
1313
"ml/codeflare/run": "Bring Your Own Code",
14-
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\"}",
14+
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
1717
"ml/ray/start/resources": "{\"Number of CPUs\":\"500m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",

‎tests/kind/profiles/non-gpu4/keep-it-simple

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
"madwizard/apriori/in-terminal": "HTML",
1212
"ml/codeflare": "Submit a new Run",
1313
"ml/codeflare/run": "Bring Your Own Code",
14-
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\"}",
14+
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
1717
"ml/ray/start/resources": "{\"Number of CPUs\":\"500m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",

‎tests/kind/run.sh

+13-10
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
set -e
1212
set -o pipefail
13+
if [ -z "$TERM" ]; then export TERM=xterm-256color; fi
1314

1415
SCRIPTDIR=$(cd $(dirname "$0") && pwd)
1516
. "$SCRIPTDIR"/values.sh
@@ -87,7 +88,7 @@ function run {
8788
fi
8889

8990
echo "[Test] Running with variant=$variant profile=$profile yes=$yes"
90-
GUIDEBOOK_NAME="main-job-run" "$ROOT"/bin/codeflare -p $profile $yes $guidebook
91+
GUIDEBOOK_NAME="main-job-run" "$ROOT"/bin/codeflare -p $profile $yes $guidebook | tee $OUTPUT
9192
}
9293

9394
#
@@ -159,6 +160,9 @@ function onexit {
159160
(pkill -P $EVENTS_PID || exit 0)
160161
fi
161162

163+
# just in case...
164+
(kubectl delete job ray-cleaner-codeflare-test-ray-cluster || exit 0)
165+
162166
if [ -z "$NO_KIND" ]; then
163167
# don't kill ourselves if we're running in a container
164168
sleep 10
@@ -200,21 +204,20 @@ function test {
200204
echo "[Test] Using JOB_ID=$JOB_ID"
201205

202206
# 1. launch codeflare guidebook run
203-
run "$1" | tee $OUTPUT &
204-
local RUN_PID=$!
205-
echo "[Test] Run submitted pid=$RUN_PID"
207+
echo "[Test] Submitting run"
208+
run "$1"
206209

207210
# wait to attach until the job has been submitted
208-
while true; do
209-
grep -q 'Run it' "$OUTPUT" && break
210-
echo "[Test] Waiting to attach log streamer..."
211-
sleep 1
212-
done
211+
# while true; do
212+
# grep -q 'Run it' "$OUTPUT" && break
213+
# echo "[Test] Waiting to attach log streamer..."
214+
# sleep 1
215+
#done
213216

214217
# echo "[Test] Preparing to attach log streamer jobid=$JOB_ID"
215218
# attach "$1" "$JOB_ID"
216219

217-
wait $RUN_PID
220+
# wait $RUN_PID
218221
echo "[Test] Run has finished"
219222
# the job should be done now
220223

‎tests/self-test/self-test.yaml

+7-1
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,14 @@ metadata:
99
name: codeflare-self-test-role
1010
rules:
1111
- apiGroups: [""]
12-
resources: ["pods", "pods/exec", "services", "events", "secrets", "serviceaccounts"]
12+
resources: ["pods", "pods/exec", "services", "events", "serviceaccounts", "configmaps"]
1313
verbs: ["create", "delete", "get", "watch", "list"]
14+
- apiGroups: [""]
15+
resources: ["secrets"]
16+
verbs: ["create", "get", "list", "delete", "update"]
17+
- apiGroups: ["batch"]
18+
resources: ["jobs"]
19+
verbs: ["create", "get", "list", "delete"]
1420
- apiGroups: ["rbac.authorization.k8s.io"]
1521
resources: ["roles", "rolebindings"]
1622
verbs: ["create", "delete", "get", "list"]

0 commit comments

Comments
 (0)
Please sign in to comment.