Skip to content

Commit 9925290

Browse files
committed
fix: gpu utilization may not be streamed out
This PR also backs out assigning a GPU to the ray head. This PR also adopts the hostname convention already used in pod-vmstat.sh and pod-vmstat-memory.sh
1 parent 1eb8e9d commit 9925290

File tree

3 files changed

+3
-4
lines changed

3 files changed

+3
-4
lines changed

guidebooks/ml/ray/aggregator/setup.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,5 +41,5 @@ export JOB_ENV=$(
4141
```
4242

4343
```shell
44-
export NUM_GPUS=$(echo "$JOB_ENV" | jq -cr '.runtime_env.env_vars.NUM_GPUS')
44+
export NUM_GPUS=${NUM_GPUS-$(echo "$JOB_ENV" | jq -cr '.runtime_env.env_vars.NUM_GPUS')}
4545
```

guidebooks/ml/ray/run/gpu-utilization.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,13 @@ if [ -z "$QUIET_CONSOLE" ]; then
1313
kubectl get pod -l ${KUBE_POD_LABEL_SELECTOR} ${KUBE_CONTEXT_ARG} ${KUBE_NS_ARG} -o name \
1414
--field-selector=status.phase==Running \
1515
| xargs ${REPLSIZE} -P128 -I {} -n1 \
16-
sh -c "sleep 0.\$(shuf -i 100-2000 -n1); kubectl exec --pod-running-timeout=1h ${KUBE_CONTEXT_ARG} ${KUBE_NS_ARG} {} -- sh -c \"nvidia-smi --query-gpu=timestamp,utilization.gpu,utilization.memory,memory.total,temperature.gpu,name --format=csv,noheader -l 10 | awk -Winteractive -v pod=\\\$(hostname) -F, '{printf \\\"\n\033[31;1m%s \033[0;31mGPUType\t\t\033[0;2m%s %s\033[0m\n\\\", \\\$6, pod, \\\$1; printf \\\"\033[31;1m%s \033[0;31mUtilization.GPU\t\t\t\033[0;2m%s %s\033[0m\n\\\", \\\$2, pod, \\\$1; printf \\\"\033[31;1m%s \033[0;31mUtilization.Memory\t\t\t\033[0;2m%s %s\033[0m\n\\\", \\\$3, pod, \\\$1; printf \\\"\033[31;1m%s \033[0;31mMemory.Total\t\t\t\033[0;2m%s %s\033[0m\n\\\", \\\$4, pod, \\\$1; printf \\\"\033[31;1m%s \033[0;31mTemperature.GPU\t\t\t\033[0;2m%s %s\033[0m\n\\\", \\\$5, pod, \\\$1; }'\"" \
16+
sh -c "sleep 0.\$(shuf -i 100-2000 -n1); kubectl exec --pod-running-timeout=1h ${KUBE_CONTEXT_ARG} ${KUBE_NS_ARG} {} -- sh -c \"nvidia-smi --query-gpu=timestamp,utilization.gpu,utilization.memory,memory.total,temperature.gpu,name --format=csv,noheader -l 10 | awk -Winteractive -v pod=\\\$(hostname | sed -E 's/-[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}//') -F, '{printf \\\"\n\033[31;1m%s \033[0;31mGPUType\t\t\033[0;2m%s %s\033[0m\n\\\", \\\$6, pod, \\\$1; printf \\\"\033[31;1m%s \033[0;31mUtilization.GPU\t\t\t\033[0;2m%s %s\033[0m\n\\\", \\\$2, pod, \\\$1; printf \\\"\033[31;1m%s \033[0;31mUtilization.Memory\t\t\t\033[0;2m%s %s\033[0m\n\\\", \\\$3, pod, \\\$1; printf \\\"\033[31;1m%s \033[0;31mMemory.Total\t\t\t\033[0;2m%s %s\033[0m\n\\\", \\\$4, pod, \\\$1; printf \\\"\033[31;1m%s \033[0;31mTemperature.GPU\t\t\t\033[0;2m%s %s\033[0m\n\\\", \\\$5, pod, \\\$1; }'\"" \
1717
| tee "${STREAMCONSUMER_RESOURCES}gpu.txt" \
1818
1>&2
1919
else
2020
kubectl get pod -l ${KUBE_POD_LABEL_SELECTOR} ${KUBE_CONTEXT_ARG} ${KUBE_NS_ARG} -o name \
2121
--field-selector=status.phase==Running \
2222
| xargs ${REPLSIZE} -P128 -I {} -n1 \
23-
sh -c "sleep 0.\$(shuf -i 100-2000 -n1); kubectl exec --pod-running-timeout=1h ${KUBE_CONTEXT_ARG} ${KUBE_NS_ARG} {} -- sh -c \"nvidia-smi --query-gpu=timestamp,utilization.gpu,utilization.memory,memory.total,temperature.gpu,name --format=csv,noheader -l 10 | awk -Winteractive -v pod=\\\$(hostname) -F, '{printf \\\"\n\033[31;1m%s \033[0;31mGPUType\t\t\033[0;2m%s %s\033[0m\n\\\", \\\$6, pod, \\\$1; printf \\\"\033[31;1m%s \033[0;31mUtilization.GPU\t\t\t\033[0;2m%s %s\033[0m\n\\\", \\\$2, pod, \\\$1; printf \\\"\033[31;1m%s \033[0;31mUtilization.Memory\t\t\t\033[0;2m%s %s\033[0m\n\\\", \\\$3, pod, \\\$1; printf \\\"\033[31;1m%s \033[0;31mMemory.Total\t\t\t\033[0;2m%s %s\033[0m\n\\\", \\\$4, pod, \\\$1; printf \\\"\033[31;1m%s \033[0;31mTemperature.GPU\t\t\t\033[0;2m%s %s\033[0m\n\\\", \\\$5, pod, \\\$1; }'\"" \
23+
sh -c "sleep 0.\$(shuf -i 100-2000 -n1); kubectl exec --pod-running-timeout=1h ${KUBE_CONTEXT_ARG} ${KUBE_NS_ARG} {} -- sh -c \"nvidia-smi --query-gpu=timestamp,utilization.gpu,utilization.memory,memory.total,temperature.gpu,name --format=csv,noheader -l 10 | awk -Winteractive -v pod=\\\$(hostname | sed -E 's/-[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}//') -F, '{printf \\\"\n\033[31;1m%s \033[0;31mGPUType\t\t\033[0;2m%s %s\033[0m\n\\\", \\\$6, pod, \\\$1; printf \\\"\033[31;1m%s \033[0;31mUtilization.GPU\t\t\t\033[0;2m%s %s\033[0m\n\\\", \\\$2, pod, \\\$1; printf \\\"\033[31;1m%s \033[0;31mUtilization.Memory\t\t\t\033[0;2m%s %s\033[0m\n\\\", \\\$3, pod, \\\$1; printf \\\"\033[31;1m%s \033[0;31mMemory.Total\t\t\t\033[0;2m%s %s\033[0m\n\\\", \\\$4, pod, \\\$1; printf \\\"\033[31;1m%s \033[0;31mTemperature.GPU\t\t\t\033[0;2m%s %s\033[0m\n\\\", \\\$5, pod, \\\$1; }'\"" \
2424
> "${STREAMCONSUMER_RESOURCES}gpu.txt"
2525
fi

guidebooks/ml/ray/start/kubernetes/install-via-helm.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,6 @@ cd $REPO/$SUBDIR && \
184184
--set clusterNamespace=${KUBE_NS_FOR_REAL-${KUBE_NS}} \
185185
--set podTypes.rayHeadType.CPU=${NUM_CPUS-1} \
186186
--set podTypes.rayHeadType.CPUInteger=${NUM_CPUS_INTEGER-1} \
187-
--set podTypes.rayHeadType.GPU=${NUM_GPUS-0} \
188187
--set podTypes.rayHeadType.memory=${HEAD_MEMORY-1Gi} \
189188
--set podTypes.rayHeadType.storage=${RAY_EPHEMERAL_STORAGE-5Gi} \
190189
--set podTypes.rayWorkerType.CPU=${NUM_CPUS-1} \

0 commit comments

Comments
 (0)