Skip to content

Commit 7de999a

Browse files
Bordapre-commit-ci[bot]carmoccaakihironittaotaj
authored andcommitted
CI: Use self-hosted Azure GPU runners (#14632)
* move config * Apply suggestions from code review Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Carlos Mocholí <[email protected]> Co-authored-by: Akihiro Nitta <[email protected]> Co-authored-by: otaj <[email protected]>
1 parent 0dd4f41 commit 7de999a

File tree

7 files changed

+81
-108
lines changed

7 files changed

+81
-108
lines changed

.azure/README.md

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# Creation GPU self-hosted agent pool
2+
3+
## Prepare the machine
4+
5+
This is a slightly modified version of the script from
6+
https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/docker
7+
8+
```bash
9+
apt-get update
10+
apt-get install -y --no-install-recommends \
11+
ca-certificates \
12+
curl \
13+
jq \
14+
git \
15+
iputils-ping \
16+
libcurl4 \
17+
libunwind8 \
18+
netcat \
19+
libssl1.0
20+
21+
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
22+
mkdir /azp
23+
```
24+
25+
## Stating the agents
26+
27+
```bash
28+
export TARGETARCH=linux-x64
29+
export AZP_URL="https://dev.azure.com/Lightning-AI"
30+
export AZP_TOKEN="xxxxxxxxxxxxxxxxxxxxxxxxxx"
31+
export AZP_POOL="lit-rtx-3090"
32+
33+
for i in {0..7..2}
34+
do
35+
nohup bash .azure/start.sh \
36+
"AZP_AGENT_NAME=litGPU-YX_$i,$((i+1))" \
37+
"CUDA_VISIBLE_DEVICES=$i,$((i+1))" \
38+
> "agent-$i.log" &
39+
done
40+
```
41+
42+
## Check running agents
43+
44+
```bash
45+
ps aux | grep start.sh
46+
```

.azure/gpu-tests-lite.yml

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,14 @@ jobs:
4141
timeoutInMinutes: "20"
4242
# how much time to give 'run always even if cancelled tasks' before stopping them
4343
cancelTimeoutInMinutes: "2"
44-
pool: azure-jirka-spot
44+
pool: lit-rtx-3090
45+
variables:
46+
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
4547
container:
4648
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1"
4749
# default shm size is 64m. Increase it to avoid:
4850
# 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
49-
options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m"
51+
options: "--gpus=all --shm-size=2gb"
5052
workspace:
5153
clean: all
5254

@@ -61,6 +63,10 @@ jobs:
6163
pip list
6264
displayName: 'Image info & NVIDIA'
6365
66+
- bash: |
67+
echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
68+
displayName: 'set visible devices'
69+
6470
- bash: |
6571
set -e
6672
PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
@@ -78,8 +84,9 @@ jobs:
7884
7985
- bash: |
8086
set -e
87+
echo $CUDA_VISIBLE_DEVICES
8188
python requirements/collect_env_details.py
82-
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'"
89+
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
8390
displayName: 'Env details'
8491
8592
- bash: python -m coverage run --source lightning_lite -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50

.azure/gpu-tests.yml

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,12 +67,14 @@ jobs:
6767
timeoutInMinutes: "80"
6868
# how much time to give 'run always even if cancelled tasks' before stopping them
6969
cancelTimeoutInMinutes: "2"
70-
pool: azure-jirka-spot
70+
pool: lit-rtx-3090
71+
variables:
72+
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
7173
container:
7274
image: $(image)
7375
# default shm size is 64m. Increase it to avoid:
7476
# 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
75-
options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m"
77+
options: "--gpus=all --shm-size=2gb"
7678
workspace:
7779
clean: all
7880

@@ -87,6 +89,10 @@ jobs:
8789
pip list
8890
displayName: 'Image info & NVIDIA'
8991
92+
- bash: |
93+
echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
94+
displayName: 'set visible devices'
95+
9096
- bash: |
9197
set -e
9298
python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
@@ -112,8 +118,9 @@ jobs:
112118
113119
- bash: |
114120
set -e
121+
echo $CUDA_VISIBLE_DEVICES
115122
python requirements/collect_env_details.py
116-
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'"
123+
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
117124
python requirements/pytorch/check-avail-strategies.py
118125
python requirements/pytorch/check-avail-extras.py
119126
displayName: 'Env details'

dockers/ci-runner-hpu/start.sh renamed to .azure/start.sh

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,15 @@
55

66
set -e
77

8+
# export all args as env variables
9+
for var in "$@"
10+
do
11+
echo "$var"
12+
eval "export $var"
13+
done
14+
15+
printenv
16+
817
if [ -z "$AZP_URL" ]; then
918
echo 1>&2 "error: missing AZP_URL environment variable"
1019
exit 1
@@ -26,9 +35,9 @@ if [ -n "$AZP_WORK" ]; then
2635
mkdir -p "$AZP_WORK"
2736
fi
2837

29-
rm -rf /azp/agent
30-
mkdir /azp/agent
31-
cd /azp/agent
38+
rm -rf /azp/agent-$AZP_AGENT_NAME
39+
mkdir /azp/agent-$AZP_AGENT_NAME
40+
cd /azp/agent-$AZP_AGENT_NAME
3241

3342
export AGENT_ALLOW_RUNASROOT="1"
3443

@@ -74,7 +83,7 @@ curl -LsS $AZP_AGENTPACKAGE_URL | tar -xz & wait $!
7483

7584
source ./env.sh
7685

77-
print_header "3. Configuring Azure Pipelines agent..."
86+
print_header "3. Configuring Azure Pipelines agent $AZP_AGENT_NAME..."
7887

7988
./config.sh --unattended \
8089
--agent "${AZP_AGENT_NAME:-$(hostname)}" \

dockers/ci-runner-hpu/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ RUN pip uninstall pytorch-lightning -y
5959

6060
WORKDIR /azp
6161

62-
COPY ./dockers/ci-runner-hpu/start.sh /usr/local/bin/
62+
COPY ./.azure/start.sh /usr/local/bin/
6363
RUN chmod +x /usr/local/bin/start.sh
6464

6565
ENTRYPOINT ["/usr/local/bin/start.sh"]

dockers/ci-runner-ipu/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ RUN echo "ALL ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
2323

2424
WORKDIR /azp
2525

26-
COPY ./dockers/ci-runner-ipu/start.sh /usr/local/bin/
26+
COPY ./.azure/start.sh /usr/local/bin/
2727

2828
RUN curl -o /usr/local/bin/installdependencies.sh \
2929
"https://raw.githubusercontent.com/microsoft/azure-pipelines-agent/d2acd5f77c6b3914cdb6ed0e5fbea672929c7da9/src/Misc/layoutbin/installdependencies.sh" && \

dockers/ci-runner-ipu/start.sh

Lines changed: 0 additions & 96 deletions
This file was deleted.

0 commit comments

Comments
 (0)