File tree 7 files changed +81
-108
lines changed
7 files changed +81
-108
lines changed Original file line number Diff line number Diff line change
1
+ # Creation GPU self-hosted agent pool
2
+
3
+ ## Prepare the machine
4
+
5
+ This is a slightly modified version of the script from
6
+ https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/docker
7
+
8
+ ``` bash
9
+ apt-get update
10
+ apt-get install -y --no-install-recommends \
11
+ ca-certificates \
12
+ curl \
13
+ jq \
14
+ git \
15
+ iputils-ping \
16
+ libcurl4 \
17
+ libunwind8 \
18
+ netcat \
19
+ libssl1.0
20
+
21
+ curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
22
+ mkdir /azp
23
+ ```
24
+
25
+ ## Stating the agents
26
+
27
+ ``` bash
28
+ export TARGETARCH=linux-x64
29
+ export AZP_URL=" https://dev.azure.com/Lightning-AI"
30
+ export AZP_TOKEN=" xxxxxxxxxxxxxxxxxxxxxxxxxx"
31
+ export AZP_POOL=" lit-rtx-3090"
32
+
33
+ for i in {0..7..2}
34
+ do
35
+ nohup bash .azure/start.sh \
36
+ " AZP_AGENT_NAME=litGPU-YX_$i ,$(( i+ 1 )) " \
37
+ " CUDA_VISIBLE_DEVICES=$i ,$(( i+ 1 )) " \
38
+ > " agent-$i .log" &
39
+ done
40
+ ```
41
+
42
+ ## Check running agents
43
+
44
+ ``` bash
45
+ ps aux | grep start.sh
46
+ ```
Original file line number Diff line number Diff line change @@ -41,12 +41,14 @@ jobs:
41
41
timeoutInMinutes : " 20"
42
42
# how much time to give 'run always even if cancelled tasks' before stopping them
43
43
cancelTimeoutInMinutes : " 2"
44
- pool : azure-jirka-spot
44
+ pool : lit-rtx-3090
45
+ variables :
46
+ DEVICES : $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
45
47
container :
46
48
image : " pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1"
47
49
# default shm size is 64m. Increase it to avoid:
48
50
# 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
49
- options : " --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES= all --shm-size=512m "
51
+ options : " --gpus= all --shm-size=2gb "
50
52
workspace :
51
53
clean : all
52
54
61
63
pip list
62
64
displayName: 'Image info & NVIDIA'
63
65
66
+ - bash : |
67
+ echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
68
+ displayName: 'set visible devices'
69
+
64
70
- bash : |
65
71
set -e
66
72
PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
78
84
79
85
- bash : |
80
86
set -e
87
+ echo $CUDA_VISIBLE_DEVICES
81
88
python requirements/collect_env_details.py
82
- python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu > = 2, f'GPU: {mgpu}'"
89
+ python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu = = 2, f'GPU: {mgpu}'"
83
90
displayName: 'Env details'
84
91
85
92
- bash : python -m coverage run --source lightning_lite -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
Original file line number Diff line number Diff line change @@ -67,12 +67,14 @@ jobs:
67
67
timeoutInMinutes : " 80"
68
68
# how much time to give 'run always even if cancelled tasks' before stopping them
69
69
cancelTimeoutInMinutes : " 2"
70
- pool : azure-jirka-spot
70
+ pool : lit-rtx-3090
71
+ variables :
72
+ DEVICES : $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
71
73
container :
72
74
image : $(image)
73
75
# default shm size is 64m. Increase it to avoid:
74
76
# 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
75
- options : " --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES= all --shm-size=512m "
77
+ options : " --gpus= all --shm-size=2gb "
76
78
workspace :
77
79
clean : all
78
80
87
89
pip list
88
90
displayName: 'Image info & NVIDIA'
89
91
92
+ - bash : |
93
+ echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
94
+ displayName: 'set visible devices'
95
+
90
96
- bash : |
91
97
set -e
92
98
python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
@@ -112,8 +118,9 @@ jobs:
112
118
113
119
- bash : |
114
120
set -e
121
+ echo $CUDA_VISIBLE_DEVICES
115
122
python requirements/collect_env_details.py
116
- python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu > = 2, f'GPU: {mgpu}'"
123
+ python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu = = 2, f'GPU: {mgpu}'"
117
124
python requirements/pytorch/check-avail-strategies.py
118
125
python requirements/pytorch/check-avail-extras.py
119
126
displayName: 'Env details'
Original file line number Diff line number Diff line change 5
5
6
6
set -e
7
7
8
+ # export all args as env variables
9
+ for var in " $@ "
10
+ do
11
+ echo " $var "
12
+ eval " export $var "
13
+ done
14
+
15
+ printenv
16
+
8
17
if [ -z " $AZP_URL " ]; then
9
18
echo 1>&2 " error: missing AZP_URL environment variable"
10
19
exit 1
@@ -26,9 +35,9 @@ if [ -n "$AZP_WORK" ]; then
26
35
mkdir -p " $AZP_WORK "
27
36
fi
28
37
29
- rm -rf /azp/agent
30
- mkdir /azp/agent
31
- cd /azp/agent
38
+ rm -rf /azp/agent- $AZP_AGENT_NAME
39
+ mkdir /azp/agent- $AZP_AGENT_NAME
40
+ cd /azp/agent- $AZP_AGENT_NAME
32
41
33
42
export AGENT_ALLOW_RUNASROOT=" 1"
34
43
@@ -74,7 +83,7 @@ curl -LsS $AZP_AGENTPACKAGE_URL | tar -xz & wait $!
74
83
75
84
source ./env.sh
76
85
77
- print_header " 3. Configuring Azure Pipelines agent..."
86
+ print_header " 3. Configuring Azure Pipelines agent $AZP_AGENT_NAME ..."
78
87
79
88
./config.sh --unattended \
80
89
--agent " ${AZP_AGENT_NAME:- $(hostname)} " \
Original file line number Diff line number Diff line change @@ -59,7 +59,7 @@ RUN pip uninstall pytorch-lightning -y
59
59
60
60
WORKDIR /azp
61
61
62
- COPY ./dockers/ci-runner-hpu /start.sh /usr/local/bin/
62
+ COPY ./.azure /start.sh /usr/local/bin/
63
63
RUN chmod +x /usr/local/bin/start.sh
64
64
65
65
ENTRYPOINT ["/usr/local/bin/start.sh" ]
Original file line number Diff line number Diff line change @@ -23,7 +23,7 @@ RUN echo "ALL ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
23
23
24
24
WORKDIR /azp
25
25
26
- COPY ./dockers/ci-runner-ipu /start.sh /usr/local/bin/
26
+ COPY ./.azure /start.sh /usr/local/bin/
27
27
28
28
RUN curl -o /usr/local/bin/installdependencies.sh \
29
29
"https://raw.githubusercontent.com/microsoft/azure-pipelines-agent/d2acd5f77c6b3914cdb6ed0e5fbea672929c7da9/src/Misc/layoutbin/installdependencies.sh" && \
Load Diff This file was deleted.
You can’t perform that action at this time.
0 commit comments