Skip to content

Commit e87c11a

Browse files
awaelchlicarmoccaBorda
authored
Upgrade GPU CI to PyTorch 1.13 (#15583)
Co-authored-by: Carlos Mocholí <[email protected]> Co-authored-by: Jirka <[email protected]>
1 parent c32c435 commit e87c11a

File tree

11 files changed

+94
-45
lines changed

11 files changed

+94
-45
lines changed

.azure/gpu-benchmark.yml

+28-5
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ jobs:
3939
variables:
4040
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
4141
container:
42-
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1"
42+
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.6.1"
4343
options: "--gpus=all --shm-size=32g"
4444
workspace:
4545
clean: all
@@ -49,18 +49,41 @@ jobs:
4949
- bash: |
5050
echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
5151
cuda_ver=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
52-
echo "##vso[task.setvariable variable=CUDA_VERSION_MM]$cuda_ver"
5352
echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${cuda_ver}/torch_stable.html"
5453
displayName: 'set env. vars'
5554
5655
- bash: |
57-
pip install -e .[strategies] --find-links ${TORCH_URL}
56+
echo $CUDA_VISIBLE_DEVICES
57+
echo $TORCH_URL
58+
lspci | egrep 'VGA|3D'
59+
whereis nvidia
60+
nvidia-smi
61+
which python && which pip
62+
python --version
63+
pip --version
5864
pip list
65+
displayName: 'Image info & NVIDIA'
66+
67+
- bash: |
68+
python .actions/assistant.py requirements_prune_pkgs --packages [horovod,bagua,colossalai] --req_files [requirements/pytorch/strategies.txt]
69+
70+
PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
71+
python ./requirements/pytorch/adjust-versions.py requirements/pytorch/base.txt ${PYTORCH_VERSION}
72+
displayName: 'Adjust dependencies'
73+
74+
- bash: pip install -e .[dev,strategies,examples] --find-links ${TORCH_URL}
5975
env:
60-
PACKAGE_NAME: pytorch
61-
FREEZE_REQUIREMENTS: 1
76+
PACKAGE_NAME: "pytorch"
77+
FREEZE_REQUIREMENTS: "1"
6278
displayName: 'Install package'
6379

80+
- bash: |
81+
set -e
82+
pip list
83+
python requirements/collect_env_details.py
84+
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
85+
displayName: 'Env details'
86+
6487
- bash: python -m pytest benchmarks -v --durations=0
6588
env:
6689
PL_RUNNING_BENCHMARKS: "1"

.azure/gpu-tests-lite.yml

+15-8
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ jobs:
4141
variables:
4242
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
4343
container:
44-
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1"
44+
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.6.1"
4545
# default shm size is 64m. Increase it to avoid:
4646
# 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
4747
options: "--gpus=all --shm-size=2gb"
@@ -51,6 +51,14 @@ jobs:
5151

5252
steps:
5353
- bash: |
54+
echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
55+
cuda_ver=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
56+
echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${cuda_ver}/torch_stable.html"
57+
displayName: 'set env. vars'
58+
59+
- bash: |
60+
echo $CUDA_VISIBLE_DEVICES
61+
echo $TORCH_URL
5462
lspci | egrep 'VGA|3D'
5563
whereis nvidia
5664
nvidia-smi
@@ -61,22 +69,21 @@ jobs:
6169
displayName: 'Image info & NVIDIA'
6270
6371
- bash: |
64-
echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
65-
displayName: 'set visible devices'
72+
PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
73+
python ./requirements/pytorch/adjust-versions.py requirements/lite/base.txt ${PYTORCH_VERSION}
74+
python ./requirements/pytorch/adjust-versions.py requirements/lite/examples.txt ${PYTORCH_VERSION}
75+
displayName: 'Adjust dependencies'
6676
6777
- bash: |
68-
set -e
69-
CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
70-
pip install -e .[dev,strategies,examples] --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html
71-
pip list
78+
pip install -e .[dev,strategies,examples] --find-links ${TORCH_URL}
7279
env:
7380
PACKAGE_NAME: "lite"
7481
FREEZE_REQUIREMENTS: "1"
7582
displayName: 'Install package & dependencies'
7683
7784
- bash: |
7885
set -e
79-
echo $CUDA_VISIBLE_DEVICES
86+
pip list
8087
python requirements/collect_env_details.py
8188
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
8289
displayName: 'Env details'

.azure/gpu-tests-pytorch.yml

+13-7
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,12 @@ jobs:
3939
- job: testing
4040
strategy:
4141
matrix:
42-
# TODO: package parametrization
43-
'PyTorch - stable':
42+
'PyTorch & strategies': # this uses torch 1.12 as not all strategies support 1.13 yet
4443
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1"
44+
scope: "strategies"
45+
'PyTorch - latest':
46+
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.6.1"
47+
scope: ""
4548
# how long to run the job before automatically cancelling
4649
timeoutInMinutes: "80"
4750
# how much time to give 'run always even if cancelled tasks' before stopping them
@@ -90,11 +93,11 @@ jobs:
9093
python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt ${PYTORCH_VERSION}
9194
displayName: 'Adjust dependencies'
9295
93-
- bash: pip install -e .[strategies] -r requirements/pytorch/devel.txt -r requirements/pytorch/examples.txt --find-links ${TORCH_URL}
96+
- bash: pip install -e .[dev,examples] --find-links ${TORCH_URL}
9497
env:
9598
PACKAGE_NAME: "pytorch"
9699
FREEZE_REQUIREMENTS: "1"
97-
displayName: 'Install package'
100+
displayName: 'Install package & extras'
98101

99102
- bash: |
100103
set -e
@@ -106,14 +109,17 @@ jobs:
106109
CUDA_VERSION_COLOSSALAI=$(python -c "print([ver for ver in [11.3, 11.1] if $CUDA_VERSION_MM_COLOSSALAI >= ver][0])")
107110
pip install "colossalai==0.1.10+torch${PYTORCH_VERSION_COLOSSALAI}cu${CUDA_VERSION_COLOSSALAI}" --find-links https://release.colossalai.org
108111
109-
pip list
110-
displayName: 'Install dependencies'
112+
pip install -r requirements/pytorch/strategies.txt --find-links ${TORCH_URL}
113+
114+
python requirements/pytorch/check-avail-strategies.py
115+
condition: eq(variables['scope'], 'strategies')
116+
displayName: 'Install strategies'
111117
112118
- bash: |
113119
set -e
120+
pip list
114121
python requirements/collect_env_details.py
115122
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
116-
python requirements/pytorch/check-avail-strategies.py
117123
python requirements/pytorch/check-avail-extras.py
118124
displayName: 'Env details'
119125

.github/checkgroup.yml

+2
Original file line numberDiff line numberDiff line change
@@ -138,12 +138,14 @@ subprojects:
138138
- "build-cuda (3.9, 1.10, 11.3.1)"
139139
- "build-cuda (3.9, 1.11, 11.3.1)"
140140
- "build-cuda (3.9, 1.12, 11.6.1)"
141+
- "build-cuda (3.9, 1.13, 11.6.1)"
141142
- "build-hpu (1.5.0, 1.11.0)"
142143
- "build-ipu (3.9, 1.10)"
143144
- "build-NGC"
144145
- "build-pl (3.9, 1.10, 11.3.1)"
145146
- "build-pl (3.9, 1.11, 11.3.1)"
146147
- "build-pl (3.9, 1.12, 11.6.1)"
148+
# TODO: add 1.13
147149
- "build-xla (3.7, 1.12)"
148150

149151
# SECTION: lightning_lite

.github/workflows/ci-pytorch-dockers.yml

+2
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ jobs:
3737
- {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
3838
- {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
3939
- {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"}
40+
- {python_version: "3.9", pytorch_version: "1.13", cuda_version: "11.6.1"}
4041
steps:
4142
- uses: actions/checkout@v3
4243
- uses: docker/setup-buildx-action@v2
@@ -98,6 +99,7 @@ jobs:
9899
- {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
99100
- {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
100101
- {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"}
102+
- {python_version: "3.9", pytorch_version: "1.13", cuda_version: "11.6.1"}
101103
steps:
102104
- uses: actions/checkout@v3
103105
- uses: docker/setup-buildx-action@v2

.github/workflows/release-docker.yml

+2-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ jobs:
1919
- {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
2020
- {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
2121
- {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"}
22+
- {python_version: "3.9", pytorch_version: "1.13", cuda_version: "11.6.1"}
2223
steps:
2324
- name: Checkout
2425
uses: actions/checkout@v3
@@ -47,7 +48,7 @@ jobs:
4748
- name: Publish Latest to Docker
4849
uses: docker/build-push-action@v3
4950
# Only latest Python and PyTorch
50-
if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.12'
51+
if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.13'
5152
with:
5253
repository: pytorchlightning/pytorch_lightning
5354
username: ${{ secrets.DOCKER_USERNAME }}

dockers/base-cuda/Dockerfile

+27-14
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,13 @@
1313
# limitations under the License.
1414

1515
ARG UBUNTU_VERSION=20.04
16-
ARG CUDA_VERSION=11.3.1
16+
ARG CUDA_VERSION=11.6.1
17+
1718

1819
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
1920

2021
ARG PYTHON_VERSION=3.9
21-
ARG PYTORCH_VERSION=1.12
22+
ARG PYTORCH_VERSION=1.13
2223

2324
SHELL ["/bin/bash", "-c"]
2425
# https://techoverflow.net/2019/05/18/how-to-fix-configuring-tzdata-interactive-input-when-building-docker-images/
@@ -35,7 +36,12 @@ ENV \
3536
RUN \
3637
# TODO: Remove the manual key installation once the base image is updated.
3738
# https://github.com/NVIDIA/nvidia-docker/issues/1631
38-
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
39+
# https://github.com/NVIDIA/nvidia-docker/issues/1631#issuecomment-1264715214
40+
apt-get update && apt-get install -y wget && \
41+
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
42+
mkdir -p /etc/apt/keyrings/ && mv 3bf863cc.pub /etc/apt/keyrings/ && \
43+
echo "deb [signed-by=/etc/apt/keyrings/3bf863cc.pub] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" /etc/apt/sources.list.d/cuda.list && \
44+
apt-get update && \
3945
apt-get update -qq --fix-missing && \
4046
NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \
4147
CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \
@@ -132,24 +138,32 @@ RUN \
132138

133139
RUN \
134140
# install Bagua
135-
CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \
136-
CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [116,113,111,102] if $CUDA_VERSION_MM >= ver][0])") && \
137-
pip install "bagua-cuda$CUDA_VERSION_BAGUA" && \
138-
if [[ "$CUDA_VERSION_MM" = "$CUDA_VERSION_BAGUA" ]]; then python -c "import bagua_core; bagua_core.install_deps()"; fi && \
139-
python -c "import bagua; print(bagua.__version__)"
141+
if [[ $PYTORCH_VERSION != "1.13" ]]; then \
142+
CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") ; \
143+
CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [116,113,111,102] if $CUDA_VERSION_MM >= ver][0])") ; \
144+
pip install "bagua-cuda$CUDA_VERSION_BAGUA" ; \
145+
if [[ "$CUDA_VERSION_MM" = "$CUDA_VERSION_BAGUA" ]]; then \
146+
python -c "import bagua_core; bagua_core.install_deps()"; \
147+
fi ; \
148+
python -c "import bagua; print(bagua.__version__)"; \
149+
fi
140150

141151
RUN \
142152
# install ColossalAI
143-
PYTORCH_VERSION_COLOSSALAI=$(python -c "import torch; print(torch.__version__.split('+')[0][:4])") ; \
144-
CUDA_VERSION_MM_COLOSSALAI=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda)))") ; \
145-
CUDA_VERSION_COLOSSALAI=$(python -c "print([ver for ver in [11.3, 11.1] if $CUDA_VERSION_MM_COLOSSALAI >= ver][0])") ; \
146-
pip install "colossalai==0.1.10+torch${PYTORCH_VERSION_COLOSSALAI}cu${CUDA_VERSION_COLOSSALAI}" --find-links https://release.colossalai.org ; \
147-
python -c "import colossalai; print(colossalai.__version__)" ; \
153+
# TODO: 1.13 wheels are not released, remove skip once they are
154+
if [[ $PYTORCH_VERSION != "1.13" ]]; then \
155+
PYTORCH_VERSION_COLOSSALAI=$(python -c "import torch; print(torch.__version__.split('+')[0][:4])") ; \
156+
CUDA_VERSION_MM_COLOSSALAI=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda)))") ; \
157+
CUDA_VERSION_COLOSSALAI=$(python -c "print([ver for ver in [11.3, 11.1] if $CUDA_VERSION_MM_COLOSSALAI >= ver][0])") ; \
158+
pip install "colossalai==0.1.10+torch${PYTORCH_VERSION_COLOSSALAI}cu${CUDA_VERSION_COLOSSALAI}" --find-links https://release.colossalai.org ; \
159+
python -c "import colossalai; print(colossalai.__version__)" ; \
160+
fi
148161

149162
RUN \
150163
# install rest of strategies
151164
# remove colossalai from requirements since they are installed separately
152165
python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'colossalai' not in line] ; open(fname, 'w').writelines(lines)" ; \
166+
python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)" ; \
153167
cat requirements/pytorch/strategies.txt && \
154168
pip install -r requirements/pytorch/devel.txt -r requirements/pytorch/strategies.txt --no-cache-dir --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html
155169

@@ -163,5 +177,4 @@ RUN \
163177
python -c "import sys; ver = sys.version_info ; assert f'{ver.major}.{ver.minor}' == '$PYTHON_VERSION', ver" && \
164178
python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__" && \
165179
python requirements/pytorch/check-avail-extras.py && \
166-
python requirements/pytorch/check-avail-strategies.py && \
167180
rm -rf requirements/

requirements/pytorch/strategies.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,5 @@ colossalai>=0.1.10
55
fairscale>=0.4.5, <=0.4.6
66
deepspeed>=0.6.0, <=0.7.0
77
# no need to install with [pytorch] as pytorch is already installed
8-
horovod>=0.21.2, !=0.24.0, <0.25.1
8+
horovod>=0.21.2, !=0.24.0, <=0.26.1
99
hivemind>=1.0.1, <=1.0.1; sys_platform == 'linux'

tests/tests_lite/conftest.py

+1
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ def restore_env_variables():
5454
"HOROVOD_FUSION_THRESHOLD",
5555
"RANK", # set by DeepSpeed
5656
"POPLAR_ENGINE_OPTIONS", # set by IPUStrategy
57+
"CUDA_MODULE_LOADING", # leaked since PyTorch 1.13
5758
}
5859
leaked_vars.difference_update(allowlist)
5960
assert not leaked_vars, f"test is leaking environment variable(s): {set(leaked_vars)}"

tests/tests_pytorch/conftest.py

+3
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,9 @@ def restore_env_variables():
7272
"HOROVOD_FUSION_THRESHOLD",
7373
"RANK", # set by DeepSpeed
7474
"POPLAR_ENGINE_OPTIONS", # set by IPUStrategy
75+
"CUDA_MODULE_LOADING", # leaked since PyTorch 1.13
76+
"KMP_INIT_AT_FORK", # leaked since PyTorch 1.13
77+
"KMP_DUPLICATE_LIB_OK", # leaked since PyTorch 1.13
7578
}
7679
leaked_vars.difference_update(allowlist)
7780
assert not leaked_vars, f"test is leaking environment variable(s): {set(leaked_vars)}"

tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py

-9
Original file line numberDiff line numberDiff line change
@@ -61,19 +61,13 @@ def on_predict_batch_end(self, outputs, batch, batch_idx, dataloader_idx) -> Non
6161
def _assert_layer_fsdp_instance(self) -> None:
6262
assert isinstance(self.layer, FullyShardedDataParallel)
6363
assert isinstance(self.trainer.strategy.precision_plugin, FullyShardedNativeNativeMixedPrecisionPlugin)
64-
# root should not be resharding
65-
assert self.layer.reshard_after_forward is False
66-
6764
precision = torch.float16 if self.precision == 16 else torch.bfloat16
6865
assert self.layer.mixed_precision.param_dtype == precision
6966
assert self.layer.mixed_precision.reduce_dtype == precision
7067
assert self.layer.mixed_precision.buffer_dtype == precision
7168

7269
for layer_num in [0, 2]:
7370
assert isinstance(self.layer.module[layer_num], FullyShardedDataParallel)
74-
# Assert that the nested layers are set reshard_after_forward to True
75-
assert self.layer.module[layer_num].reshard_after_forward is True
76-
7771
assert self.layer[layer_num].mixed_precision.param_dtype == precision
7872
assert self.layer[layer_num].mixed_precision.reduce_dtype == precision
7973
assert self.layer[layer_num].mixed_precision.buffer_dtype == precision
@@ -106,9 +100,6 @@ def _assert_layer_fsdp_instance(self) -> None:
106100
precision = torch.float16 if self.precision == 16 else torch.bfloat16
107101
for layer_num in [0, 2]:
108102
assert isinstance(self.layer[layer_num], FullyShardedDataParallel)
109-
# Assert that the nested layers are set reshard_after_forward to True
110-
assert self.layer[layer_num].reshard_after_forward
111-
112103
assert self.layer[layer_num].mixed_precision.param_dtype == precision
113104
assert self.layer[layer_num].mixed_precision.reduce_dtype == precision
114105
assert self.layer[layer_num].mixed_precision.buffer_dtype == precision

0 commit comments

Comments
 (0)