diff --git a/.azure/gpu-tests-lite.yml b/.azure/gpu-tests-lite.yml index 66fc3951b9ce1..bf0ed0a0b9e2f 100644 --- a/.azure/gpu-tests-lite.yml +++ b/.azure/gpu-tests-lite.yml @@ -63,11 +63,13 @@ jobs: - bash: | set -e - TORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") - CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") + PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") python ./requirements/pytorch/adjust-versions.py requirements/lite/base.txt ${PYTORCH_VERSION} + + CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") pip install -e .[strategies] --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html pip install --requirement requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html + pip list env: PACKAGE_NAME: pytorch diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index 1e589e708cb39..42c153a5b50ae 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -91,15 +91,19 @@ jobs: set -e python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)" python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'bagua' not in line] ; open(fname, 'w').writelines(lines)" - TORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") - CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") - CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [116,113,111,102] if $CUDA_VERSION_MM >= ver][0])") + + PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") python ./requirements/pytorch/adjust-versions.py requirements/pytorch/base.txt ${PYTORCH_VERSION} python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt ${PYTORCH_VERSION} python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt ${PYTORCH_VERSION} - pip install "bagua-cuda$CUDA_VERSION_BAGUA" + + CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") pip install -e .[strategies] --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html pip install --requirement requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html + + CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [116,113,111,102] if $CUDA_VERSION_MM >= ver][0])") + pip install "bagua-cuda$CUDA_VERSION_BAGUA" + pip list env: PACKAGE_NAME: pytorch diff --git a/tests/tests_pytorch/profilers/test_profiler.py b/tests/tests_pytorch/profilers/test_profiler.py index 2e3b868407d7f..1ed1212840234 100644 --- a/tests/tests_pytorch/profilers/test_profiler.py +++ b/tests/tests_pytorch/profilers/test_profiler.py @@ -474,10 +474,9 @@ def look_for_trace(trace_dir): @RunIf(min_cuda_gpus=1, standalone=True) -def test_pytorch_profiler_nested_emit_nvtx(tmpdir): +def test_pytorch_profiler_nested_emit_nvtx(): """This test check emit_nvtx is correctly supported.""" profiler = PyTorchProfiler(use_cuda=True, emit_nvtx=True) - model = BoringModel() trainer = Trainer( fast_dev_run=True, diff --git a/tests/tests_pytorch/run_standalone_tasks.sh b/tests/tests_pytorch/run_standalone_tasks.sh index 0abe25d76c638..9c9971dad01b7 100644 --- a/tests/tests_pytorch/run_standalone_tasks.sh +++ b/tests/tests_pytorch/run_standalone_tasks.sh @@ -15,7 +15,11 @@ set -e # THIS FILE ASSUMES IT IS RUN INSIDE THE tests/tests_pytorch DIRECTORY -if nvcc --version; then +# this environment variable allows special tests to run +export PL_RUN_STANDALONE_TESTS=1 + +can_run_nvprof=$(python -c "import torch; print(torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 8)") +if [[ $can_run_nvprof == "True" ]]; then echo "Running profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx" nvprof --profile-from-start off -o trace_name.prof -- python -m coverage run --source pytorch_lightning --append -m pytest --no-header profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx fi diff --git a/tests/tests_pytorch/run_standalone_tests.sh b/tests/tests_pytorch/run_standalone_tests.sh index fa6bda6706bc8..43021ddbf7d14 100644 --- a/tests/tests_pytorch/run_standalone_tests.sh +++ b/tests/tests_pytorch/run_standalone_tests.sh @@ -43,8 +43,6 @@ path_suffix=$(basename "$(dirname "$(pwd)")")/$(basename "$(pwd)")"/" # https:/ parametrizations=${parametrizations//$path_suffix/} parametrizations_arr=($parametrizations) -# tests to skip - space separated -blocklist='profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx utilities/test_warnings.py' report='' rm -f standalone_test_output.txt # in case it exists, remove it @@ -60,7 +58,8 @@ for i in "${!parametrizations_arr[@]}"; do parametrization=${parametrizations_arr[$i]} # check blocklist - if echo $blocklist | grep -F "${parametrization}"; then + if [[ "${parametrization}" == *"test_pytorch_profiler_nested_emit_nvtx"* ]]; then + echo "Skipping $parametrization" report+="Skipped\t$parametrization\n" # do not continue the loop because we might need to wait for batched jobs else diff --git a/tests/tests_pytorch/strategies/test_bagua_strategy.py b/tests/tests_pytorch/strategies/test_bagua_strategy.py index 9c36552789615..4a9912ca00c90 100644 --- a/tests/tests_pytorch/strategies/test_bagua_strategy.py +++ b/tests/tests_pytorch/strategies/test_bagua_strategy.py @@ -45,6 +45,10 @@ def test_bagua_default(tmpdir): assert isinstance(trainer.strategy, BaguaStrategy) +@pytest.mark.skipif( + torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8, + reason="Async does not support this CUDA architecture", +) @RunIf(min_cuda_gpus=2, standalone=True, bagua=True) def test_async_algorithm(tmpdir): model = BoringModel()