[fbgemm_gpu] OSS build updates

q10 · q10 · commit 561c9535b3c5 · 2024-11-20T17:22:37.000-08:00
- Upgrade gcc version to support newer libstdc++, which is required now that pytorch/pytorch#141035 has landed - Deprecate support for CUDA 12.1 and add support for 12.6, per changes in pytorch/pytorch#138899
diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
@@ -192,7 +192,9 @@ __configure_fbgemm_gpu_build_cuda () {
     cuda_version_nvcc=$(conda run -n "${env_name}" nvcc --version)
     echo "[BUILD] Using the default architectures for CUDA $cuda_version_nvcc ..."
 
-    if [[ $cuda_version_nvcc == *"V12.1"* ]] || [[ $cuda_version_nvcc == *"V12.4"* ]]; then
+    if  [[ $cuda_version_nvcc == *"V12.1"* ]] ||
+        [[ $cuda_version_nvcc == *"V12.4"* ]] ||
+        [[ $cuda_version_nvcc == *"V12.6"* ]]; then
       # sm_90 and sm_90a are only available for CUDA 12.1+
       # NOTE: CUTLASS kernels for Hopper require sm_90a to be enabled
       # See:
diff --git a/.github/scripts/fbgemm_gpu_install.bash b/.github/scripts/fbgemm_gpu_install.bash
@@ -182,7 +182,7 @@ install_fbgemm_gpu_pip () {
     echo "    ${FUNCNAME[0]} build_env 0.8.0 cpu                  # Install the CPU variant, specific version from release channel"
     echo "    ${FUNCNAME[0]} build_env release cuda/12.4.1        # Install the CUDA variant, latest version from release channel"
     echo "    ${FUNCNAME[0]} build_env test/0.8.0 cuda/12.4.1     # Install the CUDA 12.4 variant, specific version from test channel"
-    echo "    ${FUNCNAME[0]} build_env nightly rocm/6.1           # Install the ROCM 6.1 variant, latest version from nightly channel"
+    echo "    ${FUNCNAME[0]} build_env nightly rocm/6.2           # Install the ROCM 6.2 variant, latest version from nightly channel"
     return 1
   else
     echo "################################################################################"
diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
@@ -332,7 +332,7 @@ test_setup_conda_environment () {
   if [ "$pytorch_variant_type" == "" ]; then
     echo "Usage: ${FUNCNAME[0]} ENV_NAME COMPILER PYTHON_VERSION PYTORCH_INSTALLER PYTORCH_CHANNEL[/VERSION] PYTORCH_VARIANT_TYPE [PYTORCH_VARIANT_VERSION]"
     echo "Example(s):"
-    echo "    ${FUNCNAME[0]} build_env clang 3.12 pip test/0.8.0 cuda 12.1.0       # Setup environment with pytorch-test 0.8.0 for Clang + Python 3.12 + CUDA 12.1.0"
+    echo "    ${FUNCNAME[0]} build_env clang 3.12 pip test/0.8.0 cuda 12.4.1       # Setup environment with pytorch-test 0.8.0 for Clang + Python 3.12 + CUDA 12.4.1"
     return 1
   else
     echo "################################################################################"
@@ -494,8 +494,8 @@ test_fbgemm_gpu_setup_and_pip_install () {
   if [ "$variant_type" == "cuda" ] || [ "$variant_type" == "genai" ]; then
     local variant_versions=(
       11.8.0
-      12.1.1
       12.4.1
+      12.6.3
     )
   elif [ "$variant_type" == "rocm" ]; then
     local variant_versions=(
diff --git a/.github/scripts/test_torchrec.bash b/.github/scripts/test_torchrec.bash
@@ -29,10 +29,10 @@ usage () {
   echo "                      e.g., torchrec needs fbgemm-gpu while torchrec_nightly needs fbgemm-gpu-nightly"
   echo "PYTHON_VERSION      : Python version (e.g., 3.10)"
   echo "PYTORCH_CHANNEL_NAME: PyTorch's channel name (e.g., pytorch-nightly, pytorch-test (=pre-release), pytorch (=stable release))"
-  echo "CUDA_VERSION        : PyTorch's CUDA version (e.g., 12.1)"
+  echo "CUDA_VERSION        : PyTorch's CUDA version (e.g., 12.4)"
   echo "FBGEMM_WHEEL_PATH   : path to FBGEMM_GPU's wheel file"
   echo "MINICONDA_PREFIX    : path to install Miniconda (default: \$HOME/miniconda)"
-  echo "Example: Python 3.10 + PyTorch nightly (CUDA 12.1), install miniconda at \$HOME/miniconda, using dist/fbgemm_gpu_nightly.whl"
+  echo "Example: Python 3.10 + PyTorch nightly (CUDA 12.4), install miniconda at \$HOME/miniconda, using dist/fbgemm_gpu_nightly.whl"
   # shellcheck disable=SC2086
   echo "       bash $(basename ${BASH_SOURCE[0]}) -v -o torchrec_nightly -p 3.10 -P pytorch-nightly -c 11.7 -w dist/fbgemm_gpu_nightly.whl"
 }
diff --git a/.github/scripts/utils_build.bash b/.github/scripts/utils_build.bash
@@ -109,14 +109,18 @@ __conda_install_gcc () {
   # shellcheck disable=SC2155
   local env_prefix=$(env_name_or_prefix "${env_name}")
 
-  # NOTE: g++ 10.x is installed by default instead of 11.x+ becaue 11.x+ builds
-  # binaries that reference GLIBCXX_3.4.29, which may not be available on
-  # systems  with older versions of libstdc++.so.6 such as CentOS Stream 8 and
-  # Ubuntu 20.04.  However, if libfolly is used, GLIBCXX_3.4.30+ will be
-  # required, which will require 11.x+.
+  # NOTE: Previously, g++ 10.x is installed by default instead of 11.x+ because
+  # 11.x+ builds binaries that reference GLIBCXX_3.4.29, which may not be
+  # available on systems with older versions of libstdc++.so.6 such as CentOS
+  # Stream 8 and Ubuntu 20.04.
+  #
+  # However, since https://github.com/pytorch/pytorch/pull/141035 landed, g++
+  # 11.1+ became a requirement, since std::bit_cast is only available with
+  # libstdc++ 11.1+.  See for details:
+  # https://gcc.gnu.org/onlinedocs/libstdc++/manual/status.html#manual.intro.status.iso
   #
   # shellcheck disable=SC2155
-  local gcc_version="${GCC_VERSION:-10.4.0}"
+  local gcc_version="${GCC_VERSION:-11.4.0}"
 
   echo "[INSTALL] Installing GCC (${gcc_version}, ${archname}) through Conda ..."
   # shellcheck disable=SC2086
diff --git a/.github/scripts/utils_cuda.bash b/.github/scripts/utils_cuda.bash
@@ -59,7 +59,7 @@ install_cuda () {
 
   # Ensure that the libraries are properly installed
   (test_filepath "${env_name}" libcuda.so) || return 1
-  (test_filepath "${env_name}" libnvToolsExt.so) || return 1
+  # (test_filepath "${env_name}" libnvToolsExt.so) || return 1
   (test_filepath "${env_name}" libnvidia-ml.so) || return 1
 
   echo "[INSTALL] Appending libcuda.so path to LD_LIBRARY_PATH ..."
@@ -163,6 +163,7 @@ install_cudnn () {
     ["118"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-${PLATFORM_NAME_LC}-8.7.0.84_cuda11-archive.tar.xz"
     ["121"]="https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-8.9.2.26_cuda12-archive.tar.xz"
     ["124"]="https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-8.9.2.26_cuda12-archive.tar.xz"
+    ["126"]="https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-9.5.1.17_cuda12-archive.tar.xz"
   )
 
   # Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1]
diff --git a/.github/scripts/utils_pip.bash b/.github/scripts/utils_pip.bash
@@ -41,7 +41,7 @@ __export_package_channel_info () {
 __export_package_variant_info () {
   local package_variant_type_version="$1"
 
-  local FALLBACK_VERSION_CUDA="12.1.1"
+  local FALLBACK_VERSION_CUDA="12.4.1"
   local FALLBACK_VERSION_ROCM="6.0.2"
 
   if [ "$package_variant_type_version" == "cuda" ]; then
@@ -60,7 +60,7 @@ __export_package_variant_info () {
     local variant_version=""
 
   else
-    # Split along '/', e.g. cuda/12.1.0
+    # Split along '/', e.g. cuda/12.4.0
     # shellcheck disable=SC2207
     local package_variant_type_version_arr=($(echo "${package_variant_type_version}" | tr '/' '\n'))
     local variant_type="${package_variant_type_version_arr[0]}"
@@ -71,7 +71,7 @@ __export_package_variant_info () {
       local cuda_version="${variant_version:-${FALLBACK_VERSION_CUDA}}"
       # shellcheck disable=SC2206
       local cuda_version_arr=(${cuda_version//./ })
-      # Convert, i.e. cuda 12.1.0 => cu121
+      # Convert, i.e. cuda 12.4.0 => cu124
       local variant_type="cu"
       local variant_version="${cuda_version_arr[0]}${cuda_version_arr[1]}"
 
@@ -204,7 +204,7 @@ install_from_pytorch_pip () {
     echo "Example(s):"
     echo "    ${FUNCNAME[0]} build_env torch 1.11.0 cpu                       # Install the CPU variant, specific version from release channel"
     echo "    ${FUNCNAME[0]} build_env torch release cpu                      # Install the CPU variant, latest version from release channel"
-    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu test/0.8.0 cuda/12.1.0      # Install the CUDA 12.1 variant, specific version from test channel"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu test/0.8.0 cuda/12.4.0      # Install the CUDA 12.4 variant, specific version from test channel"
     echo "    ${FUNCNAME[0]} build_env fbgemm_gpu nightly rocm/6.1            # Install the ROCM 6.1 variant, latest version from nightly channel"
     echo "    ${FUNCNAME[0]} build_env pytorch_triton 1.11.0                  # Install specific version from release channel"
     echo "    ${FUNCNAME[0]} build_env pytorch_triton release                 # Install latest version from release channel"
@@ -249,7 +249,7 @@ download_from_pytorch_pip () {
     echo "Example(s):"
     echo "    ${FUNCNAME[0]} build_env torch 1.11.0 cpu                       # Download the CPU variant, specific version from release channel"
     echo "    ${FUNCNAME[0]} build_env torch release cpu                      # Download the CPU variant, latest version from release channel"
-    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu test/0.8.0 cuda/12.1.0      # Download the CUDA 12.1 variant, specific version from test channel"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu test/0.8.0 cuda/12.4.0      # Download the CUDA 12.4 variant, specific version from test channel"
     echo "    ${FUNCNAME[0]} build_env fbgemm_gpu nightly rocm/6.1            # Download the ROCM 6.1 variant, latest version from nightly channel"
     return 1
   else
diff --git a/.github/scripts/utils_pytorch.bash b/.github/scripts/utils_pytorch.bash
@@ -112,7 +112,7 @@ install_pytorch_pip () {
     echo "Example(s):"
     echo "    ${FUNCNAME[0]} build_env test/2.1.0 cpu     # Install the CPU variant for a specific version"
     echo "    ${FUNCNAME[0]} build_env release cpu        # Install the CPU variant, latest release version"
-    echo "    ${FUNCNAME[0]} build_env test cuda/12.1.0   # Install the CUDA 12.1 variant, latest test version"
+    echo "    ${FUNCNAME[0]} build_env test cuda/12.4.0   # Install the CUDA 12.4 variant, latest test version"
     echo "    ${FUNCNAME[0]} build_env nightly rocm/6.1   # Install the ROCM 6.1 variant, latest nightly version"
     return 1
   else
diff --git a/.github/scripts/utils_torchrec.bash b/.github/scripts/utils_torchrec.bash
@@ -26,7 +26,7 @@ install_torchrec_pip () {
     echo "Example(s):"
     echo "    ${FUNCNAME[0]} build_env test/2.1.0rc0 cpu                      # Install the CPU variant for a specific version"
     echo "    ${FUNCNAME[0]} build_env release cpu                            # Install the CPU variant, latest release version"
-    echo "    ${FUNCNAME[0]} build_env nightly/0.9.0.dev20240716 cuda/12.1.0  # Install the CUDA 12.1 variant, nightly version"
+    echo "    ${FUNCNAME[0]} build_env nightly/0.9.0.dev20240716 cuda/12.4.0  # Install the CUDA 12.4 variant, nightly version"
     return 1
   else
     echo "################################################################################"
diff --git a/.github/workflows/fbgemm_ci.yml b/.github/workflows/fbgemm_ci.yml
@@ -144,7 +144,7 @@ jobs:
       run: . $PRELUDE; create_conda_environment $BUILD_ENV 3.12
 
     - name: Install C/C++ Compilers
-      run: . $PRELUDE; install_cxx_compiler $BUILD_ENV ${{ matrix.compiler }}
+      run: . $PRELUDE; GCC_VERSION=10.4.0 install_cxx_compiler $BUILD_ENV ${{ matrix.compiler }}
 
     - name: Install Build Tools
       run: . $PRELUDE; install_build_tools $BUILD_ENV
diff --git a/.github/workflows/fbgemm_gpu_ci_cuda.yml b/.github/workflows/fbgemm_gpu_ci_cuda.yml
@@ -67,7 +67,7 @@ jobs:
           { arch: x86, instance: "linux.24xlarge" },
         ]
         python-version: [ "3.9", "3.10", "3.11", "3.12" ]
-        cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ]
+        cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
         compiler: [ "gcc", "clang" ]
 
     steps:
@@ -150,7 +150,7 @@ jobs:
           # { arch: x86, instance: "linux.gcp.a100" },
         ]
         python-version: [ "3.9", "3.10", "3.11", "3.12" ]
-        cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ]
+        cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
         # Specify exactly ONE CUDA version for artifact publish
         cuda-version-publish: [ "12.4.1" ]
         compiler: [ "gcc", "clang" ]
diff --git a/.github/workflows/fbgemm_gpu_ci_genai.yml b/.github/workflows/fbgemm_gpu_ci_genai.yml
@@ -67,7 +67,7 @@ jobs:
           { arch: x86, instance: "linux.24xlarge" },
         ]
         python-version: [ "3.9", "3.10", "3.11", "3.12" ]
-        cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ]
+        cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
         compiler: [ "gcc", "clang" ]
 
     steps:
@@ -149,7 +149,7 @@ jobs:
           # { arch: x86, instance: "linux.gcp.a100" },
         ]
         python-version: [ "3.9", "3.10", "3.11", "3.12" ]
-        cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ]
+        cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
         # Specify exactly ONE CUDA version for artifact publish
         cuda-version-publish: [ "12.4.1" ]
         compiler: [ "gcc", "clang" ]
diff --git a/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml b/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml
@@ -54,7 +54,7 @@ jobs:
           { arch: x86, instance: "ubuntu-latest" },
         ]
         python-version: [ "3.9", "3.10", "3.11", "3.12" ]
-        cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ]
+        cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
         compiler: [ "gcc", "clang" ]
 
     steps:
@@ -141,7 +141,7 @@ jobs:
           { arch: x86, instance: "ubuntu-latest" },
         ]
         python-version: [ "3.9", "3.10", "3.11", "3.12" ]
-        cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ]
+        cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
         # Specify exactly ONE CUDA version for artifact publish
         cuda-version-publish: [ "12.4.1" ]
         compiler: [ "gcc", "clang" ]
diff --git a/.github/workflows/fbgemm_gpu_ci_rocm.yml b/.github/workflows/fbgemm_gpu_ci_rocm.yml
@@ -64,9 +64,9 @@ jobs:
         host-machine: [
           { arch: x86, instance: "linux.24xlarge" },
         ]
-        container-image: [ "ubuntu:20.04" ]
+        container-image: [ "ubuntu:22.04" ]
         python-version: [ "3.9", "3.10", "3.11", "3.12" ]
-        rocm-version: [ "6.1" ]
+        rocm-version: [ "6.1", "6.2" ]
         compiler: [ "gcc", "clang" ]
 
     steps:
@@ -129,7 +129,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: ${{ matrix.host-machine.instance }}
     container:
-      image: "rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}-complete"
+      image: "rocm/dev-ubuntu-22.04:${{ matrix.rocm-version }}-complete"
       options: --user root --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size 16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined
     defaults:
       run:
@@ -147,7 +147,7 @@ jobs:
         ]
         # ROCm machines are limited, so we only test a subset of Python versions
         python-version: [ "3.12" ]
-        rocm-version: [ "6.1" ]
+        rocm-version: [ "6.2" ]
         compiler: [ "gcc", "clang" ]
     needs: build_artifact
 
diff --git a/.github/workflows/fbgemm_gpu_pip.yml b/.github/workflows/fbgemm_gpu_pip.yml
@@ -121,7 +121,7 @@ jobs:
           { instance: "linux.g5.4xlarge.nvidia.gpu" },
         ]
         python-version: [ "3.9", "3.10", "3.11", "3.12" ]
-        cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ]
+        cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
 
     steps:
     # Cannot upgrade to actions/checkout@v4 yet because GLIBC on the instance is too old
@@ -168,7 +168,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.fbgemm_gpu_variant_type == 'rocm')) }}
     runs-on: ${{ matrix.host-machine.instance }}
     container:
-      image: "rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}-complete"
+      image: "rocm/dev-ubuntu-22.04:${{ matrix.rocm-version }}-complete"
       options: --user root --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size 16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined
     defaults:
       run:
@@ -186,7 +186,7 @@ jobs:
         ]
         # ROCm machines are limited, so we only test a subset of Python versions
         python-version: [ "3.11", "3.12" ]
-        rocm-version: [ "6.1" ]
+        rocm-version: [ "6.2" ]
 
     steps:
     - name: Setup Build Container
diff --git a/.github/workflows/fbgemm_gpu_release_cuda.yml b/.github/workflows/fbgemm_gpu_release_cuda.yml
@@ -34,7 +34,7 @@ on:
         description: CUDA Version to Use for Building Artifact
         type: choice
         required: false
-        options: [ "11.8.0", "12.1.1", "12.4.1" ]
+        options: [ "11.8.0", "12.4.1", "12.6.3" ]
         default: "12.4.1"
       publish_to_pypi:
         description: Publish Artifact to PyPI
@@ -71,7 +71,7 @@ jobs:
           { arch: x86, instance: "linux.24xlarge" },
         ]
         python-version: [ "3.9", "3.10", "3.11", "3.12" ]
-        cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ]
+        cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
 
     steps:
     - name: Setup Build Container
@@ -144,7 +144,7 @@ jobs:
           { arch: x86, instance: "linux.g5.4xlarge.nvidia.gpu" },
         ]
         python-version: [ "3.9", "3.10", "3.11", "3.12" ]
-        cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ]
+        cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
     needs: build_artifact
 
     steps:
diff --git a/.github/workflows/fbgemm_gpu_release_genai.yml b/.github/workflows/fbgemm_gpu_release_genai.yml
@@ -34,7 +34,7 @@ on:
         description: CUDA Version to Use for Building Artifact
         type: choice
         required: false
-        options: [ "11.8.0", "12.1.1", "12.4.1" ]
+        options: [ "11.8.0", "12.4.1", "12.6.3" ]
         default: "12.4.1"
       publish_to_pypi:
         description: Publish Artifact to PyPI
@@ -71,7 +71,7 @@ jobs:
           { arch: x86, instance: "linux.24xlarge" },
         ]
         python-version: [ "3.9", "3.10", "3.11", "3.12" ]
-        cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ]
+        cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
 
     steps:
     - name: Setup Build Container
@@ -144,7 +144,7 @@ jobs:
           { arch: x86, instance: "linux.g5.4xlarge.nvidia.gpu" },
         ]
         python-version: [ "3.9", "3.10", "3.11", "3.12" ]
-        cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ]
+        cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
     needs: build_artifact
 
     steps:
diff --git a/fbgemm_gpu/README.md b/fbgemm_gpu/README.md
@@ -9,7 +9,7 @@ PyTorch GPU operator libraries for training and inference.  The library provides
 efficient table batched embedding bag, data layout transformation, and
 quantization supports.
 
-FBGEMM_GPU is currently tested with CUDA 12.1 and 11.8 in CI, and with PyTorch
+FBGEMM_GPU is currently tested with CUDA 12.4 and 11.8 in CI, and with PyTorch
 packages (2.1+) that are built against those CUDA versions.
 
 See the full [Documentation](https://pytorch.org/FBGEMM) for more information
diff --git a/fbgemm_gpu/docs/src/fbgemm_gpu-development/BuildInstructions.rst b/fbgemm_gpu/docs/src/fbgemm_gpu-development/BuildInstructions.rst
@@ -117,7 +117,7 @@ Install the full CUDA package through Conda, which includes
 .. code:: sh
 
   # See https://anaconda.org/nvidia/cuda for all available versions of CUDA
-  cuda_version=12.1.0
+  cuda_version=12.4.1
 
   # Install the full CUDA package
   conda install -n ${env_name} -y cuda -c "nvidia/label/cuda-${cuda_version}"
@@ -177,10 +177,10 @@ desired ROCm version:
 
 .. code:: sh
 
-  # Run for ROCm 6.1.2
-  docker run -it --entrypoint "/bin/bash" rocm/rocm-terminal:6.1.2
+  # Run for ROCm 6.2.0
+  docker run -it --entrypoint "/bin/bash" rocm/rocm-terminal:6.2.0
 
-While the `full ROCm Docker image <https://hub.docker.com/r/rocm/dev-ubuntu-20.04>`__
+While the `full ROCm Docker image <https://hub.docker.com/r/rocm/dev-ubuntu-22.04>`__
 comes with all ROCm packages pre-installed, it results in a very large Docker
 container, and so for this reason, the minimal image is recommended for building
 and running FBGEMM_GPU.