fix: Improve Docker build robustness, add validation (#1873)

gs-olive · web-flow · commit 25db25768c31 · 2023-05-03T17:36:40.000-07:00
diff --git a/README.md b/README.md
@@ -31,12 +31,7 @@ In the case of building on top of a custom base container, you first must determ
 version of the PyTorch C++ ABI. If your source of PyTorch is pytorch.org, likely this is the pre-cxx11-abi in which case you must modify `//docker/dist-build.sh` to not build the
 C++11 ABI version of Torch-TensorRT.
 
-You can then build the container using:
-
-
-```bash
-docker build --build-arg BASE_IMG=<IMAGE> -f docker/Dockerfile -t torch_tensorrt:latest .
-```
+You can then build the container using the build command in the [docker README](docker/README.md#instructions)
 
 If you would like to build outside a docker container, please follow the section [Compiling Torch-TensorRT](#compiling-torch-tensorrt)
 
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -1,10 +1,13 @@
 # Base image starts with CUDA
 ARG BASE_IMG=nvidia/cuda:11.7.1-devel-ubuntu22.04
 FROM ${BASE_IMG} as base
+ENV BASE_IMG=nvidia/cuda:11.7.1-devel-ubuntu22.04
 
 ARG TENSORRT_VERSION
+ENV TENSORRT_VERSION=${TENSORRT_VERSION}
 RUN test -n "$TENSORRT_VERSION" || (echo "No tensorrt version specified, please use --build-arg TENSORRT_VERSION=x.y.z to specify a version." && exit 1)
 ARG CUDNN_VERSION
+ENV CUDNN_VERSION=${CUDNN_VERSION}
 RUN test -n "$CUDNN_VERSION" || (echo "No cudnn version specified, please use --build-arg CUDNN_VERSION=x.y.z to specify a version." && exit 1)
 
 ARG PYTHON_VERSION=3.10
@@ -44,7 +47,7 @@ RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/
 RUN add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /"
 RUN apt-get update
 
-RUN apt-get install -y libnvinfer8=${TENSORRT_VERSION}* libnvinfer-plugin8=${TENSORRT_VERSION}* libnvinfer-dev=${TENSORRT_VERSION}* libnvinfer-plugin-dev=${TENSORRT_VERSION}* libnvonnxparsers8=${TENSORRT_VERSION}-1* libnvonnxparsers-dev=${TENSORRT_VERSION}-1* libnvparsers8=${TENSORRT_VERSION}-1*  libnvparsers-dev=${TENSORRT_VERSION}-1*
+RUN apt-get install -y libnvinfer8=${TENSORRT_VERSION}.* libnvinfer-plugin8=${TENSORRT_VERSION}.* libnvinfer-dev=${TENSORRT_VERSION}.* libnvinfer-plugin-dev=${TENSORRT_VERSION}.* libnvonnxparsers8=${TENSORRT_VERSION}.* libnvonnxparsers-dev=${TENSORRT_VERSION}.* libnvparsers8=${TENSORRT_VERSION}.*  libnvparsers-dev=${TENSORRT_VERSION}.*
 
 # Setup Bazel via Bazelisk
 RUN wget -q https://github.com/bazelbuild/bazelisk/releases/download/v1.16.0/bazelisk-linux-amd64 -O /usr/bin/bazel &&\
@@ -71,7 +74,18 @@ WORKDIR /workspace/torch_tensorrt/src
 RUN cp ./docker/WORKSPACE.docker WORKSPACE
 
 # Symlink the path pyenv is using for python with the /opt directory for package sourcing
-RUN ln -s "`pyenv which python | xargs dirname | xargs dirname`/lib/python$PYTHON_VERSION/site-packages" "/opt/python3"
+RUN mkdir -p "/opt/python3/" &&\
+    ln -s "`pyenv which python | xargs dirname | xargs dirname`/lib/python$PYTHON_VERSION/site-packages" "/opt/python3/"
+
+# Extract base image cuda version (everything after :, before -, before final ., in BASE_IMG)
+# Ensure the default cuda folder agrees with the version in the base image
+RUN CUDA_BASE_IMG_VERSION_INTERMEDIATE=`echo ${BASE_IMG#*:}` &&\
+    CUDA_BASE_IMG_VERSION=`echo ${CUDA_BASE_IMG_VERSION_INTERMEDIATE%%-*}` &&\
+    CUDA_MAJOR_MINOR_VERSION=`echo ${CUDA_BASE_IMG_VERSION%.*}` &&\
+    rm -fr /usr/local/cuda &&\
+    ln -s /usr/local/cuda-${CUDA_MAJOR_MINOR_VERSION} /usr/local/cuda
+
+ENV CUDA_HOME=/usr/local/cuda
 
 # This script builds both libtorchtrt bin/lib/include tarball and the Python wheel, in dist/
 RUN bash ./docker/dist-build.sh
diff --git a/docker/README.md b/docker/README.md
@@ -3,7 +3,7 @@
 * Use `Dockerfile` to build a container which provides the exact development environment that our master branch is usually tested against.
 
 * The `Dockerfile` currently uses <a href="https://github.com/bazelbuild/bazelisk">Bazelisk</a> to select the Bazel version, and uses the exact library versions of Torch and CUDA listed in <a href="https://github.com/pytorch/TensorRT#dependencies">dependencies</a>.
-  * The desired versions of CUDNN and TensorRT must be specified as build-args, with major, minor, and patch versions as in: `--build-arg TENSORRT_VERSION=a.b.c --build-arg CUDNN_VERSION=x.y.z`
+  * The desired versions of CUDNN and TensorRT must be specified as build-args, with major and minor versions as in: `--build-arg TENSORRT_VERSION=a.b --build-arg CUDNN_VERSION=x.y`
   * [**Optional**] The desired base image be changed by explicitly setting a base image, as in `--build-arg BASE_IMG=nvidia/cuda:11.7.1-devel-ubuntu22.04`, though this is optional
   * [**Optional**] Additionally, the desired Python version can be changed by explicitly setting a version, as in `--build-arg PYTHON_VERSION=3.10`, though this is optional as well.
 
@@ -17,14 +17,14 @@ Note: By default the container uses the `pre-cxx11-abi` version of Torch + Torch
 
 ### Instructions
 
-- The example below uses CUDNN 8.5.0 and TensorRT 8.5.1
+- The example below uses CUDNN 8.5 and TensorRT 8.5
 - See <a href="https://github.com/pytorch/TensorRT#dependencies">dependencies</a> for a list of current default dependencies.
 
 > From root of Torch-TensorRT repo
 
 Build:
 ```
-DOCKER_BUILDKIT=1 docker build --build-arg TENSORRT_VERSION=8.5.1 --build-arg CUDNN_VERSION=8.5.0 -f docker/Dockerfile -t torch_tensorrt:latest .
+DOCKER_BUILDKIT=1 docker build --build-arg TENSORRT_VERSION=8.5 --build-arg CUDNN_VERSION=8.5 -f docker/Dockerfile -t torch_tensorrt:latest .
 ```
 
 Run:
diff --git a/docker/dist-build.sh b/docker/dist-build.sh
@@ -3,17 +3,15 @@
 TOP_DIR=$(cd $(dirname $0); pwd)/..
 
 if [[ -z "${USE_CXX11}" ]]; then
-    BUILD_CMD="python3 setup.py bdist_wheel"
+    BUILD_CMD="python setup.py bdist_wheel"
 else
-    BUILD_CMD="python3 setup.py bdist_wheel  --use-cxx11-abi"
+    BUILD_CMD="python setup.py bdist_wheel  --use-cxx11-abi"
 fi
 
 cd ${TOP_DIR} \
     && mkdir -p dist && cd py \
-    && pip install -r requirements.txt
-
-# Symlink the path pyenv is using for python with the /opt directory for package sourcing
-ln -s "`pyenv which python | xargs dirname | xargs dirname`/lib/python$PYTHON_VERSION/site-packages" "/opt/python3"
+    && pip install -r requirements.txt \
+    && pip install wheel
 
 # Build Torch-TRT
 MAX_JOBS=1 LANG=en_US.UTF-8 LANGUAGE=en_US:en LC_ALL=en_US.UTF-8 ${BUILD_CMD} $* || exit 1