-
-
Notifications
You must be signed in to change notification settings - Fork 7.7k
add Dockerfile build vllm against torch nightly #16936
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
11df969
8d21643
814835f
978c199
4d65089
e2d321b
88d11a7
a550b8a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,307 @@ | ||
# The vLLM Dockerfile is used to construct vLLM image against torch nightly that can be directly used for testing | ||
|
||
# for torch nightly, cuda >=12.6 is required, | ||
# use 12.8 due to FlashAttention issue with cuda 12.6 (https://github.com/vllm-project/vllm/issues/15435#issuecomment-2775924628) | ||
ARG CUDA_VERSION=12.8.0 | ||
# | ||
#################### BASE BUILD IMAGE #################### | ||
# prepare basic build environment | ||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base | ||
ARG CUDA_VERSION=12.8.0 | ||
ARG PYTHON_VERSION=3.12 | ||
ARG TARGETPLATFORM | ||
ENV DEBIAN_FRONTEND=noninteractive | ||
# Install Python and other dependencies | ||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ | ||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ | ||
&& apt-get update -y \ | ||
&& apt-get install -y ccache software-properties-common git curl sudo \ | ||
&& add-apt-repository ppa:deadsnakes/ppa \ | ||
&& apt-get update -y \ | ||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ | ||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ | ||
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ | ||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ | ||
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ | ||
&& python3 --version \ | ||
&& python3 -m pip --version | ||
# Install uv for faster pip installs | ||
RUN --mount=type=cache,target=/root/.cache/uv \ | ||
python3 -m pip install uv | ||
|
||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out | ||
# Reference: https://github.com/astral-sh/uv/pull/1694 | ||
ENV UV_HTTP_TIMEOUT=500 | ||
|
||
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519 | ||
# as it was causing spam when compiling the CUTLASS kernels | ||
RUN apt-get install -y gcc-10 g++-10 | ||
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 | ||
RUN <<EOF | ||
gcc --version | ||
EOF | ||
|
||
# Workaround for https://github.com/openai/triton/issues/2507 and | ||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully | ||
# this won't be needed for future versions of this docker image | ||
# or future versions of triton. | ||
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ | ||
|
||
WORKDIR /workspace | ||
|
||
# install build and runtime dependencies | ||
COPY requirements/common.txt requirements/common.txt | ||
COPY use_existing_torch.py use_existing_torch.py | ||
COPY pyproject.toml pyproject.toml | ||
|
||
# install build and runtime dependencies without stable torch version | ||
RUN python3 use_existing_torch.py | ||
|
||
# install torch nightly | ||
ARG PINNED_TORCH_VERSION | ||
RUN --mount=type=cache,target=/root/.cache/uv \ | ||
if [ -n "$PINNED_TORCH_VERSION" ]; then \ | ||
pkgs="$PINNED_TORCH_VERSION"; \ | ||
else \ | ||
pkgs="torch torchaudio torchvision"; \ | ||
fi && \ | ||
uv pip install --system $pkgs --index-url https://download.pytorch.org/whl/nightly/cu128 | ||
|
||
RUN --mount=type=cache,target=/root/.cache/uv \ | ||
uv pip install --system numba==0.61.2 | ||
|
||
RUN --mount=type=cache,target=/root/.cache/uv \ | ||
uv pip install --system -r requirements/common.txt | ||
|
||
# must put before installing xformers, so it can install the correct version of xfomrers. | ||
ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0' | ||
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} | ||
|
||
# Build xformers with cuda and torch nightly | ||
# following official xformers guidance: https://github.com/facebookresearch/xformers#build | ||
# todo(elainewy): cache xformers build result for faster build | ||
ARG max_jobs=16 | ||
ENV MAX_JOBS=${max_jobs} | ||
ARG XFORMERS_COMMIT=f2de641ef670510cadab099ce6954031f52f191c | ||
yangw-dev marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
ENV CCACHE_DIR=/root/.cache/ccache | ||
RUN --mount=type=cache,target=/root/.cache/ccache \ | ||
--mount=type=cache,target=/root/.cache/uv \ | ||
echo 'git clone xformers...' \ | ||
&& git clone https://github.com/facebookresearch/xformers.git --recursive \ | ||
&& cd xformers \ | ||
&& git checkout ${XFORMERS_COMMIT} \ | ||
&& git submodule update --init --recursive \ | ||
&& echo 'finish git clone xformers...' \ | ||
&& rm -rf build \ | ||
&& python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \ | ||
&& cd .. \ | ||
&& rm -rf xformers | ||
|
||
RUN --mount=type=cache,target=/root/.cache/uv \ | ||
uv pip install --system xformers-dist/*.whl --verbose | ||
|
||
# build can take a long time, and the torch nightly version fetched from url can be different in next docker stage. | ||
# track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same | ||
RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt | ||
RUN cat torch_build_versions.txt | ||
|
||
# cuda arch list used by torch | ||
# can be useful for `test` | ||
# explicitly set the list to avoid issues with torch 2.2 | ||
# see https://github.com/pytorch/pytorch/pull/123243 | ||
|
||
# Override the arch list for flash-attn to reduce the binary size | ||
ARG vllm_fa_cmake_gpu_arches='80-real;90-real' | ||
ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches} | ||
#################### BASE BUILD IMAGE #################### | ||
|
||
#################### WHEEL BUILD IMAGE #################### | ||
FROM base AS build | ||
ARG TARGETPLATFORM | ||
|
||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out | ||
# Reference: https://github.com/astral-sh/uv/pull/1694 | ||
ENV UV_HTTP_TIMEOUT=500 | ||
|
||
COPY . . | ||
|
||
RUN python3 use_existing_torch.py | ||
|
||
RUN --mount=type=cache,target=/root/.cache/uv \ | ||
uv pip install --system -r requirements/build.txt | ||
|
||
ARG GIT_REPO_CHECK=0 | ||
RUN --mount=type=bind,source=.git,target=.git \ | ||
if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi | ||
|
||
# Max jobs used by Ninja to build extensions | ||
ARG max_jobs=16 | ||
ENV MAX_JOBS=${max_jobs} | ||
ARG nvcc_threads=2 | ||
ENV NVCC_THREADS=$nvcc_threads | ||
|
||
ARG USE_SCCACHE | ||
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache | ||
ARG SCCACHE_REGION_NAME=us-west-2 | ||
ARG SCCACHE_S3_NO_CREDENTIALS=0 | ||
|
||
# if USE_SCCACHE is set, use sccache to speed up compilation | ||
RUN --mount=type=cache,target=/root/.cache/uv \ | ||
--mount=type=bind,source=.git,target=.git \ | ||
if [ "$USE_SCCACHE" = "1" ]; then \ | ||
echo "Installing sccache..." \ | ||
&& curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \ | ||
&& tar -xzf sccache.tar.gz \ | ||
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \ | ||
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \ | ||
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \ | ||
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \ | ||
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \ | ||
&& export SCCACHE_IDLE_TIMEOUT=0 \ | ||
&& export CMAKE_BUILD_TYPE=Release \ | ||
&& sccache --show-stats \ | ||
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \ | ||
&& sccache --show-stats; \ | ||
fi | ||
|
||
ENV CCACHE_DIR=/root/.cache/ccache | ||
RUN --mount=type=cache,target=/root/.cache/ccache \ | ||
--mount=type=cache,target=/root/.cache/uv \ | ||
--mount=type=bind,source=.git,target=.git \ | ||
if [ "$USE_SCCACHE" != "1" ]; then \ | ||
# Clean any existing CMake artifacts | ||
rm -rf .deps && \ | ||
mkdir -p .deps && \ | ||
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \ | ||
fi | ||
|
||
#################### WHEEL BUILD IMAGE #################### | ||
|
||
################### VLLM INSTALLED IMAGE #################### | ||
# Setup clean environment for vLLM and its dependencies for test and api server using ubuntu22.04 with AOT flashinfer | ||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base | ||
# prepare for environment starts | ||
ARG CUDA_VERSION=12.8.0 | ||
ARG PYTHON_VERSION=3.12 | ||
WORKDIR /vllm-workspace | ||
ENV DEBIAN_FRONTEND=noninteractive | ||
ARG TARGETPLATFORM | ||
|
||
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \ | ||
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment | ||
|
||
# Install Python and other dependencies | ||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ | ||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ | ||
&& apt-get update -y \ | ||
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \ | ||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \ | ||
&& add-apt-repository ppa:deadsnakes/ppa \ | ||
&& apt-get update -y \ | ||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \ | ||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ | ||
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ | ||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ | ||
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ | ||
&& python3 --version && python3 -m pip --version | ||
|
||
RUN --mount=type=cache,target=/root/.cache/uv \ | ||
python3 -m pip install uv | ||
|
||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out | ||
# Reference: https://github.com/astral-sh/uv/pull/1694 | ||
ENV UV_HTTP_TIMEOUT=500 | ||
|
||
# Workaround for https://github.com/openai/triton/issues/2507 and | ||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully | ||
# this won't be needed for future versions of this docker image | ||
# or future versions of triton. | ||
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ | ||
|
||
# get the nightly torch version used in the build to make sure the version is the same | ||
COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt | ||
|
||
RUN --mount=type=cache,target=/root/.cache/uv \ | ||
uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu128 | ||
|
||
# install the vllm wheel | ||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/vllm-dist \ | ||
--mount=type=cache,target=/root/.cache/uv \ | ||
uv pip install --system vllm-dist/*.whl --verbose | ||
|
||
# install xformers again for the new environment | ||
RUN --mount=type=bind,from=base,src=/workspace/xformers-dist,target=/vllm-workspace/xformers-dist \ | ||
--mount=type=cache,target=/root/.cache/uv \ | ||
uv pip install --system /vllm-workspace/xformers-dist/*.whl --verbose | ||
|
||
ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0' | ||
|
||
# install package for build flashinfer | ||
# see issue: https://github.com/flashinfer-ai/flashinfer/issues/738 | ||
RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.post1 | ||
|
||
|
||
# build flashinfer for torch nightly from source around 10 mins | ||
# release version: v0.2.2.post1 | ||
# todo(elainewy): cache flashinfer build result for faster build | ||
ENV CCACHE_DIR=/root/.cache/ccache | ||
RUN --mount=type=cache,target=/root/.cache/ccache \ | ||
--mount=type=cache,target=/root/.cache/uv \ | ||
echo "git clone flashinfer..." \ | ||
&& git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \ | ||
&& cd flashinfer \ | ||
&& git checkout v0.2.2.post1 \ | ||
&& git submodule update --init --recursive \ | ||
&& echo "finish git clone flashinfer..." \ | ||
&& rm -rf build \ | ||
&& export TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} \ | ||
&& FLASHINFER_ENABLE_AOT=1 python3 setup.py bdist_wheel --dist-dir=../flashinfer-dist --verbose \ | ||
&& cd .. \ | ||
&& rm -rf flashinfer | ||
|
||
# install flashinfer | ||
RUN --mount=type=cache,target=/root/.cache/uv \ | ||
uv pip install --system flashinfer-dist/*.whl --verbose | ||
|
||
# install common packages | ||
COPY requirements/common.txt requirements/common.txt | ||
COPY use_existing_torch.py use_existing_torch.py | ||
COPY pyproject.toml pyproject.toml | ||
|
||
COPY examples examples | ||
COPY benchmarks benchmarks | ||
COPY ./vllm/collect_env.py . | ||
|
||
RUN python3 use_existing_torch.py | ||
RUN --mount=type=cache,target=/root/.cache/uv \ | ||
uv pip install --system -r requirements/common.txt | ||
|
||
################### VLLM INSTALLED IMAGE #################### | ||
|
||
|
||
#################### UNITTEST IMAGE ############################# | ||
FROM vllm-base as test | ||
COPY tests/ tests/ | ||
|
||
# install build and runtime dependencies without stable torch version | ||
COPY requirements/nightly_torch_test.txt requirements/nightly_torch_test.txt | ||
|
||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out | ||
# Reference: https://github.com/astral-sh/uv/pull/1694 | ||
ENV UV_HTTP_TIMEOUT=500 | ||
|
||
# install development dependencies (for testing) | ||
RUN --mount=type=cache,target=/root/.cache/uv \ | ||
uv pip install --system -e tests/vllm_test_utils | ||
|
||
# enable fast downloads from hf (for testing) | ||
RUN --mount=type=cache,target=/root/.cache/uv \ | ||
uv pip install --system hf_transfer | ||
ENV HF_HUB_ENABLE_HF_TRANSFER 1 | ||
|
||
RUN --mount=type=cache,target=/root/.cache/uv \ | ||
uv pip install --system -r requirements/nightly_torch_test.txt | ||
|
||
#################### UNITTEST IMAGE ############################# | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
# Dependency that able to run entrypoints test | ||
# pytest and its extensions | ||
pytest | ||
pytest-asyncio | ||
pytest-forked | ||
pytest-mock | ||
pytest-rerunfailures | ||
pytest-shard | ||
pytest-timeout | ||
|
||
|
||
librosa # required by audio tests in entrypoints/openai | ||
sentence-transformers | ||
numba == 0.61.2; python_version > '3.9' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. how about python version >= "3.10"? Wondering if 3.9.1 is >3.9 or not There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this follows the other requirements, I can do this here and upgrade later, right now this works |
||
# testing utils | ||
awscli | ||
boto3 | ||
botocore | ||
datasets | ||
ray >= 2.10.0 | ||
peft | ||
runai-model-streamer==0.11.0 | ||
runai-model-streamer-s3==0.11.0 | ||
tensorizer>=2.9.0 | ||
lm-eval==0.4.8 | ||
yangw-dev marked this conversation as resolved.
Show resolved
Hide resolved
|
||
buildkite-test-collector==0.1.9 | ||
|
||
lm-eval[api]==0.4.8 # required for model evaluation test | ||
yangw-dev marked this conversation as resolved.
Show resolved
Hide resolved
|
Uh oh!
There was an error while loading. Please reload this page.