|
| 1 | +# syntax=docker/dockerfile:1 |
| 2 | +# check=skip=InvalidDefaultArgInFrom # defaults are specified in bakefiles |
| 3 | +# vim: ft=dockerfile |
| 4 | + |
| 5 | +ARG BASE_UBI_IMAGE_TAG |
| 6 | +ARG PYTHON_VERSION |
| 7 | + |
| 8 | +## Base Layer ################################################################## |
| 9 | +FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base |
| 10 | +ARG PYTHON_VERSION |
| 11 | +ENV PYTHON_VERSION=${PYTHON_VERSION} |
| 12 | +RUN microdnf -y update && microdnf install -y \ |
| 13 | + python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel \ |
| 14 | + && microdnf clean all |
| 15 | + |
| 16 | +WORKDIR /workspace |
| 17 | + |
| 18 | +ENV LANG=C.UTF-8 \ |
| 19 | + LC_ALL=C.UTF-8 |
| 20 | + |
| 21 | +# Some utils for dev purposes - tar required for kubectl cp |
| 22 | +RUN microdnf install -y \ |
| 23 | + which procps findutils tar \ |
| 24 | + && microdnf clean all |
| 25 | + |
| 26 | + |
| 27 | +## Python Installer ############################################################ |
| 28 | +FROM base AS python-install |
| 29 | +ARG PYTHON_VERSION |
| 30 | + |
| 31 | +ENV VIRTUAL_ENV=/opt/vllm |
| 32 | +ENV PATH="$VIRTUAL_ENV/bin:$PATH" |
| 33 | +ENV PYTHON_VERSION=${PYTHON_VERSION} |
| 34 | +RUN microdnf install -y \ |
| 35 | + python${PYTHON_VERSION}-devel && \ |
| 36 | + python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && \ |
| 37 | + pip install --no-cache -U pip wheel uv && \ |
| 38 | + microdnf clean all |
| 39 | + |
| 40 | + |
| 41 | +## CUDA Base ################################################################### |
| 42 | +FROM python-install AS cuda-base |
| 43 | + |
| 44 | +RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \ |
| 45 | + https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo |
| 46 | + |
| 47 | +ENV CUDA_HOME="/usr/local/cuda" \ |
| 48 | + PATH="${CUDA_HOME}/bin:${PATH}" |
| 49 | +ENV LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/lib64/stubs/:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH}" |
| 50 | +RUN microdnf install -y \ |
| 51 | + cuda-nvcc-12-4 cuda-nvtx-12-4 cuda-libraries-devel-12-4 && \ |
| 52 | + microdnf clean all && \ |
| 53 | + ln -s ${CUDA_HOME}/lib64/stubs/libcuda.so /usr/lib64/ |
| 54 | + |
| 55 | + |
| 56 | +## Python cuda base ################################################################# |
| 57 | +FROM cuda-base AS python-cuda-base |
| 58 | + |
| 59 | +ENV VIRTUAL_ENV=/opt/vllm |
| 60 | +ENV PATH="$VIRTUAL_ENV/bin:$PATH" |
| 61 | + |
| 62 | +# install cuda and common dependencies |
| 63 | +RUN --mount=type=cache,target=/root/.cache/uv \ |
| 64 | + --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \ |
| 65 | + --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \ |
| 66 | + uv pip install \ |
| 67 | + -r requirements-cuda.txt |
| 68 | + |
| 69 | + |
| 70 | +#################### libsodium Build IMAGE #################### |
| 71 | +FROM base AS libsodium-builder |
| 72 | + |
| 73 | +RUN microdnf install -y gcc gzip \ |
| 74 | + && microdnf clean all |
| 75 | + |
| 76 | +WORKDIR /usr/src/libsodium |
| 77 | + |
| 78 | +ARG LIBSODIUM_VERSION |
| 79 | +RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM_VERSION}-RELEASE/libsodium-${LIBSODIUM_VERSION}.tar.gz \ |
| 80 | + && tar -xzvf libsodium*.tar.gz \ |
| 81 | + && rm -f libsodium*.tar.gz \ |
| 82 | + && mv libsodium*/* ./ |
| 83 | + |
| 84 | +RUN CFLAGS="-O3 -Wall -Werror=format-security -Wno-unused-function -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection"\ |
| 85 | + ./configure --prefix="/usr/" && make -j $MAX_JOBS && make check |
| 86 | + |
| 87 | +## Release ##################################################################### |
| 88 | +FROM python-install AS vllm-openai |
| 89 | +ARG PYTHON_VERSION |
| 90 | + |
| 91 | +WORKDIR /workspace |
| 92 | + |
| 93 | +ENV VIRTUAL_ENV=/opt/vllm |
| 94 | +ENV PATH=$VIRTUAL_ENV/bin:$PATH |
| 95 | + |
| 96 | +# force using the python venv's cuda runtime libraries |
| 97 | +ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_nvrtc/lib:${LD_LIBRARY_PATH}" |
| 98 | +ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_runtime/lib:${LD_LIBRARY_PATH}" |
| 99 | +ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvtx/lib:${LD_LIBRARY_PATH}" |
| 100 | + |
| 101 | +# Triton needs a CC compiler |
| 102 | +RUN microdnf install -y gcc && \ |
| 103 | + microdnf clean all |
| 104 | + |
| 105 | + |
| 106 | +# Install libsodium for Tensorizer encryption |
| 107 | +RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \ |
| 108 | + make -C /usr/src/libsodium install |
| 109 | + |
| 110 | +COPY LICENSE /licenses/vllm.md |
| 111 | +COPY examples/*.jinja /app/data/template/ |
| 112 | + |
| 113 | +# install vllm by running the payload script and then install flashinfer |
| 114 | +ARG FLASHINFER_VERSION |
| 115 | +RUN --mount=type=cache,target=/root/.cache/uv \ |
| 116 | + --mount=type=bind,src=payload,target=/workspace/payload \ |
| 117 | + ./payload/run.sh && \ |
| 118 | + uv pip install "${FLASHINFER_VERSION}" |
| 119 | + |
| 120 | + |
| 121 | +ENV HF_HUB_OFFLINE=1 \ |
| 122 | + HOME=/home/vllm \ |
| 123 | + # Allow requested max length to exceed what is extracted from the |
| 124 | + # config.json |
| 125 | + # see: https://github.com/vllm-project/vllm/pull/7080 |
| 126 | + VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \ |
| 127 | + VLLM_USAGE_SOURCE=production-docker-image \ |
| 128 | + VLLM_WORKER_MULTIPROC_METHOD=fork \ |
| 129 | + VLLM_NO_USAGE_STATS=1 \ |
| 130 | + OUTLINES_CACHE_DIR=/tmp/outlines \ |
| 131 | + NUMBA_CACHE_DIR=/tmp/numba \ |
| 132 | + TRITON_CACHE_DIR=/tmp/triton \ |
| 133 | + # Setup NCCL monitoring with torch |
| 134 | + # For tensor-parallel workloads, this monitors for NCCL deadlocks when |
| 135 | + # one rank dies, and tears down the NCCL process groups so that the driver |
| 136 | + # can cleanly exit. |
| 137 | + TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=15 \ |
| 138 | + TORCH_NCCL_DUMP_ON_TIMEOUT=0 |
| 139 | + |
| 140 | +# setup non-root user for OpenShift |
| 141 | +RUN umask 002 && \ |
| 142 | + useradd --uid 2000 --gid 0 vllm && \ |
| 143 | + mkdir -p /home/vllm && \ |
| 144 | + chmod g+rwx /home/vllm |
| 145 | + |
| 146 | +USER 2000 |
| 147 | +WORKDIR /home/vllm |
| 148 | + |
| 149 | +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] |
| 150 | + |
| 151 | + |
| 152 | +## TGIS Adapter layer ##################################################################### |
| 153 | +FROM vllm-openai AS vllm-grpc-adapter |
| 154 | + |
| 155 | +USER root |
| 156 | + |
| 157 | +ARG VLLM_TGIS_ADAPTER_VERSION |
| 158 | +RUN --mount=type=cache,target=/root/.cache/uv \ |
| 159 | + --mount=type=bind,src=payload,target=/workspace/payload \ |
| 160 | + cd /workspace && \ |
| 161 | + env HOME=/root VLLM_TGIS_ADAPTER_VERSION=${VLLM_TGIS_ADAPTER_VERSION} \ |
| 162 | + ./payload/run.sh |
| 163 | + |
| 164 | +ENV GRPC_PORT=8033 \ |
| 165 | + PORT=8000 \ |
| 166 | + # As an optimization, vLLM disables logprobs when using spec decoding by |
| 167 | + # default, but this would be unexpected to users of a hosted model that |
| 168 | + # happens to have spec decoding |
| 169 | + # see: https://github.com/vllm-project/vllm/pull/6485 |
| 170 | + DISABLE_LOGPROBS_DURING_SPEC_DECODING=false |
| 171 | + |
| 172 | +USER 2000 |
| 173 | +ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter", "--uvicorn-log-level=warning"] |
0 commit comments