Skip to content

Commit 2602d9d

Browse files
authored
add UBI-based CUDA dockerfile/bake definition (vllm-project#198)
- add CUDA ubi Dockerfile - add docker-bake definition This dockerfile expects a `payload` directory to be available in the repo's root and it should contain a `run.sh` script that contains all the instructions needed to install vllm.
1 parent cea4c48 commit 2602d9d

File tree

2 files changed

+240
-0
lines changed

2 files changed

+240
-0
lines changed

Dockerfile.ubi

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
# syntax=docker/dockerfile:1
2+
# check=skip=InvalidDefaultArgInFrom # defaults are specified in bakefiles
3+
# vim: ft=dockerfile
4+
5+
ARG BASE_UBI_IMAGE_TAG
6+
ARG PYTHON_VERSION
7+
8+
## Base Layer ##################################################################
9+
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base
10+
ARG PYTHON_VERSION
11+
ENV PYTHON_VERSION=${PYTHON_VERSION}
12+
RUN microdnf -y update && microdnf install -y \
13+
python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel \
14+
&& microdnf clean all
15+
16+
WORKDIR /workspace
17+
18+
ENV LANG=C.UTF-8 \
19+
LC_ALL=C.UTF-8
20+
21+
# Some utils for dev purposes - tar required for kubectl cp
22+
RUN microdnf install -y \
23+
which procps findutils tar \
24+
&& microdnf clean all
25+
26+
27+
## Python Installer ############################################################
28+
FROM base AS python-install
29+
ARG PYTHON_VERSION
30+
31+
ENV VIRTUAL_ENV=/opt/vllm
32+
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
33+
ENV PYTHON_VERSION=${PYTHON_VERSION}
34+
RUN microdnf install -y \
35+
python${PYTHON_VERSION}-devel && \
36+
python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && \
37+
pip install --no-cache -U pip wheel uv && \
38+
microdnf clean all
39+
40+
41+
## CUDA Base ###################################################################
42+
FROM python-install AS cuda-base
43+
44+
RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
45+
https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
46+
47+
ENV CUDA_HOME="/usr/local/cuda" \
48+
PATH="${CUDA_HOME}/bin:${PATH}"
49+
ENV LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/lib64/stubs/:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"
50+
RUN microdnf install -y \
51+
cuda-nvcc-12-4 cuda-nvtx-12-4 cuda-libraries-devel-12-4 && \
52+
microdnf clean all && \
53+
ln -s ${CUDA_HOME}/lib64/stubs/libcuda.so /usr/lib64/
54+
55+
56+
## Python cuda base #################################################################
57+
FROM cuda-base AS python-cuda-base
58+
59+
ENV VIRTUAL_ENV=/opt/vllm
60+
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
61+
62+
# install cuda and common dependencies
63+
RUN --mount=type=cache,target=/root/.cache/uv \
64+
--mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
65+
--mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
66+
uv pip install \
67+
-r requirements-cuda.txt
68+
69+
70+
#################### libsodium Build IMAGE ####################
71+
FROM base AS libsodium-builder
72+
73+
RUN microdnf install -y gcc gzip \
74+
&& microdnf clean all
75+
76+
WORKDIR /usr/src/libsodium
77+
78+
ARG LIBSODIUM_VERSION
79+
RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM_VERSION}-RELEASE/libsodium-${LIBSODIUM_VERSION}.tar.gz \
80+
&& tar -xzvf libsodium*.tar.gz \
81+
&& rm -f libsodium*.tar.gz \
82+
&& mv libsodium*/* ./
83+
84+
RUN CFLAGS="-O3 -Wall -Werror=format-security -Wno-unused-function -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection"\
85+
./configure --prefix="/usr/" && make -j $MAX_JOBS && make check
86+
87+
## Release #####################################################################
88+
FROM python-install AS vllm-openai
89+
ARG PYTHON_VERSION
90+
91+
WORKDIR /workspace
92+
93+
ENV VIRTUAL_ENV=/opt/vllm
94+
ENV PATH=$VIRTUAL_ENV/bin:$PATH
95+
96+
# force using the python venv's cuda runtime libraries
97+
ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_nvrtc/lib:${LD_LIBRARY_PATH}"
98+
ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_runtime/lib:${LD_LIBRARY_PATH}"
99+
ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvtx/lib:${LD_LIBRARY_PATH}"
100+
101+
# Triton needs a CC compiler
102+
RUN microdnf install -y gcc && \
103+
microdnf clean all
104+
105+
106+
# Install libsodium for Tensorizer encryption
107+
RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \
108+
make -C /usr/src/libsodium install
109+
110+
COPY LICENSE /licenses/vllm.md
111+
COPY examples/*.jinja /app/data/template/
112+
113+
# install vllm by running the payload script and then install flashinfer
114+
ARG FLASHINFER_VERSION
115+
RUN --mount=type=cache,target=/root/.cache/uv \
116+
--mount=type=bind,src=payload,target=/workspace/payload \
117+
./payload/run.sh && \
118+
uv pip install "${FLASHINFER_VERSION}"
119+
120+
121+
ENV HF_HUB_OFFLINE=1 \
122+
HOME=/home/vllm \
123+
# Allow requested max length to exceed what is extracted from the
124+
# config.json
125+
# see: https://github.com/vllm-project/vllm/pull/7080
126+
VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
127+
VLLM_USAGE_SOURCE=production-docker-image \
128+
VLLM_WORKER_MULTIPROC_METHOD=fork \
129+
VLLM_NO_USAGE_STATS=1 \
130+
OUTLINES_CACHE_DIR=/tmp/outlines \
131+
NUMBA_CACHE_DIR=/tmp/numba \
132+
TRITON_CACHE_DIR=/tmp/triton \
133+
# Setup NCCL monitoring with torch
134+
# For tensor-parallel workloads, this monitors for NCCL deadlocks when
135+
# one rank dies, and tears down the NCCL process groups so that the driver
136+
# can cleanly exit.
137+
TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=15 \
138+
TORCH_NCCL_DUMP_ON_TIMEOUT=0
139+
140+
# setup non-root user for OpenShift
141+
RUN umask 002 && \
142+
useradd --uid 2000 --gid 0 vllm && \
143+
mkdir -p /home/vllm && \
144+
chmod g+rwx /home/vllm
145+
146+
USER 2000
147+
WORKDIR /home/vllm
148+
149+
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
150+
151+
152+
## TGIS Adapter layer #####################################################################
153+
FROM vllm-openai AS vllm-grpc-adapter
154+
155+
USER root
156+
157+
ARG VLLM_TGIS_ADAPTER_VERSION
158+
RUN --mount=type=cache,target=/root/.cache/uv \
159+
--mount=type=bind,src=payload,target=/workspace/payload \
160+
cd /workspace && \
161+
env HOME=/root VLLM_TGIS_ADAPTER_VERSION=${VLLM_TGIS_ADAPTER_VERSION} \
162+
./payload/run.sh
163+
164+
ENV GRPC_PORT=8033 \
165+
PORT=8000 \
166+
# As an optimization, vLLM disables logprobs when using spec decoding by
167+
# default, but this would be unexpected to users of a hosted model that
168+
# happens to have spec decoding
169+
# see: https://github.com/vllm-project/vllm/pull/6485
170+
DISABLE_LOGPROBS_DURING_SPEC_DECODING=false
171+
172+
USER 2000
173+
ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter", "--uvicorn-log-level=warning"]

docker-bake.hcl

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
variable "REPOSITORY" {
2+
default = "quay.io/vllm/vllm"
3+
}
4+
5+
# GITHUB_* variables are only available in github actions
6+
variable "GITHUB_SHA" {
7+
default = "$GITHUB_SHA"
8+
}
9+
variable "GITHUB_REPO" {
10+
default = "$GITHUB_REPO"
11+
}
12+
variable "GITHUB_RUN_ID" {
13+
default = "$GITHUB_RUN_ID"
14+
}
15+
16+
variable "VLLM_VERSION" { # set by github actions or manually?
17+
default = "$VLLM_VERSION"
18+
}
19+
20+
target "docker-metadata-action" {} // populated by gha docker/metadata-action
21+
22+
target "_common" {
23+
context = "."
24+
25+
args = {
26+
BASE_UBI_IMAGE_TAG = "9.5-1736404155"
27+
PYTHON_VERSION = "3.12"
28+
}
29+
30+
inherits = ["docker-metadata-action"]
31+
32+
platforms = [
33+
"linux/amd64",
34+
]
35+
labels = {
36+
"org.opencontainers.image.source" = "https://github.com/${GITHUB_REPO}"
37+
"vcs-ref" = "${GITHUB_SHA}"
38+
"vcs-type" = "git"
39+
}
40+
}
41+
42+
group "default" {
43+
targets = [
44+
"cuda",
45+
]
46+
}
47+
48+
target "cuda" {
49+
inherits = ["_common"]
50+
dockerfile = "Dockerfile.ubi"
51+
52+
args = {
53+
BASE_UBI_IMAGE_TAG = "9.5-1739420147"
54+
PYTHON_VERSION = "3.12"
55+
# CUDA_VERSION = "12.4" # TODO: the dockerfile cannot consume the cuda version
56+
LIBSODIUM_VERSION = "1.0.20"
57+
VLLM_TGIS_ADAPTER_VERSION = "0.6.3"
58+
59+
FLASHINFER_VERSION = "https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl"
60+
}
61+
62+
tags = [
63+
"${REPOSITORY}:${replace(VLLM_VERSION, "+", "_")}", # vllm_version might contain local version specifiers (+) which are not valid tags
64+
"${REPOSITORY}:${GITHUB_SHA}",
65+
"${REPOSITORY}:${formatdate("YYYY-MM-DD-hh-mm", timestamp())}"
66+
]
67+
}

0 commit comments

Comments
 (0)