Skip to content

Commit 818fb83

Browse files
committed
Merge branch 'main' of github.com:vllm-project/vllm into sliding_window
Signed-off-by: Chen Zhang <[email protected]>
2 parents 9b966b1 + f3f8d8f commit 818fb83

File tree

141 files changed

+3229
-2322
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

141 files changed

+3229
-2322
lines changed

.buildkite/release-pipeline.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ steps:
8282
queue: cpu_queue_postmerge
8383
commands:
8484
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
85-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ."
85+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f Dockerfile.cpu ."
8686
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
8787
env:
8888
DOCKER_BUILDKIT: "1"

.buildkite/run-cpu-test.sh

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,19 @@ set -ex
88
CORE_RANGE=${CORE_RANGE:-48-95}
99
NUMA_NODE=${NUMA_NODE:-1}
1010

11-
# Try building the docker image
12-
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
13-
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
14-
1511
# Setup cleanup
16-
remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
12+
remove_docker_container() {
13+
set -e;
14+
docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true;
15+
docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true;
16+
}
1717
trap remove_docker_container EXIT
1818
remove_docker_container
1919

20+
# Try building the docker image
21+
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f Dockerfile.cpu .
22+
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f Dockerfile.cpu .
23+
2024
# Run the image, setting --shm-size=4g for tensor parallel.
2125
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
2226
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
@@ -36,8 +40,6 @@ function cpu_tests() {
3640
# Run basic model test
3741
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
3842
set -e
39-
pip install -r vllm/requirements/test.txt
40-
pip install -r vllm/requirements/cpu.txt
4143
pytest -v -s tests/kernels/test_cache.py -m cpu_model
4244
pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
4345
pytest -v -s tests/models/decoder_only/language -m cpu_model

.buildkite/run-tpu-v1-test.sh

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,16 @@ docker run --privileged --net host --shm-size=16G -it \
2828
&& echo TEST_3 \
2929
&& pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
3030
&& echo TEST_4 \
31-
&& pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
32-
&& echo TEST_5 \
3331
&& python3 /workspace/vllm/examples/offline_inference/tpu.py \
32+
&& echo TEST_5 \
33+
&& pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
3434
&& echo TEST_6 \
35-
&& pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py" \
35+
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" \
3636

3737

3838
# TODO: This test fails because it uses RANDOM_SEED sampling
3939
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
4040

41+
# TODO: Re-enable this after fixing recompilation in quantization.
42+
# && echo TEST_4 \
43+
# && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \

.buildkite/test-pipeline.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,12 +135,14 @@ steps:
135135
- examples/offline_inference/rlhf.py
136136
- examples/offline_inference/rlhf_colocate.py
137137
- tests/examples/offline_inference/data_parallel.py
138+
- tests/v1/test_async_llm_dp.py
138139
commands:
139140
# test with tp=2 and external_dp=2
140141
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
141142
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
142143
# test with internal dp
143144
- python3 ../examples/offline_inference/data_parallel.py
145+
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
144146
- pytest -v -s distributed/test_utils.py
145147
- pytest -v -s compile/test_basic_correctness.py
146148
- pytest -v -s distributed/test_pynccl.py
@@ -514,7 +516,10 @@ steps:
514516
- vllm/worker/worker.py
515517
- vllm/worker/model_runner.py
516518
- entrypoints/llm/test_collective_rpc.py
519+
- tests/v1/test_async_llm_dp.py
520+
- vllm/v1/engine/
517521
commands:
522+
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
518523
- VLLM_ENABLE_V1_MULTIPROCESSING=0 pytest -v -s entrypoints/llm/test_collective_rpc.py
519524
- pytest -v -s ./compile/test_basic_correctness.py
520525
- pytest -v -s ./compile/test_wrapper.py

.github/mergify.yml

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,15 +90,34 @@ pull_request_rules:
9090

9191
- name: label-tpu
9292
description: Automatically apply tpu label
93+
# Keep this list in sync with `label-tpu-remove` conditions
9394
conditions:
9495
- or:
95-
- files~=tpu
96+
- files~=tpu.py
97+
- files~=_tpu
98+
- files~=tpu_
99+
- files~=/tpu/
96100
- files~=pallas
97101
actions:
98102
label:
99103
add:
100104
- tpu
101105

106+
- name: label-tpu-remove
107+
description: Automatically remove tpu label
108+
# Keep this list in sync with `label-tpu` conditions
109+
conditions:
110+
- and:
111+
- -files~=tpu.py
112+
- -files~=_tpu
113+
- -files~=tpu_
114+
- -files~=/tpu/
115+
- -files~=pallas
116+
actions:
117+
label:
118+
remove:
119+
- tpu
120+
102121
- name: ping author on conflicts and add 'needs-rebase' label
103122
conditions:
104123
- conflict

Dockerfile.cpu

Lines changed: 107 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,69 +1,138 @@
11
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
2+
#
3+
# Build targets:
4+
# vllm-openai (default): used for serving deployment
5+
# vllm-test: used for CI tests
6+
# vllm-dev: used for development
7+
#
8+
# Build arguments:
9+
# PYTHON_VERSION=3.12 (default)|3.11|3.10|3.9
10+
# VLLM_CPU_DISABLE_AVX512=false (default)|true
11+
#
12+
13+
######################### BASE IMAGE #########################
14+
FROM ubuntu:22.04 AS base
215

3-
FROM ubuntu:22.04 AS cpu-test-1
16+
WORKDIR /workspace/
417

5-
ENV CCACHE_DIR=/root/.cache/ccache
18+
ARG PYTHON_VERSION=3.12
19+
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
620

21+
# Install minimal dependencies and uv
22+
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
23+
--mount=type=cache,target=/var/lib/apt,sharing=locked \
24+
apt-get update -y \
25+
&& apt-get install -y --no-install-recommends ccache git curl wget ca-certificates \
26+
gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 \
27+
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
28+
&& curl -LsSf https://astral.sh/uv/install.sh | sh
29+
30+
ENV CCACHE_DIR=/root/.cache/ccache
731
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
832

9-
RUN --mount=type=cache,target=/var/cache/apt \
10-
apt-get update -y \
11-
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
12-
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
13-
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
33+
ENV PATH="/root/.local/bin:$PATH"
34+
ENV VIRTUAL_ENV="/opt/venv"
35+
RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
36+
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
1437

15-
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
16-
# intel-openmp provides additional performance improvement vs. openmp
17-
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
18-
RUN --mount=type=cache,target=/root/.cache/pip \
19-
pip install intel-openmp==2025.0.1
38+
ENV UV_HTTP_TIMEOUT=500
2039

21-
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
40+
# Install Python dependencies
41+
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
42+
ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
43+
ENV UV_INDEX_STRATEGY="unsafe-best-match"
44+
ENV UV_LINK_MODE="copy"
45+
RUN --mount=type=cache,target=/root/.cache/uv \
46+
--mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
47+
--mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
48+
uv pip install --upgrade pip && \
49+
uv pip install -r requirements/cpu.txt
2250

23-
RUN echo 'ulimit -c 0' >> ~/.bashrc
51+
RUN --mount=type=cache,target=/root/.cache/uv \
52+
uv pip install intel-openmp==2024.2.1 intel_extension_for_pytorch==2.6.0
2453

25-
RUN pip install intel_extension_for_pytorch==2.6.0
54+
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD"
2655

27-
WORKDIR /workspace
56+
RUN echo 'ulimit -c 0' >> ~/.bashrc
2857

29-
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
30-
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
31-
RUN --mount=type=cache,target=/root/.cache/pip \
32-
--mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
33-
pip install --upgrade pip && \
34-
pip install -r requirements/build.txt
58+
######################### BUILD IMAGE #########################
59+
FROM base AS vllm-build
3560

36-
FROM cpu-test-1 AS build
61+
ARG GIT_REPO_CHECK=0
62+
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
63+
ARG VLLM_CPU_DISABLE_AVX512
64+
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
3765

3866
WORKDIR /workspace/vllm
3967

40-
RUN --mount=type=cache,target=/root/.cache/pip \
41-
--mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
42-
--mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
43-
pip install -v -r requirements/cpu.txt
68+
RUN --mount=type=cache,target=/root/.cache/uv \
69+
--mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
70+
uv pip install -r requirements/build.txt
4471

4572
COPY . .
46-
ARG GIT_REPO_CHECK=0
4773
RUN --mount=type=bind,source=.git,target=.git \
4874
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
4975

50-
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
51-
ARG VLLM_CPU_DISABLE_AVX512
52-
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
76+
RUN --mount=type=cache,target=/root/.cache/uv \
77+
--mount=type=cache,target=/root/.cache/ccache \
78+
--mount=type=bind,source=.git,target=.git \
79+
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel
80+
81+
######################### DEV IMAGE #########################
82+
FROM vllm-build AS vllm-dev
83+
84+
WORKDIR /workspace/vllm
85+
86+
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
87+
--mount=type=cache,target=/var/lib/apt,sharing=locked \
88+
apt-get install -y --no-install-recommends vim numactl
89+
90+
# install development dependencies (for testing)
91+
RUN --mount=type=cache,target=/root/.cache/uv \
92+
uv pip install -e tests/vllm_test_utils
5393

54-
RUN --mount=type=cache,target=/root/.cache/pip \
94+
RUN --mount=type=cache,target=/root/.cache/uv \
5595
--mount=type=cache,target=/root/.cache/ccache \
5696
--mount=type=bind,source=.git,target=.git \
57-
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
58-
pip install dist/*.whl && \
59-
rm -rf dist
97+
VLLM_TARGET_DEVICE=cpu python3 setup.py develop
98+
99+
RUN --mount=type=cache,target=/root/.cache/uv \
100+
uv pip install -r requirements/dev.txt && \
101+
pre-commit install --hook-type pre-commit --hook-type commit-msg
102+
103+
ENTRYPOINT ["bash"]
104+
105+
######################### TEST IMAGE #########################
106+
FROM base AS vllm-test
60107

61108
WORKDIR /workspace/
62109

63-
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
110+
RUN --mount=type=cache,target=/root/.cache/uv \
111+
--mount=type=bind,src=requirements/test.txt,target=requirements/test.txt \
112+
uv pip install -r requirements/test.txt
113+
114+
RUN --mount=type=cache,target=/root/.cache/uv \
115+
--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
116+
uv pip install dist/*.whl
117+
118+
ADD ./tests/ ./tests/
119+
ADD ./examples/ ./examples/
120+
ADD ./benchmarks/ ./benchmarks/
64121

65122
# install development dependencies (for testing)
66-
RUN --mount=type=cache,target=/root/.cache/pip \
67-
pip install -e tests/vllm_test_utils
123+
RUN --mount=type=cache,target=/root/.cache/uv \
124+
uv pip install -e tests/vllm_test_utils
125+
126+
ENTRYPOINT ["bash"]
127+
128+
######################### RELEASE IMAGE #########################
129+
FROM base AS vllm-openai
130+
131+
WORKDIR /workspace/
132+
133+
RUN --mount=type=cache,target=/root/.cache/uv \
134+
--mount=type=cache,target=/root/.cache/ccache \
135+
--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
136+
uv pip install dist/*.whl
68137

69138
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

benchmarks/benchmark_dataset.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -715,3 +715,66 @@ def sample(
715715
))
716716
self.maybe_oversample_requests(sampled_requests, num_requests)
717717
return sampled_requests
718+
719+
720+
# -----------------------------------------------------------------------------
721+
# Instruct Coder Dataset Implementation
722+
# -----------------------------------------------------------------------------
723+
724+
725+
class InstructCoderDataset(HuggingFaceDataset):
726+
"""
727+
InstructCoder Dataset.
728+
https://huggingface.co/datasets/likaixin/InstructCoder
729+
730+
InstructCoder is the dataset designed for general code editing.
731+
It consists of 114,239 instruction-input-output triplets,
732+
and covers multiple distinct code editing scenario.
733+
"""
734+
735+
DEFAULT_OUTPUT_LEN = 200 # this is the average default output length
736+
DEFAULT_NUM_REQUESTS = 1000
737+
INSTRUCT_CODER_DATASET_PATH = "likaixin/InstructCoder"
738+
739+
def __init__(
740+
self,
741+
**kwargs,
742+
) -> None:
743+
super().__init__(**kwargs)
744+
if self.dataset_path != self.INSTRUCT_CODER_DATASET_PATH:
745+
raise ValueError(f"Only support likaixin/InstructCoder dataset.\
746+
This data path {self.dataset_path} is not valid.")
747+
if self.dataset_subset is None and self.dataset_split != "train":
748+
raise ValueError("Dataset split must be 'train'.")
749+
750+
def load_data(self) -> None:
751+
dataset = load_dataset(
752+
self.dataset_path,
753+
name=self.dataset_subset,
754+
split=self.dataset_split,
755+
streaming=True,
756+
)
757+
self.data = dataset.shuffle(seed=self.random_seed)
758+
759+
def sample(self,
760+
tokenizer: PreTrainedTokenizerBase,
761+
num_requests: int,
762+
output_len: Optional[int] = None,
763+
enable_multimodal_chat: bool = False,
764+
**kwargs) -> list:
765+
output_len = (output_len
766+
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
767+
sampled_requests = []
768+
for item in self.data:
769+
if len(sampled_requests) >= num_requests:
770+
break
771+
prompt = f"{item['instruction']}:\n{item['input']}"
772+
prompt_len = len(tokenizer(prompt).input_ids)
773+
sampled_requests.append(
774+
SampleRequest(
775+
prompt=prompt,
776+
prompt_len=prompt_len,
777+
expected_output_len=output_len,
778+
))
779+
self.maybe_oversample_requests(sampled_requests, num_requests)
780+
return sampled_requests

0 commit comments

Comments
 (0)