vllm-project
diff --git a/‎.buildkite/release-pipeline.yaml
Lines changed: 1 addition & 1 deletion b/‎.buildkite/release-pipeline.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/run-cpu-test.sh
Lines changed: 9 additions & 7 deletions b/‎.buildkite/run-cpu-test.sh
Lines changed: 9 additions & 7 deletions
diff --git a/‎.buildkite/run-tpu-v1-test.sh
Lines changed: 6 additions & 3 deletions b/‎.buildkite/run-tpu-v1-test.sh
Lines changed: 6 additions & 3 deletions
diff --git a/‎.buildkite/test-pipeline.yaml
Lines changed: 5 additions & 0 deletions b/‎.buildkite/test-pipeline.yaml
Lines changed: 5 additions & 0 deletions
diff --git a/‎.github/mergify.yml
Lines changed: 20 additions & 1 deletion b/‎.github/mergify.yml
Lines changed: 20 additions & 1 deletion
diff --git a/‎Dockerfile.cpu
Lines changed: 107 additions & 38 deletions b/‎Dockerfile.cpu
Lines changed: 107 additions & 38 deletions
diff --git a/‎benchmarks/benchmark_dataset.py
Lines changed: 63 additions & 0 deletions b/‎benchmarks/benchmark_dataset.py
Lines changed: 63 additions & 0 deletions
@@ -82,7 +82,7 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f Dockerfile.cpu ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
     env:
       DOCKER_BUILDKIT: "1"
@@ -8,15 +8,19 @@ set -ex
 CORE_RANGE=${CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
 
-# Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
-
 # Setup cleanup
-remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
+remove_docker_container() { 
+    set -e; 
+    docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; 
+    docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true; 
+}
 trap remove_docker_container EXIT
 remove_docker_container
 
+# Try building the docker image
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f Dockerfile.cpu .
+
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
  --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
@@ -36,8 +40,6 @@ function cpu_tests() {
   # Run basic model test
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
-    pip install -r vllm/requirements/test.txt
-    pip install -r vllm/requirements/cpu.txt
     pytest -v -s tests/kernels/test_cache.py -m cpu_model
     pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
     pytest -v -s tests/models/decoder_only/language -m cpu_model
 
@@ -28,13 +28,16 @@ docker run --privileged --net host --shm-size=16G -it \
     && echo TEST_3 \
     && pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
     && echo TEST_4 \
-    && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
-    && echo TEST_5 \
     && python3 /workspace/vllm/examples/offline_inference/tpu.py \
+    && echo TEST_5 \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
     && echo TEST_6 \
-    && pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py" \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" \
 
 
 # TODO: This test fails because it uses RANDOM_SEED sampling
 # && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
 
+# TODO: Re-enable this after fixing recompilation in quantization.
+# && echo TEST_4 \
+# && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
@@ -135,12 +135,14 @@ steps:
   - examples/offline_inference/rlhf.py
   - examples/offline_inference/rlhf_colocate.py
   - tests/examples/offline_inference/data_parallel.py
+  - tests/v1/test_async_llm_dp.py
   commands:
   # test with tp=2 and external_dp=2
   - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
   - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
   # test with internal dp
   - python3 ../examples/offline_inference/data_parallel.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
   - pytest -v -s distributed/test_utils.py
   - pytest -v -s compile/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
@@ -514,7 +516,10 @@ steps:
   - vllm/worker/worker.py
   - vllm/worker/model_runner.py
   - entrypoints/llm/test_collective_rpc.py
+  - tests/v1/test_async_llm_dp.py
+  - vllm/v1/engine/
   commands:
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
   - VLLM_ENABLE_V1_MULTIPROCESSING=0 pytest -v -s entrypoints/llm/test_collective_rpc.py
   - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
 
@@ -90,15 +90,34 @@ pull_request_rules:
 
 - name: label-tpu
   description: Automatically apply tpu label
+  # Keep this list in sync with `label-tpu-remove` conditions
   conditions:
     - or:
-      - files~=tpu
+      - files~=tpu.py
+      - files~=_tpu
+      - files~=tpu_
+      - files~=/tpu/
       - files~=pallas
   actions:
     label:
       add:
         - tpu
 
+- name: label-tpu-remove
+  description: Automatically remove tpu label
+  # Keep this list in sync with `label-tpu` conditions
+  conditions:
+    - and:
+      - -files~=tpu.py
+      - -files~=_tpu
+      - -files~=tpu_
+      - -files~=/tpu/
+      - -files~=pallas
+  actions:
+    label:
+      remove:
+        - tpu
+
 - name: ping author on conflicts and add 'needs-rebase' label
   conditions:
       - conflict
 
@@ -1,69 +1,138 @@
 # This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
+#
+# Build targets:
+#   vllm-openai (default): used for serving deployment
+#   vllm-test: used for CI tests
+#   vllm-dev: used for development
+#
+# Build arguments:
+#   PYTHON_VERSION=3.12 (default)|3.11|3.10|3.9
+#   VLLM_CPU_DISABLE_AVX512=false (default)|true
+#
+
+######################### BASE IMAGE #########################
+FROM ubuntu:22.04 AS base
 
-FROM ubuntu:22.04 AS cpu-test-1
+WORKDIR /workspace/
 
-ENV CCACHE_DIR=/root/.cache/ccache
+ARG PYTHON_VERSION=3.12
+ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 
+# Install minimal dependencies and uv
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    apt-get update -y \
+    && apt-get install -y --no-install-recommends ccache git curl wget ca-certificates \
+        gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 \
+    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
+    && curl -LsSf https://astral.sh/uv/install.sh | sh
+
+ENV CCACHE_DIR=/root/.cache/ccache
 ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
 
-RUN --mount=type=cache,target=/var/cache/apt \
-    apt-get update -y \
-    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
-    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+ENV PATH="/root/.local/bin:$PATH"
+ENV VIRTUAL_ENV="/opt/venv"
+RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 
-# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
-# intel-openmp provides additional performance improvement vs. openmp
-# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install intel-openmp==2025.0.1
+ENV UV_HTTP_TIMEOUT=500
 
-ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
+# Install Python dependencies 
+ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+ENV UV_LINK_MODE="copy"
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
+    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
+    uv pip install --upgrade pip && \
+    uv pip install -r requirements/cpu.txt
 
-RUN echo 'ulimit -c 0' >> ~/.bashrc
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install intel-openmp==2024.2.1 intel_extension_for_pytorch==2.6.0
 
-RUN pip install intel_extension_for_pytorch==2.6.0
+ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD"
 
-WORKDIR /workspace
+RUN echo 'ulimit -c 0' >> ~/.bashrc
 
-ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
-ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
-    pip install --upgrade pip && \
-    pip install -r requirements/build.txt
+######################### BUILD IMAGE #########################
+FROM base AS vllm-build
 
-FROM cpu-test-1 AS build
+ARG GIT_REPO_CHECK=0
+# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
+ARG VLLM_CPU_DISABLE_AVX512
+ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
 
 WORKDIR /workspace/vllm
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
-    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
-    pip install -v -r requirements/cpu.txt
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
+    uv pip install -r requirements/build.txt
 
 COPY . .
-ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
     if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 
-# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
-ARG VLLM_CPU_DISABLE_AVX512
-ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=bind,source=.git,target=.git \
+    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel 
+
+######################### DEV IMAGE #########################
+FROM vllm-build AS vllm-dev
+
+WORKDIR /workspace/vllm
+
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    apt-get install -y --no-install-recommends vim numactl
+
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install -e tests/vllm_test_utils 
 
-RUN --mount=type=cache,target=/root/.cache/pip \
+RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=bind,source=.git,target=.git \
-    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
-    pip install dist/*.whl && \
-    rm -rf dist
+    VLLM_TARGET_DEVICE=cpu python3 setup.py develop 
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install -r requirements/dev.txt && \
+    pre-commit install --hook-type pre-commit --hook-type commit-msg
+
+ENTRYPOINT ["bash"]
+
+######################### TEST IMAGE #########################
+FROM base AS vllm-test
 
 WORKDIR /workspace/
 
-RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,src=requirements/test.txt,target=requirements/test.txt \
+    uv pip install -r requirements/test.txt
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
+    uv pip install dist/*.whl
+
+ADD ./tests/ ./tests/
+ADD ./examples/ ./examples/
+ADD ./benchmarks/ ./benchmarks/
 
 # install development dependencies (for testing)
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -e tests/vllm_test_utils
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install -e tests/vllm_test_utils 
+
+ENTRYPOINT ["bash"]
+
+######################### RELEASE IMAGE #########################
+FROM base AS vllm-openai
+
+WORKDIR /workspace/
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
+    uv pip install dist/*.whl
 
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
@@ -715,3 +715,66 @@ def sample(
                 ))
         self.maybe_oversample_requests(sampled_requests, num_requests)
         return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# Instruct Coder Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class InstructCoderDataset(HuggingFaceDataset):
+    """
+    InstructCoder Dataset.
+    https://huggingface.co/datasets/likaixin/InstructCoder
+
+    InstructCoder is the dataset designed for general code editing.
+    It consists of 114,239 instruction-input-output triplets,
+    and covers multiple distinct code editing scenario.
+    """
+
+    DEFAULT_OUTPUT_LEN = 200  # this is the average default output length
+    DEFAULT_NUM_REQUESTS = 1000
+    INSTRUCT_CODER_DATASET_PATH = "likaixin/InstructCoder"
+
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        if self.dataset_path != self.INSTRUCT_CODER_DATASET_PATH:
+            raise ValueError(f"Only support likaixin/InstructCoder dataset.\
+                    This data path {self.dataset_path} is not valid.")
+        if self.dataset_subset is None and self.dataset_split != "train":
+            raise ValueError("Dataset split must be 'train'.")
+
+    def load_data(self) -> None:
+        dataset = load_dataset(
+            self.dataset_path,
+            name=self.dataset_subset,
+            split=self.dataset_split,
+            streaming=True,
+        )
+        self.data = dataset.shuffle(seed=self.random_seed)
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               output_len: Optional[int] = None,
+               enable_multimodal_chat: bool = False,
+               **kwargs) -> list:
+        output_len = (output_len
+                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        sampled_requests = []
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = f"{item['instruction']}:\n{item['input']}"
+            prompt_len = len(tokenizer(prompt).input_ids)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests