mht-sharma
diff --git a/‎.buildkite/run-cpu-test-ppc64le.sh
Lines changed: 32 additions & 0 deletions b/‎.buildkite/run-cpu-test-ppc64le.sh
Lines changed: 32 additions & 0 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 0 additions & 1 deletion b/‎CMakeLists.txt
Lines changed: 0 additions & 1 deletion
diff --git a/‎Dockerfile
Lines changed: 6 additions & 4 deletions b/‎Dockerfile
Lines changed: 6 additions & 4 deletions
diff --git a/‎Dockerfile.ppc64le
Lines changed: 11 additions & 5 deletions b/‎Dockerfile.ppc64le
Lines changed: 11 additions & 5 deletions
diff --git a/‎MANIFEST.in
Lines changed: 0 additions & 1 deletion b/‎MANIFEST.in
Lines changed: 0 additions & 1 deletion
diff --git a/‎benchmarks/backend_request_func.py
Lines changed: 2 additions & 0 deletions b/‎benchmarks/backend_request_func.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎benchmarks/benchmark_serving.py
Lines changed: 39 additions & 4 deletions b/‎benchmarks/benchmark_serving.py
Lines changed: 39 additions & 4 deletions
diff --git a/‎csrc/ops.h
Lines changed: 0 additions & 3 deletions b/‎csrc/ops.h
Lines changed: 0 additions & 3 deletions
@@ -0,0 +1,32 @@
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+docker build -t cpu-test -f Dockerfile.ppc64le .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f cpu-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image, setting --shm-size=4g for tensor parallel.
+#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --name cpu-test cpu-test
+
+# Run basic model test
+docker exec cpu-test bash -c "
+  pip install pytest matplotlib einops transformers_stream_generator
+  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
+
+# online inference
+docker exec cpu-test bash -c "
+  python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & 
+  timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
+  python3 benchmarks/benchmark_serving.py \
+    --backend vllm \
+    --dataset-name random \
+    --model facebook/opt-125m \
+    --num-prompts 20 \
+    --endpoint /v1/completions \
+    --tokenizer facebook/opt-125m"
@@ -194,7 +194,6 @@ set(VLLM_EXT_SRC
   "csrc/pos_encoding_kernels.cu"
   "csrc/activation_kernels.cu"
   "csrc/layernorm_kernels.cu"
-  "csrc/quantization/squeezellm/quant_cuda_kernel.cu"
   "csrc/quantization/gptq/q_gemm.cu"
   "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
   "csrc/quantization/fp8/common.cu"
 
@@ -10,7 +10,7 @@ ARG CUDA_VERSION=12.4.1
 # prepare basic build environment
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
 ARG CUDA_VERSION=12.4.1
-ARG PYTHON_VERSION=3.10
+ARG PYTHON_VERSION=3.12
 ENV DEBIAN_FRONTEND=noninteractive
 
 # Install Python and other dependencies
@@ -37,7 +37,6 @@ WORKDIR /workspace
 
 # install build and runtime dependencies
 COPY requirements-common.txt requirements-common.txt
-COPY requirements-adag.txt requirements-adag.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-cuda.txt
@@ -66,7 +65,6 @@ COPY setup.py setup.py
 COPY cmake cmake
 COPY CMakeLists.txt CMakeLists.txt
 COPY requirements-common.txt requirements-common.txt
-COPY requirements-adag.txt requirements-adag.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 COPY pyproject.toml pyproject.toml
 COPY vllm vllm
@@ -135,7 +133,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 # image with vLLM installed
 FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
 ARG CUDA_VERSION=12.4.1
-ARG PYTHON_VERSION=3.10
+ARG PYTHON_VERSION=3.12
 WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
 
@@ -181,6 +179,10 @@ FROM vllm-base AS test
 ADD . /vllm-workspace/
 
 # install development dependencies (for testing)
+# A newer setuptools is required for installing some test dependencies from source that do not publish python 3.12 wheels
+# This installation must complete before the test dependencies are collected and installed.
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install "setuptools>=74.1.1"
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-dev.txt
 
 
@@ -2,21 +2,27 @@ FROM mambaorg/micromamba
 ARG MAMBA_DOCKERFILE_ACTIVATE=1
 USER root
 
-RUN apt-get update  -y     && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
+
+RUN apt-get update -y && apt-get install -y git wget vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential
 
 # Some packages in requirements-cpu are installed here
 # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
 # Currently these may not be available for venv or pip directly
-RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults     python=3.10     pytorch-cpu=2.1.2     torchvision-cpu=0.16.2    &&     micromamba clean --all --yes
+RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 torchvision-cpu=0.16.2 rust && micromamba clean --all --yes
 
 COPY ./ /workspace/vllm
 
 WORKDIR /workspace/vllm
 
 # These packages will be in rocketce eventually
-RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
+RUN pip install -v cmake torch==2.3.1 uvloop==0.20.0 -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
 
 RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
 
-WORKDIR /vllm-workspace
-ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
+WORKDIR /workspace/
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+
@@ -1,5 +1,4 @@
 include LICENSE
-include requirements-adag.txt
 include requirements-common.txt
 include requirements-cuda.txt
 include requirements-rocm.txt
 
@@ -24,6 +24,7 @@ class RequestFuncInput:
     model: str
     best_of: int = 1
     use_beam_search: bool = False
+    logprobs: Optional[int] = None
 
 
 @dataclass
@@ -236,6 +237,7 @@ async def async_request_openai_completions(
             "temperature": 0.0,
             "best_of": request_func_input.best_of,
             "max_tokens": request_func_input.output_len,
+            "logprobs": request_func_input.logprobs,
             "stream": True,
         }
         headers = {
 
@@ -195,8 +195,16 @@ def sample_sonnet_requests(
 
 
 def sample_random_requests(
-        input_len: int, output_len: int, num_prompts: int, range_ratio: float,
-        tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]:
+    prefix_len: int,
+    input_len: int,
+    output_len: int,
+    num_prompts: int,
+    range_ratio: float,
+    tokenizer: PreTrainedTokenizerBase,
+) -> List[Tuple[str, int, int]]:
+    prefix_token_ids = np.random.randint(0,
+                                         tokenizer.vocab_size,
+                                         size=prefix_len).tolist()
 
     input_lens = np.random.randint(
         int(input_len * range_ratio),
@@ -211,10 +219,12 @@ def sample_random_requests(
     offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
     input_requests = []
     for i in range(num_prompts):
-        prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size
+        prompt = tokenizer.decode(prefix_token_ids +
+                                  [(offsets[i] + i + j) % tokenizer.vocab_size
                                    for j in range(input_lens[i])])
+
         input_requests.append(
-            (prompt, int(input_lens[i]), int(output_lens[i])))
+            (prompt, int(prefix_len + input_lens[i]), int(output_lens[i])))
 
     return input_requests
 
@@ -318,6 +328,7 @@ async def benchmark(
     model_id: str,
     tokenizer: PreTrainedTokenizerBase,
     input_requests: List[Tuple[str, int, int]],
+    logprobs: Optional[int],
     best_of: int,
     use_beam_search: bool,
     request_rate: float,
@@ -339,6 +350,7 @@ async def benchmark(
         api_url=api_url,
         prompt_len=test_prompt_len,
         output_len=test_output_len,
+        logprobs=logprobs,
         best_of=best_of,
         use_beam_search=use_beam_search,
     )
@@ -358,6 +370,7 @@ async def benchmark(
             api_url=base_url + "/start_profile",
             prompt_len=test_prompt_len,
             output_len=test_output_len,
+            logprobs=logprobs,
             best_of=best_of,
             use_beam_search=use_beam_search,
         )
@@ -379,6 +392,7 @@ async def benchmark(
             api_url=api_url,
             prompt_len=prompt_len,
             output_len=output_len,
+            logprobs=logprobs,
             best_of=best_of,
             use_beam_search=use_beam_search,
         )
@@ -396,6 +410,7 @@ async def benchmark(
             api_url=base_url + "/stop_profile",
             prompt_len=test_prompt_len,
             output_len=test_output_len,
+            logprobs=logprobs,
             best_of=best_of,
             use_beam_search=use_beam_search,
         )
@@ -562,6 +577,7 @@ def main(args: argparse.Namespace):
 
     elif args.dataset_name == "random":
         input_requests = sample_random_requests(
+            prefix_len=args.random_prefix_len,
             input_len=args.random_input_len,
             output_len=args.random_output_len,
             num_prompts=args.num_prompts,
@@ -580,6 +596,7 @@ def main(args: argparse.Namespace):
             model_id=model_id,
             tokenizer=tokenizer,
             input_requests=input_requests,
+            logprobs=args.logprobs,
             best_of=args.best_of,
             use_beam_search=args.use_beam_search,
             request_rate=args.request_rate,
@@ -721,6 +738,16 @@ def main(args: argparse.Namespace):
         help=
         "Number of output tokens per request, used only for sonnet dataset.",
     )
+    parser.add_argument(
+        "--logprobs",
+        type=int,
+        default=None,
+        help=("Number of logprobs-per-token to compute & return as part of "
+              "the request. If unspecified, then either (1) if beam search "
+              "is disabled, no logprobs are computed & a single dummy "
+              "logprob is returned for each token; or (2) if beam search "
+              "is enabled 1 logprob per token is computed"),
+    )
     parser.add_argument(
         "--sonnet-prefix-len",
         type=int,
@@ -749,6 +776,14 @@ def main(args: argparse.Namespace):
         help="Range of sampled ratio of input/output length, "
         "used only for random sampling.",
     )
+    parser.add_argument(
+        "--random-prefix-len",
+        type=int,
+        default=0,
+        help="Number of fixed prefix tokens before random "
+        " context. The length range of context in a random "
+        " request is [random-prefix-len, "
+        " random-prefix-len + random-prefix-len * random-range-ratio).")
     parser.add_argument(
         "--request-rate",
         type=float,
 
@@ -170,9 +170,6 @@ void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
 void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
                                torch::Tensor& scales);
 
-void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
-                     torch::Tensor lookup_table);
-
 torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
                         torch::Tensor b_gptq_qzeros,
                         torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`	`1`	`include LICENSE`
`2`		`-include requirements-adag.txt`
`3`	`2`	`include requirements-common.txt`
`4`	`3`	`include requirements-cuda.txt`
`5`	`4`	`include requirements-rocm.txt`