Skip to content

Commit 4a7f8d6

Browse files
authored
Merge pull request vllm-project#174 from ROCm/upstream_merge_24_9_9
Sanity check done: Server mode; BS1 perf; Llama405b FP8
2 parents b3fc9f4 + dc1d65a commit 4a7f8d6

File tree

81 files changed

+1695
-926
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

81 files changed

+1695
-926
lines changed

.buildkite/run-cpu-test-ppc64le.sh

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# This script build the CPU docker image and run the offline inference inside the container.
2+
# It serves a sanity check for compilation and basic model usage.
3+
set -ex
4+
5+
# Try building the docker image
6+
docker build -t cpu-test -f Dockerfile.ppc64le .
7+
8+
# Setup cleanup
9+
remove_docker_container() { docker rm -f cpu-test || true; }
10+
trap remove_docker_container EXIT
11+
remove_docker_container
12+
13+
# Run the image, setting --shm-size=4g for tensor parallel.
14+
#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
15+
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --name cpu-test cpu-test
16+
17+
# Run basic model test
18+
docker exec cpu-test bash -c "
19+
pip install pytest matplotlib einops transformers_stream_generator
20+
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
21+
22+
# online inference
23+
docker exec cpu-test bash -c "
24+
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
25+
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
26+
python3 benchmarks/benchmark_serving.py \
27+
--backend vllm \
28+
--dataset-name random \
29+
--model facebook/opt-125m \
30+
--num-prompts 20 \
31+
--endpoint /v1/completions \
32+
--tokenizer facebook/opt-125m"

CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,6 @@ set(VLLM_EXT_SRC
194194
"csrc/pos_encoding_kernels.cu"
195195
"csrc/activation_kernels.cu"
196196
"csrc/layernorm_kernels.cu"
197-
"csrc/quantization/squeezellm/quant_cuda_kernel.cu"
198197
"csrc/quantization/gptq/q_gemm.cu"
199198
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
200199
"csrc/quantization/fp8/common.cu"

Dockerfile

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ ARG CUDA_VERSION=12.4.1
1010
# prepare basic build environment
1111
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
1212
ARG CUDA_VERSION=12.4.1
13-
ARG PYTHON_VERSION=3.10
13+
ARG PYTHON_VERSION=3.12
1414
ENV DEBIAN_FRONTEND=noninteractive
1515

1616
# Install Python and other dependencies
@@ -37,7 +37,6 @@ WORKDIR /workspace
3737

3838
# install build and runtime dependencies
3939
COPY requirements-common.txt requirements-common.txt
40-
COPY requirements-adag.txt requirements-adag.txt
4140
COPY requirements-cuda.txt requirements-cuda.txt
4241
RUN --mount=type=cache,target=/root/.cache/pip \
4342
python3 -m pip install -r requirements-cuda.txt
@@ -66,7 +65,6 @@ COPY setup.py setup.py
6665
COPY cmake cmake
6766
COPY CMakeLists.txt CMakeLists.txt
6867
COPY requirements-common.txt requirements-common.txt
69-
COPY requirements-adag.txt requirements-adag.txt
7068
COPY requirements-cuda.txt requirements-cuda.txt
7169
COPY pyproject.toml pyproject.toml
7270
COPY vllm vllm
@@ -135,7 +133,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
135133
# image with vLLM installed
136134
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
137135
ARG CUDA_VERSION=12.4.1
138-
ARG PYTHON_VERSION=3.10
136+
ARG PYTHON_VERSION=3.12
139137
WORKDIR /vllm-workspace
140138
ENV DEBIAN_FRONTEND=noninteractive
141139

@@ -181,6 +179,10 @@ FROM vllm-base AS test
181179
ADD . /vllm-workspace/
182180

183181
# install development dependencies (for testing)
182+
# A newer setuptools is required for installing some test dependencies from source that do not publish python 3.12 wheels
183+
# This installation must complete before the test dependencies are collected and installed.
184+
RUN --mount=type=cache,target=/root/.cache/pip \
185+
python3 -m pip install "setuptools>=74.1.1"
184186
RUN --mount=type=cache,target=/root/.cache/pip \
185187
python3 -m pip install -r requirements-dev.txt
186188

Dockerfile.ppc64le

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,27 @@ FROM mambaorg/micromamba
22
ARG MAMBA_DOCKERFILE_ACTIVATE=1
33
USER root
44

5-
RUN apt-get update -y && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
5+
ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
6+
7+
RUN apt-get update -y && apt-get install -y git wget vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential
68

79
# Some packages in requirements-cpu are installed here
810
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
911
# Currently these may not be available for venv or pip directly
10-
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 pytorch-cpu=2.1.2 torchvision-cpu=0.16.2 && micromamba clean --all --yes
12+
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 torchvision-cpu=0.16.2 rust && micromamba clean --all --yes
1113

1214
COPY ./ /workspace/vllm
1315

1416
WORKDIR /workspace/vllm
1517

1618
# These packages will be in rocketce eventually
17-
RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
19+
RUN pip install -v cmake torch==2.3.1 uvloop==0.20.0 -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
1820

1921
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
2022

21-
WORKDIR /vllm-workspace
22-
ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
23+
WORKDIR /workspace/
24+
25+
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
26+
27+
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
28+

MANIFEST.in

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
include LICENSE
2-
include requirements-adag.txt
32
include requirements-common.txt
43
include requirements-cuda.txt
54
include requirements-rocm.txt

benchmarks/backend_request_func.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ class RequestFuncInput:
2424
model: str
2525
best_of: int = 1
2626
use_beam_search: bool = False
27+
logprobs: Optional[int] = None
2728

2829

2930
@dataclass
@@ -236,6 +237,7 @@ async def async_request_openai_completions(
236237
"temperature": 0.0,
237238
"best_of": request_func_input.best_of,
238239
"max_tokens": request_func_input.output_len,
240+
"logprobs": request_func_input.logprobs,
239241
"stream": True,
240242
}
241243
headers = {

benchmarks/benchmark_serving.py

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -195,8 +195,16 @@ def sample_sonnet_requests(
195195

196196

197197
def sample_random_requests(
198-
input_len: int, output_len: int, num_prompts: int, range_ratio: float,
199-
tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]:
198+
prefix_len: int,
199+
input_len: int,
200+
output_len: int,
201+
num_prompts: int,
202+
range_ratio: float,
203+
tokenizer: PreTrainedTokenizerBase,
204+
) -> List[Tuple[str, int, int]]:
205+
prefix_token_ids = np.random.randint(0,
206+
tokenizer.vocab_size,
207+
size=prefix_len).tolist()
200208

201209
input_lens = np.random.randint(
202210
int(input_len * range_ratio),
@@ -211,10 +219,12 @@ def sample_random_requests(
211219
offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
212220
input_requests = []
213221
for i in range(num_prompts):
214-
prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size
222+
prompt = tokenizer.decode(prefix_token_ids +
223+
[(offsets[i] + i + j) % tokenizer.vocab_size
215224
for j in range(input_lens[i])])
225+
216226
input_requests.append(
217-
(prompt, int(input_lens[i]), int(output_lens[i])))
227+
(prompt, int(prefix_len + input_lens[i]), int(output_lens[i])))
218228

219229
return input_requests
220230

@@ -318,6 +328,7 @@ async def benchmark(
318328
model_id: str,
319329
tokenizer: PreTrainedTokenizerBase,
320330
input_requests: List[Tuple[str, int, int]],
331+
logprobs: Optional[int],
321332
best_of: int,
322333
use_beam_search: bool,
323334
request_rate: float,
@@ -339,6 +350,7 @@ async def benchmark(
339350
api_url=api_url,
340351
prompt_len=test_prompt_len,
341352
output_len=test_output_len,
353+
logprobs=logprobs,
342354
best_of=best_of,
343355
use_beam_search=use_beam_search,
344356
)
@@ -358,6 +370,7 @@ async def benchmark(
358370
api_url=base_url + "/start_profile",
359371
prompt_len=test_prompt_len,
360372
output_len=test_output_len,
373+
logprobs=logprobs,
361374
best_of=best_of,
362375
use_beam_search=use_beam_search,
363376
)
@@ -379,6 +392,7 @@ async def benchmark(
379392
api_url=api_url,
380393
prompt_len=prompt_len,
381394
output_len=output_len,
395+
logprobs=logprobs,
382396
best_of=best_of,
383397
use_beam_search=use_beam_search,
384398
)
@@ -396,6 +410,7 @@ async def benchmark(
396410
api_url=base_url + "/stop_profile",
397411
prompt_len=test_prompt_len,
398412
output_len=test_output_len,
413+
logprobs=logprobs,
399414
best_of=best_of,
400415
use_beam_search=use_beam_search,
401416
)
@@ -562,6 +577,7 @@ def main(args: argparse.Namespace):
562577

563578
elif args.dataset_name == "random":
564579
input_requests = sample_random_requests(
580+
prefix_len=args.random_prefix_len,
565581
input_len=args.random_input_len,
566582
output_len=args.random_output_len,
567583
num_prompts=args.num_prompts,
@@ -580,6 +596,7 @@ def main(args: argparse.Namespace):
580596
model_id=model_id,
581597
tokenizer=tokenizer,
582598
input_requests=input_requests,
599+
logprobs=args.logprobs,
583600
best_of=args.best_of,
584601
use_beam_search=args.use_beam_search,
585602
request_rate=args.request_rate,
@@ -721,6 +738,16 @@ def main(args: argparse.Namespace):
721738
help=
722739
"Number of output tokens per request, used only for sonnet dataset.",
723740
)
741+
parser.add_argument(
742+
"--logprobs",
743+
type=int,
744+
default=None,
745+
help=("Number of logprobs-per-token to compute & return as part of "
746+
"the request. If unspecified, then either (1) if beam search "
747+
"is disabled, no logprobs are computed & a single dummy "
748+
"logprob is returned for each token; or (2) if beam search "
749+
"is enabled 1 logprob per token is computed"),
750+
)
724751
parser.add_argument(
725752
"--sonnet-prefix-len",
726753
type=int,
@@ -749,6 +776,14 @@ def main(args: argparse.Namespace):
749776
help="Range of sampled ratio of input/output length, "
750777
"used only for random sampling.",
751778
)
779+
parser.add_argument(
780+
"--random-prefix-len",
781+
type=int,
782+
default=0,
783+
help="Number of fixed prefix tokens before random "
784+
" context. The length range of context in a random "
785+
" request is [random-prefix-len, "
786+
" random-prefix-len + random-prefix-len * random-range-ratio).")
752787
parser.add_argument(
753788
"--request-rate",
754789
type=float,

csrc/ops.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -170,9 +170,6 @@ void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
170170
void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
171171
torch::Tensor& scales);
172172

173-
void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
174-
torch::Tensor lookup_table);
175-
176173
torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
177174
torch::Tensor b_gptq_qzeros,
178175
torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,

0 commit comments

Comments
 (0)