Skip to content

Commit 959f6c4

Browse files
committed
Merge branch 'main' into batch_scores
Signed-off-by: Pooya Davoodi <[email protected]>
2 parents b21a0a9 + 72bac73 commit 959f6c4

File tree

17 files changed

+442
-205
lines changed

17 files changed

+442
-205
lines changed

CMakeLists.txt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
446446
endif()
447447

448448
message(STATUS "Enabling C extension.")
449+
if(VLLM_GPU_LANG STREQUAL "CUDA")
450+
list(APPEND VLLM_C_LIBS cuda)
451+
endif()
449452
define_gpu_extension_target(
450453
_C
451454
DESTINATION vllm
@@ -454,6 +457,7 @@ define_gpu_extension_target(
454457
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
455458
ARCHITECTURES ${VLLM_GPU_ARCHES}
456459
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
460+
LIBRARIES ${VLLM_C_LIBS}
457461
USE_SABI 3
458462
WITH_SOABI)
459463

@@ -576,7 +580,7 @@ else()
576580
FetchContent_Declare(
577581
vllm-flash-attn
578582
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
579-
GIT_TAG 9732b0ce005d1e6216864788502d5570004678f5
583+
GIT_TAG d4e09037abf588af1ec47d0e966b237ee376876c
580584
GIT_PROGRESS TRUE
581585
# Don't share the vllm-flash-attn build between build types
582586
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

tests/entrypoints/openai/test_serving_chat.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,116 @@ def test_serving_chat_should_set_correct_max_tokens():
103103

104104
assert mock_engine.generate.call_args.args[1].max_tokens == 10
105105

106+
# Setting server's max_tokens in the generation_config.json
107+
# lower than context_window - prompt_tokens
108+
mock_model_config = MockModelConfig()
109+
mock_model_config.diff_sampling_param = {
110+
"max_tokens": 10 # Setting server-side max_tokens limit
111+
}
112+
113+
# Reinitialize the engine with new settings
114+
mock_engine = MagicMock(spec=MQLLMEngineClient)
115+
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
116+
mock_engine.errored = False
117+
118+
# Initialize the serving chat
119+
models = OpenAIServingModels(engine_client=mock_engine,
120+
base_model_paths=BASE_MODEL_PATHS,
121+
model_config=mock_model_config)
122+
serving_chat = OpenAIServingChat(mock_engine,
123+
mock_model_config,
124+
models,
125+
response_role="assistant",
126+
chat_template=CHAT_TEMPLATE,
127+
chat_template_content_format="auto",
128+
request_logger=None)
129+
130+
# Test Case 1: No max_tokens specified in request
131+
req = ChatCompletionRequest(
132+
model=MODEL_NAME,
133+
messages=[{
134+
"role": "user",
135+
"content": "what is 1+1?"
136+
}],
137+
guided_decoding_backend="outlines",
138+
)
139+
140+
with suppress(Exception):
141+
asyncio.run(serving_chat.create_chat_completion(req))
142+
143+
assert mock_engine.generate.call_args.args[1].max_tokens == 10
144+
145+
# Test Case 2: Request's max_tokens set higher than server accepts
146+
req.max_tokens = 15
147+
148+
with suppress(Exception):
149+
asyncio.run(serving_chat.create_chat_completion(req))
150+
151+
assert mock_engine.generate.call_args.args[1].max_tokens == 10
152+
153+
# Test Case 3: Request's max_tokens set lower than server accepts
154+
req.max_tokens = 5
155+
156+
with suppress(Exception):
157+
asyncio.run(serving_chat.create_chat_completion(req))
158+
159+
assert mock_engine.generate.call_args.args[1].max_tokens == 5
160+
161+
# Setting server's max_tokens in the generation_config.json
162+
# higher than context_window - prompt_tokens
163+
mock_model_config = MockModelConfig()
164+
mock_model_config.diff_sampling_param = {
165+
"max_tokens": 200 # Setting server-side max_tokens limit
166+
}
167+
168+
# Reinitialize the engine with new settings
169+
mock_engine = MagicMock(spec=MQLLMEngineClient)
170+
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
171+
mock_engine.errored = False
172+
173+
# Initialize the serving chat
174+
models = OpenAIServingModels(engine_client=mock_engine,
175+
base_model_paths=BASE_MODEL_PATHS,
176+
model_config=mock_model_config)
177+
serving_chat = OpenAIServingChat(mock_engine,
178+
mock_model_config,
179+
models,
180+
response_role="assistant",
181+
chat_template=CHAT_TEMPLATE,
182+
chat_template_content_format="auto",
183+
request_logger=None)
184+
185+
# Test case 1: No max_tokens specified, defaults to context_window
186+
req = ChatCompletionRequest(
187+
model=MODEL_NAME,
188+
messages=[{
189+
"role": "user",
190+
"content": "what is 1+1?"
191+
}],
192+
guided_decoding_backend="outlines",
193+
)
194+
195+
with suppress(Exception):
196+
asyncio.run(serving_chat.create_chat_completion(req))
197+
198+
assert mock_engine.generate.call_args.args[1].max_tokens == 93
199+
200+
# Test Case 2: Request's max_tokens set higher than server accepts
201+
req.max_tokens = 100
202+
203+
with suppress(Exception):
204+
asyncio.run(serving_chat.create_chat_completion(req))
205+
206+
assert mock_engine.generate.call_args.args[1].max_tokens == 93
207+
208+
# Test Case 3: Request's max_tokens set lower than server accepts
209+
req.max_tokens = 5
210+
211+
with suppress(Exception):
212+
asyncio.run(serving_chat.create_chat_completion(req))
213+
214+
assert mock_engine.generate.call_args.args[1].max_tokens == 5
215+
106216

107217
def test_serving_chat_could_load_correct_generation_config():
108218

tests/kernels/test_cutlass.py

Lines changed: 3 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
33
Run `pytest tests/kernels/test_cutlass.py`.
44
"""
5-
from typing import Optional, Type
5+
from typing import Type
66

77
import pytest
88
import torch
@@ -11,6 +11,8 @@
1111
from vllm import _custom_ops as ops
1212
from vllm.platforms import current_platform
1313

14+
from .utils import baseline_scaled_mm, to_fp8, to_int8
15+
1416
MNK_FACTORS = [
1517
(1, 256, 128),
1618
(1, 16384, 1024),
@@ -41,34 +43,10 @@
4143
capability = capability[0] * 10 + capability[1]
4244

4345

44-
def to_fp8(tensor: torch.Tensor):
45-
finfo = torch.finfo(torch.float8_e4m3fn)
46-
return torch.round(tensor.clamp(
47-
min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
48-
49-
50-
def to_int8(tensor: torch.Tensor):
51-
return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
52-
53-
5446
def rand_int8(shape: tuple, device: str = "cuda"):
5547
return to_int8(torch.rand(shape, device=device) * 255 - 128)
5648

5749

58-
def baseline_scaled_mm(a: torch.Tensor,
59-
b: torch.Tensor,
60-
scale_a: torch.Tensor,
61-
scale_b: torch.Tensor,
62-
out_dtype: Type[torch.dtype],
63-
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
64-
output = (scale_a * (scale_b * (torch.mm(
65-
a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype)
66-
if bias is not None:
67-
output = output + bias
68-
69-
return output
70-
71-
7250
def cutlass_fp8_gemm_helper(m: int,
7351
n: int,
7452
k: int,

0 commit comments

Comments
 (0)