Skip to content

benchmark_serving support --served-model-name param #12109

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions benchmarks/backend_request_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ class RequestFuncInput:
prompt_len: int
output_len: int
model: str
model_name: Optional[str] = None
best_of: int = 1
logprobs: Optional[int] = None
extra_body: Optional[dict] = None
Expand Down Expand Up @@ -78,7 +79,7 @@ async def async_request_tgi(
continue
chunk_bytes = chunk_bytes.decode("utf-8")

#NOTE: Sometimes TGI returns a ping response without
# NOTE: Sometimes TGI returns a ping response without
# any data, we should skip it.
if chunk_bytes.startswith(":"):
continue
Expand Down Expand Up @@ -235,7 +236,8 @@ async def async_request_openai_completions(

async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
payload = {
"model": request_func_input.model,
"model": request_func_input.model_name \
if request_func_input.model_name else request_func_input.model,
"prompt": request_func_input.prompt,
"temperature": 0.0,
"best_of": request_func_input.best_of,
Expand Down Expand Up @@ -328,7 +330,8 @@ async def async_request_openai_chat_completions(
if request_func_input.multi_modal_content:
content.append(request_func_input.multi_modal_content)
payload = {
"model": request_func_input.model,
"model": request_func_input.model_name \
if request_func_input.model_name else request_func_input.model,
"messages": [
{
"role": "user",
Expand Down
13 changes: 13 additions & 0 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,7 @@ async def benchmark(
api_url: str,
base_url: str,
model_id: str,
model_name: str,
tokenizer: PreTrainedTokenizerBase,
input_requests: List[Tuple[str, int, int]],
logprobs: Optional[int],
Expand Down Expand Up @@ -553,6 +554,7 @@ async def benchmark(
"Multi-modal content is only supported on 'openai-chat' backend.")
test_input = RequestFuncInput(
model=model_id,
model_name=model_name,
prompt=test_prompt,
api_url=api_url,
prompt_len=test_prompt_len,
Expand All @@ -573,6 +575,7 @@ async def benchmark(
if profile:
print("Starting profiler...")
profile_input = RequestFuncInput(model=model_id,
model_name=model_name,
prompt=test_prompt,
api_url=base_url + "/start_profile",
prompt_len=test_prompt_len,
Expand Down Expand Up @@ -616,6 +619,7 @@ async def limited_request_func(request_func_input, pbar):
async for request in get_request(input_requests, request_rate, burstiness):
prompt, prompt_len, output_len, mm_content = request
request_func_input = RequestFuncInput(model=model_id,
model_name=model_name,
prompt=prompt,
api_url=api_url,
prompt_len=prompt_len,
Expand Down Expand Up @@ -780,6 +784,7 @@ def main(args: argparse.Namespace):

backend = args.backend
model_id = args.model
model_name = args.served_model_name
tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
tokenizer_mode = args.tokenizer_mode

Expand Down Expand Up @@ -877,6 +882,7 @@ def main(args: argparse.Namespace):
api_url=api_url,
base_url=base_url,
model_id=model_id,
model_name=model_name,
tokenizer=tokenizer,
input_requests=input_requests,
logprobs=args.logprobs,
Expand Down Expand Up @@ -1222,5 +1228,12 @@ def main(args: argparse.Namespace):
'always use the slow tokenizer. \n* '
'"mistral" will always use the `mistral_common` tokenizer.')

parser.add_argument("--served-model-name",
type=str,
default=None,
help="The model name used in the API. "
"If not specified, the model name will be the "
"same as the ``--model`` argument. ")

args = parser.parse_args()
main(args)
Loading