25
25
import argparse
26
26
import asyncio
27
27
import base64
28
+ import gc
28
29
import io
29
30
import json
30
31
import os
@@ -423,7 +424,7 @@ def calculate_metrics(
423
424
tokenizer : PreTrainedTokenizerBase ,
424
425
selected_percentile_metrics : List [str ],
425
426
selected_percentiles : List [float ],
426
- gootput_config_dict : Dict [str , float ],
427
+ goodput_config_dict : Dict [str , float ],
427
428
) -> Tuple [BenchmarkMetrics , List [int ]]:
428
429
actual_output_lens : List [int ] = []
429
430
total_input = 0
@@ -436,19 +437,23 @@ def calculate_metrics(
436
437
e2els : List [float ] = []
437
438
for i in range (len (outputs )):
438
439
if outputs [i ].success :
439
- # We use the tokenizer to count the number of output tokens for all
440
- # serving backends instead of looking at len(outputs[i].itl) since
441
- # multiple output tokens may be bundled together
442
- # Note : this may inflate the output token count slightly
443
- output_len = len (
444
- tokenizer (outputs [i ].generated_text ,
445
- add_special_tokens = False ).input_ids )
440
+ output_len = outputs [i ].output_tokens
441
+
442
+ if output_len is None :
443
+ # We use the tokenizer to count the number of output tokens
444
+ # for some serving backends instead of looking at
445
+ # len(outputs[i].itl) since multiple output tokens may be
446
+ # bundled together
447
+ # Note : this may inflate the output token count slightly
448
+ output_len = len (
449
+ tokenizer (outputs [i ].generated_text ,
450
+ add_special_tokens = False ).input_ids )
446
451
actual_output_lens .append (output_len )
447
452
total_input += input_requests [i ][1 ]
448
453
tpot = 0
449
454
if output_len > 1 :
450
- tpot = ( outputs [i ].latency - outputs [i ].ttft ) / ( output_len -
451
- 1 )
455
+ latency_minus_ttft = outputs [i ].latency - outputs [i ].ttft
456
+ tpot = latency_minus_ttft / ( output_len - 1 )
452
457
tpots .append (tpot )
453
458
# Note: if output_len <= 1, we regard tpot as 0 for goodput
454
459
all_tpots .append (tpot )
@@ -459,21 +464,21 @@ def calculate_metrics(
459
464
else :
460
465
actual_output_lens .append (0 )
461
466
462
- if gootput_config_dict :
467
+ if goodput_config_dict :
463
468
valid_metrics = []
464
469
slo_values = []
465
470
466
- if "ttft" in gootput_config_dict :
471
+ if "ttft" in goodput_config_dict :
467
472
valid_metrics .append (ttfts )
468
- slo_values .append (gootput_config_dict ["ttft" ] /
473
+ slo_values .append (goodput_config_dict ["ttft" ] /
469
474
MILLISECONDS_TO_SECONDS_CONVERSION )
470
- if "tpot" in gootput_config_dict :
475
+ if "tpot" in goodput_config_dict :
471
476
valid_metrics .append (all_tpots )
472
- slo_values .append (gootput_config_dict ["tpot" ] /
477
+ slo_values .append (goodput_config_dict ["tpot" ] /
473
478
MILLISECONDS_TO_SECONDS_CONVERSION )
474
- if "e2el" in gootput_config_dict :
479
+ if "e2el" in goodput_config_dict :
475
480
valid_metrics .append (e2els )
476
- slo_values .append (gootput_config_dict ["e2el" ] /
481
+ slo_values .append (goodput_config_dict ["e2el" ] /
477
482
MILLISECONDS_TO_SECONDS_CONVERSION )
478
483
479
484
for req_metric in zip (* valid_metrics ):
@@ -537,7 +542,7 @@ async def benchmark(
537
542
selected_percentile_metrics : List [str ],
538
543
selected_percentiles : List [str ],
539
544
ignore_eos : bool ,
540
- gootput_config_dict : Dict [str , float ],
545
+ goodput_config_dict : Dict [str , float ],
541
546
max_concurrency : Optional [int ],
542
547
):
543
548
if backend in ASYNC_REQUEST_FUNCS :
@@ -661,7 +666,7 @@ async def limited_request_func(request_func_input, pbar):
661
666
tokenizer = tokenizer ,
662
667
selected_percentile_metrics = selected_percentile_metrics ,
663
668
selected_percentiles = selected_percentiles ,
664
- gootput_config_dict = gootput_config_dict ,
669
+ goodput_config_dict = goodput_config_dict ,
665
670
)
666
671
667
672
print ("{s:{c}^{n}}" .format (s = ' Serving Benchmark Result ' , n = 50 , c = '=' ))
@@ -673,7 +678,7 @@ async def limited_request_func(request_func_input, pbar):
673
678
metrics .total_output ))
674
679
print ("{:<40} {:<10.2f}" .format ("Request throughput (req/s):" ,
675
680
metrics .request_throughput ))
676
- if gootput_config_dict :
681
+ if goodput_config_dict :
677
682
print ("{:<40} {:<10.2f}" .format ("Request goodput (req/s):" ,
678
683
metrics .request_goodput ))
679
684
print ("{:<40} {:<10.2f}" .format ("Output token throughput (tok/s):" ,
@@ -688,7 +693,7 @@ async def limited_request_func(request_func_input, pbar):
688
693
"total_output_tokens" : metrics .total_output ,
689
694
"request_throughput" : metrics .request_throughput ,
690
695
"request_goodput:" :
691
- metrics .request_goodput if gootput_config_dict else None ,
696
+ metrics .request_goodput if goodput_config_dict else None ,
692
697
"output_throughput" : metrics .output_throughput ,
693
698
"total_token_throughput" : metrics .total_token_throughput ,
694
699
"input_lens" : [output .prompt_len for output in outputs ],
@@ -744,11 +749,11 @@ def process_one_metric(
744
749
745
750
def check_goodput_args (args ):
746
751
# Check and parse goodput arguments
747
- gootput_config_dict = {}
752
+ goodput_config_dict = {}
748
753
VALID_NAMES = ["ttft" , "tpot" , "e2el" ]
749
754
if args .goodput :
750
- gootput_config_dict = parse_goodput (args .goodput )
751
- for slo_name , slo_val in gootput_config_dict .items ():
755
+ goodput_config_dict = parse_goodput (args .goodput )
756
+ for slo_name , slo_val in goodput_config_dict .items ():
752
757
if slo_name not in VALID_NAMES :
753
758
raise ValueError (
754
759
f"Invalid metric name found, { slo_name } : { slo_val } . "
@@ -759,22 +764,22 @@ def check_goodput_args(args):
759
764
f"Invalid value found, { slo_name } : { slo_val } . "
760
765
"The service level objective value should be "
761
766
"non-negative." )
762
- return gootput_config_dict
767
+ return goodput_config_dict
763
768
764
769
765
770
def parse_goodput (slo_pairs ):
766
- gootput_config_dict = {}
771
+ goodput_config_dict = {}
767
772
try :
768
773
for slo_pair in slo_pairs :
769
774
slo_name , slo_val = slo_pair .split (":" )
770
- gootput_config_dict [slo_name ] = float (slo_val )
775
+ goodput_config_dict [slo_name ] = float (slo_val )
771
776
except ValueError as err :
772
777
raise argparse .ArgumentTypeError (
773
778
"Invalid format found for service level objectives. "
774
779
"Specify service level objectives for goodput as \" KEY:VALUE\" "
775
780
"pairs, where the key is a metric name, and the value is a "
776
781
"number in milliseconds." ) from err
777
- return gootput_config_dict
782
+ return goodput_config_dict
778
783
779
784
780
785
def main (args : argparse .Namespace ):
@@ -874,7 +879,11 @@ def main(args: argparse.Namespace):
874
879
else :
875
880
raise ValueError (f"Unknown dataset: { args .dataset_name } " )
876
881
877
- gootput_config_dict = check_goodput_args (args )
882
+ goodput_config_dict = check_goodput_args (args )
883
+
884
+ # Avoid GC processing "static" data - reduce pause times.
885
+ gc .collect ()
886
+ gc .freeze ()
878
887
879
888
benchmark_result = asyncio .run (
880
889
benchmark (
@@ -896,7 +905,7 @@ def main(args: argparse.Namespace):
896
905
float (p ) for p in args .metric_percentiles .split ("," )
897
906
],
898
907
ignore_eos = args .ignore_eos ,
899
- gootput_config_dict = gootput_config_dict ,
908
+ goodput_config_dict = goodput_config_dict ,
900
909
max_concurrency = args .max_concurrency ,
901
910
))
902
911
0 commit comments