@@ -145,7 +145,7 @@ class EngineArgs:
145
145
max_cpu_loras : Optional [int ] = None
146
146
device : str = 'auto'
147
147
num_scheduler_steps : int = 1
148
- multi_step_stream_outputs : bool = False
148
+ multi_step_stream_outputs : bool = True
149
149
ray_workers_use_nsight : bool = False
150
150
num_gpu_blocks_override : Optional [int ] = None
151
151
num_lookahead_slots : int = 0
@@ -603,13 +603,17 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
603
603
604
604
parser .add_argument (
605
605
'--multi-step-stream-outputs' ,
606
- action = 'store_true' ,
607
- help = 'If True, then multi-step will stream outputs for every step' )
606
+ action = StoreBoolean ,
607
+ default = EngineArgs .multi_step_stream_outputs ,
608
+ nargs = "?" ,
609
+ const = "True" ,
610
+ help = 'If False, then multi-step will stream outputs at the end '
611
+ 'of all steps' )
608
612
parser .add_argument (
609
613
'--scheduler-delay-factor' ,
610
614
type = float ,
611
615
default = EngineArgs .scheduler_delay_factor ,
612
- help = 'Apply a delay (of delay factor multiplied by previous'
616
+ help = 'Apply a delay (of delay factor multiplied by previous '
613
617
'prompt latency) before scheduling next prompt.' )
614
618
parser .add_argument (
615
619
'--enable-chunked-prefill' ,
@@ -632,7 +636,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
632
636
type = nullable_str ,
633
637
choices = [* QUANTIZATION_METHODS , None ],
634
638
default = EngineArgs .speculative_model_quantization ,
635
- help = 'Method used to quantize the weights of speculative model.'
639
+ help = 'Method used to quantize the weights of speculative model. '
636
640
'If None, we first check the `quantization_config` '
637
641
'attribute in the model config file. If that is '
638
642
'None, we assume the model weights are not '
0 commit comments