Skip to content

Commit 6cce960

Browse files
mgoinLeiWang1999
authored andcommitted
[Misc] Enable multi-step output streaming by default (vllm-project#9047)
Signed-off-by: LeiWang1999 <[email protected]>
1 parent bedc7be commit 6cce960

File tree

1 file changed

+9
-5
lines changed

1 file changed

+9
-5
lines changed

vllm/engine/arg_utils.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ class EngineArgs:
145145
max_cpu_loras: Optional[int] = None
146146
device: str = 'auto'
147147
num_scheduler_steps: int = 1
148-
multi_step_stream_outputs: bool = False
148+
multi_step_stream_outputs: bool = True
149149
ray_workers_use_nsight: bool = False
150150
num_gpu_blocks_override: Optional[int] = None
151151
num_lookahead_slots: int = 0
@@ -603,13 +603,17 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
603603

604604
parser.add_argument(
605605
'--multi-step-stream-outputs',
606-
action='store_true',
607-
help='If True, then multi-step will stream outputs for every step')
606+
action=StoreBoolean,
607+
default=EngineArgs.multi_step_stream_outputs,
608+
nargs="?",
609+
const="True",
610+
help='If False, then multi-step will stream outputs at the end '
611+
'of all steps')
608612
parser.add_argument(
609613
'--scheduler-delay-factor',
610614
type=float,
611615
default=EngineArgs.scheduler_delay_factor,
612-
help='Apply a delay (of delay factor multiplied by previous'
616+
help='Apply a delay (of delay factor multiplied by previous '
613617
'prompt latency) before scheduling next prompt.')
614618
parser.add_argument(
615619
'--enable-chunked-prefill',
@@ -632,7 +636,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
632636
type=nullable_str,
633637
choices=[*QUANTIZATION_METHODS, None],
634638
default=EngineArgs.speculative_model_quantization,
635-
help='Method used to quantize the weights of speculative model.'
639+
help='Method used to quantize the weights of speculative model. '
636640
'If None, we first check the `quantization_config` '
637641
'attribute in the model config file. If that is '
638642
'None, we assume the model weights are not '

0 commit comments

Comments
 (0)