Skip to content

Commit 3ea2dc2

Browse files
authored
[Misc] Remove deprecated arg for cuda graph capture (#9864)
Signed-off-by: Roger Wang <[email protected]>
1 parent d087bf8 commit 3ea2dc2

File tree

4 files changed

+1
-23
lines changed

4 files changed

+1
-23
lines changed

vllm/config.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -84,9 +84,6 @@ class ModelConfig:
8484
disable CUDA graph and always execute the model in eager mode.
8585
If False, we will use CUDA graph and eager execution in hybrid.
8686
If None, the user did not specify, so default to False.
87-
max_context_len_to_capture: Maximum context len covered by CUDA graphs.
88-
When a sequence has context length larger than this, we fall back
89-
to eager mode (DEPRECATED. Use max_seq_len_to_capture instead).
9087
max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
9188
When a sequence has context length larger than this, we fall back
9289
to eager mode. Additionally for encoder-decoder models, if the
@@ -147,7 +144,6 @@ def __init__(
147144
quantization: Optional[str] = None,
148145
quantization_param_path: Optional[str] = None,
149146
enforce_eager: Optional[bool] = None,
150-
max_context_len_to_capture: Optional[int] = None,
151147
max_seq_len_to_capture: Optional[int] = None,
152148
max_logprobs: int = 20,
153149
disable_sliding_window: bool = False,
@@ -181,9 +177,6 @@ def __init__(
181177
self.quantization = quantization
182178
self.quantization_param_path = quantization_param_path
183179
self.enforce_eager = enforce_eager
184-
if max_context_len_to_capture is not None:
185-
raise ValueError("`max_context_len_to_capture` is deprecated. "
186-
"Use `max_seq_len_to_capture` instead.")
187180
self.max_seq_len_to_capture = max_seq_len_to_capture
188181
self.max_logprobs = max_logprobs
189182
self.disable_sliding_window = disable_sliding_window

vllm/engine/arg_utils.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,6 @@ class EngineArgs:
126126
tokenizer_revision: Optional[str] = None
127127
quantization: Optional[str] = None
128128
enforce_eager: Optional[bool] = None
129-
max_context_len_to_capture: Optional[int] = None
130129
max_seq_len_to_capture: int = 8192
131130
disable_custom_all_reduce: bool = False
132131
tokenizer_pool_size: int = 0
@@ -504,14 +503,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
504503
help='Always use eager-mode PyTorch. If False, '
505504
'will use eager mode and CUDA graph in hybrid '
506505
'for maximal performance and flexibility.')
507-
parser.add_argument('--max-context-len-to-capture',
508-
type=int,
509-
default=EngineArgs.max_context_len_to_capture,
510-
help='Maximum context length covered by CUDA '
511-
'graphs. When a sequence has context length '
512-
'larger than this, we fall back to eager mode. '
513-
'(DEPRECATED. Use --max-seq-len-to-capture instead'
514-
')')
515506
parser.add_argument('--max-seq-len-to-capture',
516507
type=int,
517508
default=EngineArgs.max_seq_len_to_capture,
@@ -939,7 +930,6 @@ def create_model_config(self) -> ModelConfig:
939930
quantization=self.quantization,
940931
quantization_param_path=self.quantization_param_path,
941932
enforce_eager=self.enforce_eager,
942-
max_context_len_to_capture=self.max_context_len_to_capture,
943933
max_seq_len_to_capture=self.max_seq_len_to_capture,
944934
max_logprobs=self.max_logprobs,
945935
disable_sliding_window=self.disable_sliding_window,

vllm/entrypoints/llm.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -93,9 +93,6 @@ class LLM:
9393
enforce_eager: Whether to enforce eager execution. If True, we will
9494
disable CUDA graph and always execute the model in eager mode.
9595
If False, we will use CUDA graph and eager execution in hybrid.
96-
max_context_len_to_capture: Maximum context len covered by CUDA graphs.
97-
When a sequence has context length larger than this, we fall back
98-
to eager mode (DEPRECATED. Use `max_seq_len_to_capture` instead).
9996
max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
10097
When a sequence has context length larger than this, we fall back
10198
to eager mode. Additionally for encoder-decoder models, if the
@@ -152,7 +149,6 @@ def __init__(
152149
swap_space: float = 4,
153150
cpu_offload_gb: float = 0,
154151
enforce_eager: Optional[bool] = None,
155-
max_context_len_to_capture: Optional[int] = None,
156152
max_seq_len_to_capture: int = 8192,
157153
disable_custom_all_reduce: bool = False,
158154
disable_async_output_proc: bool = False,
@@ -193,7 +189,6 @@ def __init__(
193189
swap_space=swap_space,
194190
cpu_offload_gb=cpu_offload_gb,
195191
enforce_eager=enforce_eager,
196-
max_context_len_to_capture=max_context_len_to_capture,
197192
max_seq_len_to_capture=max_seq_len_to_capture,
198193
disable_custom_all_reduce=disable_custom_all_reduce,
199194
disable_async_output_proc=disable_async_output_proc,

vllm/worker/model_runner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -995,7 +995,7 @@ def __init__(
995995
# Python can be expensive. To optimize this, we cache the block table
996996
# in numpy and only copy the actual input content at every iteration.
997997
# The shape of the cached block table will be
998-
# (max batch size to capture, max context len to capture / block size).
998+
# (max batch size to capture, max seq len to capture / block size).
999999
self.graph_block_tables = np.zeros(
10001000
(self.max_batchsize_to_capture, self.get_max_block_per_batch()),
10011001
dtype=np.int32)

0 commit comments

Comments
 (0)