Skip to content

Commit 6223dd8

Browse files
authored
Update deprecated type hinting in model_executor/layers (vllm-project#18056)
Signed-off-by: Harry Mellor <[email protected]>
1 parent 906f059 commit 6223dd8

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

87 files changed

+523
-523
lines changed

pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,6 @@ exclude = [
8080
"vllm/engine/**/*.py" = ["UP006", "UP035"]
8181
"vllm/executor/**/*.py" = ["UP006", "UP035"]
8282
"vllm/lora/**/*.py" = ["UP006", "UP035"]
83-
"vllm/model_executor/layers/**/*.py" = ["UP006", "UP035"]
8483
"vllm/model_executor/model_loader/**/*.py" = ["UP006", "UP035"]
8584
"vllm/model_executor/models/**/*.py" = ["UP006", "UP035"]
8685
"vllm/platforms/**/*.py" = ["UP006", "UP035"]

vllm/model_executor/layers/fused_moe/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
# SPDX-License-Identifier: Apache-2.0
22

33
from contextlib import contextmanager
4-
from typing import Any, Dict, Optional
4+
from typing import Any, Optional
55

66
from vllm.model_executor.layers.fused_moe.layer import (
77
FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
88
from vllm.triton_utils import HAS_TRITON
99

10-
_config: Optional[Dict[str, Any]] = None
10+
_config: Optional[dict[str, Any]] = None
1111

1212

1313
@contextmanager
@@ -19,7 +19,7 @@ def override_config(config):
1919
_config = old_config
2020

2121

22-
def get_config() -> Optional[Dict[str, Any]]:
22+
def get_config() -> Optional[dict[str, Any]]:
2323
return _config
2424

2525

vllm/model_executor/layers/fused_moe/deep_gemm_moe.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# SPDX-License-Identifier: Apache-2.0
22
import importlib.util
3-
from typing import Optional, Tuple
3+
from typing import Optional
44

55
import torch
66

@@ -61,7 +61,7 @@ def _moe_permute(
6161
global_num_experts: int,
6262
expert_map: Optional[torch.Tensor],
6363
block_m: int,
64-
) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
64+
) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
6565
Optional[torch.Tensor]]:
6666
"""
6767
Determine the sorted_token_ids, expert_ids for the given problem size.

vllm/model_executor/layers/fused_moe/fused_moe.py

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import functools
44
import json
55
import os
6-
from typing import Any, Callable, Dict, List, Optional, Tuple
6+
from typing import Any, Callable, Optional
77

88
import torch
99

@@ -472,14 +472,14 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
472472
num_tokens_post_padded: torch.Tensor,
473473
mul_routed_weight: bool,
474474
top_k: int,
475-
config: Dict[str, Any],
475+
config: dict[str, Any],
476476
compute_type: tl.dtype,
477477
use_fp8_w8a8: bool,
478478
use_int8_w8a8: bool,
479479
use_int8_w8a16: bool,
480480
use_int4_w4a16: bool,
481481
per_channel_quant: bool,
482-
block_shape: Optional[List[int]] = None) -> None:
482+
block_shape: Optional[list[int]] = None) -> None:
483483
assert topk_weights is not None or not mul_routed_weight
484484
assert topk_weights is None or topk_weights.stride(1) == 1
485485
assert sorted_token_ids.stride(0) == 1
@@ -622,7 +622,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
622622
def get_config_file_name(E: int,
623623
N: int,
624624
dtype: Optional[str],
625-
block_shape: Optional[List[int]] = None) -> str:
625+
block_shape: Optional[list[int]] = None) -> str:
626626
device_name = current_platform.get_device_name().replace(" ", "_")
627627
dtype_selector = "" if not dtype else f",dtype={dtype}"
628628
block_shape_selector = ("" if not block_shape or not all(block_shape) else
@@ -638,7 +638,7 @@ def get_moe_configs(
638638
dtype: Optional[str],
639639
block_n: Optional[int] = None,
640640
block_k: Optional[int] = None,
641-
) -> Optional[Dict[int, Any]]:
641+
) -> Optional[dict[int, Any]]:
642642
"""
643643
Return optimized configurations for the fused MoE kernel.
644644
@@ -670,7 +670,7 @@ def get_moe_configs(
670670
return None
671671

672672

673-
def get_moe_wna16_block_config(config: Dict[str,
673+
def get_moe_wna16_block_config(config: dict[str,
674674
int], use_moe_wna16_cuda: bool,
675675
num_valid_tokens: int, size_k: int, size_n: int,
676676
num_experts: int, group_size: int,
@@ -742,8 +742,8 @@ def get_default_config(
742742
topk: int,
743743
dtype: Optional[str],
744744
is_marlin: bool,
745-
block_shape: Optional[List[int]] = None,
746-
) -> Dict[str, int]:
745+
block_shape: Optional[list[int]] = None,
746+
) -> dict[str, int]:
747747
if dtype == "fp8_w8a8" and block_shape is not None:
748748
# Block-wise quant: BLOCK_SIZE_N must be divisible by block_shape[0]
749749
# BLOCK_SIZE_K must be divisible by block_shape[1]
@@ -795,13 +795,13 @@ def get_default_config(
795795

796796

797797
def try_get_optimal_moe_config(
798-
w1_shape: Tuple[int, ...],
799-
w2_shape: Tuple[int, ...],
798+
w1_shape: tuple[int, ...],
799+
w2_shape: tuple[int, ...],
800800
top_k: int,
801801
dtype: Optional[str],
802802
M: int,
803803
is_marlin: bool = False,
804-
block_shape: Optional[List[int]] = None,
804+
block_shape: Optional[list[int]] = None,
805805
):
806806
from vllm.model_executor.layers.fused_moe import get_config
807807
override_config = get_config()
@@ -855,7 +855,7 @@ def fused_topk(
855855
gating_output: torch.Tensor,
856856
topk: int,
857857
renormalize: bool,
858-
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
858+
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
859859
assert hidden_states.shape[0] == gating_output.shape[0], (
860860
"Number of tokens mismatch")
861861

@@ -895,7 +895,7 @@ def grouped_topk(
895895
topk_group: int = 0,
896896
scoring_func: str = "softmax",
897897
e_score_correction_bias: Optional[torch.Tensor] = None
898-
) -> Tuple[torch.Tensor, torch.Tensor]:
898+
) -> tuple[torch.Tensor, torch.Tensor]:
899899

900900
assert hidden_states.shape[0] == gating_output.shape[0], (
901901
"Number of tokens mismatch")
@@ -982,7 +982,7 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
982982
w2_zp: Optional[torch.Tensor] = None,
983983
a1_scale: Optional[torch.Tensor] = None,
984984
a2_scale: Optional[torch.Tensor] = None,
985-
block_shape: Optional[List[int]] = None) -> None:
985+
block_shape: Optional[list[int]] = None) -> None:
986986
fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True,
987987
activation, apply_router_weight_on_input, use_fp8_w8a8,
988988
use_int8_w8a8, use_int8_w8a16, use_int4_w4a16,
@@ -1012,7 +1012,7 @@ def inplace_fused_experts_fake(
10121012
w2_zp: Optional[torch.Tensor] = None,
10131013
a1_scale: Optional[torch.Tensor] = None,
10141014
a2_scale: Optional[torch.Tensor] = None,
1015-
block_shape: Optional[List[int]] = None) -> None:
1015+
block_shape: Optional[list[int]] = None) -> None:
10161016
pass
10171017

10181018

@@ -1046,7 +1046,7 @@ def outplace_fused_experts(
10461046
w2_zp: Optional[torch.Tensor] = None,
10471047
a1_scale: Optional[torch.Tensor] = None,
10481048
a2_scale: Optional[torch.Tensor] = None,
1049-
block_shape: Optional[List[int]] = None) -> torch.Tensor:
1049+
block_shape: Optional[list[int]] = None) -> torch.Tensor:
10501050
return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids,
10511051
False, activation, apply_router_weight_on_input,
10521052
use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16,
@@ -1076,7 +1076,7 @@ def outplace_fused_experts_fake(
10761076
w2_zp: Optional[torch.Tensor] = None,
10771077
a1_scale: Optional[torch.Tensor] = None,
10781078
a2_scale: Optional[torch.Tensor] = None,
1079-
block_shape: Optional[List[int]] = None) -> torch.Tensor:
1079+
block_shape: Optional[list[int]] = None) -> torch.Tensor:
10801080
return torch.empty_like(hidden_states)
10811081

10821082

@@ -1129,7 +1129,7 @@ def fused_experts(hidden_states: torch.Tensor,
11291129
w2_zp: Optional[torch.Tensor] = None,
11301130
a1_scale: Optional[torch.Tensor] = None,
11311131
a2_scale: Optional[torch.Tensor] = None,
1132-
block_shape: Optional[List[int]] = None,
1132+
block_shape: Optional[list[int]] = None,
11331133
allow_deep_gemm: bool = False) -> torch.Tensor:
11341134
if (allow_deep_gemm and use_fp8_w8a8
11351135
and _valid_deep_gemm(hidden_states, w1, w2, expert_map)):
@@ -1184,8 +1184,8 @@ def moe_kernel_prepare_input(
11841184
use_int8_w8a16: bool,
11851185
use_int4_w4a16: bool,
11861186
per_channel_quant: bool,
1187-
block_shape: Optional[List[int]] = None,
1188-
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
1187+
block_shape: Optional[list[int]] = None,
1188+
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
11891189
if use_fp8_w8a8:
11901190
assert B_scale is not None
11911191
if block_shape is None:
@@ -1248,7 +1248,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
12481248
w2_zp: Optional[torch.Tensor] = None,
12491249
a1_scale: Optional[torch.Tensor] = None,
12501250
a2_scale: Optional[torch.Tensor] = None,
1251-
block_shape: Optional[List[int]] = None):
1251+
block_shape: Optional[list[int]] = None):
12521252
# Check constraints.
12531253
if use_int4_w4a16:
12541254
assert hidden_states.shape[1] // 2 == w1.shape[
@@ -1452,7 +1452,7 @@ def fused_moe(
14521452
w2_zp: Optional[torch.Tensor] = None,
14531453
a1_scale: Optional[torch.Tensor] = None,
14541454
a2_scale: Optional[torch.Tensor] = None,
1455-
block_shape: Optional[List[int]] = None,
1455+
block_shape: Optional[list[int]] = None,
14561456
) -> torch.Tensor:
14571457
"""
14581458
This function computes a Mixture of Experts (MoE) layer using two sets of
@@ -1497,7 +1497,7 @@ def fused_moe(
14971497
a1.
14981498
- a2_scale (Optional[torch.Tensor]): Optional scale to be used for
14991499
a2.
1500-
- block_shape: (Optional[List[int]]): Optional block size for block-wise
1500+
- block_shape: (Optional[list[int]]): Optional block size for block-wise
15011501
quantization.
15021502
15031503
Returns:

vllm/model_executor/layers/fused_moe/layer.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from abc import abstractmethod
44
from enum import Enum
5-
from typing import Callable, List, Optional, Tuple
5+
from typing import Callable, Optional
66

77
import torch
88
import torch.nn.functional as F
@@ -326,7 +326,7 @@ def forward_tpu(
326326

327327
def determine_expert_map(
328328
ep_size: int, ep_rank: int,
329-
global_num_experts: int) -> Tuple[int, Optional[torch.Tensor]]:
329+
global_num_experts: int) -> tuple[int, Optional[torch.Tensor]]:
330330
"""
331331
Calculates how many experts should be assigned to each rank for EP and
332332
creates a mapping from global to local expert index. Experts are
@@ -338,7 +338,7 @@ def determine_expert_map(
338338
global_num_experts (int): The total number of experts in the model.
339339
340340
Returns:
341-
Tuple[int, Optional[torch.Tensor]]: A tuple containing:
341+
tuple[int, Optional[torch.Tensor]]: A tuple containing:
342342
- local_num_experts (int): The number of experts assigned
343343
to the current rank.
344344
- expert_map (Optional[torch.Tensor]): A tensor of shape
@@ -909,7 +909,7 @@ def forward_impl(self, hidden_states: torch.Tensor,
909909
def make_expert_params_mapping(
910910
cls, ckpt_gate_proj_name: str, ckpt_down_proj_name: str,
911911
ckpt_up_proj_name: str,
912-
num_experts: int) -> List[Tuple[str, str, int, str]]:
912+
num_experts: int) -> list[tuple[str, str, int, str]]:
913913

914914
return [
915915
# (param_name, weight_name, expert_id, shard_id)

vllm/model_executor/layers/fused_moe/moe_align_block_size.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# SPDX-License-Identifier: Apache-2.0
2-
from typing import Optional, Tuple
2+
from typing import Optional
33

44
import torch
55

@@ -153,7 +153,7 @@ def moe_align_block_size(
153153
num_experts: int,
154154
expert_map: Optional[torch.Tensor] = None,
155155
pad_sorted_ids: bool = False
156-
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
156+
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
157157
"""
158158
Aligns the token distribution across experts to be compatible with block
159159
size for matrix multiplication.

vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# SPDX-License-Identifier: Apache-2.0
2-
from typing import Optional, Tuple
2+
from typing import Optional
33

44
import torch
55

@@ -15,7 +15,7 @@ def moe_permute(
1515
expert_map: Optional[torch.Tensor] = None,
1616
align_block_size: Optional[int] = None,
1717
fill_invalid_expert: int = -1
18-
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
18+
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
1919
"""
2020
This function expands and permutes activation to gather uncontinuous tokens
2121
for each expert.

vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# SPDX-License-Identifier: Apache-2.0
22
from functools import cache
3-
from typing import List, Optional, Tuple
3+
from typing import Optional
44

55
import torch
66

@@ -97,7 +97,7 @@ def rocm_aiter_fmoe_fp8_blockscale_g1u1_impl(
9797
w1_scale: torch.Tensor,
9898
w2_scale: torch.Tensor,
9999
a1_scale: torch.Tensor,
100-
block_shape: List[int],
100+
block_shape: list[int],
101101
smooth_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
102102
from aiter import fmoe_fp8_blockscale_g1u1
103103
from aiter.fused_moe_bf16_asm import moe_sorting_ck
@@ -142,7 +142,7 @@ def rocm_aiter_fmoe_fp8_blockscale_g1u1_fake(
142142
w1_scale: torch.Tensor,
143143
w2_scale: torch.Tensor,
144144
a1_scale: torch.Tensor,
145-
block_shape: List[int],
145+
block_shape: list[int],
146146
smooth_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
147147

148148
return torch.empty_like(a1, dtype=hidden_states_dtype)
@@ -280,7 +280,7 @@ def rocm_aiter_fused_experts(hidden_states: torch.Tensor,
280280
w2_zp: Optional[torch.Tensor] = None,
281281
a1_scale: Optional[torch.Tensor] = None,
282282
a2_scale: Optional[torch.Tensor] = None,
283-
block_shape: Optional[List[int]] = None,
283+
block_shape: Optional[list[int]] = None,
284284
allow_deep_gemm: bool = False) -> torch.Tensor:
285285

286286
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
@@ -372,14 +372,14 @@ def rocm_aiter_topk_softmax(topk_weights: torch.Tensor,
372372
topk_indices: torch.Tensor,
373373
token_expert_indices: torch.Tensor,
374374
gating_output: torch.Tensor,
375-
renormalize: bool) -> Tuple[torch.Tensor, ...]:
375+
renormalize: bool) -> tuple[torch.Tensor, ...]:
376376
torch.ops.vllm.rocm_aiter_topk_softmax(topk_weights, topk_indices,
377377
token_expert_indices, gating_output,
378378
renormalize)
379379
return topk_weights, topk_indices
380380

381381

382-
def shuffle_weights(*tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]:
382+
def shuffle_weights(*tensors: torch.Tensor) -> tuple[torch.Tensor, ...]:
383383
"""
384384
Applies shuffle_weight function from AITER to each
385385
input tensor and returns them.
@@ -395,7 +395,7 @@ def shuffle_weights(*tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]:
395395

396396

397397
def expand_weights(*tensors: torch.Tensor,
398-
expansion_dims: list[int]) -> Tuple[torch.Tensor, ...]:
398+
expansion_dims: list[int]) -> tuple[torch.Tensor, ...]:
399399
"""
400400
Expands the dimensions of input tensors.
401401

vllm/model_executor/layers/fused_moe/utils.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# SPDX-License-Identifier: Apache-2.0
22
from math import prod
3-
from typing import List, Optional, Tuple
3+
from typing import Optional
44

55
import torch
66

@@ -10,7 +10,7 @@
1010
from vllm.utils import cdiv
1111

1212

13-
def _resize_cache(x: torch.Tensor, v: Tuple[int, ...]) -> torch.Tensor:
13+
def _resize_cache(x: torch.Tensor, v: tuple[int, ...]) -> torch.Tensor:
1414
"""
1515
Shrink the given tensor and apply the given view to it. This is
1616
used to resize the intermediate fused_moe caches.
@@ -22,8 +22,8 @@ def _resize_cache(x: torch.Tensor, v: Tuple[int, ...]) -> torch.Tensor:
2222
def _fp8_quantize(
2323
A: torch.Tensor,
2424
A_scale: Optional[torch.Tensor],
25-
block_shape: Optional[List[int]],
26-
) -> Tuple[torch.Tensor, torch.Tensor]:
25+
block_shape: Optional[list[int]],
26+
) -> tuple[torch.Tensor, torch.Tensor]:
2727
"""
2828
Perform fp8 quantization on the inputs. If a block_shape
2929
is provided, the output will be blocked.

0 commit comments

Comments
 (0)