1
1
# SPDX-License-Identifier: Apache-2.0
2
2
3
+ from __future__ import annotations
4
+
3
5
import ast
4
6
import copy
5
7
import enum
22
24
import torch
23
25
from pydantic import BaseModel , Field , PrivateAttr
24
26
from torch .distributed import ProcessGroup , ReduceOp
25
- from transformers import PretrainedConfig
26
27
27
28
import vllm .envs as envs
28
29
from vllm .compilation .inductor_pass import CallableInductorPass , InductorPass
29
30
from vllm .logger import init_logger
30
- from vllm .model_executor .layers .quantization import (QUANTIZATION_METHODS ,
31
- get_quantization_config )
32
- from vllm .model_executor .models import ModelRegistry
33
- from vllm .platforms import CpuArchEnum , current_platform
31
+ from vllm .platforms import CpuArchEnum
34
32
from vllm .sampling_params import GuidedDecodingParams
35
- from vllm .tracing import is_otel_available , otel_import_error_traceback
36
33
from vllm .transformers_utils .config import (
37
34
ConfigFormat , get_config , get_hf_image_processor_config ,
38
35
get_hf_text_config , get_pooling_config ,
39
36
get_sentence_transformer_tokenizer_config , is_encoder_decoder ,
40
37
try_get_generation_config , uses_mrope )
41
38
from vllm .transformers_utils .s3_utils import S3Model
42
39
from vllm .transformers_utils .utils import is_s3 , maybe_model_redirect
43
- from vllm .utils import (GiB_bytes , LayerBlockType , cuda_device_count_stateless ,
44
- get_cpu_memory , get_open_port , is_torch_equal_or_newer ,
45
- random_uuid , resolve_obj_by_qualname )
40
+ from vllm .utils import (GiB_bytes , LayerBlockType , LazyLoader ,
41
+ cuda_device_count_stateless , get_cpu_memory ,
42
+ get_open_port , is_torch_equal_or_newer , random_uuid ,
43
+ resolve_obj_by_qualname )
46
44
47
45
if TYPE_CHECKING :
48
46
from _typeshed import DataclassInstance
49
47
from ray .util .placement_group import PlacementGroup
48
+ from transformers import PretrainedConfig
50
49
51
50
from vllm .executor .executor_base import ExecutorBase
52
51
from vllm .model_executor .layers .quantization .base_config import (
53
52
QuantizationConfig )
54
53
from vllm .model_executor .model_loader .loader import BaseModelLoader
55
54
56
55
ConfigType = type [DataclassInstance ]
56
+ HfOverrides = Union [dict [str , Any ], Callable [[PretrainedConfig ],
57
+ PretrainedConfig ]]
57
58
else :
58
- QuantizationConfig = None
59
+ HfOverrides = None
59
60
ConfigType = type
60
61
62
+ me_quant = LazyLoader ("model_executor" , globals (),
63
+ "vllm.model_executor.layers.quantization" )
64
+ me_models = LazyLoader ("model_executor" , globals (),
65
+ "vllm.model_executor.models" )
61
66
logger = init_logger (__name__ )
62
67
63
68
ConfigT = TypeVar ("ConfigT" , bound = ConfigType )
89
94
for task in tasks
90
95
}
91
96
92
- HfOverrides = Union [dict [str , Any ], Callable [[PretrainedConfig ],
93
- PretrainedConfig ]]
94
-
95
97
96
98
class SupportsHash (Protocol ):
97
99
@@ -365,7 +367,7 @@ def __init__(
365
367
mm_processor_kwargs : Optional [dict [str , Any ]] = None ,
366
368
disable_mm_preprocessor_cache : bool = False ,
367
369
override_neuron_config : Optional [dict [str , Any ]] = None ,
368
- override_pooler_config : Optional [" PoolerConfig" ] = None ,
370
+ override_pooler_config : Optional [PoolerConfig ] = None ,
369
371
logits_processor_pattern : Optional [str ] = None ,
370
372
generation_config : str = "auto" ,
371
373
enable_sleep_mode : bool = False ,
@@ -548,7 +550,7 @@ def __init__(
548
550
549
551
@property
550
552
def registry (self ):
551
- return ModelRegistry
553
+ return me_models . ModelRegistry
552
554
553
555
@property
554
556
def architectures (self ) -> list [str ]:
@@ -581,7 +583,7 @@ def maybe_pull_model_tokenizer_for_s3(self, model: str,
581
583
582
584
def _init_multimodal_config (
583
585
self , limit_mm_per_prompt : Optional [dict [str , int ]]
584
- ) -> Optional [" MultiModalConfig" ]:
586
+ ) -> Optional [MultiModalConfig ]:
585
587
if self .registry .is_multimodal_model (self .architectures ):
586
588
return MultiModalConfig (limit_per_prompt = limit_mm_per_prompt or {})
587
589
@@ -597,8 +599,8 @@ def _get_encoder_config(self):
597
599
598
600
def _init_pooler_config (
599
601
self ,
600
- override_pooler_config : Optional [" PoolerConfig" ],
601
- ) -> Optional [" PoolerConfig" ]:
602
+ override_pooler_config : Optional [PoolerConfig ],
603
+ ) -> Optional [PoolerConfig ]:
602
604
603
605
if self .runner_type == "pooling" :
604
606
user_config = override_pooler_config or PoolerConfig ()
@@ -749,7 +751,8 @@ def _parse_quant_hf_config(self):
749
751
return quant_cfg
750
752
751
753
def _verify_quantization (self ) -> None :
752
- supported_quantization = QUANTIZATION_METHODS
754
+ supported_quantization = me_quant .QUANTIZATION_METHODS
755
+
753
756
optimized_quantization_methods = [
754
757
"fp8" , "marlin" , "modelopt" , "gptq_marlin_24" , "gptq_marlin" ,
755
758
"awq_marlin" , "fbgemm_fp8" , "compressed_tensors" ,
@@ -766,8 +769,8 @@ def _verify_quantization(self) -> None:
766
769
quant_method = quant_cfg .get ("quant_method" , "" ).lower ()
767
770
768
771
# Detect which checkpoint is it
769
- for name in QUANTIZATION_METHODS :
770
- method = get_quantization_config (name )
772
+ for name in me_quant . QUANTIZATION_METHODS :
773
+ method = me_quant . get_quantization_config (name )
771
774
quantization_override = method .override_quantization_method (
772
775
quant_cfg , self .quantization )
773
776
if quantization_override :
@@ -799,6 +802,8 @@ def _verify_quantization(self) -> None:
799
802
"non-quantized models." , self .quantization )
800
803
801
804
def _verify_cuda_graph (self ) -> None :
805
+ from vllm .platforms import current_platform
806
+
802
807
if self .max_seq_len_to_capture is None :
803
808
self .max_seq_len_to_capture = self .max_model_len
804
809
self .max_seq_len_to_capture = min (self .max_seq_len_to_capture ,
@@ -885,7 +890,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
885
890
886
891
def verify_with_parallel_config (
887
892
self ,
888
- parallel_config : " ParallelConfig" ,
893
+ parallel_config : ParallelConfig ,
889
894
) -> None :
890
895
891
896
if parallel_config .distributed_executor_backend == "external_launcher" :
@@ -1038,7 +1043,7 @@ def get_total_num_kv_heads(self) -> int:
1038
1043
# equal to the number of attention heads.
1039
1044
return self .hf_text_config .num_attention_heads
1040
1045
1041
- def get_num_kv_heads (self , parallel_config : " ParallelConfig" ) -> int :
1046
+ def get_num_kv_heads (self , parallel_config : ParallelConfig ) -> int :
1042
1047
"""Returns the number of KV heads per GPU."""
1043
1048
if self .use_mla :
1044
1049
# When using MLA during decode it becomes MQA
@@ -1052,13 +1057,12 @@ def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
1052
1057
return max (1 ,
1053
1058
total_num_kv_heads // parallel_config .tensor_parallel_size )
1054
1059
1055
- def get_num_attention_heads (self ,
1056
- parallel_config : "ParallelConfig" ) -> int :
1060
+ def get_num_attention_heads (self , parallel_config : ParallelConfig ) -> int :
1057
1061
num_heads = getattr (self .hf_text_config , "num_attention_heads" , 0 )
1058
1062
return num_heads // parallel_config .tensor_parallel_size
1059
1063
1060
1064
def get_layers_start_end_indices (
1061
- self , parallel_config : " ParallelConfig" ) -> tuple [int , int ]:
1065
+ self , parallel_config : ParallelConfig ) -> tuple [int , int ]:
1062
1066
from vllm .distributed .utils import get_pp_indices
1063
1067
if self .hf_text_config .model_type == "deepseek_mtp" :
1064
1068
total_num_hidden_layers = getattr (self .hf_text_config ,
@@ -1073,13 +1077,13 @@ def get_layers_start_end_indices(
1073
1077
start , end = get_pp_indices (total_num_hidden_layers , pp_rank , pp_size )
1074
1078
return start , end
1075
1079
1076
- def get_num_layers (self , parallel_config : " ParallelConfig" ) -> int :
1080
+ def get_num_layers (self , parallel_config : ParallelConfig ) -> int :
1077
1081
start , end = self .get_layers_start_end_indices (parallel_config )
1078
1082
return end - start
1079
1083
1080
1084
def get_num_layers_by_block_type (
1081
1085
self ,
1082
- parallel_config : " ParallelConfig" ,
1086
+ parallel_config : ParallelConfig ,
1083
1087
block_type : LayerBlockType = LayerBlockType .attention ,
1084
1088
) -> int :
1085
1089
# This function relies on 'layers_block_type' in hf_config,
@@ -1132,7 +1136,7 @@ def get_num_layers_by_block_type(
1132
1136
1133
1137
return sum (t == 1 for t in attn_type_list [start :end ])
1134
1138
1135
- def get_multimodal_config (self ) -> " MultiModalConfig" :
1139
+ def get_multimodal_config (self ) -> MultiModalConfig :
1136
1140
"""
1137
1141
Get the multimodal configuration of the model.
1138
1142
@@ -1241,7 +1245,7 @@ def runner_type(self) -> RunnerType:
1241
1245
@property
1242
1246
def is_v1_compatible (self ) -> bool :
1243
1247
architectures = getattr (self .hf_config , "architectures" , [])
1244
- return ModelRegistry .is_v1_compatible (architectures )
1248
+ return me_models . ModelRegistry .is_v1_compatible (architectures )
1245
1249
1246
1250
@property
1247
1251
def is_matryoshka (self ) -> bool :
@@ -1392,7 +1396,7 @@ def _verify_prefix_caching(self) -> None:
1392
1396
1393
1397
def verify_with_parallel_config (
1394
1398
self ,
1395
- parallel_config : " ParallelConfig" ,
1399
+ parallel_config : ParallelConfig ,
1396
1400
) -> None :
1397
1401
total_cpu_memory = get_cpu_memory ()
1398
1402
# FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel
@@ -1460,7 +1464,7 @@ class LoadConfig:
1460
1464
"""Configuration for loading the model weights."""
1461
1465
1462
1466
load_format : Union [str , LoadFormat ,
1463
- " BaseModelLoader" ] = LoadFormat .AUTO .value
1467
+ BaseModelLoader ] = LoadFormat .AUTO .value
1464
1468
"""The format of the model weights to load:\n
1465
1469
- "auto" will try to load the weights in the safetensors format and fall
1466
1470
back to the pytorch bin format if safetensors format is not available.\n
@@ -1582,11 +1586,11 @@ def data_parallel_rank_local(self, value: int) -> None:
1582
1586
ray_workers_use_nsight : bool = False
1583
1587
"""Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
1584
1588
1585
- placement_group : Optional [" PlacementGroup" ] = None
1589
+ placement_group : Optional [PlacementGroup ] = None
1586
1590
"""ray distributed model workers placement group."""
1587
1591
1588
1592
distributed_executor_backend : Optional [Union [DistributedExecutorBackend ,
1589
- type [" ExecutorBase" ]]] = None
1593
+ type [ExecutorBase ]]] = None
1590
1594
"""Backend to use for distributed model
1591
1595
workers, either "ray" or "mp" (multiprocessing). If the product
1592
1596
of pipeline_parallel_size and tensor_parallel_size is less than
@@ -1629,7 +1633,7 @@ def get_next_dp_init_port(self) -> int:
1629
1633
self .data_parallel_master_port += 1
1630
1634
return answer
1631
1635
1632
- def stateless_init_dp_group (self ) -> " ProcessGroup" :
1636
+ def stateless_init_dp_group (self ) -> ProcessGroup :
1633
1637
from vllm .distributed .utils import (
1634
1638
stateless_init_torch_distributed_process_group )
1635
1639
@@ -1644,7 +1648,7 @@ def stateless_init_dp_group(self) -> "ProcessGroup":
1644
1648
return dp_group
1645
1649
1646
1650
@staticmethod
1647
- def has_unfinished_dp (dp_group : " ProcessGroup" ,
1651
+ def has_unfinished_dp (dp_group : ProcessGroup ,
1648
1652
has_unfinished : bool ) -> bool :
1649
1653
tensor = torch .tensor ([has_unfinished ],
1650
1654
dtype = torch .int32 ,
@@ -2227,7 +2231,7 @@ def compute_hash(self) -> str:
2227
2231
return hash_str
2228
2232
2229
2233
@classmethod
2230
- def from_dict (cls , dict_value : dict ) -> " SpeculativeConfig" :
2234
+ def from_dict (cls , dict_value : dict ) -> SpeculativeConfig :
2231
2235
"""Parse the CLI value for the speculative config."""
2232
2236
return cls (** dict_value )
2233
2237
@@ -2819,7 +2823,7 @@ def compute_hash(self) -> str:
2819
2823
return hash_str
2820
2824
2821
2825
@staticmethod
2822
- def from_json (json_str : str ) -> " PoolerConfig" :
2826
+ def from_json (json_str : str ) -> PoolerConfig :
2823
2827
return PoolerConfig (** json .loads (json_str ))
2824
2828
2825
2829
@@ -3176,6 +3180,7 @@ def compute_hash(self) -> str:
3176
3180
return hash_str
3177
3181
3178
3182
def __post_init__ (self ):
3183
+ from vllm .tracing import is_otel_available , otel_import_error_traceback
3179
3184
if not is_otel_available () and self .otlp_traces_endpoint is not None :
3180
3185
raise ValueError (
3181
3186
"OpenTelemetry is not available. Unable to configure "
@@ -3239,7 +3244,7 @@ def compute_hash(self) -> str:
3239
3244
return hash_str
3240
3245
3241
3246
@classmethod
3242
- def from_cli (cls , cli_value : str ) -> " KVTransferConfig" :
3247
+ def from_cli (cls , cli_value : str ) -> KVTransferConfig :
3243
3248
"""Parse the CLI value for the kv cache transfer config."""
3244
3249
return KVTransferConfig .model_validate_json (cli_value )
3245
3250
@@ -3476,7 +3481,7 @@ def __repr__(self) -> str:
3476
3481
__str__ = __repr__
3477
3482
3478
3483
@classmethod
3479
- def from_cli (cls , cli_value : str ) -> " CompilationConfig" :
3484
+ def from_cli (cls , cli_value : str ) -> CompilationConfig :
3480
3485
"""Parse the CLI value for the compilation config."""
3481
3486
if cli_value in ["0" , "1" , "2" , "3" ]:
3482
3487
return cls (level = int (cli_value ))
@@ -3528,7 +3533,7 @@ def model_post_init(self, __context: Any) -> None:
3528
3533
self .static_forward_context = {}
3529
3534
self .compilation_time = 0.0
3530
3535
3531
- def init_backend (self , vllm_config : " VllmConfig" ) -> Union [str , Callable ]:
3536
+ def init_backend (self , vllm_config : VllmConfig ) -> Union [str , Callable ]:
3532
3537
if self .level == CompilationLevel .NO_COMPILATION :
3533
3538
raise ValueError ("No compilation level is set." )
3534
3539
@@ -3744,9 +3749,7 @@ def _get_quantization_config(
3744
3749
"""Get the quantization config."""
3745
3750
from vllm .platforms import current_platform
3746
3751
if model_config .quantization is not None :
3747
- from vllm .model_executor .model_loader .weight_utils import (
3748
- get_quant_config )
3749
- quant_config = get_quant_config (model_config , load_config )
3752
+ quant_config = me_quant .get_quant_config (model_config , load_config )
3750
3753
capability_tuple = current_platform .get_device_capability ()
3751
3754
3752
3755
if capability_tuple is not None :
@@ -3770,7 +3773,7 @@ def with_hf_config(
3770
3773
self ,
3771
3774
hf_config : PretrainedConfig ,
3772
3775
architectures : Optional [list [str ]] = None ,
3773
- ) -> " VllmConfig" :
3776
+ ) -> VllmConfig :
3774
3777
if architectures is not None :
3775
3778
hf_config = copy .deepcopy (hf_config )
3776
3779
hf_config .architectures = architectures
0 commit comments