diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py index bc697ef93b3..21e6fe7a226 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py @@ -42,7 +42,7 @@ def create_weights(self, layer: torch.nn.Module, input_size: int, if not sparse_cutlass_supported(): raise ValueError( - "Sparse CUTLASS not supported. vLLM must be built with" + "Sparse CUTLASS not supported. vLLM must be built with " "CUDA 12.2 or later to use this feature") self.output_dtype = params_dtype diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 91225c0ddc9..5b97eced62d 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -390,8 +390,7 @@ def load_weights(self, weights: Iterable[Tuple[str, continue if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache scales for quark and - # compressed-tensors quantization + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 6517422697c..989056bf5c1 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -440,8 +440,7 @@ def load_weights(self, weights: Iterable[Tuple[str, if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache scales for quark and - # compressed-tensors quantization + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index ff1f1c2a939..b2aa3c0709b 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -452,8 +452,7 @@ def load_weights(self, weights: Iterable[Tuple[str, for name, loaded_weight in weights: if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache scales for quark and - # compressed-tensors quantization + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index ac679d6ff43..eab3bf0756f 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -533,8 +533,7 @@ def load_weights(self, weights: Iterable[Tuple[str, continue if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache scales for quark and - # compressed-tensors quantization + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 56343ca9a71..08298cc0db3 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -316,8 +316,7 @@ def load_weights(self, weights: Iterable[Tuple[str, if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache scales for quark and - # compressed-tensors quantization + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 67e04b57658..ddd2d7a16b2 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -475,8 +475,7 @@ def load_weights(self, weights: Iterable[Tuple[str, continue if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache scales for quark and - # compressed-tensors quantization + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 4667f275ecd..a5bd418801f 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -105,9 +105,9 @@ def __init__(self, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, + bias_o_proj: bool = False, cache_config: Optional[CacheConfig] = None, - prefix: str = "", - bias_o_proj: bool = False) -> None: + prefix: str = "") -> None: super().__init__() layer_idx = extract_layer_index(prefix) self.hidden_size = hidden_size @@ -397,8 +397,7 @@ def load_weights(self, weights: Iterable[Tuple[str, continue if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache scales for quark and - # compressed-tensors quantization + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 2c8895e8429..da415cdae96 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -431,8 +431,7 @@ def load_weights(self, weights: Iterable[Tuple[str, if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache scales for quark and - # compressed-tensors quantization + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index bd261f31499..2554281610a 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -1432,8 +1432,7 @@ def load_weights(self, weights: Iterable[Tuple[str, loaded_weight = loaded_weight.view(loaded_weight.shape[0], -1) if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache scales for quark and - # compressed-tensors quantization + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index e7875e6fb88..2340283b696 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -492,8 +492,7 @@ def load_weights(self, weights: Iterable[Tuple[str, continue if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache scales for quark and - # compressed-tensors quantization + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index dc76818e22c..881c09ea9db 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -626,8 +626,7 @@ def load_weights(self, weights: Iterable[Tuple[str, if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache scales for quark and - # compressed-tensors quantization + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index b9c259ad73c..d015f60c6d0 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -367,8 +367,7 @@ def load_weights(self, weights: Iterable[Tuple[str, continue if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache scales for quark and - # compressed-tensors quantization + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index b27d2b10850..37c5a4b5713 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -492,8 +492,7 @@ def load_weights(self, weights: Iterable[Tuple[str, continue if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache scales for quark and - # compressed-tensors quantization + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader)