diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index e2d4a8de605..065715cbde4 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -370,7 +370,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config - self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding( self.vocab_size, diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 4fb68e7b48d..7e2b7c862e5 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -267,7 +267,6 @@ def __init__( quant_config = vllm_config.quant_config self.config = config - self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding( diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index 82684dfa730..5847c50862e 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -725,7 +725,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.config = config - self.padding_idx = config.pad_token_id lora_vocab = (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) if lora_config else 0 self.vocab_size = config.vocab_size + lora_vocab diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 137bfc0f98c..68284a018af 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -851,7 +851,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config = vllm_config.quant_config self.config = config - self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding( self.vocab_size, diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index c04e7a02bae..f0212f37657 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -339,7 +339,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config - self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding( diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index cf244ff572c..548f913c83c 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -570,7 +570,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config - self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size if get_pp_group().is_first_rank: diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 79939f6f40e..7d01dd37826 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -313,7 +313,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): lora_config = vllm_config.lora_config self.config = config - self.padding_idx = config.pad_token_id lora_vocab = ((lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) if lora_config else 0) self.vocab_size = config.vocab_size + lora_vocab diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py index 7c226ea47bd..e892a1a4fc6 100644 --- a/vllm/model_executor/models/florence2.py +++ b/vllm/model_executor/models/florence2.py @@ -592,7 +592,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.config = config - self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.shared = BartScaledWordEmbedding(self.vocab_size, config.d_model) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 6f4b6cdda33..51c79ba846c 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -255,7 +255,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.config = config self.multimodal_config = multimodal_config - self.padding_idx = config.pad_token_id self.vocab_size = config.text_config.vocab_size self.image_token_id = _IMAGE_TOKEN_ID self.image_feature_size = config.patch_size**2 * config.num_channels diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 201e15d3a30..eba8207d2cd 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -260,7 +260,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): lora_config = vllm_config.lora_config self.config = config - self.padding_idx = config.pad_token_id lora_vocab = (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) if lora_config else 0 self.vocab_size = config.vocab_size + lora_vocab diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 9b56874a8ad..5152539c68f 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -252,7 +252,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config - self.padding_idx = config.pad_token_id lora_vocab = (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) if lora_config else 0 self.vocab_size = config.vocab_size + lora_vocab diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 3e0b3c768b6..19d5a4c2599 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -404,7 +404,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config = vllm_config.quant_config self.config = config - self.padding_idx = self.config.text_config.pad_token_id self.vocab_size = self.config.text_config.vocab_size self.vision_model = Idefics3VisionTransformer( config.vision_config, diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 41ca399b9ef..520b85c0cdf 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -261,7 +261,6 @@ def __init__( quant_config = vllm_config.quant_config self.config = config - self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.tok_embeddings = VocabParallelEmbedding( config.vocab_size, diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 58eccd6a6b8..4ac83c6ece1 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -271,7 +271,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): lora_config = vllm_config.lora_config self.config = config - self.padding_idx = config.pad_token_id lora_vocab = ((lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) if lora_config else 0) self.vocab_size = config.vocab_size + lora_vocab diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index a0aff9e609d..81b5d9bda9a 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -302,7 +302,6 @@ def __init__(self, self.config = config self.quant_config = quant_config - self.padding_idx = config.pad_token_id lora_vocab = (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) if lora_config else 0 self.vocab_size = config.vocab_size + lora_vocab diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index 46b9182f2d7..7a525ad8e49 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -90,7 +90,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): is_lora_enabled = bool(lora_config) self.config = config - self.padding_idx = config.pad_token_id lora_vocab = ((lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) if lora_config else 0) self.vocab_size = config.vocab_size + lora_vocab diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 34e1f3927a9..cf03396a9ca 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -365,7 +365,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.config = config self.cache_config = cache_config self.quant_config = quant_config - self.padding_idx = config.pad_token_id lora_vocab = (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) if lora_config else 0 self.vocab_size = config.vocab_size + lora_vocab diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index c8dea557e57..b21aa601879 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -254,7 +254,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config - self.padding_idx = config.pad_token_id lora_vocab = (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) if lora_config else 0 self.vocab_size = config.vocab_size + lora_vocab diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index 21b52d9f54c..8a893b6d858 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -302,7 +302,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config - self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding( diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 2a829bf0e61..b1ccd8e851c 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -1031,7 +1031,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config - self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding(config.vocab_size + 8, config.hidden_size) diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 3b86b91465c..a2b49494968 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -300,7 +300,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): lora_config = vllm_config.lora_config self.config = config - self.padding_idx = config.pad_token_id lora_vocab = (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) if lora_config else 0 self.vocab_size = config.vocab_size + lora_vocab diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index e27ff5deace..8e72b36e7e5 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -252,7 +252,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config - self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding( diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index e4775478a54..d4c2b4c48d9 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -200,7 +200,6 @@ def __init__( ): super().__init__() self.config = config - self.padding_idx = config.pad_token_id self.max_target_positions = config.max_position_embeddings self.vocab_size = config.vocab_size diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 6668ede91ee..0b42666e02d 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -217,7 +217,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config = vllm_config.quant_config self.config = config - self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding( config.vocab_size, diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index c35c7e9fcce..d14425f4a70 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -441,7 +441,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config - self.padding_idx = config.pad_token_id lora_vocab = ((lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) if lora_config else 0) self.vocab_size = config.vocab_size + lora_vocab diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index fe615c41aea..c4d02e5ddeb 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -284,7 +284,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.config = config self.quant_config = quant_config - self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size if get_pp_group().is_first_rank or (config.tie_word_embeddings diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 41536b34b2f..92a66568b0f 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -325,7 +325,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config - self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding( diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 0f9e517aeb5..1cae0a7fe0d 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -269,7 +269,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): lora_config = vllm_config.lora_config self.config = config - self.padding_idx = config.pad_token_id lora_vocab = ((lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) if lora_config else 0) self.vocab_size = config.vocab_size + lora_vocab diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 90098af9dde..3d11dfd7792 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -212,10 +212,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config = vllm_config.quant_config self.config = config - self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size - # TODO: consider padding_idx (currently removed) self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 1cb026f4bcd..8ed68bd89e5 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -49,10 +49,7 @@ class WhisperAudioInputs(TypedDict): class WhisperPositionalEmbedding(nn.Embedding): - def __init__(self, - num_positions: int, - embedding_dim: int, - padding_idx: Optional[int] = None): + def __init__(self, num_positions: int, embedding_dim: int): super().__init__(num_positions, embedding_dim) def forward(self, position_ids): @@ -359,7 +356,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config embed_dim = config.d_model self.num_mel_bins = config.num_mel_bins - self.padding_idx = config.pad_token_id self.max_source_positions = config.max_source_positions self.embed_scale = (math.sqrt(embed_dim) if config.scale_embedding else 1.0)