Skip to content

[CI] Enable test_initialization to run on V1 #16736

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
May 23, 2025
8 changes: 4 additions & 4 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -399,10 +399,10 @@ steps:
commands:
- pytest -v -s models/test_transformers.py
- pytest -v -s models/test_registry.py
# V1 Test: https://github.com/vllm-project/vllm/issues/14531
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
- pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
# There are memory leak issues with these models
- pytest -v -s models/test_initialization.py -k 'llama4'
- pytest -v -s models/test_initialization.py -k 'plamo2'

- label: Language Models Test (Standard) # 32min
#mirror_hardwares: [amd]
Expand Down
6 changes: 6 additions & 0 deletions tests/models/test_initialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,12 @@ def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
"num_local_experts": 2,
})

if hasattr(hf_config, "vision_config"):
hf_config.vision_config.update({
"num_layers": 1,
"num_hidden_layers": 1,
})

return hf_config

# Avoid calling model.forward()
Expand Down
32 changes: 8 additions & 24 deletions vllm/model_executor/models/grok1.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only Grok1 model."""
from typing import Iterable, List, Optional, Set, Tuple, Union
from typing import Iterable, Optional, Set, Tuple, Union

import torch
import torch.nn.functional as F
from torch import nn

from vllm.attention import Attention, AttentionMetadata
from vllm.attention import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
Expand Down Expand Up @@ -182,25 +182,20 @@ def __init__(
quant_config=quant_config,
logits_soft_cap=attn_logits_soft_cap,
prefix=f"{prefix}.attn")
self.attn_multiplier = getattr(self.config, "attn_output_multiplier",
1.0) if self.config else 1.0

def forward(
self,
positions: torch.Tensor,
hidden_states: torch.Tensor,
kv_cache: torch.Tensor,
attn_metadata: AttentionMetadata,
) -> torch.Tensor:
qkv, _ = self.qkv_proj(hidden_states)
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
q, k = self.rotary_emb(positions, q, k)
attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
attn_output = self.attn(q, k, v)
output, _ = self.o_proj(attn_output)

# Apply attention output multiplier if specified in config
attn_multiplier = getattr(self.config, "attn_output_multiplier",
None) if self.config else None
if attn_multiplier is not None:
output = output * attn_multiplier
output *= self.attn_multiplier
return output


Expand Down Expand Up @@ -261,8 +256,6 @@ def forward(
self,
positions: torch.Tensor,
hidden_states: torch.Tensor,
kv_cache: torch.Tensor,
attn_metadata: AttentionMetadata,
residual: Optional[torch.Tensor],
) -> Tuple[torch.Tensor, torch.Tensor]:
# Self Attention
Expand All @@ -276,8 +269,6 @@ def forward(
hidden_states = self.attn(
positions=positions,
hidden_states=hidden_states,
kv_cache=kv_cache,
attn_metadata=attn_metadata,
)

# Post attention normalization
Expand Down Expand Up @@ -341,8 +332,6 @@ def forward(
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
intermediate_tensors: Optional[IntermediateTensors],
inputs_embeds: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, IntermediateTensors]:
Expand All @@ -359,9 +348,7 @@ def forward(

for i in range(self.start_layer, self.end_layer):
layer = self.layers[i]
hidden_states, residual = layer(positions, hidden_states,
kv_caches[i - self.start_layer],
attn_metadata, residual)
hidden_states, residual = layer(positions, hidden_states, residual)

if not get_pp_group().is_last_rank:
return IntermediateTensors({
Expand Down Expand Up @@ -532,13 +519,10 @@ def forward(
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, IntermediateTensors]:
hidden_states = self.model(input_ids, positions, kv_caches,
attn_metadata, intermediate_tensors,
hidden_states = self.model(input_ids, positions, intermediate_tensors,
inputs_embeds)
return hidden_states

Expand Down
15 changes: 9 additions & 6 deletions vllm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2619,14 +2619,17 @@ def wrapper(*args, **kwargs):

# Only relevant for models using ALiBi (e.g, MPT)
def check_use_alibi(model_config: ModelConfig) -> bool:
return (getattr(model_config.hf_text_config, "alibi", False) # Falcon
cfg = model_config.hf_text_config
return (getattr(cfg, "alibi", False) # Falcon
or ("BloomForCausalLM" in getattr(model_config.hf_config,
"architectures", [])) # Bloom
or getattr(model_config.hf_text_config, "position_encoding_type",
"") == "alibi" # codellm_1b_alibi
or
(hasattr(model_config.hf_text_config, "attn_config") # MPT
and model_config.hf_text_config.attn_config.get("alibi", False)))
or getattr(cfg, "position_encoding_type", "") ==
"alibi" # codellm_1b_alibi
or (hasattr(cfg, "attn_config") # MPT
and ((isinstance(cfg.attn_config, dict)
and cfg.attn_config.get("alibi", False)) or
(not isinstance(cfg.attn_config, dict)
and getattr(cfg.attn_config, "alibi", False)))))


def sha256(input) -> int:
Expand Down