Merge pull request #3 from DrownFish19/dev_20250505_add_qwen3

ZHUI · web-flow · commit e7d8492eb7a1 · 2025-05-06T08:34:30.000+08:00
Fix EP for Qwen3Moe
diff --git a/paddlenlp/transformers/moe_gate.py b/paddlenlp/transformers/moe_gate.py
@@ -226,7 +226,7 @@ def _priority(self, topk_idx: paddle.Tensor, capacity: int) -> paddle.Tensor:
         chosen_expert = topk_idx.reshape([-1])
         # Shape: [seq_len * k, num_experts].
         token_priority = F.one_hot(chosen_expert, self.num_experts).cast(paddle.int32)
-        token_priority = paddle.logical_and(token_priority > 0, token_priority.cumsum(axis=0) < capacity)
+        token_priority = paddle.logical_and(token_priority > 0, token_priority.cumsum(axis=0) <= capacity)
         # Shape: [seq_len, num_experts].
         token_priority = token_priority.reshape([-1, k, self.num_experts]).sum(axis=1)
 
@@ -532,12 +532,14 @@ def topkgating(
             token_priority = self._priority(top_idx, capacity)
 
         # normalize gates
+        # gates_masked is equal to top_gate.
         gates_masked = gates * mask
-        if self.training:
-            gates_s = paddle.sum(gates_masked, axis=-1, keepdim=True)
-            denom_s = paddle.clip(gates_s, min=paddle.finfo(gates_masked.dtype).eps)
-            if self.norm_topk_prob:
-                gates_masked = gates_masked / denom_s
+        # if self.training:
+        gates_s = paddle.sum(gates_masked, axis=-1, keepdim=True)
+        denom_s = paddle.clip(gates_s, min=paddle.finfo(gates_masked.dtype).eps)
+        if self.norm_topk_prob:
+            gates_masked = gates_masked / denom_s
+        gates_masked *= self.routed_scaling_factor
 
         return (
             capacity,
diff --git a/paddlenlp/transformers/qwen3_moe/modeling.py b/paddlenlp/transformers/qwen3_moe/modeling.py
@@ -84,6 +84,7 @@ def __init__(self, config: Qwen3MoeConfig):
             config.hidden_size,
             top_k=config.num_experts_per_tok,
             drop_tokens=False,
+            norm_topk_prob=config.norm_topk_prob,
         )
 
         super().__init__(
@@ -148,7 +149,7 @@ def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
             # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
 
             current_state = hidden_states[idx, None].reshape([-1, hidden_dim])
-            current_hidden_states = expert_layer(current_state) * routing_weights[idx, top_x]
+            current_hidden_states = expert_layer(current_state) * routing_weights[idx, top_x].unsqueeze(-1)
             final_hidden_states.index_add_(
                 index=idx.reshape([-1]), axis=0, value=current_hidden_states.to(hidden_states.dtype)
             )
@@ -165,7 +166,7 @@ def __init__(self, config: Qwen3MoeConfig, layerwise_recompute: bool = False):
         self.self_attn = Qwen3MoeAttention(config, layerwise_recompute)
 
         if config.num_experts > 0:
-            self.mlp = Qwen3MoeSparseMoeBlock(config)
+            self.mlp = ExpertParallelQwen3MoeSparseMoeBlock(config)
         else:
             # num_experts == 0 or this layer is not sparse layer
             self.mlp = Qwen3MoeMLP(config)
@@ -828,7 +829,7 @@ def prepare_inputs_for_generation(
         attention_mask=None,
         inputs_embeds=None,
         output_router_logits=False,
-        **kwargs
+        **kwargs,
     ):
         batch_size, seq_length = input_ids.shape
         position_ids = kwargs.get("position_ids", paddle.arange(seq_length).expand((batch_size, seq_length)))