Bugfix: Align expert map shapes with redundant experts in EPLB adjustment (#5285)

#### Overview This PR fixes a shape mismatch bug between `expert_placement_map` and `log2phy_expert_map` when **redundant experts** are enabled in the vLLM-Ascend platform. The issue occurred during the initialization of expert maps and their updates via EPLB (Expert Load Balancer) adjustment, leading to potential tensor shape errors and incorrect expert routing in distributed MoE deployments. #### Key Changes 1. **Unify expert map shape calculation logic** - Ensure the shape of `expert_placement_map` and `log2phy_expert_map` strictly aligns with the total number of experts (including redundant experts) during initialization. - Update the shape adjustment logic in EPLB dynamic update process to match the initial expert map dimensions. 2. **Add shape consistency checks** - Add assertion statements to verify the shape consistency of the two maps after initialization and EPLB adjustment, preventing silent shape mismatches in subsequent operations. #### Impact - Resolves tensor shape errors when using redundant experts with EPLB on Ascend platform. - Ensures correct expert routing and load balancing for MoE models with redundant expert configurations. - No breaking changes to existing functionality; compatible with non-redundant expert deployments. - vLLM version: release/v0.13.0 - vLLM main: ad32e3e19c --------- Signed-off-by: Che Ruan <cr623@ic.ac.uk> Signed-off-by: shenchuxiaofugui <1311027364@qq.com> Co-authored-by: Che Ruan <cr623@ic.ac.uk> Co-authored-by: shenchuxiaofugui <1311027364@qq.com>
2026-01-06 17:22:36 +08:00
parent fe3f2c7702
commit 29e2f9a43e
10 changed files with 12 additions and 25 deletions
--- a/vllm_ascend/ops/fused_moe/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe/fused_moe.py
@@ -210,7 +210,7 @@ class AscendFusedMoE(FusedMoE):

        self.moe_config.num_experts = self.global_num_experts
        self.moe_config.num_local_experts = self.local_num_experts
-        self.moe_config.original_num_experts = num_experts
+        self.moe_config.global_redundant_expert_num = self.global_redundant_expert_num

        moe_quant_params = {
            "num_experts": self.local_num_experts,
--- a/vllm_ascend/ops/fused_moe/moe_comm_method.py
+++ b/vllm_ascend/ops/fused_moe/moe_comm_method.py
@@ -114,7 +114,6 @@ class MoECommMethod(ABC):
            dynamic_scale_for_share: Optional[Any] = None,
            # For load balance
            log2phy: torch.Tensor = None,
-            global_redundant_expert_num: int = 0,
            need_trans: bool = False,
            dynamic_eplb: bool = False,
            mc2_mask: torch.Tensor = None,
@@ -133,7 +132,8 @@ class MoECommMethod(ABC):
            topk_ids=topk_ids,
            expert_map=expert_map,
            log2phy=log2phy,
-            global_redundant_expert_num=global_redundant_expert_num,
+            global_redundant_expert_num=self.moe_config.
+            global_redundant_expert_num,
            shared_experts=shared_experts,
            quantized_x_for_share=quantized_x_for_share,
            dynamic_scale_for_share=dynamic_scale_for_share,
@@ -290,7 +290,6 @@ class FusedMC2CommImpl(MoECommMethod):
            dynamic_scale_for_share: Optional[Any] = None,
            # For load balance
            log2phy: torch.Tensor = None,
-            global_redundant_expert_num: int = 0,
            need_trans: bool = False,
            dynamic_eplb: bool = False,
            mc2_mask: torch.Tensor = None,
--- a/vllm_ascend/ops/fused_moe/token_dispatcher.py
+++ b/vllm_ascend/ops/fused_moe/token_dispatcher.py
@@ -152,18 +152,14 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
        mc2_mask: torch.Tensor,
        global_redundant_expert_num: int = 0,
    ):
-        if self.with_quant:
-            quant_mode = 2
-            moe_expert_num = len(expert_map)
-        else:
-            quant_mode = 0
-            moe_expert_num = len(expert_map)
+        quant_mode = 2 if self.with_quant else 0
+        self.moe_expert_num = len(expert_map) + global_redundant_expert_num
        kwargs_mc2 = {
            "x": hidden_states,
            "expert_ids": topk_ids,
            "expert_shard_type": 0,
            "shared_expert_rank_num": 0,
-            "moe_expert_num": moe_expert_num,
+            "moe_expert_num": self.moe_expert_num,
            "global_bs": self.global_bs,
            "expert_token_nums_type": 0,
        }
@@ -253,7 +249,6 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
        expand_scales = context_metadata["expand_scales"]

        assert expert_map is not None
-        moe_expert_num = len(expert_map)

        kwargs_mc2 = {
            "expand_x": hidden_states,
@@ -261,7 +256,7 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
            "expert_scales": topk_weights.to(torch.float32),
            "expert_shard_type": 0,
            "shared_expert_rank_num": 0,
-            "moe_expert_num": moe_expert_num,
+            "moe_expert_num": self.moe_expert_num,
            "global_bs": self.global_bs,
        }

@@ -347,7 +342,7 @@ class TokenDispatcherWithAllGather(MoETokenDispatcher):
            hidden_states = hidden_states * \
                topk_weights.to(hidden_states.dtype)
        if expert_map is not None:
-            global_num_experts = len(expert_map)
+            global_num_experts = len(expert_map) + global_redundant_expert_num
            mask = (expert_map[topk_ids] != -1)
            topk_weights = topk_weights * mask
            first_expert_idx = get_ep_group(