Bugfix: Align expert map shapes with redundant experts in EPLB adjustment (#5285)

#### Overview
This PR fixes a shape mismatch bug between `expert_placement_map` and
`log2phy_expert_map` when **redundant experts** are enabled in the
vLLM-Ascend platform. The issue occurred during the initialization of
expert maps and their updates via EPLB (Expert Load Balancer)
adjustment, leading to potential tensor shape errors and incorrect
expert routing in distributed MoE deployments.

#### Key Changes
1. **Unify expert map shape calculation logic**
- Ensure the shape of `expert_placement_map` and `log2phy_expert_map`
strictly aligns with the total number of experts (including redundant
experts) during initialization.
- Update the shape adjustment logic in EPLB dynamic update process to
match the initial expert map dimensions.

2. **Add shape consistency checks**
- Add assertion statements to verify the shape consistency of the two
maps after initialization and EPLB adjustment, preventing silent shape
mismatches in subsequent operations.

#### Impact
- Resolves tensor shape errors when using redundant experts with EPLB on
Ascend platform.
- Ensures correct expert routing and load balancing for MoE models with
redundant expert configurations.
- No breaking changes to existing functionality; compatible with
non-redundant expert deployments.

- vLLM version: release/v0.13.0
- vLLM main:
ad32e3e19c

---------

Signed-off-by: Che Ruan <cr623@ic.ac.uk>
Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
Co-authored-by: Che Ruan <cr623@ic.ac.uk>
Co-authored-by: shenchuxiaofugui <1311027364@qq.com>
This commit is contained in:
Mercykid-bash
2026-01-06 17:22:36 +08:00
committed by GitHub
parent fe3f2c7702
commit 29e2f9a43e
10 changed files with 12 additions and 25 deletions

View File

@@ -47,8 +47,8 @@ def test_generate_task_and_state_flow(mock_adaptor):
loader_obj.state = loader.ExpertWeightUpdateState.WAITING
loader_obj.generate_expert_d2d_transfer_task([], [], {}, 0)
assert loader_obj.comm_op_list is None
assert loader_obj.state == loader.ExpertWeightUpdateState.WAITING
assert not loader_obj.comm_op_list
assert loader_obj.state == loader.ExpertWeightUpdateState.READY
def test_asyn_transfer_and_update(mock_adaptor):

View File

@@ -26,7 +26,7 @@ class TestMoECommMethod(TestBase):
self.moe_config.tp_size = 1
self.moe_config.ep_size = 1
self.moe_config.dp_group = MagicMock()
self.moe_config.num_global_redundant_experts = 0
self.moe_config.global_redundant_expert_num = 0
@patch("vllm_ascend.ops.fused_moe.moe_comm_method.get_forward_context")
@patch(

View File

@@ -143,7 +143,7 @@ class TestTokenDispatcherWithMC2(TestBase):
self.dispatcher.need_extra_args = True
self.dispatcher.enable_dispatch_v2 = True
self.dispatcher.moe_expert_num = len(expert_map)
kwargs = self.dispatcher.get_combine_mc_kwargs(hidden_states,
context_metadata)
self.assertIn("tp_send_counts", kwargs)

View File

@@ -50,10 +50,6 @@ class D2DExpertWeightLoader:
)
return
# If neither send nor receive task is needed for this layer on this rank, return
if not (expert_send_info or expert_recv_info):
return
self.updated_expert_map = updated_expert_map
self.layer_id = layer_id

View File

@@ -210,7 +210,7 @@ class AscendFusedMoE(FusedMoE):
self.moe_config.num_experts = self.global_num_experts
self.moe_config.num_local_experts = self.local_num_experts
self.moe_config.original_num_experts = num_experts
self.moe_config.global_redundant_expert_num = self.global_redundant_expert_num
moe_quant_params = {
"num_experts": self.local_num_experts,

View File

@@ -114,7 +114,6 @@ class MoECommMethod(ABC):
dynamic_scale_for_share: Optional[Any] = None,
# For load balance
log2phy: torch.Tensor = None,
global_redundant_expert_num: int = 0,
need_trans: bool = False,
dynamic_eplb: bool = False,
mc2_mask: torch.Tensor = None,
@@ -133,7 +132,8 @@ class MoECommMethod(ABC):
topk_ids=topk_ids,
expert_map=expert_map,
log2phy=log2phy,
global_redundant_expert_num=global_redundant_expert_num,
global_redundant_expert_num=self.moe_config.
global_redundant_expert_num,
shared_experts=shared_experts,
quantized_x_for_share=quantized_x_for_share,
dynamic_scale_for_share=dynamic_scale_for_share,
@@ -290,7 +290,6 @@ class FusedMC2CommImpl(MoECommMethod):
dynamic_scale_for_share: Optional[Any] = None,
# For load balance
log2phy: torch.Tensor = None,
global_redundant_expert_num: int = 0,
need_trans: bool = False,
dynamic_eplb: bool = False,
mc2_mask: torch.Tensor = None,

View File

@@ -152,18 +152,14 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
mc2_mask: torch.Tensor,
global_redundant_expert_num: int = 0,
):
if self.with_quant:
quant_mode = 2
moe_expert_num = len(expert_map)
else:
quant_mode = 0
moe_expert_num = len(expert_map)
quant_mode = 2 if self.with_quant else 0
self.moe_expert_num = len(expert_map) + global_redundant_expert_num
kwargs_mc2 = {
"x": hidden_states,
"expert_ids": topk_ids,
"expert_shard_type": 0,
"shared_expert_rank_num": 0,
"moe_expert_num": moe_expert_num,
"moe_expert_num": self.moe_expert_num,
"global_bs": self.global_bs,
"expert_token_nums_type": 0,
}
@@ -253,7 +249,6 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
expand_scales = context_metadata["expand_scales"]
assert expert_map is not None
moe_expert_num = len(expert_map)
kwargs_mc2 = {
"expand_x": hidden_states,
@@ -261,7 +256,7 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
"expert_scales": topk_weights.to(torch.float32),
"expert_shard_type": 0,
"shared_expert_rank_num": 0,
"moe_expert_num": moe_expert_num,
"moe_expert_num": self.moe_expert_num,
"global_bs": self.global_bs,
}
@@ -347,7 +342,7 @@ class TokenDispatcherWithAllGather(MoETokenDispatcher):
hidden_states = hidden_states * \
topk_weights.to(hidden_states.dtype)
if expert_map is not None:
global_num_experts = len(expert_map)
global_num_experts = len(expert_map) + global_redundant_expert_num
mask = (expert_map[topk_ids] != -1)
topk_weights = topk_weights * mask
first_expert_idx = get_ep_group(

View File

@@ -243,7 +243,6 @@ class AscendW4A16FusedMoEMethod:
use_int4_w4a16=True,
expert_map=expert_map,
log2phy=log2phy,
global_redundant_expert_num=global_redundant_expert_num,
shared_experts=shared_experts,
quantized_x_for_share=quantized_x_for_share,
dynamic_scale_for_share=dynamic_scale_for_share,

View File

@@ -391,7 +391,6 @@ class AscendW4A8DynamicFusedMoEMethod:
use_int4_w4a8=True,
expert_map=expert_map,
log2phy=log2phy,
global_redundant_expert_num=global_redundant_expert_num,
shared_experts=shared_experts,
quantized_x_for_share=quantized_x_for_share,
dynamic_scale_for_share=dynamic_scale_for_share,

View File

@@ -279,7 +279,6 @@ class AscendW8A8DynamicFusedMoEMethod:
use_int8_w8a8=True,
expert_map=expert_map,
log2phy=log2phy,
global_redundant_expert_num=global_redundant_expert_num,
shared_experts=shared_experts,
quantized_x_for_share=quantized_x_for_share,
dynamic_scale_for_share=dynamic_scale_for_share,