eplb redundant expert bugfix (#4291)

### What this PR does / why we need it? Redundant experts bugfix ### Does this PR introduce _any_ user-facing change? After configuring the path for experts_map, users do not need to configure iinit_redundancy_expert. ### How was this patch tested? The accuracy of EPLB was tested with and without the use of redundant experts. - vLLM version: v0.11.0 - vLLM main: 2918c1b49c --------- Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
2025-11-21 14:24:35 +08:00
parent 5a4e8cdeba
commit 019c7ded91
10 changed files with 63 additions and 140 deletions
--- a/vllm_ascend/torchair/ops/torchair_fused_moe.py
+++ b/vllm_ascend/torchair/ops/torchair_fused_moe.py
@@ -43,8 +43,7 @@ from vllm.model_executor.layers.quantization.base_config import \
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.ascend_forward_context import FusedMoEState
 from vllm_ascend.distributed.parallel_state import get_mc2_group
-from vllm_ascend.eplb.core.eplb_utils import (determine_default_expert_map,
-                                              determine_default_log2phy_map)
+from vllm_ascend.eplb.core.eplb_utils import determine_default_log2phy_map
 from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
 from vllm_ascend.quantization.quant_config import AscendFusedMoEMethod
 from vllm_ascend.torchair.ops.sequence_parallel import MetadataForPadding
@@ -1042,7 +1041,7 @@ class TorchairAscendFusedMoE(FusedMoE):
                self.expert_map_path) and os.access(self.expert_map_path,
                                                    os.R_OK):
            self.expert_load_balancer = ExpertLoadBalancer(
-                self.expert_map_path, self.global_num_experts)
+                self.expert_map_path, num_experts)
            self.expert_load_balancer.check_expert_map_tensor()
            self.global_redundant_expert_num = (
                self.expert_load_balancer.get_global_redundant_expert_num())
@@ -1052,15 +1051,14 @@ class TorchairAscendFusedMoE(FusedMoE):
                        self.moe_instance_id, self.ep_rank))
                self.log2phy = self.expert_load_balancer.get_rank_log2phy_map(
                    self.moe_instance_id, self.ep_rank).npu()
+                self.global_num_experts = num_experts + self.global_redundant_expert_num
            except Exception as e:
                logger.warning(
                    f"Init expert map of mtp/eagle when using sample.{e}")
-                self.local_num_experts, self.expert_map = determine_default_expert_map(
-                    self.global_num_experts, self.ep_size, self.ep_rank,
-                    self.global_redundant_expert_num)
+                self.local_num_experts, self.expert_map = determine_expert_map(
+                    self.ep_size, self.ep_rank, self.global_num_experts)
                self.log2phy = determine_default_log2phy_map(
-                    self.global_num_experts, self.ep_size, self.ep_rank,
-                    self.global_redundant_expert_num).npu()
+                    self.global_num_experts, self.ep_size, self.ep_rank).npu()
            if self.expert_map is not None and isinstance(
                    self.expert_map, torch.Tensor):
                logger.info_once(
@@ -1079,13 +1077,8 @@ class TorchairAscendFusedMoE(FusedMoE):
                    self.ep_size, self.ep_rank, self.global_num_experts)
            # dynamic eplb initializing with not expert_map_path
            if self.dynamic_eplb:
-                self.global_redundant_expert_num = ascend_config.init_redundancy_expert
-                self.local_num_experts, self.expert_map = determine_default_expert_map(
-                    self.global_num_experts, self.ep_size, self.ep_rank,
-                    self.global_redundant_expert_num)
                self.log2phy = determine_default_log2phy_map(
-                    self.global_num_experts, self.ep_size, self.ep_rank,
-                    self.global_redundant_expert_num).npu()
+                    self.global_num_experts, self.ep_size, self.ep_rank).npu()
            if self.expert_map is not None and isinstance(
                    self.expert_map, torch.Tensor):
                logger.info_once(
--- a/vllm_ascend/torchair/quantization/torchair_w8a8_dynamic.py
+++ b/vllm_ascend/torchair/quantization/torchair_w8a8_dynamic.py
@@ -990,7 +990,9 @@ class TorchairAscendW8A8DynamicFusedMoEMethod:
            # to avoid accumulating too much tokens on a single rank.
            # currently it is only activated when doing profile runs.
            if enable_force_load_balance:
-                topk_ids = torch.randint_like(topk_ids, 0, global_num_experts)
+                topk_ids = torch.randint_like(
+                    topk_ids, 0,
+                    global_num_experts - global_redundant_expert_num)
            topk_weights = topk_weights.to(x.dtype)

        if fused_moe_state == FusedMoEState.AllGatherEP: