eplb redundant expert bugfix (#4291)

### What this PR does / why we need it? Redundant experts bugfix ### Does this PR introduce _any_ user-facing change? After configuring the path for experts_map, users do not need to configure iinit_redundancy_expert. ### How was this patch tested? The accuracy of EPLB was tested with and without the use of redundant experts. - vLLM version: v0.11.0 - vLLM main: 2918c1b49c --------- Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
2025-11-21 14:24:35 +08:00
parent 5a4e8cdeba
commit 019c7ded91
10 changed files with 63 additions and 140 deletions
--- a/vllm_ascend/quantization/w4a8_dynamic.py
+++ b/vllm_ascend/quantization/w4a8_dynamic.py
@@ -342,7 +342,7 @@ class AscendW4A8DynamicFusedMoEMethod:
        scoring_func: str = "softmax",
        e_score_correction_bias: Optional[torch.Tensor] = None,
        is_prefill: bool = True,
-        enable_force_load_balance: bool = True,
+        enable_force_load_balance: bool = False,
        log2phy: torch.Tensor = None,
        global_redundant_expert_num: int = 0,
        shared_experts: Optional[Any] = None,
@@ -371,7 +371,8 @@ class AscendW4A8DynamicFusedMoEMethod:
        # to avoid accumulating too much tokens on a single rank.
        # currently it is only activated when doing profile runs.
        if enable_force_load_balance:
-            topk_ids = torch.randint_like(topk_ids, 0, global_num_experts)
+            topk_ids = torch.randint_like(
+                topk_ids, 0, global_num_experts - global_redundant_expert_num)

        topk_weights = topk_weights.to(x.dtype)

--- a/vllm_ascend/quantization/w8a8_dynamic.py
+++ b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -213,7 +213,7 @@ class AscendW8A8DynamicFusedMoEMethod:
        scoring_func: str = "softmax",
        e_score_correction_bias: Optional[torch.Tensor] = None,
        is_prefill: bool = True,
-        enable_force_load_balance: bool = True,
+        enable_force_load_balance: bool = False,
        log2phy: torch.Tensor = None,
        global_redundant_expert_num: int = 0,
        shared_experts: Optional[Any] = None,
@@ -242,7 +242,8 @@ class AscendW8A8DynamicFusedMoEMethod:
        # to avoid accumulating too much tokens on a single rank.
        # currently it is only activated when doing profile runs.
        if enable_force_load_balance:
-            topk_ids = torch.randint_like(topk_ids, 0, global_num_experts)
+            topk_ids = torch.randint_like(
+                topk_ids, 0, global_num_experts - global_redundant_expert_num)

        topk_weights = topk_weights.to(self.in_dtype)