[BugFix]Support redundant experts in EPLB (#3473)

This PR adds support for redundant experts in the EPLB. Key points: - Use global_num_experts = num_experts + num_redundant_experts consistently. - Backward compatible when num_redundant_experts=0. Tested On a 16-rank setup (W8A8) with static EPLB and expert_map_path, verifying router logits shape and successful requests. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: yechao237 <yechao20180411@gmail.com>
2025-10-18 00:09:16 +08:00
parent 07ca1b9b78
commit 4750d45d86
12 changed files with 23 additions and 35 deletions
--- a/vllm_ascend/eplb/core/eplb_utils.py
+++ b/vllm_ascend/eplb/core/eplb_utils.py
@@ -40,14 +40,6 @@ def determine_default_expert_map(global_expert_num, world_size, rank_id,
        end = global_expert_num
        local_count = global_expert_num - rank_id * local_num_experts

-    if isinstance(global_redundant_expert_num,
-                  int) and rank_id < global_redundant_expert_num:
-        local_count += 1
-        if end < global_expert_num:
-            end += 1
-        else:
-            start -= 1
-
    if isinstance(local_count, int):
        local_ids = torch.arange(local_count, dtype=torch.int32)
        expert_map[start:end] = local_ids
@@ -118,14 +110,6 @@ def determine_default_log2phy_map(global_expert_num, world_size, rank_id,
            end = global_expert_num
            local_count = global_expert_num - r * local_num_experts

-        if isinstance(global_redundant_expert_num,
-                      int) and rank_id < global_redundant_expert_num:
-            local_count += 1
-            if end < global_expert_num:
-                end += 1
-            else:
-                start -= 1
-
        if isinstance(local_count, int):
            local_ids = torch.arange(local_count, dtype=torch.int32)
            expert_map_all[r, start:end] = local_ids