[BugFix]Fix eplb problems when using dynamic eplb. (#3364)

### What this PR does / why we need it? When using dynamic eplb,it will be blocking by nz tensor.We fix these prolems by clone src tensor and recv tensor. ### Does this PR introduce any user-facing change? ### How was this patch tested? Qwen3_moe in A3. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: offline0806 <3337230449@qq.com> Co-authored-by: offline0806 <3337230449@qq.com>
2025-10-11 14:04:02 +08:00
parent ca05f7d632
commit 82b6c846ca
8 changed files with 58 additions and 34 deletions
--- a/vllm_ascend/ops/common_fused_moe.py
+++ b/vllm_ascend/ops/common_fused_moe.py
@@ -23,6 +23,7 @@ from vllm.config import CompilationLevel, get_current_vllm_config
 from vllm.distributed import (get_dp_group, get_ep_group, get_tp_group,
                              tensor_model_parallel_all_reduce)
 from vllm.forward_context import get_forward_context
+from vllm.logger import logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
 from vllm.model_executor.layers.fused_moe.layer import (
    FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map)
@@ -185,13 +186,23 @@ class AscendFusedMoE(FusedMoE):
                                                    os.R_OK):
            self.expert_load_balancer = ExpertLoadBalancer(
                self.expert_map_path, self.global_num_experts)
-            self.local_num_experts, self.expert_map = (
-                self.expert_load_balancer.get_rank_placement_map(
-                    self.moe_instance_id, self.ep_rank))
-            self.log2phy = self.expert_load_balancer.get_rank_log2phy_map(
-                self.moe_instance_id, self.ep_rank).npu()
            self.global_redundant_expert_num = (
                self.expert_load_balancer.get_global_redundant_expert_num())
+            try:
+                self.local_num_experts, self.expert_map = (
+                    self.expert_load_balancer.get_rank_placement_map(
+                        self.moe_instance_id, self.ep_rank))
+                self.log2phy = self.expert_load_balancer.get_rank_log2phy_map(
+                    self.moe_instance_id, self.ep_rank).npu()
+            except Exception as e:
+                logger.warning(
+                    f"Init expert map of mtp/eagle when using sample.{e}")
+                self.local_num_experts, self.expert_map = determine_default_expert_map(
+                    self.global_num_experts, self.ep_size, self.ep_rank,
+                    self.global_redundant_expert_num)
+                self.log2phy = determine_default_log2phy_map(
+                    self.global_num_experts, self.ep_size, self.ep_rank,
+                    self.global_redundant_expert_num).npu()
        else:
            # init moe.
            self.local_num_experts, self.expert_map = determine_expert_map(
@@ -227,6 +238,7 @@ class AscendFusedMoE(FusedMoE):
        if (self.quant_method.__class__.__name__
                in ("GPTQMarlinMoEMethod", "CompressedTensorsWNA16MoEMethod")):
            moe_quant_params["intermediate_size_full"] = intermediate_size
+        self.quant_method.create_weights(layer=self, **moe_quant_params)

        self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp

--- a/vllm_ascend/ops/moe/moe_comm_method.py
+++ b/vllm_ascend/ops/moe/moe_comm_method.py
@@ -150,7 +150,8 @@ class MoECommMethod(ABC):
                                       with_quant=use_int8_w8a8
                                       or use_int4_w4a8,
                                       fusion=use_int8_w8a8,
-                                       need_trans=need_trans)
+                                       need_trans=need_trans,
+                                       dynamic_eplb=dynamic_eplb)

        final_hidden_states = self.token_dispatcher.token_combine(
            hidden_states=mlp_output)
--- a/vllm_ascend/ops/moe/moe_mlp.py
+++ b/vllm_ascend/ops/moe/moe_mlp.py
@@ -63,7 +63,8 @@ def quant_apply_mlp(hidden_states: torch.Tensor,
                    dynamic_scale: torch.Tensor = None,
                    w1_scale_bias: torch.Tensor = None,
                    w2_scale_bias: torch.Tensor = None,
-                    fusion: bool = False) -> torch.Tensor:
+                    fusion: bool = False,
+                    dynamic_eplb: bool = False) -> torch.Tensor:
    if dynamic_scale is None:
        unquantized_hidden_states = hidden_states
        hidden_states, pertoken_scale = torch_npu.npu_dynamic_quant(
@@ -79,7 +80,7 @@ def quant_apply_mlp(hidden_states: torch.Tensor,

    is_mc2 = get_forward_context().moe_comm_type == MoECommType.MC2
    if w1_scale_bias is None and is_mc2:
-        if fusion:
+        if fusion and not dynamic_eplb:
            # gmm1: gate_up_proj & act_fn: swiglu
            hidden_states, swiglu_out_scale, _ = torch_npu.npu_grouped_matmul_swiglu_quant(
                x=hidden_states,
@@ -134,7 +135,7 @@ def quant_apply_mlp(hidden_states: torch.Tensor,
            # TODO w4a8 scene: dynamic acquisition of dtype in the future
            _output_dtype = torch.bfloat16

-        if fusion:
+        if fusion and not dynamic_eplb:
            # gmm1: gate_up_proj & act_fn: swiglu
            hidden_states, swiglu_out_scale, _ = torch_npu.npu_grouped_matmul_swiglu_quant(
                x=hidden_states,
@@ -229,7 +230,8 @@ def unified_apply_mlp(hidden_states: torch.Tensor,
                      topk_scales: Optional[torch.Tensor] = None,
                      with_quant: bool = False,
                      fusion: bool = False,
-                      need_trans: bool = True) -> torch.Tensor:
+                      need_trans: bool = True,
+                      dynamic_eplb: bool = False) -> torch.Tensor:
    if with_quant:
        return quant_apply_mlp(hidden_states=hidden_states,
                               w1=w1,
@@ -241,7 +243,8 @@ def unified_apply_mlp(hidden_states: torch.Tensor,
                               group_list_type=group_list_type,
                               w1_scale_bias=w1_scale_bias,
                               w2_scale_bias=w2_scale_bias,
-                               fusion=fusion)
+                               fusion=fusion,
+                               dynamic_eplb=dynamic_eplb)
    else:
        return unquant_apply_mlp(hidden_states=hidden_states,
                                 w1=w1,