[Feature] Support moe multi-stream for aclgraph. (#2946)

This PR puts the calculation of shared experts into a separate stream, overlaping with routing experts. - vLLM version: v0.10.2 - vLLM main: fbd6523ac0 --------- Signed-off-by: whx-sjtu <2952154980@qq.com>
2025-09-19 11:06:45 +08:00
parent 0c04bf1e36
commit 0a526768f5
14 changed files with 170 additions and 49 deletions
--- a/vllm_ascend/torchair/models/torchair_deepseek_v2.py
+++ b/vllm_ascend/torchair/models/torchair_deepseek_v2.py
@@ -322,8 +322,8 @@ class TorchairDeepseekV2MoE(nn.Module):

        ascend_config = get_ascend_config()
        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
-        self.enable_multistream_moe = \
-            ascend_config.torchair_graph_config.enable_multistream_moe and \
+        self.multistream_overlap_shared_expert = \
+            ascend_config.multistream_overlap_shared_expert and \
            self.torchair_graph_enabled

        self.gate = ReplicatedLinear(config.hidden_size,
@@ -364,7 +364,7 @@ class TorchairDeepseekV2MoE(nn.Module):
                hidden_act=config.hidden_act,
                quant_config=quant_config,
                reduce_results=reduce_results,
-                force_replicate=self.enable_multistream_moe
+                force_replicate=self.multistream_overlap_shared_expert
                or enable_shared_expert_dp,
                prefix=f"{prefix}.shared_experts",
            )
@@ -406,7 +406,7 @@ class TorchairDeepseekV2MoE(nn.Module):

        # router_logits: (num_tokens, n_experts)
        router_logits = None
-        if not self.rm_router_logits and not self.enable_multistream_moe:
+        if not self.rm_router_logits and not self.multistream_overlap_shared_expert:
            router_logits, _ = self.gate(hidden_states)

        experts_hidden_states = self.experts(
@@ -524,7 +524,7 @@ class TorchairDeepseekV2MLAAttention(DeepseekV2MLAAttention):
        elif (config.n_routed_experts is not None
              and self.debug_layer_idx >= config.first_k_dense_replace
              and self.debug_layer_idx % config.moe_layer_freq == 0
-              and (ascend_config.torchair_graph_config.enable_multistream_moe
+              and (ascend_config.multistream_overlap_shared_expert
                   or self.enable_shared_expert_dp)):
            self.o_proj = TorchairDeepseekV2RowParallelLinearReplaceAllreduce(
                self.num_heads * self.v_head_dim,
@@ -697,7 +697,7 @@ class TorchairDeepseekV2DecoderLayer(DeepseekV2DecoderLayer):
                quant_config=quant_config,
                prefix=f"{prefix}.mlp",
            )
-            self.mla_moe_communication = ascend_config.torchair_graph_config.enable_multistream_moe \
+            self.mla_moe_communication = ascend_config.multistream_overlap_shared_expert \
                and model_config.use_mla and self.tp_size > 1
        else:
            self.mlp = TorchairDeepseekV2MLP(