[Feature] Support moe multi-stream for aclgraph. (#2946)

This PR puts the calculation of shared experts into a separate stream, overlaping with routing experts. - vLLM version: v0.10.2 - vLLM main: fbd6523ac0 --------- Signed-off-by: whx-sjtu <2952154980@qq.com>
2025-09-19 11:06:45 +08:00
parent 0c04bf1e36
commit 0a526768f5
14 changed files with 170 additions and 49 deletions
--- a/vllm_ascend/ops/common_fused_moe.py
+++ b/vllm_ascend/ops/common_fused_moe.py
@@ -37,7 +37,7 @@ from vllm_ascend.ops.moe.experts_selector import select_experts
 from vllm_ascend.ops.moe.moe_comm_method import (AllGatherCommImpl,
                                                 AlltoAllCommImpl, MC2CommImpl,
                                                 NaiveMulticastCommImpl)
-from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p
+from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p, npu_stream_switch

 original_unquantized_fused_moe_init_func = UnquantizedFusedMoEMethod.__init__

@@ -426,24 +426,39 @@ class AscendSharedFusedMoE(AscendFusedMoE):
        super().__init__(**kwargs)
        self._shared_experts = shared_experts
        self.use_overlapped = use_overlapped
+        self.shared_expert_stream = None
+        ascend_config = get_ascend_config()
+        self.multistream_overlap_shared_expert = ascend_config.multistream_overlap_shared_expert
+        if self.multistream_overlap_shared_expert:
+            self.shared_expert_stream = torch.npu.Stream()

    def forward(
        self,
        hidden_states: torch.Tensor,
        router_logits: torch.Tensor,
    ) -> tuple[torch.Tensor, torch.Tensor]:
-        shared_out = self._shared_experts(hidden_states)
+        # Make sure the shared experts stream begins after hidden_states are ready.
+        if self.multistream_overlap_shared_expert:
+            self.shared_expert_stream.wait_stream(  # type: ignore
+                torch.npu.current_stream())
+        with npu_stream_switch(self.shared_expert_stream,
+                               enabled=self.multistream_overlap_shared_expert):
+            # Use a separate stream to run shared experts.
+            shared_out = self._shared_experts(hidden_states)

-        # NOTE: This is exactly the opposite of `maybe_all_reduce_tensor_model_parallel`
-        forward_context = get_forward_context()
-        moe_comm_method_name = forward_context.moe_comm_method_name
-        if moe_comm_method_name in {"alltoallcommimpl", "mc2commimpl"}:
-            shared_out = tensor_model_parallel_all_reduce(shared_out)
+            # NOTE: This is exactly the opposite of `maybe_all_reduce_tensor_model_parallel`
+            forward_context = get_forward_context()
+            moe_comm_method_name = forward_context.moe_comm_method_name
+            if moe_comm_method_name in {"alltoallcommimpl", "mc2commimpl"}:
+                shared_out = tensor_model_parallel_all_reduce(shared_out)

        fused_out = super().forward(
            hidden_states=hidden_states,
            router_logits=router_logits,
        )
+        # Make sure the default stream waits for the shared experts stream to finish.
+        if self.multistream_overlap_shared_expert:
+            torch.npu.current_stream().wait_stream(self.shared_expert_stream)
        return shared_out, fused_out