[Cherry-pick] Port MoE multi-stream fix to v0.11.0-dev (#3753)

This PR moves the communication operation of shared experts out of extra stream because I found that this might cause rtMemcpy related errors when running shared experts multistream with aclgraph. Furthermore, I utilize a global variable as extra stream object to avoid allocating streams for each layer in full-graph mode. Signed-off-by: whx-sjtu <2952154980@qq.com>
2025-10-25 15:51:43 +08:00
parent 1bc61031e5
commit a58ff9e92f
3 changed files with 25 additions and 13 deletions
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -52,6 +52,7 @@ _IS_310P = None
 _SLEEP_MODE_ENABLED = None
 _CURRENT_STREAM = None
 _PREFETCH_STREAM = None
+_SHARED_EXPERTS_COMPUTE_STREAM = None
 _ASCEND_CUSTOMOP_IS_REIGISTERED = False
 _DEFAULT_BUFFER_SIZE = 200
 _MIN_DP_BUFFER_SIZE = 50
@@ -259,6 +260,15 @@ def prefetch_stream() -> torch.npu.Stream:
    return _PREFETCH_STREAM


+def shared_experts_compute_stream() -> torch.npu.Stream:
+    global _SHARED_EXPERTS_COMPUTE_STREAM
+    if _SHARED_EXPERTS_COMPUTE_STREAM is None:
+        # when this function is called before any stream is set,
+        # we return the default stream.
+        _SHARED_EXPERTS_COMPUTE_STREAM = torch_npu.npu.Stream()
+    return _SHARED_EXPERTS_COMPUTE_STREAM
+
+
 def adapt_patch(is_global_patch: bool = False):
    if is_global_patch:
        from vllm_ascend.patch import platform  # noqa: F401