[Feature] Support moe multi-stream for aclgraph. (#2946)

This PR puts the calculation of shared experts into a separate stream, overlaping with routing experts. - vLLM version: v0.10.2 - vLLM main: fbd6523ac0 --------- Signed-off-by: whx-sjtu <2952154980@qq.com>
2025-09-19 11:06:45 +08:00
parent 0c04bf1e36
commit 0a526768f5
14 changed files with 170 additions and 49 deletions
--- a/vllm_ascend/ascend_config.py
+++ b/vllm_ascend/ascend_config.py
@@ -61,6 +61,8 @@ class AscendConfig:
        self.enable_shared_expert_dp = additional_config.get(
            "enable_shared_expert_dp", False
        ) and not self.torchair_graph_config.enabled and vllm_config.parallel_config.enable_expert_parallel
+        self.multistream_overlap_shared_expert = additional_config.get(
+            "multistream_overlap_shared_expert", False)
        self.enable_prefetch = additional_config.get("enable_prefetch", False)
        self.lmhead_tensor_parallel_size = additional_config.get(
            "lmhead_tensor_parallel_size", None)
@@ -110,8 +112,6 @@ class TorchairGraphConfig:
            "graph_batch_sizes_init", False)
        self.enable_multistream_mla = torchair_graph_config.get(
            "enable_multistream_mla", False)
-        self.enable_multistream_moe = torchair_graph_config.get(
-            "enable_multistream_moe", False)
        self.enable_view_optimize = torchair_graph_config.get(
            "enable_view_optimize", True)
        self.enable_frozen_parameter = torchair_graph_config.get(
@@ -148,10 +148,6 @@ class TorchairGraphConfig:
                raise RuntimeError(
                    "enable_multistream_mla is valid only when Torchair graph mode is enabled"
                )
-            if self.enable_multistream_moe:
-                raise RuntimeError(
-                    "enable_multistream_moe is valid only when Torchair graph mode is enabled"
-                )
            if self.enable_kv_nz:
                raise RuntimeError(
                    "enable_kv_nz is valid only when Torchair graph mode is enabled"