[Feature] Support moe multi-stream for aclgraph. (#2946)
This PR puts the calculation of shared experts into a separate stream,
overlaping with routing experts.
- vLLM version: v0.10.2
- vLLM main:
fbd6523ac0
---------
Signed-off-by: whx-sjtu <2952154980@qq.com>
This commit is contained in:
@@ -61,6 +61,8 @@ class AscendConfig:
|
||||
self.enable_shared_expert_dp = additional_config.get(
|
||||
"enable_shared_expert_dp", False
|
||||
) and not self.torchair_graph_config.enabled and vllm_config.parallel_config.enable_expert_parallel
|
||||
self.multistream_overlap_shared_expert = additional_config.get(
|
||||
"multistream_overlap_shared_expert", False)
|
||||
self.enable_prefetch = additional_config.get("enable_prefetch", False)
|
||||
self.lmhead_tensor_parallel_size = additional_config.get(
|
||||
"lmhead_tensor_parallel_size", None)
|
||||
@@ -110,8 +112,6 @@ class TorchairGraphConfig:
|
||||
"graph_batch_sizes_init", False)
|
||||
self.enable_multistream_mla = torchair_graph_config.get(
|
||||
"enable_multistream_mla", False)
|
||||
self.enable_multistream_moe = torchair_graph_config.get(
|
||||
"enable_multistream_moe", False)
|
||||
self.enable_view_optimize = torchair_graph_config.get(
|
||||
"enable_view_optimize", True)
|
||||
self.enable_frozen_parameter = torchair_graph_config.get(
|
||||
@@ -148,10 +148,6 @@ class TorchairGraphConfig:
|
||||
raise RuntimeError(
|
||||
"enable_multistream_mla is valid only when Torchair graph mode is enabled"
|
||||
)
|
||||
if self.enable_multistream_moe:
|
||||
raise RuntimeError(
|
||||
"enable_multistream_moe is valid only when Torchair graph mode is enabled"
|
||||
)
|
||||
if self.enable_kv_nz:
|
||||
raise RuntimeError(
|
||||
"enable_kv_nz is valid only when Torchair graph mode is enabled"
|
||||
|
||||
Reference in New Issue
Block a user