[Feature] Support moe multi-stream for aclgraph. (#2946)
This PR puts the calculation of shared experts into a separate stream,
overlaping with routing experts.
- vLLM version: v0.10.2
- vLLM main:
fbd6523ac0
---------
Signed-off-by: whx-sjtu <2952154980@qq.com>
This commit is contained in:
@@ -43,4 +43,4 @@ vllm serve model_path \
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
||||
}' \
|
||||
--additional-config \
|
||||
'{"ascend_scheduler_config": {"enabled": true}, "torchair_graph_config":{"enabled":true,"enable_kv_nz":false, "enable_multistream_moe":false, "graph_batch_size":[28]}, "enable_weight_nz_layout":true}'
|
||||
'{"ascend_scheduler_config": {"enabled": true}, "torchair_graph_config":{"enabled":true,"enable_kv_nz":false, "graph_batch_size":[28]}, "enable_weight_nz_layout":true, "enable_multistream_moe":false}'
|
||||
@@ -29,4 +29,4 @@ vllm serve Qwen/Qwen1.5-MoE-A2.7B \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--trust-remote-code \
|
||||
--enforce-eager \
|
||||
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":false, "enable_multistream_moe":false, "use_cached_graph":false}}'
|
||||
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":false, "use_cached_graph":false}}'
|
||||
|
||||
Reference in New Issue
Block a user