[Perf][PCP][DCP] add multi-stream for GQA to enable computation-communication overlap (#5382)
### What this PR does / why we need it?
This PR adds multi-stream for GQA to enable computation-communication
overlap. For chunked prefill, we reduce TTFT by approximately 4%.
### Does this PR introduce _any_ user-facing change?
No
- vLLM version: release/v0.13.0
- vLLM main:
bc0a5a0c08
---------
Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com>
This commit is contained in:
@@ -55,6 +55,7 @@ _PREFETCH_STREAM = None
|
||||
_WEIGHT_PREFETCH_METHOD = None
|
||||
_GLOBAL_STREAM = None
|
||||
_SHARED_EXPERTS_CALCULATION_STREAM = None
|
||||
_CP_CHUNKEDPREFILL_COMM_STREAM = None
|
||||
_ASCEND_CUSTOMOP_IS_REIGISTERED = False
|
||||
_DEFAULT_BUFFER_SIZE = 200
|
||||
_MIN_DP_BUFFER_SIZE = 50
|
||||
@@ -340,6 +341,13 @@ def shared_experts_calculation_stream() -> torch.npu.Stream:
|
||||
return _SHARED_EXPERTS_CALCULATION_STREAM
|
||||
|
||||
|
||||
def cp_chunkedprefill_comm_stream() -> torch.npu.Stream:
|
||||
global _CP_CHUNKEDPREFILL_COMM_STREAM
|
||||
if _CP_CHUNKEDPREFILL_COMM_STREAM is None:
|
||||
_CP_CHUNKEDPREFILL_COMM_STREAM = torch_npu.npu.Stream()
|
||||
return _CP_CHUNKEDPREFILL_COMM_STREAM
|
||||
|
||||
|
||||
def adapt_patch(is_global_patch: bool = False):
|
||||
if is_global_patch:
|
||||
from vllm_ascend.patch import platform # noqa: F401
|
||||
|
||||
Reference in New Issue
Block a user