[Perf][PCP][DCP] add multi-stream for GQA to enable computation-communication overlap (#5382)

### What this PR does / why we need it?
This PR adds multi-stream for GQA to enable computation-communication
overlap. For chunked prefill, we reduce TTFT by approximately 4%.

### Does this PR introduce _any_ user-facing change?
No

- vLLM version: release/v0.13.0
- vLLM main:
bc0a5a0c08

---------

Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com>
This commit is contained in:
Qiu
2026-01-04 16:33:18 +08:00
committed by GitHub
parent 37fd48bee5
commit 7c210225a2
5 changed files with 276 additions and 224 deletions

View File

@@ -55,6 +55,7 @@ _PREFETCH_STREAM = None
_WEIGHT_PREFETCH_METHOD = None
_GLOBAL_STREAM = None
_SHARED_EXPERTS_CALCULATION_STREAM = None
_CP_CHUNKEDPREFILL_COMM_STREAM = None
_ASCEND_CUSTOMOP_IS_REIGISTERED = False
_DEFAULT_BUFFER_SIZE = 200
_MIN_DP_BUFFER_SIZE = 50
@@ -340,6 +341,13 @@ def shared_experts_calculation_stream() -> torch.npu.Stream:
return _SHARED_EXPERTS_CALCULATION_STREAM
def cp_chunkedprefill_comm_stream() -> torch.npu.Stream:
global _CP_CHUNKEDPREFILL_COMM_STREAM
if _CP_CHUNKEDPREFILL_COMM_STREAM is None:
_CP_CHUNKEDPREFILL_COMM_STREAM = torch_npu.npu.Stream()
return _CP_CHUNKEDPREFILL_COMM_STREAM
def adapt_patch(is_global_patch: bool = False):
if is_global_patch:
from vllm_ascend.patch import platform # noqa: F401