[Perf][PCP][DCP] add multi-stream for GQA to enable computation-communication overlap (#5382)

### What this PR does / why we need it? This PR adds multi-stream for GQA to enable computation-communication overlap. For chunked prefill, we reduce TTFT by approximately 4%. ### Does this PR introduce _any_ user-facing change? No - vLLM version: release/v0.13.0 - vLLM main: bc0a5a0c08 --------- Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com>
2026-01-04 16:33:18 +08:00
parent 37fd48bee5
commit 7c210225a2
5 changed files with 276 additions and 224 deletions
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -55,6 +55,7 @@ _PREFETCH_STREAM = None
 _WEIGHT_PREFETCH_METHOD = None
 _GLOBAL_STREAM = None
 _SHARED_EXPERTS_CALCULATION_STREAM = None
+_CP_CHUNKEDPREFILL_COMM_STREAM = None
 _ASCEND_CUSTOMOP_IS_REIGISTERED = False
 _DEFAULT_BUFFER_SIZE = 200
 _MIN_DP_BUFFER_SIZE = 50
@@ -340,6 +341,13 @@ def shared_experts_calculation_stream() -> torch.npu.Stream:
    return _SHARED_EXPERTS_CALCULATION_STREAM


+def cp_chunkedprefill_comm_stream() -> torch.npu.Stream:
+    global _CP_CHUNKEDPREFILL_COMM_STREAM
+    if _CP_CHUNKEDPREFILL_COMM_STREAM is None:
+        _CP_CHUNKEDPREFILL_COMM_STREAM = torch_npu.npu.Stream()
+    return _CP_CHUNKEDPREFILL_COMM_STREAM
+
+
 def adapt_patch(is_global_patch: bool = False):
    if is_global_patch:
        from vllm_ascend.patch import platform  # noqa: F401