[P/D] Improve the performance of Layerwise Connector (#5303)

### What this PR does / why we need it? Improve the performance of Layerwise Connector, mainly includes the following points: 1. Use event synchronize to replace stream synchronize. 2. Access metaserver when scheduling. 3. Transfer kvcache each Chunk prefill segmentation. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? By CI. - vLLM version: release/v0.13.0 - vLLM main: 5fbfa8d9ef --------- Signed-off-by: nwpu-zxr <zhouxuerong2@huawei.com> Signed-off-by: liziyu <liziyu16@huawei.com> Signed-off-by: wangxiaoteng <wangxiaoteng@huawei.com> Co-authored-by: liziyu <liziyu16@huawei.com> Co-authored-by: wangxiaoteng <wangxiaoteng@huawei.com>
2025-12-31 15:09:01 +08:00
parent 7d5242faca
commit 46a1614387
5 changed files with 354 additions and 202 deletions
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -176,6 +176,8 @@ class AscendMetadata:
    causal: bool = True
    # runner_type in model_config.
    model_runner_type: str = ""
+    # prefill reshape_and_cache event
+    reshape_cache_event: torch.npu.Event = None

    # sliding window attention mask
    swa_mask: Optional[torch.Tensor] = None
@@ -333,6 +335,7 @@ class AscendAttentionBackendImpl(AttentionImpl):
        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
        self.key_cache = None
        self.value_cache = None
+        self.is_kv_producer = self.vllm_config.kv_transfer_config is not None and self.vllm_config.kv_transfer_config.is_kv_producer

    def full_graph_fia(self, query: torch.Tensor, key: torch.Tensor,
                       value: torch.Tensor, attn_metadata: AscendMetadata,
@@ -654,6 +657,8 @@ class AscendAttentionBackendImpl(AttentionImpl):
    ):

        if len(kv_cache) > 1:
+            if self.is_kv_producer:
+                attn_metadata.reshape_cache_event = torch.npu.Event()
            if self.key_cache is None:
                self.key_cache, self.value_cache = kv_cache[0], kv_cache[1]
            slots = attn_metadata.slot_mapping
@@ -674,6 +679,8 @@ class AscendAttentionBackendImpl(AttentionImpl):
                    key_cache=self.key_cache,
                    value_cache=self.value_cache,
                    slot_indices=slots[:attn_metadata.num_actual_tokens])
+            if self.is_kv_producer:
+                attn_metadata.reshape_cache_event.record()
        return key, value

    def forward_impl(