[P/D] Improve the performance of Layerwise Connector (#5303)

### What this PR does / why we need it? Improve the performance of Layerwise Connector, mainly includes the following points: 1. Use event synchronize to replace stream synchronize. 2. Access metaserver when scheduling. 3. Transfer kvcache each Chunk prefill segmentation. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? By CI. - vLLM version: release/v0.13.0 - vLLM main: 5fbfa8d9ef --------- Signed-off-by: nwpu-zxr <zhouxuerong2@huawei.com> Signed-off-by: liziyu <liziyu16@huawei.com> Signed-off-by: wangxiaoteng <wangxiaoteng@huawei.com> Co-authored-by: liziyu <liziyu16@huawei.com> Co-authored-by: wangxiaoteng <wangxiaoteng@huawei.com>
2025-12-31 15:09:01 +08:00
parent 7d5242faca
commit 46a1614387
5 changed files with 354 additions and 202 deletions
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -176,6 +176,8 @@ class AscendMetadata:
    causal: bool = True
    # runner_type in model_config.
    model_runner_type: str = ""
+    # prefill reshape_and_cache event
+    reshape_cache_event: torch.npu.Event = None

    # sliding window attention mask
    swa_mask: Optional[torch.Tensor] = None
@@ -333,6 +335,7 @@ class AscendAttentionBackendImpl(AttentionImpl):
        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
        self.key_cache = None
        self.value_cache = None
+        self.is_kv_producer = self.vllm_config.kv_transfer_config is not None and self.vllm_config.kv_transfer_config.is_kv_producer

    def full_graph_fia(self, query: torch.Tensor, key: torch.Tensor,
                       value: torch.Tensor, attn_metadata: AscendMetadata,
@@ -654,6 +657,8 @@ class AscendAttentionBackendImpl(AttentionImpl):
    ):

        if len(kv_cache) > 1:
+            if self.is_kv_producer:
+                attn_metadata.reshape_cache_event = torch.npu.Event()
            if self.key_cache is None:
                self.key_cache, self.value_cache = kv_cache[0], kv_cache[1]
            slots = attn_metadata.slot_mapping
@@ -674,6 +679,8 @@ class AscendAttentionBackendImpl(AttentionImpl):
                    key_cache=self.key_cache,
                    value_cache=self.value_cache,
                    slot_indices=slots[:attn_metadata.num_actual_tokens])
+            if self.is_kv_producer:
+                attn_metadata.reshape_cache_event.record()
        return key, value

    def forward_impl(
--- a/vllm_ascend/attention/mla_v1.py
+++ b/vllm_ascend/attention/mla_v1.py
@@ -166,6 +166,7 @@ class AscendMLAMetadata:

    decode: Optional[AscendMLADecodeMetadata] = None
    prefill: Optional[AscendMLAPrefillMetadata] = None
+    reshape_cache_event: torch.npu.Event = None

    def __post_init__(self):
        pass
@@ -705,6 +706,7 @@ class AscendMLAImpl(MLAAttentionImpl):
        kv_sharing_target_layer_name: Optional[str],
        **kwargs,
    ):
+        self.vllm_config = get_current_vllm_config()
        self.num_heads = num_heads
        self.head_size = head_size
        self.scale = float(scale)
@@ -752,6 +754,8 @@ class AscendMLAImpl(MLAAttentionImpl):
        self.speculative_config = self.vllm_config.speculative_config
        self.enable_mlapo = envs.VLLM_ASCEND_ENABLE_MLAPO

+        self.is_kv_producer = self.vllm_config.kv_transfer_config is not None and self.vllm_config.kv_transfer_config.is_kv_producer
+
    def _v_up_proj(self, x):
        # Convert from (N, B, L)/(N, B, 1, L) to (N, B, L)
        x = x.view(self.num_heads, -1, self.kv_lora_rank)
@@ -1351,8 +1355,12 @@ class AscendMLAImpl(MLAAttentionImpl):
        prefill_slots = attn_metadata.slot_mapping[
            num_decode_tokens:num_actual_tokens]
        prefill_q_pe = self.rope_single(prefill_q_pe, cos, sin)
+        if self.is_kv_producer:
+            attn_metadata.reshape_cache_event = torch.npu.Event()
        prefill_k_pe, prefill_k_c_normed = self.exec_kv_prefill(
            prefill_kv_no_split, cos, sin, kv_cache, prefill_slots)
+        if self.is_kv_producer:
+            attn_metadata.reshape_cache_event.record()
        prefill_k_nope, prefill_value = self.kv_b_proj(
            prefill_k_c_normed)[0].view(
                -1, self.num_heads,