[P/D] Improve the performance of Layerwise Connector (#5303)

### What this PR does / why we need it? Improve the performance of Layerwise Connector, mainly includes the following points: 1. Use event synchronize to replace stream synchronize. 2. Access metaserver when scheduling. 3. Transfer kvcache each Chunk prefill segmentation. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? By CI. - vLLM version: release/v0.13.0 - vLLM main: 5fbfa8d9ef --------- Signed-off-by: nwpu-zxr <zhouxuerong2@huawei.com> Signed-off-by: liziyu <liziyu16@huawei.com> Signed-off-by: wangxiaoteng <wangxiaoteng@huawei.com> Co-authored-by: liziyu <liziyu16@huawei.com> Co-authored-by: wangxiaoteng <wangxiaoteng@huawei.com>
2025-12-31 15:09:01 +08:00
parent 7d5242faca
commit 46a1614387
5 changed files with 354 additions and 202 deletions
--- a/vllm_ascend/attention/mla_v1.py
+++ b/vllm_ascend/attention/mla_v1.py
@@ -166,6 +166,7 @@ class AscendMLAMetadata:

    decode: Optional[AscendMLADecodeMetadata] = None
    prefill: Optional[AscendMLAPrefillMetadata] = None
+    reshape_cache_event: torch.npu.Event = None

    def __post_init__(self):
        pass
@@ -705,6 +706,7 @@ class AscendMLAImpl(MLAAttentionImpl):
        kv_sharing_target_layer_name: Optional[str],
        **kwargs,
    ):
+        self.vllm_config = get_current_vllm_config()
        self.num_heads = num_heads
        self.head_size = head_size
        self.scale = float(scale)
@@ -752,6 +754,8 @@ class AscendMLAImpl(MLAAttentionImpl):
        self.speculative_config = self.vllm_config.speculative_config
        self.enable_mlapo = envs.VLLM_ASCEND_ENABLE_MLAPO

+        self.is_kv_producer = self.vllm_config.kv_transfer_config is not None and self.vllm_config.kv_transfer_config.is_kv_producer
+
    def _v_up_proj(self, x):
        # Convert from (N, B, L)/(N, B, 1, L) to (N, B, L)
        x = x.view(self.num_heads, -1, self.kv_lora_rank)
@@ -1351,8 +1355,12 @@ class AscendMLAImpl(MLAAttentionImpl):
        prefill_slots = attn_metadata.slot_mapping[
            num_decode_tokens:num_actual_tokens]
        prefill_q_pe = self.rope_single(prefill_q_pe, cos, sin)
+        if self.is_kv_producer:
+            attn_metadata.reshape_cache_event = torch.npu.Event()
        prefill_k_pe, prefill_k_c_normed = self.exec_kv_prefill(
            prefill_kv_no_split, cos, sin, kv_cache, prefill_slots)
+        if self.is_kv_producer:
+            attn_metadata.reshape_cache_event.record()
        prefill_k_nope, prefill_value = self.kv_b_proj(
            prefill_k_c_normed)[0].view(
                -1, self.num_heads,