[P/D] Improve the performance of Layerwise Connector (#5303)
### What this PR does / why we need it?
Improve the performance of Layerwise Connector, mainly includes the
following points:
1. Use event synchronize to replace stream synchronize.
2. Access metaserver when scheduling.
3. Transfer kvcache each Chunk prefill segmentation.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
By CI.
- vLLM version: release/v0.13.0
- vLLM main:
5fbfa8d9ef
---------
Signed-off-by: nwpu-zxr <zhouxuerong2@huawei.com>
Signed-off-by: liziyu <liziyu16@huawei.com>
Signed-off-by: wangxiaoteng <wangxiaoteng@huawei.com>
Co-authored-by: liziyu <liziyu16@huawei.com>
Co-authored-by: wangxiaoteng <wangxiaoteng@huawei.com>
This commit is contained in:
@@ -176,6 +176,8 @@ class AscendMetadata:
|
||||
causal: bool = True
|
||||
# runner_type in model_config.
|
||||
model_runner_type: str = ""
|
||||
# prefill reshape_and_cache event
|
||||
reshape_cache_event: torch.npu.Event = None
|
||||
|
||||
# sliding window attention mask
|
||||
swa_mask: Optional[torch.Tensor] = None
|
||||
@@ -333,6 +335,7 @@ class AscendAttentionBackendImpl(AttentionImpl):
|
||||
self.num_queries_per_kv = self.num_heads // self.num_kv_heads
|
||||
self.key_cache = None
|
||||
self.value_cache = None
|
||||
self.is_kv_producer = self.vllm_config.kv_transfer_config is not None and self.vllm_config.kv_transfer_config.is_kv_producer
|
||||
|
||||
def full_graph_fia(self, query: torch.Tensor, key: torch.Tensor,
|
||||
value: torch.Tensor, attn_metadata: AscendMetadata,
|
||||
@@ -654,6 +657,8 @@ class AscendAttentionBackendImpl(AttentionImpl):
|
||||
):
|
||||
|
||||
if len(kv_cache) > 1:
|
||||
if self.is_kv_producer:
|
||||
attn_metadata.reshape_cache_event = torch.npu.Event()
|
||||
if self.key_cache is None:
|
||||
self.key_cache, self.value_cache = kv_cache[0], kv_cache[1]
|
||||
slots = attn_metadata.slot_mapping
|
||||
@@ -674,6 +679,8 @@ class AscendAttentionBackendImpl(AttentionImpl):
|
||||
key_cache=self.key_cache,
|
||||
value_cache=self.value_cache,
|
||||
slot_indices=slots[:attn_metadata.num_actual_tokens])
|
||||
if self.is_kv_producer:
|
||||
attn_metadata.reshape_cache_event.record()
|
||||
return key, value
|
||||
|
||||
def forward_impl(
|
||||
|
||||
Reference in New Issue
Block a user