[P/D] Improve the performance of Layerwise Connector (#5303)
### What this PR does / why we need it?
Improve the performance of Layerwise Connector, mainly includes the
following points:
1. Use event synchronize to replace stream synchronize.
2. Access metaserver when scheduling.
3. Transfer kvcache each Chunk prefill segmentation.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
By CI.
- vLLM version: release/v0.13.0
- vLLM main:
5fbfa8d9ef
---------
Signed-off-by: nwpu-zxr <zhouxuerong2@huawei.com>
Signed-off-by: liziyu <liziyu16@huawei.com>
Signed-off-by: wangxiaoteng <wangxiaoteng@huawei.com>
Co-authored-by: liziyu <liziyu16@huawei.com>
Co-authored-by: wangxiaoteng <wangxiaoteng@huawei.com>
This commit is contained in:
@@ -166,6 +166,7 @@ class AscendMLAMetadata:
|
||||
|
||||
decode: Optional[AscendMLADecodeMetadata] = None
|
||||
prefill: Optional[AscendMLAPrefillMetadata] = None
|
||||
reshape_cache_event: torch.npu.Event = None
|
||||
|
||||
def __post_init__(self):
|
||||
pass
|
||||
@@ -705,6 +706,7 @@ class AscendMLAImpl(MLAAttentionImpl):
|
||||
kv_sharing_target_layer_name: Optional[str],
|
||||
**kwargs,
|
||||
):
|
||||
self.vllm_config = get_current_vllm_config()
|
||||
self.num_heads = num_heads
|
||||
self.head_size = head_size
|
||||
self.scale = float(scale)
|
||||
@@ -752,6 +754,8 @@ class AscendMLAImpl(MLAAttentionImpl):
|
||||
self.speculative_config = self.vllm_config.speculative_config
|
||||
self.enable_mlapo = envs.VLLM_ASCEND_ENABLE_MLAPO
|
||||
|
||||
self.is_kv_producer = self.vllm_config.kv_transfer_config is not None and self.vllm_config.kv_transfer_config.is_kv_producer
|
||||
|
||||
def _v_up_proj(self, x):
|
||||
# Convert from (N, B, L)/(N, B, 1, L) to (N, B, L)
|
||||
x = x.view(self.num_heads, -1, self.kv_lora_rank)
|
||||
@@ -1351,8 +1355,12 @@ class AscendMLAImpl(MLAAttentionImpl):
|
||||
prefill_slots = attn_metadata.slot_mapping[
|
||||
num_decode_tokens:num_actual_tokens]
|
||||
prefill_q_pe = self.rope_single(prefill_q_pe, cos, sin)
|
||||
if self.is_kv_producer:
|
||||
attn_metadata.reshape_cache_event = torch.npu.Event()
|
||||
prefill_k_pe, prefill_k_c_normed = self.exec_kv_prefill(
|
||||
prefill_kv_no_split, cos, sin, kv_cache, prefill_slots)
|
||||
if self.is_kv_producer:
|
||||
attn_metadata.reshape_cache_event.record()
|
||||
prefill_k_nope, prefill_value = self.kv_b_proj(
|
||||
prefill_k_c_normed)[0].view(
|
||||
-1, self.num_heads,
|
||||
|
||||
Reference in New Issue
Block a user