[P/D]Mooncake Layerwise Connector supports hybrid attention manager with multiple kvcache groups (#7022)
### What this PR does / why we need it?
Mooncake Layerwise Connector supports hybrid attention manager with
multiple kvcache groups.
### Does this PR introduce _any_ user-facing change?
Yes.
### How was this patch tested?
By CI.
- vLLM version: v0.16.0
- vLLM main:
15d76f74e2
---------
Signed-off-by: nwpu-zxr <zhouxuerong2@huawei.com>
This commit is contained in:
@@ -59,6 +59,7 @@ def init_ascend_model_parallel(
|
||||
global _P_TP
|
||||
assert _P_TP is None, "distributed prefill tensor parallel group is already initialized"
|
||||
prefill_tensor_model_parallel_size = pd_tp_ratio
|
||||
pcp_size = parallel_config.prefill_context_parallel_size
|
||||
# divide alltoall groups
|
||||
if pd_head_ratio > 1 and get_current_vllm_config().kv_transfer_config.is_kv_producer:
|
||||
num_head_replica = get_ascend_config().num_head_replica
|
||||
@@ -67,13 +68,13 @@ def init_ascend_model_parallel(
|
||||
group_ranks = all_ranks.view(-1, prefill_tensor_model_parallel_size).unbind(0)
|
||||
else:
|
||||
group_ranks = all_ranks.clone().view(
|
||||
global_dp_size, -1, num_head_replica
|
||||
global_dp_size * pcp_size, -1, num_head_replica
|
||||
) # [DP_size, num_head, num_head_replica]
|
||||
group_ranks = group_ranks.permute(0, 2, 1)
|
||||
group_ranks = group_ranks.reshape(-1, group_ranks.size(-1)) # [DP_size * num_head_replica, num_head]
|
||||
alltoall_group_size = group_ranks.size(-1) // remote_tp_size
|
||||
group_ranks = group_ranks.unsqueeze(-1).view(
|
||||
global_dp_size, num_head_replica, -1, alltoall_group_size
|
||||
global_dp_size * pcp_size, num_head_replica, -1, alltoall_group_size
|
||||
) # [DP_size, num_head_replica, num_alltoall_group, alltoall_group_size]
|
||||
group_ranks = group_ranks.reshape(-1, alltoall_group_size).unbind(0)
|
||||
group_ranks = [x.tolist() for x in group_ranks]
|
||||
|
||||
Reference in New Issue
Block a user