[P/D]Mooncake Layerwise Connector supports hybrid attention manager with multiple kvcache groups (#7022)

### What this PR does / why we need it?
Mooncake Layerwise Connector supports hybrid attention manager with
multiple kvcache groups.

### Does this PR introduce _any_ user-facing change?
Yes.

### How was this patch tested?
By CI.

- vLLM version: v0.16.0
- vLLM main:
15d76f74e2

---------

Signed-off-by: nwpu-zxr <zhouxuerong2@huawei.com>
This commit is contained in:
zxr2333
2026-03-10 23:59:20 +08:00
committed by GitHub
parent 0f289fa2a8
commit 239683c7a6
6 changed files with 565 additions and 224 deletions

View File

@@ -261,6 +261,7 @@ def get_transfer_mappings(
pd_head_mapping: dict[int, set],
d_trans_count_mapping: dict[tuple[str, int], int],
req_meta,
block_group_idx: int,
p_parallel_info: parallel_info,
req_id: str,
transed_idx: int,
@@ -272,15 +273,17 @@ def get_transfer_mappings(
transfer_mappings: dict[tuple[str, int], dict[str, Any]] = {}
p_head_group_rank = (tp_rank - dcp_rank) // p_parallel_info.dcp_size
p_block_idxs: list[int] = p_rank_block_mapping[pcp_rank][p_head_group_rank][dcp_rank]
p_block_ids = req_meta.local_block_ids[block_group_idx]
d_block_ids = req_meta.remote_block_ids[block_group_idx]
for p_block_idx, logic_block_idx in enumerate(p_block_idxs):
if logic_block_idx < transed_idx or logic_block_idx >= to_trans_idx:
continue
for d_head_group_rank in pd_head_mapping[p_head_group_rank]:
p_block_id = req_meta.local_block_ids[p_block_idx]
p_block_id = p_block_ids[p_block_idx]
remote_host = d_block_rank_mapping[logic_block_idx][d_head_group_rank]["host"]
remote_port = d_block_rank_mapping[logic_block_idx][d_head_group_rank]["port"]
d_block_idx = d_block_rank_mapping[logic_block_idx][d_head_group_rank]["block_idx"]
d_block_id = req_meta.remote_block_ids[d_block_idx]
d_block_id = d_block_ids[d_block_idx]
if (remote_host, remote_port) not in transfer_mappings:
transfer_mappings[(remote_host, remote_port)] = {
"local_block_ids": [],

View File

@@ -59,6 +59,7 @@ def init_ascend_model_parallel(
global _P_TP
assert _P_TP is None, "distributed prefill tensor parallel group is already initialized"
prefill_tensor_model_parallel_size = pd_tp_ratio
pcp_size = parallel_config.prefill_context_parallel_size
# divide alltoall groups
if pd_head_ratio > 1 and get_current_vllm_config().kv_transfer_config.is_kv_producer:
num_head_replica = get_ascend_config().num_head_replica
@@ -67,13 +68,13 @@ def init_ascend_model_parallel(
group_ranks = all_ranks.view(-1, prefill_tensor_model_parallel_size).unbind(0)
else:
group_ranks = all_ranks.clone().view(
global_dp_size, -1, num_head_replica
global_dp_size * pcp_size, -1, num_head_replica
) # [DP_size, num_head, num_head_replica]
group_ranks = group_ranks.permute(0, 2, 1)
group_ranks = group_ranks.reshape(-1, group_ranks.size(-1)) # [DP_size * num_head_replica, num_head]
alltoall_group_size = group_ranks.size(-1) // remote_tp_size
group_ranks = group_ranks.unsqueeze(-1).view(
global_dp_size, num_head_replica, -1, alltoall_group_size
global_dp_size * pcp_size, num_head_replica, -1, alltoall_group_size
) # [DP_size, num_head_replica, num_alltoall_group, alltoall_group_size]
group_ranks = group_ranks.reshape(-1, alltoall_group_size).unbind(0)
group_ranks = [x.tolist() for x in group_ranks]