KVCache Transfer via Layer-wise Strategy in Disaggregation (#2602)

### What this PR does / why we need it? See RFC: https://github.com/vllm-project/vllm-ascend/issues/2470 This PR add a new kv connector for layer-wised kv transfer ### Does this PR introduce _any_ user-facing change? yes, a new kv connector is added. User can use layer wised feature now. ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/releases/v0.11.0 --------- Signed-off-by: leichao.lc <leichao139636@163.com> Signed-off-by: CaveNightingale <2859066733@qq.com> Signed-off-by: nwpu-zxr <zhouxuerong2@huawei.com> Signed-off-by: wangxiaoteng <wangxiaoteng@huawei.com> Signed-off-by: hanxinlong <50882499@qq.com> Signed-off-by: liziyu <liziyu16@huawei.com> Co-authored-by: CaveNightingale <2859066733@qq.com> Co-authored-by: nwpu-zxr <zhouxuerong2@huawei.com> Co-authored-by: wangxiaoteng <wangxiaoteng@huawei.com> Co-authored-by: hanxinlong <50882499@qq.com>
2025-09-30 15:10:29 +08:00
parent f8c93d8d24
commit a486ff8c11
10 changed files with 3012 additions and 4 deletions
--- a/vllm_ascend/distributed/parallel_state.py
+++ b/vllm_ascend/distributed/parallel_state.py
@@ -13,6 +13,7 @@ _MC2: Optional[GroupCoordinator] = None
 _MLP_TP: Optional[GroupCoordinator] = None
 _OTP: Optional[GroupCoordinator] = None
 _LMTP: Optional[GroupCoordinator] = None
+_P_TP: Optional[GroupCoordinator] = None


 def get_mc2_group() -> GroupCoordinator:
@@ -37,6 +38,12 @@ def get_mlp_tp_group() -> GroupCoordinator:
    return _MLP_TP


+def get_p_tp_group() -> GroupCoordinator:
+    assert _P_TP is not None, (
+        "distributed prefill tensor parallel group is not initialized")
+    return _P_TP
+
+
 def model_parallel_initialized():
    return (_MC2 is not None)

@@ -54,6 +61,22 @@ def init_ascend_model_parallel(parallel_config: ParallelConfig, ):
    all_ranks = torch.arange(world_size).reshape(
        -1, parallel_config.data_parallel_size *
        parallel_config.tensor_parallel_size)
+
+    pd_tp_ratio = get_ascend_config().pd_tp_ratio
+    global _P_TP
+    assert _P_TP is None, (
+        "distributed prefill tensor parallel group is already initialized")
+    prefill_tensor_model_parallel_size = pd_tp_ratio if \
+        pd_tp_ratio > 0 and pd_tp_ratio < parallel_config.tensor_parallel_size else parallel_config.tensor_parallel_size
+    group_ranks = all_ranks.view(-1,
+                                 prefill_tensor_model_parallel_size).unbind(0)
+    group_ranks = [x.tolist() for x in group_ranks]
+    num = get_world_group().local_rank // pd_tp_ratio
+    _P_TP = init_model_parallel_group(group_ranks,
+                                      get_world_group().local_rank,
+                                      backend,
+                                      group_name=f"p_tp_{num}")
+
    global _MC2
    group_ranks = all_ranks.unbind(0)
    group_ranks = [x.tolist() for x in group_ranks]
@@ -142,3 +165,8 @@ def destroy_ascend_model_parallel():
    if _OTP:
        _OTP.destroy()
    _OTP = None
+
+    global _P_TP
+    if _P_TP:
+        _P_TP.destroy()
+    _P_TP = None