[P/D] layerwise connector supports DeepSeek-V3.2 sparse attention && Distribute transfer tasks to redundant kv_head cards (#5722)

### What this PR does / why we need it? Add new function to mooncake layerwise connector, including: 1. supports sparse attention, for DeepSeek-V3.2 2. Distribute transfer tasks to redundant kv_head cards This PR is related to [[RFC]: CDCP Scheduling for Disaggregated Prefilling with KV Cache Layerwise Push Support](https://github.com/vllm-project/vllm-ascend/issues/4842) ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? By CI. - vLLM version: v0.13.0 - vLLM main: 2f4e6548ef --------- Signed-off-by: nwpu-zxr <zhouxuerong2@huawei.com> Signed-off-by: liziyu <liziyu16@huawei.com> Co-authored-by: liziyu <liziyu16@huawei.com>
2026-01-10 23:04:16 +08:00
parent c316679e65
commit 78b554dda9
3 changed files with 142 additions and 111 deletions
--- a/tests/ut/kv_connector/test_mooncake_layerwise_connector.py
+++ b/tests/ut/kv_connector/test_mooncake_layerwise_connector.py
@@ -170,36 +170,6 @@ class TestKVCacheSendingLayerThread(unittest.TestCase):
        self.thread._transfer_kv_cache(send_task)
        self.engine.batch_transfer_sync_write.assert_not_called()

-    def test_transfer_skips_when_tp_not_sender(self):
-
-        thread = KVCacheSendingLayerThread(
-            engine=self.engine,
-            total_layers=2,
-            ready_event=self.ready_event,
-            tp_rank=1,
-            pd_head_ratio=1,
-            num_head_replica=2,
-            kv_cache_base_addr=[1000, 2000, 3000, 4000],
-            use_mla=False,
-            block_len=[1024],
-            decode_tp_size=1,
-            first_kv_cache=self.first_kv_cache,
-            k_buffer=MagicMock(),
-            v_buffer=MagicMock(),
-            resharding_stream=MagicMock(),
-            callback_func=MagicMock())
-        req_meta = self.req_meta_base
-        send_task = SendTask(
-            send_request={"req3": req_meta},
-            wait_event=MagicMock(),
-            k_cache=self.key,
-            v_cache=self.value,
-            layer_idx=1,
-            rearrange_block_ids=[],
-        )
-        thread._transfer_kv_cache(send_task)
-        self.engine.batch_transfer_sync_write.assert_not_called()
-
    @patch(
        "vllm_ascend.distributed.mooncake_layerwise_connector.group_concurrent_contiguous",
        side_effect=group_concurrent_contiguous)
@@ -425,6 +395,7 @@ class MockVllmConfig:
        self.parallel_config.data_parallel_size = 1
        self.parallel_config.data_parallel_rank = 0
        self.cache_config.block_size = 16
+        self.model_config.hf_config.num_key_value_heads = 1

        self.kv_transfer_config.engine_id = "test_engine"
        self.kv_transfer_config.kv_port = 5000