[P/D] layerwise connector supports DeepSeek-V3.2 sparse attention && Distribute transfer tasks to redundant kv_head cards (#5722)
### What this PR does / why we need it?
Add new function to mooncake layerwise connector, including:
1. supports sparse attention, for DeepSeek-V3.2
2. Distribute transfer tasks to redundant kv_head cards
This PR is related to [[RFC]: CDCP Scheduling for Disaggregated
Prefilling with KV Cache Layerwise Push
Support](https://github.com/vllm-project/vllm-ascend/issues/4842)
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
By CI.
- vLLM version: v0.13.0
- vLLM main:
2f4e6548ef
---------
Signed-off-by: nwpu-zxr <zhouxuerong2@huawei.com>
Signed-off-by: liziyu <liziyu16@huawei.com>
Co-authored-by: liziyu <liziyu16@huawei.com>
This commit is contained in:
@@ -170,36 +170,6 @@ class TestKVCacheSendingLayerThread(unittest.TestCase):
|
||||
self.thread._transfer_kv_cache(send_task)
|
||||
self.engine.batch_transfer_sync_write.assert_not_called()
|
||||
|
||||
def test_transfer_skips_when_tp_not_sender(self):
|
||||
|
||||
thread = KVCacheSendingLayerThread(
|
||||
engine=self.engine,
|
||||
total_layers=2,
|
||||
ready_event=self.ready_event,
|
||||
tp_rank=1,
|
||||
pd_head_ratio=1,
|
||||
num_head_replica=2,
|
||||
kv_cache_base_addr=[1000, 2000, 3000, 4000],
|
||||
use_mla=False,
|
||||
block_len=[1024],
|
||||
decode_tp_size=1,
|
||||
first_kv_cache=self.first_kv_cache,
|
||||
k_buffer=MagicMock(),
|
||||
v_buffer=MagicMock(),
|
||||
resharding_stream=MagicMock(),
|
||||
callback_func=MagicMock())
|
||||
req_meta = self.req_meta_base
|
||||
send_task = SendTask(
|
||||
send_request={"req3": req_meta},
|
||||
wait_event=MagicMock(),
|
||||
k_cache=self.key,
|
||||
v_cache=self.value,
|
||||
layer_idx=1,
|
||||
rearrange_block_ids=[],
|
||||
)
|
||||
thread._transfer_kv_cache(send_task)
|
||||
self.engine.batch_transfer_sync_write.assert_not_called()
|
||||
|
||||
@patch(
|
||||
"vllm_ascend.distributed.mooncake_layerwise_connector.group_concurrent_contiguous",
|
||||
side_effect=group_concurrent_contiguous)
|
||||
@@ -425,6 +395,7 @@ class MockVllmConfig:
|
||||
self.parallel_config.data_parallel_size = 1
|
||||
self.parallel_config.data_parallel_rank = 0
|
||||
self.cache_config.block_size = 16
|
||||
self.model_config.hf_config.num_key_value_heads = 1
|
||||
|
||||
self.kv_transfer_config.engine_id = "test_engine"
|
||||
self.kv_transfer_config.kv_port = 5000
|
||||
|
||||
Reference in New Issue
Block a user