[Bugfix] TP size larger than KV cache head causes accuracy issues (#3366)

### What this PR does / why we need it?
Resolve the issue where, in the case of unequal TP (Tensor Parallelism),
the TP size is larger than the number of model attention kvcache heads,
causing the KV cache to generate duplicates, which leads to transmission
errors in the original code.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
By ci
- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

---------

Signed-off-by: nwpu-zxr <zhouxuerong2@huawei.com>
Signed-off-by: wangxiaoteng <wangxiaoteng@huawei.com>
Co-authored-by: nwpu-zxr <zhouxuerong2@huawei.com>
This commit is contained in:
wangxiaoteng888
2025-10-11 11:22:23 +08:00
committed by GitHub
parent ace300a549
commit ca05f7d632
8 changed files with 685 additions and 36 deletions

View File

@@ -32,8 +32,13 @@ def test_init_ascend_model_parallel(mock_distributed, parallel_config):
mock_ascend_config.lmhead_tensor_parallel_size = 2
mock_ascend_config.oproj_tensor_parallel_size = 2
mock_ascend_config.pd_tp_ratio = 2
mock_ascend_config.num_head_replica = 0
mock_ascend_config.pd_head_ratio = 2
mock_vllm_config = MagicMock()
mock_vllm_config.kv_transfer_config.is_kv_producer = True
with patch('vllm_ascend.distributed.parallel_state.model_parallel_initialized', return_value=False), \
patch('vllm_ascend.distributed.parallel_state.init_model_parallel_group'), \
patch('vllm_ascend.distributed.parallel_state.get_current_vllm_config', return_value=mock_vllm_config), \
patch('vllm_ascend.distributed.parallel_state.get_ascend_config', return_value=mock_ascend_config):
init_ascend_model_parallel(parallel_config)

View File

@@ -78,7 +78,8 @@ class TestKVCacheSendingLayerThreadBasic(unittest.TestCase):
def setUp(self):
self.p1 = patch(
'vllm_ascend.distributed.mooncake_layerwise_connector.get_ascend_config',
new=MagicMock(return_value=SimpleNamespace(pd_tp_ratio=1)))
new=MagicMock(return_value=SimpleNamespace(
pd_tp_ratio=1, num_head_replica=0, pd_head_ratio=1)))
self.p2 = patch(
'vllm_ascend.distributed.mooncake_layerwise_connector.get_current_vllm_config',
new=MagicMock(return_value=SimpleNamespace(
@@ -242,7 +243,8 @@ class TestSendingLayerThread(unittest.TestCase):
def setUp(self):
self.p1 = patch(
'vllm_ascend.distributed.mooncake_layerwise_connector.get_ascend_config',
new=MagicMock(return_value=SimpleNamespace(pd_tp_ratio=1)))
new=MagicMock(return_value=SimpleNamespace(
pd_tp_ratio=1, num_head_replica=0, pd_head_ratio=1)))
self.p2 = patch(
'vllm_ascend.distributed.mooncake_layerwise_connector.get_current_vllm_config',
new=MagicMock(return_value=SimpleNamespace(
@@ -900,7 +902,9 @@ class TestMooncakeLayerwiseConnectorWorker(unittest.TestCase):
{'vllm_ascend.envs': self.envs_ascend_mock}),
patch(
'vllm_ascend.distributed.mooncake_layerwise_connector.get_ascend_config',
return_value=SimpleNamespace(pd_tp_ratio=1),
return_value=SimpleNamespace(pd_tp_ratio=1,
num_head_replica=0,
pd_head_ratio=1),
),
patch(
'vllm_ascend.distributed.mooncake_layerwise_connector.get_current_vllm_config',