[Bugfix] TP size larger than KV cache head causes accuracy issues (#3366)

### What this PR does / why we need it?
Resolve the issue where, in the case of unequal TP (Tensor Parallelism),
the TP size is larger than the number of model attention kvcache heads,
causing the KV cache to generate duplicates, which leads to transmission
errors in the original code.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
By ci
- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

---------

Signed-off-by: nwpu-zxr <zhouxuerong2@huawei.com>
Signed-off-by: wangxiaoteng <wangxiaoteng@huawei.com>
Co-authored-by: nwpu-zxr <zhouxuerong2@huawei.com>
This commit is contained in:
wangxiaoteng888
2025-10-11 11:22:23 +08:00
committed by GitHub
parent ace300a549
commit ca05f7d632
8 changed files with 685 additions and 36 deletions

View File

@@ -100,13 +100,28 @@ class AscendConfig:
"oproj_tensor_parallel_size is only supported in pd scenario and can only be used in D node."
)
self.pd_tp_ratio = 1
self.pd_head_ratio = 1
self.num_head_replica = 0
if vllm_config.kv_transfer_config is not None and not vllm_config.model_config.is_deepseek_mla:
prefill_tp_size = vllm_config.kv_transfer_config.get_from_extra_config(
"prefill", {"tp_size": 1})["tp_size"]
decode_tp_size = vllm_config.kv_transfer_config.get_from_extra_config(
"decode", {"tp_size": 1})["tp_size"]
pd_tp_ratio: int = prefill_tp_size // decode_tp_size
self.pd_tp_ratio = pd_tp_ratio
assert prefill_tp_size % decode_tp_size == 0, "Prefill TP size must be divisible by Decode TP size."
self.pd_tp_ratio = prefill_tp_size // decode_tp_size
if self.pd_tp_ratio > 1:
try:
# only support Qwen model now
# TODO: use a more robust method to get kv_head_num
num_kv_head = vllm_config.model_config.hf_config.num_key_value_heads
self.num_head_replica = prefill_tp_size // num_kv_head
prefill_tp_size = min(prefill_tp_size, num_kv_head)
decode_tp_size = min(decode_tp_size, num_kv_head)
self.pd_head_ratio = prefill_tp_size // decode_tp_size
except Exception:
raise AssertionError(
"Can not get num_key_value_heads from model_config")
if self.pd_tp_ratio == 0:
raise AssertionError(
"Only support P node tp size lagger then D node tp size")