[Bugfix] TP size larger than KV cache head causes accuracy issues (#3366)

### What this PR does / why we need it? Resolve the issue where, in the case of unequal TP (Tensor Parallelism), the TP size is larger than the number of model attention kvcache heads, causing the KV cache to generate duplicates, which leads to transmission errors in the original code. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? By ci - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: nwpu-zxr <zhouxuerong2@huawei.com> Signed-off-by: wangxiaoteng <wangxiaoteng@huawei.com> Co-authored-by: nwpu-zxr <zhouxuerong2@huawei.com>
2025-10-11 11:22:23 +08:00
parent ace300a549
commit ca05f7d632
8 changed files with 685 additions and 36 deletions
--- a/vllm_ascend/ascend_config.py
+++ b/vllm_ascend/ascend_config.py
@@ -100,13 +100,28 @@ class AscendConfig:
                    "oproj_tensor_parallel_size is only supported in pd scenario and can only be used in D node."
                )
        self.pd_tp_ratio = 1
+        self.pd_head_ratio = 1
+        self.num_head_replica = 0
        if vllm_config.kv_transfer_config is not None and not vllm_config.model_config.is_deepseek_mla:
            prefill_tp_size = vllm_config.kv_transfer_config.get_from_extra_config(
                "prefill", {"tp_size": 1})["tp_size"]
            decode_tp_size = vllm_config.kv_transfer_config.get_from_extra_config(
                "decode", {"tp_size": 1})["tp_size"]
-            pd_tp_ratio: int = prefill_tp_size // decode_tp_size
-            self.pd_tp_ratio = pd_tp_ratio
+            assert prefill_tp_size % decode_tp_size == 0, "Prefill TP size must be divisible by Decode TP size."
+            self.pd_tp_ratio = prefill_tp_size // decode_tp_size
+            if self.pd_tp_ratio > 1:
+                try:
+                    # only support Qwen model now
+                    # TODO: use a more robust method to get kv_head_num
+                    num_kv_head = vllm_config.model_config.hf_config.num_key_value_heads
+                    self.num_head_replica = prefill_tp_size // num_kv_head
+                    prefill_tp_size = min(prefill_tp_size, num_kv_head)
+                    decode_tp_size = min(decode_tp_size, num_kv_head)
+                    self.pd_head_ratio = prefill_tp_size // decode_tp_size
+                except Exception:
+                    raise AssertionError(
+                        "Can not get num_key_value_heads from model_config")
+
            if self.pd_tp_ratio == 0:
                raise AssertionError(
                    "Only support P node tp size lagger then D node tp size")