From 156976b982ad7fc0a67e60312aa958309d6c9e5f Mon Sep 17 00:00:00 2001 From: Wang Kunpeng <1289706727@qq.com> Date: Mon, 9 Feb 2026 18:53:56 +0800 Subject: [PATCH] [refactor]Optimized the kvcache usage of Deepseek v3.2 (#6610) ### What this PR does / why we need it? For deepseek v3.2, DSA use FullAttentionSpec, allocate 2 * mla page size bytes, and we use half of that for k cache in DSA However, the actual proportion of k cache is not high, which results in a large amount of kvcache being wasted. The proportion of discarded kvcache is (576-128)/(576 x 2) = 0.388. Run the same script to start DeepSeek V3.2 on a single A3 server. The following shows the comparison of kvcache usage: Before refactoring ``` [kv_cache_utils.py:1307] GPU KV cache size: 15,872 tokens ``` After refactoring ``` [kv_cache_utils.py:1307] GPU KV cache size: 25,984 tokens ``` This pull request refactors the KV cache allocation for Deepseek v3.2 models that use sparse attention. It replaces the use of `FullAttentionSpec` with `MLAAttentionSpec` and introduces a more principled way of calculating KV cache tensor split factors based on model configuration. This change removes hardcoded values and correctly sizes the cache tensors, leading to optimized memory usage and improved code maintainability. ### Does this PR introduce _any_ user-facing change? No, this is an internal optimization and does not introduce any user-facing changes. ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a --------- Signed-off-by: Wang Kunpeng <1289706727@qq.com> --- vllm_ascend/worker/model_runner_v1.py | 43 ++++++++++++++++++--------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 1ffefd0b..db701ed2 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -53,11 +53,11 @@ from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.kv_cache_interface import ( AttentionSpec, EncoderOnlyAttentionSpec, - FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, KVCacheSpec, MambaSpec, + MLAAttentionSpec, UniformTypeKVCacheSpecs, ) from vllm.v1.outputs import ( @@ -2438,12 +2438,11 @@ class NPUModelRunner(GPUModelRunner): k_tensor_split_factor = 2 v_tensor_split_factor = 2 elif self.use_sparse: - # for deepseek v3.2, DSA use FullAttentionSpec - # FullAttentionSpec allocate 2 * mla page size bytes, - # and we use half of that for k cache in DSA - dsa_k_cache_factor = 2 - k_tensor_split_factor = 2 * head_size / self.model_config.hf_text_config.kv_lora_rank - v_tensor_split_factor = 2 * head_size / self.model_config.hf_text_config.qk_rope_head_dim + # for deepseek v3.2, we split the kv cache according to the corresponding ratio + sparse_sum_head_size = sum(self._get_sparse_kv_cache_ratio()) + k_tensor_split_factor, v_tensor_split_factor, dsa_k_cache_factor = [ # type: ignore + sparse_sum_head_size / ratio for ratio in self._get_sparse_kv_cache_ratio() + ] dsa_k_cache_size = int(kv_cache_tensor.size // dsa_k_cache_factor) else: # for other deepseek models, use MLAAttentionSpec @@ -2581,9 +2580,14 @@ class NPUModelRunner(GPUModelRunner): v_cache = raw_v_tensor.view(dtype).view(v_shape) if self.use_sparse and raw_dsa_k_tensor is not None: - dsa_k_cache_shape = (num_blocks, kv_cache_spec.block_size, 1, 128) - dsa_k_cache_size = (num_blocks) * kv_cache_spec.block_size * 128 * dtype.itemsize - dsa_k_cache = raw_dsa_k_tensor[:dsa_k_cache_size].view(dtype).view(dsa_k_cache_shape) + index_head_dim = self._get_sparse_kv_cache_ratio()[-1] + dsa_k_cache_shape = ( + num_blocks, + kv_cache_spec.block_size, + kv_cache_spec.num_kv_heads, + index_head_dim, + ) + dsa_k_cache = raw_dsa_k_tensor.view(dtype).view(dsa_k_cache_shape) kv_caches[layer_name] = (k_cache, v_cache, dsa_k_cache) else: kv_caches[layer_name] = (k_cache, v_cache) @@ -2832,12 +2836,13 @@ class NPUModelRunner(GPUModelRunner): if self.use_sparse: # TODO(cmq): This is a hack way to fix deepseek kvcache when # using DSA. Fix the spec in vLLM is the final way. - block_size = self.vllm_config.cache_config.block_size - kv_cache_spec[layer_name] = FullAttentionSpec( - block_size=block_size, + sparse_sum_head_size = sum(self._get_sparse_kv_cache_ratio()) + kv_cache_spec[layer_name] = MLAAttentionSpec( + block_size=self.block_size, num_kv_heads=1, - head_size=attn_module.head_size, + head_size=sparse_sum_head_size, dtype=self.kv_cache_dtype, + cache_dtype_str=self.vllm_config.cache_config.cache_dtype, ) elif spec := attn_module.get_kv_cache_spec(self.vllm_config): kv_cache_spec[layer_name] = spec @@ -2854,6 +2859,16 @@ class NPUModelRunner(GPUModelRunner): return kv_cache_spec + def _get_sparse_kv_cache_ratio(self) -> list[int]: + # TODO:If C8 is supported, we need to consider the number of bytes occupied by different dtypes + # when calculating the ratio,for example: + # [kv_lora_rank * torch.int8.itemsize, qk_rope_head_dim * torch.bfloat16.itemsize, ...] + return [ + self.model_config.hf_text_config.kv_lora_rank, + self.model_config.hf_text_config.qk_rope_head_dim, + self.model_config.hf_text_config.index_head_dim, + ] + def _check_and_update_cudagraph_mode( self, attention_backends: list[set[type[AttentionBackend]]],