[KVCache] Refactor KVCache as page_size_bytes is ineffective (#3438)

### What this PR does / why we need it? Refactor KVCache as page_size_bytes is ineffective. 1. Currently the `AttentionSpec` is patched, but the `page_size_bytes` is still using that in vLLM in runtime, thus the patch is not working actually. Thus this pr removes the patch on `AttentionSpec`, and will do the final fix in vLLM. 2. Use `MLAAttentionSpec` instead of `FullAttentionSpec` to reduce `page_size_bytes` of spec, so that num_blocks in spec could double ### How was this patch tested? Test pass with Qwen3-Next and DeepSeek-V3.2-Exp - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: MengqingCao <cmq0113@163.com>
2025-10-14 21:28:41 +08:00
parent c55d99d13e
commit 223cc34085
6 changed files with 38 additions and 131 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -80,6 +80,7 @@ from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
 from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec,
                                        KVCacheConfig, KVCacheGroupSpec,
                                        KVCacheSpec, MambaSpec,
+                                        MLAAttentionSpec,
                                        UniformTypeKVCacheSpecs)
 # yapf: enable
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
@@ -3220,13 +3221,21 @@ class NPUModelRunner(LoRAModelRunnerMixin):
            # TODO(lucas): move the attention specs into the model layers like
            # the attention backends
            if attn_module.attn_type == AttentionType.DECODER:
-                kv_cache_spec[layer_name] = FullAttentionSpec(
-                    block_size=block_size,
-                    num_kv_heads=attn_module.num_kv_heads,
-                    head_size=attn_module.head_size,
-                    dtype=self.kv_cache_dtype,
-                    use_mla=use_mla,
-                    use_sfa=use_sfa)
+                if use_mla and not use_sfa:
+                    kv_cache_spec[layer_name] = MLAAttentionSpec(
+                        block_size=block_size,
+                        num_kv_heads=attn_module.num_kv_heads,
+                        head_size=attn_module.head_size,
+                        dtype=self.kv_cache_dtype,
+                        cache_dtype_str=self.cache_config.cache_dtype)
+                else:
+                    # TODO(cmq): This is a hack way to fix deepseek kvcache when
+                    # using DSA. Fix the spec in vLLM is a finnal way.
+                    kv_cache_spec[layer_name] = FullAttentionSpec(
+                        block_size=block_size,
+                        num_kv_heads=attn_module.num_kv_heads,
+                        head_size=attn_module.head_size,
+                        dtype=self.kv_cache_dtype)
            elif attn_module.attn_type in (AttentionType.ENCODER,
                                           AttentionType.ENCODER_ONLY):
                # encoder-only attention does not need KV cache.