[KVCache] Refactor KVCache as page_size_bytes is ineffective (#3438)

### What this PR does / why we need it? Refactor KVCache as page_size_bytes is ineffective. 1. Currently the `AttentionSpec` is patched, but the `page_size_bytes` is still using that in vLLM in runtime, thus the patch is not working actually. Thus this pr removes the patch on `AttentionSpec`, and will do the final fix in vLLM. 2. Use `MLAAttentionSpec` instead of `FullAttentionSpec` to reduce `page_size_bytes` of spec, so that num_blocks in spec could double ### How was this patch tested? Test pass with Qwen3-Next and DeepSeek-V3.2-Exp - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: MengqingCao <cmq0113@163.com>
2025-10-14 21:28:41 +08:00
parent c55d99d13e
commit 223cc34085
6 changed files with 38 additions and 131 deletions
--- a/vllm_ascend/patch/platform/patch_common/init.py
+++ b/vllm_ascend/patch/platform/patch_common/init.py
@@ -19,4 +19,3 @@ import vllm_ascend.patch.platform.patch_common.patch_config  # noqa
 import vllm_ascend.patch.platform.patch_common.patch_distributed  # noqa
 import vllm_ascend.patch.platform.patch_common.patch_mamba_config  # noqa
 import vllm_ascend.patch.worker.patch_common.patch_attention_selector  # noqa
-import vllm_ascend.patch.worker.patch_common.patch_attentionspec  # noqa
--- a/vllm_ascend/patch/platform/patch_common/patch_mamba_config.py
+++ b/vllm_ascend/patch/platform/patch_common/patch_mamba_config.py
@@ -6,8 +6,6 @@ from vllm.model_executor.models.config import MambaModelConfig
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
 from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec

-from vllm_ascend.ascend_config import get_ascend_config
-

@classmethod
 def verify_and_update_config(cls, vllm_config) -> None:
@@ -24,7 +22,6 @@ def verify_and_update_config(cls, vllm_config) -> None:
    logger = init_logger(__name__)
    # Enable FULL_AND_PIECEWISE by default
    MambaModelConfig.verify_and_update_config(vllm_config)
-    ascend_config = get_ascend_config()

    cache_config = vllm_config.cache_config
    model_config = vllm_config.model_config
@@ -40,8 +37,7 @@ def verify_and_update_config(cls, vllm_config) -> None:
        block_size=1,
        num_kv_heads=model_config.get_num_kv_heads(parallel_config),
        head_size=model_config.get_head_size(),
-        dtype=kv_cache_dtype,
-        use_mla=model_config.use_mla or ascend_config.use_sfa).page_size_bytes
+        dtype=kv_cache_dtype).page_size_bytes

    model_cls, _ = ModelRegistry.resolve_model_cls(
        model_config.architecture,