[KVCache] Refactor KVCache as page_size_bytes is ineffective (#3438)
### What this PR does / why we need it? Refactor KVCache as page_size_bytes is ineffective. 1. Currently the `AttentionSpec` is patched, but the `page_size_bytes` is still using that in vLLM in runtime, thus the patch is not working actually. Thus this pr removes the patch on `AttentionSpec`, and will do the final fix in vLLM. 2. Use `MLAAttentionSpec` instead of `FullAttentionSpec` to reduce `page_size_bytes` of spec, so that num_blocks in spec could double ### How was this patch tested? Test pass with Qwen3-Next and DeepSeek-V3.2-Exp - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
@@ -19,4 +19,3 @@ import vllm_ascend.patch.platform.patch_common.patch_config # noqa
|
||||
import vllm_ascend.patch.platform.patch_common.patch_distributed # noqa
|
||||
import vllm_ascend.patch.platform.patch_common.patch_mamba_config # noqa
|
||||
import vllm_ascend.patch.worker.patch_common.patch_attention_selector # noqa
|
||||
import vllm_ascend.patch.worker.patch_common.patch_attentionspec # noqa
|
||||
|
||||
@@ -6,8 +6,6 @@ from vllm.model_executor.models.config import MambaModelConfig
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
|
||||
from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec
|
||||
|
||||
from vllm_ascend.ascend_config import get_ascend_config
|
||||
|
||||
|
||||
@classmethod
|
||||
def verify_and_update_config(cls, vllm_config) -> None:
|
||||
@@ -24,7 +22,6 @@ def verify_and_update_config(cls, vllm_config) -> None:
|
||||
logger = init_logger(__name__)
|
||||
# Enable FULL_AND_PIECEWISE by default
|
||||
MambaModelConfig.verify_and_update_config(vllm_config)
|
||||
ascend_config = get_ascend_config()
|
||||
|
||||
cache_config = vllm_config.cache_config
|
||||
model_config = vllm_config.model_config
|
||||
@@ -40,8 +37,7 @@ def verify_and_update_config(cls, vllm_config) -> None:
|
||||
block_size=1,
|
||||
num_kv_heads=model_config.get_num_kv_heads(parallel_config),
|
||||
head_size=model_config.get_head_size(),
|
||||
dtype=kv_cache_dtype,
|
||||
use_mla=model_config.use_mla or ascend_config.use_sfa).page_size_bytes
|
||||
dtype=kv_cache_dtype).page_size_bytes
|
||||
|
||||
model_cls, _ = ModelRegistry.resolve_model_cls(
|
||||
model_config.architecture,
|
||||
|
||||
Reference in New Issue
Block a user