[bugfix] adapt to new implemented get_kv_cache_spec in cpuoffload connector (#4311)

### What this PR does / why we need it? func `get_kv_cache_spec` in model_runner changed a lot and caused error in cpuoffloading connector which is copied from model_runner, this PR adapts to new implemented `get_kv_cache_spec` to fix it. ### How was this patch tested? - vLLM version: v0.11.0 - vLLM main: 2918c1b49c Signed-off-by: lidenghui <lidenghui1110@gmail.com>
2026-01-08 09:15:09 +08:00
parent f7db812ed7
commit 481138e1d2
2 changed files with 85 additions and 31 deletions
--- a/vllm_ascend/distributed/cpu_offload_manager/metadata.py
+++ b/vllm_ascend/distributed/cpu_offload_manager/metadata.py
@@ -12,7 +12,7 @@ from vllm.config import KVTransferConfig, VllmConfig
 from vllm.logger import logger
 from vllm.utils.network_utils import make_zmq_socket
 from vllm.utils.torch_utils import get_dtype_size
-from vllm.v1.kv_cache_interface import AttentionSpec
+from vllm.v1.kv_cache_interface import AttentionSpec, MLAAttentionSpec

 from vllm_ascend.distributed.cpu_offload_manager.cpu_kv_cache_manager import \
    CPUKVCacheManager
@@ -140,14 +140,15 @@ class MetadataServer:
            layer.page_size_bytes == any.page_size_bytes
            for any in kv_cache_specs.values()
        ])
+        use_mla = isinstance(layer, MLAAttentionSpec)
        # mla shares the same kv cache among different tp
-        if layer.use_mla:
+        if use_mla:
            tp_rank = 0
        if (pp_rank, tp_rank) in self.shared_memory:
            return self.shared_memory[(pp_rank, tp_rank)]
        available_memory = self.available_memory
        shared_memory_dict = {}
-        if layer.use_mla:
+        if use_mla:
            available_memory //= self.pipeline_parallel_size
            available_memory //= len(kv_cache_specs)
            num_blocks = available_memory // layer.page_size_bytes
@@ -165,7 +166,7 @@ class MetadataServer:
            shared_memory_dict[
                layer_name] = MetadataServer._safe_create_shared_memory(
                    f"cpu_kv_cache_{pp_rank}_{tp_rank}_{layer_name}", nbytes)
-        if layer.use_mla:
+        if use_mla:
            assert mla_config is not None
            assert layer.head_size == mla_config.rope_dim + mla_config.nope_dim
            self.shared_memory[(pp_rank,