Files
xc-llm-ascend/vllm_ascend/kv_offload/npu.py
whx 3f33ad23fe [BugFix] Fix npu-cpu offloading interface change bug. (#5290)
### What this PR does / why we need it?
Last month the interface of `OffloadingSpec` has
changed(https://github.com/vllm-project/vllm/pull/27743). This PR fixes
this bug and adds e2e test for cpu offloading.

### Does this PR introduce _any_ user-facing change?
None

### How was this patch tested?
CI passed with new added test.


- vLLM version: release/v0.13.0
- vLLM main:
ad32e3e19c

---------

Signed-off-by: whx-sjtu <2952154980@qq.com>
2025-12-27 10:21:20 +08:00

65 lines
2.4 KiB
Python

from collections.abc import Iterator
from typing import Optional
import torch
from vllm.attention.backends.abstract import AttentionBackend
from vllm.config import VllmConfig
from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
from vllm.v1.kv_offload.backends.cpu import CPUBackend
from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
from vllm.v1.kv_offload.spec import OffloadingSpec
from vllm.v1.kv_offload.worker.worker import OffloadingHandler
from vllm_ascend.kv_offload.cpu_npu import CpuNpuOffloadingHandler
class NPUOffloadingSpec(OffloadingSpec):
def __init__(self, vllm_config: VllmConfig):
super().__init__(vllm_config)
num_cpu_blocks = self.extra_config.get("num_cpu_blocks")
if not num_cpu_blocks:
raise Exception(
"num_cpu_blocks must be specified in kv_connector_extra_config"
)
self.num_cpu_blocks: int = num_cpu_blocks
# scheduler-side
self._manager: Optional[OffloadingManager] = None
# worker-side
self._handler: Optional[OffloadingHandler] = None
def get_manager(self) -> OffloadingManager:
if not self._manager:
kv_events_config = self.vllm_config.kv_events_config
enable_events = (kv_events_config is not None
and kv_events_config.enable_kv_cache_events)
self._manager = LRUOffloadingManager(
CPUBackend(block_size=self.offloaded_block_size,
num_blocks=self.num_cpu_blocks),
enable_events=enable_events,
)
return self._manager
def get_handlers(
self,
kv_caches: dict[str, torch.Tensor],
attn_backends: dict[str, type[AttentionBackend]],
) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec],
OffloadingHandler]]:
if not self._handler:
self._handler = CpuNpuOffloadingHandler(
attn_backends=attn_backends,
gpu_block_size=self.gpu_block_size,
cpu_block_size=self.offloaded_block_size,
num_cpu_blocks=self.num_cpu_blocks,
gpu_caches=kv_caches,
)
assert self._handler is not None
yield GPULoadStoreSpec, CPULoadStoreSpec, self._handler
yield CPULoadStoreSpec, GPULoadStoreSpec, self._handler