From ab68d31a24110c3ed2b0ea5c70016ea8874059e7 Mon Sep 17 00:00:00 2001 From: Shanshan Shen <467638484@qq.com> Date: Sat, 19 Jul 2025 09:42:32 +0800 Subject: [PATCH] [Misc][V0 Deprecation] Remove Cache Engine Used for V0 Worker (#1878) ### What this PR does / why we need it? This PR is a part of https://github.com/vllm-project/vllm-ascend/issues/1620. - vLLM version: v0.9.2 - vLLM main: https://github.com/vllm-project/vllm/commit/5895afd78047614a037cac1fc4634825c749fd59 --------- Signed-off-by: shen-shanshan <467638484@qq.com> --- vllm_ascend/worker/__init__.py | 17 ------ vllm_ascend/worker/cache_engine.py | 83 ------------------------------ 2 files changed, 100 deletions(-) delete mode 100644 vllm_ascend/worker/cache_engine.py diff --git a/vllm_ascend/worker/__init__.py b/vllm_ascend/worker/__init__.py index ee59a05..e69de29 100644 --- a/vllm_ascend/worker/__init__.py +++ b/vllm_ascend/worker/__init__.py @@ -1,17 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# This file is a part of the vllm-ascend project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import vllm_ascend.worker.cache_engine # noqa \ No newline at end of file diff --git a/vllm_ascend/worker/cache_engine.py b/vllm_ascend/worker/cache_engine.py deleted file mode 100644 index d8d9087..0000000 --- a/vllm_ascend/worker/cache_engine.py +++ /dev/null @@ -1,83 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# This file is a part of the vllm-ascend project. -# Adapted from vllm-project/vllm/vllm/worker/model_runner.py -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from typing import Any, List - -import torch -from vllm.utils import is_pin_memory_available -from vllm.worker.cache_engine import CacheEngine - -from vllm_ascend.ascend_config import get_ascend_config - - -def allocate_kv_cache( - self, - num_blocks: int, - device: str, -) -> List[Any]: - """Allocates KV cache on the specified device.""" - kv_cache_shape = self.attn_backend.get_kv_cache_shape( - num_blocks, self.block_size, self.num_kv_heads, self.head_size) - pin_memory = is_pin_memory_available() if device == "cpu" else False - kv_cache: List[Any] = [] - - ascend_config = get_ascend_config() - if ascend_config.torchair_graph_config.enabled: - # Align entries so they are 256 byte aligned for better performance - # Primarily targets MLA as this typically only ends up having entries - # be 128 byte aligned. - alloc_shape = kv_cache_shape - - for _ in range(self.num_attention_layers): - # null block in CpuGpuBlockAllocator requires at least that - # block to be zeroed-out. - # We zero-out everything for simplicity. - layer_kv_cache_nope = torch.zeros( - alloc_shape[:-1] + - (self.model_config.hf_text_config.kv_lora_rank, ), - dtype=self.dtype, - pin_memory=pin_memory, - device=device) - layer_kv_cache_pe = torch.zeros( - alloc_shape[:-1] + - (self.model_config.hf_text_config.qk_rope_head_dim, ), - dtype=self.dtype, - pin_memory=pin_memory, - device=device) - - # view back to (TOTAL_PAGES, PAGE_SIZE, entry_shape...) for cases - # when entry_shape is higher than 1D - kv_cache.append((layer_kv_cache_nope, layer_kv_cache_pe)) - else: - for _ in range(self.num_attention_layers): - # null block in CpuGpuBlockAllocator requires at least that - # block to be zeroed-out. - # We zero-out everything for simplicity. - layer_kv_cache = torch.zeros(kv_cache_shape, - dtype=self.dtype, - pin_memory=pin_memory, - device=device) - - # view back to (TOTAL_PAGES, PAGE_SIZE, entry_shape...) for cases - # when entry_shape is higher than 1D - kv_cache.append(layer_kv_cache) - return kv_cache - - -CacheEngine._allocate_kv_cache = allocate_kv_cache