From ab68d31a24110c3ed2b0ea5c70016ea8874059e7 Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Sat, 19 Jul 2025 09:42:32 +0800
Subject: [PATCH] [Misc][V0 Deprecation] Remove Cache Engine Used for V0 Worker
 (#1878)

### What this PR does / why we need it?
This PR is a part of
https://github.com/vllm-project/vllm-ascend/issues/1620.

- vLLM version: v0.9.2
- vLLM main:
https://github.com/vllm-project/vllm/commit/5895afd78047614a037cac1fc4634825c749fd59

---------

Signed-off-by: shen-shanshan <467638484@qq.com>
---
 vllm_ascend/worker/__init__.py     | 17 ------
 vllm_ascend/worker/cache_engine.py | 83 ------------------------------
 2 files changed, 100 deletions(-)
 delete mode 100644 vllm_ascend/worker/cache_engine.py

diff --git a/vllm_ascend/worker/__init__.py b/vllm_ascend/worker/__init__.py
index ee59a05..e69de29 100644
--- a/vllm_ascend/worker/__init__.py
+++ b/vllm_ascend/worker/__init__.py
@@ -1,17 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# This file is a part of the vllm-ascend project.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import vllm_ascend.worker.cache_engine  # noqa
\ No newline at end of file
diff --git a/vllm_ascend/worker/cache_engine.py b/vllm_ascend/worker/cache_engine.py
deleted file mode 100644
index d8d9087..0000000
--- a/vllm_ascend/worker/cache_engine.py
+++ /dev/null
@@ -1,83 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# This file is a part of the vllm-ascend project.
-# Adapted from vllm-project/vllm/vllm/worker/model_runner.py
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from typing import Any, List
-
-import torch
-from vllm.utils import is_pin_memory_available
-from vllm.worker.cache_engine import CacheEngine
-
-from vllm_ascend.ascend_config import get_ascend_config
-
-
-def allocate_kv_cache(
-    self,
-    num_blocks: int,
-    device: str,
-) -> List[Any]:
-    """Allocates KV cache on the specified device."""
-    kv_cache_shape = self.attn_backend.get_kv_cache_shape(
-        num_blocks, self.block_size, self.num_kv_heads, self.head_size)
-    pin_memory = is_pin_memory_available() if device == "cpu" else False
-    kv_cache: List[Any] = []
-
-    ascend_config = get_ascend_config()
-    if ascend_config.torchair_graph_config.enabled:
-        # Align entries so they are 256 byte aligned for better performance
-        # Primarily targets MLA as this typically only ends up having entries
-        # be 128 byte aligned.
-        alloc_shape = kv_cache_shape
-
-        for _ in range(self.num_attention_layers):
-            # null block in CpuGpuBlockAllocator requires at least that
-            # block to be zeroed-out.
-            # We zero-out everything for simplicity.
-            layer_kv_cache_nope = torch.zeros(
-                alloc_shape[:-1] +
-                (self.model_config.hf_text_config.kv_lora_rank, ),
-                dtype=self.dtype,
-                pin_memory=pin_memory,
-                device=device)
-            layer_kv_cache_pe = torch.zeros(
-                alloc_shape[:-1] +
-                (self.model_config.hf_text_config.qk_rope_head_dim, ),
-                dtype=self.dtype,
-                pin_memory=pin_memory,
-                device=device)
-
-            # view back to (TOTAL_PAGES, PAGE_SIZE, entry_shape...) for cases
-            # when entry_shape is higher than 1D
-            kv_cache.append((layer_kv_cache_nope, layer_kv_cache_pe))
-    else:
-        for _ in range(self.num_attention_layers):
-            # null block in CpuGpuBlockAllocator requires at least that
-            # block to be zeroed-out.
-            # We zero-out everything for simplicity.
-            layer_kv_cache = torch.zeros(kv_cache_shape,
-                                         dtype=self.dtype,
-                                         pin_memory=pin_memory,
-                                         device=device)
-
-            # view back to (TOTAL_PAGES, PAGE_SIZE, entry_shape...) for cases
-            # when entry_shape is higher than 1D
-            kv_cache.append(layer_kv_cache)
-    return kv_cache
-
-
-CacheEngine._allocate_kv_cache = allocate_kv_cache