Drop torchair (#4814)

aclgraph is stable and fast now. Let's drop torchair graph mode now.

TODO: some logic to adapt torchair should be cleaned up as well. We'll
do it in the following PR.

- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
This commit is contained in:
wangxiyuan
2025-12-10 09:20:40 +08:00
committed by GitHub
parent ba9cda9dfd
commit 835b4c8f1d
84 changed files with 77 additions and 16881 deletions

View File

@@ -453,7 +453,6 @@ class KVCacheRecvingThread(threading.Thread):
def _cat_kv_cache(self, block_ids: list[list[int]]):
# Get necessary parameters
k_cache = list(self.kv_caches.values())[0][0]
kv_shape = k_cache.shape
dtype = k_cache.dtype
device = k_cache.device
head_dim = self.model_config.hf_config.head_dim
@@ -494,13 +493,6 @@ class KVCacheRecvingThread(threading.Thread):
# Process each layer in the KV cache
for _, (k_cache_layer, v_cache_layer) in self.kv_caches.items():
if len(
k_cache_layer.shape
) == 3: # kv shape in torchair model is [num_block, block_size, num_kv_head*head_dim]
k_cache_layer = k_cache_layer.view(kv_shape[0], kv_shape[1],
num_kv_head, head_dim)
v_cache_layer = v_cache_layer.view(kv_shape[0], kv_shape[1],
num_kv_head, head_dim)
# Load cache data into buffers
torch_npu.atb.npu_paged_cache_load(
k_cache_layer,