Drop torchair (#4814)

aclgraph is stable and fast now. Let's drop torchair graph mode now. TODO: some logic to adapt torchair should be cleaned up as well. We'll do it in the following PR. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Co-authored-by: Mengqing Cao <cmq0113@163.com>
2025-12-10 09:20:40 +08:00
parent ba9cda9dfd
commit 835b4c8f1d
84 changed files with 77 additions and 16881 deletions
--- a/vllm_ascend/distributed/mooncake_connector.py
+++ b/vllm_ascend/distributed/mooncake_connector.py
@@ -453,7 +453,6 @@ class KVCacheRecvingThread(threading.Thread):
    def _cat_kv_cache(self, block_ids: list[list[int]]):
        # Get necessary parameters
        k_cache = list(self.kv_caches.values())[0][0]
-        kv_shape = k_cache.shape
        dtype = k_cache.dtype
        device = k_cache.device
        head_dim = self.model_config.hf_config.head_dim
@@ -494,13 +493,6 @@ class KVCacheRecvingThread(threading.Thread):

        # Process each layer in the KV cache
        for _, (k_cache_layer, v_cache_layer) in self.kv_caches.items():
-            if len(
-                    k_cache_layer.shape
-            ) == 3:  # kv shape in torchair model is [num_block, block_size, num_kv_head*head_dim]
-                k_cache_layer = k_cache_layer.view(kv_shape[0], kv_shape[1],
-                                                   num_kv_head, head_dim)
-                v_cache_layer = v_cache_layer.view(kv_shape[0], kv_shape[1],
-                                                   num_kv_head, head_dim)
            # Load cache data into buffers
            torch_npu.atb.npu_paged_cache_load(
                k_cache_layer,