Drop torchair (#4814)
aclgraph is stable and fast now. Let's drop torchair graph mode now.
TODO: some logic to adapt torchair should be cleaned up as well. We'll
do it in the following PR.
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
This commit is contained in:
@@ -453,7 +453,6 @@ class KVCacheRecvingThread(threading.Thread):
|
||||
def _cat_kv_cache(self, block_ids: list[list[int]]):
|
||||
# Get necessary parameters
|
||||
k_cache = list(self.kv_caches.values())[0][0]
|
||||
kv_shape = k_cache.shape
|
||||
dtype = k_cache.dtype
|
||||
device = k_cache.device
|
||||
head_dim = self.model_config.hf_config.head_dim
|
||||
@@ -494,13 +493,6 @@ class KVCacheRecvingThread(threading.Thread):
|
||||
|
||||
# Process each layer in the KV cache
|
||||
for _, (k_cache_layer, v_cache_layer) in self.kv_caches.items():
|
||||
if len(
|
||||
k_cache_layer.shape
|
||||
) == 3: # kv shape in torchair model is [num_block, block_size, num_kv_head*head_dim]
|
||||
k_cache_layer = k_cache_layer.view(kv_shape[0], kv_shape[1],
|
||||
num_kv_head, head_dim)
|
||||
v_cache_layer = v_cache_layer.view(kv_shape[0], kv_shape[1],
|
||||
num_kv_head, head_dim)
|
||||
# Load cache data into buffers
|
||||
torch_npu.atb.npu_paged_cache_load(
|
||||
k_cache_layer,
|
||||
|
||||
Reference in New Issue
Block a user