From fe4cad24e9efa97235a5ebff10b62d8a4d981ddc Mon Sep 17 00:00:00 2001
From: zxr2333 <64738772+nwpu-zxr@users.noreply.github.com>
Date: Thu, 12 Mar 2026 23:51:40 +0800
Subject: [PATCH] [BugFix]fix qwen3.5 reshape_kvcache bug (#7209)

### What this PR does / why we need it?

This PR fixes a bug in `reshape_kvcache_tensors` when reshaping the
Mamba cache for models like Qwen3.5. The previous implementation did not
correctly handle cases where the KV cache tensors have different data
types. This change ensures that slicing is performed based on byte
offsets before reshaping the tensors, which correctly handles
heterogeneous dtypes.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

By CI.

- vLLM version: v0.16.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/4034c3d32e30d01639459edd3ab486f56993876d

Signed-off-by: nwpu-zxr <zhouxuerong2@huawei.com>
---
 vllm_ascend/worker/model_runner_v1.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index a102d0b1..cee31250 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -2852,8 +2852,8 @@ class NPUModelRunner(GPUModelRunner):
                         # a conv state in some special models.
                         target_shape = (num_blocks, *shape)
 
-                        target_idx += torch.prod(torch.tensor(target_shape)).item()
-                        tensor = raw_tensor.view(dtype)[start_idx:target_idx].view(target_shape)
+                        target_idx += math.prod(target_shape) * get_dtype_size(dtype)
+                        tensor = raw_tensor[start_idx:target_idx].view(dtype).view(target_shape)
                         start_idx = target_idx
                         state_tensors.append(tensor)
                     kv_caches[layer_name] = state_tensors