From fe4cad24e9efa97235a5ebff10b62d8a4d981ddc Mon Sep 17 00:00:00 2001 From: zxr2333 <64738772+nwpu-zxr@users.noreply.github.com> Date: Thu, 12 Mar 2026 23:51:40 +0800 Subject: [PATCH] [BugFix]fix qwen3.5 reshape_kvcache bug (#7209) ### What this PR does / why we need it? This PR fixes a bug in `reshape_kvcache_tensors` when reshaping the Mamba cache for models like Qwen3.5. The previous implementation did not correctly handle cases where the KV cache tensors have different data types. This change ensures that slicing is performed based on byte offsets before reshaping the tensors, which correctly handles heterogeneous dtypes. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? By CI. - vLLM version: v0.16.0 - vLLM main: https://github.com/vllm-project/vllm/commit/4034c3d32e30d01639459edd3ab486f56993876d Signed-off-by: nwpu-zxr --- vllm_ascend/worker/model_runner_v1.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index a102d0b1..cee31250 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -2852,8 +2852,8 @@ class NPUModelRunner(GPUModelRunner): # a conv state in some special models. target_shape = (num_blocks, *shape) - target_idx += torch.prod(torch.tensor(target_shape)).item() - tensor = raw_tensor.view(dtype)[start_idx:target_idx].view(target_shape) + target_idx += math.prod(target_shape) * get_dtype_size(dtype) + tensor = raw_tensor[start_idx:target_idx].view(dtype).view(target_shape) start_idx = target_idx state_tensors.append(tensor) kv_caches[layer_name] = state_tensors