From f8315f5717c7e175e23461c0102f4f9a586da874 Mon Sep 17 00:00:00 2001 From: tanhaoan333 Date: Wed, 4 Mar 2026 22:02:29 +0800 Subject: [PATCH] [bugfix]Qwen2.5VL accurate question (#6975) ### What this PR does / why we need it? The attention mechanism in the ViT model architecture of Qwen2.5VL consists of two parts and does not support using cache to pass sequence lengths. ### Does this PR introduce _any_ user-facing change? remove seq_lens_cache ### How was this patch tested? - vLLM version: v0.16.0 - vLLM main: https://github.com/vllm-project/vllm/commit/15d76f74e2fdb12a95ea00f0ca283acf6219a2b7 --------- Signed-off-by: tanhaoan333 --- vllm_ascend/ops/mm_encoder_attention.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/vllm_ascend/ops/mm_encoder_attention.py b/vllm_ascend/ops/mm_encoder_attention.py index 889b88c4..4122c7f1 100644 --- a/vllm_ascend/ops/mm_encoder_attention.py +++ b/vllm_ascend/ops/mm_encoder_attention.py @@ -62,7 +62,6 @@ class AscendMMEncoderAttention(MMEncoderAttention): prefix=prefix, ) - self.layer_index = int("".join(filter(str.isdigit, prefix))) self.enable_pad = self.head_size > MIN_PAD_SIZE and self.head_size < MAX_PAD_SIZE self.scale_value = self.head_size**-0.5 @@ -103,12 +102,9 @@ class AscendMMEncoderAttention(MMEncoderAttention): is_reshaped = query.dim() == 4 # Directly use seq_lens cpu cache to avoid d2h copy. - global seq_lens_cpu_cache - if self.layer_index == 0: - if cu_seqlens is None: - cu_seqlens = torch.arange(0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device="cpu") - # Update seq_lens cpu cache. - seq_lens_cpu_cache = torch.diff(cu_seqlens).to("cpu") + if cu_seqlens is None: + cu_seqlens = torch.arange(0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device="cpu") + seq_lens_cpu = torch.diff(cu_seqlens).to("cpu") # q, k, v: [b, s, head, head_dim] -> [b * s, head, head_dim] q, k, v = self._reshape_qkv_to_3d(query, key, value, bsz, q_len, kv_len) @@ -128,7 +124,7 @@ class AscendMMEncoderAttention(MMEncoderAttention): query=q, key=k, value=v, - seq_len=seq_lens_cpu_cache, + seq_len=seq_lens_cpu, scale_value=self.scale_value, num_heads=self.num_heads, num_kv_heads=self.num_kv_heads,