[bugfix]Qwen2.5VL accurate question (#6975)

### What this PR does / why we need it? The attention mechanism in the ViT model architecture of Qwen2.5VL consists of two parts and does not support using cache to pass sequence lengths. ### Does this PR introduce _any_ user-facing change? remove seq_lens_cache ### How was this patch tested? - vLLM version: v0.16.0 - vLLM main: 15d76f74e2 --------- Signed-off-by: tanhaoan333 <tanhaoan@huawei.com>
2026-03-04 22:02:29 +08:00
parent 566c367a10
commit f8315f5717
1 changed files with 4 additions and 8 deletions
--- a/vllm_ascend/ops/mm_encoder_attention.py
+++ b/vllm_ascend/ops/mm_encoder_attention.py
@@ -62,7 +62,6 @@ class AscendMMEncoderAttention(MMEncoderAttention):
            prefix=prefix,
        )
        self.layer_index = int("".join(filter(str.isdigit, prefix)))
        self.enable_pad = self.head_size > MIN_PAD_SIZE and self.head_size < MAX_PAD_SIZE
        self.scale_value = self.head_size**-0.5
@@ -103,12 +102,9 @@ class AscendMMEncoderAttention(MMEncoderAttention):
        is_reshaped = query.dim() == 4
        # Directly use seq_lens cpu cache to avoid d2h copy.
        global seq_lens_cpu_cache
        if self.layer_index == 0:
        if cu_seqlens is None:
            cu_seqlens = torch.arange(0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device="cpu")
-            # Update seq_lens cpu cache.
+        seq_lens_cpu = torch.diff(cu_seqlens).to("cpu")
            seq_lens_cpu_cache = torch.diff(cu_seqlens).to("cpu")
        # q, k, v: [b, s, head, head_dim] -> [b * s, head, head_dim]
        q, k, v = self._reshape_qkv_to_3d(query, key, value, bsz, q_len, kv_len)
@@ -128,7 +124,7 @@ class AscendMMEncoderAttention(MMEncoderAttention):
            query=q,
            key=k,
            value=v,
-            seq_len=seq_lens_cpu_cache,
+            seq_len=seq_lens_cpu,
            scale_value=self.scale_value,
            num_heads=self.num_heads,
            num_kv_heads=self.num_kv_heads,