[bugfix]Qwen2.5VL accurate question (#6975)

### What this PR does / why we need it?
The attention mechanism in the ViT model architecture of Qwen2.5VL
consists of two parts and does not support using cache to pass sequence
lengths.
### Does this PR introduce _any_ user-facing change?
remove seq_lens_cache
### How was this patch tested?

- vLLM version: v0.16.0
- vLLM main:
15d76f74e2

---------

Signed-off-by: tanhaoan333 <tanhaoan@huawei.com>
This commit is contained in:
tanhaoan333
2026-03-04 22:02:29 +08:00
committed by GitHub
parent 566c367a10
commit f8315f5717

View File

@@ -62,7 +62,6 @@ class AscendMMEncoderAttention(MMEncoderAttention):
prefix=prefix, prefix=prefix,
) )
self.layer_index = int("".join(filter(str.isdigit, prefix)))
self.enable_pad = self.head_size > MIN_PAD_SIZE and self.head_size < MAX_PAD_SIZE self.enable_pad = self.head_size > MIN_PAD_SIZE and self.head_size < MAX_PAD_SIZE
self.scale_value = self.head_size**-0.5 self.scale_value = self.head_size**-0.5
@@ -103,12 +102,9 @@ class AscendMMEncoderAttention(MMEncoderAttention):
is_reshaped = query.dim() == 4 is_reshaped = query.dim() == 4
# Directly use seq_lens cpu cache to avoid d2h copy. # Directly use seq_lens cpu cache to avoid d2h copy.
global seq_lens_cpu_cache
if self.layer_index == 0:
if cu_seqlens is None: if cu_seqlens is None:
cu_seqlens = torch.arange(0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device="cpu") cu_seqlens = torch.arange(0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device="cpu")
# Update seq_lens cpu cache. seq_lens_cpu = torch.diff(cu_seqlens).to("cpu")
seq_lens_cpu_cache = torch.diff(cu_seqlens).to("cpu")
# q, k, v: [b, s, head, head_dim] -> [b * s, head, head_dim] # q, k, v: [b, s, head, head_dim] -> [b * s, head, head_dim]
q, k, v = self._reshape_qkv_to_3d(query, key, value, bsz, q_len, kv_len) q, k, v = self._reshape_qkv_to_3d(query, key, value, bsz, q_len, kv_len)
@@ -128,7 +124,7 @@ class AscendMMEncoderAttention(MMEncoderAttention):
query=q, query=q,
key=k, key=k,
value=v, value=v,
seq_len=seq_lens_cpu_cache, seq_len=seq_lens_cpu,
scale_value=self.scale_value, scale_value=self.scale_value,
num_heads=self.num_heads, num_heads=self.num_heads,
num_kv_heads=self.num_kv_heads, num_kv_heads=self.num_kv_heads,