[Bugfix] Correctly handle the output shape in multimodal attention (#5443)
### What this PR does / why we need it?
Fix https://github.com/vllm-project/vllm-ascend/issues/5297, for
`AscendMMEncoderAttention` forward, we should keep the output shape
consistence with the input
- vLLM version: release/v0.13.0
- vLLM main:
81786c8774
---------
Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
@@ -781,6 +781,11 @@ PROMPT_CONFIGS = {
|
|||||||
"fps": 1,
|
"fps": 1,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
"hunyuan-vl": {
|
||||||
|
"model": "Tencent-Hunyuan/HunyuanOCR",
|
||||||
|
"prompt_fn": hunyuan_prompt,
|
||||||
|
"mm_processor_kwargs": {},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -93,6 +93,7 @@ class AscendMMEncoderAttention(MMEncoderAttention):
|
|||||||
):
|
):
|
||||||
bsz, q_len = query.size()[:2]
|
bsz, q_len = query.size()[:2]
|
||||||
kv_len = key.size(1)
|
kv_len = key.size(1)
|
||||||
|
is_reshaped = query.dim() == 4
|
||||||
|
|
||||||
# q, k, v: [b, s, head, head_dim] -> [b * s, head, head_dim]
|
# q, k, v: [b, s, head, head_dim] -> [b * s, head, head_dim]
|
||||||
q, k, v = self.reshape_qkv_to_3d(query, key, value, bsz, q_len, kv_len)
|
q, k, v = self.reshape_qkv_to_3d(query, key, value, bsz, q_len, kv_len)
|
||||||
@@ -134,7 +135,12 @@ class AscendMMEncoderAttention(MMEncoderAttention):
|
|||||||
if enable_pad:
|
if enable_pad:
|
||||||
context_layer = context_layer[..., :origin_shape]
|
context_layer = context_layer[..., :origin_shape]
|
||||||
|
|
||||||
|
if is_reshaped:
|
||||||
context_layer = einops.rearrange(context_layer,
|
context_layer = einops.rearrange(context_layer,
|
||||||
"(b s) h d -> b s h d",
|
"(b s) h d -> b s h d",
|
||||||
b=bsz).contiguous()
|
b=bsz).contiguous()
|
||||||
|
else:
|
||||||
|
context_layer = einops.rearrange(context_layer,
|
||||||
|
"(b s) h d -> b s (h d)",
|
||||||
|
b=bsz).contiguous()
|
||||||
return context_layer
|
return context_layer
|
||||||
|
|||||||
Reference in New Issue
Block a user