avoid mrope fusion op when running qwen2.5-vl on a+x machine (#4270)
### What this PR does / why we need it?
avoid mrope fusion op when running qwen2.5-vl on a+x machine
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
Test text VQA accuracy on G8600 with aisbench
- vLLM version: v0.11.0
- vLLM main:
2918c1b49c
---------
Signed-off-by: 李少鹏 <lishaopeng21@huawei.com>
This commit is contained in:
@@ -24,6 +24,7 @@ from vllm.forward_context import get_forward_context
|
||||
from vllm.model_executor.layers.rotary_embedding import (
|
||||
DeepseekScalingRotaryEmbedding, MRotaryEmbedding, RotaryEmbedding,
|
||||
YaRNScalingRotaryEmbedding)
|
||||
from vllm.platforms import CpuArchEnum
|
||||
|
||||
from vllm_ascend.platform import NPUPlatform
|
||||
from vllm_ascend.utils import enable_custom_op, is_310p
|
||||
@@ -405,7 +406,10 @@ class AscendMRotaryEmbedding(MRotaryEmbedding):
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
):
|
||||
if self.mrope_section != [16, 24, 24]:
|
||||
# TODO: This judgment will be removed once the mrope precision issue is fixed
|
||||
if self.mrope_section != [
|
||||
16, 24, 24
|
||||
] or NPUPlatform.get_cpu_architecture() == CpuArchEnum.X86:
|
||||
return super().forward_oot(positions, query, key)
|
||||
|
||||
import torch_npu
|
||||
@@ -428,4 +432,4 @@ class AscendMRotaryEmbedding(MRotaryEmbedding):
|
||||
mrope_section=mrope_section,
|
||||
rotary_mode='half')
|
||||
|
||||
return query, key
|
||||
return query, key
|
||||
|
||||
Reference in New Issue
Block a user