cherry pick from pr 4270 (#4285)

### What this PR does / why we need it?
avoid mrope fusion op when running qwen25vl on x86 machine

---------

Signed-off-by: 李少鹏 <lishaopeng21@huawei.com>
This commit is contained in:
shaopeng-666
2025-11-19 22:32:02 +08:00
committed by GitHub
parent 277670730c
commit b6d59bdea2
2 changed files with 14 additions and 3 deletions

View File

@@ -24,6 +24,7 @@ from vllm.forward_context import get_forward_context
from vllm.model_executor.layers.rotary_embedding import (
DeepseekScalingRotaryEmbedding, MRotaryEmbedding, RotaryEmbedding,
YaRNScalingRotaryEmbedding)
from vllm.platforms import CpuArchEnum
from vllm_ascend.platform import NPUPlatform
from vllm_ascend.utils import enable_custom_op, is_310p
@@ -405,7 +406,10 @@ class AscendMRotaryEmbedding(MRotaryEmbedding):
query: torch.Tensor,
key: torch.Tensor,
):
if self.mrope_section != [16, 24, 24]:
# TODO: This judgment will be removed once the mrope precision issue is fixed
if self.mrope_section != [
16, 24, 24
] or NPUPlatform.get_cpu_architecture() == CpuArchEnum.X86:
return super().forward_oot(positions, query, key)
import torch_npu