avoid mrope fusion op when running qwen2.5-vl on a+x machine (#4270)

### What this PR does / why we need it? avoid mrope fusion op when running qwen2.5-vl on a+x machine ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? Test text VQA accuracy on G8600 with aisbench - vLLM version: v0.11.0 - vLLM main: 2918c1b49c --------- Signed-off-by: 李少鹏 <lishaopeng21@huawei.com>
2025-11-19 22:31:14 +08:00
parent c848da0687
commit 3653f33878
2 changed files with 15 additions and 4 deletions
--- a/vllm_ascend/ops/rotary_embedding.py
+++ b/vllm_ascend/ops/rotary_embedding.py
@@ -24,6 +24,7 @@ from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.rotary_embedding import (
    DeepseekScalingRotaryEmbedding, MRotaryEmbedding, RotaryEmbedding,
    YaRNScalingRotaryEmbedding)
+from vllm.platforms import CpuArchEnum

 from vllm_ascend.platform import NPUPlatform
 from vllm_ascend.utils import enable_custom_op, is_310p
@@ -405,7 +406,10 @@ class AscendMRotaryEmbedding(MRotaryEmbedding):
        query: torch.Tensor,
        key: torch.Tensor,
    ):
-        if self.mrope_section != [16, 24, 24]:
+        # TODO: This judgment will be removed once the mrope precision issue is fixed
+        if self.mrope_section != [
+                16, 24, 24
+        ] or NPUPlatform.get_cpu_architecture() == CpuArchEnum.X86:
            return super().forward_oot(positions, query, key)

        import torch_npu
@@ -428,4 +432,4 @@ class AscendMRotaryEmbedding(MRotaryEmbedding):
                                         mrope_section=mrope_section,
                                         rotary_mode='half')

-        return query, key
+        return query, key