From b6d59bdea203ff56139fc3e58d74f934304c0abc Mon Sep 17 00:00:00 2001 From: shaopeng-666 Date: Wed, 19 Nov 2025 22:32:02 +0800 Subject: [PATCH] cherry pick from pr 4270 (#4285) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? avoid mrope fusion op when running qwen25vl on x86 machine --------- Signed-off-by: 李少鹏 --- tests/ut/ops/test_rotary_embedding.py | 11 +++++++++-- vllm_ascend/ops/rotary_embedding.py | 6 +++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/tests/ut/ops/test_rotary_embedding.py b/tests/ut/ops/test_rotary_embedding.py index 3a796ae..580a4fc 100644 --- a/tests/ut/ops/test_rotary_embedding.py +++ b/tests/ut/ops/test_rotary_embedding.py @@ -7,6 +7,7 @@ from transformers.configuration_utils import PretrainedConfig from vllm.config import ModelConfig, VllmConfig from vllm.model_executor.layers.rotary_embedding import ( DeepseekScalingRotaryEmbedding, MRotaryEmbedding, RotaryEmbedding) +from vllm.platforms import CpuArchEnum from tests.ut.base import TestBase from vllm_ascend.ascend_forward_context import set_ascend_forward_context @@ -424,11 +425,14 @@ class TestAscendMRotaryEmbedding(unittest.TestCase): return vllm_config @patch('torch_npu.npu_mrope') + @patch('vllm_ascend.platform.NPUPlatform.get_cpu_architecture') @patch('vllm.config.ModelConfig.__post_init__', MagicMock()) @patch('vllm.config.VllmConfig.__post_init__', MagicMock()) @patch('vllm.distributed.parallel_state._DP', MagicMock(world_size=1)) @patch('vllm.distributed.parallel_state._TP', MagicMock(world_size=1)) - def test_forward_oot_1d_positions(self, mock_npu_mrope): + def test_forward_oot_1d_positions(self, mock_cpu_arc, mock_npu_mrope): + mock_cpu_arc.return_value = CpuArchEnum.ARM + mock_npu_mrope.return_value = (torch.zeros_like(self.query), torch.zeros_like(self.key)) @@ -443,11 +447,14 @@ class TestAscendMRotaryEmbedding(unittest.TestCase): self.assertEqual(result_q.shape, self.query.shape) @patch('torch_npu.npu_mrope') + @patch('vllm_ascend.platform.NPUPlatform.get_cpu_architecture') @patch('vllm.config.ModelConfig.__post_init__', MagicMock()) @patch('vllm.config.VllmConfig.__post_init__', MagicMock()) @patch('vllm.distributed.parallel_state._DP', MagicMock(world_size=1)) @patch('vllm.distributed.parallel_state._TP', MagicMock(world_size=1)) - def test_forward_oot_2d_positions(self, mock_npu_mrope): + def test_forward_oot_2d_positions(self, mock_cpu_arc, mock_npu_mrope): + mock_cpu_arc.return_value = CpuArchEnum.ARM + mock_npu_mrope.return_value = (torch.zeros_like(self.query), torch.zeros_like(self.key)) diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py index 0989455..0f8a68c 100644 --- a/vllm_ascend/ops/rotary_embedding.py +++ b/vllm_ascend/ops/rotary_embedding.py @@ -24,6 +24,7 @@ from vllm.forward_context import get_forward_context from vllm.model_executor.layers.rotary_embedding import ( DeepseekScalingRotaryEmbedding, MRotaryEmbedding, RotaryEmbedding, YaRNScalingRotaryEmbedding) +from vllm.platforms import CpuArchEnum from vllm_ascend.platform import NPUPlatform from vllm_ascend.utils import enable_custom_op, is_310p @@ -405,7 +406,10 @@ class AscendMRotaryEmbedding(MRotaryEmbedding): query: torch.Tensor, key: torch.Tensor, ): - if self.mrope_section != [16, 24, 24]: + # TODO: This judgment will be removed once the mrope precision issue is fixed + if self.mrope_section != [ + 16, 24, 24 + ] or NPUPlatform.get_cpu_architecture() == CpuArchEnum.X86: return super().forward_oot(positions, query, key) import torch_npu