cherry pick from pr 4270 (#4285)
### What this PR does / why we need it? avoid mrope fusion op when running qwen25vl on x86 machine --------- Signed-off-by: 李少鹏 <lishaopeng21@huawei.com>
This commit is contained in:
@@ -7,6 +7,7 @@ from transformers.configuration_utils import PretrainedConfig
|
|||||||
from vllm.config import ModelConfig, VllmConfig
|
from vllm.config import ModelConfig, VllmConfig
|
||||||
from vllm.model_executor.layers.rotary_embedding import (
|
from vllm.model_executor.layers.rotary_embedding import (
|
||||||
DeepseekScalingRotaryEmbedding, MRotaryEmbedding, RotaryEmbedding)
|
DeepseekScalingRotaryEmbedding, MRotaryEmbedding, RotaryEmbedding)
|
||||||
|
from vllm.platforms import CpuArchEnum
|
||||||
|
|
||||||
from tests.ut.base import TestBase
|
from tests.ut.base import TestBase
|
||||||
from vllm_ascend.ascend_forward_context import set_ascend_forward_context
|
from vllm_ascend.ascend_forward_context import set_ascend_forward_context
|
||||||
@@ -424,11 +425,14 @@ class TestAscendMRotaryEmbedding(unittest.TestCase):
|
|||||||
return vllm_config
|
return vllm_config
|
||||||
|
|
||||||
@patch('torch_npu.npu_mrope')
|
@patch('torch_npu.npu_mrope')
|
||||||
|
@patch('vllm_ascend.platform.NPUPlatform.get_cpu_architecture')
|
||||||
@patch('vllm.config.ModelConfig.__post_init__', MagicMock())
|
@patch('vllm.config.ModelConfig.__post_init__', MagicMock())
|
||||||
@patch('vllm.config.VllmConfig.__post_init__', MagicMock())
|
@patch('vllm.config.VllmConfig.__post_init__', MagicMock())
|
||||||
@patch('vllm.distributed.parallel_state._DP', MagicMock(world_size=1))
|
@patch('vllm.distributed.parallel_state._DP', MagicMock(world_size=1))
|
||||||
@patch('vllm.distributed.parallel_state._TP', MagicMock(world_size=1))
|
@patch('vllm.distributed.parallel_state._TP', MagicMock(world_size=1))
|
||||||
def test_forward_oot_1d_positions(self, mock_npu_mrope):
|
def test_forward_oot_1d_positions(self, mock_cpu_arc, mock_npu_mrope):
|
||||||
|
mock_cpu_arc.return_value = CpuArchEnum.ARM
|
||||||
|
|
||||||
mock_npu_mrope.return_value = (torch.zeros_like(self.query),
|
mock_npu_mrope.return_value = (torch.zeros_like(self.query),
|
||||||
torch.zeros_like(self.key))
|
torch.zeros_like(self.key))
|
||||||
|
|
||||||
@@ -443,11 +447,14 @@ class TestAscendMRotaryEmbedding(unittest.TestCase):
|
|||||||
self.assertEqual(result_q.shape, self.query.shape)
|
self.assertEqual(result_q.shape, self.query.shape)
|
||||||
|
|
||||||
@patch('torch_npu.npu_mrope')
|
@patch('torch_npu.npu_mrope')
|
||||||
|
@patch('vllm_ascend.platform.NPUPlatform.get_cpu_architecture')
|
||||||
@patch('vllm.config.ModelConfig.__post_init__', MagicMock())
|
@patch('vllm.config.ModelConfig.__post_init__', MagicMock())
|
||||||
@patch('vllm.config.VllmConfig.__post_init__', MagicMock())
|
@patch('vllm.config.VllmConfig.__post_init__', MagicMock())
|
||||||
@patch('vllm.distributed.parallel_state._DP', MagicMock(world_size=1))
|
@patch('vllm.distributed.parallel_state._DP', MagicMock(world_size=1))
|
||||||
@patch('vllm.distributed.parallel_state._TP', MagicMock(world_size=1))
|
@patch('vllm.distributed.parallel_state._TP', MagicMock(world_size=1))
|
||||||
def test_forward_oot_2d_positions(self, mock_npu_mrope):
|
def test_forward_oot_2d_positions(self, mock_cpu_arc, mock_npu_mrope):
|
||||||
|
mock_cpu_arc.return_value = CpuArchEnum.ARM
|
||||||
|
|
||||||
mock_npu_mrope.return_value = (torch.zeros_like(self.query),
|
mock_npu_mrope.return_value = (torch.zeros_like(self.query),
|
||||||
torch.zeros_like(self.key))
|
torch.zeros_like(self.key))
|
||||||
|
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ from vllm.forward_context import get_forward_context
|
|||||||
from vllm.model_executor.layers.rotary_embedding import (
|
from vllm.model_executor.layers.rotary_embedding import (
|
||||||
DeepseekScalingRotaryEmbedding, MRotaryEmbedding, RotaryEmbedding,
|
DeepseekScalingRotaryEmbedding, MRotaryEmbedding, RotaryEmbedding,
|
||||||
YaRNScalingRotaryEmbedding)
|
YaRNScalingRotaryEmbedding)
|
||||||
|
from vllm.platforms import CpuArchEnum
|
||||||
|
|
||||||
from vllm_ascend.platform import NPUPlatform
|
from vllm_ascend.platform import NPUPlatform
|
||||||
from vllm_ascend.utils import enable_custom_op, is_310p
|
from vllm_ascend.utils import enable_custom_op, is_310p
|
||||||
@@ -405,7 +406,10 @@ class AscendMRotaryEmbedding(MRotaryEmbedding):
|
|||||||
query: torch.Tensor,
|
query: torch.Tensor,
|
||||||
key: torch.Tensor,
|
key: torch.Tensor,
|
||||||
):
|
):
|
||||||
if self.mrope_section != [16, 24, 24]:
|
# TODO: This judgment will be removed once the mrope precision issue is fixed
|
||||||
|
if self.mrope_section != [
|
||||||
|
16, 24, 24
|
||||||
|
] or NPUPlatform.get_cpu_architecture() == CpuArchEnum.X86:
|
||||||
return super().forward_oot(positions, query, key)
|
return super().forward_oot(positions, query, key)
|
||||||
|
|
||||||
import torch_npu
|
import torch_npu
|
||||||
|
|||||||
Reference in New Issue
Block a user