v0.10.1rc1

2025-09-09 09:40:35 +08:00
parent d6f6ef41fe
commit 9149384e03
432 changed files with 84698 additions and 1 deletions
--- a/tests/ut/torchair/ops/test_torchair_fused_moe.py
+++ b/tests/ut/torchair/ops/test_torchair_fused_moe.py
@@ -0,0 +1,410 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+from typing import List, TypedDict
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+import torch.nn as nn
+import torch_npu
+from pytest_mock import MockerFixture
+from vllm.model_executor.layers.fused_moe import FusedMoEMethodBase
+
+from vllm_ascend.ascend_forward_context import _get_fused_moe_state
+from vllm_ascend.quantization.quant_config import AscendFusedMoEMethod
+from vllm_ascend.quantization.quantizer import W8A8Quantizer
+from vllm_ascend.torchair.ops.torchair_fused_moe import (
+    TorchairAscendFusedMoE, TorchairAscendUnquantizedFusedMoEMethod)
+from vllm_ascend.utils import AscendSocVersion, adapt_patch  # noqa E402
+
+adapt_patch(True)
+
+
+def mock_ep_and_mc2_group(mocker):
+    mock_group = mocker.MagicMock()
+    mock_group.rank_in_group = 0
+    mock_group.rank = 0
+    mock_group.world_size = 4
+    mock_group.device_group = "mock_group_ep"
+    mock_group.all_to_all = MagicMock(return_value=torch.randn(8, 8))
+    return mock_group
+
+
+def mock_dp_and_tp_group(mocker):
+    mock_group = mocker.MagicMock()
+    mock_group.rank_in_group = 0
+    mock_group.world_size = 2
+    mock_group.device_group = "mock_group"
+    mock_group.all_gather = MagicMock(return_value=torch.randn(10, 32))
+    return mock_group
+
+
+@pytest.fixture
+def mock_dist_env(mocker: MockerFixture):
+    # init dist env patch
+
+    with patch('torch.distributed.get_rank', return_value=0), \
+         patch('torch.distributed.get_world_size', return_value=4), \
+         patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_ep_group', return_value=mock_ep_and_mc2_group(mocker)), \
+         patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_mc2_group', return_value=mock_ep_and_mc2_group(mocker)), \
+         patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_tp_group', return_value=mock_dp_and_tp_group(mocker)), \
+         patch('vllm.distributed.parallel_state.get_tp_group', return_value=mock_dp_and_tp_group(mocker)), \
+         patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_dp_group', return_value=mock_dp_and_tp_group(mocker)), \
+         patch('vllm.model_executor.layers.fused_moe.layer.get_dp_group', return_value=mock_dp_and_tp_group(mocker)), \
+         patch('torch.distributed.all_gather', return_value=MagicMock(return_value=torch.randn(10,32))), \
+         patch('torch.distributed.all_to_all_single', return_value=torch.randn(8, 32)), \
+         patch('vllm_ascend.torchair.ops.torchair_fused_moe.tensor_model_parallel_all_reduce',
+               return_value=torch.randn(5, 32)), \
+         patch('vllm_ascend.torchair.ops.torchair_fused_moe.data_parallel_reduce_scatter',
+               return_value=torch.randn(5, 32)), \
+         patch('vllm.model_executor.layers.fused_moe.config.get_dp_group',
+               return_value=mock_dp_and_tp_group(mocker)), \
+         patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_config',
+               return_value=MagicMock(
+                   torchair_graph_config=MagicMock(enabled=False, enable_multistream_moe=False),
+                   expert_map_path=None
+               )), \
+         patch('vllm_ascend.torchair.ops.torchair_fused_moe.determine_expert_map',
+               return_value=(3, torch.tensor([0, 1, 2, -1, -1, -1, -1, -1]))), \
+         patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context',
+               return_value=MagicMock(
+                   max_tokens_across_dp=10,
+                   dp_metadata=MagicMock(cu_tokens_across_dp_cpu=[5, 10])
+               )), \
+        patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_current_vllm_config',
+               return_value=MagicMock(
+                   parallel_config=MagicMock(tensor_parallel_size=2),
+                   scheduler_config=MagicMock(max_num_seqs=4),
+                   model_config=MagicMock(max_model_len=2048)
+               )):
+        yield
+
+
+@pytest.fixture
+def mock_moe_env(mocker: MockerFixture):
+    # init moe env patch
+
+    with patch('torch_npu.npu_moe_gating_top_k', return_value=(
+            torch.randn(8, 2),
+            torch.randint(0, 8, (8, 2)),
+            None
+        )), \
+        patch('torch_npu.npu_moe_init_routing', return_value=(
+                torch.randn(8, 2),
+                torch.randint(0, 8, (8, 2)),
+                torch.tensor([0, 1, 2, 4, 6, 2, 7, 1])
+        )), \
+        patch("torch_npu.npu_moe_compute_expert_tokens", return_value=(
+                torch.randn(8, 2)
+        )), \
+        patch("torch_npu.npu_moe_distribute_dispatch", return_value=(
+                torch.randn(16, 2)
+        )), \
+        patch("torch_npu.npu_moe_distribute_combine", return_value=(
+                torch.randn(16, 2)
+        )), \
+        patch("torch_npu.npu_grouped_matmul", return_value=(
+                [torch.randn(16, 2)]
+        )), \
+        patch("torch_npu.npu_swiglu", return_value=(
+                torch.randn(16, 2)
+        )), \
+        patch("torch_npu.npu_moe_gating_top_k_softmax", return_value=(
+                torch.randn(8, 2),
+                torch.randint(0, 8, (8, 2)),
+                torch.tensor([0, 1, 2, 4, 6, 2, 7, 1])
+        )), \
+        patch("torch_npu.npu_moe_finalize_routing", return_value=(
+                torch.randn(16, 2)
+        )):
+        if hasattr(torch_npu, 'npu_moe_distribute_dispatch_v2'):
+            with patch("torch_npu.npu_moe_distribute_dispatch_v2", return_value=(
+                torch.randn(16, 2))), \
+                patch("torch_npu.npu_moe_distribute_combine_v2", return_value=(
+                torch.randn(16, 2))):
+                yield
+        else:
+            yield
+
+
+@pytest.fixture
+def default_moe_config():
+    """default moe config"""
+    return {
+        'num_experts': 8,
+        'top_k': 2,
+        'hidden_size': 512,
+        'intermediate_size': 1024
+    }
+
+
+@pytest.fixture
+def moe_method(mock_dist_env):
+    moe = MagicMock()
+    moe.moe_parallel_config.return_value = MagicMock(ep_size=4)
+    return TorchairAscendUnquantizedFusedMoEMethod(moe)
+
+
+class Device(TypedDict):
+    device_id: int
+    device_expert: List[int]
+
+
+class Layer(TypedDict):
+    layer_id: int
+    device_count: int
+    device_list: List[Device]
+
+
+class MockData(TypedDict):
+    moe_layer_count: int
+    layer_list: List[Layer]
+
+
+class MockQuantMethod(nn.Module):
+
+    def __init__(self, shared_experts, num_tokens):
+        super().__init__()
+        if shared_experts:
+            self.apply = MagicMock(return_value=(torch.randn(num_tokens, 32),
+                                                 torch.randn(num_tokens, 10)))
+        else:
+            self.apply = MagicMock(return_value=(torch.randn(num_tokens, 32)))
+
+
+class MockFusedMoEMethod(FusedMoEMethodBase):
+    moe = MagicMock()
+
+    def __init__(self):
+        super().__init__(self.moe)
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+        pass
+
+    def apply(self, hidden_states: torch.Tensor,
+              expert_weights: torch.Tensor) -> torch.Tensor:
+        pass
+
+
+class TestTorchairAscendFusedMoe:
+
+    def test_init_no_quant(self, mock_dist_env, default_moe_config):
+        layer = TorchairAscendFusedMoE(**default_moe_config)
+
+        layer.w13_weight = nn.Parameter(
+            torch.randn(default_moe_config['num_experts'],
+                        default_moe_config['intermediate_size'] * 2,
+                        default_moe_config['hidden_size']))
+        layer.w2_weight = nn.Parameter(
+            torch.randn(default_moe_config['num_experts'],
+                        default_moe_config['hidden_size'],
+                        default_moe_config['intermediate_size']))
+
+        assert layer.num_experts == default_moe_config['num_experts']
+        assert layer.top_k == default_moe_config['top_k']
+        assert hasattr(layer, 'w13_weight')
+        assert hasattr(layer, 'w2_weight')
+
+        # check group_topk
+        with pytest.raises(AssertionError):
+            error_config = default_moe_config.copy()
+            error_config['use_grouped_topk'] = True
+            layer = TorchairAscendFusedMoE(**error_config)
+
+        # check scoring_func
+        with pytest.raises(ValueError):
+            error_config = default_moe_config.copy()
+            error_config['scoring_func'] = "random"
+            layer = TorchairAscendFusedMoE(**error_config)
+
+    def test_init_with_quant(self, mock_dist_env, default_moe_config):
+        mock_quant_config = MagicMock()
+        mock_quant_method = MockFusedMoEMethod()
+        mock_quant_config.get_quant_method.return_value = mock_quant_method
+        mock_quant_config.is_layer_skipped_ascend.return_value = False
+        with patch(
+                'vllm_ascend.quantization.quantizer.AscendQuantizer.get_quantizer',
+                return_value=W8A8Quantizer):
+            moe = TorchairAscendFusedMoE(**default_moe_config,
+                                         quant_config=mock_quant_config)
+
+            assert moe.quant_method is not None
+            assert isinstance(moe.quant_method, AscendFusedMoEMethod)
+
+    def test_init_with_mixed_quant(self, mock_dist_env, default_moe_config):
+        mock_quant_config = MagicMock()
+        mock_quant_method = MockFusedMoEMethod()
+        mock_quant_config.get_quant_method.return_value = mock_quant_method
+        mock_quant_config.is_layer_skipped_ascend.return_value = True
+
+        moe = TorchairAscendFusedMoE(**default_moe_config,
+                                     quant_config=mock_quant_config)
+
+        assert moe.quant_method is not None
+        assert isinstance(moe.quant_method,
+                          TorchairAscendUnquantizedFusedMoEMethod)
+
+    @pytest.mark.parametrize(
+        "others_param",
+        [[None,
+          MagicMock(return_value=torch.randn(5, 32)), False, 5, None],
+         [2, None, False, 5, None], [None, None, True, 5, None],
+         [None, None, False, 1, None], [None, None, True, 5, 1],
+         [None, None, False, 5, 1]])
+    def test_forward(self, mock_dist_env, default_moe_config, others_param):
+        """
+        1 test has shared_experts
+        2 test has top_k
+        3 test is_prefill is true
+        4 test single num_tokens(decode)
+        5 test ep_size is 1 and is_prefill is true
+        6 test ep_size is 1 and is_prefill is False
+        """
+        top_k, shared_experts, is_prefill, num_tokens, ep_size = others_param
+        inputs = torch.randn(num_tokens, 32)
+        router_logits = torch.randn(num_tokens, 8)
+        moe = TorchairAscendFusedMoE(**default_moe_config)
+
+        if ep_size == 1:
+            moe.moe_parallel_config.ep_size = 1
+
+        moe.quant_method = MockQuantMethod(shared_experts, num_tokens)
+        forward_context = MagicMock(mc2_mask=torch.zeros(num_tokens,
+                                                         dtype=torch.bool),
+                                    padded_num_tokens=num_tokens)
+        with patch(
+                "vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context",
+                return_value=forward_context):
+            output = moe.forward(inputs,
+                                 router_logits,
+                                 is_prefill=is_prefill,
+                                 top_k=top_k,
+                                 shared_experts=shared_experts)
+
+        moe.quant_method.apply.assert_called_once()
+
+        if shared_experts:
+            assert output[0].shape == (num_tokens, 32)
+            assert output[1].shape == (num_tokens, 10)
+        else:
+            assert output.shape == (num_tokens, 32)
+
+    def test_forward_ms_fused_moe_comp(self, mock_dist_env,
+                                       default_moe_config):
+        inputs = torch.randn(5, 32)
+        router_logits = torch.randn(5, 8)
+        moe = TorchairAscendFusedMoE(**default_moe_config)
+
+        moe.quant_method = MockQuantMethod(None, 5)
+        output = moe._forward_ms_fused_moe_comp(inputs,
+                                                router_logits,
+                                                is_prefill=False,
+                                                real_top_k=1)
+
+        moe.quant_method.apply.assert_called_once()
+
+        assert output.shape == (5, 32)
+
+
+class TestTorchairAscendUnquantizedFusedMoEMethod:
+
+    def test_process_weights_after_loading(self, moe_method, mock_dist_env):
+        layer = MagicMock()
+        layer.w13_weight.data = torch.randn(16, 32)
+        layer.w2_weight.data = torch.randn(16, 32)
+
+        moe_method.process_weights_after_loading(layer)
+
+        assert isinstance(layer.w13_weight, torch.nn.Parameter)
+        assert isinstance(layer.w2_weight, torch.nn.Parameter)
+        assert not layer.w13_weight.requires_grad
+        assert not layer.w2_weight.requires_grad
+
+    @pytest.mark.parametrize("others_param",
+                             [[256, 4], [128, 1], [128, 1], [128, 4]])
+    def test_apply_without_expert_map(self, moe_method, mock_dist_env,
+                                      mock_moe_env, others_param):
+        """
+        1 test is_deepseek_v3_r1=true and use fused_experts_with_all2all
+        2 test use_select_experts and fused_experts
+        3 test use select_gating_topk_softmax_experts and fused_experts
+        4 test use select_experts and fused_experts_with_all2all_buffer
+        """
+        global_num_experts, ep_size = others_param
+        is_prefill = False
+        is_deepseek_v3_r1 = global_num_experts == 256
+        forward_context = MagicMock(fused_moe_state=_get_fused_moe_state(
+            ep_size, is_prefill, is_deepseek_v3_r1))
+        with patch(
+                "vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context",
+                return_value=forward_context):
+            moe_method.ep_size = ep_size
+            x = torch.randn(8, 2, 2)
+            router_logits = torch.randn(8, 8)
+            layer = MagicMock()
+            layer.w13_weight = torch.randn(8, 16, 1)
+            layer.w2_weight = torch.randn(16, 8, 1)
+            result = moe_method.apply(layer=layer,
+                                      x=x,
+                                      router_logits=router_logits,
+                                      top_k=2,
+                                      renormalize=True,
+                                      global_num_experts=global_num_experts,
+                                      is_prefill=is_prefill)
+
+            if ep_size == 1:
+                assert result.shape == (16, 2)
+            else:
+                assert result.shape == x.shape
+
+    @pytest.mark.parametrize("others_param", [16, 1, 4])
+    def test_apply_with_expert_map(self, moe_method, mock_dist_env,
+                                   mock_moe_env, others_param):
+        """
+        1 test use_select_experts and use fused_expters_with_mc2
+        2 test use_select_experts and fused_experts_with_all2all_buffer
+        3 test use_select_experts and fused_experts_with_all2all
+        4 test use_select_experts and fused_experts
+        """
+        ep_size = others_param
+        is_prefill = False
+        forward_context = MagicMock(
+            fused_moe_state=_get_fused_moe_state(ep_size, is_prefill, True))
+        with patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context", return_value=forward_context), \
+             patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_soc_version", return_value=AscendSocVersion.A3):
+            expert_map = torch.tensor([0, 1, 2, -1, -1, -1, -1, -1])
+            moe_method.ep_size = ep_size
+            x = torch.randn(8, 2, 2)
+            if ep_size == 1:
+                x = x.view(-1, 2)
+            router_logits = torch.randn(8, 8)
+            layer = MagicMock()
+            layer.w13_weight = torch.randn(8, 16, 1)
+            layer.w2_weight = torch.randn(16, 8, 1)
+            result = moe_method.apply(layer=layer,
+                                      x=x,
+                                      router_logits=router_logits,
+                                      top_k=2,
+                                      renormalize=True,
+                                      global_num_experts=128,
+                                      expert_map=expert_map,
+                                      is_prefill=is_prefill)
+
+            if ep_size == 16 or ep_size == 1:
+                assert result.shape == (16, 2)
+            else:
+                assert result.shape == x.shape
--- a/tests/ut/torchair/ops/test_torchair_rotary_embedding.py
+++ b/tests/ut/torchair/ops/test_torchair_rotary_embedding.py
@@ -0,0 +1,332 @@
+import math
+from unittest.mock import MagicMock, patch
+
+import torch
+
+from tests.ut.base import TestBase
+from vllm_ascend.torchair.ops.torchair_rotary_embedding import (
+    custom_rotary_embedding_enabled, native_rope_deepseek_forward,
+    rope_forward_oot, rotate_half, yarn_find_correction_dim, yarn_get_mscale)
+
+
+class TestCustomRotaryEmbeddingEnabled(TestBase):
+
+    def setUp(self):
+        # Common setup for tests
+        self.positions = torch.tensor([1, 2, 3])
+        self.query = torch.randn(3, 4, dtype=torch.float16)
+        self.key = torch.randn(3, 4, dtype=torch.float16)
+        self.head_size = 32
+        self.cos_sin_cache = torch.randn(3, 4)
+
+        # Mock self object for rope_forward_oot
+        self.mock_self = MagicMock()
+        self.mock_self.head_size = self.head_size
+        self.mock_self.cos_sin_cache = self.cos_sin_cache
+        self.mock_self.is_neox_style = True
+        self.mock_self.forward_native.return_value = (self.query, self.key)
+
+    def test_custom_rotary_embedding_enabled(self):
+        # Test when all conditions are True
+        with patch(
+                'vllm_ascend.torchair.ops.torchair_rotary_embedding.enable_custom_op',
+                return_value=True):
+            result = custom_rotary_embedding_enabled(self.query, True,
+                                                     self.head_size)
+            self.assertTrue(result)
+
+        # Test when dtype is not float16
+        with patch(
+                'vllm_ascend.torchair.ops.torchair_rotary_embedding.enable_custom_op',
+                return_value=True):
+            query = self.query.to(torch.float32)
+            result = custom_rotary_embedding_enabled(query, True,
+                                                     self.head_size)
+            self.assertFalse(result)
+
+        # Test when neox_style is False
+        with patch(
+                'vllm_ascend.torchair.ops.torchair_rotary_embedding.enable_custom_op',
+                return_value=True):
+            result = custom_rotary_embedding_enabled(self.query, False,
+                                                     self.head_size)
+            self.assertFalse(result)
+
+        # Test when head_size is not divisible by 32
+        with patch(
+                'vllm_ascend.torchair.ops.torchair_rotary_embedding.enable_custom_op',
+                return_value=True):
+            result = custom_rotary_embedding_enabled(self.query, True,
+                                                     self.head_size + 1)
+            self.assertFalse(result)
+
+        # Test when custom op is disabled
+        with patch(
+                'vllm_ascend.torchair.ops.torchair_rotary_embedding.enable_custom_op',
+                return_value=False):
+            result = custom_rotary_embedding_enabled(self.query, True,
+                                                     self.head_size)
+            self.assertFalse(result)
+
+
+class TestRopeForwardOot(TestBase):
+
+    def setUp(self):
+        # Common setup for tests
+        self.positions = torch.tensor([1, 2, 3])
+        self.query = torch.randn(3, 4, dtype=torch.float16)
+        self.key = torch.randn(3, 4, dtype=torch.float16)
+        self.head_size = 32
+        self.cos_sin_cache = torch.randn(3, 4)
+
+        # Mock self object for rope_forward_oot
+        self.mock_self = MagicMock()
+        self.mock_self.head_size = self.head_size
+        self.mock_self.cos_sin_cache = self.cos_sin_cache
+        self.mock_self.is_neox_style = True
+        self.mock_self.forward_native.return_value = (self.query, self.key)
+
+    @patch(
+        'vllm_ascend.torchair.ops.torchair_rotary_embedding.get_ascend_config')
+    def test_rope_forward_oot_torchair_enabled_base(self,
+                                                    mock_get_ascend_config):
+        # Setup mock for torchair enabled
+        mock_config = MagicMock()
+        mock_config.torchair_graph_config.enabled = True
+        mock_get_ascend_config.return_value = mock_config
+
+        result_q, result_k = rope_forward_oot(self.mock_self, self.positions,
+                                              self.query, self.key)
+
+        self.mock_self.forward_native.assert_called_once_with(
+            self.positions, self.query, self.key, None)
+        self.assertTrue(torch.equal(result_q, self.query))
+        self.assertTrue(torch.equal(result_k, self.key))
+
+    @patch('torch.ops._C')
+    @patch(
+        'vllm_ascend.torchair.ops.torchair_rotary_embedding.get_ascend_config')
+    @patch('vllm_ascend.torchair.ops.torchair_rotary_embedding.is_310p',
+           return_value=False)
+    @patch(
+        'vllm_ascend.torchair.ops.torchair_rotary_embedding.custom_rotary_embedding_enabled',
+        return_value=True)
+    @patch('torch.ops._npu_rotary_embedding')
+    def test_rope_forward_oot_custom_kernel(self, mock_rotary_embedding,
+                                            mock_custom_enabled, mock_is_310p,
+                                            mock_get_ascend_config, mock__c):
+        mock_config = MagicMock()
+        mock_config.torchair_graph_config.enabled = False
+        mock_get_ascend_config.return_value = mock_config
+
+        # Setup mock for custom kernel path
+
+        mock__c.rotary_embedding.return_value = self.query, self.key
+
+        result_q, result_k = rope_forward_oot(self.mock_self, self.positions,
+                                              self.query, self.key)
+
+        self.assertEqual(result_q.shape, self.query.shape)
+        self.assertEqual(result_k.shape, self.key.shape)
+
+    @patch(
+        'vllm_ascend.torchair.ops.torchair_rotary_embedding.get_ascend_config')
+    @patch(
+        'vllm_ascend.torchair.ops.torchair_rotary_embedding.custom_rotary_embedding_enabled',
+        return_value=False)
+    @patch('torch_npu._npu_rotary_embedding')
+    def test_rope_forward_oot_contiguous(self, mock_npu_rotary,
+                                         mock_custom_enabled,
+                                         mock_get_ascend_config):
+        mock_config = MagicMock()
+        mock_config.torchair_graph_config.enabled = False
+        mock_get_ascend_config.return_value = mock_config
+
+        # Test contiguous path when custom is disabled
+        non_contig_query = self.query.transpose(0, 1)
+        non_contig_key = self.key.transpose(0, 1)
+
+        result_q, result_k = rope_forward_oot(self.mock_self, self.positions,
+                                              non_contig_query, non_contig_key)
+
+        mock_npu_rotary.assert_called_once()
+        self.assertEqual(result_q.shape, non_contig_query.shape)
+        self.assertEqual(result_k.shape, non_contig_key.shape)
+
+    @patch(
+        'vllm_ascend.torchair.ops.torchair_rotary_embedding.get_ascend_config')
+    def test_rope_forward_oot_with_offsets(self, mock_get_ascend_config):
+        mock_config = MagicMock()
+        mock_config.torchair_graph_config.enabled = False
+        mock_get_ascend_config.return_value = mock_config
+
+        # Test that NotImplementedError is raised when offsets is provided
+        offsets = torch.tensor([1, 2, 3])
+        with self.assertRaises(NotImplementedError):
+            rope_forward_oot(self.mock_self, self.positions, self.query,
+                             self.key, offsets)
+
+    @patch(
+        'vllm_ascend.torchair.ops.torchair_rotary_embedding.get_ascend_config')
+    @patch(
+        'vllm_ascend.torchair.ops.torchair_rotary_embedding.custom_rotary_embedding_enabled',
+        return_value=False)
+    @patch('torch_npu._npu_rotary_embedding')
+    def test_rope_forward_oot_neox_style_override(self, mock_npu_rotary,
+                                                  mock_custom_enabled,
+                                                  mock_get_ascend_config):
+        mock_config = MagicMock()
+        mock_config.torchair_graph_config.enabled = False
+        mock_get_ascend_config.return_value = mock_config
+
+        # Test neox_style override
+        result_q, result_k = rope_forward_oot(self.mock_self,
+                                              self.positions,
+                                              self.query,
+                                              self.key,
+                                              is_neox_style_override=False)
+
+        # Check that neox_style=False was passed to the NPU function
+        args, kwargs = mock_npu_rotary.call_args
+        self.assertFalse(args[-1])
+
+
+class MockRopeModule:
+
+    def __init__(self, max_seq_len=2048, is_neox_style=True):
+        self.max_seq_len = max_seq_len
+        self.is_neox_style = is_neox_style
+        self.cos_cached = None
+        self.sin_cached = None
+        self.rotary_dim = 1
+        self.base = 1
+
+
+class TestNativeRopeDeepseekForward(TestBase):
+
+    @patch(
+        'vllm_ascend.torchair.ops.torchair_rotary_embedding.rope_forward_oot')
+    def test_native_rope_deepseek_forward_base(self, mock_rope_forward_oot):
+        module = MockRopeModule()
+        positions = torch.tensor([1, 2, 3])
+        query = torch.randn(1, 8, 128)
+        key = torch.randn(1, 8, 128)
+
+        mock_rope_forward_oot.return_value = (query, key)
+
+        q_pe, k_pe = native_rope_deepseek_forward(module, positions, query,
+                                                  key)
+
+        assert q_pe.shape == query.shape
+        assert k_pe.shape == key.shape
+
+    @patch(
+        'vllm_ascend.torchair.ops.torchair_rotary_embedding._set_cos_sin_cache'
+    )
+    @patch(
+        'vllm_ascend.torchair.ops.torchair_rotary_embedding.rope_forward_oot')
+    def test_native_rope_deepseek_forward_cache_handling(
+            self, mock_rope_forward_oot, mock_set_cache):
+        # Test cache situation is true
+        module = MockRopeModule(max_seq_len=1024)
+        positions = torch.tensor([1, 2, 3])
+        query = torch.randn(1, 8, 128)
+        key = torch.randn(1, 8, 128)
+
+        mock_rope_forward_oot.return_value = (query, key)
+
+        q_pe, k_pe = native_rope_deepseek_forward(module,
+                                                  positions,
+                                                  query,
+                                                  key,
+                                                  max_seq_len=2048)
+
+        assert q_pe.shape == query.shape
+        assert k_pe.shape == key.shape
+
+    @patch(
+        'vllm_ascend.torchair.ops.torchair_rotary_embedding.rope_forward_oot')
+    def test_native_rope_deepseek_forward_key_reshaping(
+            self, mock_rope_forward_oot):
+        module = MockRopeModule()
+        positions = torch.tensor([1, 2, 3])
+        query = torch.randn(1, 8, 128)
+        key = torch.randn(1, 128)
+
+        mock_rope_forward_oot.return_value = (query, key)
+
+        q_pe, k_pe = native_rope_deepseek_forward(module, positions, query,
+                                                  key)
+
+        assert q_pe.shape == query.shape
+        assert k_pe.shape == (1, 128)
+
+    @patch(
+        'vllm_ascend.torchair.ops.torchair_rotary_embedding.rope_forward_oot')
+    def test_native_rope_deepseek_forward_non_neox_style(
+            self, mock_rope_forward_oot):
+        module = MockRopeModule(is_neox_style=False)
+        positions = torch.tensor([1, 2, 3])
+        query = torch.randn(1, 8, 128)
+        key = torch.randn(1, 8, 128)
+
+        mock_rope_forward_oot.return_value = (query, key)
+
+        q_pe, k_pe = native_rope_deepseek_forward(module, positions, query,
+                                                  key)
+
+        assert q_pe.shape == query.shape
+        assert k_pe.shape == key.shape
+
+
+class TestRotateHalf(TestBase):
+
+    def test_rotate_half_even_dim(self):
+        # Test with even dimension
+        x = torch.tensor([1.0, 2.0, 3.0, 4.0])
+        expected = torch.tensor([-3.0, -4.0, 1.0, 2.0])
+        result = rotate_half(x)
+        self.assertTrue(torch.allclose(result, expected))
+
+
+class TestYarnFindCorrectionDim(TestBase):
+
+    def test_basic_case(self):
+        # Test with standard values
+        num_rotations = 100
+        dim = 512
+        base = 10000
+        max_position_embeddings = 2048
+
+        result = yarn_find_correction_dim(num_rotations, dim, base,
+                                          max_position_embeddings)
+
+        # Calculate expected value manually
+        expected = (dim * torch.log(
+            torch.tensor(max_position_embeddings) /
+            (num_rotations * 2 * torch.pi))) / (2 *
+                                                torch.log(torch.tensor(base)))
+
+        self.assertTrue(torch.allclose(result, expected))
+
+
+class TestYarnGetMscale(TestBase):
+
+    def test_scale_less_than_or_equal_1(self):
+        self.assertEqual(yarn_get_mscale(scale=0.5), 1.0)
+        self.assertEqual(yarn_get_mscale(scale=1.0), 1.0)
+        self.assertEqual(yarn_get_mscale(scale=0.999), 1.0)
+
+    def test_scale_greater_than_1(self):
+        test_cases = [(2.0, 1.0, 1.0 + 0.1 * math.log(2.0)),
+                      (10.0, 1.0, 1.0 + 0.1 * math.log(10.0)),
+                      (5.0, 2.0, 1.0 + 0.2 * math.log(5.0)),
+                      (math.e, 1.0, 1.0 + 0.1)]
+
+        for scale, mscale, expected in test_cases:
+            result = yarn_get_mscale(scale, mscale)
+            self.assertAlmostEqual(
+                result,
+                expected,
+                places=6,
+                msg=f"Failed for scale={scale}, mscale={mscale}")