diff --git a/tests/ut/models/test_deepseek_v2.py b/tests/ut/models/test_deepseek_v2.py
new file mode 100644
index 0000000..f3a7d1a
--- /dev/null
+++ b/tests/ut/models/test_deepseek_v2.py
@@ -0,0 +1,309 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+from types import SimpleNamespace
+from unittest.mock import Mock, patch
+
+import pytest
+import torch
+from transformers import PretrainedConfig
+from vllm.config import CacheConfig
+from vllm.distributed.parallel_state import GroupCoordinator
+
+from vllm_ascend.models.deepseek_v2 import (
+    CustomDeepseekV2DecoderLayer, CustomDeepseekV2ForCausalLM,
+    CustomDeepseekV2MergedReplicatedLinear, CustomDeepseekV2MLAAttention,
+    CustomDeepseekV2MLP, CustomDeepseekV2MoE,
+    CustomDeepseekV2RowParallelLinear,
+    CustomDeepseekV2RowParallelLinearReplaceAllreduce,
+    CustomDeepseekV2SiluAndMul)
+
+
+@pytest.fixture
+def base_config():
+    config = PretrainedConfig(
+        hidden_size=128,
+        num_attention_heads=8,
+        num_hidden_layers=2,
+        intermediate_size=256,
+        hidden_act="silu",
+        rms_norm_eps=1e-6,
+        rope_theta=10000.0,
+        max_position_embeddings=2048,
+        n_routed_experts=4,
+        n_shared_experts=1,
+        moe_intermediate_size=256,
+        num_experts_per_tok=2,
+        routed_scaling_factor=1.0,
+        first_k_dense_replace=0,
+        moe_layer_freq=1,
+        kv_lora_rank=16,
+        qk_nope_head_dim=16,
+        qk_rope_head_dim=16,
+        v_head_dim=32,
+        topk_method="noaux_tc",
+        scoring_func="softmax",
+        norm_topk_prob=True,
+        n_group=1,
+        topk_group=1,
+        vocab_size=10000,
+    )
+    return config
+
+
+@pytest.fixture
+def vllm_config(base_config):
+    model_config = SimpleNamespace(
+        hf_config=base_config,
+        tensor_parallel_size=1,
+        dtype=torch.float32,
+        use_mla=False,
+        quant_config=None,
+        max_model_len=2048,
+    )
+
+    cache_config = CacheConfig()
+    vllm_config = Mock()
+    vllm_config.model_config = model_config
+    vllm_config.cache_config = cache_config
+    vllm_config.quant_config = None
+    return vllm_config
+
+
+@pytest.fixture
+def mock_distributed():
+    tp_group = Mock(spec=GroupCoordinator)
+    tp_group.rank_in_group = 0
+    tp_group.world_size = 1
+    tp_group.device_group = Mock()
+
+    dp_group = Mock(spec=GroupCoordinator)
+    dp_group.rank_in_group = 0
+    dp_group.world_size = 1
+
+    ep_group = Mock(spec=GroupCoordinator)
+    ep_group.rank_in_group = 0
+    ep_group.world_size = 1
+
+    pp_group = Mock(spec=GroupCoordinator)
+    pp_group.rank_in_group = 0
+    pp_group.world_size = 1
+
+    mock_vllm_config = Mock()
+    mock_vllm_config.scheduler_config = Mock(max_num_seqs=256)
+    mock_vllm_config.model_config = Mock(max_model_len=2048, quant_config=None)
+
+    with patch("vllm_ascend.models.deepseek_v2.get_tensor_model_parallel_rank", return_value=0), \
+            patch("vllm_ascend.models.deepseek_v2.get_tensor_model_parallel_world_size", return_value=1), \
+            patch("vllm_ascend.models.deepseek_v2.get_tp_group", return_value=tp_group), \
+            patch("vllm_ascend.models.deepseek_v2.get_ep_group", return_value=ep_group), \
+            patch("vllm_ascend.models.deepseek_v2.get_dp_group", return_value=dp_group), \
+            patch("vllm_ascend.models.deepseek_v2.get_pp_group", return_value=pp_group), \
+            patch("vllm_ascend.models.deepseek_v2.get_pp_group",
+                  return_value=Mock(is_first_rank=False, is_last_rank=False)), \
+            patch("vllm_ascend.ops.fused_moe.get_current_vllm_config", return_value=mock_vllm_config), \
+            patch.dict("vllm.distributed.parallel_state.__dict__", _TP=tp_group, _EP=ep_group, _DP=dp_group,
+                       _PP=pp_group):
+        yield
+
+
+def test_custom_deepseek_v2_silu_and_mul():
+    torch.set_default_device("cpu")
+
+    silu = CustomDeepseekV2SiluAndMul()
+    assert silu.weight_scale is None
+
+    x = torch.randn(2, 4)
+    output = silu.forward_oot(x)
+    assert output.shape == (2, 2)
+
+    weight_scale = Mock(return_value=torch.tensor(0.1))
+    silu = CustomDeepseekV2SiluAndMul(weight_scale=weight_scale)
+    quant_x = torch.randint(-128, 127, (2, 4), dtype=torch.int32)
+    dynamic_scale = torch.randn(2, 1)
+    with patch("torch_npu.npu_dequant_swiglu_quant",
+               return_value=torch.randn(2, 4)):
+        output = silu.forward_oot((quant_x, dynamic_scale))
+        assert output.shape == (2, 4)
+
+
+def test_custom_deepseek_v2_merged_replicated_linear(mock_distributed):
+    linear = CustomDeepseekV2MergedReplicatedLinear(input_size=128,
+                                                    output_sizes=[64, 64],
+                                                    bias=False,
+                                                    quant_config=None)
+    assert linear.output_sizes == [64, 64]
+
+    param = Mock()
+    param.data = torch.zeros(128, 128)
+    param.output_dim = 1
+    param.is_gguf_weight = False
+    param.is_gguf_weight_type = False
+    loaded_weight = torch.randn(128, 64)
+    linear.weight_loader(param, loaded_weight, loaded_shard_id=0)
+
+    with pytest.raises(AssertionError):
+        linear.weight_loader(param, torch.randn(128, 32), loaded_shard_id=0)
+
+
+@pytest.mark.parametrize("cls", [
+    CustomDeepseekV2RowParallelLinearReplaceAllreduce,
+    CustomDeepseekV2RowParallelLinear
+])
+def test_row_parallel_linear(cls, mock_distributed):
+    linear = cls(input_size=128, output_size=64, bias=False, quant_config=None)
+    linear.quant_method = Mock()
+    linear.quant_method.apply.return_value = torch.randn(2, 4, 64)
+
+    input_ = torch.randn(2, 4, 128)
+    with patch("vllm_ascend.models.deepseek_v2.split_tensor_along_last_dim",
+               return_value=[torch.randn(2, 4, 64)]):
+        linear.input_is_parallel = False
+        output = linear(input_, is_prefill=True)
+    assert output[0].shape == (2, 4, 64)
+
+    linear.input_is_parallel = True
+    output = linear(input_, is_prefill=False)
+    assert output[0].shape == (2, 4, 64)
+
+
+def test_custom_deepseek_v2_mlp(mock_distributed, base_config):
+    mlp = CustomDeepseekV2MLP(hidden_size=128,
+                              intermediate_size=256,
+                              hidden_act="silu",
+                              quant_config=None)
+    assert isinstance(mlp.act_fn, CustomDeepseekV2SiluAndMul)
+
+    x = torch.randn(2, 4, 128)
+    output = mlp(x)
+    assert output.shape == (2, 4, 128)
+
+    with patch("vllm_ascend.models.deepseek_v2.QuantizationConfig"
+               ) as mock_quant_config:
+        mock_quant_config.name = "w8a8dynamic"
+        with pytest.raises(NotImplementedError):
+            CustomDeepseekV2MLP(hidden_size=128,
+                                intermediate_size=256,
+                                hidden_act="silu",
+                                quant_config=mock_quant_config,
+                                force_replicate=False)
+    with pytest.raises(ValueError):
+        CustomDeepseekV2MLP(hidden_size=128,
+                            intermediate_size=256,
+                            hidden_act="relu",
+                            quant_config=None)
+
+
+def test_custom_deepseek_v2_moe(mock_distributed, base_config):
+    base_config.n_shared_experts = 1
+    moe = CustomDeepseekV2MoE(config=base_config,
+                              quant_config=None,
+                              prefix="mlp")
+    assert moe.top_k == 2
+
+    x = torch.randn(2, 4, 128)
+    attn_metadata = Mock(num_prefills=1)
+    with patch("vllm_ascend.ops.fused_moe.AscendFusedMoE.__call__",
+               return_value=(torch.randn(2, 4, 128), torch.randn(2, 4, 128))):
+        output = moe(x, attn_metadata)
+        assert output.shape == (2, 4, 128)
+
+
+@patch("torch_npu.npu_rms_norm")
+def test_custom_deepseek_v2_mla_attention(mock_rms_norm, mock_distributed,
+                                          base_config):
+    mock_rms_norm.return_value = (torch.randn(2, 128), torch.randn(2, 128))
+
+    attn = CustomDeepseekV2MLAAttention(config=base_config,
+                                        hidden_size=128,
+                                        num_heads=8,
+                                        qk_nope_head_dim=16,
+                                        qk_rope_head_dim=16,
+                                        v_head_dim=32,
+                                        q_lora_rank=16,
+                                        kv_lora_rank=16,
+                                        cache_config=CacheConfig(),
+                                        quant_config=None,
+                                        prefix="layers.0.self_attn")
+    assert attn.debug_layer_idx == 0
+
+    x = torch.randn(2, 4, 128)
+    positions = torch.arange(4).repeat(2, 1)
+    with patch.object(attn.mla_attn,
+                      "__call__",
+                      return_value=torch.randn(2, 4, 128)):
+        with pytest.raises(AssertionError):
+            attn(positions, x)
+
+    attn = CustomDeepseekV2MLAAttention(config=base_config,
+                                        hidden_size=128,
+                                        num_heads=8,
+                                        qk_nope_head_dim=16,
+                                        qk_rope_head_dim=16,
+                                        v_head_dim=32,
+                                        q_lora_rank=None,
+                                        kv_lora_rank=16,
+                                        prefix="layers.1.self_attn")
+    assert hasattr(attn, "q_proj")
+
+
+@patch("torch_npu.npu_add_rms_norm")
+@patch("torch_npu.npu_rms_norm")
+def test_custom_deepseek_v2_decoder_layer(mock_rms_norm, mock_add_norm,
+                                          mock_distributed, base_config,
+                                          vllm_config):
+    mock_rms_norm.return_value = (torch.randn(2, 128), torch.randn(2, 128))
+    mock_add_norm.return_value = (torch.randn(2, 128), torch.randn(2, 128),
+                                  torch.randn(2, 128))
+    base_config.n_routed_experts = 4
+    layer = CustomDeepseekV2DecoderLayer(config=base_config,
+                                         prefix="layers.0",
+                                         model_config=vllm_config.model_config,
+                                         cache_config=CacheConfig(),
+                                         quant_config=None)
+    assert isinstance(layer.mlp, CustomDeepseekV2MoE)
+
+    x = torch.randn(2, 4, 128)
+    positions = torch.arange(4).repeat(2, 1)
+
+    with patch.object(layer.self_attn, "forward", Mock(return_value=torch.randn(2, 4, 128))), \
+            patch.object(layer.mlp, "forward", Mock(return_value=torch.randn(2, 4, 128))):
+        hidden_states, residual = layer(positions, x, None)
+        assert hidden_states.shape == (2, 4, 128)
+
+    base_config.n_routed_experts = None
+    layer = CustomDeepseekV2DecoderLayer(config=base_config,
+                                         prefix="layers.0",
+                                         model_config=vllm_config.model_config,
+                                         quant_config=None)
+    assert isinstance(layer.mlp, CustomDeepseekV2MLP)
+
+
+def test_custom_deepseek_v2_for_causal_lm(mock_distributed, vllm_config):
+    model = CustomDeepseekV2ForCausalLM(vllm_config=vllm_config)
+
+    input_ids = torch.randint(0, 10000, (2, 4))
+    positions = torch.arange(4).repeat(2, 1)
+    with patch.object(model.model,
+                      "forward",
+                      return_value=torch.randn(2, 4, 128)):
+        output = model(input_ids, positions)
+        assert output.shape == (2, 4, 128)
+
+    weights = [("model.embed_tokens.weight", torch.randn(10000, 128))]
+    with patch(
+            "vllm.model_executor.model_loader.weight_utils.default_weight_loader"
+    ):
+        loaded = model.load_weights(weights)
+        assert loaded is not None
diff --git a/tests/ut/ops/test_fused_ops.py b/tests/ut/ops/test_fused_ops.py
index 366dd75..eb265b9 100644
--- a/tests/ut/ops/test_fused_ops.py
+++ b/tests/ut/ops/test_fused_ops.py
@@ -188,7 +188,6 @@ class TestAscendFusedMoe:
         assert layer.top_k == default_moe_config['top_k']
         assert hasattr(layer, 'w13_weight')
         assert hasattr(layer, 'w2_weight')
-        assert layer.moe_instance_id == 0
 
         # check group_topk
         with pytest.raises(AssertionError):