xc-llm-ascend/tests/ut/models/test_deepseek_v2.py

#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
from types import SimpleNamespace
from unittest.mock import Mock, patch

import pytest
import torch
from transformers import PretrainedConfig
from vllm.config import CacheConfig
from vllm.distributed.parallel_state import GroupCoordinator

from vllm_ascend.models.deepseek_v2 import (
    CustomDeepseekV2DecoderLayer, CustomDeepseekV2ForCausalLM,
    CustomDeepseekV2MergedReplicatedLinear, CustomDeepseekV2MLAAttention,
    CustomDeepseekV2MLP, CustomDeepseekV2MoE,
    CustomDeepseekV2RowParallelLinear,
    CustomDeepseekV2RowParallelLinearReplaceAllreduce,
    CustomDeepseekV2SiluAndMul)


@pytest.fixture
def base_config():
    config = PretrainedConfig(
        hidden_size=128,
        num_attention_heads=8,
        num_hidden_layers=2,
        intermediate_size=256,
        hidden_act="silu",
        rms_norm_eps=1e-6,
        rope_theta=10000.0,
        max_position_embeddings=2048,
        n_routed_experts=4,
        n_shared_experts=1,
        moe_intermediate_size=256,
        num_experts_per_tok=2,
        routed_scaling_factor=1.0,
        first_k_dense_replace=0,
        moe_layer_freq=1,
        kv_lora_rank=16,
        qk_nope_head_dim=16,
        qk_rope_head_dim=16,
        v_head_dim=32,
        topk_method="noaux_tc",
        scoring_func="softmax",
        norm_topk_prob=True,
        n_group=1,
        topk_group=1,
        vocab_size=10000,
    )
    return config


@pytest.fixture
def vllm_config(base_config):
    model_config = SimpleNamespace(
        hf_config=base_config,
        tensor_parallel_size=1,
        dtype=torch.float32,
        use_mla=False,
        quant_config=None,
        max_model_len=2048,
    )

    cache_config = CacheConfig()
    vllm_config = Mock()
    vllm_config.model_config = model_config
    vllm_config.cache_config = cache_config
    vllm_config.quant_config = None
    return vllm_config


@pytest.fixture
def mock_distributed():
    tp_group = Mock(spec=GroupCoordinator)
    tp_group.rank_in_group = 0
    tp_group.world_size = 1
    tp_group.device_group = Mock()

    dp_group = Mock(spec=GroupCoordinator)
    dp_group.rank_in_group = 0
    dp_group.world_size = 1

    ep_group = Mock(spec=GroupCoordinator)
    ep_group.rank_in_group = 0
    ep_group.world_size = 1

    pp_group = Mock(spec=GroupCoordinator)
    pp_group.rank_in_group = 0
    pp_group.world_size = 1

    mock_vllm_config = Mock()
    mock_vllm_config.scheduler_config = Mock(max_num_seqs=256)
    mock_vllm_config.model_config = Mock(max_model_len=2048, quant_config=None)

    with patch("vllm_ascend.models.deepseek_v2.get_tensor_model_parallel_rank", return_value=0), \
            patch("vllm_ascend.models.deepseek_v2.get_tensor_model_parallel_world_size", return_value=1), \
            patch("vllm_ascend.models.deepseek_v2.get_tp_group", return_value=tp_group), \
            patch("vllm_ascend.models.deepseek_v2.get_ep_group", return_value=ep_group), \
            patch("vllm_ascend.models.deepseek_v2.get_dp_group", return_value=dp_group), \
            patch("vllm_ascend.models.deepseek_v2.get_pp_group", return_value=pp_group), \
            patch("vllm_ascend.models.deepseek_v2.get_pp_group",
                  return_value=Mock(is_first_rank=False, is_last_rank=False)), \
            patch("vllm_ascend.ops.fused_moe.get_current_vllm_config", return_value=mock_vllm_config), \
            patch.dict("vllm.distributed.parallel_state.__dict__", _TP=tp_group, _EP=ep_group, _DP=dp_group,
                       _PP=pp_group), \
            patch.dict("vllm_ascend.distributed.parallel_state.__dict__", _MC2=ep_group):
        yield


@pytest.fixture
def mock_forward_context():
    forward_context = Mock(in_profile_run=False, with_prefill=False)
    with patch("vllm_ascend.models.deepseek_v2.get_forward_context",
               return_value=forward_context):
        yield


def test_custom_deepseek_v2_silu_and_mul():
    torch.set_default_device("cpu")

    silu = CustomDeepseekV2SiluAndMul()
    assert silu.weight_scale is None

    x = torch.randn(2, 4)
    output = silu.forward_oot(x)
    assert output.shape == (2, 2)

    weight_scale = Mock(return_value=torch.tensor(0.1))
    silu = CustomDeepseekV2SiluAndMul(weight_scale=weight_scale)
    quant_x = torch.randint(-128, 127, (2, 4), dtype=torch.int32)
    dynamic_scale = torch.randn(2, 1)
    with patch("torch_npu.npu_dequant_swiglu_quant",
               return_value=torch.randn(2, 4)):
        output = silu.forward_oot((quant_x, dynamic_scale))
        assert output.shape == (2, 4)


def test_custom_deepseek_v2_merged_replicated_linear(mock_distributed):
    linear = CustomDeepseekV2MergedReplicatedLinear(input_size=128,
                                                    output_sizes=[64, 64],
                                                    bias=False,
                                                    quant_config=None)
    assert linear.output_sizes == [64, 64]

    param = Mock()
    param.data = torch.zeros(128, 128)
    param.output_dim = 1
    param.is_gguf_weight = False
    param.is_gguf_weight_type = False
    loaded_weight = torch.randn(128, 64)
    linear.weight_loader(param, loaded_weight, loaded_shard_id=0)

    with pytest.raises(AssertionError):
        linear.weight_loader(param, torch.randn(128, 32), loaded_shard_id=0)


@pytest.mark.parametrize("cls", [
    CustomDeepseekV2RowParallelLinearReplaceAllreduce,
    CustomDeepseekV2RowParallelLinear
])
def test_row_parallel_linear(cls, mock_distributed):
    linear = cls(input_size=128, output_size=64, bias=False, quant_config=None)
    linear.quant_method = Mock()
    linear.quant_method.apply.return_value = torch.randn(2, 4, 64)

    input_ = torch.randn(2, 4, 128)
    with patch("vllm_ascend.models.deepseek_v2.split_tensor_along_last_dim",
               return_value=[torch.randn(2, 4, 64)]):
        linear.input_is_parallel = False
        output = linear(input_, is_prefill=True)
    assert output[0].shape == (2, 4, 64)

    linear.input_is_parallel = True
    output = linear(input_, is_prefill=False)
    assert output[0].shape == (2, 4, 64)


def test_custom_deepseek_v2_mlp(mock_distributed, base_config):
    mlp = CustomDeepseekV2MLP(hidden_size=128,
                              intermediate_size=256,
                              hidden_act="silu",
                              quant_config=None)
    assert isinstance(mlp.act_fn, CustomDeepseekV2SiluAndMul)

    x = torch.randn(2, 4, 128)
    output = mlp(x)
    assert output.shape == (2, 4, 128)

    with patch("vllm_ascend.models.deepseek_v2.QuantizationConfig"
               ) as mock_quant_config:
        mock_quant_config.name = "w8a8dynamic"
        with pytest.raises(NotImplementedError):
            CustomDeepseekV2MLP(hidden_size=128,
                                intermediate_size=256,
                                hidden_act="silu",
                                quant_config=mock_quant_config,
                                force_replicate=False)
    with pytest.raises(ValueError):
        CustomDeepseekV2MLP(hidden_size=128,
                            intermediate_size=256,
                            hidden_act="relu",
                            quant_config=None)


def test_custom_deepseek_v2_moe(mock_distributed, base_config,
                                mock_forward_context):
    base_config.n_shared_experts = 1
    moe = CustomDeepseekV2MoE(config=base_config,
                              quant_config=None,
                              prefix="mlp")
    assert moe.top_k == 2

    x = torch.randn(2, 4, 128)
    attn_metadata = Mock(num_prefills=1)
    with patch("vllm_ascend.ops.fused_moe.AscendFusedMoE.__call__",
               return_value=(torch.randn(2, 4, 128), torch.randn(2, 4, 128))):
        output = moe(x, attn_metadata)
        assert output.shape == (2, 4, 128)


@patch("torch_npu.npu_rms_norm")
def test_custom_deepseek_v2_mla_attention(mock_rms_norm, mock_distributed,
                                          base_config):
    mock_rms_norm.return_value = (torch.randn(2, 128), torch.randn(2, 128))

    attn = CustomDeepseekV2MLAAttention(config=base_config,
                                        hidden_size=128,
                                        num_heads=8,
                                        qk_nope_head_dim=16,
                                        qk_rope_head_dim=16,
                                        v_head_dim=32,
                                        q_lora_rank=16,
                                        kv_lora_rank=16,
                                        cache_config=CacheConfig(),
                                        quant_config=None,
                                        prefix="layers.0.self_attn")
    assert attn.debug_layer_idx == 0

    x = torch.randn(2, 4, 128)
    positions = torch.arange(4).repeat(2, 1)
    with patch.object(attn.mla_attn,
                      "__call__",
                      return_value=torch.randn(2, 4, 128)):
        with pytest.raises(AssertionError):
            attn(positions, x)

    attn = CustomDeepseekV2MLAAttention(config=base_config,
                                        hidden_size=128,
                                        num_heads=8,
                                        qk_nope_head_dim=16,
                                        qk_rope_head_dim=16,
                                        v_head_dim=32,
                                        q_lora_rank=None,
                                        kv_lora_rank=16,
                                        prefix="layers.1.self_attn")
    assert hasattr(attn, "q_proj")


@patch("torch_npu.npu_add_rms_norm")
@patch("torch_npu.npu_rms_norm")
def test_custom_deepseek_v2_decoder_layer(mock_rms_norm, mock_add_norm,
                                          mock_distributed, base_config,
                                          vllm_config):
    mock_rms_norm.return_value = (torch.randn(2, 128), torch.randn(2, 128))
    mock_add_norm.return_value = (torch.randn(2, 128), torch.randn(2, 128),
                                  torch.randn(2, 128))
    base_config.n_routed_experts = 4
    layer = CustomDeepseekV2DecoderLayer(config=base_config,
                                         prefix="layers.0",
                                         model_config=vllm_config.model_config,
                                         cache_config=CacheConfig(),
                                         quant_config=None)
    assert isinstance(layer.mlp, CustomDeepseekV2MoE)

    x = torch.randn(2, 4, 128)
    positions = torch.arange(4).repeat(2, 1)

    with patch.object(layer.self_attn, "forward", Mock(return_value=torch.randn(2, 4, 128))), \
            patch.object(layer.mlp, "forward", Mock(return_value=torch.randn(2, 4, 128))):
        hidden_states, residual = layer(positions, x, None)
        assert hidden_states.shape == (2, 4, 128)

    base_config.n_routed_experts = None
    layer = CustomDeepseekV2DecoderLayer(config=base_config,
                                         prefix="layers.0",
                                         model_config=vllm_config.model_config,
                                         quant_config=None)
    assert isinstance(layer.mlp, CustomDeepseekV2MLP)


def test_custom_deepseek_v2_for_causal_lm(mock_distributed, vllm_config):
    model = CustomDeepseekV2ForCausalLM(vllm_config=vllm_config)

    input_ids = torch.randint(0, 10000, (2, 4))
    positions = torch.arange(4).repeat(2, 1)
    with patch.object(model.model,
                      "forward",
                      return_value=torch.randn(2, 4, 128)):
        output = model(input_ids, positions)
        assert output.shape == (2, 4, 128)

    weights = [("model.embed_tokens.weight", torch.randn(10000, 128))]
    with patch(
            "vllm.model_executor.model_loader.weight_utils.default_weight_loader"
    ):
        loaded = model.load_weights(weights)
        assert loaded is not None