diff --git a/tests/ut/models/test_deepseek_v2.py b/tests/ut/models/test_deepseek_v2.py new file mode 100644 index 0000000..f3a7d1a --- /dev/null +++ b/tests/ut/models/test_deepseek_v2.py @@ -0,0 +1,309 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +from types import SimpleNamespace +from unittest.mock import Mock, patch + +import pytest +import torch +from transformers import PretrainedConfig +from vllm.config import CacheConfig +from vllm.distributed.parallel_state import GroupCoordinator + +from vllm_ascend.models.deepseek_v2 import ( + CustomDeepseekV2DecoderLayer, CustomDeepseekV2ForCausalLM, + CustomDeepseekV2MergedReplicatedLinear, CustomDeepseekV2MLAAttention, + CustomDeepseekV2MLP, CustomDeepseekV2MoE, + CustomDeepseekV2RowParallelLinear, + CustomDeepseekV2RowParallelLinearReplaceAllreduce, + CustomDeepseekV2SiluAndMul) + + +@pytest.fixture +def base_config(): + config = PretrainedConfig( + hidden_size=128, + num_attention_heads=8, + num_hidden_layers=2, + intermediate_size=256, + hidden_act="silu", + rms_norm_eps=1e-6, + rope_theta=10000.0, + max_position_embeddings=2048, + n_routed_experts=4, + n_shared_experts=1, + moe_intermediate_size=256, + num_experts_per_tok=2, + routed_scaling_factor=1.0, + first_k_dense_replace=0, + moe_layer_freq=1, + kv_lora_rank=16, + qk_nope_head_dim=16, + qk_rope_head_dim=16, + v_head_dim=32, + topk_method="noaux_tc", + scoring_func="softmax", + norm_topk_prob=True, + n_group=1, + topk_group=1, + vocab_size=10000, + ) + return config + + +@pytest.fixture +def vllm_config(base_config): + model_config = SimpleNamespace( + hf_config=base_config, + tensor_parallel_size=1, + dtype=torch.float32, + use_mla=False, + quant_config=None, + max_model_len=2048, + ) + + cache_config = CacheConfig() + vllm_config = Mock() + vllm_config.model_config = model_config + vllm_config.cache_config = cache_config + vllm_config.quant_config = None + return vllm_config + + +@pytest.fixture +def mock_distributed(): + tp_group = Mock(spec=GroupCoordinator) + tp_group.rank_in_group = 0 + tp_group.world_size = 1 + tp_group.device_group = Mock() + + dp_group = Mock(spec=GroupCoordinator) + dp_group.rank_in_group = 0 + dp_group.world_size = 1 + + ep_group = Mock(spec=GroupCoordinator) + ep_group.rank_in_group = 0 + ep_group.world_size = 1 + + pp_group = Mock(spec=GroupCoordinator) + pp_group.rank_in_group = 0 + pp_group.world_size = 1 + + mock_vllm_config = Mock() + mock_vllm_config.scheduler_config = Mock(max_num_seqs=256) + mock_vllm_config.model_config = Mock(max_model_len=2048, quant_config=None) + + with patch("vllm_ascend.models.deepseek_v2.get_tensor_model_parallel_rank", return_value=0), \ + patch("vllm_ascend.models.deepseek_v2.get_tensor_model_parallel_world_size", return_value=1), \ + patch("vllm_ascend.models.deepseek_v2.get_tp_group", return_value=tp_group), \ + patch("vllm_ascend.models.deepseek_v2.get_ep_group", return_value=ep_group), \ + patch("vllm_ascend.models.deepseek_v2.get_dp_group", return_value=dp_group), \ + patch("vllm_ascend.models.deepseek_v2.get_pp_group", return_value=pp_group), \ + patch("vllm_ascend.models.deepseek_v2.get_pp_group", + return_value=Mock(is_first_rank=False, is_last_rank=False)), \ + patch("vllm_ascend.ops.fused_moe.get_current_vllm_config", return_value=mock_vllm_config), \ + patch.dict("vllm.distributed.parallel_state.__dict__", _TP=tp_group, _EP=ep_group, _DP=dp_group, + _PP=pp_group): + yield + + +def test_custom_deepseek_v2_silu_and_mul(): + torch.set_default_device("cpu") + + silu = CustomDeepseekV2SiluAndMul() + assert silu.weight_scale is None + + x = torch.randn(2, 4) + output = silu.forward_oot(x) + assert output.shape == (2, 2) + + weight_scale = Mock(return_value=torch.tensor(0.1)) + silu = CustomDeepseekV2SiluAndMul(weight_scale=weight_scale) + quant_x = torch.randint(-128, 127, (2, 4), dtype=torch.int32) + dynamic_scale = torch.randn(2, 1) + with patch("torch_npu.npu_dequant_swiglu_quant", + return_value=torch.randn(2, 4)): + output = silu.forward_oot((quant_x, dynamic_scale)) + assert output.shape == (2, 4) + + +def test_custom_deepseek_v2_merged_replicated_linear(mock_distributed): + linear = CustomDeepseekV2MergedReplicatedLinear(input_size=128, + output_sizes=[64, 64], + bias=False, + quant_config=None) + assert linear.output_sizes == [64, 64] + + param = Mock() + param.data = torch.zeros(128, 128) + param.output_dim = 1 + param.is_gguf_weight = False + param.is_gguf_weight_type = False + loaded_weight = torch.randn(128, 64) + linear.weight_loader(param, loaded_weight, loaded_shard_id=0) + + with pytest.raises(AssertionError): + linear.weight_loader(param, torch.randn(128, 32), loaded_shard_id=0) + + +@pytest.mark.parametrize("cls", [ + CustomDeepseekV2RowParallelLinearReplaceAllreduce, + CustomDeepseekV2RowParallelLinear +]) +def test_row_parallel_linear(cls, mock_distributed): + linear = cls(input_size=128, output_size=64, bias=False, quant_config=None) + linear.quant_method = Mock() + linear.quant_method.apply.return_value = torch.randn(2, 4, 64) + + input_ = torch.randn(2, 4, 128) + with patch("vllm_ascend.models.deepseek_v2.split_tensor_along_last_dim", + return_value=[torch.randn(2, 4, 64)]): + linear.input_is_parallel = False + output = linear(input_, is_prefill=True) + assert output[0].shape == (2, 4, 64) + + linear.input_is_parallel = True + output = linear(input_, is_prefill=False) + assert output[0].shape == (2, 4, 64) + + +def test_custom_deepseek_v2_mlp(mock_distributed, base_config): + mlp = CustomDeepseekV2MLP(hidden_size=128, + intermediate_size=256, + hidden_act="silu", + quant_config=None) + assert isinstance(mlp.act_fn, CustomDeepseekV2SiluAndMul) + + x = torch.randn(2, 4, 128) + output = mlp(x) + assert output.shape == (2, 4, 128) + + with patch("vllm_ascend.models.deepseek_v2.QuantizationConfig" + ) as mock_quant_config: + mock_quant_config.name = "w8a8dynamic" + with pytest.raises(NotImplementedError): + CustomDeepseekV2MLP(hidden_size=128, + intermediate_size=256, + hidden_act="silu", + quant_config=mock_quant_config, + force_replicate=False) + with pytest.raises(ValueError): + CustomDeepseekV2MLP(hidden_size=128, + intermediate_size=256, + hidden_act="relu", + quant_config=None) + + +def test_custom_deepseek_v2_moe(mock_distributed, base_config): + base_config.n_shared_experts = 1 + moe = CustomDeepseekV2MoE(config=base_config, + quant_config=None, + prefix="mlp") + assert moe.top_k == 2 + + x = torch.randn(2, 4, 128) + attn_metadata = Mock(num_prefills=1) + with patch("vllm_ascend.ops.fused_moe.AscendFusedMoE.__call__", + return_value=(torch.randn(2, 4, 128), torch.randn(2, 4, 128))): + output = moe(x, attn_metadata) + assert output.shape == (2, 4, 128) + + +@patch("torch_npu.npu_rms_norm") +def test_custom_deepseek_v2_mla_attention(mock_rms_norm, mock_distributed, + base_config): + mock_rms_norm.return_value = (torch.randn(2, 128), torch.randn(2, 128)) + + attn = CustomDeepseekV2MLAAttention(config=base_config, + hidden_size=128, + num_heads=8, + qk_nope_head_dim=16, + qk_rope_head_dim=16, + v_head_dim=32, + q_lora_rank=16, + kv_lora_rank=16, + cache_config=CacheConfig(), + quant_config=None, + prefix="layers.0.self_attn") + assert attn.debug_layer_idx == 0 + + x = torch.randn(2, 4, 128) + positions = torch.arange(4).repeat(2, 1) + with patch.object(attn.mla_attn, + "__call__", + return_value=torch.randn(2, 4, 128)): + with pytest.raises(AssertionError): + attn(positions, x) + + attn = CustomDeepseekV2MLAAttention(config=base_config, + hidden_size=128, + num_heads=8, + qk_nope_head_dim=16, + qk_rope_head_dim=16, + v_head_dim=32, + q_lora_rank=None, + kv_lora_rank=16, + prefix="layers.1.self_attn") + assert hasattr(attn, "q_proj") + + +@patch("torch_npu.npu_add_rms_norm") +@patch("torch_npu.npu_rms_norm") +def test_custom_deepseek_v2_decoder_layer(mock_rms_norm, mock_add_norm, + mock_distributed, base_config, + vllm_config): + mock_rms_norm.return_value = (torch.randn(2, 128), torch.randn(2, 128)) + mock_add_norm.return_value = (torch.randn(2, 128), torch.randn(2, 128), + torch.randn(2, 128)) + base_config.n_routed_experts = 4 + layer = CustomDeepseekV2DecoderLayer(config=base_config, + prefix="layers.0", + model_config=vllm_config.model_config, + cache_config=CacheConfig(), + quant_config=None) + assert isinstance(layer.mlp, CustomDeepseekV2MoE) + + x = torch.randn(2, 4, 128) + positions = torch.arange(4).repeat(2, 1) + + with patch.object(layer.self_attn, "forward", Mock(return_value=torch.randn(2, 4, 128))), \ + patch.object(layer.mlp, "forward", Mock(return_value=torch.randn(2, 4, 128))): + hidden_states, residual = layer(positions, x, None) + assert hidden_states.shape == (2, 4, 128) + + base_config.n_routed_experts = None + layer = CustomDeepseekV2DecoderLayer(config=base_config, + prefix="layers.0", + model_config=vllm_config.model_config, + quant_config=None) + assert isinstance(layer.mlp, CustomDeepseekV2MLP) + + +def test_custom_deepseek_v2_for_causal_lm(mock_distributed, vllm_config): + model = CustomDeepseekV2ForCausalLM(vllm_config=vllm_config) + + input_ids = torch.randint(0, 10000, (2, 4)) + positions = torch.arange(4).repeat(2, 1) + with patch.object(model.model, + "forward", + return_value=torch.randn(2, 4, 128)): + output = model(input_ids, positions) + assert output.shape == (2, 4, 128) + + weights = [("model.embed_tokens.weight", torch.randn(10000, 128))] + with patch( + "vllm.model_executor.model_loader.weight_utils.default_weight_loader" + ): + loaded = model.load_weights(weights) + assert loaded is not None diff --git a/tests/ut/ops/test_fused_ops.py b/tests/ut/ops/test_fused_ops.py index 366dd75..eb265b9 100644 --- a/tests/ut/ops/test_fused_ops.py +++ b/tests/ut/ops/test_fused_ops.py @@ -188,7 +188,6 @@ class TestAscendFusedMoe: assert layer.top_k == default_moe_config['top_k'] assert hasattr(layer, 'w13_weight') assert hasattr(layer, 'w2_weight') - assert layer.moe_instance_id == 0 # check group_topk with pytest.raises(AssertionError):