[Model][1/N] Delete deepseek v2/v3 modeling codes. (#3189)
This PR deletes model codes of deepseek_v2 and deepseek_v3 to reuse the model file from vLLM. vLLM Ascend now uses custom ops register way instead of model file hard-coding. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: whx-sjtu <2952154980@qq.com>
This commit is contained in:
@@ -302,6 +302,7 @@ class TestAscendMLAImpl(TestBase):
|
||||
"v_head_dim": 128,
|
||||
"rotary_emb": MagicMock(),
|
||||
"q_proj": MagicMock(),
|
||||
"q_b_proj": MagicMock(),
|
||||
"kv_b_proj": MagicMock(),
|
||||
"o_proj": MagicMock(),
|
||||
"kv_a_proj_with_mqa": MagicMock(),
|
||||
|
||||
@@ -90,13 +90,7 @@ def mock_distributed():
|
||||
mock_vllm_config.scheduler_config = Mock(max_num_seqs=256)
|
||||
mock_vllm_config.model_config = Mock(max_model_len=2048, quant_config=None)
|
||||
|
||||
with patch("vllm_ascend.models.deepseek_v2.get_tensor_model_parallel_rank", return_value=0), \
|
||||
patch("vllm_ascend.models.deepseek_v2.get_tensor_model_parallel_world_size", return_value=1), \
|
||||
patch("vllm_ascend.models.deepseek_v2.get_tp_group", return_value=tp_group), \
|
||||
patch("vllm_ascend.models.deepseek_v2.get_pp_group", return_value=pp_group), \
|
||||
patch("vllm_ascend.models.deepseek_v2.get_pp_group",
|
||||
return_value=Mock(is_first_rank=False, is_last_rank=False)), \
|
||||
patch("vllm_ascend.ops.common_fused_moe.get_current_vllm_config", return_value=mock_vllm_config), \
|
||||
with patch("vllm_ascend.ops.common_fused_moe.get_current_vllm_config", return_value=mock_vllm_config), \
|
||||
patch("vllm_ascend.ops.moe.token_dispatcher.torch.distributed.get_rank", return_value=0), \
|
||||
patch("vllm_ascend.ops.moe.token_dispatcher.get_ascend_soc_version", return_value=None), \
|
||||
patch.dict("vllm.distributed.parallel_state.__dict__", _TP=tp_group, _EP=ep_group, _DP=dp_group,
|
||||
@@ -104,11 +98,3 @@ def mock_distributed():
|
||||
patch.dict("vllm_ascend.distributed.parallel_state.__dict__", _MC2=ep_group), \
|
||||
patch("torch.npu.current_device", return_value=0):
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_forward_context():
|
||||
forward_context = Mock(in_profile_run=False, with_prefill=False)
|
||||
with patch("vllm_ascend.models.deepseek_v2.get_forward_context",
|
||||
return_value=forward_context):
|
||||
yield
|
||||
|
||||
@@ -37,8 +37,6 @@ class TestCustomDeepSeekMultiTokenPredictorLayer(PytestBase):
|
||||
mocker.patch(
|
||||
"vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__",
|
||||
return_value=None)
|
||||
mocker.patch("vllm_ascend.models.deepseek_v2.get_ascend_config",
|
||||
return_value=mocker.Mock())
|
||||
|
||||
mtp_layer = CustomDeepSeekMultiTokenPredictorLayer(config, "0", None)
|
||||
mocker_deepseek_v2_decode_layer.assert_called_once()
|
||||
|
||||
@@ -1,130 +0,0 @@
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
from unittest.mock import MagicMock, Mock, patch
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from vllm.config import CacheConfig
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||
|
||||
from vllm_ascend import ascend_config
|
||||
from vllm_ascend.models.deepseek_v2 import (CustomDeepseekV2MLAAttention,
|
||||
CustomDeepseekV2RowParallelLinear)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("cls", [CustomDeepseekV2RowParallelLinear])
|
||||
def test_row_parallel_linear(cls, mock_distributed):
|
||||
linear = cls(input_size=128, output_size=64, bias=False, quant_config=None)
|
||||
linear.quant_method = Mock()
|
||||
linear.quant_method.apply.return_value = torch.randn(2, 4, 64)
|
||||
input_ = torch.randn(2, 4, 128)
|
||||
with patch("vllm_ascend.models.deepseek_v2.split_tensor_along_last_dim",
|
||||
return_value=[torch.randn(2, 4, 64)]):
|
||||
linear.input_is_parallel = False
|
||||
output = linear(input_, is_prefill=True)
|
||||
assert output[0].shape == (2, 4, 64)
|
||||
|
||||
linear.input_is_parallel = True
|
||||
output = linear(input_, is_prefill=False)
|
||||
assert output[0].shape == (2, 4, 64)
|
||||
|
||||
|
||||
@patch("vllm_ascend.models.layers.mla.get_forward_context")
|
||||
@patch("torch.ops.vllm.mla_forward")
|
||||
@patch("torch_npu.npu_rms_norm")
|
||||
def test_custom_deepseek_v2_mla_attention(mock_rms_norm, mock_mla_forward,
|
||||
mock_forward_context,
|
||||
mock_distributed, base_config):
|
||||
mock_rms_norm.return_value = (torch.randn(2, 128), torch.randn(2, 128))
|
||||
# Make a fake ascend config because of the AscendLinearBase
|
||||
vllm_config = MagicMock()
|
||||
vllm_config.additional_config = None
|
||||
vllm_config.parallel_config.enable_expert_parallel = False
|
||||
vllm_config.parallel_config.tensor_parallel_size = 1
|
||||
vllm_config.kv_transfer_config = None
|
||||
ascend_config.init_ascend_config(vllm_config)
|
||||
dummy_forward_context = MagicMock()
|
||||
dummy_forward_context.sp_enabled = False
|
||||
mock_forward_context.return_value = dummy_forward_context
|
||||
|
||||
attn = CustomDeepseekV2MLAAttention(config=base_config,
|
||||
hidden_size=128,
|
||||
num_heads=8,
|
||||
qk_nope_head_dim=16,
|
||||
qk_rope_head_dim=16,
|
||||
v_head_dim=32,
|
||||
q_lora_rank=16,
|
||||
kv_lora_rank=16,
|
||||
cache_config=CacheConfig(),
|
||||
quant_config=None,
|
||||
prefix="layers.0.self_attn")
|
||||
assert attn.debug_layer_idx == 0
|
||||
|
||||
x = torch.randn(2, 4, 128)
|
||||
positions = torch.arange(4).repeat(2, 1)
|
||||
with patch.object(attn.mla_attn,
|
||||
"__call__",
|
||||
return_value=torch.randn(2, 4, 128)):
|
||||
attn(positions, x)
|
||||
mock_mla_forward.assert_called_once()
|
||||
|
||||
attn = CustomDeepseekV2MLAAttention(config=base_config,
|
||||
hidden_size=128,
|
||||
num_heads=8,
|
||||
qk_nope_head_dim=16,
|
||||
qk_rope_head_dim=16,
|
||||
v_head_dim=32,
|
||||
q_lora_rank=None,
|
||||
kv_lora_rank=16,
|
||||
prefix="layers.1.self_attn")
|
||||
assert hasattr(attn, "q_proj")
|
||||
ascend_config._ASCEND_CONFIG = None
|
||||
|
||||
|
||||
def test_deepseek_v2_lmhead(mock_distributed, vllm_config):
|
||||
# 创建一个简单的配置对象
|
||||
class SimpleConfig:
|
||||
|
||||
def __init__(self):
|
||||
self.vocab_size = 10000
|
||||
self.hidden_size = 128
|
||||
|
||||
config = SimpleConfig()
|
||||
|
||||
# Make a fake ascend config because of the AscendLinearBase
|
||||
vllm_config = MagicMock()
|
||||
vllm_config.additional_config = None
|
||||
vllm_config.parallel_config.enable_expert_parallel = False
|
||||
vllm_config.parallel_config.tensor_parallel_size = 1
|
||||
vllm_config.kv_transfer_config = None
|
||||
ascend_config.init_ascend_config(vllm_config)
|
||||
|
||||
# 直接创建lmhead和logits_processor
|
||||
lmhead = ParallelLMHead(config.vocab_size, config.hidden_size)
|
||||
logits_processor = LogitsProcessor(config.vocab_size)
|
||||
|
||||
# 创建模拟输出
|
||||
mock_output = torch.randn(2, 4, config.hidden_size)
|
||||
mock_logits = torch.randn(2, 4, config.vocab_size)
|
||||
|
||||
# 直接测试logits_processor
|
||||
with patch.object(lmhead.quant_method, "apply", return_value=mock_logits):
|
||||
with patch.object(logits_processor,
|
||||
"_gather_logits",
|
||||
return_value=mock_logits):
|
||||
logits = logits_processor(lmhead, mock_output)
|
||||
assert logits.shape == (2, 4, config.vocab_size)
|
||||
ascend_config._ASCEND_CONFIG = None
|
||||
@@ -525,6 +525,7 @@ class TestAscendMLATorchairImpl(TestBase):
|
||||
"v_head_dim": 128,
|
||||
"rotary_emb": MagicMock(),
|
||||
"q_proj": MagicMock(),
|
||||
"q_b_proj": MagicMock(),
|
||||
"kv_b_proj": MagicMock(),
|
||||
"o_proj": MagicMock(),
|
||||
"kv_a_proj_with_mqa": MagicMock(),
|
||||
|
||||
Reference in New Issue
Block a user