### What this PR does / why we need it?
A refactoring of forward_context and model_runner_v1, add some context
which is necessary in model inference into forward_context, and refactor
dummy_run logic, make it more reasonable.
Some details for this PR:
Add `ascend_forward_context`;
Update mc2_v2 op, and support `active_mask` param;
Update scripts in examples dir;
refactor `dummy_run` logic;
Add soc_version for A2 and A3;
### Does this PR introduce _any_ user-facing change?
No change at user-facing.
### How was this patch tested?
- vLLM version: v0.10.0
- vLLM main:
57c22e57f9
Signed-off-by: zzzzwwjj <1183291235@qq.com>
320 lines
12 KiB
Python
320 lines
12 KiB
Python
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# This file is a part of the vllm-ascend project.
|
|
#
|
|
from types import SimpleNamespace
|
|
from unittest.mock import Mock, patch
|
|
|
|
import pytest
|
|
import torch
|
|
from transformers import PretrainedConfig
|
|
from vllm.config import CacheConfig
|
|
from vllm.distributed.parallel_state import GroupCoordinator
|
|
|
|
from vllm_ascend.models.deepseek_v2 import (
|
|
CustomDeepseekV2DecoderLayer, CustomDeepseekV2ForCausalLM,
|
|
CustomDeepseekV2MergedReplicatedLinear, CustomDeepseekV2MLAAttention,
|
|
CustomDeepseekV2MLP, CustomDeepseekV2MoE,
|
|
CustomDeepseekV2RowParallelLinear,
|
|
CustomDeepseekV2RowParallelLinearReplaceAllreduce,
|
|
CustomDeepseekV2SiluAndMul)
|
|
|
|
|
|
@pytest.fixture
|
|
def base_config():
|
|
config = PretrainedConfig(
|
|
hidden_size=128,
|
|
num_attention_heads=8,
|
|
num_hidden_layers=2,
|
|
intermediate_size=256,
|
|
hidden_act="silu",
|
|
rms_norm_eps=1e-6,
|
|
rope_theta=10000.0,
|
|
max_position_embeddings=2048,
|
|
n_routed_experts=4,
|
|
n_shared_experts=1,
|
|
moe_intermediate_size=256,
|
|
num_experts_per_tok=2,
|
|
routed_scaling_factor=1.0,
|
|
first_k_dense_replace=0,
|
|
moe_layer_freq=1,
|
|
kv_lora_rank=16,
|
|
qk_nope_head_dim=16,
|
|
qk_rope_head_dim=16,
|
|
v_head_dim=32,
|
|
topk_method="noaux_tc",
|
|
scoring_func="softmax",
|
|
norm_topk_prob=True,
|
|
n_group=1,
|
|
topk_group=1,
|
|
vocab_size=10000,
|
|
)
|
|
return config
|
|
|
|
|
|
@pytest.fixture
|
|
def vllm_config(base_config):
|
|
model_config = SimpleNamespace(
|
|
hf_config=base_config,
|
|
tensor_parallel_size=1,
|
|
dtype=torch.float32,
|
|
use_mla=False,
|
|
quant_config=None,
|
|
max_model_len=2048,
|
|
)
|
|
|
|
cache_config = CacheConfig()
|
|
vllm_config = Mock()
|
|
vllm_config.model_config = model_config
|
|
vllm_config.cache_config = cache_config
|
|
vllm_config.quant_config = None
|
|
return vllm_config
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_distributed():
|
|
tp_group = Mock(spec=GroupCoordinator)
|
|
tp_group.rank_in_group = 0
|
|
tp_group.world_size = 1
|
|
tp_group.device_group = Mock()
|
|
|
|
dp_group = Mock(spec=GroupCoordinator)
|
|
dp_group.rank_in_group = 0
|
|
dp_group.world_size = 1
|
|
|
|
ep_group = Mock(spec=GroupCoordinator)
|
|
ep_group.rank_in_group = 0
|
|
ep_group.world_size = 1
|
|
|
|
pp_group = Mock(spec=GroupCoordinator)
|
|
pp_group.rank_in_group = 0
|
|
pp_group.world_size = 1
|
|
|
|
mock_vllm_config = Mock()
|
|
mock_vllm_config.scheduler_config = Mock(max_num_seqs=256)
|
|
mock_vllm_config.model_config = Mock(max_model_len=2048, quant_config=None)
|
|
|
|
with patch("vllm_ascend.models.deepseek_v2.get_tensor_model_parallel_rank", return_value=0), \
|
|
patch("vllm_ascend.models.deepseek_v2.get_tensor_model_parallel_world_size", return_value=1), \
|
|
patch("vllm_ascend.models.deepseek_v2.get_tp_group", return_value=tp_group), \
|
|
patch("vllm_ascend.models.deepseek_v2.get_ep_group", return_value=ep_group), \
|
|
patch("vllm_ascend.models.deepseek_v2.get_dp_group", return_value=dp_group), \
|
|
patch("vllm_ascend.models.deepseek_v2.get_pp_group", return_value=pp_group), \
|
|
patch("vllm_ascend.models.deepseek_v2.get_pp_group",
|
|
return_value=Mock(is_first_rank=False, is_last_rank=False)), \
|
|
patch("vllm_ascend.ops.fused_moe.get_current_vllm_config", return_value=mock_vllm_config), \
|
|
patch.dict("vllm.distributed.parallel_state.__dict__", _TP=tp_group, _EP=ep_group, _DP=dp_group,
|
|
_PP=pp_group), \
|
|
patch.dict("vllm_ascend.distributed.parallel_state.__dict__", _MC2=ep_group):
|
|
yield
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_forward_context():
|
|
forward_context = Mock(in_profile_run=False, with_prefill=False)
|
|
with patch("vllm_ascend.models.deepseek_v2.get_forward_context",
|
|
return_value=forward_context):
|
|
yield
|
|
|
|
|
|
def test_custom_deepseek_v2_silu_and_mul():
|
|
torch.set_default_device("cpu")
|
|
|
|
silu = CustomDeepseekV2SiluAndMul()
|
|
assert silu.weight_scale is None
|
|
|
|
x = torch.randn(2, 4)
|
|
output = silu.forward_oot(x)
|
|
assert output.shape == (2, 2)
|
|
|
|
weight_scale = Mock(return_value=torch.tensor(0.1))
|
|
silu = CustomDeepseekV2SiluAndMul(weight_scale=weight_scale)
|
|
quant_x = torch.randint(-128, 127, (2, 4), dtype=torch.int32)
|
|
dynamic_scale = torch.randn(2, 1)
|
|
with patch("torch_npu.npu_dequant_swiglu_quant",
|
|
return_value=torch.randn(2, 4)):
|
|
output = silu.forward_oot((quant_x, dynamic_scale))
|
|
assert output.shape == (2, 4)
|
|
|
|
|
|
def test_custom_deepseek_v2_merged_replicated_linear(mock_distributed):
|
|
linear = CustomDeepseekV2MergedReplicatedLinear(input_size=128,
|
|
output_sizes=[64, 64],
|
|
bias=False,
|
|
quant_config=None)
|
|
assert linear.output_sizes == [64, 64]
|
|
|
|
param = Mock()
|
|
param.data = torch.zeros(128, 128)
|
|
param.output_dim = 1
|
|
param.is_gguf_weight = False
|
|
param.is_gguf_weight_type = False
|
|
loaded_weight = torch.randn(128, 64)
|
|
linear.weight_loader(param, loaded_weight, loaded_shard_id=0)
|
|
|
|
with pytest.raises(AssertionError):
|
|
linear.weight_loader(param, torch.randn(128, 32), loaded_shard_id=0)
|
|
|
|
|
|
@pytest.mark.parametrize("cls", [
|
|
CustomDeepseekV2RowParallelLinearReplaceAllreduce,
|
|
CustomDeepseekV2RowParallelLinear
|
|
])
|
|
def test_row_parallel_linear(cls, mock_distributed):
|
|
linear = cls(input_size=128, output_size=64, bias=False, quant_config=None)
|
|
linear.quant_method = Mock()
|
|
linear.quant_method.apply.return_value = torch.randn(2, 4, 64)
|
|
|
|
input_ = torch.randn(2, 4, 128)
|
|
with patch("vllm_ascend.models.deepseek_v2.split_tensor_along_last_dim",
|
|
return_value=[torch.randn(2, 4, 64)]):
|
|
linear.input_is_parallel = False
|
|
output = linear(input_, is_prefill=True)
|
|
assert output[0].shape == (2, 4, 64)
|
|
|
|
linear.input_is_parallel = True
|
|
output = linear(input_, is_prefill=False)
|
|
assert output[0].shape == (2, 4, 64)
|
|
|
|
|
|
def test_custom_deepseek_v2_mlp(mock_distributed, base_config):
|
|
mlp = CustomDeepseekV2MLP(hidden_size=128,
|
|
intermediate_size=256,
|
|
hidden_act="silu",
|
|
quant_config=None)
|
|
assert isinstance(mlp.act_fn, CustomDeepseekV2SiluAndMul)
|
|
|
|
x = torch.randn(2, 4, 128)
|
|
output = mlp(x)
|
|
assert output.shape == (2, 4, 128)
|
|
|
|
with patch("vllm_ascend.models.deepseek_v2.QuantizationConfig"
|
|
) as mock_quant_config:
|
|
mock_quant_config.name = "w8a8dynamic"
|
|
with pytest.raises(NotImplementedError):
|
|
CustomDeepseekV2MLP(hidden_size=128,
|
|
intermediate_size=256,
|
|
hidden_act="silu",
|
|
quant_config=mock_quant_config,
|
|
force_replicate=False)
|
|
with pytest.raises(ValueError):
|
|
CustomDeepseekV2MLP(hidden_size=128,
|
|
intermediate_size=256,
|
|
hidden_act="relu",
|
|
quant_config=None)
|
|
|
|
|
|
def test_custom_deepseek_v2_moe(mock_distributed, base_config,
|
|
mock_forward_context):
|
|
base_config.n_shared_experts = 1
|
|
moe = CustomDeepseekV2MoE(config=base_config,
|
|
quant_config=None,
|
|
prefix="mlp")
|
|
assert moe.top_k == 2
|
|
|
|
x = torch.randn(2, 4, 128)
|
|
attn_metadata = Mock(num_prefills=1)
|
|
with patch("vllm_ascend.ops.fused_moe.AscendFusedMoE.__call__",
|
|
return_value=(torch.randn(2, 4, 128), torch.randn(2, 4, 128))):
|
|
output = moe(x, attn_metadata)
|
|
assert output.shape == (2, 4, 128)
|
|
|
|
|
|
@patch("torch_npu.npu_rms_norm")
|
|
def test_custom_deepseek_v2_mla_attention(mock_rms_norm, mock_distributed,
|
|
base_config):
|
|
mock_rms_norm.return_value = (torch.randn(2, 128), torch.randn(2, 128))
|
|
|
|
attn = CustomDeepseekV2MLAAttention(config=base_config,
|
|
hidden_size=128,
|
|
num_heads=8,
|
|
qk_nope_head_dim=16,
|
|
qk_rope_head_dim=16,
|
|
v_head_dim=32,
|
|
q_lora_rank=16,
|
|
kv_lora_rank=16,
|
|
cache_config=CacheConfig(),
|
|
quant_config=None,
|
|
prefix="layers.0.self_attn")
|
|
assert attn.debug_layer_idx == 0
|
|
|
|
x = torch.randn(2, 4, 128)
|
|
positions = torch.arange(4).repeat(2, 1)
|
|
with patch.object(attn.mla_attn,
|
|
"__call__",
|
|
return_value=torch.randn(2, 4, 128)):
|
|
with pytest.raises(AssertionError):
|
|
attn(positions, x)
|
|
|
|
attn = CustomDeepseekV2MLAAttention(config=base_config,
|
|
hidden_size=128,
|
|
num_heads=8,
|
|
qk_nope_head_dim=16,
|
|
qk_rope_head_dim=16,
|
|
v_head_dim=32,
|
|
q_lora_rank=None,
|
|
kv_lora_rank=16,
|
|
prefix="layers.1.self_attn")
|
|
assert hasattr(attn, "q_proj")
|
|
|
|
|
|
@patch("torch_npu.npu_add_rms_norm")
|
|
@patch("torch_npu.npu_rms_norm")
|
|
def test_custom_deepseek_v2_decoder_layer(mock_rms_norm, mock_add_norm,
|
|
mock_distributed, base_config,
|
|
vllm_config):
|
|
mock_rms_norm.return_value = (torch.randn(2, 128), torch.randn(2, 128))
|
|
mock_add_norm.return_value = (torch.randn(2, 128), torch.randn(2, 128),
|
|
torch.randn(2, 128))
|
|
base_config.n_routed_experts = 4
|
|
layer = CustomDeepseekV2DecoderLayer(config=base_config,
|
|
prefix="layers.0",
|
|
model_config=vllm_config.model_config,
|
|
cache_config=CacheConfig(),
|
|
quant_config=None)
|
|
assert isinstance(layer.mlp, CustomDeepseekV2MoE)
|
|
|
|
x = torch.randn(2, 4, 128)
|
|
positions = torch.arange(4).repeat(2, 1)
|
|
|
|
with patch.object(layer.self_attn, "forward", Mock(return_value=torch.randn(2, 4, 128))), \
|
|
patch.object(layer.mlp, "forward", Mock(return_value=torch.randn(2, 4, 128))):
|
|
hidden_states, residual = layer(positions, x, None)
|
|
assert hidden_states.shape == (2, 4, 128)
|
|
|
|
base_config.n_routed_experts = None
|
|
layer = CustomDeepseekV2DecoderLayer(config=base_config,
|
|
prefix="layers.0",
|
|
model_config=vllm_config.model_config,
|
|
quant_config=None)
|
|
assert isinstance(layer.mlp, CustomDeepseekV2MLP)
|
|
|
|
|
|
def test_custom_deepseek_v2_for_causal_lm(mock_distributed, vllm_config):
|
|
model = CustomDeepseekV2ForCausalLM(vllm_config=vllm_config)
|
|
|
|
input_ids = torch.randint(0, 10000, (2, 4))
|
|
positions = torch.arange(4).repeat(2, 1)
|
|
with patch.object(model.model,
|
|
"forward",
|
|
return_value=torch.randn(2, 4, 128)):
|
|
output = model(input_ids, positions)
|
|
assert output.shape == (2, 4, 128)
|
|
|
|
weights = [("model.embed_tokens.weight", torch.randn(10000, 128))]
|
|
with patch(
|
|
"vllm.model_executor.model_loader.weight_utils.default_weight_loader"
|
|
):
|
|
loaded = model.load_weights(weights)
|
|
assert loaded is not None
|