v0.10.1rc1
This commit is contained in:
0
tests/ut/models/__init__.py
Normal file
0
tests/ut/models/__init__.py
Normal file
195
tests/ut/models/test_deepseek_mtp.py
Normal file
195
tests/ut/models/test_deepseek_mtp.py
Normal file
@@ -0,0 +1,195 @@
|
||||
import pytest
|
||||
import torch
|
||||
from pytest_mock import MockerFixture
|
||||
from transformers import PretrainedConfig
|
||||
from vllm.config import CacheConfig, ModelConfig, VllmConfig
|
||||
|
||||
from tests.ut.base import PytestBase
|
||||
from vllm_ascend.models.deepseek_mtp import (
|
||||
CustomDeepSeekMTP, CustomDeepSeekMultiTokenPredictor,
|
||||
CustomDeepSeekMultiTokenPredictorLayer)
|
||||
|
||||
|
||||
class TestCustomDeepSeekMultiTokenPredictorLayer(PytestBase):
|
||||
|
||||
@pytest.fixture
|
||||
def setup_mtp_layer(self, mocker: MockerFixture):
|
||||
config = PretrainedConfig(vocab_size=1000,
|
||||
hidden_size=768,
|
||||
rms_norm_eps=1e-5)
|
||||
mocker.patch(
|
||||
"vllm.model_executor.layers.vocab_parallel_embedding.VocabParallelEmbedding.__init__",
|
||||
return_value=None)
|
||||
mocker.patch("vllm.model_executor.layers.layernorm.RMSNorm.__init__",
|
||||
return_value=None)
|
||||
mocker.patch(
|
||||
"vllm.model_executor.models.deepseek_mtp.SharedHead.__init__",
|
||||
return_value=None)
|
||||
mocker.patch(
|
||||
"vllm_ascend.models.deepseek_mtp.CustomDeepSeekShareHead.__init__",
|
||||
return_value=None)
|
||||
mocker_deepseek_v2_decode_layer = mocker.patch(
|
||||
"vllm_ascend.models.deepseek_v2.CustomDeepseekV2DecoderLayer.__init__",
|
||||
return_value=None)
|
||||
mocker.patch(
|
||||
"vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__",
|
||||
return_value=None)
|
||||
mocker.patch("vllm_ascend.utils.get_ascend_config",
|
||||
return_value=mocker.Mock())
|
||||
|
||||
mtp_layer = CustomDeepSeekMultiTokenPredictorLayer(config, "", None)
|
||||
mocker_deepseek_v2_decode_layer.assert_called_once()
|
||||
return mtp_layer
|
||||
|
||||
def test_init(self, mocker: MockerFixture, setup_mtp_layer):
|
||||
mtp_layer = setup_mtp_layer
|
||||
assert isinstance(mtp_layer, CustomDeepSeekMultiTokenPredictorLayer)
|
||||
|
||||
def test_forward(self, mocker: MockerFixture, setup_mtp_layer):
|
||||
mtp_layer = setup_mtp_layer
|
||||
mocker.patch("torch.nn.Module.__setattr__")
|
||||
mocker.patch("torch.nn.Module.__getattr__")
|
||||
mocker.patch("torch.nn.Module.__delattr__")
|
||||
mocker.patch.object(mtp_layer,
|
||||
'eh_proj',
|
||||
return_value=torch.randn(2, 3, 768))
|
||||
mocker.patch("torch.cat", return_value=torch.randn(2, 3, 768))
|
||||
mtp_layer.mtp_block.return_value = (torch.randn(2, 3, 768),
|
||||
torch.randn(2, 3, 768))
|
||||
|
||||
input_ids = torch.tensor([[1, 2, 3], [4, 5, 6]])
|
||||
positions = torch.tensor([[0, 1, 2], [0, 1, 2]])
|
||||
kv_cache = torch.randn(2, 3, 768)
|
||||
previous_hidden_states = torch.randn(2, 3, 768)
|
||||
inputs_embeds = torch.tensor([[1.0, 2.0, 3.0]])
|
||||
|
||||
output = mtp_layer(input_ids, positions, kv_cache, None,
|
||||
previous_hidden_states, inputs_embeds, 0)
|
||||
assert output.shape == (2, 3, 768)
|
||||
|
||||
|
||||
class TestCustomDeepSeekMultiTokenPredictor(PytestBase):
|
||||
|
||||
@pytest.fixture
|
||||
def setup_predictor(self, mocker: MockerFixture):
|
||||
mock_vllm_config = mocker.MagicMock(spec=VllmConfig)
|
||||
mock_model_config = mocker.MagicMock(spec=ModelConfig)
|
||||
mock_hf_config = mocker.MagicMock()
|
||||
mock_hf_config.num_hidden_layers = 12
|
||||
mock_hf_config.num_nextn_predict_layers = 3
|
||||
mock_hf_config.vocab_size = 30000
|
||||
mock_model_config.hf_config = mock_hf_config
|
||||
mock_vllm_config.model_config = mock_model_config
|
||||
mock_vllm_config.cache_config = CacheConfig()
|
||||
mock_vllm_config.quant_config = mocker.MagicMock()
|
||||
mocker.patch(
|
||||
"vllm.model_executor.layers.vocab_parallel_embedding.VocabParallelEmbedding.__init__",
|
||||
return_value=None)
|
||||
mocker.patch(
|
||||
"vllm_ascend.models.deepseek_mtp.CustomDeepSeekMultiTokenPredictorLayer.__init__",
|
||||
return_value=None)
|
||||
mocker.patch(
|
||||
"vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__",
|
||||
return_value=None)
|
||||
mocker.patch("vllm_ascend.utils.get_ascend_config",
|
||||
return_value=mocker.Mock())
|
||||
|
||||
predictor = CustomDeepSeekMultiTokenPredictor(
|
||||
vllm_config=mock_vllm_config)
|
||||
return predictor
|
||||
|
||||
def test_init(self, mocker: MockerFixture, setup_predictor):
|
||||
predictor = setup_predictor
|
||||
assert predictor.num_mtp_layers == 3
|
||||
assert isinstance(predictor, CustomDeepSeekMultiTokenPredictor)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'kv_caches, inputs_embeds',
|
||||
[(torch.tensor([[[0.1, 0.2, 0.3]]]), torch.tensor([[0.1, 0.2, 0.3]]))])
|
||||
def test_forward(self, mocker: MockerFixture, setup_predictor, kv_caches,
|
||||
inputs_embeds):
|
||||
predictor = setup_predictor
|
||||
mock_layer = mocker.MagicMock()
|
||||
mock_layer.return_value = torch.tensor([1.0, 2.0, 3.0])
|
||||
predictor.layers_list = [mock_layer]
|
||||
|
||||
# todo: need or not?
|
||||
# predictor.num_mtp_layers = 1
|
||||
input_ids = torch.tensor([[1, 2, 3]])
|
||||
positions = torch.tensor([[0, 1, 2]])
|
||||
mocker.patch(
|
||||
"vllm_ascend.models.deepseek_mtp.CustomDeepSeekMultiTokenPredictorLayer.__call__",
|
||||
return_value=torch.tensor([[1.0, 2.0, 3.0]]))
|
||||
output = predictor.forward(input_ids, positions, kv_caches, None, None,
|
||||
inputs_embeds, 0)
|
||||
mock_layer.assert_called_once()
|
||||
assert torch.allclose(output, torch.tensor([1.0, 2.0, 3.0]))
|
||||
|
||||
def test_compute_logits(self, mocker: MockerFixture, setup_predictor):
|
||||
hidden_states = torch.tensor([[1, 2, 3], [4, 5, 6]])
|
||||
predictor = setup_predictor
|
||||
|
||||
mock_layer = mocker.MagicMock()
|
||||
mock_layer.return_value = torch.tensor([1.0, 2.0, 3.0])
|
||||
predictor.layers_list = [mock_layer]
|
||||
mocker.patch("torch.nn.Module.__setattr__")
|
||||
mocker.patch("torch.nn.Module.__getattr__")
|
||||
mocker.patch("torch.nn.Module.__delattr__")
|
||||
mocker.patch(
|
||||
"vllm.model_executor.layers.logits_processor.LogitsProcessor.__init__",
|
||||
return_value=None)
|
||||
predictor.logits_processor.return_value = torch.tensor([1.0, 2.0, 3.0])
|
||||
|
||||
result_logits = predictor.compute_logits(hidden_states=hidden_states,
|
||||
sampling_metadata=None)
|
||||
predictor.logits_processor.assert_called_once()
|
||||
assert torch.allclose(result_logits, torch.tensor([1.0, 2.0, 3.0]))
|
||||
|
||||
|
||||
class TestCustomDeepSeekMTP(PytestBase):
|
||||
|
||||
@pytest.fixture
|
||||
def setup_mtp(self, mocker: MockerFixture):
|
||||
vllm_config = mocker.MagicMock()
|
||||
vllm_config.model_config.hf_config.num_hidden_layers = 12
|
||||
vllm_config.model_config.hf_config.num_nextn_predict_layers = 3
|
||||
vllm_config.cache_config = mocker.MagicMock()
|
||||
vllm_config.quant_config = mocker.MagicMock()
|
||||
|
||||
mocker.patch("torch.nn.Module.__setattr__")
|
||||
mocker.patch("torch.nn.Module.__getattr__")
|
||||
mocker.patch("torch.nn.Module.__delattr__")
|
||||
mocker.patch(
|
||||
"vllm.model_executor.layers.vocab_parallel_embedding.VocabParallelEmbedding.__init__",
|
||||
return_value=None)
|
||||
mocker.patch(
|
||||
"vllm_ascend.models.deepseek_mtp.CustomDeepSeekMultiTokenPredictorLayer.__call__",
|
||||
return_value=None)
|
||||
mocker.patch("vllm.model_executor.layers.sampler.get_sampler",
|
||||
return_value=None)
|
||||
mocker.patch(
|
||||
"vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__",
|
||||
return_value=None)
|
||||
mocker.patch("vllm_ascend.utils.get_ascend_config",
|
||||
return_value=mocker.Mock())
|
||||
|
||||
mtp = CustomDeepSeekMTP(vllm_config=vllm_config)
|
||||
return mtp
|
||||
|
||||
def test_init(self, mocker: MockerFixture, setup_mtp):
|
||||
mtp = setup_mtp
|
||||
assert isinstance(mtp, CustomDeepSeekMTP)
|
||||
|
||||
def test_forward(self, mocker: MockerFixture, setup_mtp):
|
||||
input_ids = torch.tensor([[1, 2, 3]])
|
||||
positions = torch.tensor([[0, 1, 2]])
|
||||
kv_caches = [torch.tensor([[0.1, 0.2, 0.3]])]
|
||||
previous_hidden_states = torch.tensor([[0.1, 0.2, 0.3]])
|
||||
inputs_embeds = torch.tensor([[0.1, 0.2, 0.3]])
|
||||
spec_step_idx = 0
|
||||
setup_mtp.model.return_value = torch.tensor([[1.0, 2.0, 3.0]])
|
||||
|
||||
output = setup_mtp.forward(input_ids, positions, kv_caches, None,
|
||||
previous_hidden_states, inputs_embeds,
|
||||
spec_step_idx)
|
||||
assert torch.allclose(output, torch.tensor([[1.0, 2.0, 3.0]]))
|
||||
295
tests/ut/models/test_deepseek_v2.py
Normal file
295
tests/ut/models/test_deepseek_v2.py
Normal file
@@ -0,0 +1,295 @@
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
from types import SimpleNamespace
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import PretrainedConfig
|
||||
from vllm.config import CacheConfig
|
||||
from vllm.distributed.parallel_state import GroupCoordinator
|
||||
|
||||
from vllm_ascend.models.deepseek_v2 import (
|
||||
CustomDeepseekV2MergedReplicatedLinear, CustomDeepseekV2MLAAttention,
|
||||
CustomDeepseekV2MLP, CustomDeepseekV2MoE,
|
||||
CustomDeepseekV2RowParallelLinear,
|
||||
CustomDeepseekV2RowParallelLinearReplaceAllreduce,
|
||||
CustomDeepseekV2SiluAndMul, LogitsProcessor, ParallelLMHead)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def base_config():
|
||||
config = PretrainedConfig(
|
||||
hidden_size=128,
|
||||
num_attention_heads=8,
|
||||
num_hidden_layers=2,
|
||||
intermediate_size=256,
|
||||
hidden_act="silu",
|
||||
rms_norm_eps=1e-6,
|
||||
rope_theta=10000.0,
|
||||
max_position_embeddings=2048,
|
||||
n_routed_experts=4,
|
||||
n_shared_experts=1,
|
||||
moe_intermediate_size=256,
|
||||
num_experts_per_tok=2,
|
||||
routed_scaling_factor=1.0,
|
||||
first_k_dense_replace=0,
|
||||
moe_layer_freq=1,
|
||||
kv_lora_rank=16,
|
||||
qk_nope_head_dim=16,
|
||||
qk_rope_head_dim=16,
|
||||
v_head_dim=32,
|
||||
topk_method="noaux_tc",
|
||||
scoring_func="softmax",
|
||||
norm_topk_prob=True,
|
||||
n_group=1,
|
||||
topk_group=1,
|
||||
vocab_size=10000,
|
||||
)
|
||||
return config
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def vllm_config(base_config):
|
||||
model_config = SimpleNamespace(
|
||||
hf_config=base_config,
|
||||
tensor_parallel_size=1,
|
||||
dtype=torch.float32,
|
||||
use_mla=False,
|
||||
quant_config=None,
|
||||
max_model_len=2048,
|
||||
)
|
||||
|
||||
cache_config = CacheConfig()
|
||||
vllm_config = Mock()
|
||||
vllm_config.model_config = model_config
|
||||
vllm_config.cache_config = cache_config
|
||||
vllm_config.quant_config = None
|
||||
return vllm_config
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_distributed():
|
||||
tp_group = Mock(spec=GroupCoordinator)
|
||||
tp_group.rank_in_group = 0
|
||||
tp_group.world_size = 1
|
||||
tp_group.device_group = Mock()
|
||||
|
||||
dp_group = Mock(spec=GroupCoordinator)
|
||||
dp_group.rank_in_group = 0
|
||||
dp_group.world_size = 1
|
||||
|
||||
ep_group = Mock(spec=GroupCoordinator)
|
||||
ep_group.rank_in_group = 0
|
||||
ep_group.world_size = 1
|
||||
|
||||
pp_group = Mock(spec=GroupCoordinator)
|
||||
pp_group.rank_in_group = 0
|
||||
pp_group.world_size = 1
|
||||
|
||||
mock_vllm_config = Mock()
|
||||
mock_vllm_config.scheduler_config = Mock(max_num_seqs=256)
|
||||
mock_vllm_config.model_config = Mock(max_model_len=2048, quant_config=None)
|
||||
|
||||
with patch("vllm_ascend.models.deepseek_v2.get_tensor_model_parallel_rank", return_value=0), \
|
||||
patch("vllm_ascend.models.deepseek_v2.get_tensor_model_parallel_world_size", return_value=1), \
|
||||
patch("vllm_ascend.models.deepseek_v2.get_tp_group", return_value=tp_group), \
|
||||
patch("vllm_ascend.models.deepseek_v2.get_ep_group", return_value=ep_group), \
|
||||
patch("vllm_ascend.models.deepseek_v2.get_dp_group", return_value=dp_group), \
|
||||
patch("vllm_ascend.models.deepseek_v2.get_pp_group", return_value=pp_group), \
|
||||
patch("vllm_ascend.models.deepseek_v2.get_pp_group",
|
||||
return_value=Mock(is_first_rank=False, is_last_rank=False)), \
|
||||
patch("vllm_ascend.ops.fused_moe.get_current_vllm_config", return_value=mock_vllm_config), \
|
||||
patch.dict("vllm.distributed.parallel_state.__dict__", _TP=tp_group, _EP=ep_group, _DP=dp_group,
|
||||
_PP=pp_group), \
|
||||
patch.dict("vllm_ascend.distributed.parallel_state.__dict__", _MC2=ep_group), \
|
||||
patch("torch.npu.current_device", return_value=0):
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_forward_context():
|
||||
forward_context = Mock(in_profile_run=False, with_prefill=False)
|
||||
with patch("vllm_ascend.models.deepseek_v2.get_forward_context",
|
||||
return_value=forward_context):
|
||||
yield
|
||||
|
||||
|
||||
def test_custom_deepseek_v2_silu_and_mul():
|
||||
torch.set_default_device("cpu")
|
||||
|
||||
silu = CustomDeepseekV2SiluAndMul()
|
||||
assert silu.weight_scale is None
|
||||
|
||||
x = torch.randn(2, 4)
|
||||
output = silu.forward_oot(x)
|
||||
assert output.shape == (2, 2)
|
||||
|
||||
weight_scale = Mock(return_value=torch.tensor(0.1))
|
||||
silu = CustomDeepseekV2SiluAndMul(weight_scale=weight_scale)
|
||||
quant_x = torch.randint(-128, 127, (2, 4), dtype=torch.int32)
|
||||
dynamic_scale = torch.randn(2, 1)
|
||||
with patch("torch_npu.npu_dequant_swiglu_quant",
|
||||
return_value=torch.randn(2, 4)):
|
||||
output = silu.forward_oot((quant_x, dynamic_scale))
|
||||
assert output.shape == (2, 4)
|
||||
|
||||
|
||||
def test_custom_deepseek_v2_merged_replicated_linear(mock_distributed):
|
||||
linear = CustomDeepseekV2MergedReplicatedLinear(input_size=128,
|
||||
output_sizes=[64, 64],
|
||||
bias=False,
|
||||
quant_config=None)
|
||||
assert linear.output_sizes == [64, 64]
|
||||
|
||||
param = Mock()
|
||||
param.data = torch.zeros(128, 128)
|
||||
param.output_dim = 1
|
||||
param.is_gguf_weight = False
|
||||
param.is_gguf_weight_type = False
|
||||
loaded_weight = torch.randn(128, 64)
|
||||
linear.weight_loader(param, loaded_weight, loaded_shard_id=0)
|
||||
|
||||
with pytest.raises(AssertionError):
|
||||
linear.weight_loader(param, torch.randn(128, 32), loaded_shard_id=0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("cls", [
|
||||
CustomDeepseekV2RowParallelLinearReplaceAllreduce,
|
||||
CustomDeepseekV2RowParallelLinear
|
||||
])
|
||||
def test_row_parallel_linear(cls, mock_distributed):
|
||||
linear = cls(input_size=128, output_size=64, bias=False, quant_config=None)
|
||||
linear.quant_method = Mock()
|
||||
linear.quant_method.apply.return_value = torch.randn(2, 4, 64)
|
||||
|
||||
input_ = torch.randn(2, 4, 128)
|
||||
with patch("vllm_ascend.models.deepseek_v2.split_tensor_along_last_dim",
|
||||
return_value=[torch.randn(2, 4, 64)]):
|
||||
linear.input_is_parallel = False
|
||||
output = linear(input_, is_prefill=True)
|
||||
assert output[0].shape == (2, 4, 64)
|
||||
|
||||
linear.input_is_parallel = True
|
||||
output = linear(input_, is_prefill=False)
|
||||
assert output[0].shape == (2, 4, 64)
|
||||
|
||||
|
||||
def test_custom_deepseek_v2_mlp(mock_distributed, base_config):
|
||||
mlp = CustomDeepseekV2MLP(hidden_size=128,
|
||||
intermediate_size=256,
|
||||
hidden_act="silu",
|
||||
quant_config=None)
|
||||
assert isinstance(mlp.act_fn, CustomDeepseekV2SiluAndMul)
|
||||
|
||||
x = torch.randn(2, 4, 128)
|
||||
output = mlp(x)
|
||||
assert output.shape == (2, 4, 128)
|
||||
|
||||
with patch("vllm_ascend.models.deepseek_v2.QuantizationConfig"
|
||||
) as mock_quant_config:
|
||||
mock_quant_config.name = "w8a8dynamic"
|
||||
with pytest.raises(NotImplementedError):
|
||||
CustomDeepseekV2MLP(hidden_size=128,
|
||||
intermediate_size=256,
|
||||
hidden_act="silu",
|
||||
quant_config=mock_quant_config,
|
||||
force_replicate=False)
|
||||
with pytest.raises(ValueError):
|
||||
CustomDeepseekV2MLP(hidden_size=128,
|
||||
intermediate_size=256,
|
||||
hidden_act="relu",
|
||||
quant_config=None)
|
||||
|
||||
|
||||
def test_custom_deepseek_v2_moe(mock_distributed, base_config,
|
||||
mock_forward_context):
|
||||
base_config.n_shared_experts = 1
|
||||
moe = CustomDeepseekV2MoE(config=base_config,
|
||||
quant_config=None,
|
||||
prefix="mlp")
|
||||
assert moe.top_k == 2
|
||||
|
||||
x = torch.randn(2, 4, 128)
|
||||
attn_metadata = Mock(num_prefills=1)
|
||||
with patch("vllm_ascend.ops.fused_moe.AscendFusedMoE.__call__",
|
||||
return_value=(torch.randn(2, 4, 128), torch.randn(2, 4, 128))):
|
||||
output = moe(x, attn_metadata)
|
||||
assert output.shape == (2, 4, 128)
|
||||
|
||||
|
||||
@patch("torch_npu.npu_rms_norm")
|
||||
def test_custom_deepseek_v2_mla_attention(mock_rms_norm, mock_distributed,
|
||||
base_config):
|
||||
mock_rms_norm.return_value = (torch.randn(2, 128), torch.randn(2, 128))
|
||||
|
||||
attn = CustomDeepseekV2MLAAttention(config=base_config,
|
||||
hidden_size=128,
|
||||
num_heads=8,
|
||||
qk_nope_head_dim=16,
|
||||
qk_rope_head_dim=16,
|
||||
v_head_dim=32,
|
||||
q_lora_rank=16,
|
||||
kv_lora_rank=16,
|
||||
cache_config=CacheConfig(),
|
||||
quant_config=None,
|
||||
prefix="layers.0.self_attn")
|
||||
assert attn.debug_layer_idx == 0
|
||||
|
||||
x = torch.randn(2, 4, 128)
|
||||
positions = torch.arange(4).repeat(2, 1)
|
||||
with patch.object(attn.mla_attn,
|
||||
"__call__",
|
||||
return_value=torch.randn(2, 4, 128)):
|
||||
with pytest.raises(AssertionError):
|
||||
attn(positions, x)
|
||||
|
||||
attn = CustomDeepseekV2MLAAttention(config=base_config,
|
||||
hidden_size=128,
|
||||
num_heads=8,
|
||||
qk_nope_head_dim=16,
|
||||
qk_rope_head_dim=16,
|
||||
v_head_dim=32,
|
||||
q_lora_rank=None,
|
||||
kv_lora_rank=16,
|
||||
prefix="layers.1.self_attn")
|
||||
assert hasattr(attn, "q_proj")
|
||||
|
||||
|
||||
def test_deepseek_v2_lmhead(mock_distributed, vllm_config):
|
||||
# 创建一个简单的配置对象
|
||||
class SimpleConfig:
|
||||
|
||||
def __init__(self):
|
||||
self.vocab_size = 10000
|
||||
self.hidden_size = 128
|
||||
|
||||
config = SimpleConfig()
|
||||
|
||||
# 直接创建lmhead和logits_processor
|
||||
lmhead = ParallelLMHead(config.vocab_size, config.hidden_size)
|
||||
logits_processor = LogitsProcessor(config.vocab_size)
|
||||
|
||||
# 创建模拟输出
|
||||
mock_output = torch.randn(2, 4, config.hidden_size)
|
||||
mock_logits = torch.randn(2, 4, config.vocab_size)
|
||||
|
||||
# 直接测试logits_processor
|
||||
with patch.object(lmhead.quant_method, "apply", return_value=mock_logits):
|
||||
with patch.object(logits_processor,
|
||||
"_gather_logits",
|
||||
return_value=mock_logits):
|
||||
logits = logits_processor(lmhead, mock_output)
|
||||
assert logits.shape == (2, 4, config.vocab_size)
|
||||
424
tests/ut/models/test_qwen2_5_vl.py
Normal file
424
tests/ut/models/test_qwen2_5_vl.py
Normal file
@@ -0,0 +1,424 @@
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
from tests.ut.base import PytestBase
|
||||
from vllm_ascend.models.qwen2_5_vl import (
|
||||
AscendQwen2_5_VisionAttention, AscendQwen2_5_VisionBlock,
|
||||
AscendQwen2_5_VisionPatchEmbed, AscendQwen2_5_VisionRotaryEmbedding,
|
||||
AscendQwen2_5_VisionTransformer, AscendQwen2_5_VLForConditionalGeneration)
|
||||
|
||||
|
||||
class TestAscendQwen2_5_VisionAttention(PytestBase):
|
||||
|
||||
def init_attention(
|
||||
self,
|
||||
mocker,
|
||||
embed_dim=1000,
|
||||
num_heads=10,
|
||||
projection_size=100,
|
||||
quant_config=None,
|
||||
prefix="",
|
||||
):
|
||||
mocker_attn = mocker.patch(
|
||||
"vllm_ascend.models.qwen2_5_vl.Qwen2_5_VisionAttention.__init__")
|
||||
|
||||
attention = AscendQwen2_5_VisionAttention(
|
||||
embed_dim=embed_dim,
|
||||
num_heads=num_heads,
|
||||
projection_size=projection_size,
|
||||
quant_config=quant_config,
|
||||
prefix=prefix,
|
||||
)
|
||||
args, kwargs = mocker_attn.call_args
|
||||
assert args == (embed_dim, num_heads, projection_size, None, "")
|
||||
assert not kwargs
|
||||
attention.num_attention_heads_per_partition = num_heads
|
||||
return attention
|
||||
|
||||
def test_attn_init_should_normal(self, mocker: MockerFixture):
|
||||
embed_dim = 1000
|
||||
num_heads = 10
|
||||
projection_size = 100
|
||||
quant_config = None
|
||||
prefix = ""
|
||||
vit = self.init_attention(
|
||||
embed_dim=embed_dim,
|
||||
num_heads=num_heads,
|
||||
projection_size=projection_size,
|
||||
quant_config=quant_config,
|
||||
prefix=prefix,
|
||||
mocker=mocker,
|
||||
)
|
||||
assert vit.embed_dim == 1000
|
||||
assert vit.hidden_size_per_attention_head == 10
|
||||
|
||||
def test_attn_init_should_raise_error(self, mocker: MockerFixture):
|
||||
embed_dim = 1000
|
||||
num_heads = 7
|
||||
projection_size = 100
|
||||
quant_config = None
|
||||
prefix = ""
|
||||
with pytest.raises(AssertionError):
|
||||
# projection_size should divided by num heads
|
||||
self.init_attention(
|
||||
mocker=mocker,
|
||||
embed_dim=embed_dim,
|
||||
num_heads=num_heads,
|
||||
projection_size=projection_size,
|
||||
quant_config=quant_config,
|
||||
prefix=prefix,
|
||||
)
|
||||
|
||||
def test_split_qkv(self, mocker: MockerFixture):
|
||||
attention = self.init_attention(mocker=mocker)
|
||||
mocker.patch("torch.nn.Module.__setattr__")
|
||||
mocker.patch("torch.nn.Module.__getattr__")
|
||||
mocker.patch("torch.nn.Module.__delattr__")
|
||||
q, k, v = attention.split_qkv(torch.rand((100, 10, 300)))
|
||||
assert q.shape == (100, 10, 10, 10)
|
||||
assert k.shape == (100, 10, 10, 10)
|
||||
assert v.shape == (100, 10, 10, 10)
|
||||
|
||||
def test_attn_forward(self, mocker: MockerFixture):
|
||||
attention = self.init_attention(mocker=mocker)
|
||||
mocker.patch("torch.nn.Module.__setattr__")
|
||||
mocker.patch("torch.nn.Module.__getattr__")
|
||||
mocker.patch("torch.nn.Module.__delattr__")
|
||||
x = torch.rand((100, 3, 10 * 3 * 128)) # s,b, head*3*head_dim
|
||||
cu_seqlens = torch.tensor([10, 50, 100])
|
||||
cos = torch.rand((1, 100, 1, 128))
|
||||
sin = torch.rand((1, 100, 1, 128))
|
||||
|
||||
qkv = lambda x: (x, 0) # noqa
|
||||
split_qkv = lambda x: [ #noqa
|
||||
torch.rand((100, 3, 10, 128)) for i in range(3)
|
||||
] # noqa
|
||||
npu_rotary_mul = lambda q, cos, sin: q # noqa
|
||||
_npu_flash_attention_unpad = lambda **kwargs: kwargs["out"] # noqa
|
||||
proj = lambda x: (x, 0) # noqa
|
||||
|
||||
mocker_qkv = mocker.patch.object(attention, "qkv", side_effect=qkv)
|
||||
mocker_split_qkv = mocker.patch.object(
|
||||
attention,
|
||||
"split_qkv",
|
||||
side_effect=split_qkv,
|
||||
)
|
||||
mocker_npu_rotary_mul = mocker.patch("torch_npu.npu_rotary_mul",
|
||||
side_effect=npu_rotary_mul)
|
||||
mocker_npu_flash_attention_unpad = mocker.patch(
|
||||
"torch_npu._npu_flash_attention_unpad",
|
||||
side_effect=_npu_flash_attention_unpad,
|
||||
)
|
||||
mocker_proj = mocker.patch.object(attention, "proj", side_effect=proj)
|
||||
attention.__dict__["qkv"] = mocker_qkv
|
||||
attention.__dict__["split_qkv"] = mocker_split_qkv
|
||||
attention.__dict__["npu_rotary_mul"] = mocker_npu_rotary_mul
|
||||
attention.__dict__["_npu_flash_attention_unpad"] = (
|
||||
mocker_npu_flash_attention_unpad)
|
||||
attention.__dict__["proj"] = mocker_proj
|
||||
|
||||
output = attention.forward(
|
||||
x=x,
|
||||
cu_seqlens=cu_seqlens,
|
||||
cos=cos,
|
||||
sin=sin,
|
||||
)
|
||||
qkv_args, qkv_kwargs = mocker_qkv.call_args
|
||||
assert qkv_args == (x, )
|
||||
assert not qkv_kwargs
|
||||
|
||||
split_qkv_args, split_qkv_kwargs = mocker_split_qkv.call_args
|
||||
assert split_qkv_args == (x, )
|
||||
assert not split_qkv_kwargs
|
||||
|
||||
npu_rotary_mul_args, npu_rotary_mul_kwargs = mocker_npu_rotary_mul.call_args
|
||||
assert npu_rotary_mul_args[1:] == (cos, sin)
|
||||
assert npu_rotary_mul_args[0].shape == torch.Size([3, 100, 10, 128])
|
||||
assert not npu_rotary_mul_kwargs
|
||||
|
||||
assert output.shape == torch.Size([100, 3, 1280])
|
||||
|
||||
|
||||
class TestAscendQwen2_5_VisionBlock(PytestBase):
|
||||
|
||||
def init_vision_block(
|
||||
self,
|
||||
mocker,
|
||||
dim=100,
|
||||
num_heads=10,
|
||||
mlp_hidden_dim=100,
|
||||
):
|
||||
mocker_vit = mocker.patch(
|
||||
"vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VisionBlock.__init__",
|
||||
return_value=None,
|
||||
)
|
||||
|
||||
mocker_attn = mocker.patch(
|
||||
"vllm_ascend.models.qwen2_5_vl.AscendQwen2_5_VisionAttention.__init__",
|
||||
return_value=None,
|
||||
)
|
||||
|
||||
mocker.patch("torch.nn.Module.__setattr__")
|
||||
mocker.patch("torch.nn.Module.__getattr__")
|
||||
mocker.patch("torch.nn.Module.__delattr__")
|
||||
vision_block = AscendQwen2_5_VisionBlock(
|
||||
dim=dim,
|
||||
num_heads=num_heads,
|
||||
mlp_hidden_dim=mlp_hidden_dim,
|
||||
)
|
||||
args, kwargs = mocker_vit.call_args
|
||||
assert args == (dim, num_heads, mlp_hidden_dim, F.silu, None, None, "")
|
||||
assert not kwargs
|
||||
|
||||
args1, kwargs1 = mocker_attn.call_args
|
||||
assert not args1
|
||||
assert kwargs1 == {
|
||||
"embed_dim": dim,
|
||||
"num_heads": num_heads,
|
||||
"projection_size": dim,
|
||||
"quant_config": None,
|
||||
"prefix": ".attn",
|
||||
}
|
||||
return vision_block
|
||||
|
||||
def test_init_vision_block_should_normal(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
):
|
||||
vision_block = self.init_vision_block(mocker)
|
||||
assert isinstance(vision_block, AscendQwen2_5_VisionBlock)
|
||||
|
||||
def test_vision_block_forward(self, mocker: MockerFixture):
|
||||
x = torch.randint(1, 100, (100, 3, 1280)) # s,b,d
|
||||
cu_seqlens = torch.tensor([10, 50, 100])
|
||||
cos = torch.rand((1, 100, 1, 128))
|
||||
sin = torch.rand((1, 100, 1, 128))
|
||||
vision_block = self.init_vision_block(mocker)
|
||||
mocker_attn = mocker.patch.object(vision_block, "attn", return_value=x)
|
||||
mocker_mlp = mocker.patch.object(vision_block, "mlp", return_value=x)
|
||||
vision_block.__dict__["attn"] = mocker_attn
|
||||
vision_block.__dict__["mlp"] = mocker_mlp
|
||||
|
||||
output = vision_block.forward(x.clone(), cu_seqlens, cos, sin)
|
||||
|
||||
_, attn_kwargs = mocker_attn.call_args
|
||||
assert attn_kwargs == {
|
||||
"cu_seqlens": cu_seqlens,
|
||||
"cos": cos,
|
||||
"sin": sin,
|
||||
}
|
||||
|
||||
assert torch.all(x * 3 == output)
|
||||
|
||||
|
||||
class TestAscendQwen2_5_VisionPatchEmbed(PytestBase):
|
||||
|
||||
def test_forward(self):
|
||||
patch_embed = AscendQwen2_5_VisionPatchEmbed()
|
||||
|
||||
ret = patch_embed(torch.rand((120, 1176)))
|
||||
assert ret.shape == (120, 1152)
|
||||
|
||||
|
||||
class TestAscendQwen2_5_VisionRotaryEmbedding(PytestBase):
|
||||
|
||||
def init_rotary_embedding(
|
||||
self,
|
||||
mocker,
|
||||
dim=128,
|
||||
):
|
||||
mocker_ebed = mocker.patch(
|
||||
"vllm_ascend.models.qwen2_5_vl.Qwen2_5_VisionRotaryEmbedding.__init__",
|
||||
return_value=None,
|
||||
)
|
||||
mocker.patch("torch.nn.Module.__setattr__")
|
||||
mocker.patch("torch.nn.Module.__getattr__")
|
||||
mocker.patch("torch.nn.Module.__delattr__")
|
||||
rotary_embedding = AscendQwen2_5_VisionRotaryEmbedding(dim=dim, )
|
||||
args, kwargs = mocker_ebed.call_args
|
||||
assert args == (dim, 10000.0)
|
||||
assert not kwargs
|
||||
return rotary_embedding
|
||||
|
||||
def test_init_rotary_embedding_should_normal(self, mocker: MockerFixture):
|
||||
rotary_embedding = self.init_rotary_embedding(mocker)
|
||||
assert isinstance(rotary_embedding,
|
||||
AscendQwen2_5_VisionRotaryEmbedding)
|
||||
|
||||
|
||||
class TestAscendQwen2_5_VisionTransformer(PytestBase):
|
||||
|
||||
input_data = torch.tensor([[0.1, 0.2], [0.3, 0.4]])
|
||||
|
||||
def init_vision_transformer(
|
||||
self,
|
||||
mocker,
|
||||
):
|
||||
norm_eps = 1e-6
|
||||
vision_config = mocker.MagicMock()
|
||||
vision_config.patch_size = 16
|
||||
vision_config.temporal_patch_size = 2
|
||||
vision_config.in_channels = 3
|
||||
vision_config.hidden_act = "gelu"
|
||||
vision_config.depth = 0
|
||||
vision_config.num_heads = 10
|
||||
vision_config.hidden_size = 300
|
||||
|
||||
mocker.patch(
|
||||
"vllm_ascend.models.qwen2_5_vl.parallel_state.get_tensor_model_parallel_rank",
|
||||
return_value=0,
|
||||
)
|
||||
mocker.patch("vllm.distributed.utils.divide", return_value=100)
|
||||
mocker.patch(
|
||||
"vllm.model_executor.layers.linear.get_tensor_model_parallel_world_size",
|
||||
return_value=2,
|
||||
)
|
||||
mocker.patch(
|
||||
"vllm.model_executor.layers.linear.divide",
|
||||
return_value=2,
|
||||
)
|
||||
mocker.patch(
|
||||
"vllm.model_executor.layers.linear.get_tensor_model_parallel_rank",
|
||||
return_value=0)
|
||||
mocker.patch(
|
||||
"vllm_ascend.models.qwen2_5_vl.parallel_state.get_tensor_model_parallel_world_size",
|
||||
return_value=2,
|
||||
)
|
||||
|
||||
vision_transformer = AscendQwen2_5_VisionTransformer(
|
||||
vision_config,
|
||||
norm_eps,
|
||||
)
|
||||
|
||||
assert not vision_transformer.interleaved
|
||||
return vision_transformer
|
||||
|
||||
def test_init_vision_transformer(self, mocker: MockerFixture):
|
||||
vision_transformer = self.init_vision_transformer(mocker)
|
||||
assert isinstance(vision_transformer, AscendQwen2_5_VisionTransformer)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"interleaved, expected",
|
||||
[
|
||||
(
|
||||
False,
|
||||
torch.tensor([
|
||||
input_data[0, 0].cos(),
|
||||
input_data[0, 1].cos(),
|
||||
input_data[0, 0].cos(),
|
||||
input_data[0, 1].cos(),
|
||||
input_data[1, 0].cos(),
|
||||
input_data[1, 1].cos(),
|
||||
input_data[1, 0].cos(),
|
||||
input_data[1, 1].cos(),
|
||||
]),
|
||||
),
|
||||
(
|
||||
True,
|
||||
torch.tensor([
|
||||
input_data[0, 0].cos(),
|
||||
input_data[0, 0].cos(),
|
||||
input_data[0, 1].cos(),
|
||||
input_data[0, 1].cos(),
|
||||
input_data[1, 0].cos(),
|
||||
input_data[1, 0].cos(),
|
||||
input_data[1, 1].cos(),
|
||||
input_data[1, 1].cos(),
|
||||
]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_cal_cos_sin(self, interleaved, expected, mocker: MockerFixture):
|
||||
vision_transformer = self.init_vision_transformer(mocker)
|
||||
mocker.patch("torch.nn.Module.__setattr__")
|
||||
mocker.patch("torch.nn.Module.__getattr__")
|
||||
mocker.patch("torch.nn.Module.__delattr__")
|
||||
vision_transformer.__dict__["interleaved"] = interleaved
|
||||
vision_transformer.__dict__["hidden_size_per_attention_head"] = 2
|
||||
vision_transformer.hidden_size_per_attention_head = 4
|
||||
cos_new, _ = vision_transformer.cal_cos_sin(self.input_data)
|
||||
assert cos_new.shape == (1, 32, 1, 2)
|
||||
|
||||
def test_forward(self, mocker: MockerFixture):
|
||||
vision_transformer = self.init_vision_transformer(mocker)
|
||||
mocker.patch("torch.nn.Module.__setattr__")
|
||||
mocker.patch("torch.nn.Module.__getattr__")
|
||||
mocker.patch("torch.nn.Module.__delattr__")
|
||||
x = torch.randn(1, 3, 224, 224)
|
||||
grid_thw = torch.tensor([[1, 4, 4]])
|
||||
mocker_patch_embed = mocker.patch.object(
|
||||
vision_transformer,
|
||||
"patch_embed",
|
||||
side_effect=lambda _: torch.randn(16, 512), # noqa
|
||||
)
|
||||
mocker_rot_pos_emb = mocker.patch.object(
|
||||
vision_transformer,
|
||||
"rot_pos_emb",
|
||||
side_effect=lambda _: torch.randn(16, 64), # noqa
|
||||
)
|
||||
mocker_get_window_index = mocker.patch.object(
|
||||
vision_transformer,
|
||||
"get_window_index",
|
||||
side_effect=lambda _: (torch.arange(8), [4, 8, 12, 16]), # noqa
|
||||
)
|
||||
mocker_cal_cos_sin = mocker.patch.object(
|
||||
vision_transformer,
|
||||
"cal_cos_sin",
|
||||
side_effect=lambda _:
|
||||
(torch.randn(16, 32), torch.randn(16, 32)), # noqa
|
||||
)
|
||||
mocker_merger = mocker.patch.object(
|
||||
vision_transformer,
|
||||
"merger",
|
||||
side_effect=lambda _: torch.randn(16, 256), # noqa
|
||||
)
|
||||
vision_transformer.__dict__["vision_blocks"] = [
|
||||
lambda *args, **kwargs: torch.randn(16, 1, 512) # noqa
|
||||
]
|
||||
vision_transformer.__dict__["patch_embed"] = mocker_patch_embed
|
||||
vision_transformer.__dict__["rot_pos_emb"] = mocker_rot_pos_emb
|
||||
vision_transformer.__dict__[
|
||||
"get_window_index"] = mocker_get_window_index
|
||||
vision_transformer.__dict__["cal_cos_sin"] = mocker_cal_cos_sin
|
||||
vision_transformer.__dict__["merger"] = mocker_merger
|
||||
vision_transformer.__dict__["fullatt_block_indexes"] = [0, 2]
|
||||
vision_transformer.__dict__["spatial_merge_unit"] = 2
|
||||
ret = vision_transformer.forward(x, grid_thw)
|
||||
assert ret.shape == (8, 256)
|
||||
mocker_patch_embed.assert_called_with(x)
|
||||
mocker_rot_pos_emb.assert_called_with(grid_thw)
|
||||
mocker_get_window_index.assert_called_with(grid_thw)
|
||||
mocker_cal_cos_sin.assert_called_once()
|
||||
mocker_merger.assert_called_once()
|
||||
|
||||
|
||||
class TestAscendQwen2_5_VLForConditionalGeneration(PytestBase):
|
||||
|
||||
def test_init_vl_for_conditional_generation(self, mocker: MockerFixture):
|
||||
vllm_config = mocker.MagicMock()
|
||||
vllm_config.vision_config = "vision_config"
|
||||
vllm_config.rms_norm_eps = 1e-5
|
||||
mocker.patch("torch.nn.Module.__setattr__")
|
||||
mocker.patch("torch.nn.Module.__getattr__")
|
||||
mocker.patch("torch.nn.Module.__delattr__")
|
||||
mocker_vl = mocker.patch(
|
||||
"vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VLForConditionalGeneration.__init__",
|
||||
return_value=None,
|
||||
)
|
||||
mocker_vit = mocker.patch(
|
||||
"vllm_ascend.models.qwen2_5_vl.AscendQwen2_5_VisionTransformer.__init__",
|
||||
return_value=None,
|
||||
)
|
||||
|
||||
vl_for_conditional_generation = AscendQwen2_5_VLForConditionalGeneration(
|
||||
vllm_config=vllm_config)
|
||||
args, kwargs = mocker_vl.call_args
|
||||
assert not args
|
||||
assert kwargs == {"vllm_config": vllm_config, "prefix": ""}
|
||||
mocker_vit.assert_called_once()
|
||||
assert isinstance(
|
||||
vl_for_conditional_generation,
|
||||
AscendQwen2_5_VLForConditionalGeneration,
|
||||
)
|
||||
422
tests/ut/models/test_qwen2_5_vl_without_padding.py
Normal file
422
tests/ut/models/test_qwen2_5_vl_without_padding.py
Normal file
@@ -0,0 +1,422 @@
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from pytest_mock import MockerFixture
|
||||
from vllm.model_executor.models.qwen2_5_vl import \
|
||||
Qwen2_5_VLForConditionalGeneration
|
||||
|
||||
from tests.ut.base import PytestBase
|
||||
from vllm_ascend.models.qwen2_5_vl_without_padding import (
|
||||
AscendQwen2_5_VisionAttention_Without_Padding,
|
||||
AscendQwen2_5_VisionBlock_Without_Padding,
|
||||
AscendQwen2_5_VisionPatchEmbed_Without_Padding,
|
||||
AscendQwen2_5_VisionTransformer_Without_Padding,
|
||||
AscendQwen2_5_VLForConditionalGeneration_Without_Padding)
|
||||
|
||||
|
||||
class TestAscendQwen2_5_VisionAttention_Without_Padding(PytestBase):
|
||||
|
||||
def init_attention(
|
||||
self,
|
||||
mocker,
|
||||
embed_dim=1000,
|
||||
num_heads=10,
|
||||
projection_size=100,
|
||||
quant_config=None,
|
||||
prefix="",
|
||||
):
|
||||
mocker_attn = mocker.patch(
|
||||
"vllm_ascend.models.qwen2_5_vl_without_padding.Qwen2_5_VisionAttention.__init__"
|
||||
)
|
||||
|
||||
attention = AscendQwen2_5_VisionAttention_Without_Padding(
|
||||
embed_dim=embed_dim,
|
||||
num_heads=num_heads,
|
||||
projection_size=projection_size,
|
||||
quant_config=quant_config,
|
||||
prefix=prefix,
|
||||
)
|
||||
args, kwargs = mocker_attn.call_args
|
||||
assert args == (embed_dim, num_heads, projection_size, None, "")
|
||||
assert not kwargs
|
||||
attention.num_attention_heads_per_partition = num_heads
|
||||
return attention
|
||||
|
||||
def test_vit_init_should_normal(self, mocker: MockerFixture):
|
||||
embed_dim = 1000
|
||||
num_heads = 10
|
||||
projection_size = 100
|
||||
quant_config = None
|
||||
prefix = ""
|
||||
vit = self.init_attention(
|
||||
embed_dim=embed_dim,
|
||||
num_heads=num_heads,
|
||||
projection_size=projection_size,
|
||||
quant_config=quant_config,
|
||||
prefix=prefix,
|
||||
mocker=mocker,
|
||||
)
|
||||
assert vit.embed_dim == 1000
|
||||
assert vit.hidden_size_per_attention_head == 10
|
||||
|
||||
def test_vit_init_should_raise_error(self, mocker: MockerFixture):
|
||||
embed_dim = 1000
|
||||
num_heads = 7
|
||||
projection_size = 100
|
||||
quant_config = None
|
||||
prefix = ""
|
||||
with pytest.raises(AssertionError):
|
||||
# projection_size should divided by num heads
|
||||
self.init_attention(
|
||||
mocker=mocker,
|
||||
embed_dim=embed_dim,
|
||||
num_heads=num_heads,
|
||||
projection_size=projection_size,
|
||||
quant_config=quant_config,
|
||||
prefix=prefix,
|
||||
)
|
||||
|
||||
def test_vit_forward(self, mocker: MockerFixture):
|
||||
mocker.patch("torch.nn.Module.__setattr__")
|
||||
mocker.patch("torch.nn.Module.__getattr__")
|
||||
mocker.patch("torch.nn.Module.__delattr__")
|
||||
attention = self.init_attention(mocker=mocker)
|
||||
x = torch.rand((100, 3, 10 * 3 * 128)) # s,b, head*3*head_dim
|
||||
cu_seqlens = torch.tensor([10, 50, 100])
|
||||
cos = torch.rand((1, 100, 1, 128))
|
||||
sin = torch.rand((1, 100, 1, 128))
|
||||
|
||||
qkv = lambda x: (x, 0) # noqa
|
||||
split_qkv = lambda x: [ #noqa
|
||||
torch.rand((100, 3, 10, 128)) for i in range(3)
|
||||
] # noqa
|
||||
npu_rotary_mul = lambda q, cos, sin: q # noqa
|
||||
_npu_flash_attention_unpad = lambda **kwargs: kwargs["out"] # noqa
|
||||
proj = lambda x: (x, 0) # noqa
|
||||
|
||||
mocker_qkv = mocker.patch.object(attention, "qkv", side_effect=qkv)
|
||||
mocker_split_qkv = mocker.patch.object(
|
||||
attention,
|
||||
"split_qkv",
|
||||
side_effect=split_qkv,
|
||||
)
|
||||
mocker_npu_rotary_mul = mocker.patch("torch_npu.npu_rotary_mul",
|
||||
side_effect=npu_rotary_mul)
|
||||
mocker_npu_flash_attention_unpad = mocker.patch(
|
||||
"torch_npu._npu_flash_attention_unpad",
|
||||
side_effect=_npu_flash_attention_unpad,
|
||||
)
|
||||
mocker_proj = mocker.patch.object(attention, "proj", side_effect=proj)
|
||||
attention.__dict__["qkv"] = mocker_qkv
|
||||
attention.__dict__["split_qkv"] = mocker_split_qkv
|
||||
attention.__dict__["npu_rotary_mul"] = mocker_npu_rotary_mul
|
||||
attention.__dict__["_npu_flash_attention_unpad"] = (
|
||||
mocker_npu_flash_attention_unpad)
|
||||
attention.__dict__["proj"] = mocker_proj
|
||||
|
||||
output = attention.forward(
|
||||
x=x,
|
||||
cu_seqlens=cu_seqlens,
|
||||
cos=cos,
|
||||
sin=sin,
|
||||
)
|
||||
qkv_args, qkv_kwargs = mocker_qkv.call_args
|
||||
assert qkv_args == (x, )
|
||||
assert not qkv_kwargs
|
||||
|
||||
split_qkv_args, split_qkv_kwargs = mocker_split_qkv.call_args
|
||||
assert split_qkv_args == (x, )
|
||||
assert not split_qkv_kwargs
|
||||
|
||||
npu_rotary_mul_args, npu_rotary_mul_kwargs = mocker_npu_rotary_mul.call_args
|
||||
assert npu_rotary_mul_args[1:] == (cos, sin)
|
||||
assert npu_rotary_mul_args[0].shape == torch.Size([3, 100, 10, 128])
|
||||
assert not npu_rotary_mul_kwargs
|
||||
|
||||
assert output.shape == torch.Size([100, 3, 1280])
|
||||
|
||||
|
||||
class TestAscendQwen2_5_VisionBlock_Without_Padding(PytestBase):
|
||||
|
||||
def init_vision_block(
|
||||
self,
|
||||
mocker,
|
||||
dim=100,
|
||||
num_heads=10,
|
||||
mlp_hidden_dim=100,
|
||||
):
|
||||
mocker_vit = mocker.patch(
|
||||
"vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VisionBlock.__init__",
|
||||
return_value=None,
|
||||
)
|
||||
|
||||
mocker_attn = mocker.patch(
|
||||
"vllm_ascend.models.qwen2_5_vl_without_padding.AscendQwen2_5_VisionAttention_Without_Padding.__init__",
|
||||
return_value=None,
|
||||
)
|
||||
|
||||
mocker.patch("torch.nn.Module.__setattr__")
|
||||
mocker.patch("torch.nn.Module.__getattr__")
|
||||
mocker.patch("torch.nn.Module.__delattr__")
|
||||
vision_block = AscendQwen2_5_VisionBlock_Without_Padding(
|
||||
dim=dim,
|
||||
num_heads=num_heads,
|
||||
mlp_hidden_dim=mlp_hidden_dim,
|
||||
)
|
||||
args, kwargs = mocker_vit.call_args
|
||||
assert args == (dim, num_heads, mlp_hidden_dim, F.silu, None, None, "")
|
||||
assert not kwargs
|
||||
|
||||
args1, kwargs1 = mocker_attn.call_args
|
||||
assert not args1
|
||||
assert kwargs1 == {
|
||||
"embed_dim": dim,
|
||||
"num_heads": num_heads,
|
||||
"projection_size": dim,
|
||||
"quant_config": None,
|
||||
"prefix": ".attn",
|
||||
}
|
||||
return vision_block
|
||||
|
||||
def test_init_vision_block_should_normal(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
):
|
||||
vision_block = self.init_vision_block(mocker)
|
||||
assert isinstance(vision_block,
|
||||
AscendQwen2_5_VisionBlock_Without_Padding)
|
||||
|
||||
def test_vision_block_forward(self, mocker: MockerFixture):
|
||||
x = torch.randint(1, 100, (100, 3, 1280)) # s,b,d
|
||||
cu_seqlens = torch.tensor([10, 50, 100])
|
||||
cos = torch.rand((1, 100, 1, 128))
|
||||
sin = torch.rand((1, 100, 1, 128))
|
||||
vision_block = self.init_vision_block(mocker)
|
||||
mocker_attn = mocker.patch.object(vision_block, "attn", return_value=x)
|
||||
mocker_mlp = mocker.patch.object(vision_block, "mlp", return_value=x)
|
||||
vision_block.__dict__["attn"] = mocker_attn
|
||||
vision_block.__dict__["mlp"] = mocker_mlp
|
||||
|
||||
output = vision_block.forward(x.clone(), cu_seqlens, cos, sin)
|
||||
|
||||
_, attn_kwargs = mocker_attn.call_args
|
||||
assert attn_kwargs == {
|
||||
"cu_seqlens": cu_seqlens,
|
||||
"cos": cos,
|
||||
"sin": sin,
|
||||
}
|
||||
|
||||
assert torch.all(x * 3 == output)
|
||||
|
||||
|
||||
class TestAscendQwen2_5_VisionPatchEmbed_Without_Padding(PytestBase):
|
||||
|
||||
def test_forward(self):
|
||||
patch_embed = AscendQwen2_5_VisionPatchEmbed_Without_Padding()
|
||||
|
||||
ret = patch_embed(torch.rand((120, 1176)))
|
||||
assert ret.shape == (120, 1152)
|
||||
|
||||
|
||||
class TestAscendQwen2_5_VisionTransformer_Without_Padding(PytestBase):
|
||||
|
||||
input_data = torch.tensor([[0.1, 0.2], [0.3, 0.4]])
|
||||
|
||||
def init_vision_transformer(
|
||||
self,
|
||||
mocker,
|
||||
):
|
||||
norm_eps = 1e-6
|
||||
vision_config = mocker.MagicMock()
|
||||
vision_config.patch_size = 16
|
||||
vision_config.temporal_patch_size = 2
|
||||
vision_config.in_channels = 3
|
||||
vision_config.hidden_act = "gelu"
|
||||
vision_config.depth = 0
|
||||
vision_config.hidden_size = 1280
|
||||
vision_config.num_heads = 16
|
||||
|
||||
mocker.patch("torch.nn.Module.__setattr__")
|
||||
mocker.patch("torch.nn.Module.__getattr__")
|
||||
mocker.patch("torch.nn.Module.__delattr__")
|
||||
mocker_vit = mocker.patch(
|
||||
"vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VisionTransformer.__init__",
|
||||
return_value=None,
|
||||
)
|
||||
mocker_vision_rotary_embedding = mocker.patch(
|
||||
"vllm_ascend.models.qwen2_5_vl.AscendQwen2_5_VisionRotaryEmbedding.__init__",
|
||||
return_value=None,
|
||||
)
|
||||
mocker.patch(
|
||||
"vllm_ascend.models.qwen2_5_vl_without_padding.AscendQwen2_5_VisionBlock_Without_Padding.__init__",
|
||||
return_value=None,
|
||||
)
|
||||
mocker.patch(
|
||||
"vllm_ascend.models.qwen2_5_vl_without_padding.AscendQwen2_5_VisionPatchEmbed_Without_Padding.__init__",
|
||||
return_value=None,
|
||||
)
|
||||
mocker.patch(
|
||||
"vllm_ascend.models.qwen2_5_vl_without_padding.parallel_state.get_tensor_model_parallel_world_size",
|
||||
return_value=1,
|
||||
)
|
||||
mocker.patch(
|
||||
"vllm_ascend.models.qwen2_5_vl_without_padding.parallel_state.get_tensor_model_parallel_rank",
|
||||
return_value=0,
|
||||
)
|
||||
mocker.patch("vllm.distributed.utils.divide", return_value=100)
|
||||
|
||||
vision_transformer = AscendQwen2_5_VisionTransformer_Without_Padding(
|
||||
vision_config,
|
||||
norm_eps,
|
||||
)
|
||||
args, kwargs = mocker_vit.call_args
|
||||
assert args == (vision_config, norm_eps, None, "")
|
||||
assert not kwargs
|
||||
mocker_vision_rotary_embedding.assert_called_once()
|
||||
return vision_transformer
|
||||
|
||||
def test_init_vision_transformer(self, mocker: MockerFixture):
|
||||
vision_transformer = self.init_vision_transformer(mocker)
|
||||
assert isinstance(vision_transformer,
|
||||
AscendQwen2_5_VisionTransformer_Without_Padding)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"interleaved, expected",
|
||||
[
|
||||
(
|
||||
False,
|
||||
torch.tensor([
|
||||
input_data[0, 0].cos(),
|
||||
input_data[0, 1].cos(),
|
||||
input_data[0, 0].cos(),
|
||||
input_data[0, 1].cos(),
|
||||
input_data[1, 0].cos(),
|
||||
input_data[1, 1].cos(),
|
||||
input_data[1, 0].cos(),
|
||||
input_data[1, 1].cos(),
|
||||
]),
|
||||
),
|
||||
(
|
||||
True,
|
||||
torch.tensor([
|
||||
input_data[0, 0].cos(),
|
||||
input_data[0, 0].cos(),
|
||||
input_data[0, 1].cos(),
|
||||
input_data[0, 1].cos(),
|
||||
input_data[1, 0].cos(),
|
||||
input_data[1, 0].cos(),
|
||||
input_data[1, 1].cos(),
|
||||
input_data[1, 1].cos(),
|
||||
]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_cal_cos_sin(self, interleaved, expected, mocker: MockerFixture):
|
||||
vision_transformer = self.init_vision_transformer(mocker)
|
||||
vision_transformer.__dict__["interleaved"] = interleaved
|
||||
vision_transformer.__dict__["hidden_size_per_attention_head"] = 2
|
||||
vision_transformer.hidden_size_per_attention_head = 4
|
||||
cos_new, _ = vision_transformer.cal_cos_sin(self.input_data)
|
||||
assert cos_new.shape == (1, 4, 1, 2)
|
||||
assert torch.allclose(cos_new.view(-1), expected)
|
||||
|
||||
def test_forward(self, mocker: MockerFixture):
|
||||
vision_transformer = self.init_vision_transformer(mocker)
|
||||
x = torch.randn(1, 3, 224, 224)
|
||||
grid_thw = torch.tensor([[1, 4, 4]])
|
||||
mocker_patch_embed = mocker.patch.object(
|
||||
vision_transformer,
|
||||
"patch_embed",
|
||||
side_effect=lambda _: torch.randn(16, 512), # noqa
|
||||
)
|
||||
mocker_rot_pos_emb = mocker.patch.object(
|
||||
vision_transformer,
|
||||
"rot_pos_emb",
|
||||
side_effect=lambda _: torch.randn(16, 64), # noqa
|
||||
)
|
||||
mocker_get_window_index = mocker.patch.object(
|
||||
vision_transformer,
|
||||
"get_window_index",
|
||||
side_effect=lambda _: (torch.arange(8), [4, 8, 12, 16]), # noqa
|
||||
)
|
||||
mocker_cal_cos_sin = mocker.patch.object(
|
||||
vision_transformer,
|
||||
"cal_cos_sin",
|
||||
side_effect=lambda _:
|
||||
(torch.randn(16, 32), torch.randn(16, 32)), # noqa
|
||||
)
|
||||
mocker_merger = mocker.patch.object(
|
||||
vision_transformer,
|
||||
"merger",
|
||||
side_effect=lambda _: torch.randn(16, 256), # noqa
|
||||
)
|
||||
vision_transformer.__dict__["vision_blocks"] = [
|
||||
lambda *args, **kwargs: torch.randn(16, 1, 512) # noqa
|
||||
]
|
||||
vision_transformer.__dict__["patch_embed"] = mocker_patch_embed
|
||||
vision_transformer.__dict__["rot_pos_emb"] = mocker_rot_pos_emb
|
||||
vision_transformer.__dict__[
|
||||
"get_window_index"] = mocker_get_window_index
|
||||
vision_transformer.__dict__["cal_cos_sin"] = mocker_cal_cos_sin
|
||||
vision_transformer.__dict__["merger"] = mocker_merger
|
||||
vision_transformer.__dict__["fullatt_block_indexes"] = [0, 2]
|
||||
vision_transformer.__dict__["spatial_merge_unit"] = 2
|
||||
ret = vision_transformer.forward(x, grid_thw)
|
||||
assert ret.shape == (8, 256)
|
||||
mocker_patch_embed.assert_called_with(x)
|
||||
mocker_rot_pos_emb.assert_called_with(grid_thw)
|
||||
mocker_get_window_index.assert_called_with(grid_thw)
|
||||
mocker_cal_cos_sin.assert_called_once()
|
||||
mocker_merger.assert_called_once()
|
||||
|
||||
|
||||
class TestAscendQwen2_5_VLForConditionalGeneration_Without_Padding(PytestBase):
|
||||
|
||||
def test_init_vl_for_conditional_generation(self, mocker: MockerFixture):
|
||||
vllm_config = mocker.MagicMock()
|
||||
vllm_config.vision_config = "vision_config"
|
||||
vllm_config.rms_norm_eps = 1e-5
|
||||
mocker.patch("torch.nn.Module.__setattr__")
|
||||
mocker.patch("torch.nn.Module.__getattr__")
|
||||
mocker.patch("torch.nn.Module.__delattr__")
|
||||
mocker_vl = mocker.patch(
|
||||
"vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VLForConditionalGeneration.__init__",
|
||||
return_value=None,
|
||||
)
|
||||
mocker_vit = mocker.patch(
|
||||
"vllm_ascend.models.qwen2_5_vl_without_padding.AscendQwen2_5_VisionTransformer_Without_Padding.__init__",
|
||||
return_value=None,
|
||||
)
|
||||
|
||||
vl_for_conditional_generation = AscendQwen2_5_VLForConditionalGeneration_Without_Padding(
|
||||
vllm_config=vllm_config)
|
||||
args, kwargs = mocker_vl.call_args
|
||||
assert not args
|
||||
assert kwargs == {"vllm_config": vllm_config, "prefix": ""}
|
||||
mocker_vit.assert_called_once()
|
||||
assert isinstance(
|
||||
vl_for_conditional_generation,
|
||||
AscendQwen2_5_VLForConditionalGeneration_Without_Padding,
|
||||
)
|
||||
|
||||
def test_overridden_methods(self):
|
||||
self.assert_method_overridden(
|
||||
AscendQwen2_5_VLForConditionalGeneration_Without_Padding,
|
||||
Qwen2_5_VLForConditionalGeneration,
|
||||
"_process_image_input",
|
||||
)
|
||||
|
||||
self.assert_method_overridden(
|
||||
AscendQwen2_5_VLForConditionalGeneration_Without_Padding,
|
||||
Qwen2_5_VLForConditionalGeneration,
|
||||
"_process_video_input",
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def assert_method_overridden(subclass, parent, method_name: str):
|
||||
"""assert subclass override parent method"""
|
||||
parent_func = parent.__dict__.get(method_name)
|
||||
child_func = subclass.__dict__.get(method_name)
|
||||
|
||||
assert child_func is not None, f"{subclass.__name__} should defined {method_name}"
|
||||
assert child_func is not parent_func, f"{method_name} should override in {subclass.__name__}"
|
||||
200
tests/ut/models/test_qwen2_vl.py
Normal file
200
tests/ut/models/test_qwen2_vl.py
Normal file
@@ -0,0 +1,200 @@
|
||||
import pytest
|
||||
import torch
|
||||
from pytest_mock import MockerFixture
|
||||
from vllm.model_executor.layers.activation import QuickGELU
|
||||
|
||||
from tests.ut.base import PytestBase
|
||||
from vllm_ascend.models.qwen2_vl import (AscendQwen2VisionAttention,
|
||||
AscendQwen2VisionBlock)
|
||||
|
||||
|
||||
class TestAscendQwen2VisionAttention(PytestBase):
|
||||
|
||||
def init_attention(
|
||||
self,
|
||||
mocker,
|
||||
embed_dim=1000,
|
||||
num_heads=10,
|
||||
projection_size=100,
|
||||
quant_config=None,
|
||||
prefix="",
|
||||
):
|
||||
mocker_attn = mocker.patch(
|
||||
"vllm_ascend.models.qwen2_vl.Qwen2VisionAttention.__init__")
|
||||
|
||||
attention = AscendQwen2VisionAttention(
|
||||
embed_dim=embed_dim,
|
||||
num_heads=num_heads,
|
||||
projection_size=projection_size,
|
||||
quant_config=quant_config,
|
||||
prefix=prefix,
|
||||
)
|
||||
args, kwargs = mocker_attn.call_args
|
||||
assert args == (embed_dim, num_heads, projection_size, None, "")
|
||||
assert not kwargs
|
||||
attention.num_attention_heads_per_partition = num_heads
|
||||
return attention
|
||||
|
||||
def test_attn_init_should_normal(self, mocker: MockerFixture):
|
||||
embed_dim = 1000
|
||||
num_heads = 10
|
||||
projection_size = 100
|
||||
quant_config = None
|
||||
prefix = ""
|
||||
vit = self.init_attention(
|
||||
embed_dim=embed_dim,
|
||||
num_heads=num_heads,
|
||||
projection_size=projection_size,
|
||||
quant_config=quant_config,
|
||||
prefix=prefix,
|
||||
mocker=mocker,
|
||||
)
|
||||
assert vit.hidden_size_per_attention_head == 10
|
||||
|
||||
def test_attn_init_should_raise_error(self, mocker: MockerFixture):
|
||||
embed_dim = 1000
|
||||
num_heads = 7
|
||||
projection_size = 100
|
||||
quant_config = None
|
||||
prefix = ""
|
||||
with pytest.raises(AssertionError):
|
||||
# projection_size should divided by num heads
|
||||
self.init_attention(
|
||||
mocker=mocker,
|
||||
embed_dim=embed_dim,
|
||||
num_heads=num_heads,
|
||||
projection_size=projection_size,
|
||||
quant_config=quant_config,
|
||||
prefix=prefix,
|
||||
)
|
||||
|
||||
def test_attn_forward(self, mocker: MockerFixture):
|
||||
attention = self.init_attention(mocker=mocker)
|
||||
mocker.patch("torch.nn.Module.__setattr__")
|
||||
mocker.patch("torch.nn.Module.__getattr__")
|
||||
mocker.patch("torch.nn.Module.__delattr__")
|
||||
x = torch.rand((100, 3, 10 * 3 * 128)) # s,b, head*3*head_dim
|
||||
cu_seqlens = torch.tensor([10, 50, 100])
|
||||
cos = torch.rand((1, 100, 1, 128))
|
||||
sin = torch.rand((1, 100, 1, 128))
|
||||
|
||||
qkv = lambda x: (x, 0) # noqa
|
||||
split_qkv = lambda x: [ #noqa
|
||||
torch.rand((100, 3, 10, 128)) for i in range(3)
|
||||
] # noqa
|
||||
npu_rotary_mul = lambda q, cos, sin: q # noqa
|
||||
_npu_flash_attention_unpad = lambda **kwargs: kwargs["out"] # noqa
|
||||
proj = lambda x: (x, 0) # noqa
|
||||
|
||||
mocker_qkv = mocker.patch.object(attention, "qkv", side_effect=qkv)
|
||||
mocker_split_qkv = mocker.patch.object(
|
||||
attention,
|
||||
"split_qkv",
|
||||
side_effect=split_qkv,
|
||||
)
|
||||
mocker_npu_rotary_mul = mocker.patch("torch_npu.npu_rotary_mul",
|
||||
side_effect=npu_rotary_mul)
|
||||
mocker_npu_flash_attention_unpad = mocker.patch(
|
||||
"torch_npu._npu_flash_attention_unpad",
|
||||
side_effect=_npu_flash_attention_unpad,
|
||||
)
|
||||
mocker_proj = mocker.patch.object(attention, "proj", side_effect=proj)
|
||||
attention.__dict__["qkv"] = mocker_qkv
|
||||
attention.__dict__["split_qkv"] = mocker_split_qkv
|
||||
attention.__dict__["npu_rotary_mul"] = mocker_npu_rotary_mul
|
||||
attention.__dict__["_npu_flash_attention_unpad"] = (
|
||||
mocker_npu_flash_attention_unpad)
|
||||
attention.__dict__["proj"] = mocker_proj
|
||||
|
||||
output = attention.forward(
|
||||
x=x,
|
||||
cu_seqlens=cu_seqlens,
|
||||
cos=cos,
|
||||
sin=sin,
|
||||
)
|
||||
qkv_args, qkv_kwargs = mocker_qkv.call_args
|
||||
assert qkv_args == (x, )
|
||||
assert not qkv_kwargs
|
||||
|
||||
split_qkv_args, split_qkv_kwargs = mocker_split_qkv.call_args
|
||||
assert split_qkv_args == (x, )
|
||||
assert not split_qkv_kwargs
|
||||
|
||||
npu_rotary_mul_args, npu_rotary_mul_kwargs = mocker_npu_rotary_mul.call_args
|
||||
assert npu_rotary_mul_args[1:] == (cos, sin)
|
||||
assert npu_rotary_mul_args[0].shape == torch.Size([3, 100, 10, 128])
|
||||
assert not npu_rotary_mul_kwargs
|
||||
|
||||
assert output.shape == torch.Size([100, 3, 1280])
|
||||
|
||||
|
||||
class TestAscendQwen2VisionBlock(PytestBase):
|
||||
|
||||
def init_vision_block(
|
||||
self,
|
||||
mocker,
|
||||
dim=100,
|
||||
num_heads=10,
|
||||
mlp_ratio=0.5,
|
||||
):
|
||||
mocker_vit = mocker.patch(
|
||||
"vllm.model_executor.models.qwen2_vl.Qwen2VisionBlock.__init__",
|
||||
return_value=None,
|
||||
)
|
||||
|
||||
mocker_attn = mocker.patch(
|
||||
"vllm_ascend.models.qwen2_vl.AscendQwen2VisionAttention.__init__",
|
||||
return_value=None,
|
||||
)
|
||||
|
||||
mocker.patch("torch.nn.Module.__setattr__")
|
||||
mocker.patch("torch.nn.Module.__getattr__")
|
||||
mocker.patch("torch.nn.Module.__delattr__")
|
||||
vision_block = AscendQwen2VisionBlock(
|
||||
dim=dim,
|
||||
num_heads=num_heads,
|
||||
mlp_ratio=mlp_ratio,
|
||||
)
|
||||
args, kwargs = mocker_vit.call_args
|
||||
assert args == (dim, num_heads, mlp_ratio, QuickGELU, None, None, "")
|
||||
assert not kwargs
|
||||
|
||||
args1, kwargs1 = mocker_attn.call_args
|
||||
assert not args1
|
||||
assert kwargs1 == {
|
||||
"embed_dim": dim,
|
||||
"num_heads": num_heads,
|
||||
"projection_size": dim,
|
||||
"quant_config": None,
|
||||
"prefix": ".attn",
|
||||
}
|
||||
return vision_block
|
||||
|
||||
def test_init_vision_block_should_normal(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
):
|
||||
vision_block = self.init_vision_block(mocker)
|
||||
assert isinstance(vision_block, AscendQwen2VisionBlock)
|
||||
|
||||
def test_vision_block_forward(self, mocker: MockerFixture):
|
||||
x = torch.randint(1, 100, (100, 3, 1280)) # s,b,d
|
||||
cu_seqlens = torch.tensor([10, 50, 100])
|
||||
cos = torch.rand((1, 100, 1, 128))
|
||||
sin = torch.rand((1, 100, 1, 128))
|
||||
vision_block = self.init_vision_block(mocker)
|
||||
mocker_attn = mocker.patch.object(vision_block, "attn", return_value=x)
|
||||
mocker_mlp = mocker.patch.object(vision_block, "mlp", return_value=x)
|
||||
vision_block.__dict__["attn"] = mocker_attn
|
||||
vision_block.__dict__["mlp"] = mocker_mlp
|
||||
|
||||
output = vision_block.forward(x.clone(), cu_seqlens, cos, sin)
|
||||
|
||||
_, attn_kwargs = mocker_attn.call_args
|
||||
assert attn_kwargs == {
|
||||
"cu_seqlens": cu_seqlens,
|
||||
"cos": cos,
|
||||
"sin": sin,
|
||||
}
|
||||
|
||||
assert torch.all(x * 3 == output)
|
||||
98
tests/ut/models/test_qwen3_moe.py
Normal file
98
tests/ut/models/test_qwen3_moe.py
Normal file
@@ -0,0 +1,98 @@
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
import math
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from vllm.model_executor.models.qwen3_moe import Qwen3MoeForCausalLM
|
||||
|
||||
from vllm_ascend.models.qwen3_moe import CustomQwen3MoeForCausalLM
|
||||
from vllm_ascend.torchair.models.qwen3_moe import CustomQwen3MoeAttention
|
||||
|
||||
|
||||
class TestCustomQwen3MoeForCausalLM:
|
||||
|
||||
def test_class_inheritance(self):
|
||||
assert issubclass(CustomQwen3MoeForCausalLM, Qwen3MoeForCausalLM)
|
||||
|
||||
@pytest.mark.parametrize("key, expected", [
|
||||
("qkv_proj", ["q_proj", "k_proj", "v_proj"]),
|
||||
("gate_up_proj", ["gate_proj", "up_proj"]),
|
||||
("experts",
|
||||
["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]),
|
||||
])
|
||||
def test_packed_modules_mapping(self, key, expected):
|
||||
assert CustomQwen3MoeForCausalLM.packed_modules_mapping[
|
||||
key] == expected
|
||||
|
||||
def test_packed_modules_mapping_structure(self):
|
||||
expected_mapping = {
|
||||
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
||||
"gate_up_proj": ["gate_proj", "up_proj"],
|
||||
"experts": [
|
||||
"experts.0.gate_proj", "experts.0.up_proj",
|
||||
"experts.0.down_proj"
|
||||
]
|
||||
}
|
||||
assert CustomQwen3MoeForCausalLM.packed_modules_mapping == expected_mapping
|
||||
|
||||
|
||||
class DummyRMSNorm:
|
||||
|
||||
def __init__(self, dim: int, eps: float = 1e-6):
|
||||
self.dim = dim
|
||||
self.eps = eps
|
||||
|
||||
def __call__(self, x):
|
||||
mean_sq = x.pow(2).mean(dim=-1, keepdim=True)
|
||||
denom = (mean_sq + self.eps).sqrt()
|
||||
return x / denom
|
||||
|
||||
|
||||
class TestCustomQwen3MoeAttention(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.batch = 2
|
||||
self.seq_len = 3
|
||||
self.q_size = 8
|
||||
self.kv_size = 8
|
||||
self.head_dim = 4
|
||||
self.rms_eps = 1e-6
|
||||
|
||||
total_dim = self.q_size + 2 * self.kv_size
|
||||
|
||||
self.qkv = torch.arange(self.batch * self.seq_len * total_dim,
|
||||
dtype=torch.float32).reshape(
|
||||
self.batch, self.seq_len, total_dim)
|
||||
|
||||
def test_constant_input_normalization(self):
|
||||
ones_qkv = torch.ones((1, 1, self.q_size + 2 * self.kv_size),
|
||||
dtype=torch.float32)
|
||||
|
||||
q_norm = DummyRMSNorm(self.head_dim, self.rms_eps)
|
||||
k_norm = DummyRMSNorm(self.head_dim, self.rms_eps)
|
||||
q, k, v = CustomQwen3MoeAttention.normalize_qkv(
|
||||
ones_qkv, self.q_size, self.kv_size, self.head_dim, q_norm, k_norm)
|
||||
|
||||
norm_val = 1.0 / math.sqrt(1.0 + self.rms_eps)
|
||||
|
||||
expected_q = torch.full((1, 1, self.q_size), norm_val)
|
||||
expected_k = torch.full((1, 1, self.kv_size), norm_val)
|
||||
expected_v = torch.ones((1, 1, self.kv_size), dtype=torch.float32)
|
||||
|
||||
self.assertTrue(torch.allclose(q, expected_q, atol=1e-6))
|
||||
self.assertTrue(torch.allclose(k, expected_k, atol=1e-6))
|
||||
self.assertTrue(torch.equal(v, expected_v))
|
||||
Reference in New Issue
Block a user