v0.10.1rc1

This commit is contained in:
2025-09-09 09:40:35 +08:00
parent d6f6ef41fe
commit 9149384e03
432 changed files with 84698 additions and 1 deletions

View File

View File

@@ -0,0 +1,195 @@
import pytest
import torch
from pytest_mock import MockerFixture
from transformers import PretrainedConfig
from vllm.config import CacheConfig, ModelConfig, VllmConfig
from tests.ut.base import PytestBase
from vllm_ascend.models.deepseek_mtp import (
CustomDeepSeekMTP, CustomDeepSeekMultiTokenPredictor,
CustomDeepSeekMultiTokenPredictorLayer)
class TestCustomDeepSeekMultiTokenPredictorLayer(PytestBase):
@pytest.fixture
def setup_mtp_layer(self, mocker: MockerFixture):
config = PretrainedConfig(vocab_size=1000,
hidden_size=768,
rms_norm_eps=1e-5)
mocker.patch(
"vllm.model_executor.layers.vocab_parallel_embedding.VocabParallelEmbedding.__init__",
return_value=None)
mocker.patch("vllm.model_executor.layers.layernorm.RMSNorm.__init__",
return_value=None)
mocker.patch(
"vllm.model_executor.models.deepseek_mtp.SharedHead.__init__",
return_value=None)
mocker.patch(
"vllm_ascend.models.deepseek_mtp.CustomDeepSeekShareHead.__init__",
return_value=None)
mocker_deepseek_v2_decode_layer = mocker.patch(
"vllm_ascend.models.deepseek_v2.CustomDeepseekV2DecoderLayer.__init__",
return_value=None)
mocker.patch(
"vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__",
return_value=None)
mocker.patch("vllm_ascend.utils.get_ascend_config",
return_value=mocker.Mock())
mtp_layer = CustomDeepSeekMultiTokenPredictorLayer(config, "", None)
mocker_deepseek_v2_decode_layer.assert_called_once()
return mtp_layer
def test_init(self, mocker: MockerFixture, setup_mtp_layer):
mtp_layer = setup_mtp_layer
assert isinstance(mtp_layer, CustomDeepSeekMultiTokenPredictorLayer)
def test_forward(self, mocker: MockerFixture, setup_mtp_layer):
mtp_layer = setup_mtp_layer
mocker.patch("torch.nn.Module.__setattr__")
mocker.patch("torch.nn.Module.__getattr__")
mocker.patch("torch.nn.Module.__delattr__")
mocker.patch.object(mtp_layer,
'eh_proj',
return_value=torch.randn(2, 3, 768))
mocker.patch("torch.cat", return_value=torch.randn(2, 3, 768))
mtp_layer.mtp_block.return_value = (torch.randn(2, 3, 768),
torch.randn(2, 3, 768))
input_ids = torch.tensor([[1, 2, 3], [4, 5, 6]])
positions = torch.tensor([[0, 1, 2], [0, 1, 2]])
kv_cache = torch.randn(2, 3, 768)
previous_hidden_states = torch.randn(2, 3, 768)
inputs_embeds = torch.tensor([[1.0, 2.0, 3.0]])
output = mtp_layer(input_ids, positions, kv_cache, None,
previous_hidden_states, inputs_embeds, 0)
assert output.shape == (2, 3, 768)
class TestCustomDeepSeekMultiTokenPredictor(PytestBase):
@pytest.fixture
def setup_predictor(self, mocker: MockerFixture):
mock_vllm_config = mocker.MagicMock(spec=VllmConfig)
mock_model_config = mocker.MagicMock(spec=ModelConfig)
mock_hf_config = mocker.MagicMock()
mock_hf_config.num_hidden_layers = 12
mock_hf_config.num_nextn_predict_layers = 3
mock_hf_config.vocab_size = 30000
mock_model_config.hf_config = mock_hf_config
mock_vllm_config.model_config = mock_model_config
mock_vllm_config.cache_config = CacheConfig()
mock_vllm_config.quant_config = mocker.MagicMock()
mocker.patch(
"vllm.model_executor.layers.vocab_parallel_embedding.VocabParallelEmbedding.__init__",
return_value=None)
mocker.patch(
"vllm_ascend.models.deepseek_mtp.CustomDeepSeekMultiTokenPredictorLayer.__init__",
return_value=None)
mocker.patch(
"vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__",
return_value=None)
mocker.patch("vllm_ascend.utils.get_ascend_config",
return_value=mocker.Mock())
predictor = CustomDeepSeekMultiTokenPredictor(
vllm_config=mock_vllm_config)
return predictor
def test_init(self, mocker: MockerFixture, setup_predictor):
predictor = setup_predictor
assert predictor.num_mtp_layers == 3
assert isinstance(predictor, CustomDeepSeekMultiTokenPredictor)
@pytest.mark.parametrize(
'kv_caches, inputs_embeds',
[(torch.tensor([[[0.1, 0.2, 0.3]]]), torch.tensor([[0.1, 0.2, 0.3]]))])
def test_forward(self, mocker: MockerFixture, setup_predictor, kv_caches,
inputs_embeds):
predictor = setup_predictor
mock_layer = mocker.MagicMock()
mock_layer.return_value = torch.tensor([1.0, 2.0, 3.0])
predictor.layers_list = [mock_layer]
# todo: need or not?
# predictor.num_mtp_layers = 1
input_ids = torch.tensor([[1, 2, 3]])
positions = torch.tensor([[0, 1, 2]])
mocker.patch(
"vllm_ascend.models.deepseek_mtp.CustomDeepSeekMultiTokenPredictorLayer.__call__",
return_value=torch.tensor([[1.0, 2.0, 3.0]]))
output = predictor.forward(input_ids, positions, kv_caches, None, None,
inputs_embeds, 0)
mock_layer.assert_called_once()
assert torch.allclose(output, torch.tensor([1.0, 2.0, 3.0]))
def test_compute_logits(self, mocker: MockerFixture, setup_predictor):
hidden_states = torch.tensor([[1, 2, 3], [4, 5, 6]])
predictor = setup_predictor
mock_layer = mocker.MagicMock()
mock_layer.return_value = torch.tensor([1.0, 2.0, 3.0])
predictor.layers_list = [mock_layer]
mocker.patch("torch.nn.Module.__setattr__")
mocker.patch("torch.nn.Module.__getattr__")
mocker.patch("torch.nn.Module.__delattr__")
mocker.patch(
"vllm.model_executor.layers.logits_processor.LogitsProcessor.__init__",
return_value=None)
predictor.logits_processor.return_value = torch.tensor([1.0, 2.0, 3.0])
result_logits = predictor.compute_logits(hidden_states=hidden_states,
sampling_metadata=None)
predictor.logits_processor.assert_called_once()
assert torch.allclose(result_logits, torch.tensor([1.0, 2.0, 3.0]))
class TestCustomDeepSeekMTP(PytestBase):
@pytest.fixture
def setup_mtp(self, mocker: MockerFixture):
vllm_config = mocker.MagicMock()
vllm_config.model_config.hf_config.num_hidden_layers = 12
vllm_config.model_config.hf_config.num_nextn_predict_layers = 3
vllm_config.cache_config = mocker.MagicMock()
vllm_config.quant_config = mocker.MagicMock()
mocker.patch("torch.nn.Module.__setattr__")
mocker.patch("torch.nn.Module.__getattr__")
mocker.patch("torch.nn.Module.__delattr__")
mocker.patch(
"vllm.model_executor.layers.vocab_parallel_embedding.VocabParallelEmbedding.__init__",
return_value=None)
mocker.patch(
"vllm_ascend.models.deepseek_mtp.CustomDeepSeekMultiTokenPredictorLayer.__call__",
return_value=None)
mocker.patch("vllm.model_executor.layers.sampler.get_sampler",
return_value=None)
mocker.patch(
"vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__",
return_value=None)
mocker.patch("vllm_ascend.utils.get_ascend_config",
return_value=mocker.Mock())
mtp = CustomDeepSeekMTP(vllm_config=vllm_config)
return mtp
def test_init(self, mocker: MockerFixture, setup_mtp):
mtp = setup_mtp
assert isinstance(mtp, CustomDeepSeekMTP)
def test_forward(self, mocker: MockerFixture, setup_mtp):
input_ids = torch.tensor([[1, 2, 3]])
positions = torch.tensor([[0, 1, 2]])
kv_caches = [torch.tensor([[0.1, 0.2, 0.3]])]
previous_hidden_states = torch.tensor([[0.1, 0.2, 0.3]])
inputs_embeds = torch.tensor([[0.1, 0.2, 0.3]])
spec_step_idx = 0
setup_mtp.model.return_value = torch.tensor([[1.0, 2.0, 3.0]])
output = setup_mtp.forward(input_ids, positions, kv_caches, None,
previous_hidden_states, inputs_embeds,
spec_step_idx)
assert torch.allclose(output, torch.tensor([[1.0, 2.0, 3.0]]))

View File

@@ -0,0 +1,295 @@
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
from types import SimpleNamespace
from unittest.mock import Mock, patch
import pytest
import torch
from transformers import PretrainedConfig
from vllm.config import CacheConfig
from vllm.distributed.parallel_state import GroupCoordinator
from vllm_ascend.models.deepseek_v2 import (
CustomDeepseekV2MergedReplicatedLinear, CustomDeepseekV2MLAAttention,
CustomDeepseekV2MLP, CustomDeepseekV2MoE,
CustomDeepseekV2RowParallelLinear,
CustomDeepseekV2RowParallelLinearReplaceAllreduce,
CustomDeepseekV2SiluAndMul, LogitsProcessor, ParallelLMHead)
@pytest.fixture
def base_config():
config = PretrainedConfig(
hidden_size=128,
num_attention_heads=8,
num_hidden_layers=2,
intermediate_size=256,
hidden_act="silu",
rms_norm_eps=1e-6,
rope_theta=10000.0,
max_position_embeddings=2048,
n_routed_experts=4,
n_shared_experts=1,
moe_intermediate_size=256,
num_experts_per_tok=2,
routed_scaling_factor=1.0,
first_k_dense_replace=0,
moe_layer_freq=1,
kv_lora_rank=16,
qk_nope_head_dim=16,
qk_rope_head_dim=16,
v_head_dim=32,
topk_method="noaux_tc",
scoring_func="softmax",
norm_topk_prob=True,
n_group=1,
topk_group=1,
vocab_size=10000,
)
return config
@pytest.fixture
def vllm_config(base_config):
model_config = SimpleNamespace(
hf_config=base_config,
tensor_parallel_size=1,
dtype=torch.float32,
use_mla=False,
quant_config=None,
max_model_len=2048,
)
cache_config = CacheConfig()
vllm_config = Mock()
vllm_config.model_config = model_config
vllm_config.cache_config = cache_config
vllm_config.quant_config = None
return vllm_config
@pytest.fixture
def mock_distributed():
tp_group = Mock(spec=GroupCoordinator)
tp_group.rank_in_group = 0
tp_group.world_size = 1
tp_group.device_group = Mock()
dp_group = Mock(spec=GroupCoordinator)
dp_group.rank_in_group = 0
dp_group.world_size = 1
ep_group = Mock(spec=GroupCoordinator)
ep_group.rank_in_group = 0
ep_group.world_size = 1
pp_group = Mock(spec=GroupCoordinator)
pp_group.rank_in_group = 0
pp_group.world_size = 1
mock_vllm_config = Mock()
mock_vllm_config.scheduler_config = Mock(max_num_seqs=256)
mock_vllm_config.model_config = Mock(max_model_len=2048, quant_config=None)
with patch("vllm_ascend.models.deepseek_v2.get_tensor_model_parallel_rank", return_value=0), \
patch("vllm_ascend.models.deepseek_v2.get_tensor_model_parallel_world_size", return_value=1), \
patch("vllm_ascend.models.deepseek_v2.get_tp_group", return_value=tp_group), \
patch("vllm_ascend.models.deepseek_v2.get_ep_group", return_value=ep_group), \
patch("vllm_ascend.models.deepseek_v2.get_dp_group", return_value=dp_group), \
patch("vllm_ascend.models.deepseek_v2.get_pp_group", return_value=pp_group), \
patch("vllm_ascend.models.deepseek_v2.get_pp_group",
return_value=Mock(is_first_rank=False, is_last_rank=False)), \
patch("vllm_ascend.ops.fused_moe.get_current_vllm_config", return_value=mock_vllm_config), \
patch.dict("vllm.distributed.parallel_state.__dict__", _TP=tp_group, _EP=ep_group, _DP=dp_group,
_PP=pp_group), \
patch.dict("vllm_ascend.distributed.parallel_state.__dict__", _MC2=ep_group), \
patch("torch.npu.current_device", return_value=0):
yield
@pytest.fixture
def mock_forward_context():
forward_context = Mock(in_profile_run=False, with_prefill=False)
with patch("vllm_ascend.models.deepseek_v2.get_forward_context",
return_value=forward_context):
yield
def test_custom_deepseek_v2_silu_and_mul():
torch.set_default_device("cpu")
silu = CustomDeepseekV2SiluAndMul()
assert silu.weight_scale is None
x = torch.randn(2, 4)
output = silu.forward_oot(x)
assert output.shape == (2, 2)
weight_scale = Mock(return_value=torch.tensor(0.1))
silu = CustomDeepseekV2SiluAndMul(weight_scale=weight_scale)
quant_x = torch.randint(-128, 127, (2, 4), dtype=torch.int32)
dynamic_scale = torch.randn(2, 1)
with patch("torch_npu.npu_dequant_swiglu_quant",
return_value=torch.randn(2, 4)):
output = silu.forward_oot((quant_x, dynamic_scale))
assert output.shape == (2, 4)
def test_custom_deepseek_v2_merged_replicated_linear(mock_distributed):
linear = CustomDeepseekV2MergedReplicatedLinear(input_size=128,
output_sizes=[64, 64],
bias=False,
quant_config=None)
assert linear.output_sizes == [64, 64]
param = Mock()
param.data = torch.zeros(128, 128)
param.output_dim = 1
param.is_gguf_weight = False
param.is_gguf_weight_type = False
loaded_weight = torch.randn(128, 64)
linear.weight_loader(param, loaded_weight, loaded_shard_id=0)
with pytest.raises(AssertionError):
linear.weight_loader(param, torch.randn(128, 32), loaded_shard_id=0)
@pytest.mark.parametrize("cls", [
CustomDeepseekV2RowParallelLinearReplaceAllreduce,
CustomDeepseekV2RowParallelLinear
])
def test_row_parallel_linear(cls, mock_distributed):
linear = cls(input_size=128, output_size=64, bias=False, quant_config=None)
linear.quant_method = Mock()
linear.quant_method.apply.return_value = torch.randn(2, 4, 64)
input_ = torch.randn(2, 4, 128)
with patch("vllm_ascend.models.deepseek_v2.split_tensor_along_last_dim",
return_value=[torch.randn(2, 4, 64)]):
linear.input_is_parallel = False
output = linear(input_, is_prefill=True)
assert output[0].shape == (2, 4, 64)
linear.input_is_parallel = True
output = linear(input_, is_prefill=False)
assert output[0].shape == (2, 4, 64)
def test_custom_deepseek_v2_mlp(mock_distributed, base_config):
mlp = CustomDeepseekV2MLP(hidden_size=128,
intermediate_size=256,
hidden_act="silu",
quant_config=None)
assert isinstance(mlp.act_fn, CustomDeepseekV2SiluAndMul)
x = torch.randn(2, 4, 128)
output = mlp(x)
assert output.shape == (2, 4, 128)
with patch("vllm_ascend.models.deepseek_v2.QuantizationConfig"
) as mock_quant_config:
mock_quant_config.name = "w8a8dynamic"
with pytest.raises(NotImplementedError):
CustomDeepseekV2MLP(hidden_size=128,
intermediate_size=256,
hidden_act="silu",
quant_config=mock_quant_config,
force_replicate=False)
with pytest.raises(ValueError):
CustomDeepseekV2MLP(hidden_size=128,
intermediate_size=256,
hidden_act="relu",
quant_config=None)
def test_custom_deepseek_v2_moe(mock_distributed, base_config,
mock_forward_context):
base_config.n_shared_experts = 1
moe = CustomDeepseekV2MoE(config=base_config,
quant_config=None,
prefix="mlp")
assert moe.top_k == 2
x = torch.randn(2, 4, 128)
attn_metadata = Mock(num_prefills=1)
with patch("vllm_ascend.ops.fused_moe.AscendFusedMoE.__call__",
return_value=(torch.randn(2, 4, 128), torch.randn(2, 4, 128))):
output = moe(x, attn_metadata)
assert output.shape == (2, 4, 128)
@patch("torch_npu.npu_rms_norm")
def test_custom_deepseek_v2_mla_attention(mock_rms_norm, mock_distributed,
base_config):
mock_rms_norm.return_value = (torch.randn(2, 128), torch.randn(2, 128))
attn = CustomDeepseekV2MLAAttention(config=base_config,
hidden_size=128,
num_heads=8,
qk_nope_head_dim=16,
qk_rope_head_dim=16,
v_head_dim=32,
q_lora_rank=16,
kv_lora_rank=16,
cache_config=CacheConfig(),
quant_config=None,
prefix="layers.0.self_attn")
assert attn.debug_layer_idx == 0
x = torch.randn(2, 4, 128)
positions = torch.arange(4).repeat(2, 1)
with patch.object(attn.mla_attn,
"__call__",
return_value=torch.randn(2, 4, 128)):
with pytest.raises(AssertionError):
attn(positions, x)
attn = CustomDeepseekV2MLAAttention(config=base_config,
hidden_size=128,
num_heads=8,
qk_nope_head_dim=16,
qk_rope_head_dim=16,
v_head_dim=32,
q_lora_rank=None,
kv_lora_rank=16,
prefix="layers.1.self_attn")
assert hasattr(attn, "q_proj")
def test_deepseek_v2_lmhead(mock_distributed, vllm_config):
# 创建一个简单的配置对象
class SimpleConfig:
def __init__(self):
self.vocab_size = 10000
self.hidden_size = 128
config = SimpleConfig()
# 直接创建lmhead和logits_processor
lmhead = ParallelLMHead(config.vocab_size, config.hidden_size)
logits_processor = LogitsProcessor(config.vocab_size)
# 创建模拟输出
mock_output = torch.randn(2, 4, config.hidden_size)
mock_logits = torch.randn(2, 4, config.vocab_size)
# 直接测试logits_processor
with patch.object(lmhead.quant_method, "apply", return_value=mock_logits):
with patch.object(logits_processor,
"_gather_logits",
return_value=mock_logits):
logits = logits_processor(lmhead, mock_output)
assert logits.shape == (2, 4, config.vocab_size)

View File

@@ -0,0 +1,424 @@
import pytest
import torch
import torch.nn.functional as F
from pytest_mock import MockerFixture
from tests.ut.base import PytestBase
from vllm_ascend.models.qwen2_5_vl import (
AscendQwen2_5_VisionAttention, AscendQwen2_5_VisionBlock,
AscendQwen2_5_VisionPatchEmbed, AscendQwen2_5_VisionRotaryEmbedding,
AscendQwen2_5_VisionTransformer, AscendQwen2_5_VLForConditionalGeneration)
class TestAscendQwen2_5_VisionAttention(PytestBase):
def init_attention(
self,
mocker,
embed_dim=1000,
num_heads=10,
projection_size=100,
quant_config=None,
prefix="",
):
mocker_attn = mocker.patch(
"vllm_ascend.models.qwen2_5_vl.Qwen2_5_VisionAttention.__init__")
attention = AscendQwen2_5_VisionAttention(
embed_dim=embed_dim,
num_heads=num_heads,
projection_size=projection_size,
quant_config=quant_config,
prefix=prefix,
)
args, kwargs = mocker_attn.call_args
assert args == (embed_dim, num_heads, projection_size, None, "")
assert not kwargs
attention.num_attention_heads_per_partition = num_heads
return attention
def test_attn_init_should_normal(self, mocker: MockerFixture):
embed_dim = 1000
num_heads = 10
projection_size = 100
quant_config = None
prefix = ""
vit = self.init_attention(
embed_dim=embed_dim,
num_heads=num_heads,
projection_size=projection_size,
quant_config=quant_config,
prefix=prefix,
mocker=mocker,
)
assert vit.embed_dim == 1000
assert vit.hidden_size_per_attention_head == 10
def test_attn_init_should_raise_error(self, mocker: MockerFixture):
embed_dim = 1000
num_heads = 7
projection_size = 100
quant_config = None
prefix = ""
with pytest.raises(AssertionError):
# projection_size should divided by num heads
self.init_attention(
mocker=mocker,
embed_dim=embed_dim,
num_heads=num_heads,
projection_size=projection_size,
quant_config=quant_config,
prefix=prefix,
)
def test_split_qkv(self, mocker: MockerFixture):
attention = self.init_attention(mocker=mocker)
mocker.patch("torch.nn.Module.__setattr__")
mocker.patch("torch.nn.Module.__getattr__")
mocker.patch("torch.nn.Module.__delattr__")
q, k, v = attention.split_qkv(torch.rand((100, 10, 300)))
assert q.shape == (100, 10, 10, 10)
assert k.shape == (100, 10, 10, 10)
assert v.shape == (100, 10, 10, 10)
def test_attn_forward(self, mocker: MockerFixture):
attention = self.init_attention(mocker=mocker)
mocker.patch("torch.nn.Module.__setattr__")
mocker.patch("torch.nn.Module.__getattr__")
mocker.patch("torch.nn.Module.__delattr__")
x = torch.rand((100, 3, 10 * 3 * 128)) # s,b, head*3*head_dim
cu_seqlens = torch.tensor([10, 50, 100])
cos = torch.rand((1, 100, 1, 128))
sin = torch.rand((1, 100, 1, 128))
qkv = lambda x: (x, 0) # noqa
split_qkv = lambda x: [ #noqa
torch.rand((100, 3, 10, 128)) for i in range(3)
] # noqa
npu_rotary_mul = lambda q, cos, sin: q # noqa
_npu_flash_attention_unpad = lambda **kwargs: kwargs["out"] # noqa
proj = lambda x: (x, 0) # noqa
mocker_qkv = mocker.patch.object(attention, "qkv", side_effect=qkv)
mocker_split_qkv = mocker.patch.object(
attention,
"split_qkv",
side_effect=split_qkv,
)
mocker_npu_rotary_mul = mocker.patch("torch_npu.npu_rotary_mul",
side_effect=npu_rotary_mul)
mocker_npu_flash_attention_unpad = mocker.patch(
"torch_npu._npu_flash_attention_unpad",
side_effect=_npu_flash_attention_unpad,
)
mocker_proj = mocker.patch.object(attention, "proj", side_effect=proj)
attention.__dict__["qkv"] = mocker_qkv
attention.__dict__["split_qkv"] = mocker_split_qkv
attention.__dict__["npu_rotary_mul"] = mocker_npu_rotary_mul
attention.__dict__["_npu_flash_attention_unpad"] = (
mocker_npu_flash_attention_unpad)
attention.__dict__["proj"] = mocker_proj
output = attention.forward(
x=x,
cu_seqlens=cu_seqlens,
cos=cos,
sin=sin,
)
qkv_args, qkv_kwargs = mocker_qkv.call_args
assert qkv_args == (x, )
assert not qkv_kwargs
split_qkv_args, split_qkv_kwargs = mocker_split_qkv.call_args
assert split_qkv_args == (x, )
assert not split_qkv_kwargs
npu_rotary_mul_args, npu_rotary_mul_kwargs = mocker_npu_rotary_mul.call_args
assert npu_rotary_mul_args[1:] == (cos, sin)
assert npu_rotary_mul_args[0].shape == torch.Size([3, 100, 10, 128])
assert not npu_rotary_mul_kwargs
assert output.shape == torch.Size([100, 3, 1280])
class TestAscendQwen2_5_VisionBlock(PytestBase):
def init_vision_block(
self,
mocker,
dim=100,
num_heads=10,
mlp_hidden_dim=100,
):
mocker_vit = mocker.patch(
"vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VisionBlock.__init__",
return_value=None,
)
mocker_attn = mocker.patch(
"vllm_ascend.models.qwen2_5_vl.AscendQwen2_5_VisionAttention.__init__",
return_value=None,
)
mocker.patch("torch.nn.Module.__setattr__")
mocker.patch("torch.nn.Module.__getattr__")
mocker.patch("torch.nn.Module.__delattr__")
vision_block = AscendQwen2_5_VisionBlock(
dim=dim,
num_heads=num_heads,
mlp_hidden_dim=mlp_hidden_dim,
)
args, kwargs = mocker_vit.call_args
assert args == (dim, num_heads, mlp_hidden_dim, F.silu, None, None, "")
assert not kwargs
args1, kwargs1 = mocker_attn.call_args
assert not args1
assert kwargs1 == {
"embed_dim": dim,
"num_heads": num_heads,
"projection_size": dim,
"quant_config": None,
"prefix": ".attn",
}
return vision_block
def test_init_vision_block_should_normal(
self,
mocker: MockerFixture,
):
vision_block = self.init_vision_block(mocker)
assert isinstance(vision_block, AscendQwen2_5_VisionBlock)
def test_vision_block_forward(self, mocker: MockerFixture):
x = torch.randint(1, 100, (100, 3, 1280)) # s,b,d
cu_seqlens = torch.tensor([10, 50, 100])
cos = torch.rand((1, 100, 1, 128))
sin = torch.rand((1, 100, 1, 128))
vision_block = self.init_vision_block(mocker)
mocker_attn = mocker.patch.object(vision_block, "attn", return_value=x)
mocker_mlp = mocker.patch.object(vision_block, "mlp", return_value=x)
vision_block.__dict__["attn"] = mocker_attn
vision_block.__dict__["mlp"] = mocker_mlp
output = vision_block.forward(x.clone(), cu_seqlens, cos, sin)
_, attn_kwargs = mocker_attn.call_args
assert attn_kwargs == {
"cu_seqlens": cu_seqlens,
"cos": cos,
"sin": sin,
}
assert torch.all(x * 3 == output)
class TestAscendQwen2_5_VisionPatchEmbed(PytestBase):
def test_forward(self):
patch_embed = AscendQwen2_5_VisionPatchEmbed()
ret = patch_embed(torch.rand((120, 1176)))
assert ret.shape == (120, 1152)
class TestAscendQwen2_5_VisionRotaryEmbedding(PytestBase):
def init_rotary_embedding(
self,
mocker,
dim=128,
):
mocker_ebed = mocker.patch(
"vllm_ascend.models.qwen2_5_vl.Qwen2_5_VisionRotaryEmbedding.__init__",
return_value=None,
)
mocker.patch("torch.nn.Module.__setattr__")
mocker.patch("torch.nn.Module.__getattr__")
mocker.patch("torch.nn.Module.__delattr__")
rotary_embedding = AscendQwen2_5_VisionRotaryEmbedding(dim=dim, )
args, kwargs = mocker_ebed.call_args
assert args == (dim, 10000.0)
assert not kwargs
return rotary_embedding
def test_init_rotary_embedding_should_normal(self, mocker: MockerFixture):
rotary_embedding = self.init_rotary_embedding(mocker)
assert isinstance(rotary_embedding,
AscendQwen2_5_VisionRotaryEmbedding)
class TestAscendQwen2_5_VisionTransformer(PytestBase):
input_data = torch.tensor([[0.1, 0.2], [0.3, 0.4]])
def init_vision_transformer(
self,
mocker,
):
norm_eps = 1e-6
vision_config = mocker.MagicMock()
vision_config.patch_size = 16
vision_config.temporal_patch_size = 2
vision_config.in_channels = 3
vision_config.hidden_act = "gelu"
vision_config.depth = 0
vision_config.num_heads = 10
vision_config.hidden_size = 300
mocker.patch(
"vllm_ascend.models.qwen2_5_vl.parallel_state.get_tensor_model_parallel_rank",
return_value=0,
)
mocker.patch("vllm.distributed.utils.divide", return_value=100)
mocker.patch(
"vllm.model_executor.layers.linear.get_tensor_model_parallel_world_size",
return_value=2,
)
mocker.patch(
"vllm.model_executor.layers.linear.divide",
return_value=2,
)
mocker.patch(
"vllm.model_executor.layers.linear.get_tensor_model_parallel_rank",
return_value=0)
mocker.patch(
"vllm_ascend.models.qwen2_5_vl.parallel_state.get_tensor_model_parallel_world_size",
return_value=2,
)
vision_transformer = AscendQwen2_5_VisionTransformer(
vision_config,
norm_eps,
)
assert not vision_transformer.interleaved
return vision_transformer
def test_init_vision_transformer(self, mocker: MockerFixture):
vision_transformer = self.init_vision_transformer(mocker)
assert isinstance(vision_transformer, AscendQwen2_5_VisionTransformer)
@pytest.mark.parametrize(
"interleaved, expected",
[
(
False,
torch.tensor([
input_data[0, 0].cos(),
input_data[0, 1].cos(),
input_data[0, 0].cos(),
input_data[0, 1].cos(),
input_data[1, 0].cos(),
input_data[1, 1].cos(),
input_data[1, 0].cos(),
input_data[1, 1].cos(),
]),
),
(
True,
torch.tensor([
input_data[0, 0].cos(),
input_data[0, 0].cos(),
input_data[0, 1].cos(),
input_data[0, 1].cos(),
input_data[1, 0].cos(),
input_data[1, 0].cos(),
input_data[1, 1].cos(),
input_data[1, 1].cos(),
]),
),
],
)
def test_cal_cos_sin(self, interleaved, expected, mocker: MockerFixture):
vision_transformer = self.init_vision_transformer(mocker)
mocker.patch("torch.nn.Module.__setattr__")
mocker.patch("torch.nn.Module.__getattr__")
mocker.patch("torch.nn.Module.__delattr__")
vision_transformer.__dict__["interleaved"] = interleaved
vision_transformer.__dict__["hidden_size_per_attention_head"] = 2
vision_transformer.hidden_size_per_attention_head = 4
cos_new, _ = vision_transformer.cal_cos_sin(self.input_data)
assert cos_new.shape == (1, 32, 1, 2)
def test_forward(self, mocker: MockerFixture):
vision_transformer = self.init_vision_transformer(mocker)
mocker.patch("torch.nn.Module.__setattr__")
mocker.patch("torch.nn.Module.__getattr__")
mocker.patch("torch.nn.Module.__delattr__")
x = torch.randn(1, 3, 224, 224)
grid_thw = torch.tensor([[1, 4, 4]])
mocker_patch_embed = mocker.patch.object(
vision_transformer,
"patch_embed",
side_effect=lambda _: torch.randn(16, 512), # noqa
)
mocker_rot_pos_emb = mocker.patch.object(
vision_transformer,
"rot_pos_emb",
side_effect=lambda _: torch.randn(16, 64), # noqa
)
mocker_get_window_index = mocker.patch.object(
vision_transformer,
"get_window_index",
side_effect=lambda _: (torch.arange(8), [4, 8, 12, 16]), # noqa
)
mocker_cal_cos_sin = mocker.patch.object(
vision_transformer,
"cal_cos_sin",
side_effect=lambda _:
(torch.randn(16, 32), torch.randn(16, 32)), # noqa
)
mocker_merger = mocker.patch.object(
vision_transformer,
"merger",
side_effect=lambda _: torch.randn(16, 256), # noqa
)
vision_transformer.__dict__["vision_blocks"] = [
lambda *args, **kwargs: torch.randn(16, 1, 512) # noqa
]
vision_transformer.__dict__["patch_embed"] = mocker_patch_embed
vision_transformer.__dict__["rot_pos_emb"] = mocker_rot_pos_emb
vision_transformer.__dict__[
"get_window_index"] = mocker_get_window_index
vision_transformer.__dict__["cal_cos_sin"] = mocker_cal_cos_sin
vision_transformer.__dict__["merger"] = mocker_merger
vision_transformer.__dict__["fullatt_block_indexes"] = [0, 2]
vision_transformer.__dict__["spatial_merge_unit"] = 2
ret = vision_transformer.forward(x, grid_thw)
assert ret.shape == (8, 256)
mocker_patch_embed.assert_called_with(x)
mocker_rot_pos_emb.assert_called_with(grid_thw)
mocker_get_window_index.assert_called_with(grid_thw)
mocker_cal_cos_sin.assert_called_once()
mocker_merger.assert_called_once()
class TestAscendQwen2_5_VLForConditionalGeneration(PytestBase):
def test_init_vl_for_conditional_generation(self, mocker: MockerFixture):
vllm_config = mocker.MagicMock()
vllm_config.vision_config = "vision_config"
vllm_config.rms_norm_eps = 1e-5
mocker.patch("torch.nn.Module.__setattr__")
mocker.patch("torch.nn.Module.__getattr__")
mocker.patch("torch.nn.Module.__delattr__")
mocker_vl = mocker.patch(
"vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VLForConditionalGeneration.__init__",
return_value=None,
)
mocker_vit = mocker.patch(
"vllm_ascend.models.qwen2_5_vl.AscendQwen2_5_VisionTransformer.__init__",
return_value=None,
)
vl_for_conditional_generation = AscendQwen2_5_VLForConditionalGeneration(
vllm_config=vllm_config)
args, kwargs = mocker_vl.call_args
assert not args
assert kwargs == {"vllm_config": vllm_config, "prefix": ""}
mocker_vit.assert_called_once()
assert isinstance(
vl_for_conditional_generation,
AscendQwen2_5_VLForConditionalGeneration,
)

View File

@@ -0,0 +1,422 @@
import pytest
import torch
import torch.nn.functional as F
from pytest_mock import MockerFixture
from vllm.model_executor.models.qwen2_5_vl import \
Qwen2_5_VLForConditionalGeneration
from tests.ut.base import PytestBase
from vllm_ascend.models.qwen2_5_vl_without_padding import (
AscendQwen2_5_VisionAttention_Without_Padding,
AscendQwen2_5_VisionBlock_Without_Padding,
AscendQwen2_5_VisionPatchEmbed_Without_Padding,
AscendQwen2_5_VisionTransformer_Without_Padding,
AscendQwen2_5_VLForConditionalGeneration_Without_Padding)
class TestAscendQwen2_5_VisionAttention_Without_Padding(PytestBase):
def init_attention(
self,
mocker,
embed_dim=1000,
num_heads=10,
projection_size=100,
quant_config=None,
prefix="",
):
mocker_attn = mocker.patch(
"vllm_ascend.models.qwen2_5_vl_without_padding.Qwen2_5_VisionAttention.__init__"
)
attention = AscendQwen2_5_VisionAttention_Without_Padding(
embed_dim=embed_dim,
num_heads=num_heads,
projection_size=projection_size,
quant_config=quant_config,
prefix=prefix,
)
args, kwargs = mocker_attn.call_args
assert args == (embed_dim, num_heads, projection_size, None, "")
assert not kwargs
attention.num_attention_heads_per_partition = num_heads
return attention
def test_vit_init_should_normal(self, mocker: MockerFixture):
embed_dim = 1000
num_heads = 10
projection_size = 100
quant_config = None
prefix = ""
vit = self.init_attention(
embed_dim=embed_dim,
num_heads=num_heads,
projection_size=projection_size,
quant_config=quant_config,
prefix=prefix,
mocker=mocker,
)
assert vit.embed_dim == 1000
assert vit.hidden_size_per_attention_head == 10
def test_vit_init_should_raise_error(self, mocker: MockerFixture):
embed_dim = 1000
num_heads = 7
projection_size = 100
quant_config = None
prefix = ""
with pytest.raises(AssertionError):
# projection_size should divided by num heads
self.init_attention(
mocker=mocker,
embed_dim=embed_dim,
num_heads=num_heads,
projection_size=projection_size,
quant_config=quant_config,
prefix=prefix,
)
def test_vit_forward(self, mocker: MockerFixture):
mocker.patch("torch.nn.Module.__setattr__")
mocker.patch("torch.nn.Module.__getattr__")
mocker.patch("torch.nn.Module.__delattr__")
attention = self.init_attention(mocker=mocker)
x = torch.rand((100, 3, 10 * 3 * 128)) # s,b, head*3*head_dim
cu_seqlens = torch.tensor([10, 50, 100])
cos = torch.rand((1, 100, 1, 128))
sin = torch.rand((1, 100, 1, 128))
qkv = lambda x: (x, 0) # noqa
split_qkv = lambda x: [ #noqa
torch.rand((100, 3, 10, 128)) for i in range(3)
] # noqa
npu_rotary_mul = lambda q, cos, sin: q # noqa
_npu_flash_attention_unpad = lambda **kwargs: kwargs["out"] # noqa
proj = lambda x: (x, 0) # noqa
mocker_qkv = mocker.patch.object(attention, "qkv", side_effect=qkv)
mocker_split_qkv = mocker.patch.object(
attention,
"split_qkv",
side_effect=split_qkv,
)
mocker_npu_rotary_mul = mocker.patch("torch_npu.npu_rotary_mul",
side_effect=npu_rotary_mul)
mocker_npu_flash_attention_unpad = mocker.patch(
"torch_npu._npu_flash_attention_unpad",
side_effect=_npu_flash_attention_unpad,
)
mocker_proj = mocker.patch.object(attention, "proj", side_effect=proj)
attention.__dict__["qkv"] = mocker_qkv
attention.__dict__["split_qkv"] = mocker_split_qkv
attention.__dict__["npu_rotary_mul"] = mocker_npu_rotary_mul
attention.__dict__["_npu_flash_attention_unpad"] = (
mocker_npu_flash_attention_unpad)
attention.__dict__["proj"] = mocker_proj
output = attention.forward(
x=x,
cu_seqlens=cu_seqlens,
cos=cos,
sin=sin,
)
qkv_args, qkv_kwargs = mocker_qkv.call_args
assert qkv_args == (x, )
assert not qkv_kwargs
split_qkv_args, split_qkv_kwargs = mocker_split_qkv.call_args
assert split_qkv_args == (x, )
assert not split_qkv_kwargs
npu_rotary_mul_args, npu_rotary_mul_kwargs = mocker_npu_rotary_mul.call_args
assert npu_rotary_mul_args[1:] == (cos, sin)
assert npu_rotary_mul_args[0].shape == torch.Size([3, 100, 10, 128])
assert not npu_rotary_mul_kwargs
assert output.shape == torch.Size([100, 3, 1280])
class TestAscendQwen2_5_VisionBlock_Without_Padding(PytestBase):
def init_vision_block(
self,
mocker,
dim=100,
num_heads=10,
mlp_hidden_dim=100,
):
mocker_vit = mocker.patch(
"vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VisionBlock.__init__",
return_value=None,
)
mocker_attn = mocker.patch(
"vllm_ascend.models.qwen2_5_vl_without_padding.AscendQwen2_5_VisionAttention_Without_Padding.__init__",
return_value=None,
)
mocker.patch("torch.nn.Module.__setattr__")
mocker.patch("torch.nn.Module.__getattr__")
mocker.patch("torch.nn.Module.__delattr__")
vision_block = AscendQwen2_5_VisionBlock_Without_Padding(
dim=dim,
num_heads=num_heads,
mlp_hidden_dim=mlp_hidden_dim,
)
args, kwargs = mocker_vit.call_args
assert args == (dim, num_heads, mlp_hidden_dim, F.silu, None, None, "")
assert not kwargs
args1, kwargs1 = mocker_attn.call_args
assert not args1
assert kwargs1 == {
"embed_dim": dim,
"num_heads": num_heads,
"projection_size": dim,
"quant_config": None,
"prefix": ".attn",
}
return vision_block
def test_init_vision_block_should_normal(
self,
mocker: MockerFixture,
):
vision_block = self.init_vision_block(mocker)
assert isinstance(vision_block,
AscendQwen2_5_VisionBlock_Without_Padding)
def test_vision_block_forward(self, mocker: MockerFixture):
x = torch.randint(1, 100, (100, 3, 1280)) # s,b,d
cu_seqlens = torch.tensor([10, 50, 100])
cos = torch.rand((1, 100, 1, 128))
sin = torch.rand((1, 100, 1, 128))
vision_block = self.init_vision_block(mocker)
mocker_attn = mocker.patch.object(vision_block, "attn", return_value=x)
mocker_mlp = mocker.patch.object(vision_block, "mlp", return_value=x)
vision_block.__dict__["attn"] = mocker_attn
vision_block.__dict__["mlp"] = mocker_mlp
output = vision_block.forward(x.clone(), cu_seqlens, cos, sin)
_, attn_kwargs = mocker_attn.call_args
assert attn_kwargs == {
"cu_seqlens": cu_seqlens,
"cos": cos,
"sin": sin,
}
assert torch.all(x * 3 == output)
class TestAscendQwen2_5_VisionPatchEmbed_Without_Padding(PytestBase):
def test_forward(self):
patch_embed = AscendQwen2_5_VisionPatchEmbed_Without_Padding()
ret = patch_embed(torch.rand((120, 1176)))
assert ret.shape == (120, 1152)
class TestAscendQwen2_5_VisionTransformer_Without_Padding(PytestBase):
input_data = torch.tensor([[0.1, 0.2], [0.3, 0.4]])
def init_vision_transformer(
self,
mocker,
):
norm_eps = 1e-6
vision_config = mocker.MagicMock()
vision_config.patch_size = 16
vision_config.temporal_patch_size = 2
vision_config.in_channels = 3
vision_config.hidden_act = "gelu"
vision_config.depth = 0
vision_config.hidden_size = 1280
vision_config.num_heads = 16
mocker.patch("torch.nn.Module.__setattr__")
mocker.patch("torch.nn.Module.__getattr__")
mocker.patch("torch.nn.Module.__delattr__")
mocker_vit = mocker.patch(
"vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VisionTransformer.__init__",
return_value=None,
)
mocker_vision_rotary_embedding = mocker.patch(
"vllm_ascend.models.qwen2_5_vl.AscendQwen2_5_VisionRotaryEmbedding.__init__",
return_value=None,
)
mocker.patch(
"vllm_ascend.models.qwen2_5_vl_without_padding.AscendQwen2_5_VisionBlock_Without_Padding.__init__",
return_value=None,
)
mocker.patch(
"vllm_ascend.models.qwen2_5_vl_without_padding.AscendQwen2_5_VisionPatchEmbed_Without_Padding.__init__",
return_value=None,
)
mocker.patch(
"vllm_ascend.models.qwen2_5_vl_without_padding.parallel_state.get_tensor_model_parallel_world_size",
return_value=1,
)
mocker.patch(
"vllm_ascend.models.qwen2_5_vl_without_padding.parallel_state.get_tensor_model_parallel_rank",
return_value=0,
)
mocker.patch("vllm.distributed.utils.divide", return_value=100)
vision_transformer = AscendQwen2_5_VisionTransformer_Without_Padding(
vision_config,
norm_eps,
)
args, kwargs = mocker_vit.call_args
assert args == (vision_config, norm_eps, None, "")
assert not kwargs
mocker_vision_rotary_embedding.assert_called_once()
return vision_transformer
def test_init_vision_transformer(self, mocker: MockerFixture):
vision_transformer = self.init_vision_transformer(mocker)
assert isinstance(vision_transformer,
AscendQwen2_5_VisionTransformer_Without_Padding)
@pytest.mark.parametrize(
"interleaved, expected",
[
(
False,
torch.tensor([
input_data[0, 0].cos(),
input_data[0, 1].cos(),
input_data[0, 0].cos(),
input_data[0, 1].cos(),
input_data[1, 0].cos(),
input_data[1, 1].cos(),
input_data[1, 0].cos(),
input_data[1, 1].cos(),
]),
),
(
True,
torch.tensor([
input_data[0, 0].cos(),
input_data[0, 0].cos(),
input_data[0, 1].cos(),
input_data[0, 1].cos(),
input_data[1, 0].cos(),
input_data[1, 0].cos(),
input_data[1, 1].cos(),
input_data[1, 1].cos(),
]),
),
],
)
def test_cal_cos_sin(self, interleaved, expected, mocker: MockerFixture):
vision_transformer = self.init_vision_transformer(mocker)
vision_transformer.__dict__["interleaved"] = interleaved
vision_transformer.__dict__["hidden_size_per_attention_head"] = 2
vision_transformer.hidden_size_per_attention_head = 4
cos_new, _ = vision_transformer.cal_cos_sin(self.input_data)
assert cos_new.shape == (1, 4, 1, 2)
assert torch.allclose(cos_new.view(-1), expected)
def test_forward(self, mocker: MockerFixture):
vision_transformer = self.init_vision_transformer(mocker)
x = torch.randn(1, 3, 224, 224)
grid_thw = torch.tensor([[1, 4, 4]])
mocker_patch_embed = mocker.patch.object(
vision_transformer,
"patch_embed",
side_effect=lambda _: torch.randn(16, 512), # noqa
)
mocker_rot_pos_emb = mocker.patch.object(
vision_transformer,
"rot_pos_emb",
side_effect=lambda _: torch.randn(16, 64), # noqa
)
mocker_get_window_index = mocker.patch.object(
vision_transformer,
"get_window_index",
side_effect=lambda _: (torch.arange(8), [4, 8, 12, 16]), # noqa
)
mocker_cal_cos_sin = mocker.patch.object(
vision_transformer,
"cal_cos_sin",
side_effect=lambda _:
(torch.randn(16, 32), torch.randn(16, 32)), # noqa
)
mocker_merger = mocker.patch.object(
vision_transformer,
"merger",
side_effect=lambda _: torch.randn(16, 256), # noqa
)
vision_transformer.__dict__["vision_blocks"] = [
lambda *args, **kwargs: torch.randn(16, 1, 512) # noqa
]
vision_transformer.__dict__["patch_embed"] = mocker_patch_embed
vision_transformer.__dict__["rot_pos_emb"] = mocker_rot_pos_emb
vision_transformer.__dict__[
"get_window_index"] = mocker_get_window_index
vision_transformer.__dict__["cal_cos_sin"] = mocker_cal_cos_sin
vision_transformer.__dict__["merger"] = mocker_merger
vision_transformer.__dict__["fullatt_block_indexes"] = [0, 2]
vision_transformer.__dict__["spatial_merge_unit"] = 2
ret = vision_transformer.forward(x, grid_thw)
assert ret.shape == (8, 256)
mocker_patch_embed.assert_called_with(x)
mocker_rot_pos_emb.assert_called_with(grid_thw)
mocker_get_window_index.assert_called_with(grid_thw)
mocker_cal_cos_sin.assert_called_once()
mocker_merger.assert_called_once()
class TestAscendQwen2_5_VLForConditionalGeneration_Without_Padding(PytestBase):
def test_init_vl_for_conditional_generation(self, mocker: MockerFixture):
vllm_config = mocker.MagicMock()
vllm_config.vision_config = "vision_config"
vllm_config.rms_norm_eps = 1e-5
mocker.patch("torch.nn.Module.__setattr__")
mocker.patch("torch.nn.Module.__getattr__")
mocker.patch("torch.nn.Module.__delattr__")
mocker_vl = mocker.patch(
"vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VLForConditionalGeneration.__init__",
return_value=None,
)
mocker_vit = mocker.patch(
"vllm_ascend.models.qwen2_5_vl_without_padding.AscendQwen2_5_VisionTransformer_Without_Padding.__init__",
return_value=None,
)
vl_for_conditional_generation = AscendQwen2_5_VLForConditionalGeneration_Without_Padding(
vllm_config=vllm_config)
args, kwargs = mocker_vl.call_args
assert not args
assert kwargs == {"vllm_config": vllm_config, "prefix": ""}
mocker_vit.assert_called_once()
assert isinstance(
vl_for_conditional_generation,
AscendQwen2_5_VLForConditionalGeneration_Without_Padding,
)
def test_overridden_methods(self):
self.assert_method_overridden(
AscendQwen2_5_VLForConditionalGeneration_Without_Padding,
Qwen2_5_VLForConditionalGeneration,
"_process_image_input",
)
self.assert_method_overridden(
AscendQwen2_5_VLForConditionalGeneration_Without_Padding,
Qwen2_5_VLForConditionalGeneration,
"_process_video_input",
)
@staticmethod
def assert_method_overridden(subclass, parent, method_name: str):
"""assert subclass override parent method"""
parent_func = parent.__dict__.get(method_name)
child_func = subclass.__dict__.get(method_name)
assert child_func is not None, f"{subclass.__name__} should defined {method_name}"
assert child_func is not parent_func, f"{method_name} should override in {subclass.__name__}"

View File

@@ -0,0 +1,200 @@
import pytest
import torch
from pytest_mock import MockerFixture
from vllm.model_executor.layers.activation import QuickGELU
from tests.ut.base import PytestBase
from vllm_ascend.models.qwen2_vl import (AscendQwen2VisionAttention,
AscendQwen2VisionBlock)
class TestAscendQwen2VisionAttention(PytestBase):
def init_attention(
self,
mocker,
embed_dim=1000,
num_heads=10,
projection_size=100,
quant_config=None,
prefix="",
):
mocker_attn = mocker.patch(
"vllm_ascend.models.qwen2_vl.Qwen2VisionAttention.__init__")
attention = AscendQwen2VisionAttention(
embed_dim=embed_dim,
num_heads=num_heads,
projection_size=projection_size,
quant_config=quant_config,
prefix=prefix,
)
args, kwargs = mocker_attn.call_args
assert args == (embed_dim, num_heads, projection_size, None, "")
assert not kwargs
attention.num_attention_heads_per_partition = num_heads
return attention
def test_attn_init_should_normal(self, mocker: MockerFixture):
embed_dim = 1000
num_heads = 10
projection_size = 100
quant_config = None
prefix = ""
vit = self.init_attention(
embed_dim=embed_dim,
num_heads=num_heads,
projection_size=projection_size,
quant_config=quant_config,
prefix=prefix,
mocker=mocker,
)
assert vit.hidden_size_per_attention_head == 10
def test_attn_init_should_raise_error(self, mocker: MockerFixture):
embed_dim = 1000
num_heads = 7
projection_size = 100
quant_config = None
prefix = ""
with pytest.raises(AssertionError):
# projection_size should divided by num heads
self.init_attention(
mocker=mocker,
embed_dim=embed_dim,
num_heads=num_heads,
projection_size=projection_size,
quant_config=quant_config,
prefix=prefix,
)
def test_attn_forward(self, mocker: MockerFixture):
attention = self.init_attention(mocker=mocker)
mocker.patch("torch.nn.Module.__setattr__")
mocker.patch("torch.nn.Module.__getattr__")
mocker.patch("torch.nn.Module.__delattr__")
x = torch.rand((100, 3, 10 * 3 * 128)) # s,b, head*3*head_dim
cu_seqlens = torch.tensor([10, 50, 100])
cos = torch.rand((1, 100, 1, 128))
sin = torch.rand((1, 100, 1, 128))
qkv = lambda x: (x, 0) # noqa
split_qkv = lambda x: [ #noqa
torch.rand((100, 3, 10, 128)) for i in range(3)
] # noqa
npu_rotary_mul = lambda q, cos, sin: q # noqa
_npu_flash_attention_unpad = lambda **kwargs: kwargs["out"] # noqa
proj = lambda x: (x, 0) # noqa
mocker_qkv = mocker.patch.object(attention, "qkv", side_effect=qkv)
mocker_split_qkv = mocker.patch.object(
attention,
"split_qkv",
side_effect=split_qkv,
)
mocker_npu_rotary_mul = mocker.patch("torch_npu.npu_rotary_mul",
side_effect=npu_rotary_mul)
mocker_npu_flash_attention_unpad = mocker.patch(
"torch_npu._npu_flash_attention_unpad",
side_effect=_npu_flash_attention_unpad,
)
mocker_proj = mocker.patch.object(attention, "proj", side_effect=proj)
attention.__dict__["qkv"] = mocker_qkv
attention.__dict__["split_qkv"] = mocker_split_qkv
attention.__dict__["npu_rotary_mul"] = mocker_npu_rotary_mul
attention.__dict__["_npu_flash_attention_unpad"] = (
mocker_npu_flash_attention_unpad)
attention.__dict__["proj"] = mocker_proj
output = attention.forward(
x=x,
cu_seqlens=cu_seqlens,
cos=cos,
sin=sin,
)
qkv_args, qkv_kwargs = mocker_qkv.call_args
assert qkv_args == (x, )
assert not qkv_kwargs
split_qkv_args, split_qkv_kwargs = mocker_split_qkv.call_args
assert split_qkv_args == (x, )
assert not split_qkv_kwargs
npu_rotary_mul_args, npu_rotary_mul_kwargs = mocker_npu_rotary_mul.call_args
assert npu_rotary_mul_args[1:] == (cos, sin)
assert npu_rotary_mul_args[0].shape == torch.Size([3, 100, 10, 128])
assert not npu_rotary_mul_kwargs
assert output.shape == torch.Size([100, 3, 1280])
class TestAscendQwen2VisionBlock(PytestBase):
def init_vision_block(
self,
mocker,
dim=100,
num_heads=10,
mlp_ratio=0.5,
):
mocker_vit = mocker.patch(
"vllm.model_executor.models.qwen2_vl.Qwen2VisionBlock.__init__",
return_value=None,
)
mocker_attn = mocker.patch(
"vllm_ascend.models.qwen2_vl.AscendQwen2VisionAttention.__init__",
return_value=None,
)
mocker.patch("torch.nn.Module.__setattr__")
mocker.patch("torch.nn.Module.__getattr__")
mocker.patch("torch.nn.Module.__delattr__")
vision_block = AscendQwen2VisionBlock(
dim=dim,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
)
args, kwargs = mocker_vit.call_args
assert args == (dim, num_heads, mlp_ratio, QuickGELU, None, None, "")
assert not kwargs
args1, kwargs1 = mocker_attn.call_args
assert not args1
assert kwargs1 == {
"embed_dim": dim,
"num_heads": num_heads,
"projection_size": dim,
"quant_config": None,
"prefix": ".attn",
}
return vision_block
def test_init_vision_block_should_normal(
self,
mocker: MockerFixture,
):
vision_block = self.init_vision_block(mocker)
assert isinstance(vision_block, AscendQwen2VisionBlock)
def test_vision_block_forward(self, mocker: MockerFixture):
x = torch.randint(1, 100, (100, 3, 1280)) # s,b,d
cu_seqlens = torch.tensor([10, 50, 100])
cos = torch.rand((1, 100, 1, 128))
sin = torch.rand((1, 100, 1, 128))
vision_block = self.init_vision_block(mocker)
mocker_attn = mocker.patch.object(vision_block, "attn", return_value=x)
mocker_mlp = mocker.patch.object(vision_block, "mlp", return_value=x)
vision_block.__dict__["attn"] = mocker_attn
vision_block.__dict__["mlp"] = mocker_mlp
output = vision_block.forward(x.clone(), cu_seqlens, cos, sin)
_, attn_kwargs = mocker_attn.call_args
assert attn_kwargs == {
"cu_seqlens": cu_seqlens,
"cos": cos,
"sin": sin,
}
assert torch.all(x * 3 == output)

View File

@@ -0,0 +1,98 @@
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
import math
import unittest
import pytest
import torch
from vllm.model_executor.models.qwen3_moe import Qwen3MoeForCausalLM
from vllm_ascend.models.qwen3_moe import CustomQwen3MoeForCausalLM
from vllm_ascend.torchair.models.qwen3_moe import CustomQwen3MoeAttention
class TestCustomQwen3MoeForCausalLM:
def test_class_inheritance(self):
assert issubclass(CustomQwen3MoeForCausalLM, Qwen3MoeForCausalLM)
@pytest.mark.parametrize("key, expected", [
("qkv_proj", ["q_proj", "k_proj", "v_proj"]),
("gate_up_proj", ["gate_proj", "up_proj"]),
("experts",
["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]),
])
def test_packed_modules_mapping(self, key, expected):
assert CustomQwen3MoeForCausalLM.packed_modules_mapping[
key] == expected
def test_packed_modules_mapping_structure(self):
expected_mapping = {
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
"gate_up_proj": ["gate_proj", "up_proj"],
"experts": [
"experts.0.gate_proj", "experts.0.up_proj",
"experts.0.down_proj"
]
}
assert CustomQwen3MoeForCausalLM.packed_modules_mapping == expected_mapping
class DummyRMSNorm:
def __init__(self, dim: int, eps: float = 1e-6):
self.dim = dim
self.eps = eps
def __call__(self, x):
mean_sq = x.pow(2).mean(dim=-1, keepdim=True)
denom = (mean_sq + self.eps).sqrt()
return x / denom
class TestCustomQwen3MoeAttention(unittest.TestCase):
def setUp(self):
self.batch = 2
self.seq_len = 3
self.q_size = 8
self.kv_size = 8
self.head_dim = 4
self.rms_eps = 1e-6
total_dim = self.q_size + 2 * self.kv_size
self.qkv = torch.arange(self.batch * self.seq_len * total_dim,
dtype=torch.float32).reshape(
self.batch, self.seq_len, total_dim)
def test_constant_input_normalization(self):
ones_qkv = torch.ones((1, 1, self.q_size + 2 * self.kv_size),
dtype=torch.float32)
q_norm = DummyRMSNorm(self.head_dim, self.rms_eps)
k_norm = DummyRMSNorm(self.head_dim, self.rms_eps)
q, k, v = CustomQwen3MoeAttention.normalize_qkv(
ones_qkv, self.q_size, self.kv_size, self.head_dim, q_norm, k_norm)
norm_val = 1.0 / math.sqrt(1.0 + self.rms_eps)
expected_q = torch.full((1, 1, self.q_size), norm_val)
expected_k = torch.full((1, 1, self.kv_size), norm_val)
expected_v = torch.ones((1, 1, self.kv_size), dtype=torch.float32)
self.assertTrue(torch.allclose(q, expected_q, atol=1e-6))
self.assertTrue(torch.allclose(k, expected_k, atol=1e-6))
self.assertTrue(torch.equal(v, expected_v))