diff --git a/tests/ut/models/test_qwen2_5_vl.py b/tests/ut/models/test_qwen2_5_vl.py deleted file mode 100644 index 7111aaed..00000000 --- a/tests/ut/models/test_qwen2_5_vl.py +++ /dev/null @@ -1,488 +0,0 @@ -import pytest -import torch -import torch.nn.functional as F -from pytest_mock import MockerFixture - -from tests.ut.base import PytestBase -from vllm_ascend.models.qwen2_5_vl import ( - AscendQwen2_5_VisionAttention, AscendQwen2_5_VisionBlock, - AscendQwen2_5_VisionPatchEmbed, AscendQwen2_5_VisionRotaryEmbedding, - AscendQwen2_5_VisionTransformer, AscendQwen2_5_VLForConditionalGeneration) - - -class TestAscendQwen2_5_VisionAttention(PytestBase): - - def init_attention( - self, - mocker, - embed_dim=1000, - num_heads=10, - projection_size=100, - quant_config=None, - prefix="", - ): - mocker_attn = mocker.patch( - "vllm_ascend.models.qwen2_5_vl.Qwen2_5_VisionAttention.__init__") - - attention = AscendQwen2_5_VisionAttention( - embed_dim=embed_dim, - num_heads=num_heads, - projection_size=projection_size, - quant_config=quant_config, - prefix=prefix, - ) - args, kwargs = mocker_attn.call_args - assert args == (embed_dim, num_heads, projection_size, None, "") - assert not kwargs - attention.num_attention_heads_per_partition = num_heads - return attention - - def test_attn_init_should_normal(self, mocker: MockerFixture): - embed_dim = 1000 - num_heads = 10 - projection_size = 100 - quant_config = None - prefix = "" - vit = self.init_attention( - embed_dim=embed_dim, - num_heads=num_heads, - projection_size=projection_size, - quant_config=quant_config, - prefix=prefix, - mocker=mocker, - ) - assert vit.embed_dim == 1000 - assert vit.hidden_size_per_attention_head == 10 - - def test_attn_init_should_raise_error(self, mocker: MockerFixture): - embed_dim = 1000 - num_heads = 7 - projection_size = 100 - quant_config = None - prefix = "" - with pytest.raises(AssertionError): - # projection_size should divided by num heads - self.init_attention( - mocker=mocker, - embed_dim=embed_dim, - num_heads=num_heads, - projection_size=projection_size, - quant_config=quant_config, - prefix=prefix, - ) - - def test_split_qkv(self, mocker: MockerFixture): - attention = self.init_attention(mocker=mocker) - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - q, k, v = attention.split_qkv(torch.rand((100, 10, 300))) - assert q.shape == (100, 10, 10, 10) - assert k.shape == (100, 10, 10, 10) - assert v.shape == (100, 10, 10, 10) - - def test_attn_forward(self, mocker: MockerFixture): - attention = self.init_attention(mocker=mocker) - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - x = torch.rand((100, 3, 10 * 3 * 128)) # s,b, head*3*head_dim - cu_seqlens = torch.tensor([10, 50, 100]) - cos = torch.rand((1, 100, 1, 128)) - sin = torch.rand((1, 100, 1, 128)) - - qkv = lambda x: (x, 0) # noqa - split_qkv = lambda x: [ #noqa - torch.rand((100, 3, 10, 128)) for i in range(3) - ] # noqa - npu_rotary_mul = lambda q, cos, sin: q # noqa - _npu_flash_attention_unpad = lambda **kwargs: kwargs["out"] # noqa - proj = lambda x: (x, 0) # noqa - - mocker_qkv = mocker.patch.object(attention, "qkv", side_effect=qkv) - mocker_split_qkv = mocker.patch.object( - attention, - "split_qkv", - side_effect=split_qkv, - ) - mocker_npu_rotary_mul = mocker.patch("torch_npu.npu_rotary_mul", - side_effect=npu_rotary_mul) - mocker_npu_flash_attention_unpad = mocker.patch( - "torch_npu._npu_flash_attention_unpad", - side_effect=_npu_flash_attention_unpad, - ) - mocker_proj = mocker.patch.object(attention, "proj", side_effect=proj) - attention.__dict__["qkv"] = mocker_qkv - attention.__dict__["split_qkv"] = mocker_split_qkv - attention.__dict__["npu_rotary_mul"] = mocker_npu_rotary_mul - attention.__dict__["_npu_flash_attention_unpad"] = ( - mocker_npu_flash_attention_unpad) - attention.__dict__["proj"] = mocker_proj - - output = attention.forward( - x=x, - cu_seqlens=cu_seqlens, - cos=cos, - sin=sin, - ) - qkv_args, qkv_kwargs = mocker_qkv.call_args - assert qkv_args == (x, ) - assert not qkv_kwargs - - split_qkv_args, split_qkv_kwargs = mocker_split_qkv.call_args - assert split_qkv_args == (x, ) - assert not split_qkv_kwargs - - npu_rotary_mul_args, npu_rotary_mul_kwargs = mocker_npu_rotary_mul.call_args - assert npu_rotary_mul_args[1:] == (cos, sin) - assert npu_rotary_mul_args[0].shape == torch.Size([3, 100, 10, 128]) - assert not npu_rotary_mul_kwargs - - assert output.shape == torch.Size([100, 3, 1280]) - - -class TestAscendQwen2_5_VisionBlock(PytestBase): - - def init_vision_block( - self, - mocker, - dim=100, - num_heads=10, - mlp_hidden_dim=100, - ): - mocker_vit = mocker.patch( - "vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VisionBlock.__init__", - return_value=None, - ) - - mocker_attn = mocker.patch( - "vllm_ascend.models.qwen2_5_vl.AscendQwen2_5_VisionAttention.__init__", - return_value=None, - ) - - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - vision_block = AscendQwen2_5_VisionBlock( - dim=dim, - num_heads=num_heads, - mlp_hidden_dim=mlp_hidden_dim, - ) - args, kwargs = mocker_vit.call_args - assert args == (dim, num_heads, mlp_hidden_dim, F.silu, None, None, "") - assert not kwargs - - args1, kwargs1 = mocker_attn.call_args - assert not args1 - assert kwargs1 == { - "embed_dim": dim, - "num_heads": num_heads, - "projection_size": dim, - "quant_config": None, - "prefix": ".attn", - } - return vision_block - - def test_init_vision_block_should_normal( - self, - mocker: MockerFixture, - ): - vision_block = self.init_vision_block(mocker) - assert isinstance(vision_block, AscendQwen2_5_VisionBlock) - - def test_vision_block_forward(self, mocker: MockerFixture): - x = torch.randint(1, 100, (100, 3, 1280)) # s,b,d - cu_seqlens = torch.tensor([10, 50, 100]) - cos = torch.rand((1, 100, 1, 128)) - sin = torch.rand((1, 100, 1, 128)) - vision_block = self.init_vision_block(mocker) - mocker_attn = mocker.patch.object(vision_block, "attn", return_value=x) - mocker_mlp = mocker.patch.object(vision_block, "mlp", return_value=x) - vision_block.__dict__["attn"] = mocker_attn - vision_block.__dict__["mlp"] = mocker_mlp - - output = vision_block.forward(x.clone(), cu_seqlens, cos, sin) - - _, attn_kwargs = mocker_attn.call_args - assert attn_kwargs == { - "cu_seqlens": cu_seqlens, - "cos": cos, - "sin": sin, - } - - assert torch.all(x * 3 == output) - - -class TestAscendQwen2_5_VisionPatchEmbed(PytestBase): - - def test_forward(self): - patch_embed = AscendQwen2_5_VisionPatchEmbed() - - ret = patch_embed(torch.rand((120, 1176))) - assert ret.shape == (120, 1152) - - -class TestAscendQwen2_5_VisionRotaryEmbedding(PytestBase): - - def init_rotary_embedding( - self, - mocker, - dim=128, - ): - mocker_ebed = mocker.patch( - "vllm_ascend.models.qwen2_5_vl.Qwen2_5_VisionRotaryEmbedding.__init__", - return_value=None, - ) - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - rotary_embedding = AscendQwen2_5_VisionRotaryEmbedding(dim=dim, ) - args, kwargs = mocker_ebed.call_args - assert args == (dim, 10000.0) - assert not kwargs - return rotary_embedding - - def test_init_rotary_embedding_should_normal(self, mocker: MockerFixture): - rotary_embedding = self.init_rotary_embedding(mocker) - assert isinstance(rotary_embedding, - AscendQwen2_5_VisionRotaryEmbedding) - - -class TestAscendQwen2_5_VisionTransformer(PytestBase): - - input_data = torch.tensor([[0.1, 0.2], [0.3, 0.4]]) - - def init_vision_transformer( - self, - mocker, - ): - norm_eps = 1e-6 - vision_config = mocker.MagicMock() - vision_config.patch_size = 16 - vision_config.temporal_patch_size = 2 - vision_config.in_channels = 3 - vision_config.hidden_act = "gelu" - vision_config.depth = 0 - vision_config.num_heads = 10 - vision_config.hidden_size = 300 - - mocker.patch( - "vllm_ascend.models.qwen2_5_vl.parallel_state.get_tensor_model_parallel_rank", - return_value=0, - ) - mocker.patch("vllm.distributed.utils.divide", return_value=100) - mocker.patch( - "vllm.model_executor.layers.linear.get_tensor_model_parallel_world_size", - return_value=2, - ) - mocker.patch( - "vllm.model_executor.layers.linear.divide", - return_value=2, - ) - mocker.patch( - "vllm.model_executor.layers.linear.get_tensor_model_parallel_rank", - return_value=0) - mocker.patch( - "vllm_ascend.models.qwen2_5_vl.parallel_state.get_tensor_model_parallel_world_size", - return_value=2, - ) - mocker.patch( - "vllm_ascend.ops.linear.divide", - return_value=2, - ) - - mock_group = mocker.MagicMock() - mock_group.rank_in_group = 0 - mock_group.world_size = 2 - mocker.patch( - "vllm_ascend.ops.linear_op.get_tp_group", - return_value=mock_group, - ) - mocker.patch( - "vllm.distributed.parallel_state.get_tp_group", - return_value=mock_group, - ) - - vision_transformer = AscendQwen2_5_VisionTransformer( - vision_config, - norm_eps, - ) - - assert not vision_transformer.interleaved - return vision_transformer - - def test_init_vision_transformer(self, mocker: MockerFixture): - vision_transformer = self.init_vision_transformer(mocker) - assert isinstance(vision_transformer, AscendQwen2_5_VisionTransformer) - - @pytest.mark.parametrize( - "interleaved, expected", - [ - ( - False, - torch.tensor([ - input_data[0, 0].cos(), - input_data[0, 1].cos(), - input_data[0, 0].cos(), - input_data[0, 1].cos(), - input_data[1, 0].cos(), - input_data[1, 1].cos(), - input_data[1, 0].cos(), - input_data[1, 1].cos(), - ]), - ), - ( - True, - torch.tensor([ - input_data[0, 0].cos(), - input_data[0, 0].cos(), - input_data[0, 1].cos(), - input_data[0, 1].cos(), - input_data[1, 0].cos(), - input_data[1, 0].cos(), - input_data[1, 1].cos(), - input_data[1, 1].cos(), - ]), - ), - ], - ) - def test_cal_cos_sin(self, interleaved, expected, mocker: MockerFixture): - vision_transformer = self.init_vision_transformer(mocker) - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - vision_transformer.__dict__["interleaved"] = interleaved - vision_transformer.__dict__["hidden_size_per_attention_head"] = 2 - vision_transformer.hidden_size_per_attention_head = 4 - cos_new, _ = vision_transformer.cal_cos_sin(self.input_data) - assert cos_new.shape == (1, 32, 1, 2) - - def test_pad_qkv_bias(self, mocker: MockerFixture): - attention = self.init_vision_transformer(mocker) - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - res = attention.pad_qkv_bias(torch.rand((300))) - assert res.shape[0] == 384 - - def test_pad_qkv_weight(self, mocker: MockerFixture): - attention = self.init_vision_transformer(mocker) - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - mocker.patch( - "torch_npu.npu_format_cast", - return_value=torch.rand((384, 300)), - ) - res = attention.pad_qkv_weight(torch.rand((300, 300))) - assert res.shape == (384, 300) - - def test_pad_proj_weight(self, mocker: MockerFixture): - attention = self.init_vision_transformer(mocker) - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - mocker.patch( - "torch_npu.npu_format_cast", - return_value=torch.rand((300, 384)), - ) - res = attention.pad_proj_weight(torch.rand((300, 300))) - assert res.shape == (300, 384) - - def test_pad_qkv_weight_scale_offset(self, mocker: MockerFixture): - attention = self.init_vision_transformer(mocker) - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - res = attention.pad_qkv_weight_scale_offset(torch.rand((300, 1))) - assert res.shape == (384, 1) - - def test_pad_qkv_deq_scale_quant_bias(self, mocker: MockerFixture): - attention = self.init_vision_transformer(mocker) - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - res = attention.pad_qkv_deq_scale_quant_bias(torch.rand((300))) - assert res.shape[0] == 384 - - def test_forward(self, mocker: MockerFixture): - vision_transformer = self.init_vision_transformer(mocker) - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - x = torch.randn(1, 3, 224, 224) - grid_thw = torch.tensor([[1, 4, 4]]) - mocker_patch_embed = mocker.patch.object( - vision_transformer, - "patch_embed", - side_effect=lambda _: torch.randn(16, 512), # noqa - ) - mocker_rot_pos_emb = mocker.patch.object( - vision_transformer, - "rot_pos_emb", - side_effect=lambda _: torch.randn(16, 64), # noqa - ) - mocker_get_window_index = mocker.patch.object( - vision_transformer, - "get_window_index", - side_effect=lambda _: (torch.arange(8), [4, 8, 12, 16]), # noqa - ) - mocker_cal_cos_sin = mocker.patch.object( - vision_transformer, - "cal_cos_sin", - side_effect=lambda _: - (torch.randn(16, 32), torch.randn(16, 32)), # noqa - ) - mocker_merger = mocker.patch.object( - vision_transformer, - "merger", - side_effect=lambda _: torch.randn(16, 256), # noqa - ) - vision_transformer.__dict__["vision_blocks"] = [ - lambda *args, **kwargs: torch.randn(16, 1, 512) # noqa - ] - vision_transformer.__dict__["patch_embed"] = mocker_patch_embed - vision_transformer.__dict__["rot_pos_emb"] = mocker_rot_pos_emb - vision_transformer.__dict__[ - "get_window_index"] = mocker_get_window_index - vision_transformer.__dict__["cal_cos_sin"] = mocker_cal_cos_sin - vision_transformer.__dict__["merger"] = mocker_merger - vision_transformer.__dict__["fullatt_block_indexes"] = [0, 2] - vision_transformer.__dict__["spatial_merge_unit"] = 2 - ret = vision_transformer.forward(x, grid_thw) - assert ret.shape == (8, 256) - mocker_patch_embed.assert_called_with(x) - mocker_rot_pos_emb.assert_called_with(grid_thw) - mocker_get_window_index.assert_called_with(grid_thw) - mocker_cal_cos_sin.assert_called_once() - mocker_merger.assert_called_once() - - -class TestAscendQwen2_5_VLForConditionalGeneration(PytestBase): - - def test_init_vl_for_conditional_generation(self, mocker: MockerFixture): - vllm_config = mocker.MagicMock() - vllm_config.vision_config = "vision_config" - vllm_config.rms_norm_eps = 1e-5 - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - mocker_vl = mocker.patch( - "vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VLForConditionalGeneration.__init__", - return_value=None, - ) - mocker_vit = mocker.patch( - "vllm_ascend.models.qwen2_5_vl.AscendQwen2_5_VisionTransformer.__init__", - return_value=None, - ) - - vl_for_conditional_generation = AscendQwen2_5_VLForConditionalGeneration( - vllm_config=vllm_config) - args, kwargs = mocker_vl.call_args - assert not args - assert kwargs == {"vllm_config": vllm_config, "prefix": ""} - mocker_vit.assert_called_once() - assert isinstance( - vl_for_conditional_generation, - AscendQwen2_5_VLForConditionalGeneration, - ) diff --git a/tests/ut/models/test_qwen2_5_vl_without_padding.py b/tests/ut/models/test_qwen2_5_vl_without_padding.py deleted file mode 100644 index 00caf810..00000000 --- a/tests/ut/models/test_qwen2_5_vl_without_padding.py +++ /dev/null @@ -1,422 +0,0 @@ -import pytest -import torch -import torch.nn.functional as F -from pytest_mock import MockerFixture -from vllm.model_executor.models.qwen2_5_vl import \ - Qwen2_5_VLForConditionalGeneration - -from tests.ut.base import PytestBase -from vllm_ascend.models.qwen2_5_vl_without_padding import ( - AscendQwen2_5_VisionAttention_Without_Padding, - AscendQwen2_5_VisionBlock_Without_Padding, - AscendQwen2_5_VisionPatchEmbed_Without_Padding, - AscendQwen2_5_VisionTransformer_Without_Padding, - AscendQwen2_5_VLForConditionalGeneration_Without_Padding) - - -class TestAscendQwen2_5_VisionAttention_Without_Padding(PytestBase): - - def init_attention( - self, - mocker, - embed_dim=1000, - num_heads=10, - projection_size=100, - quant_config=None, - prefix="", - ): - mocker_attn = mocker.patch( - "vllm_ascend.models.qwen2_5_vl_without_padding.Qwen2_5_VisionAttention.__init__" - ) - - attention = AscendQwen2_5_VisionAttention_Without_Padding( - embed_dim=embed_dim, - num_heads=num_heads, - projection_size=projection_size, - quant_config=quant_config, - prefix=prefix, - ) - args, kwargs = mocker_attn.call_args - assert args == (embed_dim, num_heads, projection_size, None, "") - assert not kwargs - attention.num_attention_heads_per_partition = num_heads - return attention - - def test_vit_init_should_normal(self, mocker: MockerFixture): - embed_dim = 1000 - num_heads = 10 - projection_size = 100 - quant_config = None - prefix = "" - vit = self.init_attention( - embed_dim=embed_dim, - num_heads=num_heads, - projection_size=projection_size, - quant_config=quant_config, - prefix=prefix, - mocker=mocker, - ) - assert vit.embed_dim == 1000 - assert vit.hidden_size_per_attention_head == 10 - - def test_vit_init_should_raise_error(self, mocker: MockerFixture): - embed_dim = 1000 - num_heads = 7 - projection_size = 100 - quant_config = None - prefix = "" - with pytest.raises(AssertionError): - # projection_size should divided by num heads - self.init_attention( - mocker=mocker, - embed_dim=embed_dim, - num_heads=num_heads, - projection_size=projection_size, - quant_config=quant_config, - prefix=prefix, - ) - - def test_vit_forward(self, mocker: MockerFixture): - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - attention = self.init_attention(mocker=mocker) - x = torch.rand((100, 3, 10 * 3 * 128)) # s,b, head*3*head_dim - cu_seqlens = torch.tensor([10, 50, 100]) - cos = torch.rand((1, 100, 1, 128)) - sin = torch.rand((1, 100, 1, 128)) - - qkv = lambda x: (x, 0) # noqa - split_qkv = lambda x: [ #noqa - torch.rand((100, 3, 10, 128)) for i in range(3) - ] # noqa - npu_rotary_mul = lambda q, cos, sin: q # noqa - _npu_flash_attention_unpad = lambda **kwargs: kwargs["out"] # noqa - proj = lambda x: (x, 0) # noqa - - mocker_qkv = mocker.patch.object(attention, "qkv", side_effect=qkv) - mocker_split_qkv = mocker.patch.object( - attention, - "split_qkv", - side_effect=split_qkv, - ) - mocker_npu_rotary_mul = mocker.patch("torch_npu.npu_rotary_mul", - side_effect=npu_rotary_mul) - mocker_npu_flash_attention_unpad = mocker.patch( - "torch_npu._npu_flash_attention_unpad", - side_effect=_npu_flash_attention_unpad, - ) - mocker_proj = mocker.patch.object(attention, "proj", side_effect=proj) - attention.__dict__["qkv"] = mocker_qkv - attention.__dict__["split_qkv"] = mocker_split_qkv - attention.__dict__["npu_rotary_mul"] = mocker_npu_rotary_mul - attention.__dict__["_npu_flash_attention_unpad"] = ( - mocker_npu_flash_attention_unpad) - attention.__dict__["proj"] = mocker_proj - - output = attention.forward( - x=x, - cu_seqlens=cu_seqlens, - cos=cos, - sin=sin, - ) - qkv_args, qkv_kwargs = mocker_qkv.call_args - assert qkv_args == (x, ) - assert not qkv_kwargs - - split_qkv_args, split_qkv_kwargs = mocker_split_qkv.call_args - assert split_qkv_args == (x, ) - assert not split_qkv_kwargs - - npu_rotary_mul_args, npu_rotary_mul_kwargs = mocker_npu_rotary_mul.call_args - assert npu_rotary_mul_args[1:] == (cos, sin) - assert npu_rotary_mul_args[0].shape == torch.Size([3, 100, 10, 128]) - assert not npu_rotary_mul_kwargs - - assert output.shape == torch.Size([100, 3, 1280]) - - -class TestAscendQwen2_5_VisionBlock_Without_Padding(PytestBase): - - def init_vision_block( - self, - mocker, - dim=100, - num_heads=10, - mlp_hidden_dim=100, - ): - mocker_vit = mocker.patch( - "vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VisionBlock.__init__", - return_value=None, - ) - - mocker_attn = mocker.patch( - "vllm_ascend.models.qwen2_5_vl_without_padding.AscendQwen2_5_VisionAttention_Without_Padding.__init__", - return_value=None, - ) - - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - vision_block = AscendQwen2_5_VisionBlock_Without_Padding( - dim=dim, - num_heads=num_heads, - mlp_hidden_dim=mlp_hidden_dim, - ) - args, kwargs = mocker_vit.call_args - assert args == (dim, num_heads, mlp_hidden_dim, F.silu, None, None, "") - assert not kwargs - - args1, kwargs1 = mocker_attn.call_args - assert not args1 - assert kwargs1 == { - "embed_dim": dim, - "num_heads": num_heads, - "projection_size": dim, - "quant_config": None, - "prefix": ".attn", - } - return vision_block - - def test_init_vision_block_should_normal( - self, - mocker: MockerFixture, - ): - vision_block = self.init_vision_block(mocker) - assert isinstance(vision_block, - AscendQwen2_5_VisionBlock_Without_Padding) - - def test_vision_block_forward(self, mocker: MockerFixture): - x = torch.randint(1, 100, (100, 3, 1280)) # s,b,d - cu_seqlens = torch.tensor([10, 50, 100]) - cos = torch.rand((1, 100, 1, 128)) - sin = torch.rand((1, 100, 1, 128)) - vision_block = self.init_vision_block(mocker) - mocker_attn = mocker.patch.object(vision_block, "attn", return_value=x) - mocker_mlp = mocker.patch.object(vision_block, "mlp", return_value=x) - vision_block.__dict__["attn"] = mocker_attn - vision_block.__dict__["mlp"] = mocker_mlp - - output = vision_block.forward(x.clone(), cu_seqlens, cos, sin) - - _, attn_kwargs = mocker_attn.call_args - assert attn_kwargs == { - "cu_seqlens": cu_seqlens, - "cos": cos, - "sin": sin, - } - - assert torch.all(x * 3 == output) - - -class TestAscendQwen2_5_VisionPatchEmbed_Without_Padding(PytestBase): - - def test_forward(self): - patch_embed = AscendQwen2_5_VisionPatchEmbed_Without_Padding() - - ret = patch_embed(torch.rand((120, 1176))) - assert ret.shape == (120, 1152) - - -class TestAscendQwen2_5_VisionTransformer_Without_Padding(PytestBase): - - input_data = torch.tensor([[0.1, 0.2], [0.3, 0.4]]) - - def init_vision_transformer( - self, - mocker, - ): - norm_eps = 1e-6 - vision_config = mocker.MagicMock() - vision_config.patch_size = 16 - vision_config.temporal_patch_size = 2 - vision_config.in_channels = 3 - vision_config.hidden_act = "gelu" - vision_config.depth = 0 - vision_config.hidden_size = 1280 - vision_config.num_heads = 16 - - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - mocker_vit = mocker.patch( - "vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VisionTransformer.__init__", - return_value=None, - ) - mocker_vision_rotary_embedding = mocker.patch( - "vllm_ascend.models.qwen2_5_vl.AscendQwen2_5_VisionRotaryEmbedding.__init__", - return_value=None, - ) - mocker.patch( - "vllm_ascend.models.qwen2_5_vl_without_padding.AscendQwen2_5_VisionBlock_Without_Padding.__init__", - return_value=None, - ) - mocker.patch( - "vllm_ascend.models.qwen2_5_vl_without_padding.AscendQwen2_5_VisionPatchEmbed_Without_Padding.__init__", - return_value=None, - ) - mocker.patch( - "vllm_ascend.models.qwen2_5_vl_without_padding.parallel_state.get_tensor_model_parallel_world_size", - return_value=1, - ) - mocker.patch( - "vllm_ascend.models.qwen2_5_vl_without_padding.parallel_state.get_tensor_model_parallel_rank", - return_value=0, - ) - mocker.patch("vllm.distributed.utils.divide", return_value=100) - - vision_transformer = AscendQwen2_5_VisionTransformer_Without_Padding( - vision_config, - norm_eps, - ) - args, kwargs = mocker_vit.call_args - assert args == (vision_config, norm_eps, None, "") - assert not kwargs - mocker_vision_rotary_embedding.assert_called_once() - return vision_transformer - - def test_init_vision_transformer(self, mocker: MockerFixture): - vision_transformer = self.init_vision_transformer(mocker) - assert isinstance(vision_transformer, - AscendQwen2_5_VisionTransformer_Without_Padding) - - @pytest.mark.parametrize( - "interleaved, expected", - [ - ( - False, - torch.tensor([ - input_data[0, 0].cos(), - input_data[0, 1].cos(), - input_data[0, 0].cos(), - input_data[0, 1].cos(), - input_data[1, 0].cos(), - input_data[1, 1].cos(), - input_data[1, 0].cos(), - input_data[1, 1].cos(), - ]), - ), - ( - True, - torch.tensor([ - input_data[0, 0].cos(), - input_data[0, 0].cos(), - input_data[0, 1].cos(), - input_data[0, 1].cos(), - input_data[1, 0].cos(), - input_data[1, 0].cos(), - input_data[1, 1].cos(), - input_data[1, 1].cos(), - ]), - ), - ], - ) - def test_cal_cos_sin(self, interleaved, expected, mocker: MockerFixture): - vision_transformer = self.init_vision_transformer(mocker) - vision_transformer.__dict__["interleaved"] = interleaved - vision_transformer.__dict__["hidden_size_per_attention_head"] = 2 - vision_transformer.hidden_size_per_attention_head = 4 - cos_new, _ = vision_transformer.cal_cos_sin(self.input_data) - assert cos_new.shape == (1, 4, 1, 2) - assert torch.allclose(cos_new.view(-1), expected) - - def test_forward(self, mocker: MockerFixture): - vision_transformer = self.init_vision_transformer(mocker) - x = torch.randn(1, 3, 224, 224) - grid_thw = torch.tensor([[1, 4, 4]]) - mocker_patch_embed = mocker.patch.object( - vision_transformer, - "patch_embed", - side_effect=lambda _: torch.randn(16, 512), # noqa - ) - mocker_rot_pos_emb = mocker.patch.object( - vision_transformer, - "rot_pos_emb", - side_effect=lambda _: torch.randn(16, 64), # noqa - ) - mocker_get_window_index = mocker.patch.object( - vision_transformer, - "get_window_index", - side_effect=lambda _: (torch.arange(8), [4, 8, 12, 16]), # noqa - ) - mocker_cal_cos_sin = mocker.patch.object( - vision_transformer, - "cal_cos_sin", - side_effect=lambda _: - (torch.randn(16, 32), torch.randn(16, 32)), # noqa - ) - mocker_merger = mocker.patch.object( - vision_transformer, - "merger", - side_effect=lambda _: torch.randn(16, 256), # noqa - ) - vision_transformer.__dict__["vision_blocks"] = [ - lambda *args, **kwargs: torch.randn(16, 1, 512) # noqa - ] - vision_transformer.__dict__["patch_embed"] = mocker_patch_embed - vision_transformer.__dict__["rot_pos_emb"] = mocker_rot_pos_emb - vision_transformer.__dict__[ - "get_window_index"] = mocker_get_window_index - vision_transformer.__dict__["cal_cos_sin"] = mocker_cal_cos_sin - vision_transformer.__dict__["merger"] = mocker_merger - vision_transformer.__dict__["fullatt_block_indexes"] = [0, 2] - vision_transformer.__dict__["spatial_merge_unit"] = 2 - ret = vision_transformer.forward(x, grid_thw) - assert ret.shape == (8, 256) - mocker_patch_embed.assert_called_with(x) - mocker_rot_pos_emb.assert_called_with(grid_thw) - mocker_get_window_index.assert_called_with(grid_thw) - mocker_cal_cos_sin.assert_called_once() - mocker_merger.assert_called_once() - - -class TestAscendQwen2_5_VLForConditionalGeneration_Without_Padding(PytestBase): - - def test_init_vl_for_conditional_generation(self, mocker: MockerFixture): - vllm_config = mocker.MagicMock() - vllm_config.vision_config = "vision_config" - vllm_config.rms_norm_eps = 1e-5 - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - mocker_vl = mocker.patch( - "vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VLForConditionalGeneration.__init__", - return_value=None, - ) - mocker_vit = mocker.patch( - "vllm_ascend.models.qwen2_5_vl_without_padding.AscendQwen2_5_VisionTransformer_Without_Padding.__init__", - return_value=None, - ) - - vl_for_conditional_generation = AscendQwen2_5_VLForConditionalGeneration_Without_Padding( - vllm_config=vllm_config) - args, kwargs = mocker_vl.call_args - assert not args - assert kwargs == {"vllm_config": vllm_config, "prefix": ""} - mocker_vit.assert_called_once() - assert isinstance( - vl_for_conditional_generation, - AscendQwen2_5_VLForConditionalGeneration_Without_Padding, - ) - - def test_overridden_methods(self): - self.assert_method_overridden( - AscendQwen2_5_VLForConditionalGeneration_Without_Padding, - Qwen2_5_VLForConditionalGeneration, - "_process_image_input", - ) - - self.assert_method_overridden( - AscendQwen2_5_VLForConditionalGeneration_Without_Padding, - Qwen2_5_VLForConditionalGeneration, - "_process_video_input", - ) - - @staticmethod - def assert_method_overridden(subclass, parent, method_name: str): - """assert subclass override parent method""" - parent_func = parent.__dict__.get(method_name) - child_func = subclass.__dict__.get(method_name) - - assert child_func is not None, f"{subclass.__name__} should defined {method_name}" - assert child_func is not parent_func, f"{method_name} should override in {subclass.__name__}" diff --git a/vllm_ascend/models/__init__.py b/vllm_ascend/models/__init__.py index 956df2eb..31eae8d7 100644 --- a/vllm_ascend/models/__init__.py +++ b/vllm_ascend/models/__init__.py @@ -1,7 +1,5 @@ from vllm import ModelRegistry -import vllm_ascend.envs as envs_ascend - def register_model(): ModelRegistry.register_model( @@ -10,24 +8,11 @@ def register_model(): ModelRegistry.register_model( "Qwen3VLMoeForConditionalGeneration", - "vllm_ascend.models.qwen2_5_vl_without_padding:AscendQwen3VLMoeForConditionalGeneration" - ) + "vllm_ascend.models.qwen3_vl:AscendQwen3VLMoeForConditionalGeneration") ModelRegistry.register_model( "Qwen3VLForConditionalGeneration", - "vllm_ascend.models.qwen2_5_vl_without_padding:AscendQwen3VLForConditionalGeneration" - ) - - if envs_ascend.USE_OPTIMIZED_MODEL: - ModelRegistry.register_model( - "Qwen2_5_VLForConditionalGeneration", - "vllm_ascend.models.qwen2_5_vl:AscendQwen2_5_VLForConditionalGeneration" - ) - else: - ModelRegistry.register_model( - "Qwen2_5_VLForConditionalGeneration", - "vllm_ascend.models.qwen2_5_vl_without_padding:AscendQwen2_5_VLForConditionalGeneration_Without_Padding" - ) + "vllm_ascend.models.qwen3_vl:AscendQwen3VLForConditionalGeneration") # There is no PanguProMoEForCausalLM in vLLM, so we should register it before vLLM config initialization # to make sure the model can be loaded correctly. This register step can be removed once vLLM support PanguProMoEForCausalLM. diff --git a/vllm_ascend/models/qwen2_5_vl.py b/vllm_ascend/models/qwen2_5_vl.py deleted file mode 100644 index 0ff31712..00000000 --- a/vllm_ascend/models/qwen2_5_vl.py +++ /dev/null @@ -1,556 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Adapted from vllm/model_executor/models/qwen2_5_vl.py -# Copyright 2023 The vLLM team. -# -# This file is a part of the vllm-ascend project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from functools import partial -from typing import Callable, Iterable, Optional, Set, Tuple, Union - -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch_npu -from einops import rearrange -from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import ( - Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig) -from vllm.config import VllmConfig -from vllm.distributed import parallel_state -from vllm.distributed import utils as dist_utils -from vllm.model_executor.layers.activation import get_act_and_mul_fn -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.qwen2_5_vl import ( - Qwen2_5_VisionAttention, Qwen2_5_VisionBlock, Qwen2_5_VisionPatchEmbed, - Qwen2_5_VisionRotaryEmbedding, Qwen2_5_VisionTransformer, - Qwen2_5_VLDummyInputsBuilder, Qwen2_5_VLForConditionalGeneration, - Qwen2_5_VLMultiModalProcessor, Qwen2_5_VLProcessingInfo) -from vllm.model_executor.models.utils import maybe_prefix -from vllm.multimodal import MULTIMODAL_REGISTRY - -from vllm_ascend.ascend_forward_context import set_ascend_forward_context -from vllm_ascend.utils import ACL_FORMAT_FRACTAL_ND, is_enable_nz - -MIN_PAD_SIZE = 64 # min_size to pad weight -MAX_PAD_SIZE = 128 # max_size to pad weight - - -class AscendQwen2_5_VisionAttention(Qwen2_5_VisionAttention): - - def __init__( - self, - embed_dim: int, - num_heads: int, - projection_size: int, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: - super().__init__( - embed_dim, - num_heads, - projection_size, - quant_config, - prefix, - ) - self.embed_dim = embed_dim - self.hidden_size_per_attention_head = dist_utils.divide( - projection_size, num_heads) - self.origin_hidden_size_per_attention_head = self.hidden_size_per_attention_head - if self.hidden_size_per_attention_head > MIN_PAD_SIZE and self.hidden_size_per_attention_head < MAX_PAD_SIZE: - self.hidden_size_per_attention_head = MAX_PAD_SIZE - - def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]: - # [s, b, 3 * head * head_dim] - seq_len, bs, _ = qkv.shape - - # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim] - q, k, v = qkv.chunk(3, dim=2) - - # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim] - new_shape = (seq_len, bs, self.num_attention_heads_per_partition, - self.hidden_size_per_attention_head) - q, k, v = (x.view(*new_shape) for x in (q, k, v)) - return q, k, v - - def forward( - self, - x: torch.Tensor, - cu_seqlens: torch.Tensor, - cos: torch.Tensor, - sin: torch.Tensor, - ) -> torch.Tensor: - # [s, b, c] --> [s, b, head * 3 * head_dim] - x, _ = self.qkv(x) - - # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim] - q, k, v = self.split_qkv(x) - batch_size = q.shape[1] - - q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() - for x in (q, k, v)) - q = torch_npu.npu_rotary_mul(q, cos, sin) - k = torch_npu.npu_rotary_mul(k, cos, sin) - - q, k, v = [ - rearrange(x, "b s h d -> (b s) h d").contiguous() - for x in (q, k, v) - ] - - context_layer = torch.empty_like(q) - - # operator requires pta version >= 2.5.1 - torch_npu._npu_flash_attention_unpad( - query=q, - key=k, - value=v, - seq_len=cu_seqlens, - scale_value=self.origin_hidden_size_per_attention_head**-0.5, - num_heads=self.num_attention_heads_per_partition, - num_kv_heads=self.num_attention_heads_per_partition, - out=context_layer) - - context_layer = rearrange(context_layer, - "(b s) h d -> s b (h d)", - b=batch_size).contiguous() - - output, _ = self.proj(context_layer) - return output - - -class AscendQwen2_5_VisionBlock(Qwen2_5_VisionBlock): - - def __init__( - self, - dim: int, - num_heads: int, - mlp_hidden_dim: int, - act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, - norm_layer: Optional[Callable[[int], nn.Module]] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: - super().__init__(dim=dim, - num_heads=num_heads, - mlp_hidden_dim=mlp_hidden_dim, - act_fn=act_fn, - norm_layer=norm_layer, - quant_config=quant_config, - prefix=prefix) - - self.attn = AscendQwen2_5_VisionAttention(embed_dim=dim, - num_heads=num_heads, - projection_size=dim, - quant_config=quant_config, - prefix=f"{prefix}.attn") - - def forward(self, x: torch.Tensor, cu_seqlens: torch.Tensor, - cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor: - x = x + self.attn( - self.norm1(x), cu_seqlens=cu_seqlens, cos=cos, sin=sin) - - x = x + self.mlp(self.norm2(x)) - return x - - -class AscendQwen2_5_VisionRotaryEmbedding(Qwen2_5_VisionRotaryEmbedding): - - def __init__(self, dim: int, theta: float = 10000.0) -> None: - super().__init__(dim, theta) - inv_freq = 1.0 / (theta - **(torch.arange(0, dim, 2, dtype=torch.float) / dim)) - self.inv_freq = inv_freq - - -class AscendQwen2_5_VisionTransformer(Qwen2_5_VisionTransformer): - - def __init__( - self, - vision_config: Qwen2_5_VLVisionConfig, - norm_eps: float = 1e-6, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - interleaved=False, - ) -> None: - super().__init__(vision_config, norm_eps, quant_config, prefix) - norm_layer = partial(RMSNorm, eps=norm_eps) - self.interleaved = interleaved - self.enable_pad = False - head_dim = self.hidden_size // self.num_heads - self.rotary_pos_emb = AscendQwen2_5_VisionRotaryEmbedding(head_dim // - 2) - self.patch_embed = Qwen2_5_VisionPatchEmbed( - patch_size=vision_config.patch_size, - temporal_patch_size=vision_config.temporal_patch_size, - in_channels=vision_config.in_channels, - hidden_size=self.hidden_size, - ) - - act_fn = get_act_and_mul_fn(vision_config.hidden_act) - self.blocks = nn.ModuleList([ - AscendQwen2_5_VisionBlock( - dim=self.hidden_size, - num_heads=self.num_heads, - mlp_hidden_dim=vision_config.intermediate_size, - act_fn=act_fn, - norm_layer=norm_layer, - quant_config=quant_config, - prefix=f"{prefix}.blocks.{layer_idx}") - for layer_idx in range(vision_config.depth) - ]) - self.tp_size = parallel_state.get_tensor_model_parallel_world_size() - self.tp_rank = parallel_state.get_tensor_model_parallel_rank() - self.hidden_size_per_attention_head = dist_utils.divide( - self.hidden_size, self.num_heads) - - if self.hidden_size_per_attention_head > MIN_PAD_SIZE and self.hidden_size_per_attention_head < MAX_PAD_SIZE: - self.enable_pad = True - self.origin_hidden_size_per_attention_head = self.hidden_size_per_attention_head - self.half_origin_hidden_size_per_attention_head = self.hidden_size_per_attention_head // 2 - self.half_pad_hidden_size_per_attention_head = ( - MAX_PAD_SIZE - self.hidden_size_per_attention_head) // 2 - self.hidden_size_per_attention_head = MAX_PAD_SIZE - - def cal_cos_sin(self, rotary_pos_emb): - cos = rotary_pos_emb.cos() # [seqlen, rotary_dim / 2] - sin = rotary_pos_emb.sin() - if self.enable_pad: - cos = torch.nn.functional.pad( - cos, (0, self.half_pad_hidden_size_per_attention_head)) - sin = torch.nn.functional.pad( - sin, (0, self.half_pad_hidden_size_per_attention_head)) - - if not self.interleaved: - cos_new = torch.cat((cos, cos), dim=-1) - sin_new = torch.cat((sin, sin), dim=-1) - else: - cos_new = rearrange(torch.stack((cos, cos), dim=-1), - "... d two -> ...(d two)", - two=2) - sin_new = rearrange(torch.stack((sin, sin), dim=-1), - "... d two -> ...(d two)", - two=2) - cos_new = cos_new.reshape(1, -1, 1, - self.hidden_size_per_attention_head) - sin_new = sin_new.reshape(1, -1, 1, - self.hidden_size_per_attention_head) - return cos_new, sin_new - - def pad_qkv_bias(self, bias): - first_half = bias.reshape( - -1, 3, self.origin_hidden_size_per_attention_head - )[:, :, :self.half_origin_hidden_size_per_attention_head] - second_half = bias.reshape( - -1, 3, self.origin_hidden_size_per_attention_head - )[:, :, self.half_origin_hidden_size_per_attention_head:] - first_half_padded = torch.nn.functional.pad( - first_half, (0, self.half_pad_hidden_size_per_attention_head)) - second_half_padded = torch.nn.functional.pad( - second_half, (0, self.half_pad_hidden_size_per_attention_head)) - bias_padded = torch.cat([first_half_padded, second_half_padded], dim=2) - bias_final = bias_padded.reshape(-1) - return bias_final - - def pad_qkv_weight(self, data): - qkv_weight_first_half = data.reshape( - -1, 3, self.origin_hidden_size_per_attention_head, self.hidden_size - )[:, :, :self.half_origin_hidden_size_per_attention_head, :] - qkv_weight_second_half = data.reshape( - -1, 3, self.origin_hidden_size_per_attention_head, self.hidden_size - )[:, :, self.half_origin_hidden_size_per_attention_head:, :] - - qkv_weight_first_half_padded = torch.nn.functional.pad( - qkv_weight_first_half, - (0, 0, 0, self.half_pad_hidden_size_per_attention_head)) - qkv_weight_second_half_padded = torch.nn.functional.pad( - qkv_weight_second_half, - (0, 0, 0, self.half_pad_hidden_size_per_attention_head)) - qkv_weight_padded = torch.cat( - [qkv_weight_first_half_padded, qkv_weight_second_half_padded], - dim=2) - qkv_weight_final = qkv_weight_padded.reshape(-1, self.hidden_size) - - if is_enable_nz(): - qkv_weight_final_copy = torch.empty_like(qkv_weight_final).copy_( - qkv_weight_final) - qkv_weight_final_copy = torch_npu.npu_format_cast( - qkv_weight_final_copy, ACL_FORMAT_FRACTAL_ND) - return qkv_weight_final_copy - - return qkv_weight_final - - def pad_proj_weight(self, data): - out_weight = torch.nn.functional.pad( - data.reshape(self.hidden_size, -1, - self.half_origin_hidden_size_per_attention_head), - (0, self.half_pad_hidden_size_per_attention_head, 0, 0)).reshape( - self.hidden_size, -1) - - if is_enable_nz(): - out_weight_copy = torch.empty_like(out_weight).copy_(out_weight) - out_weight_copy = torch_npu.npu_format_cast( - out_weight_copy, ACL_FORMAT_FRACTAL_ND) - return out_weight_copy - - return out_weight - - def pad_qkv_weight_scale_offset(self, data): - reshaped_data = data.reshape( - -1, 3, self.origin_hidden_size_per_attention_head, 1) - data1 = reshaped_data[:, :, :self. - half_origin_hidden_size_per_attention_head, :] - data2 = reshaped_data[:, :, self. - half_origin_hidden_size_per_attention_head:, :] - data1_paded = torch.nn.functional.pad( - data1, (0, 0, 0, self.half_pad_hidden_size_per_attention_head, 0, - 0, 0, 0)) - data2_paded = torch.nn.functional.pad( - data2, (0, 0, 0, self.half_pad_hidden_size_per_attention_head, 0, - 0, 0, 0)) - res = torch.cat([data1_paded, data2_paded], dim=2) - res = res.reshape(-1, 1) - return res - - def pad_qkv_deq_scale_quant_bias(self, data): - reshaped_data = data.reshape( - -1, 3, self.origin_hidden_size_per_attention_head) - data1 = reshaped_data[:, :, :self. - half_origin_hidden_size_per_attention_head] - data2 = reshaped_data[:, :, - self.half_origin_hidden_size_per_attention_head:] - - data1_paded = torch.nn.functional.pad( - data1, (0, self.half_pad_hidden_size_per_attention_head)) - data2_paded = torch.nn.functional.pad( - data2, (0, self.half_pad_hidden_size_per_attention_head)) - - res = torch.cat([data1_paded, data2_paded], dim=2) - res = res.reshape(-1) - return res - - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: - stacked_params_mapping: list[tuple[str, str, Union[str, int]]] = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("mlp.gate_up_proj.", "mlp.gate_proj.", 0), - ("mlp.gate_up_proj.", "mlp.up_proj.", 1), - ] - params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() - for name, loaded_weight in weights: - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - if ("attn.proj.weight_scale" in name or - "attn.proj.weight_offset" in name) and self.enable_pad: - continue - elif ("attn.proj.deq_scale" in name - or "attn.proj.quant_bias" in name) and self.enable_pad: - continue - elif ("attn.qkv.weight_scale" in name - or "attn.qkv.weight_offset" in name) and self.enable_pad: - param.data = self.pad_qkv_weight_scale_offset(param.data) - elif ("attn.qkv.deq_scale" in name - or "attn.qkv.quant_bias" in name) and self.enable_pad: - param.data = self.pad_qkv_deq_scale_quant_bias(param.data) - elif ("attn.proj.weight" in name) and self.enable_pad: - param.data = self.pad_proj_weight(param.data) - elif ("attn.qkv.weight" in name) and self.enable_pad: - param.data = self.pad_qkv_weight(param.data) - elif ("attn.qkv.bias" in name) and self.enable_pad: - param.data = self.pad_qkv_bias(param.data) - loaded_params.add(name) - return loaded_params - - def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: - pos_ids = [] - for t, h, w in grid_thw: - hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) - wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) - hpos_ids = hpos_ids.reshape( - h // self.spatial_merge_size, - self.spatial_merge_size, - w // self.spatial_merge_size, - self.spatial_merge_size, - ).permute(0, 2, 1, 3).flatten() - wpos_ids = wpos_ids.reshape( - h // self.spatial_merge_size, - self.spatial_merge_size, - w // self.spatial_merge_size, - self.spatial_merge_size, - ).permute(0, 2, 1, 3).flatten() - pos_ids.append( - torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) - pos_ids = torch.cat(pos_ids, dim=0) - max_grid_size = grid_thw[:, 1:].max() - rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) - rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) - return rotary_pos_emb - - def get_window_index(self, grid_thw): - window_index: list = [] - cu_window_seqlens: list = [0] - window_index_id = 0 - vit_merger_window_size = (self.window_size // - self.spatial_merge_size // self.patch_size) - - for grid_t, grid_h, grid_w in grid_thw: - llm_grid_h = grid_h // self.spatial_merge_size - llm_grid_w = grid_w // self.spatial_merge_size - index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape( - grid_t, llm_grid_h, llm_grid_w) - pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size - pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size - num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size - num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size - index_padded = F.pad(index, (0, pad_w, 0, pad_h), 'constant', -100) - index_padded = index_padded.reshape(grid_t, num_windows_h, - vit_merger_window_size, - num_windows_w, - vit_merger_window_size) - index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape( - grid_t, num_windows_h * num_windows_w, vit_merger_window_size, - vit_merger_window_size) - seqlens = (index_padded != -100).sum([2, 3]).reshape(-1) - index_padded = index_padded.reshape(-1) - index_new = index_padded[index_padded != -100] - window_index.append(index_new + window_index_id) - cu_seqlens_tmp = seqlens.cumsum( - 0) * self.spatial_merge_unit + cu_window_seqlens[-1] - cu_window_seqlens.extend(cu_seqlens_tmp.tolist()) - window_index_id += (grid_t * llm_grid_h * llm_grid_w).item() - window_index = torch.cat(window_index, dim=0) - return window_index, cu_window_seqlens - - def forward( - self, - x: torch.Tensor, - grid_thw: torch.Tensor, - ) -> torch.Tensor: - # compute cu_seqlens - cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], - grid_thw[:, - 0]).cpu().to(torch.int32) - - # patchify - x = self.patch_embed(x) - - # compute position embedding - rotary_pos_emb = self.rot_pos_emb(grid_thw) - - # windows attention - window_index, cu_window_seqlens = self.get_window_index(grid_thw) - cu_window_seqlens = torch.tensor( - cu_window_seqlens, - device=x.device, - dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32) - cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens) - cu_window_seqlens = torch.diff(cu_window_seqlens).cpu().to(torch.int32) - seq_len, _ = x.size() - x = x.reshape(seq_len // self.spatial_merge_unit, - self.spatial_merge_unit, -1) - x = x[window_index, :, :] - x = x.reshape(seq_len, -1) - rotary_pos_emb = rotary_pos_emb.reshape( - seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1) - rotary_pos_emb = rotary_pos_emb[window_index, :, :] - rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1) - - cos, sin = self.cal_cos_sin(rotary_pos_emb) - - # transformers - x = x.unsqueeze(1) - for layer_num, blk in enumerate(self.blocks): - if layer_num in self.fullatt_block_indexes: - cu_seqlens_now = cu_seqlens - else: - cu_seqlens_now = cu_window_seqlens - x = blk(x, cu_seqlens=cu_seqlens_now, cos=cos, sin=sin) - - # adapter - x = self.merger(x) - reverse_indices = torch.argsort(window_index) - x = x[reverse_indices, :] - return x - - -@MULTIMODAL_REGISTRY.register_processor( - Qwen2_5_VLMultiModalProcessor, - info=Qwen2_5_VLProcessingInfo, - dummy_inputs=Qwen2_5_VLDummyInputsBuilder) -class AscendQwen2_5_VLForConditionalGeneration( - Qwen2_5_VLForConditionalGeneration): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__(vllm_config=vllm_config, prefix=prefix) - config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - self.visual = AscendQwen2_5_VisionTransformer( - vision_config=config.vision_config, - norm_eps=getattr(config, "rms_norm_eps", 1e-6), - quant_config=quant_config, - prefix=maybe_prefix(prefix, "visual"), - ) - - def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]: - - grid_thw = image_input["image_grid_thw"] - assert grid_thw.ndim == 2 - - if image_input["type"] == "image_embeds": - image_embeds = image_input["image_embeds"].type(self.visual.dtype) - else: - pixel_values = image_input["pixel_values"].type(self.visual.dtype) - with set_ascend_forward_context(None, self.vllm_config): - image_embeds = self.visual(pixel_values, grid_thw=grid_thw) - - # Split concatenated embeddings for each image item. - merge_size = self.visual.spatial_merge_size - sizes = grid_thw.prod(-1) // merge_size // merge_size - return image_embeds.split(sizes.tolist()) - - def _process_video_input(self, video_input) -> tuple[torch.Tensor, ...]: - - grid_thw = video_input["video_grid_thw"] - assert grid_thw.ndim == 2 - - if video_input["type"] == "video_embeds": - video_embeds = video_input["video_embeds"].type(self.visual.dtype) - else: - pixel_values_videos = video_input["pixel_values_videos"].type( - self.visual.dtype) - with set_ascend_forward_context(None, self.vllm_config): - video_embeds = self.visual(pixel_values_videos, - grid_thw=grid_thw) - - # Split concatenated embeddings for each video item. - merge_size = self.visual.spatial_merge_size - sizes = grid_thw.prod(-1) // merge_size // merge_size - return video_embeds.split(sizes.tolist()) diff --git a/vllm_ascend/models/qwen2_5_vl_without_padding.py b/vllm_ascend/models/qwen2_5_vl_without_padding.py deleted file mode 100644 index d51a5aca..00000000 --- a/vllm_ascend/models/qwen2_5_vl_without_padding.py +++ /dev/null @@ -1,617 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# This file is a part of the vllm-ascend project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from functools import partial -from typing import Callable, Optional - -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch_npu -from einops import rearrange -from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import ( - Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig) - -try: - from transformers.models.qwen3_vl.configuration_qwen3_vl import \ - Qwen3VLConfig - from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import \ - Qwen3VLMoeConfig -except ImportError: - pass -from vllm.config import VllmConfig -from vllm.distributed import parallel_state -from vllm.distributed import utils as dist_utils -from vllm.model_executor.layers.activation import (_ACTIVATION_REGISTRY, - get_act_and_mul_fn) -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.models.qwen2_5_vl import ( - Qwen2_5_VisionAttention, Qwen2_5_VisionBlock, Qwen2_5_VisionPatchEmbed, - Qwen2_5_VisionTransformer, Qwen2_5_VLDummyInputsBuilder, - Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLMultiModalProcessor, - Qwen2_5_VLProcessingInfo) - -try: - from vllm.model_executor.models.qwen3_vl import ( - Qwen3_VisionBlock, Qwen3_VisionPatchEmbed, Qwen3_VisionTransformer, - Qwen3VLDummyInputsBuilder, Qwen3VLForConditionalGeneration, - Qwen3VLMultiModalProcessor, Qwen3VLProcessingInfo) - from vllm.model_executor.models.qwen3_vl_moe import ( - Qwen3VLMoeForConditionalGeneration, Qwen3VLMoeProcessingInfo) -except ImportError: - Qwen3_VisionBlock = object - Qwen3_VisionPatchEmbed = object - Qwen3_VisionTransformer = object - Qwen3VLDummyInputsBuilder = object - Qwen3VLForConditionalGeneration = object - Qwen3VLMultiModalProcessor = object - Qwen3VLProcessingInfo = object - Qwen3VLMoeForConditionalGeneration = object - Qwen3VLMoeProcessingInfo = object -from vllm.model_executor.models.utils import WeightsMapper, maybe_prefix -from vllm.multimodal import MULTIMODAL_REGISTRY - -from vllm_ascend.models.qwen2_5_vl import AscendQwen2_5_VisionRotaryEmbedding - - -class AscendQwen2_5_VisionAttention_Without_Padding(Qwen2_5_VisionAttention): - - def __init__( - self, - embed_dim: int, - num_heads: int, - projection_size: int, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: - super().__init__( - embed_dim, - num_heads, - projection_size, - quant_config, - prefix, - ) - self.embed_dim = embed_dim - self.hidden_size_per_attention_head = dist_utils.divide( - projection_size, num_heads) - - def forward( - self, - x: torch.Tensor, - cu_seqlens: torch.Tensor, - cos: torch.Tensor, - sin: torch.Tensor, - ) -> torch.Tensor: - # [s, b, c] --> [s, b, head * 3 * head_dim] - x, _ = self.qkv(x) - - # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim] - q, k, v = self.split_qkv(x) - batch_size = q.shape[1] - - q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() - for x in (q, k, v)) - q = torch_npu.npu_rotary_mul(q, cos, sin) - k = torch_npu.npu_rotary_mul(k, cos, sin) - - q, k, v = [ - rearrange(x, "b s h d -> (b s) h d").contiguous() - for x in (q, k, v) - ] - - context_layer = torch.empty_like(q) - - # operator requires pta version >= 2.5.1.dev20250226 - torch_npu._npu_flash_attention_unpad( - query=q, - key=k, - value=v, - seq_len=cu_seqlens, - scale_value=self.hidden_size_per_attention_head**-0.5, - num_heads=self.num_attention_heads_per_partition, - num_kv_heads=self.num_attention_heads_per_partition, - out=context_layer) - - context_layer = rearrange(context_layer, - "(b s) h d -> s b (h d)", - b=batch_size).contiguous() - - output, _ = self.proj(context_layer) - return output - - -class AscendQwen2_5_VisionBlock_Without_Padding(Qwen2_5_VisionBlock): - - def __init__(self, - dim: int, - num_heads: int, - mlp_hidden_dim: int, - act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, - norm_layer: Optional[Callable[[int], nn.Module]] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "") -> None: - super().__init__(dim=dim, - num_heads=num_heads, - mlp_hidden_dim=mlp_hidden_dim, - act_fn=act_fn, - norm_layer=norm_layer, - quant_config=quant_config, - prefix=prefix) - self.attn = AscendQwen2_5_VisionAttention_Without_Padding( - embed_dim=dim, - num_heads=num_heads, - projection_size=dim, - quant_config=quant_config, - prefix=f"{prefix}.attn") - - def forward(self, x: torch.Tensor, cu_seqlens: torch.Tensor, - cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor: - x = x + self.attn( - self.norm1(x), cu_seqlens=cu_seqlens, cos=cos, sin=sin) - - x = x + self.mlp(self.norm2(x)) - return x - - -class AscendQwen2_5_VisionPatchEmbed_Without_Padding(Qwen2_5_VisionPatchEmbed): - - def forward(self, x: torch.Tensor) -> torch.Tensor: - x = x.matmul( - self.proj.weight.data.view(self.hidden_size, -1).transpose(0, 1)) - return x - - -class AscendQwen2_5_VisionTransformer_Without_Padding(Qwen2_5_VisionTransformer - ): - - def __init__( - self, - vision_config: Qwen2_5_VLVisionConfig, - norm_eps: float = 1e-6, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - interleaved=False, - ) -> None: - super().__init__(vision_config, norm_eps, quant_config, prefix) - norm_layer = partial(RMSNorm, eps=norm_eps) - self.interleaved = interleaved - head_dim = self.hidden_size // self.num_heads - self.rotary_pos_emb = AscendQwen2_5_VisionRotaryEmbedding(head_dim // - 2) - self.patch_embed = AscendQwen2_5_VisionPatchEmbed_Without_Padding( - patch_size=vision_config.patch_size, - temporal_patch_size=vision_config.temporal_patch_size, - in_channels=vision_config.in_channels, - hidden_size=self.hidden_size, - ) - - act_fn = get_act_and_mul_fn(vision_config.hidden_act) - self.blocks = nn.ModuleList([ - AscendQwen2_5_VisionBlock_Without_Padding( - dim=self.hidden_size, - num_heads=self.num_heads, - mlp_hidden_dim=vision_config.intermediate_size, - act_fn=act_fn, - norm_layer=norm_layer, - quant_config=quant_config, - prefix=f"{prefix}.blocks.{layer_idx}") - for layer_idx in range(vision_config.depth) - ]) - self.tp_size = parallel_state.get_tensor_model_parallel_world_size() - self.tp_rank = parallel_state.get_tensor_model_parallel_rank() - self.hidden_size_per_attention_head = dist_utils.divide( - self.hidden_size, self.num_heads) - - def cal_cos_sin(self, rotary_pos_emb): - cos = rotary_pos_emb.cos() # [seqlen, rotary_dim / 2] - sin = rotary_pos_emb.sin() - - if not self.interleaved: - cos_new = torch.cat((cos, cos), dim=-1) - sin_new = torch.cat((sin, sin), dim=-1) - else: - cos_new = rearrange(torch.stack((cos, cos), dim=-1), - "... d two -> ...(d two)", - two=2) - sin_new = rearrange(torch.stack((sin, sin), dim=-1), - "... d two -> ...(d two)", - two=2) - cos_new = cos_new.reshape(1, -1, 1, - self.hidden_size_per_attention_head) - sin_new = sin_new.reshape(1, -1, 1, - self.hidden_size_per_attention_head) - return cos_new, sin_new - - def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: - pos_ids = [] - for t, h, w in grid_thw: - hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) - wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) - hpos_ids = hpos_ids.reshape( - h // self.spatial_merge_size, - self.spatial_merge_size, - w // self.spatial_merge_size, - self.spatial_merge_size, - ).permute(0, 2, 1, 3).flatten() - wpos_ids = wpos_ids.reshape( - h // self.spatial_merge_size, - self.spatial_merge_size, - w // self.spatial_merge_size, - self.spatial_merge_size, - ).permute(0, 2, 1, 3).flatten() - pos_ids.append( - torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) - pos_ids = torch.cat(pos_ids, dim=0) - max_grid_size = grid_thw[:, 1:].max() - rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) - rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) - return rotary_pos_emb - - def get_window_index(self, grid_thw): - window_index: list = [] - cu_window_seqlens: list = [0] - window_index_id = 0 - vit_merger_window_size = (self.window_size // - self.spatial_merge_size // self.patch_size) - - for grid_t, grid_h, grid_w in grid_thw: - llm_grid_h = grid_h // self.spatial_merge_size - llm_grid_w = grid_w // self.spatial_merge_size - index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape( - grid_t, llm_grid_h, llm_grid_w) - pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size - pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size - num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size - num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size - index_padded = F.pad(index, (0, pad_w, 0, pad_h), 'constant', -100) - index_padded = index_padded.reshape(grid_t, num_windows_h, - vit_merger_window_size, - num_windows_w, - vit_merger_window_size) - index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape( - grid_t, num_windows_h * num_windows_w, vit_merger_window_size, - vit_merger_window_size) - seqlens = (index_padded != -100).sum([2, 3]).reshape(-1) - index_padded = index_padded.reshape(-1) - index_new = index_padded[index_padded != -100] - window_index.append(index_new + window_index_id) - cu_seqlens_tmp = seqlens.cumsum( - 0) * self.spatial_merge_unit + cu_window_seqlens[-1] - cu_window_seqlens.extend(cu_seqlens_tmp.tolist()) - window_index_id += (grid_t * llm_grid_h * llm_grid_w).item() - window_index = torch.cat(window_index, dim=0) - return window_index, cu_window_seqlens - - def forward( - self, - x: torch.Tensor, - grid_thw: torch.Tensor, - ) -> torch.Tensor: - # compute cu_seqlens - cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], - grid_thw[:, - 0]).cpu().to(torch.int32) - - # patchify - x = self.patch_embed(x) - - # compute position embedding - rotary_pos_emb = self.rot_pos_emb(grid_thw) - - # windows attention - window_index, cu_window_seqlens = self.get_window_index(grid_thw) - cu_window_seqlens = torch.tensor( - cu_window_seqlens, - device=x.device, - dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32) - cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens) - cu_window_seqlens = torch.diff(cu_window_seqlens).cpu().to(torch.int32) - seq_len, _ = x.size() - x = x.reshape(seq_len // self.spatial_merge_unit, - self.spatial_merge_unit, -1) - x = x[window_index, :, :] - x = x.reshape(seq_len, -1) - rotary_pos_emb = rotary_pos_emb.reshape( - seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1) - rotary_pos_emb = rotary_pos_emb[window_index, :, :] - rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1) - - cos, sin = self.cal_cos_sin(rotary_pos_emb) - - # transformers - x = x.unsqueeze(1) - for layer_num, blk in enumerate(self.blocks): - if layer_num in self.fullatt_block_indexes: - cu_seqlens_now = cu_seqlens - else: - cu_seqlens_now = cu_window_seqlens - x = blk(x, cu_seqlens=cu_seqlens_now, cos=cos, sin=sin) - - # adapter - x = self.merger(x) - reverse_indices = torch.argsort(window_index) - x = x[reverse_indices, :] - return x - - -class AscendQwen3_VisionPatchEmbed(Qwen3_VisionPatchEmbed): - - def forward(self, x: torch.Tensor) -> torch.Tensor: - x = x.matmul( - self.proj.weight.data.view(self.hidden_size, -1).transpose(0, 1)) - x = x + self.proj.bias - return x - - -class AscendQwen3_VisionBlock(Qwen3_VisionBlock): - - def __init__( - self, - dim: int, - num_heads: int, - mlp_hidden_dim: int, - act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, - norm_layer: Optional[Callable[[int], nn.Module]] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - use_data_parallel: bool = False, - ) -> None: - super().__init__(dim=dim, - num_heads=num_heads, - mlp_hidden_dim=mlp_hidden_dim, - act_fn=act_fn, - norm_layer=norm_layer, - quant_config=quant_config, - prefix=prefix, - use_data_parallel=use_data_parallel) - - self.attn = AscendQwen2_5_VisionAttention_Without_Padding( - embed_dim=dim, - num_heads=num_heads, - projection_size=dim, - quant_config=quant_config, - prefix=f"{prefix}.attn") - - def forward(self, x: torch.Tensor, cu_seqlens: torch.Tensor, - cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor: - x = x + self.attn( - self.norm1(x), cu_seqlens=cu_seqlens, cos=cos, sin=sin) - - x = x + self.mlp(self.norm2(x)) - return x - - -class AscendQwen3_VisionTransformer(Qwen3_VisionTransformer): - - def __init__( - self, - vision_config, - norm_eps: float = 1e-6, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - use_data_parallel: bool = False, - ) -> None: - super().__init__(vision_config, norm_eps, quant_config, prefix, - use_data_parallel) - norm_layer = partial(nn.LayerNorm, eps=norm_eps) - self.patch_embed = AscendQwen3_VisionPatchEmbed( - patch_size=self.patch_size, - temporal_patch_size=self.temporal_patch_size, - in_channels=vision_config.in_channels, - hidden_size=self.hidden_size, - ) - self.blocks = nn.ModuleList([ - AscendQwen3_VisionBlock( - dim=self.hidden_size, - num_heads=self.num_heads, - mlp_hidden_dim=vision_config.intermediate_size, - act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act], - norm_layer=norm_layer, - quant_config=quant_config, - prefix=f"{prefix}.blocks.{layer_idx}") - for layer_idx in range(vision_config.depth) - ]) - self.hidden_size_per_attention_head = dist_utils.divide( - self.hidden_size, self.num_heads) - - def cal_cos_sin(self, rotary_pos_emb): - cos = rotary_pos_emb.cos() # [seqlen, rotary_dim / 2] - sin = rotary_pos_emb.sin() - cos_new = torch.cat((cos, cos), dim=-1) - sin_new = torch.cat((sin, sin), dim=-1) - cos_new = cos_new.reshape(1, -1, 1, - self.hidden_size_per_attention_head) - sin_new = sin_new.reshape(1, -1, 1, - self.hidden_size_per_attention_head) - return cos_new, sin_new - - def forward( - self, - x: torch.Tensor, - grid_thw: list[list[int]], - ) -> torch.Tensor: - hidden_states = x.to(device=self.device, dtype=self.dtype) - hidden_states = self.patch_embed(hidden_states) - - pos_embeds = self.fast_pos_embed_interpolate(grid_thw) - hidden_states = hidden_states + pos_embeds - rotary_pos_emb = self.rot_pos_emb(grid_thw) - grid_thw_tensor = torch.tensor(grid_thw, - device=self.device, - dtype=torch.int32) - cu_seqlens = torch.repeat_interleave( - grid_thw_tensor[:, 1] * grid_thw_tensor[:, 2], - grid_thw_tensor[:, 0]).cpu().to(torch.int32) - cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0) - - hidden_states = hidden_states.unsqueeze(1) - rotary_pos_emb = rotary_pos_emb.to(hidden_states.device) - - cos, sin = self.cal_cos_sin(rotary_pos_emb) - - deepstack_feature_lists = [] - for layer_num, blk in enumerate(self.blocks): - hidden_states = blk(hidden_states, - cu_seqlens=cu_seqlens, - cos=cos, - sin=sin) - if layer_num in self.deepstack_visual_indexes: - deepstack_merger_idx = self.deepstack_visual_indexes.index( - layer_num) - deepstack_feature = self.deepstack_merger_list[ - deepstack_merger_idx](hidden_states) - deepstack_feature_lists.append(deepstack_feature) - hidden_states = self.merger(hidden_states) - hidden_states = torch.cat( - [hidden_states] + deepstack_feature_lists, - dim=1) # [seq_len, hidden_size * (1 + depth_of_deepstack)] - return hidden_states - - -@MULTIMODAL_REGISTRY.register_processor( - Qwen2_5_VLMultiModalProcessor, - info=Qwen2_5_VLProcessingInfo, - dummy_inputs=Qwen2_5_VLDummyInputsBuilder) -class AscendQwen2_5_VLForConditionalGeneration_Without_Padding( - Qwen2_5_VLForConditionalGeneration): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__(vllm_config=vllm_config, prefix=prefix) - config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - self.visual = AscendQwen2_5_VisionTransformer_Without_Padding( - vision_config=config.vision_config, - norm_eps=getattr(config, "rms_norm_eps", 1e-6), - quant_config=quant_config, - prefix=maybe_prefix(prefix, "visual"), - ) - - def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]: - - grid_thw = image_input["image_grid_thw"] - assert grid_thw.ndim == 2 - - if image_input["type"] == "image_embeds": - image_embeds = image_input["image_embeds"].type(self.visual.dtype) - else: - pixel_values = image_input["pixel_values"].type(self.visual.dtype) - image_embeds = self.visual(pixel_values, grid_thw=grid_thw) - - # Split concatenated embeddings for each image item. - merge_size = self.visual.spatial_merge_size - sizes = grid_thw.prod(-1) // merge_size // merge_size - return image_embeds.split(sizes.tolist()) - - def _process_video_input(self, video_input) -> tuple[torch.Tensor, ...]: - - grid_thw = video_input["video_grid_thw"] - assert grid_thw.ndim == 2 - - if video_input["type"] == "video_embeds": - video_embeds = video_input["video_embeds"].type(self.visual.dtype) - else: - pixel_values_videos = video_input["pixel_values_videos"].type( - self.visual.dtype) - video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) - - # Split concatenated embeddings for each video item. - merge_size = self.visual.spatial_merge_size - sizes = grid_thw.prod(-1) // merge_size // merge_size - return video_embeds.split(sizes.tolist()) - - -@MULTIMODAL_REGISTRY.register_processor(Qwen3VLMultiModalProcessor, - info=Qwen3VLProcessingInfo, - dummy_inputs=Qwen3VLDummyInputsBuilder) -class AscendQwen3VLForConditionalGeneration(Qwen3VLForConditionalGeneration): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } - - supports_encoder_tp_data = True - - # To ensure correct weight loading and mapping. - hf_to_vllm_mapper = WeightsMapper( - orig_to_new_prefix={ - "model.visual.": "visual.", - "lm_head.": "language_model.lm_head.", - "model.language_model.": "language_model.model.", - }) - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__(vllm_config=vllm_config, prefix=prefix) - config: Qwen3VLConfig = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - self.visual = AscendQwen3_VisionTransformer( - config.vision_config, - norm_eps=getattr(config, "rms_norm_eps", 1e-6), - quant_config=quant_config, - prefix=maybe_prefix(prefix, "visual"), - use_data_parallel=self.use_data_parallel) - - -@MULTIMODAL_REGISTRY.register_processor(Qwen3VLMultiModalProcessor, - info=Qwen3VLMoeProcessingInfo, - dummy_inputs=Qwen3VLDummyInputsBuilder) -class AscendQwen3VLMoeForConditionalGeneration( - Qwen3VLMoeForConditionalGeneration): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } - - supports_encoder_tp_data = True - - # To ensure correct weight loading and mapping. - hf_to_vllm_mapper = WeightsMapper( - orig_to_new_prefix={ - "model.visual.": "visual.", - "lm_head.": "language_model.lm_head.", - "model.language_model.": "language_model.model.", - }) - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__(vllm_config=vllm_config, prefix=prefix) - config: Qwen3VLMoeConfig = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - multimodal_config = vllm_config.model_config.multimodal_config - self.multimodal_config = multimodal_config - self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" - self.visual = AscendQwen3_VisionTransformer( - config.vision_config, - norm_eps=getattr(config, "rms_norm_eps", 1e-6), - quant_config=quant_config, - prefix=maybe_prefix(prefix, "visual"), - use_data_parallel=self.use_data_parallel, - ) diff --git a/vllm_ascend/models/qwen3_vl.py b/vllm_ascend/models/qwen3_vl.py new file mode 100644 index 00000000..c79e71e7 --- /dev/null +++ b/vllm_ascend/models/qwen3_vl.py @@ -0,0 +1,264 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import partial +from typing import Callable, Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F + +try: + from transformers.models.qwen3_vl.configuration_qwen3_vl import \ + Qwen3VLConfig + from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import \ + Qwen3VLMoeConfig +except ImportError: + pass +from vllm.config import VllmConfig +from vllm.distributed import utils as dist_utils +from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.models.qwen2_5_vl import Qwen2_5_VisionAttention + +try: + from vllm.model_executor.models.qwen3_vl import ( + Qwen3_VisionBlock, Qwen3_VisionPatchEmbed, Qwen3_VisionTransformer, + Qwen3VLDummyInputsBuilder, Qwen3VLForConditionalGeneration, + Qwen3VLMultiModalProcessor, Qwen3VLProcessingInfo) + from vllm.model_executor.models.qwen3_vl_moe import ( + Qwen3VLMoeForConditionalGeneration, Qwen3VLMoeProcessingInfo) +except ImportError: + Qwen3_VisionBlock = object + Qwen3_VisionPatchEmbed = object + Qwen3_VisionTransformer = object + Qwen3VLDummyInputsBuilder = object + Qwen3VLForConditionalGeneration = object + Qwen3VLMultiModalProcessor = object + Qwen3VLProcessingInfo = object + Qwen3VLMoeForConditionalGeneration = object + Qwen3VLMoeProcessingInfo = object +from vllm.model_executor.models.utils import WeightsMapper, maybe_prefix +from vllm.multimodal import MULTIMODAL_REGISTRY + + +class AscendQwen3_VisionPatchEmbed(Qwen3_VisionPatchEmbed): + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x.matmul( + self.proj.weight.data.view(self.hidden_size, -1).transpose(0, 1)) + x = x + self.proj.bias + return x + + +class AscendQwen3_VisionBlock(Qwen3_VisionBlock): + + def __init__( + self, + dim: int, + num_heads: int, + mlp_hidden_dim: int, + act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, + norm_layer: Optional[Callable[[int], nn.Module]] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + use_data_parallel: bool = False, + ) -> None: + super().__init__(dim, num_heads, mlp_hidden_dim, act_fn, norm_layer, + quant_config, prefix, use_data_parallel) + self.attn = Qwen2_5_VisionAttention(embed_dim=dim, + num_heads=num_heads, + projection_size=dim, + quant_config=quant_config, + prefix=f"{prefix}.attn") + + def forward(self, x: torch.Tensor, cu_seqlens: torch.Tensor, + cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor: + x = x + self.attn( + self.norm1(x), cu_seqlens=cu_seqlens, cos=cos, sin=sin) + + x = x + self.mlp(self.norm2(x)) + return x + + +class AscendQwen3_VisionTransformer(Qwen3_VisionTransformer): + + def __init__( + self, + vision_config, + norm_eps: float = 1e-6, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + use_data_parallel: bool = False, + ) -> None: + super().__init__(vision_config, norm_eps, quant_config, prefix, + use_data_parallel) + norm_layer = partial(nn.LayerNorm, eps=norm_eps) + self.patch_embed = AscendQwen3_VisionPatchEmbed( + patch_size=self.patch_size, + temporal_patch_size=self.temporal_patch_size, + in_channels=vision_config.in_channels, + hidden_size=self.hidden_size, + ) + self.blocks = nn.ModuleList([ + AscendQwen3_VisionBlock( + dim=self.hidden_size, + num_heads=self.num_heads, + mlp_hidden_dim=vision_config.intermediate_size, + act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act], + norm_layer=norm_layer, + quant_config=quant_config, + prefix=f"{prefix}.blocks.{layer_idx}") + for layer_idx in range(vision_config.depth) + ]) + self.hidden_size_per_attention_head = dist_utils.divide( + self.hidden_size, self.num_heads) + + def cal_cos_sin(self, rotary_pos_emb): + cos = rotary_pos_emb.cos() # [seqlen, rotary_dim / 2] + sin = rotary_pos_emb.sin() + cos_new = torch.cat((cos, cos), dim=-1) + sin_new = torch.cat((sin, sin), dim=-1) + cos_new = cos_new.reshape(1, -1, 1, + self.hidden_size_per_attention_head) + sin_new = sin_new.reshape(1, -1, 1, + self.hidden_size_per_attention_head) + return cos_new, sin_new + + def forward( + self, + x: torch.Tensor, + grid_thw: list[list[int]], + ) -> torch.Tensor: + hidden_states = x.to(device=self.device, dtype=self.dtype) + hidden_states = self.patch_embed(hidden_states) + + pos_embeds = self.fast_pos_embed_interpolate(grid_thw) + hidden_states = hidden_states + pos_embeds + rotary_pos_emb = self.rot_pos_emb(grid_thw) + grid_thw_tensor = torch.tensor(grid_thw, + device=self.device, + dtype=torch.int32) + cu_seqlens = torch.repeat_interleave( + grid_thw_tensor[:, 1] * grid_thw_tensor[:, 2], + grid_thw_tensor[:, 0]).cpu().to(torch.int32) + cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0) + + hidden_states = hidden_states.unsqueeze(1) + rotary_pos_emb = rotary_pos_emb.to(hidden_states.device) + + cos, sin = self.cal_cos_sin(rotary_pos_emb) + + deepstack_feature_lists = [] + for layer_num, blk in enumerate(self.blocks): + hidden_states = blk(hidden_states, + cu_seqlens=cu_seqlens, + cos=cos, + sin=sin) + if layer_num in self.deepstack_visual_indexes: + deepstack_merger_idx = self.deepstack_visual_indexes.index( + layer_num) + deepstack_feature = self.deepstack_merger_list[ + deepstack_merger_idx](hidden_states) + deepstack_feature_lists.append(deepstack_feature) + hidden_states = self.merger(hidden_states) + hidden_states = torch.cat( + [hidden_states] + deepstack_feature_lists, + dim=1) # [seq_len, hidden_size * (1 + depth_of_deepstack)] + return hidden_states + + +@MULTIMODAL_REGISTRY.register_processor(Qwen3VLMultiModalProcessor, + info=Qwen3VLProcessingInfo, + dummy_inputs=Qwen3VLDummyInputsBuilder) +class AscendQwen3VLForConditionalGeneration(Qwen3VLForConditionalGeneration): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + supports_encoder_tp_data = True + + # To ensure correct weight loading and mapping. + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "model.visual.": "visual.", + "lm_head.": "language_model.lm_head.", + "model.language_model.": "language_model.model.", + }) + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + config: Qwen3VLConfig = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.visual = AscendQwen3_VisionTransformer( + config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=quant_config, + prefix=maybe_prefix(prefix, "visual"), + use_data_parallel=self.use_data_parallel) + + +@MULTIMODAL_REGISTRY.register_processor(Qwen3VLMultiModalProcessor, + info=Qwen3VLMoeProcessingInfo, + dummy_inputs=Qwen3VLDummyInputsBuilder) +class AscendQwen3VLMoeForConditionalGeneration( + Qwen3VLMoeForConditionalGeneration): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + supports_encoder_tp_data = True + + # To ensure correct weight loading and mapping. + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "model.visual.": "visual.", + "lm_head.": "language_model.lm_head.", + "model.language_model.": "language_model.model.", + }) + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + config: Qwen3VLMoeConfig = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config + self.multimodal_config = multimodal_config + self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" + self.visual = AscendQwen3_VisionTransformer( + config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=quant_config, + prefix=maybe_prefix(prefix, "visual"), + use_data_parallel=self.use_data_parallel, + ) diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py index a361789f..faa57b61 100644 --- a/vllm_ascend/patch/worker/__init__.py +++ b/vllm_ascend/patch/worker/__init__.py @@ -27,3 +27,5 @@ import vllm_ascend.patch.worker.patch_roberta # noqa import vllm_ascend.patch.worker.patch_weight_loader # noqa import vllm_ascend.patch.worker.patch_multimodal_merge # noqa import vllm_ascend.patch.worker.patch_minicpm # noqa +import vllm_ascend.patch.worker.patch_qwen2_5_vl # noqa +import vllm_ascend.patch.worker.patch_rope # noqa diff --git a/vllm_ascend/patch/worker/patch_qwen2_5_vl.py b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py new file mode 100644 index 00000000..27f08751 --- /dev/null +++ b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py @@ -0,0 +1,501 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from functools import lru_cache, partial + +import einops +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch_npu +from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import \ + Qwen2_5_VLVisionConfig +from vllm.attention.backends.registry import AttentionBackendEnum +from vllm.attention.layer import maybe_get_vit_flash_attn_backend +from vllm.model_executor.layers.activation import get_act_and_mul_fn +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.models.qwen2_5_vl import ( + Qwen2_5_VisionAttention, Qwen2_5_VisionBlock, Qwen2_5_VisionPatchEmbed, + Qwen2_5_VisionPatchMerger, Qwen2_5_VisionTransformer, + Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLImageInputs, + Qwen2_5_VLVideoInputs) +from vllm.model_executor.models.utils import cast_overflow_tensors +from vllm.model_executor.models.vision import ( + get_vit_attn_backend, run_dp_sharded_mrope_vision_model) + +import vllm_ascend.envs as envs_ascend +from vllm_ascend.ascend_forward_context import set_ascend_forward_context + +MIN_PAD_SIZE = 64 # min_size to pad weight +MAX_PAD_SIZE = 128 # max_size to pad weight + + +class AscendQwen2_5_VisionAttention(nn.Module): + + def forward( + self, + x: torch.Tensor, + cu_seqlens: torch.Tensor, + rotary_pos_emb_cos: torch.Tensor, + rotary_pos_emb_sin: torch.Tensor, + max_seqlen: torch.Tensor, + seqlens: torch.Tensor, + ) -> torch.Tensor: + # [s, b, c] --> [s, b, head * 3 * head_dim] + x, _ = self.qkv(x) + seq_len, batch_size, _ = x.shape + + # Split q k v. + qkv = einops.rearrange( + x, + "s b (three head head_dim) -> b s three head head_dim", + three=3, + head=self.num_attention_heads_per_partition, + ) + q, k, v = qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2] + origin_shape = q.shape[-1] + + # Convert cumulative tensor to intervals and move it to cpu. + cu_seqlens = torch.diff(cu_seqlens).to("cpu") + + cos = rotary_pos_emb_cos + sin = rotary_pos_emb_sin + cos = einops.rearrange( + torch.stack((cos, cos), dim=-1), + "... d two -> ...(d two)", + two=2, + ) + sin = einops.rearrange( + torch.stack((sin, sin), dim=-1), + "... d two -> ...(d two)", + two=2, + ) + cos = cos.reshape(1, -1, 1, self.hidden_size_per_attention_head) + sin = sin.reshape(1, -1, 1, self.hidden_size_per_attention_head) + q = torch_npu.npu_rotary_mul(q, cos, sin) + k = torch_npu.npu_rotary_mul(k, cos, sin) + + q, k, v = [ + einops.rearrange(x, "b s h d -> (b s) h d").contiguous() + for x in (q, k, v) + ] + + enable_pad = (envs_ascend.USE_OPTIMIZED_MODEL + and self.hidden_size_per_attention_head > MIN_PAD_SIZE + and self.hidden_size_per_attention_head < MAX_PAD_SIZE) + + if enable_pad: + pad_len = MAX_PAD_SIZE - origin_shape + # q/k/v: [b * s, head, head_dim] -> [b * s, head, MAX_PAD_SIZE] + q = F.pad(q, (0, pad_len), mode="constant", value=0) + k = F.pad(k, (0, pad_len), mode="constant", value=0) + v = F.pad(v, (0, pad_len), mode="constant", value=0) + + context_layer = torch.empty_like(q) + + # operator requires pta version >= 2.5.1 + torch_npu._npu_flash_attention_unpad( + query=q, + key=k, + value=v, + seq_len=cu_seqlens, + scale_value=self.hidden_size_per_attention_head**-0.5, + num_heads=self.num_attention_heads_per_partition, + num_kv_heads=self.num_attention_heads_per_partition, + out=context_layer, + ) + + if enable_pad: + context_layer = context_layer[..., :origin_shape] + + context_layer = einops.rearrange(context_layer, + "(b s) h d -> s b (h d)", + b=batch_size).contiguous() + + output, _ = self.proj(context_layer) + return output + + +class AscendQwen2_5_VisionBlock(nn.Module): + + def forward( + self, + x: torch.Tensor, + cu_seqlens: torch.Tensor, + rotary_pos_emb_cos: torch.Tensor, + rotary_pos_emb_sin: torch.Tensor, + max_seqlen: torch.Tensor, # Only used for Flash Attention + seqlens: torch.Tensor, # Only used for xFormers + ) -> torch.Tensor: + x_attn = self.attn( + self.norm1(x), + cu_seqlens=cu_seqlens, + rotary_pos_emb_cos=rotary_pos_emb_cos, + rotary_pos_emb_sin=rotary_pos_emb_sin, + max_seqlen=max_seqlen, + seqlens=seqlens, + ) + x_fused_norm, residual = self.norm2(x, residual=x_attn) + x = residual + self.mlp(x_fused_norm) + return x + + +class AscendQwen2_5_VisionTransformer(nn.Module): + + def __init__( + self, + vision_config: Qwen2_5_VLVisionConfig, + norm_eps: float = 1e-6, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + use_data_parallel: bool = False, + attn_backend_override: AttentionBackendEnum | None = None, + ) -> None: + nn.Module.__init__(self) + + patch_size = vision_config.patch_size + temporal_patch_size = vision_config.temporal_patch_size + in_channels = vision_config.in_channels + depth = vision_config.depth + self.hidden_size = vision_config.hidden_size + self.num_heads = vision_config.num_heads + self.use_data_parallel = use_data_parallel + self.out_hidden_size = vision_config.out_hidden_size + + # args for get_window_index_thw + self.window_size = vision_config.window_size + self.patch_size = vision_config.patch_size + self.spatial_merge_size = vision_config.spatial_merge_size + self.fullatt_block_indexes = vision_config.fullatt_block_indexes + self.spatial_merge_unit = self.spatial_merge_size**2 + # TODO[@lucaskabela]: Investigate fixing this usage + # see https://github.com/vllm-project/vllm/issues/27044 + # DO NOT MOVE THIS IMPORT + from vllm.compilation.backends import set_model_tag + + with set_model_tag("Qwen2_5_VisionPatchEmbed"): + self.patch_embed = Qwen2_5_VisionPatchEmbed( + patch_size=patch_size, + temporal_patch_size=temporal_patch_size, + in_channels=in_channels, + hidden_size=self.hidden_size, + ) + + norm_layer = partial(RMSNorm, eps=norm_eps) + head_dim = self.hidden_size // self.num_heads + self.rotary_pos_emb = get_rope( + head_size=head_dim, + rotary_dim=head_dim // 2, + max_position=8192, + base=10000.0, + is_neox_style=True, + ) + + use_upstream_fa = False + self.attn_backend = get_vit_attn_backend( + head_size=head_dim, + dtype=torch.get_default_dtype(), + attn_backend_override=attn_backend_override, + ) + + self.attn_backend, self.flash_attn_varlen_func = ( + maybe_get_vit_flash_attn_backend( + self.attn_backend, + use_upstream_fa, + attn_backend_override=attn_backend_override, + )) + + with set_model_tag("Qwen2_5_VisionBlock"): + self.blocks = nn.ModuleList([ + Qwen2_5_VisionBlock( + dim=self.hidden_size, + num_heads=self.num_heads, + mlp_hidden_dim=vision_config.intermediate_size, + act_fn=get_act_and_mul_fn(vision_config.hidden_act), + norm_layer=norm_layer, + quant_config=quant_config, + prefix=f"{prefix}.blocks.{layer_idx}", + use_data_parallel=use_data_parallel, + attn_backend=self.attn_backend, + use_upstream_fa=use_upstream_fa, + attn_backend_override=attn_backend_override, + ) for layer_idx in range(depth) + ]) + + with set_model_tag("Qwen2_5_VisionPatchMerger"): + self.merger = Qwen2_5_VisionPatchMerger( + d_model=vision_config.out_hidden_size, + context_dim=self.hidden_size, + norm_layer=norm_layer, + spatial_merge_size=self.spatial_merge_size, + quant_config=quant_config, + prefix=f"{prefix}.merger", + use_data_parallel=use_data_parallel, + ) + + def rotary_pos_emb_thw(self, t, h, w): + hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) + wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) + hpos_ids = (hpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ).permute(0, 2, 1, 3).flatten()) + wpos_ids = (wpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ).permute(0, 2, 1, 3).flatten()) + pos_ids = torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1) + max_size = max(h, w) + + # Use pre-computed cos_sin_cache from RotaryEmbedding + cos, sin = self.rotary_pos_emb.get_cos_sin(max_size) + + cos_h = cos[pos_ids[:, 0]] # (num_tokens, rotary_dim // 2) + cos_w = cos[pos_ids[:, 1]] + sin_h = sin[pos_ids[:, 0]] + sin_w = sin[pos_ids[:, 1]] + + cos_combined = torch.cat([cos_h, cos_w], dim=-1) + sin_combined = torch.cat([sin_h, sin_w], dim=-1) + + cos_combined = cos_combined.reshape( + cos_combined.shape[0] // self.spatial_merge_unit, + self.spatial_merge_unit, + -1, + ) + sin_combined = sin_combined.reshape( + sin_combined.shape[0] // self.spatial_merge_unit, + self.spatial_merge_unit, + -1, + ) + + return cos_combined, sin_combined + + @lru_cache(maxsize=1024) # noqa: B019 + def get_rope_by_thw(self, t, h, w): + window_index_thw, cu_seqlens_window_thw = self.get_window_index_thw( + t, h, w) + cos_thw, sin_thw = self.rotary_pos_emb_thw(t, h, w) + + cos_thw = cos_thw[window_index_thw, :, :] + cos_thw = cos_thw.flatten(start_dim=0, end_dim=1) + sin_thw = sin_thw[window_index_thw, :, :] + sin_thw = sin_thw.flatten(start_dim=0, end_dim=1) + + cu_seqlens_thw = torch.repeat_interleave( + torch.tensor([h * w], dtype=torch.int32), t) + return ( + cos_thw, + sin_thw, + window_index_thw, + cu_seqlens_window_thw, + cu_seqlens_thw, + ) + + def forward( + self, + x: torch.Tensor, + grid_thw: list[list[int]], + ) -> torch.Tensor: + # patchify + seq_len, _ = x.size() + rotary_pos_emb_cos: list = [] + rotary_pos_emb_sin: list = [] + window_index: list = [] + cu_window_seqlens: list = [torch.tensor([0], dtype=torch.int32)] + cu_seqlens: list = [] + + hidden_states = x.to(device=self.device, dtype=self.dtype) + hidden_states = self.patch_embed(hidden_states) + + window_index_id = 0 + cu_window_seqlens_last = 0 + for t, h, w in grid_thw: + t, h, w = int(t), int(h), int(w) + llm_h = h // self.spatial_merge_size + llm_w = w // self.spatial_merge_size + + ( + cos_thw, + sin_thw, + window_index_thw, + cu_seqlens_window_thw, + cu_seqlens_thw, + ) = self.get_rope_by_thw(t, h, w) + + window_index.append(window_index_thw + window_index_id) + window_index_id += t * llm_h * llm_w + + cu_seqlens_window_thw = cu_seqlens_window_thw + cu_window_seqlens_last + cu_window_seqlens_last = cu_seqlens_window_thw[-1] + cu_window_seqlens.append(cu_seqlens_window_thw) + + rotary_pos_emb_cos.append(cos_thw) + rotary_pos_emb_sin.append(sin_thw) + + cu_seqlens.append(cu_seqlens_thw) + + rotary_pos_emb_cos = torch.cat(rotary_pos_emb_cos) + rotary_pos_emb_sin = torch.cat(rotary_pos_emb_sin) + window_index = torch.cat(window_index) + # compute reverse indices + reverse_indices = self.invert_permutation(window_index) + cu_window_seqlens = torch.cat(cu_window_seqlens) + cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens) + cu_seqlens = torch.cat(cu_seqlens) + cu_seqlens = torch.cumsum(cu_seqlens, dim=0, dtype=torch.int32) + cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0) + + # transformers + # pre-compute seqlens for window/full attn to reduce cuMemcpy operations + max_seqlen_full, seqlens_full = self.compute_attn_mask_seqlen( + cu_seqlens) + max_seqlen_window, seqlens_window = self.compute_attn_mask_seqlen( + cu_window_seqlens) + + cu_seqlens = cu_seqlens.to( # type: ignore[attr-defined] + device=self.device, + non_blocking=True) + cu_window_seqlens = cu_window_seqlens.to( # type: ignore[attr-defined] + device=self.device, + non_blocking=True) + rotary_pos_emb_cos = rotary_pos_emb_cos.to( # type: ignore[attr-defined] + device=self.device, + non_blocking=True) + rotary_pos_emb_sin = rotary_pos_emb_sin.to( # type: ignore[attr-defined] + device=self.device, + non_blocking=True) + window_index = window_index.to( # type: ignore[attr-defined] + device=hidden_states.device, + non_blocking=True) + reverse_indices = reverse_indices.to(device=hidden_states.device, + non_blocking=True) + + hidden_states = hidden_states.reshape( + seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1) + hidden_states = hidden_states[window_index, :, :] + hidden_states = hidden_states.reshape(seq_len, -1) + + hidden_states = hidden_states.unsqueeze(1) + + for layer_num, blk in enumerate(self.blocks): + if layer_num in self.fullatt_block_indexes: + cu_seqlens_now = cu_seqlens + max_seqlen_now = max_seqlen_full + seqlens_now = seqlens_full + else: + cu_seqlens_now = cu_window_seqlens + max_seqlen_now = max_seqlen_window + seqlens_now = seqlens_window + + hidden_states = blk( + hidden_states, + cu_seqlens=cu_seqlens_now, + rotary_pos_emb_cos=rotary_pos_emb_cos, + rotary_pos_emb_sin=rotary_pos_emb_sin, + max_seqlen=max_seqlen_now, + seqlens=seqlens_now, + ) + + # For Qwen2.5-VL-3B, float16 will overflow at last block + # for long visual tokens sequences. + if hidden_states.dtype == torch.float16: + hidden_states = cast_overflow_tensors(hidden_states) + + # adapter + hidden_states = self.merger(hidden_states) + hidden_states = hidden_states[reverse_indices, :] + return hidden_states + + +class AscendQwen2_5_VLForConditionalGeneration(nn.Module): + + def _process_image_input( + self, + image_input: Qwen2_5_VLImageInputs) -> tuple[torch.Tensor, ...]: + grid_thw = image_input["image_grid_thw"] + assert grid_thw.ndim == 2 + grid_thw_list = grid_thw.tolist() + + if image_input["type"] == "image_embeds": + image_embeds = image_input["image_embeds"].type(self.visual.dtype) + else: + pixel_values = image_input["pixel_values"] + with set_ascend_forward_context(None, self.vllm_config): + if self.use_data_parallel: + return run_dp_sharded_mrope_vision_model( + self.visual, + pixel_values, + grid_thw_list, + rope_type="rope_3d") + else: + image_embeds = self.visual(pixel_values, + grid_thw=grid_thw_list) + + # Split concatenated embeddings for each image item. + merge_size = self.visual.spatial_merge_size + sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist() + return image_embeds.split(sizes) + + def _process_video_input( + self, + video_input: Qwen2_5_VLVideoInputs) -> tuple[torch.Tensor, ...]: + grid_thw = video_input["video_grid_thw"] + assert grid_thw.ndim == 2 + grid_thw_list = grid_thw.tolist() + + if video_input["type"] == "video_embeds": + video_embeds = video_input["video_embeds"].type(self.visual.dtype) + else: + pixel_values_videos = video_input["pixel_values_videos"] + with set_ascend_forward_context(None, self.vllm_config): + if self.use_data_parallel: + return run_dp_sharded_mrope_vision_model( + self.visual, + pixel_values_videos, + grid_thw_list, + rope_type="rope_3d", + ) + else: + video_embeds = self.visual(pixel_values_videos, + grid_thw=grid_thw_list) + + # Split concatenated embeddings for each video item. + merge_size = self.visual.spatial_merge_size + sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist() + return video_embeds.split(sizes) + + +# NOTE: This will be removed after MMEncoderAttention has been extract as a CustomOp in vllm. +Qwen2_5_VisionAttention.forward = AscendQwen2_5_VisionAttention.forward + +# NOTE: These will be removed after https://github.com/vllm-project/vllm/pull/29388 is merged. +Qwen2_5_VLForConditionalGeneration._process_image_input = AscendQwen2_5_VLForConditionalGeneration._process_image_input +Qwen2_5_VLForConditionalGeneration._process_video_input = AscendQwen2_5_VLForConditionalGeneration._process_video_input + +# NOTE: These will be removed after vllm-ascend is aligned with vllm latest main. +Qwen2_5_VisionBlock.forward = AscendQwen2_5_VisionBlock.forward +Qwen2_5_VisionTransformer.__init__ = AscendQwen2_5_VisionTransformer.__init__ +Qwen2_5_VisionTransformer.rotary_pos_emb_thw = AscendQwen2_5_VisionTransformer.rotary_pos_emb_thw +Qwen2_5_VisionTransformer.get_rope_by_thw = AscendQwen2_5_VisionTransformer.get_rope_by_thw +Qwen2_5_VisionTransformer.forward = AscendQwen2_5_VisionTransformer.forward diff --git a/vllm_ascend/patch/worker/patch_rope.py b/vllm_ascend/patch/worker/patch_rope.py new file mode 100644 index 00000000..cb40af86 --- /dev/null +++ b/vllm_ascend/patch/worker/patch_rope.py @@ -0,0 +1,33 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import torch +import torch.nn as nn +from vllm.model_executor.layers.rotary_embedding.base import \ + RotaryEmbeddingBase + + +class AscendRotaryEmbeddingBase(nn.Module): + + def get_cos_sin(self, seqlen: int) -> tuple[torch.Tensor, torch.Tensor]: + cos_sin = self.cos_sin_cache[:seqlen] + cos, sin = cos_sin.chunk(2, dim=-1) + return cos, sin + + +# NOTE: These will be removed after vllm-ascend is aligned with vllm latest main. +RotaryEmbeddingBase.get_cos_sin = AscendRotaryEmbeddingBase.get_cos_sin