diff --git a/tests/ut/models/test_qwen2_5_vl.py b/tests/ut/models/test_qwen2_5_vl.py index 06fb07d..7111aae 100644 --- a/tests/ut/models/test_qwen2_5_vl.py +++ b/tests/ut/models/test_qwen2_5_vl.py @@ -370,6 +370,10 @@ class TestAscendQwen2_5_VisionTransformer(PytestBase): mocker.patch("torch.nn.Module.__setattr__") mocker.patch("torch.nn.Module.__getattr__") mocker.patch("torch.nn.Module.__delattr__") + mocker.patch( + "torch_npu.npu_format_cast", + return_value=torch.rand((384, 300)), + ) res = attention.pad_qkv_weight(torch.rand((300, 300))) assert res.shape == (384, 300) @@ -378,6 +382,10 @@ class TestAscendQwen2_5_VisionTransformer(PytestBase): mocker.patch("torch.nn.Module.__setattr__") mocker.patch("torch.nn.Module.__getattr__") mocker.patch("torch.nn.Module.__delattr__") + mocker.patch( + "torch_npu.npu_format_cast", + return_value=torch.rand((300, 384)), + ) res = attention.pad_proj_weight(torch.rand((300, 300))) assert res.shape == (300, 384) diff --git a/vllm_ascend/models/qwen2_5_vl.py b/vllm_ascend/models/qwen2_5_vl.py index 33b4be2..b19ed87 100644 --- a/vllm_ascend/models/qwen2_5_vl.py +++ b/vllm_ascend/models/qwen2_5_vl.py @@ -42,6 +42,8 @@ from vllm.model_executor.models.qwen2_5_vl import ( from vllm.model_executor.models.utils import maybe_prefix from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm_ascend.utils import ACL_FORMAT_FRACTAL_ND, is_enable_nz + MIN_PAD_SIZE = 64 # min_size to pad weight MAX_PAD_SIZE = 128 # max_size to pad weight @@ -281,6 +283,14 @@ class AscendQwen2_5_VisionTransformer(Qwen2_5_VisionTransformer): [qkv_weight_first_half_padded, qkv_weight_second_half_padded], dim=2) qkv_weight_final = qkv_weight_padded.reshape(-1, self.hidden_size) + + if is_enable_nz(): + qkv_weight_final_copy = torch.empty_like(qkv_weight_final).copy_( + qkv_weight_final) + qkv_weight_final_copy = torch_npu.npu_format_cast( + qkv_weight_final_copy, ACL_FORMAT_FRACTAL_ND) + return qkv_weight_final_copy + return qkv_weight_final def pad_proj_weight(self, data): @@ -289,6 +299,13 @@ class AscendQwen2_5_VisionTransformer(Qwen2_5_VisionTransformer): self.half_origin_hidden_size_per_attention_head), (0, self.half_pad_hidden_size_per_attention_head, 0, 0)).reshape( self.hidden_size, -1) + + if is_enable_nz(): + out_weight_copy = torch.empty_like(out_weight).copy_(out_weight) + out_weight_copy = torch_npu.npu_format_cast( + out_weight_copy, ACL_FORMAT_FRACTAL_ND) + return out_weight_copy + return out_weight def pad_qkv_weight_scale_offset(self, data): diff --git a/vllm_ascend/models/qwen2_vl.py b/vllm_ascend/models/qwen2_vl.py index b601b16..ccd4616 100644 --- a/vllm_ascend/models/qwen2_vl.py +++ b/vllm_ascend/models/qwen2_vl.py @@ -40,6 +40,8 @@ from vllm.model_executor.models.qwen2_vl import ( from vllm.model_executor.models.utils import maybe_prefix from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm_ascend.utils import ACL_FORMAT_FRACTAL_ND, is_enable_nz + MIN_PAD_SIZE = 64 # min_size to pad weight MAX_PAD_SIZE = 128 # max_size to pad weight @@ -265,6 +267,14 @@ class AscendQwen2VisionTransformer(Qwen2VisionTransformer): [qkv_weight_first_half_padded, qkv_weight_second_half_padded], dim=2) qkv_weight_final = qkv_weight_padded.reshape(-1, self.hidden_size) + + if is_enable_nz(): + qkv_weight_final_copy = torch.empty_like(qkv_weight_final).copy_( + qkv_weight_final) + qkv_weight_final_copy = torch_npu.npu_format_cast( + qkv_weight_final_copy, ACL_FORMAT_FRACTAL_ND) + return qkv_weight_final_copy + return qkv_weight_final def pad_proj_weight(self, data): @@ -273,6 +283,13 @@ class AscendQwen2VisionTransformer(Qwen2VisionTransformer): self.half_origin_hidden_size_per_attention_head), (0, self.half_pad_hidden_size_per_attention_head, 0, 0)).reshape( self.hidden_size, -1) + + if is_enable_nz(): + out_weight_copy = torch.empty_like(out_weight).copy_(out_weight) + out_weight_copy = torch_npu.npu_format_cast( + out_weight_copy, ACL_FORMAT_FRACTAL_ND) + return out_weight_copy + return out_weight def load_weights(self, weights: Iterable[Tuple[str,