[Patch] Remove the patch of MiniCPM (#5975)
### What this PR does / why we need it?
Part of #5304.
After https://github.com/vllm-project/vllm/pull/32523 merge, we could
remove the patch of `MiniCPMAttention`.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
Test it locally.
- vLLM version: v0.13.0
- vLLM main:
2c24bc6996
---------
Signed-off-by: gcanlin <canlinguosdu@gmail.com>
This commit is contained in:
@@ -1,77 +0,0 @@
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import torch
|
||||
|
||||
from tests.ut.base import TestBase
|
||||
from vllm_ascend.patch.worker.patch_minicpm import forward
|
||||
|
||||
|
||||
class TestPatchMiniCPM(TestBase):
|
||||
|
||||
def setUp(self):
|
||||
self.mock_self = MagicMock()
|
||||
|
||||
self.mock_self.q_size = 128
|
||||
self.mock_self.kv_size = 128
|
||||
|
||||
self.mock_self.qkv_proj = MagicMock()
|
||||
self.mock_self.rotary_emb = MagicMock()
|
||||
self.mock_self.attn = MagicMock()
|
||||
self.mock_self.o_proj = MagicMock()
|
||||
|
||||
self.positions = torch.tensor([1, 2, 3])
|
||||
self.hidden_states = torch.randn(3, 256)
|
||||
|
||||
self.mock_qkv = torch.randn(3, 384)
|
||||
self.mock_q = self.mock_qkv[:, :128]
|
||||
self.mock_k = self.mock_qkv[:, 128:256]
|
||||
self.mock_v = self.mock_qkv[:, 256:]
|
||||
|
||||
self.mock_self.qkv_proj.return_value = (self.mock_qkv, None)
|
||||
self.mock_self.rotary_emb.return_value = (self.mock_q, self.mock_k)
|
||||
self.mock_self.attn.return_value = torch.randn(3, 256)
|
||||
self.mock_self.o_proj.return_value = (torch.randn(3, 256), None)
|
||||
|
||||
def test_forward_patched(self):
|
||||
from vllm.model_executor.models.minicpm import MiniCPMAttention
|
||||
|
||||
self.assertIs(MiniCPMAttention.forward, forward)
|
||||
|
||||
def test_forward_function(self):
|
||||
result = forward(self.mock_self, self.positions, self.hidden_states)
|
||||
|
||||
self.mock_self.qkv_proj.assert_called_once_with(self.hidden_states)
|
||||
|
||||
args, _ = self.mock_self.rotary_emb.call_args
|
||||
self.assertEqual(len(args), 3)
|
||||
self.assertTrue(torch.equal(args[0], self.positions))
|
||||
self.assertTrue(torch.equal(args[1], self.mock_q))
|
||||
self.assertTrue(torch.equal(args[2], self.mock_k))
|
||||
|
||||
args, _ = self.mock_self.attn.call_args
|
||||
self.assertEqual(len(args), 3)
|
||||
self.assertTrue(torch.equal(args[0], self.mock_q))
|
||||
self.assertTrue(torch.equal(args[1], self.mock_k))
|
||||
self.assertTrue(torch.equal(args[2], self.mock_v))
|
||||
|
||||
self.mock_self.o_proj.assert_called_once_with(
|
||||
self.mock_self.attn.return_value)
|
||||
|
||||
self.assertEqual(result.shape, (3, 256))
|
||||
self.assertTrue(
|
||||
torch.equal(result, self.mock_self.o_proj.return_value[0]))
|
||||
@@ -112,20 +112,6 @@
|
||||
# Remove this patch when the refactor of all2all manager is done.
|
||||
# Remove this patch when vLLM support all_reduce as customop.
|
||||
#
|
||||
# ** 2. File: worker/patch_minicpm.py **
|
||||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
# 1. `vllm.model_executor.models.minicpm.MiniCPMAttention.forward`
|
||||
# Why:
|
||||
# The forward func of MiniCPMAttention in vllm do a datatype convert
|
||||
# (original datatype --> float32) to ensure the precision on cuda.
|
||||
# However float32 is not supported in cann rope op, thus we keep this patch
|
||||
# How:
|
||||
# Removed the dtype convert operations in forward
|
||||
# Related PR (if no, explain why):
|
||||
# NO, only for npu due to rope op.
|
||||
# Future Plan:
|
||||
# Keep this patch in vllm-ascend.
|
||||
#
|
||||
# ** 3. File: worker/patch_multimodal_merge.py**
|
||||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
# 1. `vllm.model_executor.models.utils._merge_multimodal_embeddings`
|
||||
|
||||
@@ -26,7 +26,6 @@ import vllm_ascend.patch.worker.patch_unquantized_gemm # noqa
|
||||
import vllm_ascend.patch.worker.patch_bert # noqa
|
||||
import vllm_ascend.patch.worker.patch_distributed # noqa
|
||||
import vllm_ascend.patch.worker.patch_multimodal_merge # noqa
|
||||
import vllm_ascend.patch.worker.patch_minicpm # noqa
|
||||
import vllm_ascend.patch.worker.patch_rope # noqa
|
||||
import vllm_ascend.patch.worker.patch_qwen3_next # noqa
|
||||
import vllm_ascend.patch.worker.patch_qwen3_next_mtp # noqa
|
||||
|
||||
@@ -1,36 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import torch
|
||||
from vllm.model_executor.models.minicpm import MiniCPMAttention
|
||||
|
||||
|
||||
def forward(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
qkv, _ = self.qkv_proj(hidden_states)
|
||||
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
|
||||
q, k = self.rotary_emb(positions, q, k)
|
||||
attn_output = self.attn(q, k, v)
|
||||
output, _ = self.o_proj(attn_output)
|
||||
return output
|
||||
|
||||
|
||||
# The type conversion in the forward function is deleted to support the rope operator.
|
||||
MiniCPMAttention.forward = forward
|
||||
Reference in New Issue
Block a user