From b7aa511daa57da2f8198ea2955113ceca428f7dc Mon Sep 17 00:00:00 2001
From: Canlin Guo <canlinguosdu@gmail.com>
Date: Mon, 9 Feb 2026 14:07:44 +0800
Subject: [PATCH] [Patch] Remove the patch of MiniCPM (#5975)

### What this PR does / why we need it?

Part of #5304.

After https://github.com/vllm-project/vllm/pull/32523 merge, we could
remove the patch of `MiniCPMAttention`.

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?
Test it locally.

- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060

---------

Signed-off-by: gcanlin <canlinguosdu@gmail.com>
---
 .../worker/patch_common/test_patch_minicpm.py | 77 -------------------
 vllm_ascend/patch/__init__.py                 | 14 ----
 vllm_ascend/patch/worker/__init__.py          |  1 -
 vllm_ascend/patch/worker/patch_minicpm.py     | 36 ---------
 4 files changed, 128 deletions(-)
 delete mode 100644 tests/ut/patch/worker/patch_common/test_patch_minicpm.py
 delete mode 100644 vllm_ascend/patch/worker/patch_minicpm.py

diff --git a/tests/ut/patch/worker/patch_common/test_patch_minicpm.py b/tests/ut/patch/worker/patch_common/test_patch_minicpm.py
deleted file mode 100644
index 9a63d0ea..00000000
--- a/tests/ut/patch/worker/patch_common/test_patch_minicpm.py
+++ /dev/null
@@ -1,77 +0,0 @@
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This file is a part of the vllm-ascend project.
-#
-
-from unittest.mock import MagicMock
-
-import torch
-
-from tests.ut.base import TestBase
-from vllm_ascend.patch.worker.patch_minicpm import forward
-
-
-class TestPatchMiniCPM(TestBase):
-
-    def setUp(self):
-        self.mock_self = MagicMock()
-
-        self.mock_self.q_size = 128
-        self.mock_self.kv_size = 128
-
-        self.mock_self.qkv_proj = MagicMock()
-        self.mock_self.rotary_emb = MagicMock()
-        self.mock_self.attn = MagicMock()
-        self.mock_self.o_proj = MagicMock()
-
-        self.positions = torch.tensor([1, 2, 3])
-        self.hidden_states = torch.randn(3, 256)
-
-        self.mock_qkv = torch.randn(3, 384)
-        self.mock_q = self.mock_qkv[:, :128]
-        self.mock_k = self.mock_qkv[:, 128:256]
-        self.mock_v = self.mock_qkv[:, 256:]
-
-        self.mock_self.qkv_proj.return_value = (self.mock_qkv, None)
-        self.mock_self.rotary_emb.return_value = (self.mock_q, self.mock_k)
-        self.mock_self.attn.return_value = torch.randn(3, 256)
-        self.mock_self.o_proj.return_value = (torch.randn(3, 256), None)
-
-    def test_forward_patched(self):
-        from vllm.model_executor.models.minicpm import MiniCPMAttention
-
-        self.assertIs(MiniCPMAttention.forward, forward)
-
-    def test_forward_function(self):
-        result = forward(self.mock_self, self.positions, self.hidden_states)
-
-        self.mock_self.qkv_proj.assert_called_once_with(self.hidden_states)
-
-        args, _ = self.mock_self.rotary_emb.call_args
-        self.assertEqual(len(args), 3)
-        self.assertTrue(torch.equal(args[0], self.positions))
-        self.assertTrue(torch.equal(args[1], self.mock_q))
-        self.assertTrue(torch.equal(args[2], self.mock_k))
-
-        args, _ = self.mock_self.attn.call_args
-        self.assertEqual(len(args), 3)
-        self.assertTrue(torch.equal(args[0], self.mock_q))
-        self.assertTrue(torch.equal(args[1], self.mock_k))
-        self.assertTrue(torch.equal(args[2], self.mock_v))
-
-        self.mock_self.o_proj.assert_called_once_with(
-            self.mock_self.attn.return_value)
-
-        self.assertEqual(result.shape, (3, 256))
-        self.assertTrue(
-            torch.equal(result, self.mock_self.o_proj.return_value[0]))
diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py
index b7010b73..a5a32dd1 100644
--- a/vllm_ascend/patch/__init__.py
+++ b/vllm_ascend/patch/__init__.py
@@ -112,20 +112,6 @@
 #       Remove this patch when the refactor of all2all manager is done.
 #       Remove this patch when vLLM support all_reduce as customop.
 #
-# ** 2. File: worker/patch_minicpm.py **
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#   1. `vllm.model_executor.models.minicpm.MiniCPMAttention.forward`
-#    Why:
-#       The forward func of MiniCPMAttention in vllm do a datatype convert
-#       (original datatype --> float32) to ensure the precision on cuda.
-#       However float32 is not supported in cann rope op, thus we keep this patch
-#    How：
-#       Removed the dtype convert operations in forward
-#    Related PR (if no, explain why):
-#       NO, only for npu due to rope op.
-#    Future Plan:
-#       Keep this patch in vllm-ascend.
-#
 # ** 3. File: worker/patch_multimodal_merge.py**
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.model_executor.models.utils._merge_multimodal_embeddings`
diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py
index 2fd0498f..1eac4d0c 100644
--- a/vllm_ascend/patch/worker/__init__.py
+++ b/vllm_ascend/patch/worker/__init__.py
@@ -26,7 +26,6 @@ import vllm_ascend.patch.worker.patch_unquantized_gemm  # noqa
 import vllm_ascend.patch.worker.patch_bert  # noqa
 import vllm_ascend.patch.worker.patch_distributed  # noqa
 import vllm_ascend.patch.worker.patch_multimodal_merge  # noqa
-import vllm_ascend.patch.worker.patch_minicpm  # noqa
 import vllm_ascend.patch.worker.patch_rope  # noqa
 import vllm_ascend.patch.worker.patch_qwen3_next  # noqa
 import vllm_ascend.patch.worker.patch_qwen3_next_mtp  # noqa
diff --git a/vllm_ascend/patch/worker/patch_minicpm.py b/vllm_ascend/patch/worker/patch_minicpm.py
deleted file mode 100644
index 663a08a5..00000000
--- a/vllm_ascend/patch/worker/patch_minicpm.py
+++ /dev/null
@@ -1,36 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# This file is a part of the vllm-ascend project.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import torch
-from vllm.model_executor.models.minicpm import MiniCPMAttention
-
-
-def forward(
-    self,
-    positions: torch.Tensor,
-    hidden_states: torch.Tensor,
-) -> torch.Tensor:
-    qkv, _ = self.qkv_proj(hidden_states)
-    q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-    q, k = self.rotary_emb(positions, q, k)
-    attn_output = self.attn(q, k, v)
-    output, _ = self.o_proj(attn_output)
-    return output
-
-
-# The type conversion in the forward function is deleted to support the rope operator.
-MiniCPMAttention.forward = forward