v0.10.1rc1

2025-09-09 09:40:35 +08:00
parent d6f6ef41fe
commit 9149384e03
432 changed files with 84698 additions and 1 deletions
--- a/vllm_ascend/patch/init.py
+++ b/vllm_ascend/patch/init.py
@@ -0,0 +1,104 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ----------------------------------------------------------------------------------
+# This module manage the patch for vllm. There are two folders in this module:
+# - platform: contains the patches applied before worker starts. It's called by
+#             `vllm_ascend.utils.adapt_patch(is_global_patch=True)` in
+#             `vllm_ascend.platform.NPUPlatform.pre_register_and_update()` function.
+# - worker: contains the patches applied when worker starts. It's called by
+#           `vllm_ascend.utils.adapt_patch(is_global_patch=False)` in
+#           each worker's `__init__` function.
+#
+# Then in each kind of patch, there are three folders:
+# - patch_0_10_0: contains the patches applied when vllm version is 0.10.0.
+# - patch_main: contains the patches applied when vllm version is main branch.
+# - patch_common: contains the patches applied in both 0.10.0 and main branch.
+#
+# Once a new patch is added in vllm-ascend, please add the patch description into this file as well.
+# ----------------------------------------------------------------------------------
+
+# What's Patched and how it works:
+# --------------------------------
+# * Platform Patch:
+# =================
+# ** File: platform/patch_common/patch_distributed.py**
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   1. `vllm.config.ParallelConfig.get_next_dp_init_port`
+#    Why:
+#       vllm doesn't support get port from environment.
+#    How：
+#       Add the logic to get port from environment.
+#    Related PR (if no, explain why):
+#       Need a PR to vllm to support get port from environment.
+#    Future Plan:
+#       Remove those patch when vllm merged them
+#
+# * Worker Patch:
+# ===============
+# ** File: worker/patch_common/patch_minicpm.py **
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   1. `vllm.model_executor.models.minicpm.MiniCPMAttention.forward`
+#    Why:
+#       The forward func of MiniCPMAttention in vllm do a datatype convert
+#       (original datatype --> float32) to ensure the precision on cuda.
+#       However float32 is not supported in cann rope op, thus we keep this patch
+#    How：
+#       Removed the dtype convert operations in forward
+#    Related PR (if no, explain why):
+#       NO, only for npu due to rope op.
+#    Future Plan:
+#       Keep this patch in vllm-ascend.
+#
+# ** File: worker/patch_common/patch_distributed.py **
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   1. `vllm.distributed.parallel_state.GroupCoordinator`
+#    Why:
+#       vllm doesn't support all_to_all for GroupCoordinator.
+#    How：
+#       Add all_to_all implementation for GroupCoordinator.
+#    Related PR (if no, explain why):
+#       Need a PR to vllm to support all_to_all for GroupCoordinator.
+#    Future Plan:
+#       Remove this patch when vllm merged them.
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   1. `vllm.v1.sample.sampler.Sampler.gather_logprobs`
+#    Why:
+#       We need to patch gather_logprobs to make sure call batched_count_greater_than
+#       with backend=current_platform.simple_compile_backend
+#    How：
+#       Patch gather_logprobs call new batched_count_greater_than
+#    Related PR (if no, explain why):
+#       - https://github.com/vllm-project/vllm/pull/21591
+#    Future Plan:
+#       Revert it when vLLM merge #21591 and release new version
+# ** File: worker/patch_common/patch_linear.py **
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   1. `vllm.model_executor.layers.linear.RowParallelLinear`
+#    Why:
+#       We need to fuse matmul and allreuce in `RowParallelLinear`
+#       to improve performance.
+#    How：
+#       Create a new class `AscendRowParallelLinear` that inherits from `RowParallelLinear`.
+#       In this class, we override the `forward` method to use
+#       torch_npu.npu_mm_all_reduce_base to replace matmul and allreduce.
+#    Related PR (if no, explain why):
+#       - https://github.com/vllm-project/vllm-ascend/pull/1926
+#    Future Plan:
+#       Validate more models in all kinds of scenario,
+#       if performance is always improved, we can enable this patch by default and remove the env
+#       variable `VLLM_ASCEND_ENABLE_FUSE_MATMUL_ALLREDUCE` in the future.
--- a/vllm_ascend/patch/platform/init.py
+++ b/vllm_ascend/patch/platform/init.py
@@ -0,0 +1,18 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from vllm_ascend.patch.platform import patch_common  # noqa: F401
+from vllm_ascend.patch.platform import patch_main  # noqa: F401
--- a/vllm_ascend/patch/platform/patch_common/init.py
+++ b/vllm_ascend/patch/platform/patch_common/init.py
@@ -0,0 +1,18 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import vllm_ascend.patch.platform.patch_common.patch_distributed  # noqa
--- a/vllm_ascend/patch/platform/patch_common/patch_distributed.py
+++ b/vllm_ascend/patch/platform/patch_common/patch_distributed.py
@@ -0,0 +1,115 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from vllm/model_executor/models/qwen2_vl.py
+# This file is a part of the vllm-ascend project.
+
+import torch
+import vllm.envs as envs_vllm
+from vllm.config import ParallelConfig
+
+from vllm_ascend.utils import is_310p
+
+
+def parallel_config_get_dp_port(self) -> int:
+    """
+    We might need to initialize process groups in multiple
+    processes that is related to data parallelism,
+    e.g. both in the worker and in the engine, which
+    can live in different processes. To avoid port conflicts, we
+    increment the port number each time we need to initialize a
+    new process group related to data parallelism.
+    """
+    answer = self.data_parallel_master_port
+    self.data_parallel_master_port += 1
+
+    # NOTE: Get port from envs directly when using torchrun
+    port = envs_vllm.VLLM_DP_MASTER_PORT if envs_vllm.VLLM_DP_MASTER_PORT else answer
+    return port
+
+
+ParallelConfig.get_next_dp_init_port = parallel_config_get_dp_port
+
+
+class NullHandle:
+
+    def __init__(self):
+        pass
+
+    def wait(self):
+        pass
+
+
+def communication_adaptation_310p():
+
+    def broadcast310p_wrapper(fn):
+
+        def broadcast310p(tensor, src, group=None, async_op=False):
+            if tensor.device == torch.device('cpu'):
+                return fn(tensor, src, group, async_op)
+            rank = torch.distributed.get_rank(group)
+            world_size = torch.distributed.get_world_size(group)
+            tensor_list = [torch.empty_like(tensor) for _ in range(world_size)]
+            tensor_list[rank] = tensor
+            torch.distributed.all_gather(tensor_list, tensor, group=group)
+            tensor[...] = tensor_list[src]
+            if async_op:
+                return NullHandle()
+            else:
+                return None
+
+        return broadcast310p
+
+    torch.distributed.broadcast = broadcast310p_wrapper(
+        torch.distributed.broadcast)
+    torch.distributed.distributed_c10d.broadcast = broadcast310p_wrapper(
+        torch.distributed.distributed_c10d.broadcast)
+
+    def all_reduce_wrapper_310p(fn):
+
+        def all_reduce(
+            tensor,
+            op=torch.distributed.ReduceOp.SUM,
+            group=None,
+            async_op=False,
+        ):
+            if tensor.dtype != torch.int64:
+                return fn(tensor, op, group, async_op)
+            rank = torch.distributed.get_rank(group)
+            world_size = torch.distributed.get_world_size(group)
+            tensor_list = [torch.empty_like(tensor) for _ in range(world_size)]
+            tensor_list[rank] = tensor
+            torch.distributed.all_gather(tensor_list, tensor, group=group)
+            if op == torch.distributed.ReduceOp.SUM:
+                return torch.stack(tensor_list).sum(0)
+            elif op == torch.distributed.ReduceOp.MAX:
+                return torch.tensor(
+                    torch.stack(tensor_list).cpu().numpy().max(0),
+                    device=tensor.device,
+                )
+            else:
+                raise RuntimeError(f"not implement op {op}")
+
+        return all_reduce
+
+    torch.distributed.all_reduce = all_reduce_wrapper_310p(
+        torch.distributed.all_reduce)
+    torch.distributed.distributed_c10d.all_reduce = all_reduce_wrapper_310p(
+        torch.distributed.distributed_c10d.all_reduce)
+
+
+if is_310p():
+    communication_adaptation_310p()
--- a/vllm_ascend/patch/platform/patch_main/init.py
+++ b/vllm_ascend/patch/platform/patch_main/init.py
@@ -0,0 +1,16 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
--- a/vllm_ascend/patch/worker/init.py
+++ b/vllm_ascend/patch/worker/init.py
@@ -0,0 +1,19 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from vllm_ascend.patch.worker import patch_common  # noqa: F401
+from vllm_ascend.patch.worker import patch_main  # noqa: F401
--- a/vllm_ascend/patch/worker/patch_common/init.py
+++ b/vllm_ascend/patch/worker/patch_common/init.py
@@ -0,0 +1,22 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import vllm_ascend.patch.worker.patch_common.patch_distributed  # noqa
+import vllm_ascend.patch.worker.patch_common.patch_linear  # noqa
+import vllm_ascend.patch.worker.patch_common.patch_logits  # noqa
+import vllm_ascend.patch.worker.patch_common.patch_lora_embedding  # noqa
+import vllm_ascend.patch.worker.patch_common.patch_minicpm  # noqa
--- a/vllm_ascend/patch/worker/patch_common/patch_distributed.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_distributed.py
@@ -0,0 +1,49 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List, Optional
+
+import torch
+import vllm
+from vllm.distributed.parallel_state import GroupCoordinator
+
+
+class GroupCoordinatorPatch(GroupCoordinator):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def all_to_all(self,
+                   input_: torch.Tensor,
+                   scatter_dim: int = 0,
+                   gather_dim: int = -1,
+                   scatter_sizes: Optional[List[int]] = None,
+                   gather_sizes: Optional[List[int]] = None) -> torch.Tensor:
+        if self.world_size == 1:
+            return input_
+        assert -input_.dim() <= scatter_dim < input_.dim(), (
+            f"Invalid scatter dim ({scatter_dim}) for input tensor with shape {input_.size()}"
+        )
+        assert -input_.dim() <= gather_dim < input_.dim(), (
+            f"Invalid gather dim ({gather_dim}) for input tensor with shape {input_.size()}"
+        )
+        return self.device_communicator.all_to_all(input_, scatter_dim,
+                                                   gather_dim, scatter_sizes,
+                                                   gather_sizes)
+
+
+vllm.distributed.parallel_state.GroupCoordinator = GroupCoordinatorPatch  # Note: check the GroupCoordinator with online serving
--- a/vllm_ascend/patch/worker/patch_common/patch_linear.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_linear.py
@@ -0,0 +1,147 @@
+"""
+Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+This file is a part of the vllm-ascend project.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from typing import Optional, Union
+
+import torch
+import torch_npu
+import vllm
+from torch.distributed import ProcessGroup
+from torch.nn.parameter import Parameter
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              split_tensor_along_last_dim)
+from vllm.distributed.parallel_state import get_tp_group
+from vllm.logger import logger
+from vllm.model_executor.layers.linear import RowParallelLinear
+
+import vllm_ascend.envs as envs_ascend
+
+_HCOMM_INFO = None
+
+
+class AscendRowParallelLinear(RowParallelLinear):
+    """
+    AscendRowParallelLinear is a custom implementation of RowParallelLinear
+    that overrides the forward method to handle Ascend-specific operations.
+    """
+
+    def __init__(self, *args, **kwargs):
+        """Initialize the AscendRowParallelLinear layer.
+
+        Args:
+            *args: Variable length argument list.
+            **kwargs: Arbitrary keyword arguments.
+        """
+        tp_group = get_tp_group().device_group
+        hcomm_info = self.get_hcomm_info(tp_group)
+        self.hcomm_info = hcomm_info
+        super().__init__(*args, **kwargs)
+        self.weight_t = self.weight.t()
+
+    @staticmethod
+    def get_hcomm_info(group: ProcessGroup) -> str:
+        """Get the HCCL communication information for the given group.
+
+        Args:
+            group (ProcessGroup): The process group for which to get the HCCL communication info.
+
+        Returns:
+            str: The HCCL communication name for the given group.
+        """
+        global _HCOMM_INFO
+        if _HCOMM_INFO is not None:
+            return _HCOMM_INFO
+
+        rank = torch.distributed.get_rank(group)
+        if torch.__version__ > "2.0":
+            global_rank = torch.distributed.get_global_rank(group, rank)
+            _HCOMM_INFO = group._get_backend(
+                torch.device("npu")).get_hccl_comm_name(global_rank)
+
+        else:
+            _HCOMM_INFO = group.get_hccl_comm_name(rank)
+        return _HCOMM_INFO
+
+    def forward(
+        self, input_: torch.Tensor
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+        """Forward pass for the AscendRowParallelLinear layer.
+
+        Args:
+            input_ (torch.Tensor): the input tensor to the layer.
+
+        Returns:
+            Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]: 
+                The output tensor after applying the linear transformation,
+                and optionally the bias if `return_bias` is True.
+        """
+        input_parallel = self.calc_input(input_)
+
+        # Matrix multiply.
+        assert self.quant_method is not None
+        # Only fuse bias add into GEMM for rank 0 (this ensures that
+        # bias will not get added more than once in TP>1 case)
+        output = self.calc_output(input_parallel)
+
+        output_bias = self.bias if self.skip_bias_add else None
+
+        if not self.return_bias:
+            return output
+        return output, output_bias
+
+    def calc_input(self, input_: torch.Tensor) -> torch.Tensor:
+        """Calculate the input tensor for parallel processing.
+
+        Args:
+            input_ (torch.Tensor): the input tensor to be processed.
+
+        Returns:
+            torch.Tensor: The input tensor split along the last dimension
+            for tensor model parallelism, or the original input if not parallel.
+        """
+        if self.input_is_parallel:
+            return input_
+        tp_rank = get_tensor_model_parallel_rank()
+        splitted_input = split_tensor_along_last_dim(
+            input_, num_partitions=self.tp_size)
+        return splitted_input[tp_rank].contiguous()
+
+    def calc_output(self, input_parallel: torch.Tensor) -> torch.Tensor:
+        """Calculate the output tensor of forward by considering
+        fusing communication and computation.
+
+        Args:
+            input_parallel (_type_): the input tensor to be processed in parallel.
+
+        Returns:
+             torch.Tensor: the output tensor after applying the linear transformation
+             and optionally handle communication between tensor model parallel ranks.
+        """
+        bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias
+        if self.reduce_results and self.tp_size > 1:
+            output = torch_npu.npu_mm_all_reduce_base(input_parallel,
+                                                      self.weight_t,
+                                                      self.hcomm_info,
+                                                      bias=bias_)
+        else:
+            output = self.quant_method.apply(self, input_parallel, bias=bias_)
+        return output
+
+
+if envs_ascend.VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE:
+    logger.info("AscendRowParallelLinear: Matmul all-reduce is enabled. ")
+    vllm.model_executor.layers.linear.RowParallelLinear = AscendRowParallelLinear
--- a/vllm_ascend/patch/worker/patch_common/patch_logits.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_logits.py
@@ -0,0 +1,26 @@
+import torch
+import vllm
+from vllm._custom_ops import apply_repetition_penalties_torch
+
+
+def apply_repetition_penalties(logits: torch.Tensor, prompt_mask: torch.Tensor,
+                               output_mask: torch.Tensor,
+                               repetition_penalties: torch.Tensor) -> None:
+    """Apply repetition penalties to logits in-place.
+
+    Args:
+        logits: The logits tensor of shape [num_seqs, vocab_size].
+        prompt_mask: A boolean tensor indicating which tokens appear in the prompt.
+        output_mask: A boolean tensor indicating which tokens appear in the output.
+        repetition_penalties: The repetition penalties of shape (num_seqs, ).
+    """
+    apply_repetition_penalties_torch(logits, prompt_mask, output_mask,
+                                     repetition_penalties)
+
+
+# NPU device type tensors have attributes is_cuda=True and is_npu=True, according to its implementation in
+# https://github.com/Ascend/pytorch/blob/863b9071cbdf47023c12c246e3efa9c6e2285fc6/torch_npu/npu/_stream_check.py#L74
+# This causes that vLLM's apply_repetition_penalties function will run into the branch of "if logits.is_cuda" and
+# call the custom op implemented in CUDA, which is not compatible with NPU.
+# Reference: https://github.com/vllm-project/vllm/blob/f66673a39d9f364194c249f28098cad8a5584ccb/vllm/_custom_ops.py#L314
+vllm._custom_ops.apply_repetition_penalties = apply_repetition_penalties
--- a/vllm_ascend/patch/worker/patch_common/patch_lora_embedding.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_lora_embedding.py
@@ -0,0 +1,29 @@
+from typing import Optional
+
+import vllm
+from torch import nn
+from transformers import PretrainedConfig
+from vllm.config import LoRAConfig
+from vllm.lora.layers import VocabParallelEmbeddingWithLoRA
+from vllm.lora.utils import _all_lora_classes
+
+from vllm_ascend.ops.vocab_parallel_embedding import \
+    AscendVocabParallelEmbedding
+
+
+class AscendVocabParallelEmbeddingWithLoRA(VocabParallelEmbeddingWithLoRA):
+
+    @classmethod
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        return type(source_layer) is AscendVocabParallelEmbedding
+
+
+# Patch for lora register_model issue after overriding VocabParallelEmbedding class (#2515)
+_all_lora_classes.add(AscendVocabParallelEmbeddingWithLoRA)
+vllm.lora.utils._all_lora_classes = _all_lora_classes
--- a/vllm_ascend/patch/worker/patch_common/patch_minicpm.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_minicpm.py
@@ -0,0 +1,36 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import torch
+from vllm.model_executor.models.minicpm import MiniCPMAttention
+
+
+def forward(
+    self,
+    positions: torch.Tensor,
+    hidden_states: torch.Tensor,
+) -> torch.Tensor:
+    qkv, _ = self.qkv_proj(hidden_states)
+    q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+    q, k = self.rotary_emb(positions, q, k)
+    attn_output = self.attn(q, k, v)
+    output, _ = self.o_proj(attn_output)
+    return output
+
+
+# The type conversion in the forward function is deleted to support the rope operator.
+MiniCPMAttention.forward = forward
--- a/vllm_ascend/patch/worker/patch_main/init.py
+++ b/vllm_ascend/patch/worker/patch_main/init.py
@@ -0,0 +1,16 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#