v0.10.1rc1

2025-09-09 09:40:35 +08:00
parent d6f6ef41fe
commit 9149384e03
432 changed files with 84698 additions and 1 deletions
--- a/vllm_ascend/patch/worker/init.py
+++ b/vllm_ascend/patch/worker/init.py
@@ -0,0 +1,19 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from vllm_ascend.patch.worker import patch_common  # noqa: F401
+from vllm_ascend.patch.worker import patch_main  # noqa: F401
--- a/vllm_ascend/patch/worker/patch_common/init.py
+++ b/vllm_ascend/patch/worker/patch_common/init.py
@@ -0,0 +1,22 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import vllm_ascend.patch.worker.patch_common.patch_distributed  # noqa
+import vllm_ascend.patch.worker.patch_common.patch_linear  # noqa
+import vllm_ascend.patch.worker.patch_common.patch_logits  # noqa
+import vllm_ascend.patch.worker.patch_common.patch_lora_embedding  # noqa
+import vllm_ascend.patch.worker.patch_common.patch_minicpm  # noqa
--- a/vllm_ascend/patch/worker/patch_common/patch_distributed.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_distributed.py
@@ -0,0 +1,49 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List, Optional
+
+import torch
+import vllm
+from vllm.distributed.parallel_state import GroupCoordinator
+
+
+class GroupCoordinatorPatch(GroupCoordinator):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def all_to_all(self,
+                   input_: torch.Tensor,
+                   scatter_dim: int = 0,
+                   gather_dim: int = -1,
+                   scatter_sizes: Optional[List[int]] = None,
+                   gather_sizes: Optional[List[int]] = None) -> torch.Tensor:
+        if self.world_size == 1:
+            return input_
+        assert -input_.dim() <= scatter_dim < input_.dim(), (
+            f"Invalid scatter dim ({scatter_dim}) for input tensor with shape {input_.size()}"
+        )
+        assert -input_.dim() <= gather_dim < input_.dim(), (
+            f"Invalid gather dim ({gather_dim}) for input tensor with shape {input_.size()}"
+        )
+        return self.device_communicator.all_to_all(input_, scatter_dim,
+                                                   gather_dim, scatter_sizes,
+                                                   gather_sizes)
+
+
+vllm.distributed.parallel_state.GroupCoordinator = GroupCoordinatorPatch  # Note: check the GroupCoordinator with online serving
--- a/vllm_ascend/patch/worker/patch_common/patch_linear.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_linear.py
@@ -0,0 +1,147 @@
+"""
+Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+This file is a part of the vllm-ascend project.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from typing import Optional, Union
+
+import torch
+import torch_npu
+import vllm
+from torch.distributed import ProcessGroup
+from torch.nn.parameter import Parameter
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              split_tensor_along_last_dim)
+from vllm.distributed.parallel_state import get_tp_group
+from vllm.logger import logger
+from vllm.model_executor.layers.linear import RowParallelLinear
+
+import vllm_ascend.envs as envs_ascend
+
+_HCOMM_INFO = None
+
+
+class AscendRowParallelLinear(RowParallelLinear):
+    """
+    AscendRowParallelLinear is a custom implementation of RowParallelLinear
+    that overrides the forward method to handle Ascend-specific operations.
+    """
+
+    def __init__(self, *args, **kwargs):
+        """Initialize the AscendRowParallelLinear layer.
+
+        Args:
+            *args: Variable length argument list.
+            **kwargs: Arbitrary keyword arguments.
+        """
+        tp_group = get_tp_group().device_group
+        hcomm_info = self.get_hcomm_info(tp_group)
+        self.hcomm_info = hcomm_info
+        super().__init__(*args, **kwargs)
+        self.weight_t = self.weight.t()
+
+    @staticmethod
+    def get_hcomm_info(group: ProcessGroup) -> str:
+        """Get the HCCL communication information for the given group.
+
+        Args:
+            group (ProcessGroup): The process group for which to get the HCCL communication info.
+
+        Returns:
+            str: The HCCL communication name for the given group.
+        """
+        global _HCOMM_INFO
+        if _HCOMM_INFO is not None:
+            return _HCOMM_INFO
+
+        rank = torch.distributed.get_rank(group)
+        if torch.__version__ > "2.0":
+            global_rank = torch.distributed.get_global_rank(group, rank)
+            _HCOMM_INFO = group._get_backend(
+                torch.device("npu")).get_hccl_comm_name(global_rank)
+
+        else:
+            _HCOMM_INFO = group.get_hccl_comm_name(rank)
+        return _HCOMM_INFO
+
+    def forward(
+        self, input_: torch.Tensor
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+        """Forward pass for the AscendRowParallelLinear layer.
+
+        Args:
+            input_ (torch.Tensor): the input tensor to the layer.
+
+        Returns:
+            Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]: 
+                The output tensor after applying the linear transformation,
+                and optionally the bias if `return_bias` is True.
+        """
+        input_parallel = self.calc_input(input_)
+
+        # Matrix multiply.
+        assert self.quant_method is not None
+        # Only fuse bias add into GEMM for rank 0 (this ensures that
+        # bias will not get added more than once in TP>1 case)
+        output = self.calc_output(input_parallel)
+
+        output_bias = self.bias if self.skip_bias_add else None
+
+        if not self.return_bias:
+            return output
+        return output, output_bias
+
+    def calc_input(self, input_: torch.Tensor) -> torch.Tensor:
+        """Calculate the input tensor for parallel processing.
+
+        Args:
+            input_ (torch.Tensor): the input tensor to be processed.
+
+        Returns:
+            torch.Tensor: The input tensor split along the last dimension
+            for tensor model parallelism, or the original input if not parallel.
+        """
+        if self.input_is_parallel:
+            return input_
+        tp_rank = get_tensor_model_parallel_rank()
+        splitted_input = split_tensor_along_last_dim(
+            input_, num_partitions=self.tp_size)
+        return splitted_input[tp_rank].contiguous()
+
+    def calc_output(self, input_parallel: torch.Tensor) -> torch.Tensor:
+        """Calculate the output tensor of forward by considering
+        fusing communication and computation.
+
+        Args:
+            input_parallel (_type_): the input tensor to be processed in parallel.
+
+        Returns:
+             torch.Tensor: the output tensor after applying the linear transformation
+             and optionally handle communication between tensor model parallel ranks.
+        """
+        bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias
+        if self.reduce_results and self.tp_size > 1:
+            output = torch_npu.npu_mm_all_reduce_base(input_parallel,
+                                                      self.weight_t,
+                                                      self.hcomm_info,
+                                                      bias=bias_)
+        else:
+            output = self.quant_method.apply(self, input_parallel, bias=bias_)
+        return output
+
+
+if envs_ascend.VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE:
+    logger.info("AscendRowParallelLinear: Matmul all-reduce is enabled. ")
+    vllm.model_executor.layers.linear.RowParallelLinear = AscendRowParallelLinear
--- a/vllm_ascend/patch/worker/patch_common/patch_logits.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_logits.py
@@ -0,0 +1,26 @@
+import torch
+import vllm
+from vllm._custom_ops import apply_repetition_penalties_torch
+
+
+def apply_repetition_penalties(logits: torch.Tensor, prompt_mask: torch.Tensor,
+                               output_mask: torch.Tensor,
+                               repetition_penalties: torch.Tensor) -> None:
+    """Apply repetition penalties to logits in-place.
+
+    Args:
+        logits: The logits tensor of shape [num_seqs, vocab_size].
+        prompt_mask: A boolean tensor indicating which tokens appear in the prompt.
+        output_mask: A boolean tensor indicating which tokens appear in the output.
+        repetition_penalties: The repetition penalties of shape (num_seqs, ).
+    """
+    apply_repetition_penalties_torch(logits, prompt_mask, output_mask,
+                                     repetition_penalties)
+
+
+# NPU device type tensors have attributes is_cuda=True and is_npu=True, according to its implementation in
+# https://github.com/Ascend/pytorch/blob/863b9071cbdf47023c12c246e3efa9c6e2285fc6/torch_npu/npu/_stream_check.py#L74
+# This causes that vLLM's apply_repetition_penalties function will run into the branch of "if logits.is_cuda" and
+# call the custom op implemented in CUDA, which is not compatible with NPU.
+# Reference: https://github.com/vllm-project/vllm/blob/f66673a39d9f364194c249f28098cad8a5584ccb/vllm/_custom_ops.py#L314
+vllm._custom_ops.apply_repetition_penalties = apply_repetition_penalties
--- a/vllm_ascend/patch/worker/patch_common/patch_lora_embedding.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_lora_embedding.py
@@ -0,0 +1,29 @@
+from typing import Optional
+
+import vllm
+from torch import nn
+from transformers import PretrainedConfig
+from vllm.config import LoRAConfig
+from vllm.lora.layers import VocabParallelEmbeddingWithLoRA
+from vllm.lora.utils import _all_lora_classes
+
+from vllm_ascend.ops.vocab_parallel_embedding import \
+    AscendVocabParallelEmbedding
+
+
+class AscendVocabParallelEmbeddingWithLoRA(VocabParallelEmbeddingWithLoRA):
+
+    @classmethod
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        return type(source_layer) is AscendVocabParallelEmbedding
+
+
+# Patch for lora register_model issue after overriding VocabParallelEmbedding class (#2515)
+_all_lora_classes.add(AscendVocabParallelEmbeddingWithLoRA)
+vllm.lora.utils._all_lora_classes = _all_lora_classes
--- a/vllm_ascend/patch/worker/patch_common/patch_minicpm.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_minicpm.py
@@ -0,0 +1,36 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import torch
+from vllm.model_executor.models.minicpm import MiniCPMAttention
+
+
+def forward(
+    self,
+    positions: torch.Tensor,
+    hidden_states: torch.Tensor,
+) -> torch.Tensor:
+    qkv, _ = self.qkv_proj(hidden_states)
+    q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+    q, k = self.rotary_emb(positions, q, k)
+    attn_output = self.attn(q, k, v)
+    output, _ = self.o_proj(attn_output)
+    return output
+
+
+# The type conversion in the forward function is deleted to support the rope operator.
+MiniCPMAttention.forward = forward
--- a/vllm_ascend/patch/worker/patch_main/init.py
+++ b/vllm_ascend/patch/worker/patch_main/init.py
@@ -0,0 +1,16 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#