[Feature]: implement the fusion of allreduce and matmul in prefill phase when tp is enabled (#1926)

### What this PR does / why we need it? it'll execute allreduce and malmul seperately in vllm RowParallelLinear forward funcion, this function use torch_npu.npu_mm_all_reduce_base to execute allreduce and matmul in a fused kernel way. this will gain a 20% performance promotion in eager mode. ### Does this PR introduce _any_ user-facing change? this PR introduce a new env `VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE` to control whether enable the feature or not. ### How was this patch tested? the patch is tested by adding a new test file `test_patch_linear.py` to guard the ut - vLLM version: v0.10.0 - vLLM main: 7728dd77bb Signed-off-by: Ronald1995 <ronaldautomobile@163.com>
2025-07-28 15:13:37 +08:00
parent ba3dfbd59e
commit 32a9c5f694
6 changed files with 334 additions and 5 deletions
--- a/tests/ut/base.py
+++ b/tests/ut/base.py
@@ -19,10 +19,6 @@ import pytest

 from vllm_ascend.utils import adapt_patch, register_ascend_customop

-# fused moe ops test will hit the infer_schema error, we need add the patch
-# here to make the test pass.
-import vllm_ascend.patch.worker.patch_common.patch_utils  # type: ignore[import]  # isort: skip  # noqa
-

 class TestBase(unittest.TestCase):

--- a/tests/ut/patch/worker/patch_common/test_patch_linear.py
+++ b/tests/ut/patch/worker/patch_common/test_patch_linear.py
@@ -0,0 +1,167 @@
+from importlib import reload
+
+import pytest
+import torch
+import vllm
+from pytest_mock import MockerFixture
+
+from tests.ut.base import PytestBase
+from vllm_ascend import envs
+from vllm_ascend.patch.worker.patch_common import patch_linear
+
+
+class TestAscendRowParallelLinear(PytestBase):
+
+    def init_row_parallel_linear(self, mocker: MockerFixture):
+        mocker.patch(
+            "vllm_ascend.patch.worker.patch_common.patch_linear.AscendRowParallelLinear.__init__",
+            return_value=None,
+        )
+        mocker.patch("torch.nn.Module.__setattr__")
+        mocker.patch("torch.nn.Module.__getattr__")
+        mocker.patch("torch.nn.Module.__delattr__")
+        return patch_linear.AscendRowParallelLinear(
+            input_size=128,
+            output_size=256,
+        )
+
+    @pytest.mark.parametrize(
+        "version, expected",
+        [
+            ("1.0.0", 1),
+            ("2.1.0", 1),
+        ],
+    )
+    def test_get_hcomm_info(self, version, expected, mocker: MockerFixture):
+        mock_group = mocker.MagicMock()
+        backend = mocker.MagicMock()
+        backend.get_hccl_comm_name = lambda x: x
+        mock_group._get_backend = lambda x: backend
+        mock_group.get_hccl_comm_name = lambda x: x
+        mocker.patch("torch.distributed.get_rank", return_value=1)
+        mocker.patch(
+            "torch.distributed.get_global_rank",
+            return_value=0,
+        )
+        mocker.patch("torch.__version__", new=version)
+        hcomm_info = patch_linear.AscendRowParallelLinear.get_hcomm_info(
+            mock_group)
+        assert hcomm_info == expected
+
+    @pytest.mark.parametrize(
+        "skip_bias_add, return_bias, bias, expected",
+        [
+            (True, False, torch.tensor(1.0), torch.tensor(14.0)),
+            (False, True, torch.tensor(1.0), (torch.tensor(14.0), None)),
+            (
+                True,
+                True,
+                torch.tensor(1.0),
+                (torch.tensor(14.0), torch.tensor(1.0)),
+            ),
+        ],
+    )
+    def test_forward(
+        self,
+        skip_bias_add,
+        return_bias,
+        bias,
+        expected,
+        mocker: MockerFixture,
+    ):
+        mocker_tp_group = mocker.MagicMock()
+        mocker_tp_group.device_group = mocker.MagicMock()
+        row_parallel_linear = self.init_row_parallel_linear(mocker)
+        row_parallel_linear.__dict__["tp_rank"] = 0
+        row_parallel_linear.__dict__["skip_bias_add"] = skip_bias_add
+        row_parallel_linear.__dict__["return_bias"] = return_bias
+        row_parallel_linear.__dict__["bias"] = bias
+        row_parallel_linear.__dict__["qyuant_method"] = mocker.MagicMock()
+        row_parallel_linear.__dict__["calc_input"] = lambda x: x  # noqa
+        row_parallel_linear.__dict__[
+            "calc_output"] = lambda x: x.matmul(  # noqa
+                torch.tensor([1.0, 2.0]))
+        ret = row_parallel_linear.forward(torch.tensor([10.0, 2.0]))
+        if isinstance(ret, tuple):
+            assert torch.allclose(ret[0], expected[0])
+            if ret[1] is None:
+                assert ret[1] == expected[1]
+            else:
+                assert torch.allclose(ret[1], expected[1])
+        else:
+            assert torch.allclose(ret, expected)
+
+    @pytest.mark.parametrize(
+        "input_is_parallel, expected",
+        [
+            (True, torch.tensor([10.0, 2.0])),
+            (False, torch.tensor([10.0])),
+        ],
+    )
+    def test_calc_input(
+        self,
+        input_is_parallel,
+        expected,
+        mocker: MockerFixture,
+    ):
+        row_parallel_linear = self.init_row_parallel_linear(mocker)
+        row_parallel_linear.__dict__["input_is_parallel"] = input_is_parallel
+        input_tensor = torch.Tensor([10, 2])
+        mocker.patch(
+            "vllm_ascend.patch.worker.patch_common.patch_linear.get_tensor_model_parallel_rank",  # noqa
+            return_value=0,
+        )
+        mocker.patch(
+            "vllm_ascend.patch.worker.patch_common.patch_linear.split_tensor_along_last_dim",  # noqa
+            return_value=[torch.Tensor([10]),
+                          torch.Tensor([2])],
+        )
+        input_parallel = row_parallel_linear.calc_input(input_tensor)
+        assert torch.allclose(input_parallel, expected)
+
+    @pytest.mark.parametrize(
+        "reduce_results, tp_size, expected",
+        [
+            (True, 2, torch.tensor(56.0)),
+            (True, 1, torch.tensor(14.0)),
+            (False, 2, torch.tensor(14.0)),
+        ],
+    )
+    def test_calc_output(
+        self,
+        reduce_results,
+        tp_size,
+        expected,
+        mocker: MockerFixture,
+    ):
+        quant_method = mocker.MagicMock()
+        quant_method.apply = lambda self, x, bias=None: x.matmul(  # noqa
+            torch.tensor([1.0, 2.0]))
+        row_parallel_linear = self.init_row_parallel_linear(mocker)
+        row_parallel_linear.__dict__["reduce_results"] = reduce_results
+        row_parallel_linear.__dict__["tp_size"] = tp_size
+        row_parallel_linear.__dict__["quant_method"] = quant_method
+        row_parallel_linear.__dict__["tp_rank"] = 0
+        row_parallel_linear.__dict__["get_hcomm_info"] = lambda x: None  # noqa
+
+        mocker.patch(
+            "vllm_ascend.patch.worker.patch_common.patch_linear.get_tp_group",
+            return_value=mocker.MagicMock(device_group=mocker.MagicMock()),
+        )
+        mocker.patch(
+            "torch_npu.npu_mm_all_reduce_base",
+            side_effect=lambda input_, weight, hccl_info, bias: input_.
+            matmul(  # noqa
+                torch.tensor([4.0, 8.0])),
+        )  # noqa
+        ret = row_parallel_linear.calc_output(torch.tensor([10.0, 2.0]))
+        assert torch.allclose(ret, expected)
+
+    def test_enable_allreduce_matmul(self, mocker: MockerFixture):
+        mocker.patch.object(envs,
+                            "VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE",
+                            new=True)
+        reload(patch_linear)
+        assert envs.VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE
+        assert id(vllm.model_executor.layers.linear.RowParallelLinear) == id(
+            patch_linear.AscendRowParallelLinear)
--- a/vllm_ascend/envs.py
+++ b/vllm_ascend/envs.py
@@ -154,7 +154,11 @@ env_variables: Dict[str, Callable[[], Any]] = {
    # Whether to enable mla_pa for deepseek mla decode, this flag will be removed after its available torch_npu is public accessible
    # and the mla_pa will be the default path of deepseek decode path.
    "VLLM_ASCEND_MLA_PA":
-    lambda: int(os.getenv("VLLM_ASCEND_MLA_PA", 0))
+    lambda: int(os.getenv("VLLM_ASCEND_MLA_PA", 0)),
+    # Whether to enable MatmulAllReduce fusion kernel when tensor parallel is enabled.
+    # this feature is supported in A2, and eager mode will get better performance.
+    "VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE":
+    lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE", '0'))),
 }

 # end-env-vars-definition
--- a/vllm_ascend/patch/init.py
+++ b/vllm_ascend/patch/init.py
@@ -114,3 +114,19 @@
 #       - https://github.com/vllm-project/vllm/pull/21591
 #    Future Plan:
 #       Revert it when vLLM merge #21591 and release new version
+# ** File: worker/patch_common/patch_linear.py **
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   1. `vllm.model_executor.layers.linear.RowParallelLinear`
+#    Why:
+#       We need to fuse matmul and allreuce in `RowParallelLinear`
+#       to improve performance.
+#    How：
+#       Create a new class `AscendRowParallelLinear` that inherits from `RowParallelLinear`.
+#       In this class, we override the `forward` method to use
+#       torch_npu.npu_mm_all_reduce_base to replace matmul and allreduce.
+#    Related PR (if no, explain why):
+#       - https://github.com/vllm-project/vllm-ascend/pull/1926
+#    Future Plan:
+#       Validate more models in all kinds of scenario,
+#       if performance is always improved, we can enable this patch by default and remove the env
+#       variable `VLLM_ASCEND_ENABLE_FUSE_MATMUL_ALLREDUCE` in the future.
--- a/vllm_ascend/patch/worker/patch_common/init.py
+++ b/vllm_ascend/patch/worker/patch_common/init.py
@@ -19,5 +19,6 @@
 # patch files.
 import vllm_ascend.patch.worker.patch_common.patch_utils  # noqa isort:skip
 import vllm_ascend.patch.worker.patch_common.patch_distributed  # noqa
+import vllm_ascend.patch.worker.patch_common.patch_linear  # noqa
 import vllm_ascend.patch.worker.patch_common.patch_minicpm  # noqa
 import vllm_ascend.patch.worker.patch_common.patch_sampler  # noqa
--- a/vllm_ascend/patch/worker/patch_common/patch_linear.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_linear.py
@@ -0,0 +1,145 @@
+"""
+Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+This file is a part of the vllm-ascend project.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from typing import Optional, Union
+
+import torch
+import torch_npu
+import vllm
+from torch.distributed import ProcessGroup
+from torch.nn.parameter import Parameter
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              split_tensor_along_last_dim)
+from vllm.distributed.parallel_state import get_tp_group
+from vllm.model_executor.layers.linear import RowParallelLinear
+
+from vllm_ascend import envs
+
+_HCOMM_INFO = None
+
+
+class AscendRowParallelLinear(RowParallelLinear):
+    """
+    AscendRowParallelLinear is a custom implementation of RowParallelLinear
+    that overrides the forward method to handle Ascend-specific operations.
+    """
+
+    def __init__(self, *args, **kwargs):
+        """Initialize the AscendRowParallelLinear layer.
+
+        Args:
+            *args: Variable length argument list.
+            **kwargs: Arbitrary keyword arguments.
+        """
+        tp_group = get_tp_group().device_group
+        hcomm_info = self.get_hcomm_info(tp_group)
+        self.hcomm_info = hcomm_info
+        super().__init__(*args, **kwargs)
+        self.weight_t = self.weight.t()
+
+    @staticmethod
+    def get_hcomm_info(group: ProcessGroup) -> str:
+        """Get the HCCL communication information for the given group.
+
+        Args:
+            group (ProcessGroup): The process group for which to get the HCCL communication info.
+
+        Returns:
+            str: The HCCL communication name for the given group.
+        """
+        global _HCOMM_INFO
+        if _HCOMM_INFO is not None:
+            return _HCOMM_INFO
+
+        rank = torch.distributed.get_rank(group)
+        if torch.__version__ > "2.0":
+            global_rank = torch.distributed.get_global_rank(group, rank)
+            _HCOMM_INFO = group._get_backend(
+                torch.device("npu")).get_hccl_comm_name(global_rank)
+
+        else:
+            _HCOMM_INFO = group.get_hccl_comm_name(rank)
+        return _HCOMM_INFO
+
+    def forward(
+        self, input_: torch.Tensor
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+        """Forward pass for the AscendRowParallelLinear layer.
+
+        Args:
+            input_ (torch.Tensor): the input tensor to the layer.
+
+        Returns:
+            Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]: 
+                The output tensor after applying the linear transformation,
+                and optionally the bias if `return_bias` is True.
+        """
+        input_parallel = self.calc_input(input_)
+
+        # Matrix multiply.
+        assert self.quant_method is not None
+        # Only fuse bias add into GEMM for rank 0 (this ensures that
+        # bias will not get added more than once in TP>1 case)
+        output = self.calc_output(input_parallel)
+
+        output_bias = self.bias if self.skip_bias_add else None
+
+        if not self.return_bias:
+            return output
+        return output, output_bias
+
+    def calc_input(self, input_: torch.Tensor) -> torch.Tensor:
+        """Calculate the input tensor for parallel processing.
+
+        Args:
+            input_ (torch.Tensor): the input tensor to be processed.
+
+        Returns:
+            torch.Tensor: The input tensor split along the last dimension
+            for tensor model parallelism, or the original input if not parallel.
+        """
+        if self.input_is_parallel:
+            return input_
+        tp_rank = get_tensor_model_parallel_rank()
+        splitted_input = split_tensor_along_last_dim(
+            input_, num_partitions=self.tp_size)
+        return splitted_input[tp_rank].contiguous()
+
+    def calc_output(self, input_parallel: torch.Tensor) -> torch.Tensor:
+        """Calculate the output tensor of forward by considering
+        fusing communication and computation.
+
+        Args:
+            input_parallel (_type_): the input tensor to be processed in parallel.
+
+        Returns:
+             torch.Tensor: the output tensor after applying the linear transformation
+             and optionally handle communication between tensor model parallel ranks.
+        """
+        bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias
+        if self.reduce_results and self.tp_size > 1:
+            output = torch_npu.npu_mm_all_reduce_base(input_parallel,
+                                                      self.weight_t,
+                                                      self.hcomm_info,
+                                                      bias=bias_)
+        else:
+            output = self.quant_method.apply(self, input_parallel, bias=bias_)
+        return output
+
+
+if envs.VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE:
+    vllm.model_executor.layers.linear.RowParallelLinear = AscendRowParallelLinear