Refactor tensor_parallel and comm_utils (#2814)

### What this PR does / why we need it? 1. Move ops/comm_utils to ops/moe/comm_utils 2. Move distributed/tensor_parallel/gather_from_sequence_parallel_region to ops/moe/comm_utils 3. Delete distributed/tensor_parallel ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? e2e & ut - vLLM version: main - vLLM main: a1213fae5f --------- Signed-off-by: wuweiqiang24 <1005334931@qq.com> Signed-off-by: wuweiqiang24 <wuweiqiang11@huawei.com>
2025-09-11 21:26:36 +08:00
parent 0005479b9c
commit 9615dea3a7
6 changed files with 153 additions and 392 deletions
--- a/tests/ut/distributed/test_distributed_tensor_parallel.py
+++ b/tests/ut/distributed/test_distributed_tensor_parallel.py
@@ -1,139 +0,0 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 # Copyright 2023 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 import importlib
 import pytest
 import torch
 from pytest_mock import MockerFixture
 from tests.ut.base import PytestBase
 from vllm_ascend.distributed.tensor_parallel import (
    _gather_along_first_dim, _gather_along_last_dim,
    _reduce_scatter_along_first_dim, _reduce_scatter_along_last_dim,
    all_to_all_hp2sp, all_to_all_sp2hp)
 class TestDistributedCommunication(PytestBase):
    @pytest.fixture(autouse=True)
    def context(self, mocker: MockerFixture):
        mocker.patch("torch.npu.current_device", return_value="cpu")
        mocker.patch("torch.distributed.get_world_size", return_value=4)
        mocker.patch("torch.distributed.get_rank", return_value=0)
    @pytest.mark.parametrize("world_size, test_tensor, expected",
                             [(1, torch.randn(8, 16), (8, 16)),
                              (4, torch.randn(8, 16), (32, 16))])
    def test_gather_along_first_dim(self, test_tensor, expected, world_size,
                                    mocker: MockerFixture):
        """test _gather_along_first_dim"""
        mocker.patch("torch.distributed.get_world_size",
                     return_value=world_size)
        result = _gather_along_first_dim(test_tensor, mocker.MagicMock())
        assert result.shape == expected
    @pytest.mark.parametrize("test_tensor, output_split_sizes, expected", [
        (torch.randn(8, 16), [5, 10, 15, 2], (32, 16)),
    ])
    def test_gather_along_first_dim_unequal_split(self, test_tensor, expected,
                                                  output_split_sizes,
                                                  mocker: MockerFixture):
        """test _gather_along_first_dim"""
        result = _gather_along_first_dim(test_tensor, mocker.MagicMock(),
                                         output_split_sizes)
        assert result.shape == expected
    @pytest.mark.parametrize("world_size, test_tensor, expected",
                             [(1, torch.randn(8, 16, 32), (8, 16, 32)),
                              (4, torch.randn(8, 16, 32), (8, 16, 32 * 4))])
    def test_gather_along_last_dim(self, test_tensor, expected, world_size,
                                   mocker: MockerFixture):
        """test _gather_along_last_dim"""
        mocker.patch("torch.distributed.get_world_size",
                     return_value=world_size)
        result = _gather_along_last_dim(test_tensor, mocker.MagicMock())
        assert result.shape == expected
    @pytest.mark.parametrize("input_shape,expected_shape", [
        ((32, 16), (8, 16)),
        ((40, 10), (10, 10)),
    ])
    def test_reduce_scatter_along_first_dim(self, input_shape, expected_shape,
                                            mocker: MockerFixture):
        input_tensor = torch.randn(*input_shape)
        result = _reduce_scatter_along_first_dim(input_tensor,
                                                 mocker.MagicMock())
        assert result.shape == expected_shape
    @pytest.mark.parametrize("input_shape,expected_shape", [
        ((8, 16, 32), (8, 16, 8)),
    ])
    def test_reduce_scatter_along_last_dim(self, input_shape, expected_shape,
                                           mocker: MockerFixture):
        input_tensor = torch.randn(*input_shape)
        result = _reduce_scatter_along_last_dim(input_tensor,
                                                mocker.MagicMock())
        assert result.shape == expected_shape
    @pytest.mark.parametrize("func,input_shape,expected_shape", [
        ("all_gather_last_dim_from_tensor_parallel_region", (8, 16, 32),
         (8, 16, 128)),
        ("reduce_scatter_to_sequence_parallel_region", (32, 16), (8, 16)),
        ("reduce_scatter_last_dim_to_tensor_parallel_region", (8, 16, 32),
         (8, 16, 8)),
        ("gather_from_sequence_parallel_region", (8, 16), (32, 16)),
    ])
    def test_wrapper_functions(self, func, input_shape, expected_shape,
                               mocker: MockerFixture):
        """test wrapper funcs"""
        mod = importlib.import_module(
            'vllm_ascend.distributed.tensor_parallel')
        globals = mod.__dict__
        test_func = globals[func]
        input_tensor = torch.randn(*input_shape)
        result = test_func(input_tensor, mocker.MagicMock())
        assert result.shape == expected_shape
    @pytest.mark.parametrize(
        "input_shape,output_shape",
        [
            ((8, 16), (32, 4)),  # [num_tokens/TP, H] -> [num_tokens, H/TP]
        ])
    def test_all_to_all_sp2hp(self, input_shape, output_shape,
                              mocker: MockerFixture):
        input_tensor = torch.randn(*input_shape)
        result = all_to_all_sp2hp(input_tensor, mocker.MagicMock())
        assert result.shape == output_shape
    @pytest.mark.parametrize(
        "input_shape,output_shape",
        [
            ((32, 4), (8, 16)),  # [num_tokens, H/TP] -> [num_tokens/TP, H]
        ])
    def test_all_to_all_hp2sp(self, input_shape, output_shape,
                              mocker: MockerFixture):
        input_tensor = torch.randn(*input_shape)
        result = all_to_all_hp2sp(input_tensor, mocker.MagicMock())
        assert result.shape == output_shape
--- a/tests/ut/ops/test_comm_utils.py
+++ b/tests/ut/ops/test_comm_utils.py
@@ -0,0 +1,98 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 # Copyright 2023 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 import pytest
 import torch
 from pytest_mock import MockerFixture
 from tests.ut.base import PytestBase
 from vllm_ascend.ops.moe.comm_utils import (
    _gather_along_first_dim, async_all_to_all,
    gather_from_sequence_parallel_region)
 class TestDistributedCommunication(PytestBase):
    @pytest.fixture(autouse=True)
    def context(self, mocker: MockerFixture):
        mocker.patch("torch.npu.current_device", return_value="cpu")
        mocker.patch("torch.distributed.get_world_size", return_value=4)
        mocker.patch("torch.distributed.get_rank", return_value=0)
    @pytest.mark.parametrize(
        "input_tensor, output_split_sizes, input_split_sizes",
        [(torch.randn(8, 16), [2, 2, 2, 2], [2, 2, 2, 2]),
         (torch.randn(16, 32), None, None)])
    def test_async_all_to_all(self, input_tensor, output_split_sizes,
                              input_split_sizes, mocker: MockerFixture):
        """Test async_all_to_all"""
        mock_group = mocker.MagicMock()
        mocker.patch("torch.distributed.all_to_all_single",
                     return_value=mocker.MagicMock())
        _, a2a_out, handle = async_all_to_all(input_tensor, output_split_sizes,
                                              input_split_sizes, mock_group)
        # Check if the output tensor is created properly
        if output_split_sizes is None:
            assert a2a_out.shape == input_tensor.shape
        else:
            total_output_size = sum(output_split_sizes)
            expected_shape = [total_output_size] + list(
                input_tensor.size())[1:]
            assert a2a_out.shape == torch.Size(expected_shape)
        # Ensure handle is returned from async operation
        assert handle is not None
        assert isinstance(handle, mocker.MagicMock)
    @pytest.mark.parametrize("world_size, test_tensor, expected",
                             [(1, torch.randn(8, 16), (8, 16)),
                              (4, torch.randn(8, 16), (32, 16))])
    def test_gather_along_first_dim(self, test_tensor, expected, world_size,
                                    mocker: MockerFixture):
        """Test _gather_along_first_dim"""
        mocker.patch("torch.distributed.get_world_size",
                     return_value=world_size)
        result = _gather_along_first_dim(test_tensor, mocker.MagicMock())
        assert result.shape == expected
    @pytest.mark.parametrize("input_tensor, output_split_sizes",
                             [(torch.randn(8, 16), None),
                              (torch.randn(8, 16), [2, 2, 2, 2])])
    def test_gather_from_sequence_parallel_region(self, input_tensor,
                                                  output_split_sizes,
                                                  mocker: MockerFixture):
        """Test gather_from_sequence_parallel_region"""
        mock_group = mocker.MagicMock()
        result = gather_from_sequence_parallel_region(input_tensor, mock_group,
                                                      output_split_sizes)
        # If output_split_sizes is not provided, result should have expanded first dimension by world size
        if output_split_sizes is None:
            expected_shape = [input_tensor.shape[0] * 4] + list(
                input_tensor.shape[1:])
            assert result.shape == torch.Size(expected_shape)
        else:
            # If output_split_sizes is provided, result shape is dictated by sum of output_split_sizes
            expected_shape = [sum(output_split_sizes)] + list(
                input_tensor.shape[1:])
            assert result.shape == torch.Size(expected_shape)
--- a/tests/ut/ops/test_token_dispatcher.py
+++ b/tests/ut/ops/test_token_dispatcher.py
@@ -348,7 +348,7 @@ class TestTokenDispatcherWithAll2AllV(TestBase):
        self.mock_npu_moe_token_unpermute.return_value = torch.randn(8, 16)
        # Mock async_all_to_all
-        patcher6 = patch('vllm_ascend.ops.comm_utils.async_all_to_all')
+        patcher6 = patch('vllm_ascend.ops.moe.comm_utils.async_all_to_all')
        self.mock_async_all_to_all = patcher6.start()
        self.addCleanup(patcher6.stop)
        self.mock_async_all_to_all.return_value = (None, torch.randn(16, 16),
--- a/vllm_ascend/distributed/tensor_parallel.py
+++ b/vllm_ascend/distributed/tensor_parallel.py
@@ -1,248 +0,0 @@
 # Copyright (c) 2024; NVIDIA CORPORATION. All rights reserved.
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 # Copyright 2023 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Adapts from: Megatron/megatron/core/tensor_parallel/mappings.py.
 # This file is a part of the vllm-ascend project.
 import torch
 def _gather_along_first_dim(input_, group, output_split_sizes=None):
    """Gather tensors and concatenate along the first dimension.
    Args:
        input_tensor (torch.Tensor):
            A tensor to be gathered.
        output_split_sizes (List[int], optional):
            A list specifying the sizes of the output splits along the first dimension.
            If None, equal splitting is assumed. Default: None.
    Returns:
        torch.Tensor: Gathered tensor.
    """
    world_size = torch.distributed.get_world_size(group)
    # Bypass the function if we are using only 1 GPU.
    if world_size == 1:
        return input_
    dim_size = list(input_.size())
    if output_split_sizes is None:
        dim_size[0] = dim_size[0] * world_size
        output = torch.empty(dim_size,
                             dtype=input_.dtype,
                             device=torch.npu.current_device())
        torch.distributed.all_gather_into_tensor(output,
                                                 input_.contiguous(),
                                                 group=group)
    else:
        dim_size[0] = sum(output_split_sizes)
        output = torch.empty(dim_size,
                             dtype=input_.dtype,
                             device=torch.npu.current_device())
        output_tensor_list = list(
            torch.split(output, output_split_sizes, dim=0))
        torch.distributed.all_gather(output_tensor_list, input_, group=group)
    return output
 def _gather_along_last_dim(input_, group):
    """Gather tensors and concatenate along the last dimension."""
    world_size = torch.distributed.get_world_size(group)
    # Bypass the function if we are using only 1 GPU.
    if world_size == 1:
        return input_
    dim_size = list(input_.size())
    dim_size[0] = dim_size[0] * world_size
    output = torch.empty(dim_size,
                         dtype=input_.dtype,
                         device=torch.npu.current_device())
    torch.distributed.all_gather_into_tensor(output,
                                             input_.contiguous(),
                                             group=group)
    tensor_list = output.chunk(world_size, dim=0)
    output = torch.cat(tensor_list, dim=-1).contiguous()
    return output
 def _reduce_scatter_along_first_dim(input_,
                                    group,
                                    input_split_sizes=None,
                                    use_global_buffer=False):
    """Reduce-scatter the input tensor across model parallel group.
    Args:
        input_ (torch.Tensor): The input tensor to be reduce-scattered.
        input_split_sizes (List[int], optional): A list specifying the sizes of
            the input splits along the first dimension for each rank. If None,
            equal splitting is assumed. Default: None.
    """
    world_size = torch.distributed.get_world_size(group)
    # Bypass the function if we are using only 1 GPU.
    if world_size == 1:
        return input_
    if input_split_sizes is None:
        dim_size = list(input_.size())
        assert (
            dim_size[0] % world_size == 0
        ), "First dimension of the tensor should be divisible by tensor parallel size"
        dim_size[0] = dim_size[0] // world_size
        output = torch.empty(dim_size,
                             dtype=input_.dtype,
                             device=torch.npu.current_device())
        torch.distributed.reduce_scatter_tensor(output,
                                                input_.contiguous(),
                                                group=group)
    else:
        rank = torch.distributed.get_rank(group)
        input_tensor_list = list(torch.split(input_, input_split_sizes, dim=0))
        output = torch.empty_like(input_tensor_list[rank])
        torch.distributed.reduce_scatter(output,
                                         input_tensor_list,
                                         group=group)
    return output
 def _reduce_scatter_along_last_dim(input_, group):
    """Reduce-scatter tensors on the last dimension."""
    world_size = torch.distributed.get_world_size(group)
    target_shape = list(input_.size())
    target_shape[-1] = target_shape[-1] // world_size
    input_ = input_.reshape(-1, input_.shape[-1])
    split_tensors = torch.split(input_,
                                split_size_or_sections=input_.shape[-1] //
                                world_size,
                                dim=1)
    concat_tensor = torch.cat(split_tensors, dim=0)
    output = _reduce_scatter_along_first_dim(concat_tensor,
                                             group).reshape(target_shape)
    return output
 def all_gather_last_dim_from_tensor_parallel_region(input_, group):
    """Wrapper for autograd function: forward: AG, backward RS <last dim>"""
    return _gather_along_last_dim(input_, group)
 def reduce_scatter_to_sequence_parallel_region(input_,
                                               group,
                                               input_split_sizes=None):
    """Wrapper for autograd function: forward: RS, backward AG <first dim>"""
    return _reduce_scatter_along_first_dim(input_, group, input_split_sizes)
 def reduce_scatter_last_dim_to_tensor_parallel_region(input_, group):
    """Wrapper for autograd function: forward: RS, backward AG: AG <last dim>"""
    return _reduce_scatter_along_last_dim(input_, group)
 def gather_from_sequence_parallel_region(
    input_,
    group,
    output_split_sizes=None,
 ):
    """Wrapper for autograd function: forward: AG, backward: RS <first dim>"""
    return _gather_along_first_dim(input_, group, output_split_sizes)
 def all_to_all(group, input, output_split_sizes=None, input_split_sizes=None):
    world_size = torch.distributed.get_world_size(group=group)
    # Bypass the function if we are using only 1 GPU.
    if world_size == 1:
        return input
    input = input.contiguous()
    if output_split_sizes is None:
        # Equal split (all2all)
        output = torch.empty_like(input)
    else:
        # Unequal split (all2all-v)
        output = input.new_empty(
            size=[sum(output_split_sizes)] + list(input.size()[1:]),
            dtype=input.dtype,
            device=torch.npu.current_device(),
        )
    torch.distributed.all_to_all_single(
        output,
        input,
        output_split_sizes=output_split_sizes,
        input_split_sizes=input_split_sizes,
        group=group,
    )
    return output
 def all_to_all_sp2hp(input_, group):
    """
    Perform AlltoAll communication on tensor parallel group, transform the input tensor from shape
    [num_tokens/TP, H] to [num_tokens, H/TP].
    Args:
        input_ (torch.Tensor):
            The input tensor which has been distributed along the sequence
            dimension.
    Returns:
        torch.Tensor: The output tensor with shape [num_tokens, H/TP].
    """
    if group is None:
        return input_
    world_size = torch.distributed.get_world_size(group=group)
    tp_group = group
    input_ = input_.reshape(-1, input_.shape[-1])
    split_tensors = torch.split(input_,
                                split_size_or_sections=input_.shape[-1] //
                                world_size,
                                dim=1)
    concat_tensor = torch.cat(split_tensors, dim=0)
    output = all_to_all(tp_group, concat_tensor)
    return output
 def all_to_all_hp2sp(input_, group):
    """
    Perform AlltoAll communication on tensor parallel group, transform the input tensor from shape
    [num_tokens, H/TP] to [num_tokens/TP, H].
    Args:
        input_ (torch.Tensor):
            The input tensor which has been distributed along the hidden
            dimension.
    Returns:
        torch.Tensor: The output tensor with shape [num_tokens/TP, H].
    """
    if group is None:
        return input_
    world_size = torch.distributed.get_world_size(group=group)
    input_ = input_.reshape(-1, input_.shape[-1])
    tp_group = group
    input_exchanged = all_to_all(tp_group, input_)
    input_reshaped = input_exchanged.reshape(-1, input_exchanged.shape[-1])
    split_tensors = torch.split(
        input_reshaped,
        split_size_or_sections=input_reshaped.shape[0] // world_size,
        dim=0)
    output = torch.cat(split_tensors, dim=-1)
    return output
--- a/vllm_ascend/ops/moe/comm_utils.py
+++ b/vllm_ascend/ops/moe/comm_utils.py
@@ -1,5 +1,7 @@
 # Copyright (c) 2024; NVIDIA CORPORATION. All rights reserved.
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 # Copyright 2023 The vLLM team.
 # This file is a part of the vllm-ascend project.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# This file is a part of the vllm-ascend project.
+#
 import torch
 import torch.distributed
 import torch.distributed as dist
@@ -60,3 +62,52 @@ def async_all_to_all(input_,
                                        group=group,
                                        async_op=True)
    return input_, a2a_out, handle
 def _gather_along_first_dim(input_, group, output_split_sizes=None):
    """Gather tensors and concatenate along the first dimension.
    Args:
        input_tensor (torch.Tensor):
            A tensor to be gathered.
        output_split_sizes (List[int], optional):
            A list specifying the sizes of the output splits along the first dimension.
            If None, equal splitting is assumed. Default: None.
    Returns:
        torch.Tensor: Gathered tensor.
    """
    world_size = torch.distributed.get_world_size(group)
    # Bypass the function if we are using only 1 GPU.
    if world_size == 1:
        return input_
    dim_size = list(input_.size())
    if output_split_sizes is None:
        dim_size[0] = dim_size[0] * world_size
        output = torch.empty(dim_size,
                             dtype=input_.dtype,
                             device=torch.npu.current_device())
        torch.distributed.all_gather_into_tensor(output,
                                                 input_.contiguous(),
                                                 group=group)
    else:
        dim_size[0] = sum(output_split_sizes)
        output = torch.empty(dim_size,
                             dtype=input_.dtype,
                             device=torch.npu.current_device())
        output_tensor_list = list(
            torch.split(output, output_split_sizes, dim=0))
        torch.distributed.all_gather(output_tensor_list, input_, group=group)
    return output
 def gather_from_sequence_parallel_region(
    input_,
    group,
    output_split_sizes=None,
 ):
    """Wrapper for autograd function: forward: AG, backward: RS <first dim>"""
    return _gather_along_first_dim(input_, group, output_split_sizes)
--- a/vllm_ascend/ops/moe/token_dispatcher.py
+++ b/vllm_ascend/ops/moe/token_dispatcher.py
@@ -30,9 +30,8 @@ from vllm.distributed.parallel_state import get_ep_group
 import vllm_ascend.envs as envs_ascend
 from vllm_ascend.distributed.parallel_state import get_mc2_group
-from vllm_ascend.distributed.tensor_parallel import \
+from vllm_ascend.ops.moe.comm_utils import (
-    gather_from_sequence_parallel_region
+    async_all_to_all, gather_from_sequence_parallel_region)
 from vllm_ascend.ops.comm_utils import async_all_to_all
 from vllm_ascend.utils import AscendSocVersion, get_ascend_soc_version
 _Dispatchers: Dict[str, Any] = {}