[main][Feature]Moe alltoallv communication optimization for unquantized RL training sence (#2088)

It comes from 0.9.1dev [0.9.1][Feature]Moe alltoallv communication optimization for unquantized RL training sence & alltoallv support dpo (#1547) - vLLM version: v0.10.0 - vLLM main: 97608dc276 --------- Signed-off-by: weijinqian_v1 <weijinqian@huawei.com> Signed-off-by: whx-sjtu <2952154980@qq.com> Signed-off-by: curryliu <120010041@link.cuhk.edu.cn> Signed-off-by: wangli <wangli858794774@gmail.com> Signed-off-by: ChenTaoyu-SJTU <ctynb@qq.com> Signed-off-by: taoxudonghaha <justsheldon@163.com> Signed-off-by: shen-shanshan <467638484@qq.com> Signed-off-by: Shanshan Shen <87969357+shen-shanshan@users.noreply.github.com> Signed-off-by: leo-pony <nengjunma@outlook.com> Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Co-authored-by: weijinqian_v1 <weijinqian@huawei.com> Co-authored-by: whx <56632993+whx-sjtu@users.noreply.github.com> Co-authored-by: curryliu <99582471+Irving11-BKN@users.noreply.github.com> Co-authored-by: Li Wang <wangli858794774@gmail.com> Co-authored-by: TaoYu Chen <ctynb@qq.com> Co-authored-by: taoxudonghaha <justsheldon@163.com> Co-authored-by: Shanshan Shen <467638484@qq.com> Co-authored-by: leo-pony <nengjunma@outlook.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com> Co-authored-by: Mengqing Cao <cmq0113@163.com>
2025-08-02 09:49:10 +08:00
parent f0c1f0c828
commit 6e00aed4d5
14 changed files with 1265 additions and 17 deletions
--- a/tests/e2e/multicard/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -157,6 +157,28 @@ def test_models_distributed_topk() -> None:
        vllm_model.generate(example_prompts, sampling_params)


+@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ": "1"})
+def test_models_distributed_alltoallv() -> None:
+    example_prompts = [
+        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
+        "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
+        "Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
+    ]
+    dtype = "half"
+    sampling_params = SamplingParams(max_tokens=5,
+                                     temperature=0.0,
+                                     top_k=50,
+                                     top_p=0.9)
+
+    with VllmRunner(
+            "deepseek-ai/DeepSeek-V2-Lite",
+            dtype=dtype,
+            tensor_parallel_size=2,
+            distributed_executor_backend="mp",
+    ) as vllm_model:
+        vllm_model.generate(example_prompts, sampling_params)
+
+
 def test_models_distributed_Qwen3_W8A8():
    example_prompts = [
        "Hello, my name is",
--- a/tests/ut/distributed/test_distributed_tensor_parallel.py
+++ b/tests/ut/distributed/test_distributed_tensor_parallel.py
@@ -0,0 +1,139 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+
+import importlib
+
+import pytest
+import torch
+from pytest_mock import MockerFixture
+
+from tests.ut.base import PytestBase
+from vllm_ascend.distributed.tensor_parallel import (
+    _gather_along_first_dim, _gather_along_last_dim,
+    _reduce_scatter_along_first_dim, _reduce_scatter_along_last_dim,
+    all_to_all_hp2sp, all_to_all_sp2hp)
+
+
+class TestDistributedCommunication(PytestBase):
+
+    @pytest.fixture(autouse=True)
+    def context(self, mocker: MockerFixture):
+        mocker.patch("torch.npu.current_device", return_value="cpu")
+        mocker.patch("torch.distributed.get_world_size", return_value=4)
+
+        mocker.patch("torch.distributed.get_rank", return_value=0)
+
+    @pytest.mark.parametrize("world_size, test_tensor, expected",
+                             [(1, torch.randn(8, 16), (8, 16)),
+                              (4, torch.randn(8, 16), (32, 16))])
+    def test_gather_along_first_dim(self, test_tensor, expected, world_size,
+                                    mocker: MockerFixture):
+        """test _gather_along_first_dim"""
+        mocker.patch("torch.distributed.get_world_size",
+                     return_value=world_size)
+
+        result = _gather_along_first_dim(test_tensor, mocker.MagicMock())
+
+        assert result.shape == expected
+
+    @pytest.mark.parametrize("test_tensor, output_split_sizes, expected", [
+        (torch.randn(8, 16), [5, 10, 15, 2], (32, 16)),
+    ])
+    def test_gather_along_first_dim_unequal_split(self, test_tensor, expected,
+                                                  output_split_sizes,
+                                                  mocker: MockerFixture):
+        """test _gather_along_first_dim"""
+
+        result = _gather_along_first_dim(test_tensor, mocker.MagicMock(),
+                                         output_split_sizes)
+
+        assert result.shape == expected
+
+    @pytest.mark.parametrize("world_size, test_tensor, expected",
+                             [(1, torch.randn(8, 16, 32), (8, 16, 32)),
+                              (4, torch.randn(8, 16, 32), (8, 16, 32 * 4))])
+    def test_gather_along_last_dim(self, test_tensor, expected, world_size,
+                                   mocker: MockerFixture):
+        """test _gather_along_last_dim"""
+        mocker.patch("torch.distributed.get_world_size",
+                     return_value=world_size)
+
+        result = _gather_along_last_dim(test_tensor, mocker.MagicMock())
+
+        assert result.shape == expected
+
+    @pytest.mark.parametrize("input_shape,expected_shape", [
+        ((32, 16), (8, 16)),
+        ((40, 10), (10, 10)),
+    ])
+    def test_reduce_scatter_along_first_dim(self, input_shape, expected_shape,
+                                            mocker: MockerFixture):
+        input_tensor = torch.randn(*input_shape)
+        result = _reduce_scatter_along_first_dim(input_tensor,
+                                                 mocker.MagicMock())
+        assert result.shape == expected_shape
+
+    @pytest.mark.parametrize("input_shape,expected_shape", [
+        ((8, 16, 32), (8, 16, 8)),
+    ])
+    def test_reduce_scatter_along_last_dim(self, input_shape, expected_shape,
+                                           mocker: MockerFixture):
+        input_tensor = torch.randn(*input_shape)
+        result = _reduce_scatter_along_last_dim(input_tensor,
+                                                mocker.MagicMock())
+        assert result.shape == expected_shape
+
+    @pytest.mark.parametrize("func,input_shape,expected_shape", [
+        ("all_gather_last_dim_from_tensor_parallel_region", (8, 16, 32),
+         (8, 16, 128)),
+        ("reduce_scatter_to_sequence_parallel_region", (32, 16), (8, 16)),
+        ("reduce_scatter_last_dim_to_tensor_parallel_region", (8, 16, 32),
+         (8, 16, 8)),
+        ("gather_from_sequence_parallel_region", (8, 16), (32, 16)),
+    ])
+    def test_wrapper_functions(self, func, input_shape, expected_shape,
+                               mocker: MockerFixture):
+        """test wrapper funcs"""
+        mod = importlib.import_module(
+            'vllm_ascend.distributed.tensor_parallel')
+        globals = mod.__dict__
+        test_func = globals[func]
+        input_tensor = torch.randn(*input_shape)
+        result = test_func(input_tensor, mocker.MagicMock())
+        assert result.shape == expected_shape
+
+    @pytest.mark.parametrize(
+        "input_shape,output_shape",
+        [
+            ((8, 16), (32, 4)),  # [num_tokens/TP, H] -> [num_tokens, H/TP]
+        ])
+    def test_all_to_all_sp2hp(self, input_shape, output_shape,
+                              mocker: MockerFixture):
+        input_tensor = torch.randn(*input_shape)
+        result = all_to_all_sp2hp(input_tensor, mocker.MagicMock())
+        assert result.shape == output_shape
+
+    @pytest.mark.parametrize(
+        "input_shape,output_shape",
+        [
+            ((32, 4), (8, 16)),  # [num_tokens, H/TP] -> [num_tokens/TP, H]
+        ])
+    def test_all_to_all_hp2sp(self, input_shape, output_shape,
+                              mocker: MockerFixture):
+        input_tensor = torch.randn(*input_shape)
+        result = all_to_all_hp2sp(input_tensor, mocker.MagicMock())
+        assert result.shape == output_shape
--- a/tests/ut/ops/test_token_dispatcher.py
+++ b/tests/ut/ops/test_token_dispatcher.py
@@ -0,0 +1,65 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+
+import pytest
+from pytest_mock import MockerFixture
+
+from tests.ut.base import PytestBase
+from vllm_ascend.ops.moe_dispatcher.token_dispatcher import (
+    MoEAlltoAllSeqOverLapDispatcher, MoEDispatcherConfig)
+from vllm_ascend.utils import adapt_patch  # noqa E402
+
+
+class TestMoEAlltoAllSeqOverLapDispatcher(PytestBase):
+
+    @pytest.fixture
+    def config(self):
+        config = MoEDispatcherConfig()
+        config.set_num_local_experts(2)
+        config.set_num_moe_experts(4)
+        config.set_moe_pad_expert_input_to_capacity(False)
+        config.set_moe_expert_capacity_factor(None)
+        config.set_moe_router_topk(2)
+        config.set_moe_grouped_gemm(False)
+        config.set_group_topk(0)
+        config.set_num_groups(1)
+        config.set_is_fused(False)
+        return config.build()
+
+    def mock_ep_group(self, mocker):
+        mock_group = mocker.MagicMock()
+        mock_group.rank_in_group = 0
+        mock_group.world_size = 2
+        mock_group.device_group = "mock_group"
+        return mock_group
+
+    @pytest.fixture
+    def dispatcher(self, config, mocker: MockerFixture):
+        mocker.patch(
+            "vllm_ascend.ops.moe_dispatcher.token_dispatcher.get_ep_group",
+            return_value=self.mock_ep_group(mocker))
+        mocker.patch("torch.npu.current_device", return_value="cpu")
+        mocker.patch("torch.npu.Stream", return_value=mocker.MagicMock)
+        return MoEAlltoAllSeqOverLapDispatcher(config)
+
+    def test_initialization(self, dispatcher, config):
+        assert dispatcher.num_local_experts == config.num_local_experts
+        assert dispatcher.num_experts == config.num_moe_experts
+        assert dispatcher.local_expert_indices == [0, 1]
+        assert dispatcher.ep_rank == 0
+        assert dispatcher.ep_size == 2
+        assert dispatcher.overlap_stream is not None