[Feat] enable hierarchical communication for mc2 ops on A2 (#3015)

Currently, when in A2, setting the environment variables `HCCL_INTRA_PCIE_ENABLE=1` and `HCCL_INTRA_ROCE_ENABLE=0` can reduce cross-machine communication traffic and significantly improve communication performance. For more details, please refer to [document](https://www.hiascend.com/document/detail/zh/Pytorch/710/apiref/torchnpuCustomsapi/context/torch_npu-npu_moe_distribute_dispatch_v2.md) - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: realliujiaxu <realliujiaxu@163.com>
2025-10-13 16:13:17 +08:00
parent 0563106477
commit 31682961af
6 changed files with 112 additions and 17 deletions
--- a/tests/ut/ops/test_token_dispatcher.py
+++ b/tests/ut/ops/test_token_dispatcher.py
@@ -92,7 +92,8 @@ class TestTokenDispatcherWithMC2(TestBase):
        expert_map = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7])

        with patch("torch_npu.npu_moe_distribute_dispatch_v2",
-                   return_value=(torch.randn(10, 128), ) * 5) as mock_dispatch:
+                   return_value=(torch.randn(10, 128), ) * 5 +
+                   (None, None)) as mock_dispatch:
            output = self.dispatcher.token_dispatch(hidden_states,
                                                    topk_weights, topk_ids,
                                                    self.row_idx, expert_map)
@@ -112,7 +113,7 @@ class TestTokenDispatcherWithMC2(TestBase):
        self.topk_weights = torch.randn(10, 1)

        with patch("torch_npu.npu_moe_distribute_dispatch_v2",
-                   return_value=(torch.randn(10, 128), ) * 5):
+                   return_value=(torch.randn(10, 128), ) * 5 + (None, None)):
            self.dispatcher.token_dispatch(self.hidden_states,
                                           self.topk_weights,
                                           torch.randint(0, 8, (10, 1)),