[Feat] enable hierarchical mc2 ops on A2 by default (#5545)

### What this PR does / why we need it?
Previously, it was necessary to set the environment variables
HCCL_INTRA_PCIE_ENABLE=1 and HCCL_INTRA_ROCE_ENABLE=0. This PR enables
hierarchical MC2 operations on A2 by default.
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?


- vLLM version: v0.13.0
- vLLM main:
7157596103

Signed-off-by: hwhaokun <haokun0405@163.com>
This commit is contained in:
hwhaokun
2026-01-04 14:44:20 +08:00
committed by GitHub
parent 363ac1b80f
commit fb9fdcdbe4
2 changed files with 10 additions and 20 deletions

View File

@@ -32,8 +32,7 @@ from vllm.distributed.parallel_state import get_ep_group
from vllm_ascend.distributed.parallel_state import get_mc2_group
from vllm_ascend.ops.fused_moe.comm_utils import (
async_all_to_all, gather_from_sequence_parallel_region)
from vllm_ascend.utils import (AscendDeviceType, get_ascend_device_type,
is_hierarchical_communication_enabled)
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
@dataclass
@@ -117,10 +116,6 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
self.need_extra_args = (
get_ascend_device_type() == AscendDeviceType.A3)
# NOTE: When in A2, setting the environment variables HCCL_INTRA_PCIE_ENABLE=1 and
# HCCL_INTRA_ROCE_ENABLE=0 can reduce cross-machine communication traffic and significantly
# improve communication performance.
self.need_expert_scale = is_hierarchical_communication_enabled()
self.with_quant = False
# Here we need to calculate the global_bs = max_bs_per_rank * ep_world_size to execute
@@ -158,6 +153,7 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
else:
quant_mode = 0
moe_expert_num = len(expert_map)
kwargs_mc2 = {
"x": hidden_states,
"expert_ids": topk_ids,
@@ -166,8 +162,12 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
"moe_expert_num": moe_expert_num,
"global_bs": self.global_bs,
"expert_token_nums_type": 0,
"expert_scales": topk_weights.to(torch.float32),
}
if get_ascend_device_type() == AscendDeviceType.A2:
kwargs_mc2["comm_alg"] = "hierarchy"
stage1_kwargs = {
"scales": None,
"quant_mode": quant_mode,
@@ -181,11 +181,6 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
"tp_world_size": 1,
"tp_rank_id": 0,
})
if self.need_expert_scale:
stage1_kwargs.update({
"expert_scales":
topk_weights.to(torch.float32),
})
kwargs_mc2.update(stage1_kwargs)
return kwargs_mc2
@@ -263,8 +258,12 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
"shared_expert_rank_num": 0,
"moe_expert_num": moe_expert_num,
"global_bs": self.global_bs,
"expand_scales": expand_scales,
}
if get_ascend_device_type() == AscendDeviceType.A2:
kwargs_mc2["comm_alg"] = "hierarchy"
if self.with_quant:
tp_recv_counts = torch.empty(1,
dtype=torch.int32,
@@ -275,7 +274,6 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
"group_ep": self.moe_all_to_all_group_name,
"ep_world_size": self.ep_world_size,
"ep_rank_id": self.ep_rank_id,
"expand_scales": expand_scales,
}
if self.enable_dispatch_v2:

View File

@@ -958,14 +958,6 @@ def calculate_dp_buffer_size() -> int:
return max(dp_buffer_size, _MIN_DP_BUFFER_SIZE)
# Currently, when in A2, setting the environment variables HCCL_INTRA_PCIE_ENABLE=1
# and HCCL_INTRA_ROCE_ENABLE=0 can reduce cross-machine communication traffic and
# significantly improve communication performance of MC2 ops dispatch/combine.
def is_hierarchical_communication_enabled():
return (os.getenv("HCCL_INTRA_ROCE_ENABLE", "") == "0"
and os.getenv("HCCL_INTRA_PCIE_ENABLE", "") == "1")
def has_layer_idx(model_instance: torch.nn.Module) -> bool:
if model_instance is None:
return False