This reverts commitfb9fdcdbe4. ### What this PR does / why we need it? this pr breaks the smoke test because of that leads the error of aclnnNeScalar:Kernel Run failed. opType: 25, NotEqual launch failed for NotEqual, errno:361001 <img width="1149" height="166" alt="A6C9453D-4F0B-4256-DD80-A9C181DAB2D9" src="https://github.com/user-attachments/assets/cab9c4b8-3fd1-4c6b-b424-474b46042726" /> ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main:7157596103Signed-off-by: zxwang <1476209578@qq.com>
This commit is contained in:
@@ -32,7 +32,8 @@ from vllm.distributed.parallel_state import get_ep_group
|
|||||||
from vllm_ascend.distributed.parallel_state import get_mc2_group
|
from vllm_ascend.distributed.parallel_state import get_mc2_group
|
||||||
from vllm_ascend.ops.fused_moe.comm_utils import (
|
from vllm_ascend.ops.fused_moe.comm_utils import (
|
||||||
async_all_to_all, gather_from_sequence_parallel_region)
|
async_all_to_all, gather_from_sequence_parallel_region)
|
||||||
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
|
from vllm_ascend.utils import (AscendDeviceType, get_ascend_device_type,
|
||||||
|
is_hierarchical_communication_enabled)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -116,6 +117,10 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
|
|||||||
self.need_extra_args = (
|
self.need_extra_args = (
|
||||||
get_ascend_device_type() == AscendDeviceType.A3)
|
get_ascend_device_type() == AscendDeviceType.A3)
|
||||||
|
|
||||||
|
# NOTE: When in A2, setting the environment variables HCCL_INTRA_PCIE_ENABLE=1 and
|
||||||
|
# HCCL_INTRA_ROCE_ENABLE=0 can reduce cross-machine communication traffic and significantly
|
||||||
|
# improve communication performance.
|
||||||
|
self.need_expert_scale = is_hierarchical_communication_enabled()
|
||||||
self.with_quant = False
|
self.with_quant = False
|
||||||
|
|
||||||
# Here we need to calculate the global_bs = max_bs_per_rank * ep_world_size to execute
|
# Here we need to calculate the global_bs = max_bs_per_rank * ep_world_size to execute
|
||||||
@@ -153,7 +158,6 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
|
|||||||
else:
|
else:
|
||||||
quant_mode = 0
|
quant_mode = 0
|
||||||
moe_expert_num = len(expert_map)
|
moe_expert_num = len(expert_map)
|
||||||
|
|
||||||
kwargs_mc2 = {
|
kwargs_mc2 = {
|
||||||
"x": hidden_states,
|
"x": hidden_states,
|
||||||
"expert_ids": topk_ids,
|
"expert_ids": topk_ids,
|
||||||
@@ -162,12 +166,8 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
|
|||||||
"moe_expert_num": moe_expert_num,
|
"moe_expert_num": moe_expert_num,
|
||||||
"global_bs": self.global_bs,
|
"global_bs": self.global_bs,
|
||||||
"expert_token_nums_type": 0,
|
"expert_token_nums_type": 0,
|
||||||
"expert_scales": topk_weights.to(torch.float32),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if get_ascend_device_type() == AscendDeviceType.A2:
|
|
||||||
kwargs_mc2["comm_alg"] = "hierarchy"
|
|
||||||
|
|
||||||
stage1_kwargs = {
|
stage1_kwargs = {
|
||||||
"scales": None,
|
"scales": None,
|
||||||
"quant_mode": quant_mode,
|
"quant_mode": quant_mode,
|
||||||
@@ -181,6 +181,11 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
|
|||||||
"tp_world_size": 1,
|
"tp_world_size": 1,
|
||||||
"tp_rank_id": 0,
|
"tp_rank_id": 0,
|
||||||
})
|
})
|
||||||
|
if self.need_expert_scale:
|
||||||
|
stage1_kwargs.update({
|
||||||
|
"expert_scales":
|
||||||
|
topk_weights.to(torch.float32),
|
||||||
|
})
|
||||||
|
|
||||||
kwargs_mc2.update(stage1_kwargs)
|
kwargs_mc2.update(stage1_kwargs)
|
||||||
return kwargs_mc2
|
return kwargs_mc2
|
||||||
@@ -258,12 +263,8 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
|
|||||||
"shared_expert_rank_num": 0,
|
"shared_expert_rank_num": 0,
|
||||||
"moe_expert_num": moe_expert_num,
|
"moe_expert_num": moe_expert_num,
|
||||||
"global_bs": self.global_bs,
|
"global_bs": self.global_bs,
|
||||||
"expand_scales": expand_scales,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if get_ascend_device_type() == AscendDeviceType.A2:
|
|
||||||
kwargs_mc2["comm_alg"] = "hierarchy"
|
|
||||||
|
|
||||||
if self.with_quant:
|
if self.with_quant:
|
||||||
tp_recv_counts = torch.empty(1,
|
tp_recv_counts = torch.empty(1,
|
||||||
dtype=torch.int32,
|
dtype=torch.int32,
|
||||||
@@ -274,6 +275,7 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
|
|||||||
"group_ep": self.moe_all_to_all_group_name,
|
"group_ep": self.moe_all_to_all_group_name,
|
||||||
"ep_world_size": self.ep_world_size,
|
"ep_world_size": self.ep_world_size,
|
||||||
"ep_rank_id": self.ep_rank_id,
|
"ep_rank_id": self.ep_rank_id,
|
||||||
|
"expand_scales": expand_scales,
|
||||||
}
|
}
|
||||||
|
|
||||||
if self.enable_dispatch_v2:
|
if self.enable_dispatch_v2:
|
||||||
|
|||||||
@@ -983,6 +983,14 @@ def calculate_dp_buffer_size() -> int:
|
|||||||
return max(dp_buffer_size, _MIN_DP_BUFFER_SIZE)
|
return max(dp_buffer_size, _MIN_DP_BUFFER_SIZE)
|
||||||
|
|
||||||
|
|
||||||
|
# Currently, when in A2, setting the environment variables HCCL_INTRA_PCIE_ENABLE=1
|
||||||
|
# and HCCL_INTRA_ROCE_ENABLE=0 can reduce cross-machine communication traffic and
|
||||||
|
# significantly improve communication performance of MC2 ops dispatch/combine.
|
||||||
|
def is_hierarchical_communication_enabled():
|
||||||
|
return (os.getenv("HCCL_INTRA_ROCE_ENABLE", "") == "0"
|
||||||
|
and os.getenv("HCCL_INTRA_PCIE_ENABLE", "") == "1")
|
||||||
|
|
||||||
|
|
||||||
def has_layer_idx(model_instance: torch.nn.Module) -> bool:
|
def has_layer_idx(model_instance: torch.nn.Module) -> bool:
|
||||||
if model_instance is None:
|
if model_instance is None:
|
||||||
return False
|
return False
|
||||||
|
|||||||
Reference in New Issue
Block a user