Dynamic Expert Load Balance with Zero-like-overhead (#2956)
### Motivation
Currently dynamically experts balancing would stop-the-world.
Asynchronously expert load balancing would be better without flowing
problems:
Host-bound latency:
There are many cpu operations during EPLB such as
eplb-algorithm、creating p2p ops、and log2phy expert converting would
spend long cpu time, as ~1s.
Communication latency: The transfer time would cost much in the
situation without nvlink. As the weight of an expert maybe transfer to
multiple new positions, thus N times send/recv for one expert, with
result long latency. We had tested that batch_isend_irecv cost more
100ms for 16 experts weight transmission in A2 server of ascend.
SwiftBalancer would not stop-the-world anymore, in out test on NPU 1~2ms
cost for each layer while benefit 5ms-8ms decode latency with ep_size =
64.
The following updates have been made:
1、expert distribution recording with lower cost.
2、async cpu computing for eplb algo and other python operator.
3、new eplb algo with less expert rebalancing while almost the same
effect.
### Proposed Change
We will gradually migrate the EPLB logic to the VLLM community and
implement a generalized design. Relevant RFC:
https://github.com/vllm-project/vllm/issues/22246
The overall workflow involves:
<img width="801" height="302"
alt="474430541-23b06f58-23bc-44a3-a1be-00f268aeb15c"
src="https://github.com/user-attachments/assets/1d73a459-1b23-4b0a-812a-bf0a75debfed"
/>
1. Record experts distribution during forward. We using expert_token_num
after disptach instead of topk_ids, thus we got much smaller tensor
shape to reduce cost of hbm recording and add-operator.
2. Do all-gather for experts distribution. Using all-gather instead of
all-reduce as less traffic volume.
3. Wake up eplb worker process with experts distribution when
num_iterations comes. Run eplb algorithm in eplb worker.
4. Generate p2p send/recv ops and other operator such as log2phy would
cost long cpu time.
5. Lanch ibatch_send_recv in async_stream before forward.
6. After forward, wait for the ibatch_send_recv finish, then do uapte
expert map and expert weights.
### Co-author
Co-authored-by: raindaywhu raindaywhu@raindaywhu@ 163.con
Co-authored-by: njuyuan yuanjl19@smail.nju.edu.cn
Co-authored-by: qmkakaxi wjh1594260677@qq.com
Co-authored-by: Skywalker-EP 173723846@qq.com
- vLLM version: v0.10.2
- vLLM main:
567939953b
---------
Signed-off-by: offline0806 <z00858301@china.huawei.com>
Co-authored-by: offline0806 <z00858301@china.huawei.com>
This commit is contained in:
@@ -14,7 +14,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import os.path
|
||||
from typing import Callable, Optional
|
||||
|
||||
import torch
|
||||
@@ -26,10 +26,13 @@ from vllm.forward_context import get_forward_context
|
||||
from vllm.model_executor.layers.fused_moe.config import \
|
||||
FusedMoEParallelConfig # isort: skip
|
||||
from vllm.model_executor.layers.fused_moe.layer import (
|
||||
FusedMoE, UnquantizedFusedMoEMethod)
|
||||
FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map)
|
||||
|
||||
from vllm_ascend.ascend_config import get_ascend_config
|
||||
from vllm_ascend.distributed.parallel_state import get_mc2_group
|
||||
from vllm_ascend.eplb.core.eplb_utils import (determine_default_expert_map,
|
||||
determine_default_log2phy_map)
|
||||
from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
|
||||
from vllm_ascend.ops.moe.experts_selector import select_experts
|
||||
from vllm_ascend.ops.moe.moe_comm_method import (AllGatherCommImpl,
|
||||
AlltoAllCommImpl, MC2CommImpl,
|
||||
@@ -226,14 +229,52 @@ def process_weights_after_loading(self, layer):
|
||||
|
||||
|
||||
class AscendFusedMoE(FusedMoE):
|
||||
moe_counter = -1
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
AscendFusedMoE.moe_counter += 1
|
||||
self.moe_instance_id = AscendFusedMoE.moe_counter
|
||||
self.moe_config.tp_group = get_tp_group()
|
||||
self.moe_config.dp_group = get_dp_group()
|
||||
self.moe_config.ep_group = get_ep_group()
|
||||
self.moe_config.mc2_group = get_mc2_group()
|
||||
ascend_config = get_ascend_config()
|
||||
self.dynamic_eplb = ascend_config.dynamic_eplb
|
||||
self.expert_map_path = ascend_config.expert_map_path
|
||||
self.global_redundant_expert_num = ascend_config.init_redundancy_expert
|
||||
# static eplb initializing with expert_map_path
|
||||
if self.expert_map_path and os.path.exists(
|
||||
self.expert_map_path) and os.access(self.expert_map_path,
|
||||
os.R_OK):
|
||||
self.expert_load_balancer = ExpertLoadBalancer(
|
||||
self.expert_map_path, self.global_num_experts)
|
||||
self.local_num_experts, self.expert_map = (
|
||||
self.expert_load_balancer.get_rank_placement_map(
|
||||
self.moe_instance_id, self.ep_rank))
|
||||
self.log2phy = self.expert_load_balancer.get_rank_log2phy_map(
|
||||
self.moe_instance_id, self.ep_rank).npu()
|
||||
self.global_redundant_expert_num = (
|
||||
self.expert_load_balancer.get_global_redundant_expert_num())
|
||||
else:
|
||||
# init moe.
|
||||
self.local_num_experts, self.expert_map = determine_expert_map(
|
||||
self.ep_size, self.ep_rank, self.global_num_experts)
|
||||
# dynamic eplb initializing with not expert_map_path
|
||||
if self.dynamic_eplb:
|
||||
self.global_redundant_expert_num = ascend_config.init_redundancy_expert
|
||||
self.local_num_experts, self.expert_map = determine_default_expert_map(
|
||||
self.global_num_experts, self.ep_size, self.ep_rank,
|
||||
self.global_redundant_expert_num)
|
||||
self.log2phy = determine_default_log2phy_map(
|
||||
self.global_num_experts, self.ep_size, self.ep_rank,
|
||||
self.global_redundant_expert_num)
|
||||
local_num_experts = (torch.sum(
|
||||
self.expert_map != -1) if self.expert_map is not None else
|
||||
self.global_num_experts)
|
||||
if self.dynamic_eplb:
|
||||
self.moe_load = torch.zeros(local_num_experts, dtype=torch.int64)
|
||||
|
||||
for method in {
|
||||
AllGatherCommImpl, AlltoAllCommImpl, MC2CommImpl,
|
||||
@@ -243,6 +284,19 @@ class AscendFusedMoE(FusedMoE):
|
||||
self, method.__name__.lower(),
|
||||
method(moe_config=self.moe_config)) # type: ignore[abstract]
|
||||
|
||||
def update_expert_map(self, new_expert_map):
|
||||
self.expert_map = new_expert_map
|
||||
|
||||
def get_map(self):
|
||||
return self.expert_map
|
||||
|
||||
def get_log2phy_map(self):
|
||||
return self.logical_to_physical_map
|
||||
|
||||
def clear_moe_load(self):
|
||||
if self.moe_load is not None:
|
||||
self.moe_load.zero_()
|
||||
|
||||
def maybe_all_reduce_tensor_model_parallel(
|
||||
self, final_hidden_states: torch.Tensor):
|
||||
"""NOTE(Yizhou): This is to override the parent class method. In `mc2commimpl`,
|
||||
@@ -292,6 +346,12 @@ class AscendFusedMoE(FusedMoE):
|
||||
logical_to_physical_map=self.logical_to_physical_map,
|
||||
logical_replica_count=self.logical_replica_count,
|
||||
)
|
||||
if isinstance(final_hidden_states, tuple):
|
||||
final_hidden_states, group_list_type, expert_tokens = final_hidden_states
|
||||
|
||||
if self.dynamic_eplb:
|
||||
self.moe_load += expert_tokens if group_list_type else \
|
||||
torch.cat([expert_tokens[:1], expert_tokens[1:] - expert_tokens[:-1]])
|
||||
|
||||
final_hidden_states = forward_context.moe_comm_method.finalize(
|
||||
hidden_states=final_hidden_states,
|
||||
|
||||
@@ -37,6 +37,8 @@ from vllm.model_executor.layers.quantization.base_config import \
|
||||
|
||||
from vllm_ascend.ascend_config import get_ascend_config
|
||||
from vllm_ascend.distributed.parallel_state import get_mc2_group
|
||||
from vllm_ascend.eplb.core.eplb_utils import (determine_default_expert_map,
|
||||
determine_default_log2phy_map)
|
||||
from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
|
||||
from vllm_ascend.ops.moe.experts_selector import select_experts
|
||||
from vllm_ascend.ops.moe.moe_comm_method import (AllGatherCommImpl,
|
||||
@@ -58,6 +60,7 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
|
||||
self.global_batch_size = vllm_config.scheduler_config.max_num_seqs
|
||||
self.max_model_len = vllm_config.model_config.max_model_len
|
||||
get_ascend_config()
|
||||
self.dynamic_eplb = get_ascend_config().dynamic_eplb
|
||||
|
||||
try:
|
||||
device_group = get_mc2_group().device_group
|
||||
@@ -136,7 +139,8 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
|
||||
global_num_experts=global_num_experts,
|
||||
expert_map=expert_map,
|
||||
shared_experts=shared_experts,
|
||||
need_trans=True)
|
||||
need_trans=True,
|
||||
dynamic_eplb=self.dynamic_eplb)
|
||||
|
||||
|
||||
class AscendFusedMoE(FusedMoE):
|
||||
@@ -234,25 +238,40 @@ class AscendFusedMoE(FusedMoE):
|
||||
self.moe_parallel_config.ep_size, is_deepseek_v3_r1)
|
||||
|
||||
ascend_config = get_ascend_config()
|
||||
expert_map_path = ascend_config.expert_map_path
|
||||
if expert_map_path and os.path.exists(expert_map_path):
|
||||
# moe expert load balance
|
||||
expert_load_balancer = ExpertLoadBalancer(expert_map_path,
|
||||
self.global_num_experts)
|
||||
self.local_num_experts, self.expert_map = \
|
||||
expert_load_balancer.get_rank_placement_map(
|
||||
self.moe_instance_id,
|
||||
get_ep_group().rank_in_group)
|
||||
self.log2phy = expert_load_balancer.get_rank_log2phy_map(
|
||||
self.moe_instance_id,
|
||||
get_ep_group().rank_in_group)
|
||||
self.global_redundant_expert_num = \
|
||||
expert_load_balancer.get_global_redundant_expert_num()
|
||||
self.dynamic_eplb = ascend_config.dynamic_eplb
|
||||
self.expert_map_path = ascend_config.expert_map_path
|
||||
self.global_redundant_expert_num = ascend_config.init_redundancy_expert
|
||||
self.global_num_experts = num_experts + self.global_redundant_expert_num
|
||||
# static eplb initializing with expert_map_path
|
||||
if self.expert_map_path and os.path.exists(
|
||||
self.expert_map_path) and os.access(self.expert_map_path,
|
||||
os.R_OK):
|
||||
self.expert_load_balancer = ExpertLoadBalancer(
|
||||
self.expert_map_path, self.global_num_experts)
|
||||
self.local_num_experts, self.expert_map = (
|
||||
self.expert_load_balancer.get_rank_placement_map(
|
||||
self.moe_instance_id, self.ep_rank))
|
||||
self.log2phy = self.expert_load_balancer.get_rank_log2phy_map(
|
||||
self.moe_instance_id, self.ep_rank).npu()
|
||||
self.global_redundant_expert_num = (
|
||||
self.expert_load_balancer.get_global_redundant_expert_num())
|
||||
else:
|
||||
# Create a tensor of size num_experts filled with -1
|
||||
# init moe.
|
||||
self.local_num_experts, self.expert_map = determine_expert_map(
|
||||
self.ep_size,
|
||||
get_ep_group().rank_in_group, self.global_num_experts)
|
||||
self.ep_size, self.ep_rank, self.global_num_experts)
|
||||
# dynamic eplb initializing with not expert_map_path
|
||||
if self.dynamic_eplb:
|
||||
self.global_redundant_expert_num = ascend_config.init_redundancy_expert
|
||||
self.local_num_experts, self.expert_map = determine_default_expert_map(
|
||||
self.global_num_experts, self.ep_size, self.ep_rank,
|
||||
self.global_redundant_expert_num)
|
||||
self.log2phy = determine_default_log2phy_map(
|
||||
self.global_num_experts, self.ep_size, self.ep_rank,
|
||||
self.global_redundant_expert_num)
|
||||
local_num_experts = (torch.sum(self.expert_map != -1)
|
||||
if self.expert_map is not None else num_experts)
|
||||
if self.dynamic_eplb:
|
||||
self.moe_load = torch.zeros(local_num_experts, dtype=torch.int64)
|
||||
|
||||
self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp
|
||||
|
||||
@@ -281,6 +300,11 @@ class AscendFusedMoE(FusedMoE):
|
||||
local_num_experts = torch.sum(self.expert_map != -1) \
|
||||
if self.expert_map is not None else num_experts
|
||||
|
||||
self.moe_load = None
|
||||
|
||||
if self.dynamic_eplb:
|
||||
self.moe_load = torch.zeros(local_num_experts, dtype=torch.int64)
|
||||
|
||||
moe_quant_params = {
|
||||
"num_experts": local_num_experts,
|
||||
"hidden_size": hidden_size,
|
||||
@@ -313,6 +337,19 @@ class AscendFusedMoE(FusedMoE):
|
||||
self, method.__name__.lower(),
|
||||
method(moe_config=self.moe_config)) # type: ignore[abstract]
|
||||
|
||||
def update_expert_map(self, new_expert_map):
|
||||
self.expert_map = new_expert_map
|
||||
|
||||
def get_map(self):
|
||||
return self.expert_map
|
||||
|
||||
def get_log2phy_map(self):
|
||||
return self.logical_to_physical_map
|
||||
|
||||
def clear_moe_load(self):
|
||||
if self.moe_load is not None:
|
||||
self.moe_load.zero_()
|
||||
|
||||
def naive_multicast(self, x: torch.Tensor,
|
||||
cu_tokens_across_dp_cpu: torch.Tensor):
|
||||
assert (len(x.shape) == 2)
|
||||
@@ -401,10 +438,20 @@ class AscendFusedMoE(FusedMoE):
|
||||
dynamic_scale_for_share=dynamic_scale_for_share,
|
||||
)
|
||||
|
||||
group_list_type = None
|
||||
|
||||
if shared_experts:
|
||||
if isinstance(e_hidden_states, tuple):
|
||||
if isinstance(e_hidden_states,
|
||||
tuple) and len(e_hidden_states) == 2:
|
||||
e_hidden_states, shared_hidden_states = e_hidden_states
|
||||
|
||||
if isinstance(e_hidden_states, tuple) and len(e_hidden_states) == 3:
|
||||
e_hidden_states, group_list_type, expert_tokens = e_hidden_states
|
||||
|
||||
if self.dynamic_eplb and group_list_type is not None:
|
||||
self.moe_load += expert_tokens if group_list_type else \
|
||||
torch.cat([expert_tokens[:1], expert_tokens[1:] - expert_tokens[:-1]])
|
||||
|
||||
final_hidden_states = forward_context.moe_comm_method.finalize(
|
||||
hidden_states=e_hidden_states,
|
||||
reduce_results=(not self.all_reduce_merge))
|
||||
|
||||
@@ -88,7 +88,8 @@ class MoECommMethod(ABC):
|
||||
# For load balance
|
||||
log2phy: torch.Tensor = None,
|
||||
global_redundant_expert_num: int = 0,
|
||||
need_trans: bool = False) -> torch.Tensor:
|
||||
need_trans: bool = False,
|
||||
dynamic_eplb: bool = False):
|
||||
# Check constraints
|
||||
assert hidden_states.dtype in [
|
||||
torch.float32, torch.float16, torch.bfloat16
|
||||
@@ -133,6 +134,9 @@ class MoECommMethod(ABC):
|
||||
final_hidden_states = self.token_dispatcher.token_combine(
|
||||
hidden_states=mlp_output)
|
||||
|
||||
if dynamic_eplb:
|
||||
return (final_hidden_states, group_list_type, expert_tokens)
|
||||
|
||||
return final_hidden_states
|
||||
|
||||
@abstractmethod
|
||||
|
||||
Reference in New Issue
Block a user