eplb redundant expert bugfix (#4291)
### What this PR does / why we need it?
Redundant experts bugfix
### Does this PR introduce _any_ user-facing change?
After configuring the path for experts_map, users do not need to
configure iinit_redundancy_expert.
### How was this patch tested?
The accuracy of EPLB was tested with and without the use of redundant
experts.
- vLLM version: v0.11.0
- vLLM main:
2918c1b49c
---------
Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
This commit is contained in:
@@ -43,8 +43,7 @@ from vllm.model_executor.layers.quantization.base_config import \
|
||||
from vllm_ascend.ascend_config import get_ascend_config
|
||||
from vllm_ascend.ascend_forward_context import FusedMoEState
|
||||
from vllm_ascend.distributed.parallel_state import get_mc2_group
|
||||
from vllm_ascend.eplb.core.eplb_utils import (determine_default_expert_map,
|
||||
determine_default_log2phy_map)
|
||||
from vllm_ascend.eplb.core.eplb_utils import determine_default_log2phy_map
|
||||
from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
|
||||
from vllm_ascend.quantization.quant_config import AscendFusedMoEMethod
|
||||
from vllm_ascend.torchair.ops.sequence_parallel import MetadataForPadding
|
||||
@@ -1042,7 +1041,7 @@ class TorchairAscendFusedMoE(FusedMoE):
|
||||
self.expert_map_path) and os.access(self.expert_map_path,
|
||||
os.R_OK):
|
||||
self.expert_load_balancer = ExpertLoadBalancer(
|
||||
self.expert_map_path, self.global_num_experts)
|
||||
self.expert_map_path, num_experts)
|
||||
self.expert_load_balancer.check_expert_map_tensor()
|
||||
self.global_redundant_expert_num = (
|
||||
self.expert_load_balancer.get_global_redundant_expert_num())
|
||||
@@ -1052,15 +1051,14 @@ class TorchairAscendFusedMoE(FusedMoE):
|
||||
self.moe_instance_id, self.ep_rank))
|
||||
self.log2phy = self.expert_load_balancer.get_rank_log2phy_map(
|
||||
self.moe_instance_id, self.ep_rank).npu()
|
||||
self.global_num_experts = num_experts + self.global_redundant_expert_num
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Init expert map of mtp/eagle when using sample.{e}")
|
||||
self.local_num_experts, self.expert_map = determine_default_expert_map(
|
||||
self.global_num_experts, self.ep_size, self.ep_rank,
|
||||
self.global_redundant_expert_num)
|
||||
self.local_num_experts, self.expert_map = determine_expert_map(
|
||||
self.ep_size, self.ep_rank, self.global_num_experts)
|
||||
self.log2phy = determine_default_log2phy_map(
|
||||
self.global_num_experts, self.ep_size, self.ep_rank,
|
||||
self.global_redundant_expert_num).npu()
|
||||
self.global_num_experts, self.ep_size, self.ep_rank).npu()
|
||||
if self.expert_map is not None and isinstance(
|
||||
self.expert_map, torch.Tensor):
|
||||
logger.info_once(
|
||||
@@ -1079,13 +1077,8 @@ class TorchairAscendFusedMoE(FusedMoE):
|
||||
self.ep_size, self.ep_rank, self.global_num_experts)
|
||||
# dynamic eplb initializing with not expert_map_path
|
||||
if self.dynamic_eplb:
|
||||
self.global_redundant_expert_num = ascend_config.init_redundancy_expert
|
||||
self.local_num_experts, self.expert_map = determine_default_expert_map(
|
||||
self.global_num_experts, self.ep_size, self.ep_rank,
|
||||
self.global_redundant_expert_num)
|
||||
self.log2phy = determine_default_log2phy_map(
|
||||
self.global_num_experts, self.ep_size, self.ep_rank,
|
||||
self.global_redundant_expert_num).npu()
|
||||
self.global_num_experts, self.ep_size, self.ep_rank).npu()
|
||||
if self.expert_map is not None and isinstance(
|
||||
self.expert_map, torch.Tensor):
|
||||
logger.info_once(
|
||||
|
||||
@@ -990,7 +990,9 @@ class TorchairAscendW8A8DynamicFusedMoEMethod:
|
||||
# to avoid accumulating too much tokens on a single rank.
|
||||
# currently it is only activated when doing profile runs.
|
||||
if enable_force_load_balance:
|
||||
topk_ids = torch.randint_like(topk_ids, 0, global_num_experts)
|
||||
topk_ids = torch.randint_like(
|
||||
topk_ids, 0,
|
||||
global_num_experts - global_redundant_expert_num)
|
||||
topk_weights = topk_weights.to(x.dtype)
|
||||
|
||||
if fused_moe_state == FusedMoEState.AllGatherEP:
|
||||
|
||||
Reference in New Issue
Block a user