[EPLB]Record expert map without dynamic eplb. (#3409)
What this PR does / why we need it? 1.Record expert map without dynamic eplb. 2.Add export PYTHONOPTIMIZE=1 when using dynamic eplb. 3.change eplb doc Does this PR introduce any user-facing change? How was this patch tested? Qwen3_moe in A3. - vLLM version: v0.11.0 --------- Signed-off-by: offline0806 <3337230449@qq.com> Co-authored-by: offline0806 <3337230449@qq.com>
This commit is contained in:
@@ -26,7 +26,8 @@ from vllm.forward_context import get_forward_context
|
||||
from vllm.logger import logger
|
||||
from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
|
||||
from vllm.model_executor.layers.fused_moe.layer import (
|
||||
FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map)
|
||||
FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map,
|
||||
get_compressed_expert_map)
|
||||
from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE
|
||||
|
||||
from vllm_ascend.ascend_config import get_ascend_config
|
||||
@@ -176,7 +177,7 @@ class AscendFusedMoE(FusedMoE):
|
||||
self.moe_config.ep_group = get_ep_group()
|
||||
self.moe_config.mc2_group = get_mc2_group()
|
||||
ascend_config = get_ascend_config()
|
||||
self.dynamic_eplb = ascend_config.dynamic_eplb
|
||||
self.dynamic_eplb = ascend_config.dynamic_eplb or ascend_config.expert_map_record_path
|
||||
self.expert_map_path = ascend_config.expert_map_path
|
||||
self.global_redundant_expert_num = ascend_config.init_redundancy_expert
|
||||
self.global_num_experts = num_experts + self.global_redundant_expert_num
|
||||
@@ -203,6 +204,14 @@ class AscendFusedMoE(FusedMoE):
|
||||
self.log2phy = determine_default_log2phy_map(
|
||||
self.global_num_experts, self.ep_size, self.ep_rank,
|
||||
self.global_redundant_expert_num).npu()
|
||||
if self.expert_map is not None and isinstance(
|
||||
self.expert_map, torch.Tensor):
|
||||
logger.info_once(
|
||||
"[EP Rank %s/%s] Expert parallelism is enabled. Local/global"
|
||||
" number of experts: %s/%s. Experts local to global index map:"
|
||||
" %s.", self.ep_rank, self.ep_size, self.local_num_experts,
|
||||
self.global_num_experts,
|
||||
get_compressed_expert_map(self.expert_map))
|
||||
else:
|
||||
# init moe.
|
||||
self.local_num_experts, self.expert_map = determine_expert_map(
|
||||
@@ -216,6 +225,14 @@ class AscendFusedMoE(FusedMoE):
|
||||
self.log2phy = determine_default_log2phy_map(
|
||||
self.global_num_experts, self.ep_size, self.ep_rank,
|
||||
self.global_redundant_expert_num).npu()
|
||||
if self.expert_map is not None and isinstance(
|
||||
self.expert_map, torch.Tensor):
|
||||
logger.info_once(
|
||||
"[EP Rank %s/%s] Expert parallelism is enabled. Local/global"
|
||||
" number of experts: %s/%s. Experts local to global index map:"
|
||||
" %s.", self.ep_rank, self.ep_size, self.local_num_experts,
|
||||
self.global_num_experts,
|
||||
get_compressed_expert_map(self.expert_map))
|
||||
local_num_experts = (torch.sum(
|
||||
self.expert_map != -1) if self.expert_map is not None else
|
||||
self.global_num_experts)
|
||||
|
||||
Reference in New Issue
Block a user