[EPLB]Record expert map without dynamic eplb. (#3409)

What this PR does / why we need it?
1.Record expert map without dynamic eplb.
2.Add export PYTHONOPTIMIZE=1  when using dynamic eplb.
3.change eplb doc

Does this PR introduce any user-facing change?
How was this patch tested?
Qwen3_moe in A3.

- vLLM version: v0.11.0

---------

Signed-off-by: offline0806 <3337230449@qq.com>
Co-authored-by: offline0806 <3337230449@qq.com>
This commit is contained in:
offline893
2025-10-15 14:21:15 +08:00
committed by GitHub
parent 4f937f561d
commit 5a3082cd15
9 changed files with 49 additions and 15 deletions

View File

@@ -35,7 +35,8 @@ from vllm.model_executor.layers.fused_moe.config import \
from vllm.model_executor.layers.fused_moe.config import \
FusedMoEParallelConfig # isort: skip
from vllm.model_executor.layers.fused_moe.layer import (
FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map)
FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map,
get_compressed_expert_map)
from vllm.model_executor.layers.quantization.base_config import \
QuantizationConfig
@@ -1028,7 +1029,7 @@ class TorchairAscendFusedMoE(FusedMoE):
self.moe_parallel_config.ep_size, is_deepseek_v3_r1)
ascend_config = get_ascend_config()
self.dynamic_eplb = ascend_config.dynamic_eplb
self.dynamic_eplb = ascend_config.dynamic_eplb or ascend_config.expert_map_record_path
self.expert_map_path = ascend_config.expert_map_path
self.global_redundant_expert_num = ascend_config.init_redundancy_expert
self.global_num_experts = num_experts + self.global_redundant_expert_num
@@ -1055,6 +1056,14 @@ class TorchairAscendFusedMoE(FusedMoE):
self.log2phy = determine_default_log2phy_map(
self.global_num_experts, self.ep_size, self.ep_rank,
self.global_redundant_expert_num).npu()
if self.expert_map is not None and isinstance(
self.expert_map, torch.Tensor):
logger.info_once(
"[EP Rank %s/%s] Expert parallelism is enabled. Local/global"
" number of experts: %s/%s. Experts local to global index map:"
" %s.", self.ep_rank, self.ep_size, self.local_num_experts,
self.global_num_experts,
get_compressed_expert_map(self.expert_map))
else:
# init moe.
self.local_num_experts, self.expert_map = determine_expert_map(
@@ -1068,6 +1077,14 @@ class TorchairAscendFusedMoE(FusedMoE):
self.log2phy = determine_default_log2phy_map(
self.global_num_experts, self.ep_size, self.ep_rank,
self.global_redundant_expert_num).npu()
if self.expert_map is not None and isinstance(
self.expert_map, torch.Tensor):
logger.info_once(
"[EP Rank %s/%s] Expert parallelism is enabled. Local/global"
" number of experts: %s/%s. Experts local to global index map:"
" %s.", self.ep_rank, self.ep_size, self.local_num_experts,
self.global_num_experts,
get_compressed_expert_map(self.expert_map))
local_num_experts = (torch.sum(self.expert_map != -1)
if self.expert_map is not None else num_experts)
if self.dynamic_eplb: