diff --git a/vllm_ascend/eplb/core/eplb_utils.py b/vllm_ascend/eplb/core/eplb_utils.py index db5e31b5..32f7d0be 100644 --- a/vllm_ascend/eplb/core/eplb_utils.py +++ b/vllm_ascend/eplb/core/eplb_utils.py @@ -22,6 +22,7 @@ from collections import defaultdict import numpy as np import torch from vllm.logger import logger +from vllm.model_executor.layers.fused_moe.layer import determine_expert_map def expert_file_to_tensor(expert_map_path, layer_id): @@ -60,6 +61,11 @@ def init_eplb_config(eplb_config, layer_id, moe_config): global_placement = None eplb_enable = eplb_config.dynamic_eplb n_redundant = eplb_config.num_redundant_experts if eplb_enable else 0 + + if ep_size == 1: + assert not eplb_enable, "EPLB must used in expert parallelism." + return None, None, None, n_redundant + if expert_map_path: if not (os.path.exists(expert_map_path) and os.access(expert_map_path, os.R_OK)): raise ValueError("Invalid EPLB path") @@ -71,13 +77,13 @@ def init_eplb_config(eplb_config, layer_id, moe_config): raise ValueError("Eplb supports only w8a8_dynamic quantization.") else: eplb_enable = False + elif not eplb_enable: + _, expert_map, _ = determine_expert_map(ep_size, moe_config.ep_rank, n_experts) + return None, expert_map, None, 0 if global_placement is None: global_placement = generate_global_placement(n_experts, ep_size, n_redundant) - if ep_size == 1: - assert not eplb_enable, "EPLB must used in expert parallelism." - return None, None, None, n_redundant global_expert_map = [] for rankid in range(ep_size): expert_map = torch.full((n_experts,), -1, dtype=torch.int32) @@ -85,7 +91,7 @@ def init_eplb_config(eplb_config, layer_id, moe_config): expert_map[local_placement] = torch.arange(local_placement.shape[0], dtype=torch.int32) global_expert_map.append(expert_map) if rankid == moe_config.ep_rank: - local_expert_map = expert_map.npu() + local_expert_map = expert_map log2phy = generate_log2phy_map(global_expert_map, moe_config.ep_rank).npu() if eplb_enable else None return torch.stack(global_expert_map), local_expert_map, log2phy, n_redundant diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py index 79ba6623..d5df0ad8 100644 --- a/vllm_ascend/ops/fused_moe/fused_moe.py +++ b/vllm_ascend/ops/fused_moe/fused_moe.py @@ -286,9 +286,7 @@ class AscendFusedMoE(FusedMoE): ) self.global_num_experts = num_experts + self.global_redundant_expert_num self.dynamic_eplb = eplb_config.dynamic_eplb and (self.log2phy is not None) - self.local_num_experts = ( - torch.sum(self._expert_map != -1).item() if self._expert_map is not None else self.global_num_experts - ) + self.local_num_experts = self.global_num_experts // self.ep_size if self._expert_map is not None: logger.info_once( "[EP Rank %s/%s] Expert parallelism is enabled. Local/global"