diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py index 3b780268..945ea197 100644 --- a/vllm_ascend/ops/fused_moe/fused_moe.py +++ b/vllm_ascend/ops/fused_moe/fused_moe.py @@ -184,6 +184,9 @@ class AscendFusedMoE(FusedMoE): # init moe. self.local_num_experts, self.expert_map, _ = determine_expert_map( self.ep_size, self.ep_rank, self.global_num_experts) + # TODO: Temporary flag to indicate if static EPLB is enabled. This is a + # workaround to bypass a quantization check that fails with float weights. + init_eplb_enable = False # static eplb initializing with expert_map_path if self.expert_map_path and os.path.exists( self.expert_map_path) and os.access(self.expert_map_path, @@ -200,6 +203,7 @@ class AscendFusedMoE(FusedMoE): self.moe_instance_id, self.ep_rank)) self.log2phy = self.expert_load_balancer.get_rank_log2phy_map( self.moe_instance_id, self.ep_rank).npu() + init_eplb_enable = True except Exception as e: logger.warning( f"Init expert map of mtp/eagle when using sample.{e}") @@ -225,10 +229,10 @@ class AscendFusedMoE(FusedMoE): self.moe_load = torch.zeros(local_num_experts, dtype=torch.int64).npu() - eplb_enable = self.dynamic_eplb or (self.expert_map_path is not None) - if eplb_enable and (not hasattr(self.quant_method, "quant_method") or - not isinstance(self.quant_method.quant_method, - AscendW8A8DynamicFusedMoEMethod)): + if init_eplb_enable and ( + not hasattr(self.quant_method, "quant_method") + or not isinstance(self.quant_method.quant_method, + AscendW8A8DynamicFusedMoEMethod)): raise ValueError("Eplb supports only w8a8_dynamic quantization.") self.moe_config.num_experts = self.global_num_experts