From 2fa39451121d4ab002a8621fe4c4226abe0db378 Mon Sep 17 00:00:00 2001
From: offline893 <158537145+offline893@users.noreply.github.com>
Date: Tue, 2 Dec 2025 09:20:49 +0800
Subject: [PATCH] [Bugfix]Fix eplb enable when using mtp float weights. (#4571)

### What this PR does / why we need it?
Fix eplb enable when using mtp float weights. It will be remove when
eplb supporting mtp and float weights.

### How was this patch tested?
Deepseek-V3 + MTP + EPLB in A3.

- vLLM version: v0.11.2
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2

---------

Signed-off-by: offline0806 <3337230449@qq.com>
Signed-off-by: offline893 <158537145+offline893@users.noreply.github.com>
Co-authored-by: offline0806 <3337230449@qq.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm_ascend/ops/fused_moe/fused_moe.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py
index 3b780268..945ea197 100644
--- a/vllm_ascend/ops/fused_moe/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe/fused_moe.py
@@ -184,6 +184,9 @@ class AscendFusedMoE(FusedMoE):
         # init moe.
         self.local_num_experts, self.expert_map, _ = determine_expert_map(
             self.ep_size, self.ep_rank, self.global_num_experts)
+        # TODO: Temporary flag to indicate if static EPLB is enabled. This is a
+        # workaround to bypass a quantization check that fails with float weights.
+        init_eplb_enable = False
         # static eplb initializing with expert_map_path
         if self.expert_map_path and os.path.exists(
                 self.expert_map_path) and os.access(self.expert_map_path,
@@ -200,6 +203,7 @@ class AscendFusedMoE(FusedMoE):
                         self.moe_instance_id, self.ep_rank))
                 self.log2phy = self.expert_load_balancer.get_rank_log2phy_map(
                     self.moe_instance_id, self.ep_rank).npu()
+                init_eplb_enable = True
             except Exception as e:
                 logger.warning(
                     f"Init expert map of mtp/eagle when using sample.{e}")
@@ -225,10 +229,10 @@ class AscendFusedMoE(FusedMoE):
             self.moe_load = torch.zeros(local_num_experts,
                                         dtype=torch.int64).npu()
 
-        eplb_enable = self.dynamic_eplb or (self.expert_map_path is not None)
-        if eplb_enable and (not hasattr(self.quant_method, "quant_method") or
-                            not isinstance(self.quant_method.quant_method,
-                                           AscendW8A8DynamicFusedMoEMethod)):
+        if init_eplb_enable and (
+                not hasattr(self.quant_method, "quant_method")
+                or not isinstance(self.quant_method.quant_method,
+                                  AscendW8A8DynamicFusedMoEMethod)):
             raise ValueError("Eplb supports only w8a8_dynamic quantization.")
 
         self.moe_config.num_experts = self.global_num_experts