[Bugfix]Fix eplb enable when using mtp float weights. (#4576)

### What this PR does / why we need it? Fix eplb enable when using mtp float weights. It will be remove when eplb supporting mtp and float weights. ### How was this patch tested? Deepseek-V3 + MTP + EPLB in A3. --------- Signed-off-by: offline0806 <3337230449@qq.com> Signed-off-by: offline893 <158537145+offline893@users.noreply.github.com> Co-authored-by: offline0806 <3337230449@qq.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-12-05 21:15:32 +08:00
parent c4a11a745a
commit 350999c4ef
1 changed files with 9 additions and 4 deletions
--- a/vllm_ascend/ops/common_fused_moe.py
+++ b/vllm_ascend/ops/common_fused_moe.py
@@ -169,11 +169,15 @@ class AscendFusedMoE(FusedMoE):
        self.expert_map_path = ascend_config.expert_map_path
        self.global_redundant_expert_num = ascend_config.init_redundancy_expert
        self.global_num_experts = num_experts + self.global_redundant_expert_num
+        # TODO: Flag for static expert placement. This is a temporary workaround
+        # to allow dynamic EPLB with float weights by skipping quantization checks.
+        self.static_eplb_enabled = False
        if self.custom_routing_function is None and self.e_score_correction_bias is not None:
            vllm_config = get_current_vllm_config()
            self.e_score_correction_bias.data = self.e_score_correction_bias.data.to(
                dtype=vllm_config.model_config.dtype)
        # static eplb initializing with expert_map_path
+        init_eplb_enable = False
        if self.expert_map_path and os.path.exists(
                self.expert_map_path) and os.access(self.expert_map_path,
                                                    os.R_OK):
@@ -189,6 +193,7 @@ class AscendFusedMoE(FusedMoE):
                        self.moe_instance_id, self.ep_rank))
                self.log2phy = self.expert_load_balancer.get_rank_log2phy_map(
                    self.moe_instance_id, self.ep_rank).npu()
+                init_eplb_enable = True
            except Exception as e:
                logger.warning(
                    f"Init expert map of mtp/eagle when using sample.{e}")
@@ -219,10 +224,10 @@ class AscendFusedMoE(FusedMoE):
            self.moe_load = torch.zeros(local_num_experts,
                                        dtype=torch.int64).npu()

-        eplb_enable = self.dynamic_eplb or (self.expert_map_path is not None)
-        if eplb_enable and (not hasattr(self.quant_method, "quant_method") or
-                            not isinstance(self.quant_method.quant_method,
-                                           AscendW8A8DynamicFusedMoEMethod)):
+        if init_eplb_enable and (
+                not hasattr(self.quant_method, "quant_method")
+                or not isinstance(self.quant_method.quant_method,
+                                  AscendW8A8DynamicFusedMoEMethod)):
            raise ValueError("Eplb supports only w8a8_dynamic quantization.")

        self.moe_config.num_experts = self.global_num_experts