From c94b38c82e9a055bfb0cf89da03f037301bba654 Mon Sep 17 00:00:00 2001 From: LI SHENGYONG <49200266+shenchuxiaofugui@users.noreply.github.com> Date: Fri, 21 Nov 2025 14:25:39 +0800 Subject: [PATCH] [Readme] EPLB Support Scenarios (#4315) ### What this PR does / why we need it? Add information on the scope of EPLB support. --------- Signed-off-by: shenchuxiaofugui <1311027364@qq.com> --- .../source/user_guide/feature_guide/eplb_swift_balancer.md | 7 +++++++ vllm_ascend/ops/common_fused_moe.py | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/docs/source/user_guide/feature_guide/eplb_swift_balancer.md b/docs/source/user_guide/feature_guide/eplb_swift_balancer.md index 1b03ce7..fe24a30 100644 --- a/docs/source/user_guide/feature_guide/eplb_swift_balancer.md +++ b/docs/source/user_guide/feature_guide/eplb_swift_balancer.md @@ -12,6 +12,13 @@ Expert balancing for MoE models in LLM serving is essential for optimal performa - Adaptive Scaling: Automatically adjusts to workload fluctuations while maintaining stable performance. - Fault Tolerance: Redundant expert placement ensures system resilience during hardware failures. +## Support Scenarios + +### Models: +DeepseekV3/V3.1/R1、Qwen3-MOE +### MOE QuantType: +W8A8-dynamic + ## How to Use EPLB ### Dynamic EPLB diff --git a/vllm_ascend/ops/common_fused_moe.py b/vllm_ascend/ops/common_fused_moe.py index aec0ffc..412569d 100644 --- a/vllm_ascend/ops/common_fused_moe.py +++ b/vllm_ascend/ops/common_fused_moe.py @@ -38,6 +38,8 @@ from vllm_ascend.eplb.core.eplb_utils import (determine_default_expert_map, from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer from vllm_ascend.ops.moe.experts_selector import select_experts from vllm_ascend.ops.moe.moe_comm_method import setup_moe_comm_method +from vllm_ascend.quantization.w8a8_dynamic import \ + AscendW8A8DynamicFusedMoEMethod from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, enable_sp, is_310p, is_enable_nz, npu_stream_switch, shared_expert_dp_enabled, @@ -247,6 +249,11 @@ class AscendFusedMoE(FusedMoE): self.moe_load = torch.zeros(local_num_experts, dtype=torch.int64).npu() + eplb_enable = self.dynamic_eplb or (self.expert_map_path is not None) + if eplb_enable and (not isinstance(self.quant_method, + AscendW8A8DynamicFusedMoEMethod)): + raise ValueError("Eplb supports only w8a8_dynamic quantization.") + self.moe_config.num_experts = self.global_num_experts self.moe_config.num_local_experts = self.local_num_experts self.moe_config.original_num_experts = num_experts