[Readme] EPLB Support Scenarios (#4315)
### What this PR does / why we need it? Add information on the scope of EPLB support. --------- Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
This commit is contained in:
@@ -12,6 +12,13 @@ Expert balancing for MoE models in LLM serving is essential for optimal performa
|
||||
- Adaptive Scaling: Automatically adjusts to workload fluctuations while maintaining stable performance.
|
||||
- Fault Tolerance: Redundant expert placement ensures system resilience during hardware failures.
|
||||
|
||||
## Support Scenarios
|
||||
|
||||
### Models:
|
||||
DeepseekV3/V3.1/R1、Qwen3-MOE
|
||||
### MOE QuantType:
|
||||
W8A8-dynamic
|
||||
|
||||
## How to Use EPLB
|
||||
|
||||
### Dynamic EPLB
|
||||
|
||||
@@ -38,6 +38,8 @@ from vllm_ascend.eplb.core.eplb_utils import (determine_default_expert_map,
|
||||
from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
|
||||
from vllm_ascend.ops.moe.experts_selector import select_experts
|
||||
from vllm_ascend.ops.moe.moe_comm_method import setup_moe_comm_method
|
||||
from vllm_ascend.quantization.w8a8_dynamic import \
|
||||
AscendW8A8DynamicFusedMoEMethod
|
||||
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, enable_sp, is_310p,
|
||||
is_enable_nz, npu_stream_switch,
|
||||
shared_expert_dp_enabled,
|
||||
@@ -247,6 +249,11 @@ class AscendFusedMoE(FusedMoE):
|
||||
self.moe_load = torch.zeros(local_num_experts,
|
||||
dtype=torch.int64).npu()
|
||||
|
||||
eplb_enable = self.dynamic_eplb or (self.expert_map_path is not None)
|
||||
if eplb_enable and (not isinstance(self.quant_method,
|
||||
AscendW8A8DynamicFusedMoEMethod)):
|
||||
raise ValueError("Eplb supports only w8a8_dynamic quantization.")
|
||||
|
||||
self.moe_config.num_experts = self.global_num_experts
|
||||
self.moe_config.num_local_experts = self.local_num_experts
|
||||
self.moe_config.original_num_experts = num_experts
|
||||
|
||||
Reference in New Issue
Block a user