[Perf][MoE] Improve MoE multistream parallel performace. (#1891)
This PR designs the shared expert multi-stream parallelism of
w8a8-dynamic-quantized MoE stage in more detail to achieve better
performance.
- vLLM version: v0.10.0
- vLLM main:
2cc571199b
Signed-off-by: whx-sjtu <2952154980@qq.com>
This commit is contained in:
@@ -1334,6 +1334,21 @@ class AscendFusedMoE(FusedMoE):
|
||||
forward_context = get_forward_context()
|
||||
fused_moe_state = forward_context.fused_moe_state
|
||||
mc2_mask = forward_context.mc2_mask
|
||||
# For w8a8 dynamic we can do npu_dynamic_quant and gate in parallel.
|
||||
quantized_x_for_share, dynamic_scale_for_share = None, None
|
||||
from vllm_ascend.quantization.w8a8_dynamic import \
|
||||
AscendW8A8DynamicFusedMoEMethod
|
||||
if self.enable_multistream_moe:
|
||||
if not self.rm_router_logits:
|
||||
router_logits, _ = gate(hidden_states)
|
||||
if hasattr(self.quant_method, "quant_method") and \
|
||||
isinstance(self.quant_method.quant_method,
|
||||
AscendW8A8DynamicFusedMoEMethod
|
||||
) and fused_moe_state == FusedMoEState.MC2:
|
||||
with npu_stream_switch("moe_secondary", 0):
|
||||
quantized_x_for_share, dynamic_scale_for_share = torch_npu.npu_dynamic_quant(
|
||||
hidden_states)
|
||||
|
||||
if shared_experts:
|
||||
if not self.enable_multistream_moe or fused_moe_state != FusedMoEState.MC2:
|
||||
# When all_reduce_merge is in progress, shared_experts does not do all_reduce in mlp, but waits until shared_experts+router_experts are completed before doing all_reduce
|
||||
@@ -1419,6 +1434,8 @@ class AscendFusedMoE(FusedMoE):
|
||||
shared_experts=shared_experts if self.torchair_graph_enabled
|
||||
and self.enable_multistream_moe and not is_prefill else None,
|
||||
mc2_mask=mc2_mask,
|
||||
quantized_x_for_share=quantized_x_for_share,
|
||||
dynamic_scale_for_share=dynamic_scale_for_share,
|
||||
)
|
||||
|
||||
if shared_experts:
|
||||
|
||||
Reference in New Issue
Block a user