[Perf][MoE] Improve MoE multistream parallel performace. (#1891)

This PR designs the shared expert multi-stream parallelism of w8a8-dynamic-quantized MoE stage in more detail to achieve better performance. - vLLM version: v0.10.0 - vLLM main: 2cc571199b Signed-off-by: whx-sjtu <2952154980@qq.com>
2025-07-29 23:53:19 +08:00
parent 4df8e0027c
commit b6a7f07c70
3 changed files with 124 additions and 14 deletions
--- a/vllm_ascend/ops/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe.py
@@ -1334,6 +1334,21 @@ class AscendFusedMoE(FusedMoE):
        forward_context = get_forward_context()
        fused_moe_state = forward_context.fused_moe_state
        mc2_mask = forward_context.mc2_mask
+        # For w8a8 dynamic we can do npu_dynamic_quant and gate in parallel.
+        quantized_x_for_share, dynamic_scale_for_share = None, None
+        from vllm_ascend.quantization.w8a8_dynamic import \
+            AscendW8A8DynamicFusedMoEMethod
+        if self.enable_multistream_moe:
+            if not self.rm_router_logits:
+                router_logits, _ = gate(hidden_states)
+            if hasattr(self.quant_method, "quant_method") and \
+               isinstance(self.quant_method.quant_method,
+                          AscendW8A8DynamicFusedMoEMethod
+                          ) and fused_moe_state == FusedMoEState.MC2:
+                with npu_stream_switch("moe_secondary", 0):
+                    quantized_x_for_share, dynamic_scale_for_share = torch_npu.npu_dynamic_quant(
+                        hidden_states)
+
        if shared_experts:
            if not self.enable_multistream_moe or fused_moe_state != FusedMoEState.MC2:
                # When all_reduce_merge is in progress, shared_experts does not do all_reduce in mlp, but waits until shared_experts+router_experts are completed before doing all_reduce
@@ -1419,6 +1434,8 @@ class AscendFusedMoE(FusedMoE):
            shared_experts=shared_experts if self.torchair_graph_enabled
            and self.enable_multistream_moe and not is_prefill else None,
            mc2_mask=mc2_mask,
+            quantized_x_for_share=quantized_x_for_share,
+            dynamic_scale_for_share=dynamic_scale_for_share,
        )

        if shared_experts: