[Perf] move quant before allgather in Allgather EP (#3420)

### What this PR does / why we need it? move quant before allgather in Allgather EP, rely on https://github.com/vllm-project/vllm-ascend/pull/3334 Deepseek R1 W8A8 performance on A2 with `HCCL_ALGO="level0:NA;level1:pipeline"`: | Seq length | Mean TTFT (ms) main | Mean TTFT (ms) this PR | |----------|----------|----------| | 4k | 375.21 | 364.99 | | 16k | 1465.23 | 1421.75 | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0 - vLLM main: 83f478bb19 --------- Signed-off-by: realliujiaxu <realliujiaxu@163.com>
2025-11-04 16:49:58 +08:00
parent 44b58b8665
commit bedf223771
10 changed files with 160 additions and 66 deletions
--- a/vllm_ascend/ops/fused_moe/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe/fused_moe.py
@@ -289,7 +289,7 @@ class AscendFusedMoE(FusedMoE):

        self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp

-        setup_moe_comm_method(self.moe_config)
+        setup_moe_comm_method(self.moe_config, self.quant_method)

    def update_expert_map(self, new_expert_map):
        self.expert_map = new_expert_map
@@ -336,11 +336,17 @@ class AscendFusedMoE(FusedMoE):
            replace_allreduce=forward_context.sp_enabled,
            enable_shared_expert_dp=self.enable_shared_expert_dp)

+        if isinstance(hidden_states, tuple):
+            hidden_states, pertoken_scale = hidden_states
+        else:
+            pertoken_scale = None
+
        # Matrix multiply.
        final_hidden_states = self.quant_method.apply(
            layer=self,
            x=hidden_states,
            router_logits=router_logits,
+            pertoken_scale=pertoken_scale,
            top_k=self.top_k,
            renormalize=self.renormalize,
            use_grouped_topk=self.use_grouped_topk,