[Perf] move quant before allgather in Allgather EP (#3420)

### What this PR does / why we need it? move quant before allgather in Allgather EP, rely on https://github.com/vllm-project/vllm-ascend/pull/3334 Deepseek R1 W8A8 performance on A2 with `HCCL_ALGO="level0:NA;level1:pipeline"`: | Seq length | Mean TTFT (ms) main | Mean TTFT (ms) this PR | |----------|----------|----------| | 4k | 375.21 | 364.99 | | 16k | 1465.23 | 1421.75 | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0 - vLLM main: 83f478bb19 --------- Signed-off-by: realliujiaxu <realliujiaxu@163.com>
2025-11-04 16:49:58 +08:00
parent 44b58b8665
commit bedf223771
10 changed files with 160 additions and 66 deletions
--- a/vllm_ascend/quantization/w4a8_dynamic.py
+++ b/vllm_ascend/quantization/w4a8_dynamic.py
@@ -386,7 +386,6 @@ class AscendW4A8DynamicFusedMoEMethod:
            w2_scale_bias=layer.w2_scale_bias,
            topk_weights=topk_weights,
            topk_ids=topk_ids,
-            use_int4_w4a8=True,
            expert_map=expert_map,
            log2phy=log2phy,
            global_redundant_expert_num=global_redundant_expert_num,
--- a/vllm_ascend/quantization/w8a8_dynamic.py
+++ b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -143,6 +143,7 @@ class AscendW8A8DynamicFusedMoEMethod:
                and not ascend_config.torchair_graph_config.enabled)

        self.dynamic_eplb = ascend_config.dynamic_eplb or ascend_config.expert_map_record_path
+        self.in_dtype = vllm_config.model_config.dtype

        try:
            device_group = get_mc2_group().device_group
@@ -218,6 +219,7 @@ class AscendW8A8DynamicFusedMoEMethod:
        shared_experts: Optional[Any] = None,
        quantized_x_for_share: Optional[Any] = None,
        dynamic_scale_for_share: Optional[Any] = None,
+        pertoken_scale: Optional[Any] = None,
        **kwargs,
    ) -> torch.Tensor:
        assert router_logits.shape[
@@ -242,18 +244,18 @@ class AscendW8A8DynamicFusedMoEMethod:
        if enable_force_load_balance:
            topk_ids = torch.randint_like(topk_ids, 0, global_num_experts)

-        topk_weights = topk_weights.to(x.dtype)
+        topk_weights = topk_weights.to(self.in_dtype)

        moe_comm_method = get_forward_context().moe_comm_method
        return moe_comm_method.fused_experts(
            hidden_states=x,
+            pertoken_scale=pertoken_scale,
            w1=layer.w13_weight,
            w1_scale=layer.w13_weight_scale_fp32,
            w2=layer.w2_weight,
            w2_scale=layer.w2_weight_scale,
            topk_weights=topk_weights,
            topk_ids=topk_ids,
-            use_int8_w8a8=True,
            expert_map=expert_map,
            log2phy=log2phy,
            global_redundant_expert_num=global_redundant_expert_num,