[Perf] move quant before allgather in Allgather EP (#3420)

### What this PR does / why we need it? move quant before allgather in Allgather EP, rely on https://github.com/vllm-project/vllm-ascend/pull/3334 Deepseek R1 W8A8 performance on A2 with `HCCL_ALGO="level0:NA;level1:pipeline"`: | Seq length | Mean TTFT (ms) main | Mean TTFT (ms) this PR | |----------|----------|----------| | 4k | 375.21 | 364.99 | | 16k | 1465.23 | 1421.75 | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0 - vLLM main: 83f478bb19 --------- Signed-off-by: realliujiaxu <realliujiaxu@163.com>
2025-11-04 16:49:58 +08:00
parent 44b58b8665
commit bedf223771
10 changed files with 160 additions and 66 deletions
--- a/vllm_ascend/ops/fused_moe/token_dispatcher.py
+++ b/vllm_ascend/ops/fused_moe/token_dispatcher.py
@@ -57,20 +57,23 @@ class MoETokenDispatcher(ABC):
        return get_ep_group().world_size

    @abstractmethod
-    def token_dispatch(self,
-                       hidden_states: torch.Tensor,
-                       topk_weights: torch.Tensor,
-                       topk_ids: torch.Tensor,
-                       expert_map: Optional[torch.Tensor] = None,
-                       log2phy: Optional[torch.Tensor] = None,
-                       global_redundant_expert_num: int = 0,
-                       shared_experts: Optional[Any] = None,
-                       quantized_x_for_share: Optional[Any] = None,
-                       dynamic_scale_for_share: Optional[Any] = None,
-                       mc2_mask: Optional[torch.Tensor] = None,
-                       apply_router_weight_on_input: bool = False,
-                       with_quant: bool = False,
-                       dynamic_eplb: bool = False):
+    def token_dispatch(
+        self,
+        hidden_states: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        expert_map: Optional[torch.Tensor] = None,
+        log2phy: Optional[torch.Tensor] = None,
+        global_redundant_expert_num: int = 0,
+        shared_experts: Optional[Any] = None,
+        quantized_x_for_share: Optional[Any] = None,
+        dynamic_scale_for_share: Optional[Any] = None,
+        mc2_mask: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        with_quant: bool = False,
+        dynamic_eplb: bool = False,
+        pertoken_scale: Optional[torch.Tensor] = None,
+    ):
        raise NotImplementedError("Dispatch function not implemented.")

    @abstractmethod
@@ -170,7 +173,8 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
                       mc2_mask: Optional[torch.Tensor] = None,
                       apply_router_weight_on_input: bool = False,
                       with_quant: bool = False,
-                       dynamic_eplb: bool = False):
+                       dynamic_eplb: bool = False,
+                       pertoken_scale: Optional[torch.Tensor] = None):
        self.with_quant = with_quant

        # Apply log2phy if needed
@@ -339,7 +343,8 @@ class TokenDispatcherWithAllGather(MoETokenDispatcher):
                       mc2_mask: Optional[torch.Tensor] = None,
                       apply_router_weight_on_input: bool = False,
                       with_quant: bool = False,
-                       dynamic_eplb: bool = False):
+                       dynamic_eplb: bool = False,
+                       pertoken_scale: Optional[torch.Tensor] = None):
        self.with_quant = with_quant
        self.original_shape = hidden_states.shape

@@ -370,12 +375,14 @@ class TokenDispatcherWithAllGather(MoETokenDispatcher):
            torch_npu.npu_moe_init_routing_v2(
                hidden_states,
                topk_ids,
+                scale=pertoken_scale,
                active_num=num_tokens * self.top_k,
                expert_num=global_num_experts,
                expert_tokens_num_type=1,
                expert_tokens_num_flag=True,
                active_expert_range=[first_expert_idx, last_expert_idx],
-                quant_mode=1 if self.with_quant else -1,
+                quant_mode=1
+                if self.with_quant and pertoken_scale is None else -1,
            ))
        expert_tokens = expert_tokens.to(torch.int64)
        group_list_type = 1  # `count` mode
@@ -430,7 +437,8 @@ class TokenDispatcherWithMoge(MoETokenDispatcher):
                       mc2_mask: Optional[torch.Tensor] = None,
                       apply_router_weight_on_input: bool = False,
                       with_quant: bool = False,
-                       dynamic_eplb: bool = False):
+                       dynamic_eplb: bool = False,
+                       pertoken_scale: Optional[torch.Tensor] = None):
        self.bsz, _ = hidden_states.shape
        flatten_topk_ids = topk_ids.view(-1)
        self.sorted_topk_ids = torch.argsort(flatten_topk_ids.float())
@@ -518,7 +526,8 @@ class TokenDispatcherWithAll2AllV(MoETokenDispatcher):
                       mc2_mask: Optional[torch.Tensor] = None,
                       apply_router_weight_on_input: bool = False,
                       with_quant: bool = False,
-                       dynamic_eplb: bool = False):
+                       dynamic_eplb: bool = False,
+                       pertoken_scale: Optional[torch.Tensor] = None):
        self.with_quant = with_quant
        self.hidden_shape = hidden_states.shape