[Feat]support dynamic quantization in allgather (#2841)

### What this PR does / why we need it? [Feat]support dynamic quantization in allgather ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: main - vLLM main: 5931b7e5d9 Signed-off-by: withHades <244036962@qq.com> Signed-off-by: WithHades <244036962@qq.com>
2025-09-11 18:47:20 +08:00
parent 07c58669fd
commit c3c2221503
4 changed files with 112 additions and 4 deletions
--- a/vllm_ascend/ops/moe/token_dispatcher.py
+++ b/vllm_ascend/ops/moe/token_dispatcher.py
@@ -367,7 +367,7 @@ class TokenDispatcherWithAllGather(MoETokenDispatcher):
            last_expert_idx = self.num_experts_local
            global_num_experts = self.num_experts_local

-        sorted_hidden_states, self.expanded_row_idx, expert_tokens, _ = (
+        sorted_hidden_states, self.expanded_row_idx, expert_tokens, pertoken_scale = (
            torch_npu.npu_moe_init_routing_v2(
                hidden_states,
                topk_ids,
@@ -376,7 +376,7 @@ class TokenDispatcherWithAllGather(MoETokenDispatcher):
                expert_tokens_num_type=1,
                expert_tokens_num_flag=True,
                active_expert_range=[first_expert_idx, last_expert_idx],
-                quant_mode=-1,
+                quant_mode=1 if self.with_quant else -1,
            ))
        expert_tokens = expert_tokens.to(torch.int64)
        group_list_type = 1  # `count` mode
@@ -384,6 +384,7 @@ class TokenDispatcherWithAllGather(MoETokenDispatcher):
            "group_list_type": group_list_type,
            "hidden_states": sorted_hidden_states,
            "group_list": expert_tokens,
+            "dynamic_scale": pertoken_scale if self.with_quant else None,
        }

    def token_combine(self,