[Feat]support dynamic quantization in allgather (#2841)

### What this PR does / why we need it?
[Feat]support dynamic quantization in allgather
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: main
- vLLM main:
5931b7e5d9

Signed-off-by: withHades <244036962@qq.com>
Signed-off-by: WithHades <244036962@qq.com>
This commit is contained in:
无脸男
2025-09-11 18:47:20 +08:00
committed by GitHub
parent 07c58669fd
commit c3c2221503
4 changed files with 112 additions and 4 deletions

View File

@@ -367,7 +367,7 @@ class TokenDispatcherWithAllGather(MoETokenDispatcher):
last_expert_idx = self.num_experts_local
global_num_experts = self.num_experts_local
sorted_hidden_states, self.expanded_row_idx, expert_tokens, _ = (
sorted_hidden_states, self.expanded_row_idx, expert_tokens, pertoken_scale = (
torch_npu.npu_moe_init_routing_v2(
hidden_states,
topk_ids,
@@ -376,7 +376,7 @@ class TokenDispatcherWithAllGather(MoETokenDispatcher):
expert_tokens_num_type=1,
expert_tokens_num_flag=True,
active_expert_range=[first_expert_idx, last_expert_idx],
quant_mode=-1,
quant_mode=1 if self.with_quant else -1,
))
expert_tokens = expert_tokens.to(torch.int64)
group_list_type = 1 # `count` mode
@@ -384,6 +384,7 @@ class TokenDispatcherWithAllGather(MoETokenDispatcher):
"group_list_type": group_list_type,
"hidden_states": sorted_hidden_states,
"group_list": expert_tokens,
"dynamic_scale": pertoken_scale if self.with_quant else None,
}
def token_combine(self,