[Perf] move quant before allgather in Allgather EP (#3420)

### What this PR does / why we need it?
move quant before allgather in Allgather EP, rely on
https://github.com/vllm-project/vllm-ascend/pull/3334

Deepseek R1 W8A8 performance on A2 with
`HCCL_ALGO="level0:NA;level1:pipeline"`:
| Seq length | Mean TTFT (ms) main | Mean TTFT (ms)  this PR |
|----------|----------|----------|
| 4k   |  375.21  | 364.99   |
| 16k  | 1465.23   | 1421.75  |
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?


- vLLM version: v0.11.0
- vLLM main:
83f478bb19

---------

Signed-off-by: realliujiaxu <realliujiaxu@163.com>
This commit is contained in:
realliujiaxu
2025-11-04 16:49:58 +08:00
committed by GitHub
parent 44b58b8665
commit bedf223771
10 changed files with 160 additions and 66 deletions

View File

@@ -386,7 +386,6 @@ class AscendW4A8DynamicFusedMoEMethod:
w2_scale_bias=layer.w2_scale_bias,
topk_weights=topk_weights,
topk_ids=topk_ids,
use_int4_w4a8=True,
expert_map=expert_map,
log2phy=log2phy,
global_redundant_expert_num=global_redundant_expert_num,

View File

@@ -143,6 +143,7 @@ class AscendW8A8DynamicFusedMoEMethod:
and not ascend_config.torchair_graph_config.enabled)
self.dynamic_eplb = ascend_config.dynamic_eplb or ascend_config.expert_map_record_path
self.in_dtype = vllm_config.model_config.dtype
try:
device_group = get_mc2_group().device_group
@@ -218,6 +219,7 @@ class AscendW8A8DynamicFusedMoEMethod:
shared_experts: Optional[Any] = None,
quantized_x_for_share: Optional[Any] = None,
dynamic_scale_for_share: Optional[Any] = None,
pertoken_scale: Optional[Any] = None,
**kwargs,
) -> torch.Tensor:
assert router_logits.shape[
@@ -242,18 +244,18 @@ class AscendW8A8DynamicFusedMoEMethod:
if enable_force_load_balance:
topk_ids = torch.randint_like(topk_ids, 0, global_num_experts)
topk_weights = topk_weights.to(x.dtype)
topk_weights = topk_weights.to(self.in_dtype)
moe_comm_method = get_forward_context().moe_comm_method
return moe_comm_method.fused_experts(
hidden_states=x,
pertoken_scale=pertoken_scale,
w1=layer.w13_weight,
w1_scale=layer.w13_weight_scale_fp32,
w2=layer.w2_weight,
w2_scale=layer.w2_weight_scale,
topk_weights=topk_weights,
topk_ids=topk_ids,
use_int8_w8a8=True,
expert_map=expert_map,
log2phy=log2phy,
global_redundant_expert_num=global_redundant_expert_num,