Reapply "[MoE] [Refactor] Remove manual memory cleanup (#3365)" (#3483) (#3512)

### What this PR does / why we need it?
1. Replace manual memory cleanup with passing parameter.
2. FusedMoEPrepareAndFinalizeWithMC2 inherits All2All avoid duplicated
code.
3. Fix MC2 bug introduced in
https://github.com/vllm-project/vllm-ascend/pull/3365
4. Unify aclgraph & eager in W8A8_dynamic.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
e2e & ut

- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

Signed-off-by: Pr0Wh1teGivee <calvin_zhu0210@outlook.com>
This commit is contained in:
weichen
2025-10-22 11:41:30 +08:00
committed by GitHub
parent 6ef62cb427
commit 2f1b9a7a64
13 changed files with 608 additions and 522 deletions

View File

@@ -227,22 +227,6 @@ class AscendW8A8DynamicFusedMoEMethod:
if enable_force_load_balance:
topk_ids = torch.randint_like(topk_ids, 0, global_num_experts)
if self.use_aclgraph:
moe_comm_method = get_forward_context().moe_comm_method
return moe_comm_method.fused_experts(
hidden_states=x,
w1=layer.w13_weight,
w2=layer.w2_weight,
topk_weights=topk_weights,
topk_ids=topk_ids,
use_int8_w8a8=True,
w1_scale=layer.w13_weight_scale,
w2_scale=layer.w2_weight_scale,
expert_map=expert_map,
dynamic_eplb=self.dynamic_eplb,
log2phy=log2phy,
global_redundant_expert_num=global_redundant_expert_num)
topk_weights = topk_weights.to(x.dtype)
moe_comm_method = get_forward_context().moe_comm_method
@@ -261,7 +245,8 @@ class AscendW8A8DynamicFusedMoEMethod:
shared_experts=shared_experts,
quantized_x_for_share=quantized_x_for_share,
dynamic_scale_for_share=dynamic_scale_for_share,
dynamic_eplb=self.dynamic_eplb)
dynamic_eplb=self.dynamic_eplb,
mc2_mask=kwargs.get("mc2_mask", None))
def process_weights_after_loading(self, layer):
if self.transpose_weight: