Reapply "[MoE] [Refactor] Remove manual memory cleanup (#3365)" (#3483) (#3512)

### What this PR does / why we need it? 1. Replace manual memory cleanup with passing parameter. 2. FusedMoEPrepareAndFinalizeWithMC2 inherits All2All avoid duplicated code. 3. Fix MC2 bug introduced in https://github.com/vllm-project/vllm-ascend/pull/3365 4. Unify aclgraph & eager in W8A8_dynamic. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? e2e & ut - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: Pr0Wh1teGivee <calvin_zhu0210@outlook.com>
2025-10-22 11:41:30 +08:00
parent 6ef62cb427
commit 2f1b9a7a64
13 changed files with 608 additions and 522 deletions
--- a/vllm_ascend/quantization/w8a8_dynamic.py
+++ b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -227,22 +227,6 @@ class AscendW8A8DynamicFusedMoEMethod:
        if enable_force_load_balance:
            topk_ids = torch.randint_like(topk_ids, 0, global_num_experts)

-        if self.use_aclgraph:
-            moe_comm_method = get_forward_context().moe_comm_method
-            return moe_comm_method.fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                use_int8_w8a8=True,
-                w1_scale=layer.w13_weight_scale,
-                w2_scale=layer.w2_weight_scale,
-                expert_map=expert_map,
-                dynamic_eplb=self.dynamic_eplb,
-                log2phy=log2phy,
-                global_redundant_expert_num=global_redundant_expert_num)
-
        topk_weights = topk_weights.to(x.dtype)

        moe_comm_method = get_forward_context().moe_comm_method
@@ -261,7 +245,8 @@ class AscendW8A8DynamicFusedMoEMethod:
            shared_experts=shared_experts,
            quantized_x_for_share=quantized_x_for_share,
            dynamic_scale_for_share=dynamic_scale_for_share,
-            dynamic_eplb=self.dynamic_eplb)
+            dynamic_eplb=self.dynamic_eplb,
+            mc2_mask=kwargs.get("mc2_mask", None))

    def process_weights_after_loading(self, layer):
        if self.transpose_weight: