From 69509bcdd693dc6653db81648d4775af24ceb2b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=97=A0=E8=84=B8=E7=94=B7?= <244036962@qq.com>
Date: Fri, 26 Sep 2025 08:57:47 +0800
Subject: [PATCH]  [bugfix] fix oom in aclgraph (#3158)

### What this PR does / why we need it?
fix oom in aclgraph.

1. In the current token dispatch implementation, tensors are mounted on
class instances to facilitate parameter passing between different
methods. This approach prevents automatic recycling of these tensors. In
some cases, it may lead to out-of-memory error. To address this issue,
we manually set these tensors to None to release corresponding memory.

2. The `profile_run` method is designed to accurately estimate the
maximum NPU memory usage during vLLM inference. However, in certain
scenarios, MoE models perform inference via MC2, which includes
communication and consumes additional NPU memory. This leads to
inaccurate estimation by the profile run. We address this by actively
triggering the MC2 during profile run for initialization.```.

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?


- vLLM version: v0.10.2
- vLLM main:
https://github.com/vllm-project/vllm/commit/52d0cb845866869d587fc013a7c59e60a86ebcf2

Signed-off-by: WithHades <244036962@qq.com>
---
 vllm_ascend/ops/moe/token_dispatcher.py | 24 ++++++++++++++++++++++++
 vllm_ascend/worker/model_runner_v1.py   |  8 ++++++++
 2 files changed, 32 insertions(+)

diff --git a/vllm_ascend/ops/moe/token_dispatcher.py b/vllm_ascend/ops/moe/token_dispatcher.py
index 90c84d5..b36cc44 100644
--- a/vllm_ascend/ops/moe/token_dispatcher.py
+++ b/vllm_ascend/ops/moe/token_dispatcher.py
@@ -272,6 +272,16 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
             **kwargs_mc2
         ) if self.enable_dispatch_v2 else torch_npu.npu_moe_distribute_combine(
             **kwargs_mc2)
+
+        # these values are no longer used, so they need to be set to None for memory release.
+        self.output = None
+        self.assist_info_for_combine = None
+        self.ep_recv_counts = None
+        self.topk_ids = None
+        self.topk_weights = None
+        self.mc2_mask = None
+        self.expert_map = None
+
         if self.shared_experts is None:
             return hidden_states
         else:
@@ -281,6 +291,9 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
             else:
                 shared_hidden_states, _ = self.shared_experts.down_proj(
                     self.shared_act)
+            self.shared_act = None
+            self.shared_experts = None
+            self.swiglu_out_scale = None
             return hidden_states, shared_hidden_states
 
 
@@ -374,6 +387,12 @@ class TokenDispatcherWithAllGather(MoETokenDispatcher):
             probs=self.topk_weights)
         if len(self.original_shape) == 3:
             final_hidden_states = final_hidden_states.view(self.original_shape)
+
+        # these values are no longer used, so they need to be set to None for memory release.
+        self.expert_map = None
+        self.topk_weights = None
+        self.topk_ids = None
+        self.expanded_row_idx = None
         return final_hidden_states
 
 
@@ -564,9 +583,14 @@ class TokenDispatcherWithAll2AllV(MoETokenDispatcher):
 
         output = self._combine_postprocess(permutated_local_input_tokens)
 
+        # these values are no longer used, so they need to be set to None for memory release.
         self.input_splits = None
         self.output_splits = None
         self.num_global_tokens_per_local_expert = None
+        self.topk_weights = None
+        self.reversed_local_input_permutation_mapping = None
+        self.reversed_global_input_permutation_mapping = None
+        self.global_input_tokens_local_experts_indices = None
 
         return output
 
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 4d9e338..ff055e4 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -2507,6 +2507,14 @@ class NPUModelRunner(LoRAModelRunnerMixin):
         with self.set_in_profile_run():
             hidden_states = self._dummy_run(self.max_num_tokens,
                                             with_prefill=True)
+            # MC2 will consume additional NPU memory.
+            # Therefore, we need to run the MC2 path once here to complete its initialization,
+            # allowing vLLM to correctly estimate the maximum memory required.
+            if self._select_moe_comm_method(
+                    self.mc2_tokens_capacity,
+                    with_prefill=True) == MoECommType.MC2:
+                self._dummy_run(self.mc2_tokens_capacity)
+
         output = None
         if get_pp_group().is_last_rank:
             if self.is_pooling_model: