From 4536123341f46ffec59b58e6e1c102afaa973f82 Mon Sep 17 00:00:00 2001
From: Yizhou <136800916+yiz-liu@users.noreply.github.com>
Date: Tue, 14 Oct 2025 10:56:12 +0800
Subject: [PATCH] [Fix] Fix mc2_tokens_capacity-related issues (#3411)
### What this PR does / why we need it?
Replaces the hardcoded `mc2_tokens_capacity` with the max graph capture
size for a more accurate allocation.
This change ensures the capacity is correctly sized relative to the
graph capture configuration, removing a magic number and making the
setup more robust.
This PR fixes two issues:
1. MC2 op restrictions differ between SoCs. @Angazenn This
requires an overhaul, hence removed from this PR, please commit another
PR.
2. The hardcoded value `512` allocates too much buffer for large models.
### Does this PR introduce _any_ user-facing change?
None.
### How was this patch tested?
Tested in daily checks.
- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0
---------
Signed-off-by: Yizhou Liu
---
vllm_ascend/worker/model_runner_v1.py | 16 ++++++++++++----
1 file changed, 12 insertions(+), 4 deletions(-)
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 7bddae0..5cfad82 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -458,15 +458,23 @@ class NPUModelRunner(LoRAModelRunnerMixin):
self.is_kv_producer = vllm_config.kv_transfer_config.is_kv_producer
self.is_kv_consumer = vllm_config.kv_transfer_config.is_kv_consumer
- # NOTE: Technically, MC2 can have 512 tokens each rank, but this will consume too much memory. The formula is:
- # ((maxBs * tokenNeedSizeDispatch * ep_worldsize * localMoeExpertNum) + (maxBs * tokenNeedSizeCombine * (k + sharedExpertNum))) * 2
- # so we have to limit the MC2 tokens to save memory, should fix this in the future.
- self.mc2_tokens_capacity = 512
+ # NOTE: To be clear, we need to make sure that during graph capture, the number of
+ # tokens is less than or equal to mc2_tokens_capacity. According to _set_cudagraph_sizes,
+ # the max number of tokens in graph is min(max_num_seqs * 2, 512).
+ if self.compilation_config.cudagraph_capture_sizes:
+ max_num_tokens = self.compilation_config.cudagraph_capture_sizes[0]
+ else:
+ max_num_tokens = self.max_num_reqs * self.uniform_decode_query_len
+ tp_size = self.parallel_config.tensor_parallel_size
+ # Use integer arithmetic for ceiling division.
+ num_tokens_per_tp_rank = (max_num_tokens + tp_size - 1) // tp_size
+ self.mc2_tokens_capacity = num_tokens_per_tp_rank * tp_size
self.reserved_mc2_mask = torch.zeros(
self.mc2_tokens_capacity,
dtype=torch.bool,
device=self.device,
)
+
self.dynamic_eplb = self.ascend_config.dynamic_eplb
if self.dynamic_eplb:
self.is_eplb_warmuped = False