diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 7bddae0..5cfad82 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -458,15 +458,23 @@ class NPUModelRunner(LoRAModelRunnerMixin): self.is_kv_producer = vllm_config.kv_transfer_config.is_kv_producer self.is_kv_consumer = vllm_config.kv_transfer_config.is_kv_consumer - # NOTE: Technically, MC2 can have 512 tokens each rank, but this will consume too much memory. The formula is: - # ((maxBs * tokenNeedSizeDispatch * ep_worldsize * localMoeExpertNum) + (maxBs * tokenNeedSizeCombine * (k + sharedExpertNum))) * 2 - # so we have to limit the MC2 tokens to save memory, should fix this in the future. - self.mc2_tokens_capacity = 512 + # NOTE: To be clear, we need to make sure that during graph capture, the number of + # tokens is less than or equal to mc2_tokens_capacity. According to _set_cudagraph_sizes, + # the max number of tokens in graph is min(max_num_seqs * 2, 512). + if self.compilation_config.cudagraph_capture_sizes: + max_num_tokens = self.compilation_config.cudagraph_capture_sizes[0] + else: + max_num_tokens = self.max_num_reqs * self.uniform_decode_query_len + tp_size = self.parallel_config.tensor_parallel_size + # Use integer arithmetic for ceiling division. + num_tokens_per_tp_rank = (max_num_tokens + tp_size - 1) // tp_size + self.mc2_tokens_capacity = num_tokens_per_tp_rank * tp_size self.reserved_mc2_mask = torch.zeros( self.mc2_tokens_capacity, dtype=torch.bool, device=self.device, ) + self.dynamic_eplb = self.ascend_config.dynamic_eplb if self.dynamic_eplb: self.is_eplb_warmuped = False