From dde547e900e5263e046be665c7664af13a5c7f75 Mon Sep 17 00:00:00 2001 From: LiuYi-Up <73060646+LiuYi-Up@users.noreply.github.com> Date: Tue, 13 Jan 2026 08:44:10 +0800 Subject: [PATCH] [Bugfix] bugfix for the order of dummy run pad and sync (#5777) ### What this PR does / why we need it? This PR addresses an issue in piecewise graph mode when Multi-Threading Parallelism (MTP) is enabled. Specifically, the original dummy run sequence performs the following steps in order: 1. Sync DP (input length = 1 + k) 2. Dispatch (input length = 1 + k, with padding==graph size) However, in the model execution phase, the sequence differs, resulting in: 1. Padding (input length = 1, with padding) 2. Sync DP (input length = 1 + k) 3. Dispatch (input length 1 + k != graph size 1 + k, with padding) This discrepancy leads to a mismatch between the input sizes used in the model execution and those expected by the dispatch graph, causing an inconsistency in graph size. This PR ensures that the dispatch graph size aligns correctly by modifying the sequence of operations during model execution to match the dummy run sequence, resolving the mismatch issue. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d Signed-off-by: LiuYi-UP <1150854440@qq.com> --- vllm_ascend/worker/model_runner_v1.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 80148785..757da788 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -2072,10 +2072,14 @@ class NPUModelRunner(GPUModelRunner): if self.is_kv_producer and not self.is_kv_consumer: with_prefill = True + has_lora = True if self.lora_config and self.compilation_config.cudagraph_specialize_lora else False + _ag_mode, batch_descriptor = \ + self.cudagraph_dispatcher.dispatch(num_tokens=num_tokens, uniform_decode=uniform_decode, has_lora=has_lora) + # Padding for DP (num_tokens, num_tokens_across_dp, - with_prefill) = self._sync_metadata_across_dp(num_tokens, - with_prefill) + with_prefill) = self._sync_metadata_across_dp( + batch_descriptor.num_tokens, with_prefill) # If cudagraph_mode.decode_mode() == FULL and # cudagraph_mode.seperate_routine(). This means that we are using @@ -2122,9 +2126,11 @@ class NPUModelRunner(GPUModelRunner): if not is_profile and self.dynamic_eplb: self.eplb_updator.forward_before() - has_lora = True if self.lora_config and self.compilation_config.cudagraph_specialize_lora else False - _ag_mode, batch_descriptor = \ - self.cudagraph_dispatcher.dispatch(num_tokens=num_tokens, uniform_decode=uniform_decode, has_lora=has_lora) + if num_tokens != batch_descriptor.num_tokens: + _ag_mode, batch_descriptor = self.cudagraph_dispatcher.dispatch( + num_tokens=num_tokens, + uniform_decode=uniform_decode, + has_lora=has_lora) num_tokens_padded = batch_descriptor.num_tokens num_reqs_padded = (batch_descriptor.num_reqs if