From 99bf25af76a9f6759e262e351c7d41fed57f159f Mon Sep 17 00:00:00 2001 From: yiz-liu <136800916+yiz-liu@users.noreply.github.com> Date: Mon, 25 Aug 2025 19:56:02 +0800 Subject: [PATCH] [Fix] Add operations in `_dummy_run` to maintain synchronization with `_process_reqs`, resolving a service hang (#2454) ### What this PR does / why we need it? Fixes hang when batch size < DP size. ### Does this PR introduce _any_ user-facing change? None. ### How was this patch tested? After this change, the function in DP case will work now. - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/d9a55204bad7bde505624f4ffb0464d98c86914a Signed-off-by: Yizhou Liu --- vllm_ascend/worker/model_runner_v1.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 82b4996..3df2613 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1911,6 +1911,10 @@ class NPUModelRunner(LoRAModelRunnerMixin): ) # Padding for DP + num_pad, num_tokens_across_dp_native = self.get_dp_padding(num_tokens) + # num_tokens += num_pad ## Uncomment this after TorchAir is removed + + # Padding for DP (for TorchAir) (num_tokens, num_tokens_across_dp, with_prefill, _) = self._get_forward_metadata_across_dp_and_pad( num_tokens, with_prefill, False)