From 3a27b15ddc4522c045036cf2ed7c92abed6c8350 Mon Sep 17 00:00:00 2001 From: leo-pony Date: Tue, 30 Sep 2025 15:30:01 +0800 Subject: [PATCH] [bugfix] Fix Qwen3-30B-A3B dp parallel hung issue when running with the dp parallel example (#3287) ### What this PR does / why we need it? Fix Qwen3-30B-A3B dp parallel hung issue when running with the dp parallel example. For large-parameter models of Qwen3-30B and above, weight loading alone takes 4 to 5 minutes. Therefore, the 5-minute timeout in the current example code implementation is too short, causing some DP instances to be killed prematurely and eventually stuck in the DP synchronization all-reduce operation. ### Does this PR introduce _any_ user-facing change? NA ### How was this patch tested? NA vLLM version: v0.11.0rc3 vLLM main: https://github.com/vllm-project/vllm/commit/releases/v0.11.0 - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/releases/v0.11.0 --------- Signed-off-by: leo-pony --- examples/offline_data_parallel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/offline_data_parallel.py b/examples/offline_data_parallel.py index c5d0b3e..63e0bf9 100644 --- a/examples/offline_data_parallel.py +++ b/examples/offline_data_parallel.py @@ -244,10 +244,10 @@ if __name__ == "__main__": procs.append(proc) exit_code = 0 for proc in procs: - proc.join(timeout=300) + proc.join(timeout=900) if proc.exitcode is None: print( - f"Killing process {proc.pid} that didn't stop within 5 minutes." + f"Killing process {proc.pid} that didn't stop within 15 minutes." ) proc.kill() exit_code = 1