From cab5d736334e7c46ef4ec0913cad83dcd8e0ff51 Mon Sep 17 00:00:00 2001 From: linfeng-yuan <1102311262@qq.com> Date: Mon, 30 Mar 2026 20:24:52 +0800 Subject: [PATCH] [releases/v0.18.0][BugFix] Fix server init error when set max_num_seqs not a multiple of tp while FLASHCOMM is on (#7832) ### What this PR does / why we need it? Current version will run into init error when user set max_num_seqs to number not a multiple of tp size. The reason is that we will first find out the valid size of sequence parallelism, and then remove numbers that are not the multiple of tp size. This may cause an error when we set a max_num_seqs above a multiple of 8 before a multiple of tp size, say when the tp size is 16 and the max_num_seqs is 90. The system will just drop the calculated max graph capture size 88 from the valid size list but not reset the max_cudagraph_capture_size to the next valid number. Thus, we will need to add the line to match them up. Cherry-pick from main PR #7801 ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Full CI passed with this PR. Signed-off-by: linfeng-yuan <1102311262@qq.com> Co-authored-by: limuyuan --- vllm_ascend/platform.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index f09c524e..85f19eeb 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -327,6 +327,10 @@ class NPUPlatform(Platform): f"{vllm_config.parallel_config.tensor_parallel_size}" ) if len(sp_aclgraph_sizes) != len(original_sizes): + # If user set the max_num_seqs miss fit the multiple of tp_size, + # we need to match the max_cudagraph_capture_size with the valid max size, + # so we can avoid initialization error of vllm server. + compilation_config.max_cudagraph_capture_size = sp_aclgraph_sizes[-1] compilation_config.cudagraph_capture_sizes = sp_aclgraph_sizes update_cudagraph_capture_sizes(vllm_config, sp_aclgraph_sizes)