From 2b8a9ce8bd420b14f63f0a794059f06cfea7af0d Mon Sep 17 00:00:00 2001 From: weiguihua2 Date: Wed, 7 Jan 2026 15:39:52 +0800 Subject: [PATCH] [Bugfix] fix resource are insufficient when pcp and piecewise (#5377) ### What this PR does / why we need it? Resolving the issue of insufficient resources during service operation when PCP is enabled in a piecewise scenario. When enabling PCP and executing in piecewise mode, the curl request fails due to insufficient resources, resulting in the error message "The resources are insufficient." Through profiling analysis, it was found that the PCP communication domain also occupies streams and consumes resources. Therefore, when updating aclgraph sizes, the PCP communication domain needs to be taken into account. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: release/v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/bc0a5a0c089844b17cb93f3294348f411e523586 --------- Signed-off-by: weiguihua2 --- vllm_ascend/utils.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 711fbbd1..3527b910 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -491,6 +491,9 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: # TODO: Find out whether we need to solve allreduce function MAX_CAPTURE_SIZE = 1800 + # enable pcp or dcp will add new communication and consume additional approximately less than 100 streams + CP_ADDITIONAL_STREAM_NUM = 100 + # Store original configuration and temporarily clear it compilation_config = vllm_config.compilation_config original_sizes, compilation_config.cudagraph_capture_sizes = \ @@ -547,6 +550,12 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: "Calculated maximum supported batch sizes for ACL graph: %s", max_num_batch_sizes) else: + # enable pcp or dcp will add new communication and consume additional approximately less than 100 streams + if parallel_config.prefill_context_parallel_size > 1: + MAX_CAPTURE_SIZE = MAX_CAPTURE_SIZE - CP_ADDITIONAL_STREAM_NUM + if parallel_config.decode_context_parallel_size > 1: + MAX_CAPTURE_SIZE = MAX_CAPTURE_SIZE - CP_ADDITIONAL_STREAM_NUM + # The above describes an empirical formula applicable to the A2 hardware. # Under this configuration, HCCL employs the FFTS+ method for execution unfolding, # which adds only 1 concurrent stream without consuming collective communication execution unfolding streams.