diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 711fbbd1..3527b910 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -491,6 +491,9 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: # TODO: Find out whether we need to solve allreduce function MAX_CAPTURE_SIZE = 1800 + # enable pcp or dcp will add new communication and consume additional approximately less than 100 streams + CP_ADDITIONAL_STREAM_NUM = 100 + # Store original configuration and temporarily clear it compilation_config = vllm_config.compilation_config original_sizes, compilation_config.cudagraph_capture_sizes = \ @@ -547,6 +550,12 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: "Calculated maximum supported batch sizes for ACL graph: %s", max_num_batch_sizes) else: + # enable pcp or dcp will add new communication and consume additional approximately less than 100 streams + if parallel_config.prefill_context_parallel_size > 1: + MAX_CAPTURE_SIZE = MAX_CAPTURE_SIZE - CP_ADDITIONAL_STREAM_NUM + if parallel_config.decode_context_parallel_size > 1: + MAX_CAPTURE_SIZE = MAX_CAPTURE_SIZE - CP_ADDITIONAL_STREAM_NUM + # The above describes an empirical formula applicable to the A2 hardware. # Under this configuration, HCCL employs the FFTS+ method for execution unfolding, # which adds only 1 concurrent stream without consuming collective communication execution unfolding streams.