diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py index 4ab74cee..f7a6cbd1 100644 --- a/tests/ut/test_platform.py +++ b/tests/ut/test_platform.py @@ -330,6 +330,7 @@ class TestNPUPlatform(TestBase): ) @patch("vllm_ascend.utils.is_310p", return_value=False) + @patch("vllm_ascend.utils.update_default_aclgraph_sizes") @patch("vllm_ascend.ascend_config.check_ascend_config") @patch("vllm_ascend.ascend_config.init_ascend_config") @patch( @@ -337,7 +338,8 @@ class TestNPUPlatform(TestBase): ) def test_check_and_update_config_unsupported_compilation_level( self, mock_init_recompute, mock_init_ascend, mock_check_ascend, - mock_is_310p): + mock_update_default, mock_is_310p): + mock_update_default.return_value = MagicMock() mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config( ) vllm_config = TestNPUPlatform.mock_vllm_config() @@ -410,6 +412,7 @@ class TestNPUPlatform(TestBase): ) @patch("vllm_ascend.utils.is_310p", return_value=False) + @patch("vllm_ascend.utils.update_default_aclgraph_sizes") @patch("vllm_ascend.ascend_config.check_ascend_config") @patch("vllm_ascend.ascend_config.init_ascend_config") @patch( @@ -417,7 +420,8 @@ class TestNPUPlatform(TestBase): ) def test_check_and_update_config_torchair_enabled_compilation( self, mock_init_recompute, mock_init_ascend, mock_check_ascend, - mock_is_310p): + mock_update_default, mock_is_310p): + mock_update_default.return_value = MagicMock() mock_ascend_config = TestNPUPlatform.mock_vllm_ascend_config() mock_ascend_config.torchair_graph_config.enabled = True mock_init_ascend.return_value = mock_ascend_config diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index dd063e28..5559df8c 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -33,7 +33,8 @@ from vllm_ascend.torchair.utils import (check_torchair_cache_exist, from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, enable_sp, is_310p, prefill_context_parallel_enable, update_aclgraph_sizes, - update_cudagraph_capture_sizes, vllm_version_is) + update_cudagraph_capture_sizes, + update_default_aclgraph_sizes, vllm_version_is) if TYPE_CHECKING: from vllm.config import ModelConfig, VllmConfig @@ -193,6 +194,10 @@ class NPUPlatform(Platform): # set cudaprah sizes before extending `compilation_config.splitting_ops` vllm_config._set_cudagraph_sizes() + # There are cases where default cudagraph_capture_sizes are not friendly + # to ascend ops && hardwares. We update these sizes here to improve + # default performance. + update_default_aclgraph_sizes(vllm_config) # TODO delete graph size update here when compilation_config.pass_config.enable_sequence_parallelism # is supported by vllm-ascend. if vllm_config.parallel_config.tensor_parallel_size > 1 and not vllm_config.model_config.enforce_eager and \ diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 5cccdaf0..38151080 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -413,6 +413,75 @@ def update_cudagraph_capture_sizes(vllm_config: VllmConfig, vllm_config.compilation_config.post_init_cudagraph_sizes() +def _is_default_capture_sizes(vllm_config: VllmConfig) -> bool: + """ + Check whether it is vLLM default capture sizes. + """ + + if vllm_version_is("0.11.0"): + cuda_graph_sizes = vllm_config.scheduler_config.cuda_graph_sizes + if len(cuda_graph_sizes) == 1: + cudagraph_capture_sizes = [1, 2, 4] + [ + i for i in range(8, cuda_graph_sizes[0] + 1, 8) + ] + else: + max_cudagraph_capture_size = \ + vllm_config.compilation_config.max_cudagraph_capture_size + cudagraph_capture_sizes = [ + i for i in [1, 2, 4] if i <= max_cudagraph_capture_size + ] + if max_cudagraph_capture_size >= 8: + # Step size 8 for small batch sizes, up to 256(not included) + cudagraph_capture_sizes += list( + range(8, min(max_cudagraph_capture_size + 1, 256), 8)) + if max_cudagraph_capture_size >= 256: + # Step size 16 for larger batch sizes + cudagraph_capture_sizes += list( + range(256, max_cudagraph_capture_size + 1, 16)) + + if sorted(cudagraph_capture_sizes, reverse=True) == \ + vllm_config.compilation_config.cudagraph_capture_sizes: + return True + + return False + + +def update_default_aclgraph_sizes(vllm_config: VllmConfig) -> None: + """ + Update ACL graph default capture sizes, so that new sizes + are more friendly to ascend ops && hardware. + """ + + if vllm_config.model_config is None or \ + vllm_config.model_config.enforce_eager or \ + not _is_default_capture_sizes(vllm_config): + return + + # modify the default capture_sizes for Qwen3-MoE models on dp settings. + # this is mainly because performance of _npu_paged_attention might degrades + # on special shapes. + # TODO(Angazenn): we will remove this once _npu_paged_attention is fully + # replaced by npu_fused_infer_attention_score which does not contain such bugs. + if vllm_config.model_config and vllm_config.model_config.hf_config.model_type == "qwen3_moe" \ + and vllm_config.parallel_config.tensor_parallel_size == 1 \ + and vllm_config.parallel_config.data_parallel_size > 1 : + if vllm_version_is("0.11.0"): + max_capture_size = vllm_config.scheduler_config.cuda_graph_sizes[0] + else: + max_capture_size = vllm_config.compilation_config.max_cudagraph_capture_size + new_cudagraph_capture_sizes = [1, 2, 5, 10, 15, 20] + [ + i for i in range(24, max_capture_size + 1, 8) + ] + + if vllm_version_is("0.11.0"): + vllm_config.compilation_config.cudagraph_capture_sizes = new_cudagraph_capture_sizes + vllm_config.compilation_config.init_with_cudagraph_sizes( + new_cudagraph_capture_sizes) + else: + update_cudagraph_capture_sizes(vllm_config, + new_cudagraph_capture_sizes) + + def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: """Update ACL graph capture sizes based on hardware limitations""" # NOTE: Currently, we can only capture 1800 graphs at most,