From 5e34c70ffcd2adb4448e836415cf440089bdd039 Mon Sep 17 00:00:00 2001 From: Angazenn <92204292+Angazenn@users.noreply.github.com> Date: Tue, 27 Jan 2026 14:38:07 +0800 Subject: [PATCH] [Misc] Removes unnecessary graph size re-initialization (#6280) ### What this PR does / why we need it? This PR removes `update_default_aclgraph_sizes`. In earlier versions, we add this function to change default `cudagraph_capture_sizes` because `_npu_paged_attention` degrades significantly on certain shapes (which is included in default `cudagraph_capture_sizes` of VLLM). Now since we use FIA as default attention op (which does not contain such performance degradation), there is no need to add this default change. Otherwise, it could cause some conflicts if we set a small `cudagraph_capture_sizes` that < 20 now. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.14.1 - vLLM main: https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60 --------- Signed-off-by: Angazenn --- tests/ut/test_platform.py | 4 +--- vllm_ascend/platform.py | 5 ----- vllm_ascend/utils.py | 47 --------------------------------------- 3 files changed, 1 insertion(+), 55 deletions(-) diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py index e17d85e8..ac62bf47 100644 --- a/tests/ut/test_platform.py +++ b/tests/ut/test_platform.py @@ -207,13 +207,11 @@ class TestNPUPlatform(TestBase): ) @patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3) - @patch("vllm_ascend.utils.update_default_aclgraph_sizes") @patch("vllm_ascend.ascend_config.init_ascend_config") @patch("vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config") def test_check_and_update_config_unsupported_compilation_level( - self, mock_init_recompute, mock_init_ascend, mock_update_default, mock_soc_version + self, mock_init_recompute, mock_init_ascend, mock_soc_version ): - mock_update_default.return_value = MagicMock() mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config() vllm_config = TestNPUPlatform.mock_vllm_config() vllm_config.model_config.enforce_eager = False diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index a87c790f..ecd435df 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -47,7 +47,6 @@ from vllm_ascend.utils import ( refresh_block_size, update_aclgraph_sizes, update_cudagraph_capture_sizes, - update_default_aclgraph_sizes, is_310p, ) @@ -247,10 +246,6 @@ class NPUPlatform(Platform): # set cudaprah sizes before extending `compilation_config.splitting_ops` vllm_config._set_cudagraph_sizes() - # There are cases where default cudagraph_capture_sizes are not friendly - # to ascend ops && hardwares. We update these sizes here to improve - # default performance. - update_default_aclgraph_sizes(vllm_config) # TODO delete graph size update here when compilation_config.pass_config.enable_sp # is supported by vllm-ascend. if ( diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index eb88afdd..3d8f6abe 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -427,53 +427,6 @@ def update_cudagraph_capture_sizes(vllm_config: VllmConfig, cudagraph_capture_si vllm_config.compilation_config.post_init_cudagraph_sizes() -def _is_default_capture_sizes(vllm_config: VllmConfig) -> bool: - """ - Check whether it is vLLM default capture sizes. - """ - - max_cudagraph_capture_size = vllm_config.compilation_config.max_cudagraph_capture_size - cudagraph_capture_sizes = [i for i in [1, 2, 4] if i <= max_cudagraph_capture_size] - if max_cudagraph_capture_size >= 8: - # Step size 8 for small batch sizes, up to 256(not included) - cudagraph_capture_sizes += list(range(8, min(max_cudagraph_capture_size + 1, 256), 8)) - if max_cudagraph_capture_size >= 256: - # Step size 16 for larger batch sizes - cudagraph_capture_sizes += list(range(256, max_cudagraph_capture_size + 1, 16)) - # in newer version, vLLM use ascending order of cudagraph_capture_sizes. - target_cudagraph_capture_sizes = sorted(cudagraph_capture_sizes) - return target_cudagraph_capture_sizes == vllm_config.compilation_config.cudagraph_capture_sizes - - -def update_default_aclgraph_sizes(vllm_config: VllmConfig) -> None: - """ - Update ACL graph default capture sizes, so that new sizes - are more friendly to ascend ops && hardware. - """ - - if ( - vllm_config.model_config is None - or vllm_config.model_config.enforce_eager - or not _is_default_capture_sizes(vllm_config) - ): - return - - # modify the default capture_sizes for Qwen3-MoE models on dp settings. - # this is mainly because performance of _npu_paged_attention might degrades - # on special shapes. - # TODO(Angazenn): we will remove this once _npu_paged_attention is fully - # replaced by npu_fused_infer_attention_score which does not contain such bugs. - if ( - vllm_config.model_config - and vllm_config.model_config.hf_text_config.model_type == "qwen3_moe" - and vllm_config.parallel_config.tensor_parallel_size == 1 - and vllm_config.parallel_config.data_parallel_size > 1 - ): - max_capture_size = vllm_config.compilation_config.max_cudagraph_capture_size - new_cudagraph_capture_sizes = [1, 2, 5, 10, 15, 20] + [i for i in range(24, max_capture_size + 1, 8)] - update_cudagraph_capture_sizes(vllm_config, new_cudagraph_capture_sizes) - - def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: """Update ACL graph capture sizes based on hardware limitations""" # NOTE: Currently, we can only capture 1800 graphs at most,