From 05a561129e9b0ceda47093e3bd507f1f44179e86 Mon Sep 17 00:00:00 2001 From: linfeng-yuan <1102311262@qq.com> Date: Wed, 25 Mar 2026 17:57:19 +0800 Subject: [PATCH] [Graph][Bugfix] Set default cudagraph max capture size via platform defaults (#7572) ### What this PR does / why we need it? This PR lets NPU platform provide its own default `max_cudagraph_capture_size` via `NPUPlatform.apply_config_platform_defaults()`. Previously, when cudagraph sizing was left unset, Ascend inherited vLLM's upstream default heuristic in `_set_cudagraph_sizes()`, which uses `max_num_seqs * decode_query_len * 2`. This PR changes Ascend's default to `min(max_num_seqs * decode_query_len, 512)` while keeping the rest of vLLM's cudagraph sizing logic unchanged. ### Does this PR introduce _any_ user-facing change? Yes, but only for Ascend when users do not explicitly configure cudagraph sizing. If `max_cudagraph_capture_size` and `cudagraph_capture_sizes` are both unset, we now uses `max_num_seqs * decode_query_len` (capped at `512`) instead of the upstream `* 2` default. Explicit user settings are unchanged. ### How was this patch tested? Add unit tests to cover: - default max injection via `apply_config_platform_defaults()` - explicit `max_cudagraph_capture_size` is preserved - explicit `cudagraph_capture_sizes` are preserved - Ascend default max no longer uses the upstream `* 2` - late `_set_cudagraph_sizes()` recomputation reuses the current max input - vLLM version: v0.18.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ed359c497a728f08b5b41456c07a688ccd510fbc --------- Signed-off-by: linfeng-yuan <1102311262@qq.com> --- tests/ut/test_platform.py | 113 +++++++++++++++++++++++++++++++++++++- vllm_ascend/platform.py | 48 +++++++++++++++- 2 files changed, 159 insertions(+), 2 deletions(-) diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py index 914cd7f4..4256e5eb 100644 --- a/tests/ut/test_platform.py +++ b/tests/ut/test_platform.py @@ -9,7 +9,11 @@ from vllm.v1.attention.selector import AttentionSelectorConfig # type: ignore from tests.ut.base import TestBase from vllm_ascend.platform import NPUPlatform -from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD, AscendDeviceType +from vllm_ascend.utils import ( + ASCEND_QUANTIZATION_METHOD, + COMPRESSED_TENSORS_METHOD, + AscendDeviceType, +) class TestNPUPlatform(TestBase): @@ -21,7 +25,9 @@ class TestNPUPlatform(TestBase): mock_vllm_config.parallel_config = MagicMock() mock_vllm_config.cache_config = MagicMock() mock_vllm_config.scheduler_config = MagicMock() + mock_vllm_config.scheduler_config.max_num_seqs = None mock_vllm_config.speculative_config = None + mock_vllm_config.additional_config = {} mock_vllm_config.compilation_config.pass_config.enable_sp = False mock_vllm_config.compilation_config.cudagraph_mode = None return mock_vllm_config @@ -30,7 +36,13 @@ class TestNPUPlatform(TestBase): def mock_vllm_ascend_config(): mock_ascend_config = MagicMock() mock_ascend_config.xlite_graph_config.enabled = False + mock_ascend_config.xlite_graph_config.full_mode = False + mock_ascend_config.ascend_compilation_config.enable_npugraph_ex = False + mock_ascend_config.ascend_fusion_config = None + mock_ascend_config.recompute_scheduler_enable = False + mock_ascend_config.SLO_limits_for_dynamic_batch = -1 mock_ascend_config.enable_shared_expert_dp = False + mock_ascend_config.update_compile_ranges_split_points = MagicMock() return mock_ascend_config def setUp(self): @@ -99,6 +111,105 @@ class TestNPUPlatform(TestBase): mock_adapt_patch.assert_called_once_with(is_global_patch=True) self.assertEqual(len(mock_action.choices), 2) + def test_apply_config_platform_defaults_sets_ascend_default_max(self): + test_cases = [ + (40, 3, 160), + (200, 3, 512), + ] + + for max_num_seqs, num_speculative_tokens, expected_max in test_cases: + with self.subTest( + max_num_seqs=max_num_seqs, + num_speculative_tokens=num_speculative_tokens, + expected_max=expected_max, + ): + vllm_config = TestNPUPlatform.mock_vllm_config() + vllm_config.scheduler_config.max_num_seqs = max_num_seqs + vllm_config.speculative_config = MagicMock( + num_speculative_tokens=num_speculative_tokens + ) + vllm_config.compilation_config.max_cudagraph_capture_size = None + vllm_config.compilation_config.cudagraph_capture_sizes = None + + self.platform.apply_config_platform_defaults(vllm_config) + + self.assertEqual( + vllm_config.compilation_config.max_cudagraph_capture_size, + expected_max, + ) + + def test_apply_config_platform_defaults_respects_explicit_max(self): + vllm_config = TestNPUPlatform.mock_vllm_config() + vllm_config.compilation_config.max_cudagraph_capture_size = 456 + vllm_config.compilation_config.cudagraph_capture_sizes = None + + self.platform.apply_config_platform_defaults(vllm_config) + + self.assertEqual(vllm_config.compilation_config.max_cudagraph_capture_size, 456) + + def test_apply_config_platform_defaults_respects_explicit_sizes(self): + vllm_config = TestNPUPlatform.mock_vllm_config() + vllm_config.compilation_config.max_cudagraph_capture_size = None + vllm_config.compilation_config.cudagraph_capture_sizes = [1, 2, 4] + + self.platform.apply_config_platform_defaults(vllm_config) + + self.assertIsNone(vllm_config.compilation_config.max_cudagraph_capture_size) + self.assertEqual(vllm_config.compilation_config.cudagraph_capture_sizes, [1, 2, 4]) + + def test_apply_config_platform_defaults_skips_when_scheduler_max_num_seqs_is_missing(self): + vllm_config = TestNPUPlatform.mock_vllm_config() + vllm_config.compilation_config.max_cudagraph_capture_size = None + vllm_config.compilation_config.cudagraph_capture_sizes = None + + self.platform.apply_config_platform_defaults(vllm_config) + + self.assertIsNone(vllm_config.compilation_config.max_cudagraph_capture_size) + + @patch("vllm_ascend.platform.refresh_block_size") + @patch("vllm_ascend.platform.get_ascend_device_type", return_value=AscendDeviceType.A3) + @patch("vllm_ascend.platform.enable_sp", return_value=False) + @patch("vllm_ascend.ascend_config.init_ascend_config") + @patch("vllm_ascend.quantization.utils.maybe_auto_detect_quantization") + def test_check_and_update_config_preserves_platform_default_max_input( + self, + mock_auto_detect, + mock_init_ascend, + _mock_enable_sp, + _mock_device_type, + _mock_refresh_block_size, + ): + mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config() + vllm_config = TestNPUPlatform.mock_vllm_config() + vllm_config.scheduler_config.max_num_seqs = 77 + vllm_config.compilation_config.max_cudagraph_capture_size = None + vllm_config.compilation_config.cudagraph_capture_sizes = None + vllm_config.compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE + vllm_config.compilation_config.cudagraph_mode = CUDAGraphMode.FULL_DECODE_ONLY + vllm_config.compilation_config.custom_ops = [] + vllm_config.model_config.enforce_eager = False + vllm_config.model_config.enable_sleep_mode = True + vllm_config.model_config.is_encoder_decoder = False + vllm_config.parallel_config.decode_context_parallel_size = 1 + vllm_config.parallel_config.prefill_context_parallel_size = 1 + vllm_config.parallel_config.tensor_parallel_size = 1 + vllm_config.parallel_config.worker_cls = "manual" + vllm_config.parallel_config.cp_kv_cache_interleave_size = 1 + vllm_config.cache_config.block_size = 1 + + self.platform.apply_config_platform_defaults(vllm_config) + + observed_inputs: list[int | None] = [] + vllm_config._set_cudagraph_sizes = MagicMock( + side_effect=lambda: observed_inputs.append( + vllm_config.compilation_config.max_cudagraph_capture_size + ) + ) + + self.platform.check_and_update_config(vllm_config) + + self.assertEqual(observed_inputs, [77]) + def test_get_device_capability(self): self.assertIsNone(self.platform.get_device_capability(device_id=0)) diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 61463a21..324b693a 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -152,6 +152,49 @@ class NPUPlatform(Platform): config_deprecated_logging() + @classmethod + def _get_default_max_cudagraph_capture_size(cls, vllm_config: VllmConfig) -> int | None: + """Mirror the default-max branch in vLLM's `_set_cudagraph_sizes()`. + + This helper corresponds to the upstream block under + "determine the initial max_cudagraph_capture_size" when + `compilation_config.max_cudagraph_capture_size is None`. + + Ascend injects this default earlier via `apply_config_platform_defaults()` + so the rest of `_set_cudagraph_sizes()` can keep using upstream logic for + size-list generation, token-cap clipping, SP filtering, and later + post-processing. The only intentional difference from upstream is removing + the CUDA-oriented trailing `* 2`: Ascend wants the default capture upper + bound to track `max_num_seqs * decode_query_len`, capped at 512. + + Returning `None` means the platform should not inject a default. This + covers the cases where the user has already provided either + `max_cudagraph_capture_size` or `cudagraph_capture_sizes`. + """ + compilation_config = vllm_config.compilation_config + if compilation_config.max_cudagraph_capture_size is not None: + return None + if compilation_config.cudagraph_capture_sizes is not None: + return None + + scheduler_config = getattr(vllm_config, "scheduler_config", None) + max_num_seqs = getattr(scheduler_config, "max_num_seqs", None) + if max_num_seqs is None: + return None + + decode_query_len = 1 + speculative_config = getattr(vllm_config, "speculative_config", None) + if speculative_config and speculative_config.num_speculative_tokens: + decode_query_len += speculative_config.num_speculative_tokens + + return min(max_num_seqs * decode_query_len, 512) + + @classmethod + def apply_config_platform_defaults(cls, vllm_config: VllmConfig) -> None: + default_max_cg_capture_size = cls._get_default_max_cudagraph_capture_size(vllm_config) + if default_max_cg_capture_size is not None: + vllm_config.compilation_config.max_cudagraph_capture_size = default_max_cg_capture_size + @classmethod def get_device_capability(cls, device_id: int = 0): return None @@ -273,7 +316,10 @@ class NPUPlatform(Platform): ) compilation_config.cudagraph_mode = CUDAGraphMode.NONE - # set cudaprah sizes before extending `compilation_config.splitting_ops` + # Recompute cudagraph sizes after Ascend-specific compatibility updates. + # The platform default max is injected earlier via + # `apply_config_platform_defaults`, so this late pass should only honor + # the current max / size inputs after the mode adjustments above. vllm_config._set_cudagraph_sizes() # TODO delete graph size update here when compilation_config.pass_config.enable_sp # is supported by vllm-ascend.