From 4af5b80606e6cffe440c27237655cd44c2e5bdaf Mon Sep 17 00:00:00 2001 From: linfeng-yuan <1102311262@qq.com> Date: Sat, 23 Aug 2025 19:39:44 +0800 Subject: [PATCH] [Scheduler] validate max_num_batched_tokens and max_model_len in AscendSchedulerConfig (#2434) ### What this PR does / why we need it? Add configuration check logic for ascend scheduler: if chunked_prefill is disabled, max_num_batched_tokens couldn't be less than max_model_len, following vLLM; ### Does this PR introduce _any_ user-facing change? users cannot set max_num_batched_tokens smaller than max_model_len with ascend scheduler ### How was this patch tested? CI and vllm serving passed - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/f77a0802b758a32c5b9f7bc04e9498d77e8d99e0 Signed-off-by: linfeng-yuan <1102311262@qq.com> --- tests/e2e/singlecard/test_ascend_scheduler.py | 4 +- tests/ut/core/test_schedule_config.py | 58 +++++++++++++++++-- vllm_ascend/core/schedule_config.py | 10 ++++ 3 files changed, 66 insertions(+), 6 deletions(-) diff --git a/tests/e2e/singlecard/test_ascend_scheduler.py b/tests/e2e/singlecard/test_ascend_scheduler.py index 2aab523..de7dd18 100644 --- a/tests/e2e/singlecard/test_ascend_scheduler.py +++ b/tests/e2e/singlecard/test_ascend_scheduler.py @@ -16,7 +16,7 @@ def test_concurrent_partial_prefill(): }, }, max_num_seqs=3, - max_num_batched_tokens=200, + max_num_batched_tokens=2048, enforce_eager=True, max_model_len=2048, gpu_memory_utilization=0.7) as vllm_model: @@ -35,7 +35,7 @@ def test_prefix_cache_stats_is_recorded(): }, }, max_num_seqs=3, - max_num_batched_tokens=200, + max_num_batched_tokens=2048, enforce_eager=True, max_model_len=2048, gpu_memory_utilization=0.7) as vllm_model: diff --git a/tests/ut/core/test_schedule_config.py b/tests/ut/core/test_schedule_config.py index 5074a4b..df36b52 100644 --- a/tests/ut/core/test_schedule_config.py +++ b/tests/ut/core/test_schedule_config.py @@ -24,6 +24,7 @@ class TestAscendSchedulerConfig(TestBase): def setUp(self): self.basic_scheduler_config = SchedulerConfig( max_num_batched_tokens=8192, + max_model_len=8192, is_multimodal_model=False, send_delta_data=False, scheduler_delay_factor=0, @@ -51,6 +52,7 @@ class TestAscendSchedulerConfig(TestBase): num_scheduler_steps=1, scheduler_cls="vllm_ascend.core.scheduler.AscendScheduler", max_num_batched_tokens=2048, + max_model_len=2048, ), ) self.assertEqual(ascend_config.enable_chunked_prefill, False) @@ -65,7 +67,11 @@ class TestAscendSchedulerConfig(TestBase): with self.assertRaises(NotImplementedError) as context: AscendSchedulerConfig.initialize_from_config( self.basic_scheduler_config, - AscendSchedulerConfig(policy="custom_policy", ), + AscendSchedulerConfig( + policy="custom_policy", + max_num_batched_tokens=2048, + max_model_len=2048, + ), ) self.assertIn( "currently AscendScheduler only supports fcfs policy", @@ -83,7 +89,11 @@ class TestAscendSchedulerConfig(TestBase): with self.assertRaises(NotImplementedError) as context: AscendSchedulerConfig.initialize_from_config( self.basic_scheduler_config, - AscendSchedulerConfig(num_scheduler_steps=2), + AscendSchedulerConfig( + num_scheduler_steps=2, + max_num_batched_tokens=2048, + max_model_len=2048, + ), ) self.assertIn( "currently AscendScheduler doesn't support multi-step", @@ -94,7 +104,12 @@ class TestAscendSchedulerConfig(TestBase): with self.assertRaises(NotImplementedError) as context: AscendSchedulerConfig.initialize_from_config( self.basic_scheduler_config, - AscendSchedulerConfig(send_delta_data=True)) + AscendSchedulerConfig( + send_delta_data=True, + max_num_batched_tokens=2048, + max_model_len=2048, + ), + ) self.assertIn( "currently AscendScheduler doesn't support send_delta_data", str(context.exception), @@ -104,7 +119,12 @@ class TestAscendSchedulerConfig(TestBase): with self.assertRaises(NotImplementedError) as context: AscendSchedulerConfig.initialize_from_config( self.basic_scheduler_config, - AscendSchedulerConfig(delay_factor=1)) + AscendSchedulerConfig( + delay_factor=1, + max_num_batched_tokens=2048, + max_model_len=2048, + ), + ) self.assertIn( "currently AscendScheduler doesn't support scheduler_delay_factor", str(context.exception), @@ -115,3 +135,33 @@ class TestAscendSchedulerConfig(TestBase): self.basic_scheduler_config, {}) self.assertEqual(ascend_config.max_num_encoder_input_tokens, 8192) self.assertEqual(ascend_config.encoder_cache_size, 8192) + + def test_valid_config_with_chunked_prefill(self): + ascend_config = AscendSchedulerConfig.initialize_from_config( + self.basic_scheduler_config, + AscendSchedulerConfig( + enable_chunked_prefill=True, + max_num_batched_tokens=2048, + max_model_len=4096, + ), + ) + self.assertEqual(ascend_config.max_num_batched_tokens, 2048) + self.assertEqual(ascend_config.max_model_len, 4096) + self.assertTrue(ascend_config.enable_chunked_prefill) + + def test_invalid_config_without_chunked_prefill(self): + with self.assertRaises(ValueError) as context: + AscendSchedulerConfig.initialize_from_config( + self.basic_scheduler_config, + AscendSchedulerConfig( + enable_chunked_prefill=False, + max_num_batched_tokens=2048, + max_model_len=4096, + ), + ) + self.assertIn( + "Ascend scheduler is enabled without chunked prefill feature", + str(context.exception), + ) + self.assertIn("max_num_batched_tokens (2048)", str(context.exception)) + self.assertIn("max_model_len (4096)", str(context.exception)) diff --git a/vllm_ascend/core/schedule_config.py b/vllm_ascend/core/schedule_config.py index 4a4131e..4ee02e7 100644 --- a/vllm_ascend/core/schedule_config.py +++ b/vllm_ascend/core/schedule_config.py @@ -55,6 +55,16 @@ class AscendSchedulerConfig(SchedulerConfig): self.max_num_encoder_input_tokens = self.max_num_batched_tokens self.encoder_cache_size = self.max_num_batched_tokens self.chunked_prefill_enabled = self.enable_chunked_prefill + if (self.max_num_batched_tokens < self.max_model_len + and not self.chunked_prefill_enabled): + raise ValueError( + "Ascend scheduler is enabled without chunked prefill feature. " + f"Argument max_num_batched_tokens ({self.max_num_batched_tokens}) is " + f"smaller than max_model_len ({self.max_model_len}). " + "This effectively limits the maximum sequence length to " + "max_num_batched_tokens and makes vLLM reject longer " + "sequences. Please increase max_num_batched_tokens or " + "decrease max_model_len.") if self.policy != "fcfs": raise NotImplementedError( f"currently AscendScheduler only supports fcfs policy, got {self.policy}"