[Scheduler] validate max_num_batched_tokens and max_model_len in AscendSchedulerConfig (#2434)
### What this PR does / why we need it?
Add configuration check logic for ascend scheduler: if chunked_prefill
is disabled, max_num_batched_tokens couldn't be less than max_model_len,
following vLLM;
### Does this PR introduce _any_ user-facing change?
users cannot set max_num_batched_tokens smaller than max_model_len with
ascend scheduler
### How was this patch tested?
CI and vllm serving passed
- vLLM version: v0.10.0
- vLLM main:
f77a0802b7
Signed-off-by: linfeng-yuan <1102311262@qq.com>
This commit is contained in:
@@ -16,7 +16,7 @@ def test_concurrent_partial_prefill():
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
max_num_seqs=3,
|
max_num_seqs=3,
|
||||||
max_num_batched_tokens=200,
|
max_num_batched_tokens=2048,
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
max_model_len=2048,
|
max_model_len=2048,
|
||||||
gpu_memory_utilization=0.7) as vllm_model:
|
gpu_memory_utilization=0.7) as vllm_model:
|
||||||
@@ -35,7 +35,7 @@ def test_prefix_cache_stats_is_recorded():
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
max_num_seqs=3,
|
max_num_seqs=3,
|
||||||
max_num_batched_tokens=200,
|
max_num_batched_tokens=2048,
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
max_model_len=2048,
|
max_model_len=2048,
|
||||||
gpu_memory_utilization=0.7) as vllm_model:
|
gpu_memory_utilization=0.7) as vllm_model:
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ class TestAscendSchedulerConfig(TestBase):
|
|||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.basic_scheduler_config = SchedulerConfig(
|
self.basic_scheduler_config = SchedulerConfig(
|
||||||
max_num_batched_tokens=8192,
|
max_num_batched_tokens=8192,
|
||||||
|
max_model_len=8192,
|
||||||
is_multimodal_model=False,
|
is_multimodal_model=False,
|
||||||
send_delta_data=False,
|
send_delta_data=False,
|
||||||
scheduler_delay_factor=0,
|
scheduler_delay_factor=0,
|
||||||
@@ -51,6 +52,7 @@ class TestAscendSchedulerConfig(TestBase):
|
|||||||
num_scheduler_steps=1,
|
num_scheduler_steps=1,
|
||||||
scheduler_cls="vllm_ascend.core.scheduler.AscendScheduler",
|
scheduler_cls="vllm_ascend.core.scheduler.AscendScheduler",
|
||||||
max_num_batched_tokens=2048,
|
max_num_batched_tokens=2048,
|
||||||
|
max_model_len=2048,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
self.assertEqual(ascend_config.enable_chunked_prefill, False)
|
self.assertEqual(ascend_config.enable_chunked_prefill, False)
|
||||||
@@ -65,7 +67,11 @@ class TestAscendSchedulerConfig(TestBase):
|
|||||||
with self.assertRaises(NotImplementedError) as context:
|
with self.assertRaises(NotImplementedError) as context:
|
||||||
AscendSchedulerConfig.initialize_from_config(
|
AscendSchedulerConfig.initialize_from_config(
|
||||||
self.basic_scheduler_config,
|
self.basic_scheduler_config,
|
||||||
AscendSchedulerConfig(policy="custom_policy", ),
|
AscendSchedulerConfig(
|
||||||
|
policy="custom_policy",
|
||||||
|
max_num_batched_tokens=2048,
|
||||||
|
max_model_len=2048,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
self.assertIn(
|
self.assertIn(
|
||||||
"currently AscendScheduler only supports fcfs policy",
|
"currently AscendScheduler only supports fcfs policy",
|
||||||
@@ -83,7 +89,11 @@ class TestAscendSchedulerConfig(TestBase):
|
|||||||
with self.assertRaises(NotImplementedError) as context:
|
with self.assertRaises(NotImplementedError) as context:
|
||||||
AscendSchedulerConfig.initialize_from_config(
|
AscendSchedulerConfig.initialize_from_config(
|
||||||
self.basic_scheduler_config,
|
self.basic_scheduler_config,
|
||||||
AscendSchedulerConfig(num_scheduler_steps=2),
|
AscendSchedulerConfig(
|
||||||
|
num_scheduler_steps=2,
|
||||||
|
max_num_batched_tokens=2048,
|
||||||
|
max_model_len=2048,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
self.assertIn(
|
self.assertIn(
|
||||||
"currently AscendScheduler doesn't support multi-step",
|
"currently AscendScheduler doesn't support multi-step",
|
||||||
@@ -94,7 +104,12 @@ class TestAscendSchedulerConfig(TestBase):
|
|||||||
with self.assertRaises(NotImplementedError) as context:
|
with self.assertRaises(NotImplementedError) as context:
|
||||||
AscendSchedulerConfig.initialize_from_config(
|
AscendSchedulerConfig.initialize_from_config(
|
||||||
self.basic_scheduler_config,
|
self.basic_scheduler_config,
|
||||||
AscendSchedulerConfig(send_delta_data=True))
|
AscendSchedulerConfig(
|
||||||
|
send_delta_data=True,
|
||||||
|
max_num_batched_tokens=2048,
|
||||||
|
max_model_len=2048,
|
||||||
|
),
|
||||||
|
)
|
||||||
self.assertIn(
|
self.assertIn(
|
||||||
"currently AscendScheduler doesn't support send_delta_data",
|
"currently AscendScheduler doesn't support send_delta_data",
|
||||||
str(context.exception),
|
str(context.exception),
|
||||||
@@ -104,7 +119,12 @@ class TestAscendSchedulerConfig(TestBase):
|
|||||||
with self.assertRaises(NotImplementedError) as context:
|
with self.assertRaises(NotImplementedError) as context:
|
||||||
AscendSchedulerConfig.initialize_from_config(
|
AscendSchedulerConfig.initialize_from_config(
|
||||||
self.basic_scheduler_config,
|
self.basic_scheduler_config,
|
||||||
AscendSchedulerConfig(delay_factor=1))
|
AscendSchedulerConfig(
|
||||||
|
delay_factor=1,
|
||||||
|
max_num_batched_tokens=2048,
|
||||||
|
max_model_len=2048,
|
||||||
|
),
|
||||||
|
)
|
||||||
self.assertIn(
|
self.assertIn(
|
||||||
"currently AscendScheduler doesn't support scheduler_delay_factor",
|
"currently AscendScheduler doesn't support scheduler_delay_factor",
|
||||||
str(context.exception),
|
str(context.exception),
|
||||||
@@ -115,3 +135,33 @@ class TestAscendSchedulerConfig(TestBase):
|
|||||||
self.basic_scheduler_config, {})
|
self.basic_scheduler_config, {})
|
||||||
self.assertEqual(ascend_config.max_num_encoder_input_tokens, 8192)
|
self.assertEqual(ascend_config.max_num_encoder_input_tokens, 8192)
|
||||||
self.assertEqual(ascend_config.encoder_cache_size, 8192)
|
self.assertEqual(ascend_config.encoder_cache_size, 8192)
|
||||||
|
|
||||||
|
def test_valid_config_with_chunked_prefill(self):
|
||||||
|
ascend_config = AscendSchedulerConfig.initialize_from_config(
|
||||||
|
self.basic_scheduler_config,
|
||||||
|
AscendSchedulerConfig(
|
||||||
|
enable_chunked_prefill=True,
|
||||||
|
max_num_batched_tokens=2048,
|
||||||
|
max_model_len=4096,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
self.assertEqual(ascend_config.max_num_batched_tokens, 2048)
|
||||||
|
self.assertEqual(ascend_config.max_model_len, 4096)
|
||||||
|
self.assertTrue(ascend_config.enable_chunked_prefill)
|
||||||
|
|
||||||
|
def test_invalid_config_without_chunked_prefill(self):
|
||||||
|
with self.assertRaises(ValueError) as context:
|
||||||
|
AscendSchedulerConfig.initialize_from_config(
|
||||||
|
self.basic_scheduler_config,
|
||||||
|
AscendSchedulerConfig(
|
||||||
|
enable_chunked_prefill=False,
|
||||||
|
max_num_batched_tokens=2048,
|
||||||
|
max_model_len=4096,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
self.assertIn(
|
||||||
|
"Ascend scheduler is enabled without chunked prefill feature",
|
||||||
|
str(context.exception),
|
||||||
|
)
|
||||||
|
self.assertIn("max_num_batched_tokens (2048)", str(context.exception))
|
||||||
|
self.assertIn("max_model_len (4096)", str(context.exception))
|
||||||
|
|||||||
@@ -55,6 +55,16 @@ class AscendSchedulerConfig(SchedulerConfig):
|
|||||||
self.max_num_encoder_input_tokens = self.max_num_batched_tokens
|
self.max_num_encoder_input_tokens = self.max_num_batched_tokens
|
||||||
self.encoder_cache_size = self.max_num_batched_tokens
|
self.encoder_cache_size = self.max_num_batched_tokens
|
||||||
self.chunked_prefill_enabled = self.enable_chunked_prefill
|
self.chunked_prefill_enabled = self.enable_chunked_prefill
|
||||||
|
if (self.max_num_batched_tokens < self.max_model_len
|
||||||
|
and not self.chunked_prefill_enabled):
|
||||||
|
raise ValueError(
|
||||||
|
"Ascend scheduler is enabled without chunked prefill feature. "
|
||||||
|
f"Argument max_num_batched_tokens ({self.max_num_batched_tokens}) is "
|
||||||
|
f"smaller than max_model_len ({self.max_model_len}). "
|
||||||
|
"This effectively limits the maximum sequence length to "
|
||||||
|
"max_num_batched_tokens and makes vLLM reject longer "
|
||||||
|
"sequences. Please increase max_num_batched_tokens or "
|
||||||
|
"decrease max_model_len.")
|
||||||
if self.policy != "fcfs":
|
if self.policy != "fcfs":
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
f"currently AscendScheduler only supports fcfs policy, got {self.policy}"
|
f"currently AscendScheduler only supports fcfs policy, got {self.policy}"
|
||||||
|
|||||||
Reference in New Issue
Block a user