[Core] Disable the chunked prefill feature in Non-MLA LLMs (#2894)
### What this PR does / why we need it?
This PR enforces the forcible disabling of the chunked prefill feature
in Non-MLA models, as the performance of operators supporting this
functionality is currently suboptimal. Unless the user has enabled
chunked prefill in the ascend_scheduler_config, we would allow this
feature.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
CI passed with new added/existing test.
Related: https://github.com/vllm-project/vllm-ascend/pull/2659
- vLLM version: main
- vLLM main:
d21a36f5f9
Signed-off-by: rjg-lyh <1318825571@qq.com>
This commit is contained in:
@@ -36,7 +36,6 @@ class TestAscendSchedulerConfig(TestBase):
|
||||
self.basic_scheduler_config, {})
|
||||
self.assertEqual(ascend_config.enable_chunked_prefill, False)
|
||||
self.assertEqual(ascend_config.policy, "fcfs")
|
||||
self.assertEqual(ascend_config.num_scheduler_steps, 1)
|
||||
self.assertEqual(ascend_config.scheduler_cls,
|
||||
"vllm_ascend.core.scheduler.AscendScheduler")
|
||||
self.assertEqual(ascend_config.max_num_encoder_input_tokens, 8192)
|
||||
@@ -49,7 +48,6 @@ class TestAscendSchedulerConfig(TestBase):
|
||||
AscendSchedulerConfig(
|
||||
enable_chunked_prefill=False,
|
||||
policy="fcfs",
|
||||
num_scheduler_steps=1,
|
||||
scheduler_cls="vllm_ascend.core.scheduler.AscendScheduler",
|
||||
max_num_batched_tokens=2048,
|
||||
max_model_len=2048,
|
||||
@@ -57,7 +55,6 @@ class TestAscendSchedulerConfig(TestBase):
|
||||
)
|
||||
self.assertEqual(ascend_config.enable_chunked_prefill, False)
|
||||
self.assertEqual(ascend_config.policy, "fcfs")
|
||||
self.assertEqual(ascend_config.num_scheduler_steps, 1)
|
||||
self.assertEqual(ascend_config.scheduler_cls,
|
||||
"vllm_ascend.core.scheduler.AscendScheduler")
|
||||
self.assertEqual(ascend_config.max_num_batched_tokens, 2048)
|
||||
@@ -85,21 +82,6 @@ class TestAscendSchedulerConfig(TestBase):
|
||||
self.assertIn("currently AscendScheduler only supports LLM models",
|
||||
str(context.exception))
|
||||
|
||||
def test_not_implemented_multi_step(self):
|
||||
with self.assertRaises(NotImplementedError) as context:
|
||||
AscendSchedulerConfig.initialize_from_config(
|
||||
self.basic_scheduler_config,
|
||||
AscendSchedulerConfig(
|
||||
num_scheduler_steps=2,
|
||||
max_num_batched_tokens=2048,
|
||||
max_model_len=2048,
|
||||
),
|
||||
)
|
||||
self.assertIn(
|
||||
"currently AscendScheduler doesn't support multi-step",
|
||||
str(context.exception),
|
||||
)
|
||||
|
||||
def test_not_implemented_send_delta_data(self):
|
||||
with self.assertRaises(NotImplementedError) as context:
|
||||
AscendSchedulerConfig.initialize_from_config(
|
||||
|
||||
Reference in New Issue
Block a user