[Core] Disable the chunked prefill feature in Non-MLA LLMs (#2894)

### What this PR does / why we need it? This PR enforces the forcible disabling of the chunked prefill feature in Non-MLA models, as the performance of operators supporting this functionality is currently suboptimal. Unless the user has enabled chunked prefill in the ascend_scheduler_config, we would allow this feature. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? CI passed with new added/existing test. Related: https://github.com/vllm-project/vllm-ascend/pull/2659 - vLLM version: main - vLLM main: d21a36f5f9 Signed-off-by: rjg-lyh <1318825571@qq.com>
2025-09-12 23:17:09 +08:00
parent 756b8a1946
commit 585a494baa
3 changed files with 29 additions and 23 deletions
--- a/tests/ut/core/test_schedule_config.py
+++ b/tests/ut/core/test_schedule_config.py
@@ -36,7 +36,6 @@ class TestAscendSchedulerConfig(TestBase):
            self.basic_scheduler_config, {})
        self.assertEqual(ascend_config.enable_chunked_prefill, False)
        self.assertEqual(ascend_config.policy, "fcfs")
-        self.assertEqual(ascend_config.num_scheduler_steps, 1)
        self.assertEqual(ascend_config.scheduler_cls,
                         "vllm_ascend.core.scheduler.AscendScheduler")
        self.assertEqual(ascend_config.max_num_encoder_input_tokens, 8192)
@@ -49,7 +48,6 @@ class TestAscendSchedulerConfig(TestBase):
            AscendSchedulerConfig(
                enable_chunked_prefill=False,
                policy="fcfs",
-                num_scheduler_steps=1,
                scheduler_cls="vllm_ascend.core.scheduler.AscendScheduler",
                max_num_batched_tokens=2048,
                max_model_len=2048,
@@ -57,7 +55,6 @@ class TestAscendSchedulerConfig(TestBase):
        )
        self.assertEqual(ascend_config.enable_chunked_prefill, False)
        self.assertEqual(ascend_config.policy, "fcfs")
-        self.assertEqual(ascend_config.num_scheduler_steps, 1)
        self.assertEqual(ascend_config.scheduler_cls,
                         "vllm_ascend.core.scheduler.AscendScheduler")
        self.assertEqual(ascend_config.max_num_batched_tokens, 2048)
@@ -85,21 +82,6 @@ class TestAscendSchedulerConfig(TestBase):
        self.assertIn("currently AscendScheduler only supports LLM models",
                      str(context.exception))

-    def test_not_implemented_multi_step(self):
-        with self.assertRaises(NotImplementedError) as context:
-            AscendSchedulerConfig.initialize_from_config(
-                self.basic_scheduler_config,
-                AscendSchedulerConfig(
-                    num_scheduler_steps=2,
-                    max_num_batched_tokens=2048,
-                    max_model_len=2048,
-                ),
-            )
-        self.assertIn(
-            "currently AscendScheduler doesn't support multi-step",
-            str(context.exception),
-        )
-
    def test_not_implemented_send_delta_data(self):
        with self.assertRaises(NotImplementedError) as context:
            AscendSchedulerConfig.initialize_from_config(