[BugFix][Core] Fix a bug running multi-modal with ascend_scheduler (#3675)

This PR fix the bug related with running multi-modal models with AscendScheduler. This bug was introduced by PR #2372 by using the same parameter names as vLLM with different default values. Currently I fix this bug by changing the default values of these two parameters to align with vLLM. - vLLM version: v0.11.0rc3 - vLLM main: 17c540a993 Signed-off-by: hw_whx <wanghexiang7@huawei.com> Co-authored-by: hw_whx <wanghexiang7@huawei.com>
2025-10-25 09:41:33 +08:00
parent 1a9feb3ba5
commit e33751ef8b
2 changed files with 36 additions and 4 deletions
--- a/vllm_ascend/core/schedule_config.py
+++ b/vllm_ascend/core/schedule_config.py
@@ -26,7 +26,7 @@ MAX_INT = 2147483647
@dataclass
 class AscendSchedulerConfig(SchedulerConfig):
    enable_chunked_prefill: bool = False
-    max_long_partial_prefills: int = MAX_INT
+    max_long_partial_prefills: int = 1
    long_prefill_token_threshold: int = MAX_INT
    policy: str = "fcfs"
    scheduler_cls: Union[str, Type[object]] = (
@@ -73,9 +73,9 @@ class AscendSchedulerConfig(SchedulerConfig):
                "max_num_batched_tokens and makes vLLM reject longer "
                "sequences. Please increase max_num_batched_tokens or "
                "decrease max_model_len.")
-        # concurrent partial prefills. Default is inf
+        # concurrent partial prefills. Default is 1 meaning not enabled.
        if self.max_long_partial_prefills is None:
-            self.max_long_partial_prefills = MAX_INT
+            self.max_long_partial_prefills = 1
            self.long_prefill_token_threshold = MAX_INT

        if self.long_prefill_token_threshold is None or \
@@ -105,4 +105,4 @@ class AscendSchedulerConfig(SchedulerConfig):
        if getattr(self, "scheduler_delay_factor", 0) > 0:
            raise NotImplementedError(
                "currently AscendScheduler doesn't support scheduler_delay_factor."
-            )
+            )