[BugFix][Cherry-pick] Cherry-pick PR 3675 to v0.11.0-dev (#3732)

This PR cherry-picks the bugfix related with running multi-modal models
with AscendScheduler to v0.11.0-dev

Signed-off-by: hw_whx <wanghexiang7@huawei.com>
Co-authored-by: hw_whx <wanghexiang7@huawei.com>
This commit is contained in:
whx
2025-10-25 09:41:51 +08:00
committed by GitHub
parent 12bc78d252
commit 5a2c5be229
2 changed files with 35 additions and 3 deletions

View File

@@ -26,7 +26,7 @@ MAX_INT = 2147483647
@dataclass
class AscendSchedulerConfig(SchedulerConfig):
enable_chunked_prefill: bool = False
max_long_partial_prefills: int = MAX_INT
max_long_partial_prefills: int = 1
long_prefill_token_threshold: int = MAX_INT
policy: str = "fcfs"
scheduler_cls: Union[str, Type[object]] = (
@@ -73,9 +73,9 @@ class AscendSchedulerConfig(SchedulerConfig):
"max_num_batched_tokens and makes vLLM reject longer "
"sequences. Please increase max_num_batched_tokens or "
"decrease max_model_len.")
# concurrent partial prefills. Default is inf
# concurrent partial prefills. Default is 1 meaning not enabled.
if self.max_long_partial_prefills is None:
self.max_long_partial_prefills = MAX_INT
self.max_long_partial_prefills = 1
self.long_prefill_token_threshold = MAX_INT
if self.long_prefill_token_threshold is None or \