From 5a2c5be2297a9af7260878ad3fc1d9ec0bbc6ac9 Mon Sep 17 00:00:00 2001 From: whx <56632993+whx-sjtu@users.noreply.github.com> Date: Sat, 25 Oct 2025 09:41:51 +0800 Subject: [PATCH] [BugFix][Cherry-pick] Cherry-pick PR 3675 to v0.11.0-dev (#3732) This PR cherry-picks the bugfix related with running multi-modal models with AscendScheduler to v0.11.0-dev Signed-off-by: hw_whx Co-authored-by: hw_whx --- tests/e2e/singlecard/test_vlm.py | 32 +++++++++++++++++++++++++++++ vllm_ascend/core/schedule_config.py | 6 +++--- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/tests/e2e/singlecard/test_vlm.py b/tests/e2e/singlecard/test_vlm.py index 654078e..8808d1e 100644 --- a/tests/e2e/singlecard/test_vlm.py +++ b/tests/e2e/singlecard/test_vlm.py @@ -55,6 +55,38 @@ def test_multimodal_vl(prompt_template): assert output_str, "Generated output should not be empty." +def test_multimodal_ascend_scheduler(prompt_template): + image = ImageAsset("cherry_blossom") \ + .pil_image.convert("RGB") + img_questions = [ + "What is the content of this image?", + "Describe the content of this image in detail.", + "What's in the image?", + "Where is this image taken?", + ] + images = [image] * len(img_questions) + prompts = prompt_template(img_questions) + with VllmRunner("Qwen/Qwen2.5-VL-3B-Instruct", + max_model_len=4096, + additional_config={ + 'ascend_scheduler_config': { + 'enabled': True, + }, + }, + mm_processor_kwargs={ + "min_pixels": 28 * 28, + "max_pixels": 1280 * 28 * 28, + "fps": 1, + }, + enforce_eager=True) as vllm_model: + outputs = vllm_model.generate_greedy(prompts=prompts, + images=images, + max_tokens=64) + assert len(outputs) == len(prompts) + for _, output_str in outputs: + assert output_str, "Generated output should not be empty." + + def test_multimodal_audio(): audio_prompt = "".join([ f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n" diff --git a/vllm_ascend/core/schedule_config.py b/vllm_ascend/core/schedule_config.py index 3736534..c8695e6 100644 --- a/vllm_ascend/core/schedule_config.py +++ b/vllm_ascend/core/schedule_config.py @@ -26,7 +26,7 @@ MAX_INT = 2147483647 @dataclass class AscendSchedulerConfig(SchedulerConfig): enable_chunked_prefill: bool = False - max_long_partial_prefills: int = MAX_INT + max_long_partial_prefills: int = 1 long_prefill_token_threshold: int = MAX_INT policy: str = "fcfs" scheduler_cls: Union[str, Type[object]] = ( @@ -73,9 +73,9 @@ class AscendSchedulerConfig(SchedulerConfig): "max_num_batched_tokens and makes vLLM reject longer " "sequences. Please increase max_num_batched_tokens or " "decrease max_model_len.") - # concurrent partial prefills. Default is inf + # concurrent partial prefills. Default is 1 meaning not enabled. if self.max_long_partial_prefills is None: - self.max_long_partial_prefills = MAX_INT + self.max_long_partial_prefills = 1 self.long_prefill_token_threshold = MAX_INT if self.long_prefill_token_threshold is None or \