[BugFix][Cherry-pick] Cherry-pick PR 3675 to v0.11.0-dev (#3732)

This PR cherry-picks the bugfix related with running multi-modal models with AscendScheduler to v0.11.0-dev Signed-off-by: hw_whx <wanghexiang7@huawei.com> Co-authored-by: hw_whx <wanghexiang7@huawei.com>
2025-10-25 09:41:51 +08:00
parent 12bc78d252
commit 5a2c5be229
2 changed files with 35 additions and 3 deletions
--- a/tests/e2e/singlecard/test_vlm.py
+++ b/tests/e2e/singlecard/test_vlm.py
@@ -55,6 +55,38 @@ def test_multimodal_vl(prompt_template):
            assert output_str, "Generated output should not be empty."
 def test_multimodal_ascend_scheduler(prompt_template):
    image = ImageAsset("cherry_blossom") \
        .pil_image.convert("RGB")
    img_questions = [
        "What is the content of this image?",
        "Describe the content of this image in detail.",
        "What's in the image?",
        "Where is this image taken?",
    ]
    images = [image] * len(img_questions)
    prompts = prompt_template(img_questions)
    with VllmRunner("Qwen/Qwen2.5-VL-3B-Instruct",
                    max_model_len=4096,
                    additional_config={
                        'ascend_scheduler_config': {
                            'enabled': True,
                        },
                    },
                    mm_processor_kwargs={
                        "min_pixels": 28 * 28,
                        "max_pixels": 1280 * 28 * 28,
                        "fps": 1,
                    },
                    enforce_eager=True) as vllm_model:
        outputs = vllm_model.generate_greedy(prompts=prompts,
                                             images=images,
                                             max_tokens=64)
        assert len(outputs) == len(prompts)
        for _, output_str in outputs:
            assert output_str, "Generated output should not be empty."
 def test_multimodal_audio():
    audio_prompt = "".join([
        f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
--- a/vllm_ascend/core/schedule_config.py
+++ b/vllm_ascend/core/schedule_config.py
@@ -26,7 +26,7 @@ MAX_INT = 2147483647
@dataclass
 class AscendSchedulerConfig(SchedulerConfig):
    enable_chunked_prefill: bool = False
-    max_long_partial_prefills: int = MAX_INT
+    max_long_partial_prefills: int = 1
    long_prefill_token_threshold: int = MAX_INT
    policy: str = "fcfs"
    scheduler_cls: Union[str, Type[object]] = (
@@ -73,9 +73,9 @@ class AscendSchedulerConfig(SchedulerConfig):
                "max_num_batched_tokens and makes vLLM reject longer "
                "sequences. Please increase max_num_batched_tokens or "
                "decrease max_model_len.")
-        # concurrent partial prefills. Default is inf
+        # concurrent partial prefills. Default is 1 meaning not enabled.
        if self.max_long_partial_prefills is None:
-            self.max_long_partial_prefills = MAX_INT
+            self.max_long_partial_prefills = 1
            self.long_prefill_token_threshold = MAX_INT
        if self.long_prefill_token_threshold is None or \