[BugFix][Core] Fix a bug running multi-modal with ascend_scheduler (#3675)

This PR fix the bug related with running multi-modal models with
AscendScheduler. This bug was introduced by PR #2372 by using the same
parameter names as vLLM with different default values. 

Currently I fix this bug by changing the default values of these two
parameters to align with vLLM. 

- vLLM version: v0.11.0rc3
- vLLM main:
17c540a993

Signed-off-by: hw_whx <wanghexiang7@huawei.com>
Co-authored-by: hw_whx <wanghexiang7@huawei.com>
This commit is contained in:
whx
2025-10-25 09:41:33 +08:00
committed by GitHub
parent 1a9feb3ba5
commit e33751ef8b
2 changed files with 36 additions and 4 deletions

View File

@@ -55,6 +55,38 @@ def test_multimodal_vl(prompt_template):
assert output_str, "Generated output should not be empty."
def test_multimodal_ascend_scheduler(prompt_template):
image = ImageAsset("cherry_blossom") \
.pil_image.convert("RGB")
img_questions = [
"What is the content of this image?",
"Describe the content of this image in detail.",
"What's in the image?",
"Where is this image taken?",
]
images = [image] * len(img_questions)
prompts = prompt_template(img_questions)
with VllmRunner("Qwen/Qwen2.5-VL-3B-Instruct",
max_model_len=4096,
additional_config={
'ascend_scheduler_config': {
'enabled': True,
},
},
mm_processor_kwargs={
"min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28,
"fps": 1,
},
enforce_eager=True) as vllm_model:
outputs = vllm_model.generate_greedy(prompts=prompts,
images=images,
max_tokens=64)
assert len(outputs) == len(prompts)
for _, output_str in outputs:
assert output_str, "Generated output should not be empty."
def test_multimodal_audio():
audio_prompt = "".join([
f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"

View File

@@ -26,7 +26,7 @@ MAX_INT = 2147483647
@dataclass
class AscendSchedulerConfig(SchedulerConfig):
enable_chunked_prefill: bool = False
max_long_partial_prefills: int = MAX_INT
max_long_partial_prefills: int = 1
long_prefill_token_threshold: int = MAX_INT
policy: str = "fcfs"
scheduler_cls: Union[str, Type[object]] = (
@@ -73,9 +73,9 @@ class AscendSchedulerConfig(SchedulerConfig):
"max_num_batched_tokens and makes vLLM reject longer "
"sequences. Please increase max_num_batched_tokens or "
"decrease max_model_len.")
# concurrent partial prefills. Default is inf
# concurrent partial prefills. Default is 1 meaning not enabled.
if self.max_long_partial_prefills is None:
self.max_long_partial_prefills = MAX_INT
self.max_long_partial_prefills = 1
self.long_prefill_token_threshold = MAX_INT
if self.long_prefill_token_threshold is None or \
@@ -105,4 +105,4 @@ class AscendSchedulerConfig(SchedulerConfig):
if getattr(self, "scheduler_delay_factor", 0) > 0:
raise NotImplementedError(
"currently AscendScheduler doesn't support scheduler_delay_factor."
)
)