[BugFix][Cherry-pick] Cherry-pick PR 3675 to v0.11.0-dev (#3732)
This PR cherry-picks the bugfix related with running multi-modal models with AscendScheduler to v0.11.0-dev Signed-off-by: hw_whx <wanghexiang7@huawei.com> Co-authored-by: hw_whx <wanghexiang7@huawei.com>
This commit is contained in:
@@ -55,6 +55,38 @@ def test_multimodal_vl(prompt_template):
|
|||||||
assert output_str, "Generated output should not be empty."
|
assert output_str, "Generated output should not be empty."
|
||||||
|
|
||||||
|
|
||||||
|
def test_multimodal_ascend_scheduler(prompt_template):
|
||||||
|
image = ImageAsset("cherry_blossom") \
|
||||||
|
.pil_image.convert("RGB")
|
||||||
|
img_questions = [
|
||||||
|
"What is the content of this image?",
|
||||||
|
"Describe the content of this image in detail.",
|
||||||
|
"What's in the image?",
|
||||||
|
"Where is this image taken?",
|
||||||
|
]
|
||||||
|
images = [image] * len(img_questions)
|
||||||
|
prompts = prompt_template(img_questions)
|
||||||
|
with VllmRunner("Qwen/Qwen2.5-VL-3B-Instruct",
|
||||||
|
max_model_len=4096,
|
||||||
|
additional_config={
|
||||||
|
'ascend_scheduler_config': {
|
||||||
|
'enabled': True,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
mm_processor_kwargs={
|
||||||
|
"min_pixels": 28 * 28,
|
||||||
|
"max_pixels": 1280 * 28 * 28,
|
||||||
|
"fps": 1,
|
||||||
|
},
|
||||||
|
enforce_eager=True) as vllm_model:
|
||||||
|
outputs = vllm_model.generate_greedy(prompts=prompts,
|
||||||
|
images=images,
|
||||||
|
max_tokens=64)
|
||||||
|
assert len(outputs) == len(prompts)
|
||||||
|
for _, output_str in outputs:
|
||||||
|
assert output_str, "Generated output should not be empty."
|
||||||
|
|
||||||
|
|
||||||
def test_multimodal_audio():
|
def test_multimodal_audio():
|
||||||
audio_prompt = "".join([
|
audio_prompt = "".join([
|
||||||
f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
|
f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ MAX_INT = 2147483647
|
|||||||
@dataclass
|
@dataclass
|
||||||
class AscendSchedulerConfig(SchedulerConfig):
|
class AscendSchedulerConfig(SchedulerConfig):
|
||||||
enable_chunked_prefill: bool = False
|
enable_chunked_prefill: bool = False
|
||||||
max_long_partial_prefills: int = MAX_INT
|
max_long_partial_prefills: int = 1
|
||||||
long_prefill_token_threshold: int = MAX_INT
|
long_prefill_token_threshold: int = MAX_INT
|
||||||
policy: str = "fcfs"
|
policy: str = "fcfs"
|
||||||
scheduler_cls: Union[str, Type[object]] = (
|
scheduler_cls: Union[str, Type[object]] = (
|
||||||
@@ -73,9 +73,9 @@ class AscendSchedulerConfig(SchedulerConfig):
|
|||||||
"max_num_batched_tokens and makes vLLM reject longer "
|
"max_num_batched_tokens and makes vLLM reject longer "
|
||||||
"sequences. Please increase max_num_batched_tokens or "
|
"sequences. Please increase max_num_batched_tokens or "
|
||||||
"decrease max_model_len.")
|
"decrease max_model_len.")
|
||||||
# concurrent partial prefills. Default is inf
|
# concurrent partial prefills. Default is 1 meaning not enabled.
|
||||||
if self.max_long_partial_prefills is None:
|
if self.max_long_partial_prefills is None:
|
||||||
self.max_long_partial_prefills = MAX_INT
|
self.max_long_partial_prefills = 1
|
||||||
self.long_prefill_token_threshold = MAX_INT
|
self.long_prefill_token_threshold = MAX_INT
|
||||||
|
|
||||||
if self.long_prefill_token_threshold is None or \
|
if self.long_prefill_token_threshold is None or \
|
||||||
|
|||||||
Reference in New Issue
Block a user