Sync from v0.13
This commit is contained in:
0
tests/models/multimodal/generation/__init__.py
Normal file
0
tests/models/multimodal/generation/__init__.py
Normal file
35
tests/models/multimodal/generation/conftest.py
Normal file
35
tests/models/multimodal/generation/conftest.py
Normal file
@@ -0,0 +1,35 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Pytest configuration for vLLM tests."""
|
||||
|
||||
import warnings
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
def pytest_configure(config):
|
||||
"""Disable Flash/MemEfficient SDP on ROCm to avoid HF
|
||||
Transformers accuracy issues.
|
||||
"""
|
||||
if not current_platform.is_rocm():
|
||||
return
|
||||
|
||||
skip_patterns = ["test_granite_speech.py"]
|
||||
if any(pattern in str(arg) for arg in config.args for pattern in skip_patterns):
|
||||
# Skip disabling SDP for Granite Speech tests on ROCm
|
||||
return
|
||||
|
||||
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
|
||||
# accuracy issues
|
||||
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
|
||||
torch.backends.cuda.enable_flash_sdp(False)
|
||||
torch.backends.cuda.enable_mem_efficient_sdp(False)
|
||||
torch.backends.cuda.enable_math_sdp(True)
|
||||
warnings.warn(
|
||||
"ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
|
||||
"to avoid HuggingFace Transformers accuracy issues",
|
||||
UserWarning,
|
||||
stacklevel=1,
|
||||
)
|
||||
142
tests/models/multimodal/generation/test_audioflamingo3.py
Normal file
142
tests/models/multimodal/generation/test_audioflamingo3.py
Normal file
@@ -0,0 +1,142 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Copyright 2025 The vLLM team.
|
||||
# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
|
||||
# reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.models.registry import HF_EXAMPLE_MODELS
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
MODEL_NAME = "nvidia/audio-flamingo-3-hf"
|
||||
|
||||
|
||||
def get_fixture_path(filename):
|
||||
return os.path.join(
|
||||
os.path.dirname(__file__), "../../fixtures/audioflamingo3", filename
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# Check if the model is supported by the current transformers version
|
||||
model_info = HF_EXAMPLE_MODELS.get_hf_info("AudioFlamingo3ForConditionalGeneration")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
try:
|
||||
llm = LLM(
|
||||
model=MODEL_NAME,
|
||||
trust_remote_code=True,
|
||||
dtype="bfloat16",
|
||||
enforce_eager=True,
|
||||
limit_mm_per_prompt={"audio": 1},
|
||||
)
|
||||
return llm
|
||||
except Exception as e:
|
||||
pytest.skip(f"Failed to load model {MODEL_NAME}: {e}")
|
||||
|
||||
|
||||
def test_single_generation(llm):
|
||||
fixture_path = get_fixture_path("expected_results_single.json")
|
||||
if not os.path.exists(fixture_path):
|
||||
pytest.skip(f"Fixture not found: {fixture_path}")
|
||||
|
||||
with open(fixture_path) as f:
|
||||
expected = json.load(f)
|
||||
|
||||
audio_url = "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Why_do_we_ask_questions_converted.wav"
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "audio_url", "audio_url": {"url": audio_url}},
|
||||
{"type": "text", "text": "Transcribe the input speech."},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
|
||||
|
||||
outputs = llm.chat(
|
||||
messages=messages,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
generated_text = outputs[0].outputs[0].text.strip()
|
||||
|
||||
expected_text = expected["transcriptions"][0]
|
||||
|
||||
assert expected_text in generated_text or generated_text in expected_text
|
||||
|
||||
|
||||
def test_batched_generation(llm):
|
||||
fixture_path = get_fixture_path("expected_results_batched.json")
|
||||
if not os.path.exists(fixture_path):
|
||||
pytest.skip(f"Fixture not found: {fixture_path}")
|
||||
|
||||
with open(fixture_path) as f:
|
||||
expected = json.load(f)
|
||||
|
||||
items = [
|
||||
{
|
||||
"audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/dogs_barking_in_sync_with_the_music.wav",
|
||||
"question": "What is surprising about the relationship "
|
||||
"between the barking and the music?",
|
||||
"expected_idx": 0,
|
||||
},
|
||||
{
|
||||
"audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Ch6Ae9DT6Ko_00-04-03_00-04-31.wav",
|
||||
"question": (
|
||||
"Why is the philosopher's name mentioned in the lyrics? "
|
||||
"(A) To express a sense of nostalgia "
|
||||
"(B) To indicate that language cannot express clearly, "
|
||||
"satirizing the inversion of black and white in the world "
|
||||
"(C) To add depth and complexity to the lyrics "
|
||||
"(D) To showcase the wisdom and influence of the philosopher"
|
||||
),
|
||||
"expected_idx": 1,
|
||||
},
|
||||
]
|
||||
|
||||
conversations = []
|
||||
for item in items:
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "audio_url", "audio_url": {"url": item["audio_url"]}},
|
||||
{"type": "text", "text": item["question"]},
|
||||
],
|
||||
}
|
||||
]
|
||||
conversations.append(messages)
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
|
||||
|
||||
outputs = llm.chat(
|
||||
messages=conversations,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
for i, output in enumerate(outputs):
|
||||
generated_text = output.outputs[0].text.strip()
|
||||
expected_text = expected["transcriptions"][i]
|
||||
|
||||
assert expected_text in generated_text or generated_text in expected_text
|
||||
1263
tests/models/multimodal/generation/test_common.py
Normal file
1263
tests/models/multimodal/generation/test_common.py
Normal file
File diff suppressed because it is too large
Load Diff
160
tests/models/multimodal/generation/test_granite_speech.py
Normal file
160
tests/models/multimodal/generation/test_granite_speech.py
Normal file
@@ -0,0 +1,160 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
import pytest
|
||||
from transformers import AutoModelForSpeechSeq2Seq
|
||||
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ....conftest import AudioTestAssets, HfRunner, PromptAudioInput, VllmRunner
|
||||
from ...registry import HF_EXAMPLE_MODELS
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
HF_AUDIO_PROMPT = "<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|><|audio|>can you transcribe the speech into a written format?<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>" # noqa: E501
|
||||
|
||||
|
||||
def vllm_to_hf_output(
|
||||
vllm_output: tuple[list[int], str, SampleLogprobs | None],
|
||||
) -> tuple[list[int], str, SampleLogprobs | None]:
|
||||
"""Sanitize hf output to be comparable with vllm output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
hf_output_str = output_str + "<|end_of_text|>"
|
||||
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
MODEL_NAME = "ibm-granite/granite-speech-3.3-2b"
|
||||
# Audio lora co-exists directly in the model directory, but
|
||||
# currently still needs to be passed directly to vLLM.
|
||||
audio_lora_path = MODEL_NAME
|
||||
models = [MODEL_NAME]
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def set_attention_backend_for_rocm(monkeypatch):
|
||||
if current_platform.is_rocm():
|
||||
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
|
||||
|
||||
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
inputs: Sequence[tuple[list[str], PromptAudioInput]],
|
||||
model: str,
|
||||
*,
|
||||
max_model_len: int,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: str | None = None,
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the audio fixtures for the test are from AUDIO_ASSETS.
|
||||
For huggingface runner, we provide the audio as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
Note, the text input is also adjusted to abide by vllm contract.
|
||||
The text output is sanitized to be able to compare with hf.
|
||||
"""
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=1,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"audio": 1},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enable_lora=True,
|
||||
max_lora_rank=64,
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
lora_request = LoRARequest("audio", 1, audio_lora_path)
|
||||
vllm_outputs_per_case = [
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
audios=audios,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
for prompts, audios in inputs
|
||||
]
|
||||
|
||||
with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
|
||||
hf_processor = hf_model.processor
|
||||
eos_token_id = hf_processor.tokenizer.eos_token_id
|
||||
|
||||
hf_outputs_per_case = [
|
||||
hf_model.generate_greedy_logprobs_limit(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
audios=[audios],
|
||||
eos_token_id=eos_token_id,
|
||||
)
|
||||
for prompts, audios in inputs
|
||||
]
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=[vllm_to_hf_output(output) for output in vllm_outputs],
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize(
|
||||
"dtype", ["float16"] if current_platform.is_rocm() else ["bfloat16"]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"max_model_len", [512] if current_platform.is_rocm() else [2048]
|
||||
)
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model: str,
|
||||
audio_assets: AudioTestAssets,
|
||||
dtype: str,
|
||||
max_model_len: int,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
audio, sr = audio_assets[0].audio_and_sample_rate
|
||||
# This model expects 16k sample rate, which our test audio
|
||||
# already is; if this changes, it may break this test,
|
||||
# so we check it directly
|
||||
assert sr == 16000
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
[
|
||||
([HF_AUDIO_PROMPT], [audio]),
|
||||
],
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=max_model_len,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
81
tests/models/multimodal/generation/test_interleaved.py
Normal file
81
tests/models/multimodal/generation/test_interleaved.py
Normal file
@@ -0,0 +1,81 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.assets.video import VideoAsset
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
|
||||
models = ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]
|
||||
|
||||
|
||||
def base_prompt(modalities_str: str) -> str:
|
||||
return f"<|im_start|>user {modalities_str}\nDescribe what you see from these items.<|im_end|><|im_start|>assistant\n" # noqa: E501
|
||||
|
||||
|
||||
INTERLEAVED_PROMPT = base_prompt("<image><video><image>\n")
|
||||
NONINTERLEAVED_PROMPT = base_prompt("<image><image><video>\n")
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("dtype", ["float16"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
|
||||
"""
|
||||
This is a simple test to check if interleaved and non-interleaved prompts
|
||||
give the same result.
|
||||
"""
|
||||
|
||||
image_cherry = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
|
||||
image_stop = convert_image_mode(ImageAsset("stop_sign").pil_image, "RGB")
|
||||
images = [image_cherry, image_stop]
|
||||
video = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays
|
||||
|
||||
inputs = [
|
||||
(
|
||||
[INTERLEAVED_PROMPT],
|
||||
[images],
|
||||
[video],
|
||||
),
|
||||
(
|
||||
[NONINTERLEAVED_PROMPT],
|
||||
[images],
|
||||
[video],
|
||||
),
|
||||
]
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="generate",
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"image": 2},
|
||||
max_model_len=32768,
|
||||
max_num_seqs=2,
|
||||
tensor_parallel_size=1,
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
vllm_outputs_per_case = [
|
||||
vllm_model.generate_greedy(
|
||||
prompts, max_tokens, images=images, videos=videos
|
||||
)
|
||||
for prompts, images, videos in inputs
|
||||
]
|
||||
|
||||
all_results = [output[0][1] for output in vllm_outputs_per_case]
|
||||
outputs = [
|
||||
(total_str, total_str.find("assistant\n") + len("assistant\n"))
|
||||
for total_str in all_results
|
||||
]
|
||||
prompt_lengths = [prompt_len for _, prompt_len in outputs]
|
||||
generated_strs = [total_str[prompt_len:] for total_str, prompt_len in outputs]
|
||||
interleaved_prompt_len, noninterleaved_prompt_len = prompt_lengths
|
||||
interleaved_output_str, noninterleaved_output_str = generated_strs
|
||||
|
||||
# The two prompts are identical except for the order of modality tokens.
|
||||
assert interleaved_prompt_len == noninterleaved_prompt_len
|
||||
|
||||
# The two generated strings should be different because of the
|
||||
# interleaved modality tokens.
|
||||
assert interleaved_output_str != noninterleaved_output_str
|
||||
86
tests/models/multimodal/generation/test_keye.py
Normal file
86
tests/models/multimodal/generation/test_keye.py
Normal file
@@ -0,0 +1,86 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from dataclasses import asdict
|
||||
from typing import NamedTuple
|
||||
|
||||
import pytest
|
||||
from PIL.Image import Image
|
||||
from transformers import AutoProcessor
|
||||
|
||||
from vllm import LLM, EngineArgs, SamplingParams
|
||||
from vllm.multimodal.utils import encode_image_base64
|
||||
|
||||
MODEL_NAME = "Kwai-Keye/Keye-VL-8B-Preview"
|
||||
|
||||
QUESTION = "What is the content of each image?"
|
||||
|
||||
|
||||
class ModelRequestData(NamedTuple):
|
||||
engine_args: EngineArgs
|
||||
prompt: str
|
||||
image_data: list[Image]
|
||||
stop_token_ids: list[int] | None = None
|
||||
chat_template: str | None = None
|
||||
sampling_params: SamplingParams | None = None
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("question", [QUESTION])
|
||||
def test_keye_vl(
|
||||
image_assets,
|
||||
question: str,
|
||||
):
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
image_urls = [
|
||||
f"data:image/jpeg;base64,{encode_image_base64(image)}" for image in images
|
||||
]
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=MODEL_NAME,
|
||||
trust_remote_code=True,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=5,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*placeholders,
|
||||
{"type": "text", "text": question},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
|
||||
|
||||
prompt = processor.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
|
||||
engine_args = asdict(engine_args) | {"seed": 42}
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.0, max_tokens=256, stop_token_ids=None
|
||||
)
|
||||
|
||||
outputs = llm.generate(
|
||||
{
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": {"image": images},
|
||||
},
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
print("-" * 50)
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
print(generated_text)
|
||||
assert len(generated_text) > 10, (
|
||||
f"Generated text is too short: {generated_text}"
|
||||
)
|
||||
print("-" * 50)
|
||||
723
tests/models/multimodal/generation/test_maverick.py
Normal file
723
tests/models/multimodal/generation/test_maverick.py
Normal file
@@ -0,0 +1,723 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Create a reduced-layer version of the Maverick model for testing purposes.
|
||||
|
||||
This script creates a new model with fewer layers by:
|
||||
1. Loading the original Maverick model configuration
|
||||
2. Creating a reduced configuration
|
||||
3. Generating compatible safetensors files with appropriate weights
|
||||
4. Creating the necessary index files for vLLM compatibility
|
||||
"""
|
||||
|
||||
import json
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from safetensors.torch import save_file
|
||||
from transformers import AutoConfig, AutoProcessor, AutoTokenizer, GenerationConfig
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.v1.executor.abstract import Executor
|
||||
from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, FullAttentionSpec
|
||||
|
||||
from ....utils import multi_gpu_test
|
||||
|
||||
# Sample prompts for testing
|
||||
PROMPTS: list[str] = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
|
||||
def run_maverick_serving(model: str):
|
||||
"""Test Llama-4-Maverick model with vLLM LLM class using CLI equivalent
|
||||
options with reduced layers.
|
||||
"""
|
||||
|
||||
try:
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
llm = LLM(
|
||||
model=model,
|
||||
max_model_len=2048,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=8,
|
||||
enable_expert_parallel=True,
|
||||
trust_remote_code=True,
|
||||
gpu_memory_utilization=0.4,
|
||||
kv_cache_dtype="fp8",
|
||||
)
|
||||
|
||||
outputs = llm.generate(PROMPTS, sampling_params)
|
||||
|
||||
# Print the outputs
|
||||
print("\nGenerated Outputs:\n" + "-" * 60)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}")
|
||||
print(f"Output: {generated_text!r}")
|
||||
print("-" * 60)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error initializing or running model: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def get_rope_layers_config(model_path: str) -> list[int]:
|
||||
"""
|
||||
Get the interleaved RoPE configuration from HuggingFace config
|
||||
|
||||
Args:
|
||||
model_path: Path to the local directory containing the reduced
|
||||
Maverick model checkpoint
|
||||
|
||||
Returns:
|
||||
List of 0 or 1 indicating whether each layer uses RoPE and local attn
|
||||
0 indicates that RoPE is not used while 1 indicates that RoPE is used.
|
||||
"""
|
||||
config_path = Path(model_path) / "config.json"
|
||||
model_config = json.loads(config_path.read_text())
|
||||
text_config = model_config["text_config"]
|
||||
no_rope_layers = text_config["no_rope_layers"]
|
||||
print(f"Found no_rope_layers: {no_rope_layers}")
|
||||
return no_rope_layers
|
||||
|
||||
|
||||
def create_reduced_maverick_model(
|
||||
original_model_name: str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||
output_dir: str = "/tmp/reduced_maverick",
|
||||
text_layers: int = 4,
|
||||
num_experts: int = 4,
|
||||
vision_layers: int = 2,
|
||||
force_recreate: bool = False,
|
||||
) -> str:
|
||||
"""
|
||||
Create a reduced-layer version of the Maverick model.
|
||||
|
||||
Args:
|
||||
original_model_name: Name of the original Maverick model
|
||||
output_dir: Directory to save the reduced model
|
||||
text_layers: Number of text transformer layers
|
||||
num_experts: Number of experts per layer
|
||||
vision_layers: Number of vision transformer layers
|
||||
force_recreate: Whether to recreate if output_dir already exists
|
||||
|
||||
Returns:
|
||||
Path to the created reduced model directory
|
||||
"""
|
||||
|
||||
print(
|
||||
f"Creating reduced Maverick model with {text_layers} text layers and "
|
||||
f"{vision_layers} vision layers..."
|
||||
)
|
||||
|
||||
# Create output directory
|
||||
output_path = Path(output_dir)
|
||||
if output_path.exists():
|
||||
if force_recreate:
|
||||
shutil.rmtree(output_path)
|
||||
else:
|
||||
print(
|
||||
f"Output directory {output_dir} already exists. "
|
||||
"Use --force-recreate to overwrite."
|
||||
)
|
||||
return str(output_path)
|
||||
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
print("Loading original model configuration...")
|
||||
original_config = AutoConfig.from_pretrained(
|
||||
original_model_name, trust_remote_code=True
|
||||
)
|
||||
print("Creating reduced configuration...")
|
||||
reduced_config = create_reduced_config(
|
||||
original_config, text_layers, num_experts, vision_layers
|
||||
)
|
||||
|
||||
config_path = output_path / "config.json"
|
||||
with open(config_path, "w") as f:
|
||||
json.dump(reduced_config, f, indent=2)
|
||||
print(f"Saved reduced config to {config_path}")
|
||||
|
||||
print("Copying tokenizer files...")
|
||||
copy_tokenizer_files(original_model_name, output_path)
|
||||
|
||||
print("Creating reduced safetensors files...")
|
||||
create_reduced_safetensors(original_config, reduced_config, output_path)
|
||||
|
||||
print("Creating preprocessor config...")
|
||||
create_preprocessor_config(original_config, output_path)
|
||||
|
||||
try:
|
||||
gen_config = GenerationConfig.from_pretrained(original_model_name)
|
||||
gen_config.save_pretrained(output_path)
|
||||
print("Copied generation config")
|
||||
except Exception as e:
|
||||
print(f"Could not copy generation config: {e}")
|
||||
|
||||
print(f"Successfully created reduced Maverick model at {output_path}")
|
||||
return str(output_path)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error creating reduced model: {e}")
|
||||
# Clean up on failure
|
||||
if output_path.exists():
|
||||
shutil.rmtree(output_path)
|
||||
raise
|
||||
|
||||
|
||||
def create_reduced_config(
|
||||
original_config: Any, text_layers: int, num_experts: int, vision_layers: int
|
||||
) -> dict[str, Any]:
|
||||
"""Create a reduced configuration based on the original."""
|
||||
|
||||
# Convert config to dictionary
|
||||
config_dict = original_config.to_dict()
|
||||
|
||||
# Reduce text layers
|
||||
if "text_config" in config_dict:
|
||||
original_text_layers = config_dict["text_config"]["num_hidden_layers"]
|
||||
config_dict["text_config"]["num_hidden_layers"] = text_layers
|
||||
original_layer_types = config_dict["text_config"]["layer_types"]
|
||||
config_dict["text_config"]["layer_types"] = original_layer_types[:text_layers]
|
||||
print(f"Reduced text layers from {original_text_layers} to {text_layers}")
|
||||
|
||||
original_num_experts = config_dict["text_config"]["num_local_experts"]
|
||||
config_dict["text_config"]["num_local_experts"] = num_experts
|
||||
print(f"Reduced num experts from {original_num_experts} to {num_experts}")
|
||||
|
||||
hidden_dim_divisor = 4
|
||||
|
||||
original_hidden_size = config_dict["text_config"]["hidden_size"]
|
||||
new_hidden_size = original_hidden_size // hidden_dim_divisor
|
||||
config_dict["text_config"]["hidden_size"] = new_hidden_size
|
||||
print(f"Reduced hidden size from {original_hidden_size} to {new_hidden_size}")
|
||||
|
||||
original_head_dim = config_dict["text_config"]["head_dim"]
|
||||
new_head_dim = original_head_dim // hidden_dim_divisor
|
||||
config_dict["text_config"]["head_dim"] = new_head_dim
|
||||
print(f"Reduced head dim from {original_head_dim} to {new_head_dim}")
|
||||
|
||||
# Reduce vision layers
|
||||
if "vision_config" in config_dict:
|
||||
original_vision_layers = config_dict["vision_config"]["num_hidden_layers"]
|
||||
config_dict["vision_config"]["num_hidden_layers"] = vision_layers
|
||||
print(f"Reduced vision layers from {original_vision_layers} to {vision_layers}")
|
||||
|
||||
# Update model name to indicate it's a reduced version
|
||||
config_dict["_name_or_path"] = f"reduced_maverick_{text_layers}t_{vision_layers}v"
|
||||
|
||||
return config_dict
|
||||
|
||||
|
||||
def copy_tokenizer_files(original_model_name: str, output_path: Path) -> None:
|
||||
"""Copy tokenizer files from the original model."""
|
||||
|
||||
try:
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
original_model_name, trust_remote_code=True
|
||||
)
|
||||
tokenizer.save_pretrained(output_path)
|
||||
print("Tokenizer files copied successfully")
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not copy tokenizer files: {e}")
|
||||
|
||||
|
||||
def create_preprocessor_config(original_config: Any, output_path: Path) -> None:
|
||||
"""Create preprocessor_config.json for multimodal model."""
|
||||
|
||||
# Try to load the original preprocessor config
|
||||
try:
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
original_config._name_or_path
|
||||
or "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
processor.save_pretrained(output_path)
|
||||
print("Copied original preprocessor config")
|
||||
return
|
||||
except Exception as e:
|
||||
print(f"Could not copy original preprocessor config: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def create_reduced_safetensors(
|
||||
original_config: Any, reduced_config: dict[str, Any], output_path: Path
|
||||
) -> None:
|
||||
"""Create safetensors files with weights for the reduced model."""
|
||||
|
||||
print("Generating synthetic weights for reduced model...")
|
||||
|
||||
text_config = reduced_config["text_config"]
|
||||
vision_config = reduced_config["vision_config"]
|
||||
|
||||
weights = {}
|
||||
|
||||
print("Creating text model weights...")
|
||||
weights.update(create_text_model_weights(text_config))
|
||||
|
||||
print("Creating vision model weights...")
|
||||
weights.update(create_vision_model_weights(vision_config))
|
||||
|
||||
print("Creating shared model weights...")
|
||||
weights.update(create_shared_weights(text_config, vision_config))
|
||||
|
||||
print("Saving weights to safetensors files...")
|
||||
save_weights_to_safetensors(weights, output_path)
|
||||
|
||||
|
||||
def create_text_model_weights(text_config: dict[str, Any]) -> dict[str, torch.Tensor]:
|
||||
"""Create synthetic weights for the text model with MoE structure."""
|
||||
|
||||
weights = {}
|
||||
|
||||
vocab_size = text_config["vocab_size"]
|
||||
hidden_size = text_config["hidden_size"]
|
||||
intermediate_size = text_config["intermediate_size"]
|
||||
intermediate_size_mlp = text_config["intermediate_size_mlp"]
|
||||
num_layers = text_config["num_hidden_layers"]
|
||||
num_attention_heads = text_config["num_attention_heads"]
|
||||
num_key_value_heads = text_config.get("num_key_value_heads", num_attention_heads)
|
||||
|
||||
# MoE specific parameters
|
||||
num_experts = text_config.get("num_local_experts")
|
||||
assert num_experts is not None, "num_local_experts must be specified for MoE"
|
||||
|
||||
head_dim = hidden_size // num_attention_heads
|
||||
|
||||
# Embedding layers
|
||||
weights["language_model.model.embed_tokens.weight"] = torch.randn(
|
||||
vocab_size, hidden_size, dtype=torch.float16
|
||||
)
|
||||
|
||||
# Transformer layers
|
||||
for layer_idx in range(num_layers):
|
||||
layer_prefix = f"language_model.model.layers.{layer_idx}"
|
||||
print(f"Creating weights for layer {layer_prefix}...")
|
||||
|
||||
# Self-attention weights (separate q, k, v projections)
|
||||
weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
|
||||
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
|
||||
hidden_size, num_key_value_heads * head_dim, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
|
||||
num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
|
||||
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16
|
||||
)
|
||||
print("Self-attention weights created.")
|
||||
|
||||
# Feed-forward weights - MoE pattern based on interleave_moe_layer_step
|
||||
# For interleave_moe_layer_step=2: layers 1,3,5,... are MoE, layers
|
||||
# 0,2,4,... are dense
|
||||
interleave_step = text_config.get("interleave_moe_layer_step", 1)
|
||||
is_moe_layer = interleave_step > 0 and (layer_idx + 1) % interleave_step == 0
|
||||
|
||||
if is_moe_layer:
|
||||
# MoE layer structure
|
||||
# 1. Router weights
|
||||
weights[f"{layer_prefix}.feed_forward.router.weight"] = torch.randn(
|
||||
num_experts, hidden_size, dtype=torch.float16
|
||||
)
|
||||
|
||||
# 2. Individual expert weights (not fused)
|
||||
for expert_idx in range(num_experts):
|
||||
expert_prefix = f"{layer_prefix}.feed_forward.experts.{expert_idx}"
|
||||
|
||||
weights[f"{expert_prefix}.gate_proj.weight"] = torch.randn(
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{expert_prefix}.up_proj.weight"] = torch.randn(
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{expert_prefix}.down_proj.weight"] = torch.randn(
|
||||
hidden_size, intermediate_size, dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
# Expert weight scales (FP8 quantization)
|
||||
weights[f"{expert_prefix}.gate_proj.weight_scale"] = torch.ones(
|
||||
intermediate_size, 1, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{expert_prefix}.up_proj.weight_scale"] = torch.ones(
|
||||
intermediate_size, 1, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{expert_prefix}.down_proj.weight_scale"] = torch.ones(
|
||||
hidden_size, 1, dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
# 3. Shared expert weights
|
||||
shared_expert_prefix = f"{layer_prefix}.feed_forward.shared_expert"
|
||||
weights[f"{shared_expert_prefix}.gate_proj.weight"] = torch.randn(
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{shared_expert_prefix}.up_proj.weight"] = torch.randn(
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{shared_expert_prefix}.down_proj.weight"] = torch.randn(
|
||||
hidden_size, intermediate_size, dtype=torch.bfloat16
|
||||
)
|
||||
print(f"MoE feed-forward weights created for layer {layer_idx}.")
|
||||
else:
|
||||
# Dense layer structure
|
||||
weights[f"{layer_prefix}.feed_forward.gate_proj.weight"] = torch.randn(
|
||||
intermediate_size_mlp, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.feed_forward.up_proj.weight"] = torch.randn(
|
||||
intermediate_size_mlp, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.feed_forward.down_proj.weight"] = torch.randn(
|
||||
hidden_size, intermediate_size_mlp, dtype=torch.bfloat16
|
||||
)
|
||||
print(f"Dense feed-forward weights created for layer {layer_idx}.")
|
||||
|
||||
# Layer norms
|
||||
weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
print("Layer norms created.")
|
||||
|
||||
# Final layer norm and output projection
|
||||
weights["language_model.model.norm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights["language_model.lm_head.weight"] = torch.randn(
|
||||
vocab_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
return weights
|
||||
|
||||
|
||||
def create_vision_model_weights(
|
||||
vision_config: dict[str, Any],
|
||||
) -> dict[str, torch.Tensor]:
|
||||
"""Create synthetic weights for the vision model."""
|
||||
|
||||
weights = {}
|
||||
|
||||
hidden_size = vision_config["hidden_size"]
|
||||
intermediate_size = vision_config["intermediate_size"]
|
||||
num_layers = vision_config["num_hidden_layers"]
|
||||
|
||||
# Vision transformer layers
|
||||
for layer_idx in range(num_layers):
|
||||
layer_prefix = f"vision_model.model.layers.{layer_idx}"
|
||||
|
||||
weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
|
||||
hidden_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.q_proj.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
|
||||
hidden_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.k_proj.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
|
||||
hidden_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.v_proj.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
|
||||
hidden_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.o_proj.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
weights[f"{layer_prefix}.mlp.fc1.weight"] = torch.randn(
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.mlp.fc1.bias"] = torch.zeros(
|
||||
intermediate_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.mlp.fc2.weight"] = torch.randn(
|
||||
hidden_size, intermediate_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.mlp.fc2.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.input_layernorm.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.post_attention_layernorm.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
return weights
|
||||
|
||||
|
||||
def create_shared_weights(
|
||||
text_config: dict[str, Any], vision_config: dict[str, Any]
|
||||
) -> dict[str, torch.Tensor]:
|
||||
"""Create weights for shared components (vision-language connector)"""
|
||||
|
||||
weights = {}
|
||||
|
||||
text_hidden_size = text_config["hidden_size"]
|
||||
projector_input_dim = vision_config["projector_input_dim"]
|
||||
|
||||
# Vision-language connector (projects vision features to text space)
|
||||
weights["multi_modal_projector.linear_1.weight"] = torch.randn(
|
||||
text_hidden_size, projector_input_dim, dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
return weights
|
||||
|
||||
|
||||
def save_weights_to_safetensors(
|
||||
weights: dict[str, torch.Tensor], output_path: Path
|
||||
) -> None:
|
||||
"""Save weights to safetensors files and create index."""
|
||||
|
||||
# Determine how to shard the weights
|
||||
max_shard_size = 5 * 1024 * 1024 * 1024 # 5GB per shard
|
||||
|
||||
# Calculate sizes and create shards
|
||||
shards = []
|
||||
current_shard: dict[str, torch.Tensor] = {}
|
||||
current_size = 0
|
||||
|
||||
for name, tensor in weights.items():
|
||||
tensor_size = tensor.numel() * tensor.element_size()
|
||||
|
||||
if current_size + tensor_size > max_shard_size and current_shard:
|
||||
shards.append(current_shard)
|
||||
current_shard = {}
|
||||
current_size = 0
|
||||
|
||||
current_shard[name] = tensor
|
||||
current_size += tensor_size
|
||||
|
||||
if current_shard:
|
||||
shards.append(current_shard)
|
||||
|
||||
# Save shards and create index
|
||||
weight_map = {}
|
||||
|
||||
if len(shards) == 1:
|
||||
# Single file
|
||||
filename = "model.safetensors"
|
||||
save_file(shards[0], output_path / filename)
|
||||
weight_map = {name: filename for name in shards[0]}
|
||||
print(f"Saved weights to single file: {filename}")
|
||||
else:
|
||||
# Multiple shards
|
||||
for i, shard in enumerate(shards):
|
||||
filename = f"model-{i + 1:05d}-of-{len(shards):05d}.safetensors"
|
||||
save_file(shard, output_path / filename)
|
||||
for name in shard:
|
||||
weight_map[name] = filename
|
||||
print(f"Saved shard {i + 1}/{len(shards)}: {filename}")
|
||||
|
||||
# Create index file
|
||||
index_data = {
|
||||
"metadata": {
|
||||
"total_size": sum(
|
||||
tensor.numel() * tensor.element_size() for tensor in weights.values()
|
||||
)
|
||||
},
|
||||
"weight_map": weight_map,
|
||||
}
|
||||
|
||||
index_path = output_path / "model.safetensors.index.json"
|
||||
with open(index_path, "w") as f:
|
||||
json.dump(index_data, f, indent=2)
|
||||
|
||||
print(f"Created index file: {index_path}")
|
||||
print(
|
||||
f"Total model size: {index_data['metadata']['total_size'] / (1024**3):.2f} GB"
|
||||
)
|
||||
|
||||
|
||||
def check_attention_spec_interleaved_rope(
|
||||
llm: LLM,
|
||||
num_attention_layers: int,
|
||||
num_ranks: int,
|
||||
rope_layers: list[int],
|
||||
):
|
||||
"""Check that the attention spec is correct."""
|
||||
assert isinstance(llm.llm_engine.model_executor, Executor)
|
||||
kv_cache_specs_per_rank = llm.llm_engine.model_executor.get_kv_cache_specs()
|
||||
for rank in range(num_ranks):
|
||||
kv_cache_specs = kv_cache_specs_per_rank[rank]
|
||||
assert len(kv_cache_specs.keys()) == num_attention_layers
|
||||
for i in range(num_attention_layers):
|
||||
if rope_layers[i] == 0:
|
||||
expected_spec = FullAttentionSpec
|
||||
else:
|
||||
expected_spec = ChunkedLocalAttentionSpec
|
||||
assert isinstance(
|
||||
kv_cache_specs[f"language_model.model.layers.{i}.self_attn.attn"],
|
||||
expected_spec,
|
||||
)
|
||||
|
||||
|
||||
def run_reduced_model(llm: LLM, should_profile: bool = False) -> None:
|
||||
"""Test the created reduced model with vLLM."""
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=50)
|
||||
|
||||
if should_profile:
|
||||
llm.start_profile()
|
||||
outputs = llm.generate(PROMPTS, sampling_params)
|
||||
if should_profile:
|
||||
llm.stop_profile()
|
||||
|
||||
print("Test generation successful!")
|
||||
for output in outputs:
|
||||
print(f"Prompt: {output.prompt}")
|
||||
print(f"Output: {output.outputs[0].text}")
|
||||
print("-" * 40)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize(
|
||||
"original_model_name,text_layers,num_experts,vision_layers,",
|
||||
[("meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", 4, 4, 2)],
|
||||
)
|
||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
||||
@pytest.mark.parametrize("tp,ep", [(2, True)])
|
||||
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
|
||||
def test_dummy_maverick(
|
||||
monkeypatch,
|
||||
original_model_name: str,
|
||||
text_layers: int,
|
||||
num_experts: int,
|
||||
vision_layers: int,
|
||||
enforce_eager: bool,
|
||||
tp: int,
|
||||
ep: bool,
|
||||
output_dir: str = "/tmp/reduced_maverick",
|
||||
force_recreate: bool = True,
|
||||
profile: bool = False,
|
||||
) -> None:
|
||||
# Disable multiprocessing allows us to access model executor from LLM engine
|
||||
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||
|
||||
model_path = create_reduced_maverick_model(
|
||||
original_model_name=original_model_name,
|
||||
output_dir=output_dir,
|
||||
text_layers=text_layers,
|
||||
num_experts=num_experts,
|
||||
vision_layers=vision_layers,
|
||||
force_recreate=force_recreate,
|
||||
)
|
||||
|
||||
print(f"\nReduced model created successfully at: {model_path}")
|
||||
|
||||
rope_layers = get_rope_layers_config(model_path)
|
||||
|
||||
llm = LLM(
|
||||
model=model_path,
|
||||
trust_remote_code=True,
|
||||
max_model_len=512, # Small context for testing
|
||||
gpu_memory_utilization=0.3, # Conservative memory usage
|
||||
enforce_eager=enforce_eager,
|
||||
tensor_parallel_size=tp,
|
||||
enable_expert_parallel=ep,
|
||||
)
|
||||
|
||||
check_attention_spec_interleaved_rope(
|
||||
llm,
|
||||
text_layers,
|
||||
tp,
|
||||
rope_layers,
|
||||
)
|
||||
|
||||
print(f"\nTesting reduced model at {model_path}...")
|
||||
run_reduced_model(llm=llm, should_profile=profile)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function to create and test the reduced model."""
|
||||
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Create a reduced-layer Maverick model"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
default="/tmp/reduced_maverick",
|
||||
help="Output directory for the reduced model",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--text-layers",
|
||||
type=int,
|
||||
default=4,
|
||||
help="Number of text transformer layers",
|
||||
)
|
||||
parser.add_argument("--num-experts", type=int, default=4, help="Number of experts")
|
||||
parser.add_argument(
|
||||
"--vision-layers",
|
||||
type=int,
|
||||
default=2,
|
||||
help="Number of vision transformer layers",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--force-recreate",
|
||||
action="store_true",
|
||||
help="Force recreation if output directory exists",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--test", action="store_true", help="Test the created model with vLLM"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--profile", action="store_true", help="Profile the created model with vLLM"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--test-original",
|
||||
action="store_true",
|
||||
help="Test the original model with vLLM",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--original-model",
|
||||
default="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||
help="Original model name to base the reduction on",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.test:
|
||||
test_dummy_maverick(
|
||||
original_model_name=args.original_model,
|
||||
output_dir=args.output_dir,
|
||||
text_layers=args.text_layers,
|
||||
num_experts=args.num_experts,
|
||||
vision_layers=args.vision_layers,
|
||||
force_recreate=args.force_recreate,
|
||||
tp=2,
|
||||
ep=True,
|
||||
enforce_eager=True,
|
||||
profile=args.profile,
|
||||
)
|
||||
|
||||
if args.test_original:
|
||||
run_maverick_serving(args.original_model)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit(main())
|
||||
180
tests/models/multimodal/generation/test_multimodal_gguf.py
Normal file
180
tests/models/multimodal/generation/test_multimodal_gguf.py
Normal file
@@ -0,0 +1,180 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
|
||||
from typing import Any, NamedTuple
|
||||
|
||||
import pytest
|
||||
from huggingface_hub import hf_hub_download
|
||||
from pytest import MarkDecorator
|
||||
from transformers import AutoModelForImageTextToText
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.utils.torch_utils import set_default_torch_num_threads
|
||||
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
|
||||
class GGUFMMTestConfig(NamedTuple):
|
||||
original_model: str
|
||||
gguf_repo: str
|
||||
gguf_backbone: str
|
||||
gguf_mmproj: str
|
||||
prompt: list[str]
|
||||
image_names: list[str] # Store names, load PIL images at runtime
|
||||
max_model_len: int = 4096
|
||||
marks: list[MarkDecorator] = []
|
||||
mm_processor_kwargs: dict[str, Any] = {}
|
||||
|
||||
@property
|
||||
def gguf_model(self):
|
||||
hf_hub_download(self.gguf_repo, filename=self.gguf_mmproj)
|
||||
return hf_hub_download(self.gguf_repo, filename=self.gguf_backbone)
|
||||
|
||||
|
||||
# Common prompts aligned with test_common.py "gemma3" entry format
|
||||
_GEMMA3_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": (
|
||||
"<bos><start_of_turn>user\n"
|
||||
"<start_of_image>What's the content in the center of the image?"
|
||||
"<end_of_turn>\n<start_of_turn>model\n"
|
||||
),
|
||||
"cherry_blossom": (
|
||||
"<bos><start_of_turn>user\n"
|
||||
"<start_of_image>What is the season?"
|
||||
"<end_of_turn>\n<start_of_turn>model\n"
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
# Image asset names - load at runtime to avoid pickle issues with subprocess
|
||||
_GEMMA3_IMAGE_NAMES = ["stop_sign", "cherry_blossom"]
|
||||
|
||||
# Regular multimodal (no pan-and-scan) - uses QAT Q4_0 GGUF
|
||||
GEMMA3_CONFIG = GGUFMMTestConfig(
|
||||
original_model="google/gemma-3-4b-it",
|
||||
gguf_repo="google/gemma-3-4b-it-qat-q4_0-gguf",
|
||||
gguf_backbone="gemma-3-4b-it-q4_0.gguf",
|
||||
gguf_mmproj="mmproj-model-f16-4B.gguf",
|
||||
prompt=_GEMMA3_PROMPTS,
|
||||
image_names=_GEMMA3_IMAGE_NAMES,
|
||||
max_model_len=4096,
|
||||
marks=[pytest.mark.core_model],
|
||||
mm_processor_kwargs={},
|
||||
)
|
||||
|
||||
# Pan-and-scan multimodal - uses unquantized BF16 GGUF
|
||||
GEMMA3_CONFIG_PAN_AND_SCAN = GGUFMMTestConfig(
|
||||
original_model="google/gemma-3-4b-it",
|
||||
gguf_repo="unsloth/gemma-3-4b-it-GGUF",
|
||||
gguf_backbone="gemma-3-4b-it-BF16.gguf",
|
||||
gguf_mmproj="mmproj-BF16.gguf",
|
||||
prompt=_GEMMA3_PROMPTS,
|
||||
image_names=_GEMMA3_IMAGE_NAMES,
|
||||
max_model_len=4096,
|
||||
marks=[pytest.mark.core_model],
|
||||
mm_processor_kwargs={"do_pan_and_scan": True},
|
||||
)
|
||||
|
||||
MODELS_TO_TEST = [GEMMA3_CONFIG, GEMMA3_CONFIG_PAN_AND_SCAN]
|
||||
|
||||
|
||||
def run_multimodal_gguf_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
model: GGUFMMTestConfig,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
):
|
||||
# Load images at runtime (inside subprocess) to avoid pickle issues
|
||||
images = [ImageAsset(name).pil_image for name in model.image_names]
|
||||
size_factors = [0.25, 0.5, 1.0]
|
||||
inputs_per_image = [
|
||||
(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
)
|
||||
for image, prompt in zip(images, model.prompt)
|
||||
]
|
||||
|
||||
# NOTE: Run vLLM first to avoid CUDA init issues with multiprocessing fork.
|
||||
# Run GGUF model via vLLM.
|
||||
with (
|
||||
set_default_torch_num_threads(1),
|
||||
vllm_runner(
|
||||
model_name=model.gguf_model,
|
||||
enforce_eager=True,
|
||||
tokenizer_name=model.original_model,
|
||||
dtype=dtype,
|
||||
max_model_len=model.max_model_len,
|
||||
mm_processor_kwargs=model.mm_processor_kwargs,
|
||||
) as gguf_model,
|
||||
):
|
||||
gguf_outputs_per_case = [
|
||||
gguf_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
)
|
||||
for prompts, images in inputs_per_image
|
||||
]
|
||||
|
||||
# Then run HfRunner for HuggingFace baseline comparison.
|
||||
with hf_runner(
|
||||
model.original_model,
|
||||
dtype=dtype,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
) as hf_model:
|
||||
hf_outputs_per_case = [
|
||||
hf_model.generate_greedy_logprobs_limit(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
)
|
||||
for prompts, images in inputs_per_image
|
||||
]
|
||||
|
||||
for hf_outputs, gguf_outputs in zip(hf_outputs_per_case, gguf_outputs_per_case):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=gguf_outputs,
|
||||
name_0="hf",
|
||||
name_1="gguf",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("gguf"),
|
||||
reason="gguf is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
pytest.param(test_config, marks=test_config.marks)
|
||||
for test_config in MODELS_TO_TEST
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_gemma3_mm_gguf(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
model: GGUFMMTestConfig,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
run_multimodal_gguf_test(
|
||||
hf_runner, vllm_runner, model, dtype, max_tokens, num_logprobs
|
||||
)
|
||||
317
tests/models/multimodal/generation/test_phi4mm.py
Normal file
317
tests/models/multimodal/generation/test_phi4mm.py
Normal file
@@ -0,0 +1,317 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
from collections.abc import Sequence
|
||||
|
||||
import librosa
|
||||
import pytest
|
||||
import regex as re
|
||||
from huggingface_hub import snapshot_download
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal.image import convert_image_mode, rescale_image_size
|
||||
|
||||
from ....conftest import (
|
||||
IMAGE_ASSETS,
|
||||
HfRunner,
|
||||
PromptAudioInput,
|
||||
PromptImageInput,
|
||||
VllmRunner,
|
||||
)
|
||||
from ....utils import large_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
"cherry_blossom": "<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
}
|
||||
)
|
||||
HF_MULTIIMAGE_IMAGE_PROMPT = (
|
||||
"<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
|
||||
)
|
||||
|
||||
model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
|
||||
# Since the vision-lora and speech-lora co-exist with the base model,
|
||||
# we have to manually specify the path of the lora weights.
|
||||
vision_lora_path = os.path.join(model_path, "vision-lora")
|
||||
speech_question = os.path.join(
|
||||
model_path, "examples", "what_is_shown_in_this_image.wav"
|
||||
)
|
||||
models = [model_path]
|
||||
|
||||
|
||||
def vllm_to_hf_output(
|
||||
vllm_output: tuple[list[int], str, SampleLogprobs | None], model: str
|
||||
):
|
||||
"""Sanitize vllm output to be comparable with hf output."""
|
||||
_, output_str, out_logprobs = vllm_output
|
||||
|
||||
output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
|
||||
assert output_str_without_image[0] == " "
|
||||
output_str_without_image = output_str_without_image[1:]
|
||||
|
||||
hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
hf_output_ids = tokenizer.encode(output_str_without_image)
|
||||
assert hf_output_ids[0] == 1
|
||||
hf_output_ids = hf_output_ids[1:]
|
||||
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
target_dtype = "half"
|
||||
|
||||
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
inputs: Sequence[tuple[list[str], PromptImageInput, PromptAudioInput | None]],
|
||||
model: str,
|
||||
*,
|
||||
max_model_len: int,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
mm_limit: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: str | None = None,
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
Note, the text input is also adjusted to abide by vllm contract.
|
||||
The text output is sanitized to be able to compare with hf.
|
||||
"""
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=2,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"image": mm_limit},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enable_lora=True,
|
||||
max_lora_rank=320,
|
||||
gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
lora_request = LoRARequest("vision", 1, vision_lora_path)
|
||||
vllm_outputs_per_case = [
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
audios=audios,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
for prompts, images, audios in inputs
|
||||
]
|
||||
|
||||
# This error occurs inside `get_peft_model`
|
||||
# FIXME: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/75
|
||||
pytest.skip("HF impl is not compatible with current transformers")
|
||||
|
||||
hf_model_kwargs = {"_attn_implementation": "sdpa"}
|
||||
with hf_runner(model, dtype=dtype, model_kwargs=hf_model_kwargs) as hf_model:
|
||||
hf_processor = hf_model.processor
|
||||
eos_token_id = hf_processor.tokenizer.eos_token_id
|
||||
|
||||
def patch_hf_processor(
|
||||
*args, text="", images=None, audio=None, sampling_rate=None, **kwargs
|
||||
):
|
||||
audios = None
|
||||
if audio is not None and sampling_rate is not None:
|
||||
audios = [(audio, sampling_rate)]
|
||||
return hf_processor(
|
||||
*args, text=text, images=images, audios=audios, **kwargs
|
||||
)
|
||||
|
||||
hf_model.processor = patch_hf_processor
|
||||
|
||||
hf_outputs_per_case = [
|
||||
hf_model.generate_greedy_logprobs_limit(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
audios=audios,
|
||||
eos_token_id=eos_token_id,
|
||||
num_logits_to_keep=0,
|
||||
)
|
||||
for prompts, images, audios in inputs
|
||||
]
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# No image
|
||||
[],
|
||||
# Single-scale
|
||||
[1.0],
|
||||
# Single-scale, batched
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_model_len", [12800])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model,
|
||||
size_factors,
|
||||
dtype: str,
|
||||
max_model_len: int,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_image = [
|
||||
(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
None,
|
||||
)
|
||||
for image, prompt in zip(images, HF_IMAGE_PROMPTS)
|
||||
]
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs_per_image,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=max_model_len,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=1,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=48)
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# No image
|
||||
# [],
|
||||
# Single-scale
|
||||
[1.0],
|
||||
# Single-scale, batched
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_model_len", [25600])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_multi_images_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model,
|
||||
size_factors,
|
||||
dtype: str,
|
||||
max_model_len: int,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_case = [
|
||||
(
|
||||
[HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
|
||||
[
|
||||
[rescale_image_size(image, factor) for image in images]
|
||||
for factor in size_factors
|
||||
],
|
||||
None,
|
||||
),
|
||||
]
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs_per_case,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=max_model_len,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=2,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_model_len", [12800])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_vision_speech_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model,
|
||||
dtype: str,
|
||||
max_model_len: int,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
# use the example speech question so that the model outputs are reasonable
|
||||
audio = librosa.load(speech_question, sr=None)
|
||||
image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
|
||||
|
||||
inputs_vision_speech = [
|
||||
(
|
||||
["<|user|><|image_1|><|audio_1|><|end|><|assistant|>"],
|
||||
[image],
|
||||
[audio],
|
||||
),
|
||||
]
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs_vision_speech,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=max_model_len,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=1,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
211
tests/models/multimodal/generation/test_pixtral.py
Normal file
211
tests/models/multimodal/generation/test_pixtral.py
Normal file
@@ -0,0 +1,211 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import json
|
||||
from dataclasses import asdict
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import pytest
|
||||
from mistral_common.multimodal import download_image
|
||||
from mistral_common.protocol.instruct.chunk import ImageURLChunk
|
||||
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
|
||||
from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
|
||||
from transformers import AutoProcessor
|
||||
|
||||
from vllm import SamplingParams, TextPrompt, TokensPrompt
|
||||
from vllm.logprobs import Logprob, SampleLogprobs
|
||||
from vllm.multimodal import MultiModalDataBuiltins
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ....utils import VLLM_PATH, large_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from _typeshed import StrPath
|
||||
|
||||
PIXTRAL_ID = "mistralai/Pixtral-12B-2409"
|
||||
MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
|
||||
|
||||
MODELS = [PIXTRAL_ID, MISTRAL_SMALL_3_1_ID]
|
||||
|
||||
IMG_URLS = [
|
||||
"237-400x300.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
|
||||
"231-200x300.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
|
||||
"27-500x500.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
|
||||
"17-150x600.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
|
||||
]
|
||||
PROMPT = "Describe each image in one short sentence."
|
||||
|
||||
|
||||
def _create_msg_format(urls: list[str]) -> list[dict[str, Any]]:
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": PROMPT,
|
||||
}
|
||||
]
|
||||
+ [{"type": "image_url", "image_url": {"url": url}} for url in urls],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def _create_msg_format_hf(urls: list[str]) -> list[dict[str, Any]]:
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"content": PROMPT,
|
||||
},
|
||||
*({"type": "image", "image": download_image(url)} for url in urls),
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def _create_engine_inputs(urls: list[str]) -> TokensPrompt:
|
||||
msg = _create_msg_format(urls)
|
||||
|
||||
tokenizer = MistralTokenizer.from_model("pixtral")
|
||||
|
||||
request = ChatCompletionRequest(messages=msg) # type: ignore[type-var]
|
||||
tokenized = tokenizer.encode_chat_completion(request)
|
||||
|
||||
engine_inputs = TokensPrompt(prompt_token_ids=tokenized.tokens)
|
||||
|
||||
images = []
|
||||
for chunk in request.messages[0].content:
|
||||
if isinstance(chunk, ImageURLChunk):
|
||||
images.append(image_from_chunk(chunk))
|
||||
|
||||
mm_data = MultiModalDataBuiltins(image=images)
|
||||
engine_inputs["multi_modal_data"] = mm_data
|
||||
|
||||
return engine_inputs
|
||||
|
||||
|
||||
def _create_engine_inputs_hf(urls: list[str]) -> TextPrompt:
|
||||
msg = _create_msg_format_hf(urls)
|
||||
|
||||
tokenizer = AutoProcessor.from_pretrained("mistral-community/pixtral-12b")
|
||||
prompt = tokenizer.apply_chat_template(msg)
|
||||
|
||||
images = []
|
||||
for chunk in msg[0]["content"]:
|
||||
if chunk["type"] == "image":
|
||||
images.append(chunk["image"])
|
||||
|
||||
mm_data = MultiModalDataBuiltins(image=images)
|
||||
engine_inputs = TextPrompt(prompt=prompt, multi_modal_data=mm_data)
|
||||
|
||||
return engine_inputs
|
||||
|
||||
|
||||
SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
|
||||
LIMIT_MM_PER_PROMPT = dict(image=4)
|
||||
|
||||
MAX_MODEL_LEN = [8192, 65536]
|
||||
|
||||
FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
|
||||
assert FIXTURES_PATH.exists()
|
||||
|
||||
FIXTURE_LOGPROBS_CHAT = {
|
||||
PIXTRAL_ID: FIXTURES_PATH / "pixtral_chat.json",
|
||||
MISTRAL_SMALL_3_1_ID: FIXTURES_PATH / "mistral_small_3_chat.json",
|
||||
}
|
||||
|
||||
OutputsLogprobs = list[tuple[list[int], str, SampleLogprobs | None]]
|
||||
|
||||
|
||||
# For the test author to store golden output in JSON
|
||||
def _dump_outputs_w_logprobs(
|
||||
outputs: OutputsLogprobs,
|
||||
filename: "StrPath",
|
||||
) -> None:
|
||||
json_data = [
|
||||
(
|
||||
tokens,
|
||||
text,
|
||||
[
|
||||
{k: asdict(v) for k, v in token_logprobs.items()}
|
||||
for token_logprobs in (logprobs or [])
|
||||
],
|
||||
)
|
||||
for tokens, text, logprobs in outputs
|
||||
]
|
||||
|
||||
with open(filename, "w") as f:
|
||||
json.dump(json_data, f)
|
||||
|
||||
|
||||
def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
|
||||
with open(filename, "rb") as f:
|
||||
json_data = json.load(f)
|
||||
|
||||
return [
|
||||
(
|
||||
tokens,
|
||||
text,
|
||||
[
|
||||
{int(k): Logprob(**v) for k, v in token_logprobs.items()}
|
||||
for token_logprobs in logprobs
|
||||
],
|
||||
)
|
||||
for tokens, text, logprobs in json_data
|
||||
]
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=80)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
def test_chat(
|
||||
vllm_runner, max_model_len: int, model: str, dtype: str, local_asset_server
|
||||
) -> None:
|
||||
if (
|
||||
model == MISTRAL_SMALL_3_1_ID
|
||||
and max_model_len == 65536
|
||||
and current_platform.is_rocm()
|
||||
):
|
||||
pytest.skip(
|
||||
"OOM on ROCm: 24B model with 65536 context length exceeds GPU memory"
|
||||
)
|
||||
|
||||
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT[model])
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="mistral",
|
||||
load_format="mistral",
|
||||
config_format="mistral",
|
||||
max_model_len=max_model_len,
|
||||
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
|
||||
) as vllm_model:
|
||||
outputs = []
|
||||
|
||||
urls_all = [local_asset_server.url_for(u) for u in IMG_URLS]
|
||||
msgs = [
|
||||
_create_msg_format(urls_all[:1]),
|
||||
_create_msg_format(urls_all[:2]),
|
||||
_create_msg_format(urls_all),
|
||||
]
|
||||
for msg in msgs:
|
||||
output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS)
|
||||
|
||||
outputs.extend(output)
|
||||
|
||||
logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
|
||||
# Remove last `None` prompt_logprobs to compare with fixture
|
||||
for i in range(len(logprobs)):
|
||||
assert logprobs[i][-1] is None
|
||||
logprobs[i] = logprobs[i][:-1]
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
|
||||
outputs_1_lst=logprobs,
|
||||
name_0="h100_ref",
|
||||
name_1="output",
|
||||
)
|
||||
148
tests/models/multimodal/generation/test_qwen2_5_vl.py
Normal file
148
tests/models/multimodal/generation/test_qwen2_5_vl.py
Normal file
@@ -0,0 +1,148 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.multimodal.video import sample_frames_from_video
|
||||
|
||||
from ....conftest import VIDEO_ASSETS
|
||||
|
||||
models = ["Qwen/Qwen2.5-VL-3B-Instruct"]
|
||||
target_dtype = "bfloat16"
|
||||
|
||||
VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
|
||||
|
||||
|
||||
def qwen2_5_vl_chat_template(*query):
|
||||
return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n" # noqa: E501
|
||||
|
||||
|
||||
VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
|
||||
{
|
||||
"baby_reading": qwen2_5_vl_chat_template(
|
||||
VIDEO_PLACEHOLDER,
|
||||
"Describe this video with a short sentence ",
|
||||
"(no more than 20 words)",
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("video_pruning_rate", [0.0, 0.75])
|
||||
@pytest.mark.parametrize("num_frames", [16])
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
|
||||
def test_qwen2_5_vl_evs_functionality(
|
||||
vllm_runner,
|
||||
video_assets,
|
||||
model,
|
||||
video_pruning_rate: float,
|
||||
num_frames: int,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
use_bytecode_hook: bool,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
"""Test EVS (Efficient Video Sampling) functionality with different
|
||||
pruning rates.
|
||||
"""
|
||||
# Set the environment variable for this test
|
||||
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
|
||||
|
||||
# Sample frames from video assets
|
||||
sampled_vids = [
|
||||
sample_frames_from_video(asset.np_ndarrays, num_frames)
|
||||
for asset in video_assets
|
||||
]
|
||||
|
||||
prompts = [VIDEO_PROMPTS[0]]
|
||||
videos = [sampled_vids[0]]
|
||||
|
||||
# Initialize model with EVS configuration
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=4000,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"video": 1},
|
||||
video_pruning_rate=video_pruning_rate,
|
||||
) as vllm_model:
|
||||
# Generate output - this should not crash
|
||||
outputs = vllm_model.generate_greedy(prompts, max_tokens, videos=videos)
|
||||
|
||||
# Basic validation that we got a response
|
||||
assert len(outputs) == 1
|
||||
output_ids, output_text = outputs[0]
|
||||
|
||||
# Ensure we got some output
|
||||
assert len(output_ids) > 0
|
||||
assert len(output_text) > 0
|
||||
|
||||
# Ensure the output is a string
|
||||
assert isinstance(output_text, str)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("video_pruning_rate", [0.0, 0.75])
|
||||
@pytest.mark.parametrize("num_frames", [16])
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
|
||||
def test_qwen2_5_vl_evs_batched_videos(
|
||||
vllm_runner,
|
||||
video_assets,
|
||||
model,
|
||||
video_pruning_rate: float,
|
||||
num_frames: int,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
use_bytecode_hook: bool,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
"""Test EVS functionality with batched videos.
|
||||
|
||||
This test validates that:
|
||||
1. The model handles batched video inputs correctly with EVS
|
||||
2. Both pruning configurations work with multiple videos
|
||||
3. The model doesn't crash when processing multiple videos simultaneously
|
||||
"""
|
||||
# Set the environment variable for this test
|
||||
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
|
||||
# Sample frames from video assets
|
||||
sampled_vids = [
|
||||
sample_frames_from_video(asset.np_ndarrays, num_frames)
|
||||
for asset in video_assets
|
||||
]
|
||||
|
||||
# Test batched videos
|
||||
prompts = [VIDEO_PROMPTS[0], VIDEO_PROMPTS[0]]
|
||||
videos = [sampled_vids[0], sampled_vids[0]] # Use same video twice for testing
|
||||
|
||||
# Initialize model with EVS configuration
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=4000,
|
||||
max_num_seqs=2,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"video": 2},
|
||||
tensor_parallel_size=1,
|
||||
video_pruning_rate=video_pruning_rate,
|
||||
) as vllm_model:
|
||||
# Generate output - this should not crash
|
||||
outputs = vllm_model.generate_greedy(prompts, max_tokens, videos=videos)
|
||||
|
||||
# Basic validation that we got responses for both videos
|
||||
assert len(outputs) == 2
|
||||
|
||||
for output_ids, output_text in outputs:
|
||||
# Ensure we got some output for each video
|
||||
assert len(output_ids) > 0
|
||||
assert len(output_text) > 0
|
||||
|
||||
# Ensure the output is a string
|
||||
assert isinstance(output_text, str)
|
||||
473
tests/models/multimodal/generation/test_qwen2_vl.py
Normal file
473
tests/models/multimodal/generation/test_qwen2_vl.py
Normal file
@@ -0,0 +1,473 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Any, TypedDict
|
||||
|
||||
import numpy.typing as npt
|
||||
import pytest
|
||||
import torch
|
||||
from PIL import Image
|
||||
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
|
||||
|
||||
from ....conftest import (
|
||||
IMAGE_ASSETS,
|
||||
VIDEO_ASSETS,
|
||||
PromptImageInput,
|
||||
PromptVideoInput,
|
||||
VllmRunner,
|
||||
)
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def enable_pickle(monkeypatch):
|
||||
"""`LLM.apply_model` requires pickling a function."""
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
|
||||
|
||||
models = ["Qwen/Qwen2-VL-2B-Instruct"]
|
||||
target_dtype = "half"
|
||||
|
||||
IMAGE_PLACEHOLDER = "<|vision_start|><|image_pad|><|vision_end|>"
|
||||
VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
|
||||
MODEL_HIDDEN_SIZE = 1536
|
||||
|
||||
|
||||
def qwen2_vl_chat_template(*query):
|
||||
return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n" # noqa: E501
|
||||
|
||||
|
||||
IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": qwen2_vl_chat_template(
|
||||
IMAGE_PLACEHOLDER,
|
||||
"What is the biggest text's content in this image?",
|
||||
),
|
||||
"cherry_blossom": qwen2_vl_chat_template(
|
||||
IMAGE_PLACEHOLDER,
|
||||
"What is the season shown in this image? ",
|
||||
"Reply with a short sentence (no more than 20 words)",
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
|
||||
{
|
||||
"baby_reading": qwen2_vl_chat_template(
|
||||
VIDEO_PLACEHOLDER,
|
||||
"Describe this video with a short sentence ",
|
||||
"(no more than 20 words)",
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
MULTIIMAGE_PROMPT = qwen2_vl_chat_template(
|
||||
IMAGE_PLACEHOLDER,
|
||||
IMAGE_PLACEHOLDER,
|
||||
"Describe these two images separately. ",
|
||||
"For each image, reply with a short sentence ",
|
||||
"(no more than 10 words).",
|
||||
)
|
||||
|
||||
|
||||
class Qwen2VLPromptImageEmbeddingInput(TypedDict):
|
||||
image_embeds: torch.Tensor
|
||||
image_grid_thw: torch.Tensor
|
||||
|
||||
|
||||
class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
|
||||
video_embeds: torch.Tensor
|
||||
video_grid_thw: torch.Tensor
|
||||
|
||||
|
||||
def batch_make_image_embeddings(
|
||||
image_batches: list[Image.Image | list[Image.Image]],
|
||||
processor,
|
||||
llm: VllmRunner,
|
||||
) -> list[Qwen2VLPromptImageEmbeddingInput]:
|
||||
"""batched image embeddings for Qwen2-VL
|
||||
|
||||
This will infer all images' embeddings in a single batch,
|
||||
and split the result according to input batches.
|
||||
|
||||
image_batches:
|
||||
- Single-image batches: `list[Image.Image]`
|
||||
- Multiple-image batches: `list[list[Image.Image]]]`
|
||||
|
||||
returns: `list[Qwen2VLPromptImageEmbeddingInput]`
|
||||
"""
|
||||
|
||||
image_batches_: list[Any] = image_batches[:]
|
||||
|
||||
# convert single-image batches to multiple-image batches
|
||||
for idx in range(len(image_batches_)):
|
||||
if not isinstance(image_batches_[idx], list):
|
||||
image_batches_[idx] = [image_batches_[idx]]
|
||||
|
||||
assert isinstance(image_batches_[idx], list)
|
||||
|
||||
# append all images into a list (as a batch)
|
||||
images: list[Image.Image] = []
|
||||
for image_batch in image_batches_:
|
||||
images += image_batch
|
||||
|
||||
# image to pixel values
|
||||
image_processor = processor.image_processor
|
||||
|
||||
preprocess_result = image_processor.preprocess(
|
||||
images=images, return_tensors="pt"
|
||||
).data
|
||||
pixel_values = preprocess_result["pixel_values"]
|
||||
image_grid_thw = preprocess_result["image_grid_thw"]
|
||||
|
||||
# pixel values to embeddings & grid_thws
|
||||
def get_image_embeds(model):
|
||||
with torch.no_grad():
|
||||
visual = model.visual
|
||||
|
||||
pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
|
||||
return visual(pixel_values_on_device, grid_thw=image_grid_thw).cpu()
|
||||
|
||||
image_embeds = torch.concat(llm.apply_model(get_image_embeds))
|
||||
|
||||
# split into original batches
|
||||
result: list[Qwen2VLPromptImageEmbeddingInput] = []
|
||||
image_counter = 0
|
||||
embed_counter = 0
|
||||
for image_batch in image_batches_:
|
||||
cur_batch_image_count = len(image_batch)
|
||||
merge_size = image_processor.merge_size
|
||||
cur_batch_embed_len = sum(
|
||||
grid_thw.prod(-1) // merge_size // merge_size
|
||||
for grid_thw in image_grid_thw[
|
||||
image_counter : image_counter + cur_batch_image_count
|
||||
]
|
||||
)
|
||||
|
||||
result.append(
|
||||
{
|
||||
"image_embeds": image_embeds[
|
||||
embed_counter : embed_counter + cur_batch_embed_len
|
||||
],
|
||||
"image_grid_thw": image_grid_thw[
|
||||
image_counter : image_counter + cur_batch_image_count
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
embed_counter += cur_batch_embed_len
|
||||
image_counter += cur_batch_image_count
|
||||
|
||||
# ensure we don't lose any images or embeddings
|
||||
assert embed_counter == image_embeds.size(0)
|
||||
assert image_counter == image_grid_thw.size(0)
|
||||
assert len(image_batches) == len(result)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def batch_make_video_embeddings(
|
||||
video_batches: PromptVideoInput, processor, llm: VllmRunner
|
||||
) -> list[Qwen2VLPromptVideoEmbeddingInput]:
|
||||
"""batched video embeddings for Qwen2-VL
|
||||
|
||||
A NDArray represents a single video's all frames.
|
||||
|
||||
This will infer all videos' embeddings in a single batch,
|
||||
and split the result according to input batches.
|
||||
|
||||
video_batches:
|
||||
- Single-video batches: `list[NDArray]`
|
||||
- Multiple-video batches: `list[list[NDArray]]`
|
||||
"""
|
||||
|
||||
video_batches_: list[Any] = video_batches[:]
|
||||
|
||||
for idx in range(len(video_batches_)):
|
||||
if not isinstance(video_batches_[idx], list):
|
||||
single_video_batch: list[npt.NDArray] = [video_batches_[idx]]
|
||||
video_batches_[idx] = single_video_batch
|
||||
|
||||
assert isinstance(video_batches_[idx], list)
|
||||
|
||||
# append all videos into a list (as a batch)
|
||||
videos: list[npt.NDArray] = []
|
||||
for video_batch in video_batches_:
|
||||
videos += video_batch
|
||||
|
||||
# video to pixel values
|
||||
image_processor = processor.image_processor
|
||||
|
||||
preprocess_result = image_processor.preprocess(
|
||||
images=None, videos=videos, return_tensors="pt"
|
||||
).data
|
||||
pixel_values = preprocess_result["pixel_values_videos"]
|
||||
video_grid_thw = preprocess_result["video_grid_thw"]
|
||||
|
||||
# pixel values to embeddings & grid_thws
|
||||
def get_image_embeds(model):
|
||||
with torch.no_grad():
|
||||
visual = model.visual
|
||||
|
||||
pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
|
||||
return visual(pixel_values_on_device, grid_thw=video_grid_thw).cpu()
|
||||
|
||||
video_embeds = torch.concat(llm.apply_model(get_image_embeds))
|
||||
|
||||
# split into original batches
|
||||
result: list[Qwen2VLPromptVideoEmbeddingInput] = []
|
||||
video_counter = 0
|
||||
embed_counter = 0
|
||||
for video_batch in video_batches_:
|
||||
cur_batch_video_count = len(video_batch)
|
||||
merge_size = image_processor.merge_size
|
||||
cur_batch_embed_len = sum(
|
||||
grid_thw.prod(-1) // merge_size // merge_size
|
||||
for grid_thw in video_grid_thw[
|
||||
video_counter : video_counter + cur_batch_video_count
|
||||
]
|
||||
)
|
||||
|
||||
result.append(
|
||||
{
|
||||
"video_embeds": video_embeds[
|
||||
embed_counter : embed_counter + cur_batch_embed_len
|
||||
],
|
||||
"video_grid_thw": video_grid_thw[
|
||||
video_counter : video_counter + cur_batch_video_count
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
embed_counter += cur_batch_embed_len
|
||||
video_counter += cur_batch_video_count
|
||||
|
||||
# ensure we don't lose any videos or embeddings
|
||||
assert embed_counter == video_embeds.size(0)
|
||||
assert video_counter == video_grid_thw.size(0)
|
||||
assert len(video_batches) == len(result)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def run_embedding_input_test(
|
||||
vllm_runner: type[VllmRunner],
|
||||
inputs: list[tuple[list[str], PromptImageInput, PromptVideoInput]],
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
mm_limit: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: str | None = None,
|
||||
):
|
||||
"""Inference result should be the same between
|
||||
original image/video input and image/video embeddings input.
|
||||
"""
|
||||
from transformers import AutoProcessor # noqa: F401
|
||||
|
||||
processor = AutoProcessor.from_pretrained(model)
|
||||
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=4000,
|
||||
max_num_seqs=3,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"image": mm_limit, "video": mm_limit},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
default_torch_num_threads=1,
|
||||
enable_mm_embeds=True,
|
||||
) as vllm_model:
|
||||
outputs_per_case_for_original_input = [
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images or None,
|
||||
videos=videos or None,
|
||||
)
|
||||
for prompts, images, videos in inputs
|
||||
]
|
||||
|
||||
outputs_per_case_for_embeddings_input = [
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=batch_make_image_embeddings(images, processor, vllm_model)
|
||||
if images
|
||||
else None,
|
||||
videos=batch_make_video_embeddings(videos, processor, vllm_model)
|
||||
if videos
|
||||
else None,
|
||||
)
|
||||
for prompts, images, videos in inputs
|
||||
]
|
||||
|
||||
for outputs_for_original_input, outputs_for_embeddings_input in zip(
|
||||
outputs_per_case_for_original_input, outputs_per_case_for_embeddings_input
|
||||
):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=outputs_for_original_input,
|
||||
outputs_1_lst=outputs_for_embeddings_input,
|
||||
name_0="original_input",
|
||||
name_1="embeddings_input",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# Single-scale
|
||||
[0.5],
|
||||
# Single-scale, batched
|
||||
[0.5, 0.5],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 0.5],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_qwen2_vl_image_embeddings_input(
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model,
|
||||
size_factors,
|
||||
dtype,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
|
||||
(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
[],
|
||||
)
|
||||
for image, prompt in zip(images, IMAGE_PROMPTS)
|
||||
]
|
||||
|
||||
run_embedding_input_test(
|
||||
vllm_runner,
|
||||
inputs_per_case,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=1,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
[],
|
||||
# Single-scale
|
||||
[0.5],
|
||||
# Single-scale, batched
|
||||
[0.5, 0.5],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 0.5],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_qwen2_vl_multiple_image_embeddings_input(
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model,
|
||||
size_factors,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
|
||||
(
|
||||
[MULTIIMAGE_PROMPT for _ in size_factors],
|
||||
[
|
||||
[rescale_image_size(image, factor) for image in images]
|
||||
for factor in size_factors
|
||||
],
|
||||
[],
|
||||
)
|
||||
]
|
||||
|
||||
run_embedding_input_test(
|
||||
vllm_runner,
|
||||
inputs_per_case,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=2,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# Single-scale
|
||||
[0.5],
|
||||
# Single-scale, batched
|
||||
[0.5, 0.5],
|
||||
# Multi-scale
|
||||
[0.25, 0.25, 0.5],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_qwen2_vl_video_embeddings_input(
|
||||
vllm_runner,
|
||||
video_assets,
|
||||
model,
|
||||
size_factors,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
num_frames = 4
|
||||
sampled_vids = [
|
||||
sample_frames_from_video(asset.np_ndarrays, num_frames)
|
||||
for asset in video_assets
|
||||
]
|
||||
|
||||
inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
|
||||
(
|
||||
[prompt for _ in size_factors],
|
||||
[],
|
||||
[rescale_video_size(video, factor) for factor in size_factors],
|
||||
)
|
||||
for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)
|
||||
]
|
||||
|
||||
run_embedding_input_test(
|
||||
vllm_runner,
|
||||
inputs_per_case,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=1,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
185
tests/models/multimodal/generation/test_ultravox.py
Normal file
185
tests/models/multimodal/generation/test_ultravox.py
Normal file
@@ -0,0 +1,185 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from ....conftest import AUDIO_ASSETS, AudioTestAssets, VllmRunner
|
||||
from ....utils import RemoteOpenAIServer
|
||||
from ...registry import HF_EXAMPLE_MODELS
|
||||
|
||||
MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
|
||||
|
||||
AUDIO_PROMPTS = AUDIO_ASSETS.prompts(
|
||||
{
|
||||
"mary_had_lamb": "Transcribe this into English.",
|
||||
"winning_call": "What is happening in this audio clip?",
|
||||
}
|
||||
)
|
||||
|
||||
MULTI_AUDIO_PROMPT = "Describe each of the audios above."
|
||||
|
||||
AudioTuple = tuple[np.ndarray, int]
|
||||
|
||||
VLLM_PLACEHOLDER = "<|audio|>"
|
||||
HF_PLACEHOLDER = "<|audio|>"
|
||||
|
||||
CHUNKED_PREFILL_KWARGS = {
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_seqs": 2,
|
||||
# Use a very small limit to exercise chunked prefill.
|
||||
"max_num_batched_tokens": 16,
|
||||
}
|
||||
|
||||
|
||||
def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]:
|
||||
"""Convert kwargs to CLI args."""
|
||||
args = []
|
||||
for key, value in params_kwargs.items():
|
||||
if isinstance(value, bool):
|
||||
if value:
|
||||
args.append(f"--{key.replace('_', '-')}")
|
||||
else:
|
||||
args.append(f"--{key.replace('_', '-')}={value}")
|
||||
return args
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
pytest.param({}, marks=pytest.mark.cpu_model),
|
||||
pytest.param(CHUNKED_PREFILL_KWARGS),
|
||||
]
|
||||
)
|
||||
def server(request, audio_assets: AudioTestAssets):
|
||||
args = [
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"4096",
|
||||
"--enforce-eager",
|
||||
"--limit-mm-per-prompt",
|
||||
json.dumps({"audio": len(audio_assets)}),
|
||||
"--trust-remote-code",
|
||||
] + params_kwargs_to_cli_args(request.param)
|
||||
|
||||
with RemoteOpenAIServer(
|
||||
MODEL_NAME, args, env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"}
|
||||
) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
def _get_prompt(audio_count, question, placeholder):
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||
placeholder = f"{placeholder}\n" * audio_count
|
||||
|
||||
return tokenizer.apply_chat_template(
|
||||
[{"role": "user", "content": f"{placeholder}{question}"}],
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
)
|
||||
|
||||
|
||||
def run_multi_audio_test(
|
||||
vllm_runner: type[VllmRunner],
|
||||
prompts_and_audios: list[tuple[str, list[AudioTuple]]],
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
**kwargs,
|
||||
):
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
enforce_eager=True,
|
||||
limit_mm_per_prompt={
|
||||
"audio": max((len(audio) for _, audio in prompts_and_audios))
|
||||
},
|
||||
**kwargs,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
[prompt for prompt, _ in prompts_and_audios],
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
audios=[audios for _, audios in prompts_and_audios],
|
||||
)
|
||||
|
||||
# The HuggingFace model doesn't support multiple audios yet, so
|
||||
# just assert that some tokens were generated.
|
||||
assert all(tokens for tokens, *_ in vllm_outputs)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize(
|
||||
"vllm_kwargs",
|
||||
[
|
||||
pytest.param({}, marks=pytest.mark.cpu_model),
|
||||
pytest.param(CHUNKED_PREFILL_KWARGS),
|
||||
],
|
||||
)
|
||||
def test_models_with_multiple_audios(
|
||||
vllm_runner,
|
||||
audio_assets: AudioTestAssets,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
vllm_kwargs: dict,
|
||||
) -> None:
|
||||
vllm_prompt = _get_prompt(len(audio_assets), MULTI_AUDIO_PROMPT, VLLM_PLACEHOLDER)
|
||||
run_multi_audio_test(
|
||||
vllm_runner,
|
||||
[(vllm_prompt, [audio.audio_and_sample_rate for audio in audio_assets])],
|
||||
MODEL_NAME,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
**vllm_kwargs,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_online_serving(client, audio_assets: AudioTestAssets):
|
||||
"""Exercises online serving with/without chunked prefill enabled."""
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*[
|
||||
{"type": "audio_url", "audio_url": {"url": audio.url}}
|
||||
for audio in audio_assets
|
||||
],
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"What's happening in these {len(audio_assets)} audio clips?", # noqa: E501
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME, messages=messages, max_tokens=10
|
||||
)
|
||||
|
||||
assert len(chat_completion.choices) == 1
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
@@ -0,0 +1,435 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Consolidated test for ViT attention backend functionality across multiple models.
|
||||
|
||||
This test validates that each multimodal model can successfully generate outputs
|
||||
using different ViT attention backends. Tests are parametrized by model and backend.
|
||||
"""
|
||||
|
||||
from dataclasses import asdict
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
from transformers import AutoProcessor
|
||||
|
||||
from vllm import LLM, EngineArgs, SamplingParams
|
||||
from vllm.attention.backends.registry import AttentionBackendEnum
|
||||
from vllm.multimodal.utils import encode_image_base64
|
||||
from vllm.multimodal.video import sample_frames_from_video
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ....utils import create_new_process_for_each_test
|
||||
from ...utils import dummy_hf_overrides
|
||||
|
||||
# Dots.OCR prompt from official repository
|
||||
# https://github.com/rednote-hilab/dots.ocr/blob/d72d1d8c5bdd0362eb264f714cdbd1e5daa7cdff/dots_ocr/utils/prompts.py#L3
|
||||
# ruff: noqa: E501
|
||||
DOTS_OCR_PROMPT = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
|
||||
|
||||
1. Bbox format: [x1, y1, x2, y2]
|
||||
|
||||
2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
|
||||
|
||||
3. Text Extraction & Formatting Rules:
|
||||
- Picture: For the 'Picture' category, the text field should be omitted.
|
||||
- Formula: Format its text as LaTeX.
|
||||
- Table: Format its text as HTML.
|
||||
- All Others (Text, Title, etc.): Format their text as Markdown.
|
||||
|
||||
4. Constraints:
|
||||
- The output text must be the original text from the image, with no translation.
|
||||
- All layout elements must be sorted according to human reading order.
|
||||
|
||||
5. Final Output: The entire output must be a single JSON object.
|
||||
"""
|
||||
|
||||
VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
|
||||
|
||||
|
||||
# Model configurations
|
||||
MODEL_CONFIGS: dict[str, dict[str, Any]] = {
|
||||
"dots_ocr": {
|
||||
"model_name": "rednote-hilab/dots.ocr",
|
||||
"interface": "llm_chat",
|
||||
"max_model_len": 32768,
|
||||
"max_num_seqs": 1,
|
||||
"limit_mm_per_prompt": {"image": 1},
|
||||
"sampling_params": {
|
||||
"temperature": 0.1,
|
||||
"max_tokens": 16384,
|
||||
"top_p": 0.9,
|
||||
"stop_token_ids": None,
|
||||
},
|
||||
"use_specific_image": "stop_sign",
|
||||
"prompt_builder": "build_dots_ocr_prompt",
|
||||
"output_validator": lambda x: len(x) > 10 and "stop" in x.lower(),
|
||||
},
|
||||
"ernie45_vl": {
|
||||
"model_name": "baidu/ERNIE-4.5-VL-28B-A3B-PT",
|
||||
"interface": "llm_generate",
|
||||
"max_model_len": 16384,
|
||||
"max_num_seqs": 2,
|
||||
"sampling_params": {
|
||||
"temperature": 0.0,
|
||||
"max_tokens": 256,
|
||||
"stop_token_ids": None,
|
||||
},
|
||||
"use_processor": True,
|
||||
"question": "What is the content of each image?",
|
||||
},
|
||||
"glm4_1v": {
|
||||
"model_name": "zai-org/GLM-4.1V-9B-Thinking",
|
||||
"interface": "llm_generate",
|
||||
"max_model_len": 32768,
|
||||
"max_num_seqs": 2,
|
||||
"sampling_params": {
|
||||
"temperature": 0.0,
|
||||
"max_tokens": 256,
|
||||
"stop_token_ids": None,
|
||||
},
|
||||
"use_processor": True,
|
||||
"question": "What is the content of each image?",
|
||||
},
|
||||
"keye_vl": {
|
||||
"model_name": "Kwai-Keye/Keye-VL-8B-Preview",
|
||||
"interface": "llm_generate",
|
||||
"max_model_len": 8192,
|
||||
"max_num_seqs": 5,
|
||||
"sampling_params": {
|
||||
"temperature": 0.0,
|
||||
"max_tokens": 256,
|
||||
"stop_token_ids": None,
|
||||
},
|
||||
"supported_backends": {
|
||||
AttentionBackendEnum.FLASH_ATTN,
|
||||
AttentionBackendEnum.ROCM_AITER_FA,
|
||||
},
|
||||
"use_processor": True,
|
||||
"question": "What is the content of each image?",
|
||||
},
|
||||
"ovis2_5": {
|
||||
"model_name": "AIDC-AI/Ovis2.5-2B",
|
||||
"interface": "llm_generate",
|
||||
"max_model_len": 8192,
|
||||
"max_num_seqs": 2,
|
||||
"sampling_params": {
|
||||
"temperature": 0.0,
|
||||
"max_tokens": 256,
|
||||
"stop_token_ids": None,
|
||||
},
|
||||
"prompt_builder": "build_ovis_prompt",
|
||||
"question": "What is the content of each image?",
|
||||
},
|
||||
"qwen2_5_vl": {
|
||||
"model_name": "Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
"interface": "vllm_runner",
|
||||
"media_type": "video",
|
||||
"max_model_len": 4000,
|
||||
"max_num_seqs": 1,
|
||||
"limit_mm_per_prompt": {"video": 1},
|
||||
"sampling_params": {
|
||||
"max_tokens": 128,
|
||||
},
|
||||
"runner_kwargs": {
|
||||
"runner": "generate",
|
||||
"dtype": "bfloat16",
|
||||
},
|
||||
"video_params": {
|
||||
"num_frames": 16,
|
||||
"pruning_rates": [0.0, 0.75],
|
||||
},
|
||||
},
|
||||
"qwen2_5_omni": {
|
||||
"model_name": "Qwen/Qwen2.5-Omni-3B",
|
||||
"interface": "llm_generate",
|
||||
"max_model_len": 32768,
|
||||
"max_num_seqs": 2,
|
||||
"limit_mm_per_prompt": {"image": 3, "video": 3, "audio": 3},
|
||||
"sampling_params": {
|
||||
"temperature": 0.6,
|
||||
"top_p": 0.95,
|
||||
"top_k": 20,
|
||||
"max_tokens": 16384,
|
||||
},
|
||||
"use_processor": True,
|
||||
"question": "What is the content of each image?",
|
||||
},
|
||||
"qwen3_omni": {
|
||||
"model_name": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
|
||||
"interface": "llm_generate",
|
||||
"max_model_len": 32768,
|
||||
"max_num_seqs": 2,
|
||||
"limit_mm_per_prompt": {"image": 3, "video": 3, "audio": 3},
|
||||
"sampling_params": {
|
||||
"temperature": 0.6,
|
||||
"top_p": 0.95,
|
||||
"top_k": 20,
|
||||
"max_tokens": 16384,
|
||||
},
|
||||
"use_processor": True,
|
||||
"question": "What is the content of each image?",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# Prompt builder functions
|
||||
def build_dots_ocr_prompt(images, config):
|
||||
"""Build Dots.OCR specific prompt with OCR instructions."""
|
||||
# Use only stop_sign image for Dots.OCR
|
||||
image = images[0] # Already filtered to stop_sign
|
||||
|
||||
image_url = f"data:image/jpeg;base64,{encode_image_base64(image)}"
|
||||
|
||||
placeholders = [{"type": "image_url", "image_url": {"url": image_url}}]
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*placeholders,
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"<|img|><|imgpad|><|endofimg|>{DOTS_OCR_PROMPT}",
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
return messages
|
||||
|
||||
|
||||
def build_processor_prompt(images, config):
|
||||
"""Build prompt using AutoProcessor.apply_chat_template()."""
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
config["model_name"], trust_remote_code=True
|
||||
)
|
||||
|
||||
image_urls = [
|
||||
f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
|
||||
]
|
||||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*placeholders,
|
||||
{"type": "text", "text": config["question"]},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
return processor.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
|
||||
|
||||
def build_ovis_prompt(images, config):
|
||||
"""Build Ovis2.5 specific prompt with custom format."""
|
||||
image_urls = [
|
||||
f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
|
||||
]
|
||||
|
||||
placeholders = "\n".join(
|
||||
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
|
||||
)
|
||||
|
||||
return (
|
||||
f"<|im_start|>user\n\n{placeholders}\n{config['question']}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n"
|
||||
)
|
||||
|
||||
|
||||
def build_qwen2_5_video_prompt():
|
||||
"""Build Qwen2.5-VL video prompt with EVS placeholder."""
|
||||
return (
|
||||
f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
||||
f"<|im_start|>user\n{VIDEO_PLACEHOLDER}"
|
||||
"Describe this video with a short sentence (no more than 20 words)"
|
||||
"<|im_end|><|im_start|>assistant\n"
|
||||
)
|
||||
|
||||
|
||||
# Handler functions
|
||||
def run_llm_generate_test(config, mm_encoder_attn_backend, image_assets):
|
||||
"""Standard LLM.generate() interface handler."""
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
# Build prompt
|
||||
if config.get("use_processor"):
|
||||
prompt = build_processor_prompt(images, config)
|
||||
else:
|
||||
prompt_builder_name = config.get("prompt_builder", "build_ovis_prompt")
|
||||
prompt_builder = globals()[prompt_builder_name]
|
||||
prompt = prompt_builder(images, config)
|
||||
|
||||
# Determine limit_mm_per_prompt
|
||||
limit_mm_per_prompt = config.get("limit_mm_per_prompt", {"image": len(images)})
|
||||
|
||||
# Create engine
|
||||
engine_args = EngineArgs(
|
||||
model=config["model_name"],
|
||||
trust_remote_code=True,
|
||||
max_model_len=config["max_model_len"],
|
||||
max_num_seqs=config["max_num_seqs"],
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
mm_encoder_attn_backend=mm_encoder_attn_backend,
|
||||
hf_overrides=dummy_hf_overrides,
|
||||
load_format="dummy",
|
||||
)
|
||||
|
||||
engine_dict = asdict(engine_args) | {"seed": 42}
|
||||
llm = LLM(**engine_dict)
|
||||
|
||||
# Generate
|
||||
sampling_params = SamplingParams(**config["sampling_params"])
|
||||
outputs = llm.generate(
|
||||
{
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": {"image": images},
|
||||
},
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
# Validate
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
validator = config.get("output_validator", lambda x: len(x) > 10)
|
||||
assert validator(generated_text), (
|
||||
f"Validation failed for {config['model_name']}: {generated_text}"
|
||||
)
|
||||
|
||||
|
||||
def run_llm_chat_test(config, mm_encoder_attn_backend, image_assets):
|
||||
"""LLM.chat() interface handler for Dots.OCR."""
|
||||
# Filter to stop_sign image only
|
||||
stop_sign_image = [
|
||||
asset.pil_image for asset in image_assets if asset.name == "stop_sign"
|
||||
][0]
|
||||
|
||||
# Build messages
|
||||
messages = build_dots_ocr_prompt([stop_sign_image], config)
|
||||
|
||||
# Create engine
|
||||
engine_args = EngineArgs(
|
||||
model=config["model_name"],
|
||||
trust_remote_code=True,
|
||||
max_model_len=config["max_model_len"],
|
||||
max_num_seqs=config["max_num_seqs"],
|
||||
limit_mm_per_prompt=config["limit_mm_per_prompt"],
|
||||
mm_encoder_attn_backend=mm_encoder_attn_backend,
|
||||
hf_overrides=dummy_hf_overrides,
|
||||
load_format="dummy",
|
||||
)
|
||||
|
||||
engine_dict = asdict(engine_args) | {"seed": 42}
|
||||
llm = LLM(**engine_dict)
|
||||
|
||||
# Generate using chat
|
||||
sampling_params = SamplingParams(**config["sampling_params"])
|
||||
outputs = llm.chat(messages=messages, sampling_params=sampling_params)
|
||||
|
||||
# Validate
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
validator = config.get("output_validator", lambda x: len(x) > 10)
|
||||
assert validator(generated_text), (
|
||||
f"Validation failed for {config['model_name']}: {generated_text}"
|
||||
)
|
||||
|
||||
|
||||
def run_video_test(config, mm_encoder_attn_backend, video_assets, vllm_runner):
|
||||
"""Video test with EVS (Efficient Video Sampling) handler."""
|
||||
for pruning_rate in config["video_params"]["pruning_rates"]:
|
||||
num_frames = config["video_params"]["num_frames"]
|
||||
|
||||
# Sample frames from video
|
||||
sampled_vids = [
|
||||
sample_frames_from_video(asset.np_ndarrays, num_frames)
|
||||
for asset in video_assets
|
||||
]
|
||||
|
||||
# Build prompt and prepare video
|
||||
prompt = build_qwen2_5_video_prompt()
|
||||
prompts = [prompt]
|
||||
videos = [sampled_vids[0]]
|
||||
|
||||
# Run with vllm_runner context manager
|
||||
with vllm_runner(
|
||||
config["model_name"],
|
||||
max_model_len=config["max_model_len"],
|
||||
max_num_seqs=config["max_num_seqs"],
|
||||
limit_mm_per_prompt=config["limit_mm_per_prompt"],
|
||||
tensor_parallel_size=1,
|
||||
video_pruning_rate=pruning_rate,
|
||||
mm_encoder_attn_backend=mm_encoder_attn_backend,
|
||||
hf_overrides=dummy_hf_overrides,
|
||||
load_format="dummy",
|
||||
**config["runner_kwargs"],
|
||||
) as vllm_model:
|
||||
outputs = vllm_model.generate_greedy(
|
||||
prompts,
|
||||
config["sampling_params"]["max_tokens"],
|
||||
videos=videos,
|
||||
)
|
||||
|
||||
# Validate output
|
||||
assert len(outputs) == 1, f"Expected 1 output, got {len(outputs)}"
|
||||
output_ids, output_text = outputs[0]
|
||||
assert len(output_ids) > 0, "Generated no output IDs"
|
||||
assert len(output_text) > 0, "Generated empty text"
|
||||
assert isinstance(output_text, str), (
|
||||
f"Output is not string: {type(output_text)}"
|
||||
)
|
||||
|
||||
|
||||
# Main test function
|
||||
@pytest.mark.parametrize("model_key", list(MODEL_CONFIGS.keys()))
|
||||
@pytest.mark.parametrize(
|
||||
"mm_encoder_attn_backend",
|
||||
[None] + current_platform.get_supported_vit_attn_backends(),
|
||||
)
|
||||
@pytest.mark.skip(reason="Broken test due to memory segmentation fault")
|
||||
@create_new_process_for_each_test()
|
||||
def test_vit_backend_functionality(
|
||||
model_key: str,
|
||||
mm_encoder_attn_backend: AttentionBackendEnum | None,
|
||||
image_assets,
|
||||
video_assets,
|
||||
vllm_runner,
|
||||
request,
|
||||
):
|
||||
"""Test ViT attention backend functionality for multimodal models.
|
||||
|
||||
This test validates that each model can successfully generate outputs
|
||||
using different ViT attention backends. The test:
|
||||
1. Filters unsupported backends per model
|
||||
2. Applies appropriate GPU marks
|
||||
3. Routes to the correct test handler based on interface
|
||||
4. Validates output meets minimum requirements
|
||||
"""
|
||||
config = MODEL_CONFIGS[model_key]
|
||||
|
||||
# Step 1: Backend filtering
|
||||
if (
|
||||
"supported_backends" in config
|
||||
and mm_encoder_attn_backend is not None
|
||||
and mm_encoder_attn_backend not in config["supported_backends"]
|
||||
):
|
||||
pytest.skip(
|
||||
f"{model_key} does not support {mm_encoder_attn_backend} backend now."
|
||||
)
|
||||
|
||||
# Step 2: Apply GPU marks dynamically
|
||||
if "gpu_marks" in config:
|
||||
for mark in config["gpu_marks"]:
|
||||
request.applymarker(mark)
|
||||
|
||||
# Step 3: Route to appropriate handler
|
||||
if config.get("media_type") == "video":
|
||||
run_video_test(config, mm_encoder_attn_backend, video_assets, vllm_runner)
|
||||
elif config["interface"] == "llm_chat":
|
||||
run_llm_chat_test(config, mm_encoder_attn_backend, image_assets)
|
||||
elif config["interface"] == "llm_generate":
|
||||
run_llm_generate_test(config, mm_encoder_attn_backend, image_assets)
|
||||
else:
|
||||
raise ValueError(f"Unknown interface: {config['interface']}")
|
||||
114
tests/models/multimodal/generation/test_voxtral.py
Normal file
114
tests/models/multimodal/generation/test_voxtral.py
Normal file
@@ -0,0 +1,114 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from mistral_common.audio import Audio
|
||||
from mistral_common.protocol.instruct.chunk import AudioChunk, RawAudio, TextChunk
|
||||
from mistral_common.protocol.instruct.messages import UserMessage
|
||||
|
||||
from vllm.tokenizers.mistral import MistralTokenizer
|
||||
|
||||
from ....conftest import AudioTestAssets
|
||||
from ....utils import RemoteOpenAIServer
|
||||
from .test_ultravox import MULTI_AUDIO_PROMPT, run_multi_audio_test
|
||||
|
||||
MODEL_NAME = "mistralai/Voxtral-Mini-3B-2507"
|
||||
MISTRAL_FORMAT_ARGS = [
|
||||
"--tokenizer_mode",
|
||||
"mistral",
|
||||
"--config_format",
|
||||
"mistral",
|
||||
"--load_format",
|
||||
"mistral",
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def server(request, audio_assets: AudioTestAssets):
|
||||
args = [
|
||||
"--enforce-eager",
|
||||
"--limit-mm-per-prompt",
|
||||
json.dumps({"audio": len(audio_assets)}),
|
||||
] + MISTRAL_FORMAT_ARGS
|
||||
|
||||
with RemoteOpenAIServer(
|
||||
MODEL_NAME, args, env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"}
|
||||
) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
def _get_prompt(audio_assets, question):
|
||||
tokenizer = MistralTokenizer.from_pretrained(MODEL_NAME)
|
||||
|
||||
audios = [
|
||||
Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
|
||||
for i in range(len(audio_assets))
|
||||
]
|
||||
audio_chunks = [
|
||||
AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
|
||||
]
|
||||
|
||||
text_chunk = TextChunk(text=question)
|
||||
messages = [UserMessage(content=[*audio_chunks, text_chunk]).to_openai()]
|
||||
|
||||
return tokenizer.apply_chat_template(messages=messages)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models_with_multiple_audios(
|
||||
vllm_runner,
|
||||
audio_assets: AudioTestAssets,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
vllm_prompt = _get_prompt(audio_assets, MULTI_AUDIO_PROMPT)
|
||||
run_multi_audio_test(
|
||||
vllm_runner,
|
||||
[(vllm_prompt, [audio.audio_and_sample_rate for audio in audio_assets])],
|
||||
MODEL_NAME,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tokenizer_mode="mistral",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_online_serving(client, audio_assets: AudioTestAssets):
|
||||
"""Exercises online serving with/without chunked prefill enabled."""
|
||||
|
||||
def asset_to_chunk(asset):
|
||||
audio = Audio.from_file(str(asset.get_local_path()), strict=False)
|
||||
audio.format = "wav"
|
||||
audio_dict = AudioChunk.from_audio(audio).to_openai()
|
||||
return audio_dict
|
||||
|
||||
audio_chunks = [asset_to_chunk(asset) for asset in audio_assets]
|
||||
text = f"What's happening in these {len(audio_assets)} audio clips?"
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [*audio_chunks, {"type": "text", "text": text}],
|
||||
}
|
||||
]
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME, messages=messages, max_tokens=10
|
||||
)
|
||||
|
||||
assert len(chat_completion.choices) == 1
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
178
tests/models/multimodal/generation/test_whisper.py
Normal file
178
tests/models/multimodal/generation/test_whisper.py
Normal file
@@ -0,0 +1,178 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Sequence
|
||||
from typing import Any
|
||||
|
||||
import librosa
|
||||
import pytest
|
||||
from transformers import AutoModelForSpeechSeq2Seq
|
||||
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ....conftest import HfRunner, PromptAudioInput, VllmRunner
|
||||
from ....utils import create_new_process_for_each_test, multi_gpu_test
|
||||
from ...registry import HF_EXAMPLE_MODELS
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
VLLM_PROMPT = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
|
||||
HF_PROMPT = ""
|
||||
# Whisper expects 16kHz audio
|
||||
WHISPER_SAMPLE_RATE = 16000
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def use_spawn_for_whisper(monkeypatch):
|
||||
"""Whisper has issues with forked workers, use spawn instead."""
|
||||
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
|
||||
|
||||
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
inputs: Sequence[tuple[list[str], list[str], PromptAudioInput]],
|
||||
model: str,
|
||||
*,
|
||||
max_model_len: int,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: str | None = None,
|
||||
enforce_eager: bool = True,
|
||||
) -> None:
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the audio fixtures for the test are from AudioAsset.
|
||||
For huggingface runner, we provide the audio as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
"""
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=max_model_len,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
limit_mm_per_prompt={"audio": 2},
|
||||
enforce_eager=enforce_eager,
|
||||
disable_custom_all_reduce=True,
|
||||
) as vllm_model:
|
||||
vllm_outputs_per_case = [
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
vllm_prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
audios=audios,
|
||||
)
|
||||
for vllm_prompts, _, audios in inputs
|
||||
]
|
||||
|
||||
with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
|
||||
hf_outputs_per_case = [
|
||||
hf_model.generate_greedy_logprobs_limit(
|
||||
hf_prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
audios=audios,
|
||||
)
|
||||
for _, hf_prompts, audios in inputs
|
||||
]
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def input_audios() -> list[tuple[list[str], list[str], list[tuple[Any, int]]]]:
|
||||
audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
|
||||
inputs = []
|
||||
for asset in audio_assets:
|
||||
audio, orig_sr = asset.audio_and_sample_rate
|
||||
# Resample to Whisper's expected sample rate (16kHz)
|
||||
if orig_sr != WHISPER_SAMPLE_RATE:
|
||||
audio = librosa.resample(
|
||||
audio, orig_sr=orig_sr, target_sr=WHISPER_SAMPLE_RATE
|
||||
)
|
||||
# vLLM prompts, HF prompts, audio inputs
|
||||
inputs.append(([VLLM_PROMPT], [HF_PROMPT], [(audio, WHISPER_SAMPLE_RATE)]))
|
||||
return inputs
|
||||
|
||||
|
||||
def check_model_available(model: str) -> None:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.cpu_model
|
||||
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
||||
@create_new_process_for_each_test("spawn")
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
num_logprobs: int,
|
||||
input_audios,
|
||||
enforce_eager: bool,
|
||||
) -> None:
|
||||
check_model_available(model)
|
||||
if current_platform.is_cpu() and not enforce_eager:
|
||||
pytest.skip("Skipping test for CPU with non-eager mode")
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
input_audios,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=448,
|
||||
max_tokens=200,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=1,
|
||||
enforce_eager=enforce_eager,
|
||||
)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [200])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@create_new_process_for_each_test("spawn")
|
||||
def test_models_distributed(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model: str,
|
||||
distributed_executor_backend: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
input_audios,
|
||||
) -> None:
|
||||
check_model_available(model)
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
input_audios,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=448,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=False,
|
||||
)
|
||||
347
tests/models/multimodal/generation/vlm_utils/builders.py
Normal file
347
tests/models/multimodal/generation/vlm_utils/builders.py
Normal file
@@ -0,0 +1,347 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Helpers for building inputs that can be leveraged for different test types."""
|
||||
|
||||
from collections.abc import Callable, Iterable
|
||||
from pathlib import PosixPath
|
||||
from typing import Any
|
||||
|
||||
import numpy.typing as npt
|
||||
import torch
|
||||
|
||||
from vllm.multimodal.audio import AudioResampler
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.multimodal.video import (
|
||||
rescale_video_size,
|
||||
resize_video,
|
||||
sample_frames_from_video,
|
||||
)
|
||||
|
||||
from .....conftest import AudioTestAssets, ImageTestAssets, VideoTestAssets
|
||||
from .types import (
|
||||
SINGLE_AUDIO_BASE_PROMPT,
|
||||
SINGLE_IMAGE_BASE_PROMPTS,
|
||||
TEST_AUDIO_PLACEHOLDER,
|
||||
TEST_IMG_PLACEHOLDER,
|
||||
TEST_VIDEO_PLACEHOLDER,
|
||||
VIDEO_BASE_PROMPT,
|
||||
ImageSizeWrapper,
|
||||
PromptWithMultiModalInput,
|
||||
SizeType,
|
||||
VLMTestInfo,
|
||||
)
|
||||
|
||||
|
||||
def replace_test_placeholder(
|
||||
prompt: str, mm_idx_to_prompt: Callable[[int], str], test_placeholder: str
|
||||
) -> str:
|
||||
"""Given a prompt, replaces each test placeholder with the
|
||||
model-specific tag.
|
||||
"""
|
||||
prompt_segments = prompt.split(test_placeholder)
|
||||
img_prompt = prompt_segments[0]
|
||||
for placeholder_idx, next_seg in enumerate(prompt_segments[1:], start=1):
|
||||
img_prompt += mm_idx_to_prompt(placeholder_idx)
|
||||
img_prompt += next_seg
|
||||
return img_prompt
|
||||
|
||||
|
||||
def get_model_prompts(
|
||||
base_prompts: Iterable[str],
|
||||
img_idx_to_prompt: Callable[[int], str] | None,
|
||||
video_idx_to_prompt: Callable[[int], str] | None,
|
||||
audio_idx_to_prompt: Callable[[int], str] | None,
|
||||
prompt_formatter: Callable[[str], str],
|
||||
) -> list[str]:
|
||||
"""Given a model-agnostic base prompt and test configuration for a model(s)
|
||||
to be tested, update the media placeholders and apply the prompt formatting
|
||||
to get the test prompt string for this model.
|
||||
|
||||
Example for phi3v, given the base_prompt: "<image>What is the season?"
|
||||
1. Replace img placeholder(s)
|
||||
-> "<|image_1|>\nWhat is the season?"
|
||||
2. Apply prompt formatter:
|
||||
-> <|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n
|
||||
"""
|
||||
assert isinstance(base_prompts, (list, tuple))
|
||||
model_prompts = []
|
||||
for base_prompt in base_prompts:
|
||||
# Replace the multimodal placeholders in the base prompt with
|
||||
# the correct ones for the model that we are testing
|
||||
if img_idx_to_prompt:
|
||||
base_prompt = replace_test_placeholder(
|
||||
base_prompt, img_idx_to_prompt, TEST_IMG_PLACEHOLDER
|
||||
)
|
||||
|
||||
if video_idx_to_prompt:
|
||||
base_prompt = replace_test_placeholder(
|
||||
base_prompt, video_idx_to_prompt, TEST_VIDEO_PLACEHOLDER
|
||||
)
|
||||
|
||||
if audio_idx_to_prompt:
|
||||
base_prompt = replace_test_placeholder(
|
||||
base_prompt, audio_idx_to_prompt, TEST_AUDIO_PLACEHOLDER
|
||||
)
|
||||
|
||||
# Apply the prompt formatter to wrap the base prompt with
|
||||
# the correct media placeholders to get the model test prompt
|
||||
model_prompt = prompt_formatter(base_prompt)
|
||||
model_prompts.append(model_prompt)
|
||||
return model_prompts
|
||||
|
||||
|
||||
def build_single_image_inputs_from_test_info(
|
||||
test_info: VLMTestInfo,
|
||||
image_assets: ImageTestAssets,
|
||||
size_wrapper: ImageSizeWrapper,
|
||||
tmp_path: PosixPath | None = None,
|
||||
) -> list[PromptWithMultiModalInput]:
|
||||
if test_info.prompt_formatter is None:
|
||||
raise ValueError("Prompt formatter must be set to build single image inputs")
|
||||
|
||||
model_prompts = get_model_prompts(
|
||||
test_info.single_image_prompts,
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.audio_idx_to_prompt,
|
||||
test_info.prompt_formatter,
|
||||
)
|
||||
|
||||
# For models that require a local path / URL encoded in the image; export
|
||||
# assets and encode into tmp_path for this test. This should be avoided
|
||||
# where possible (currently needed for Qwen-VL).
|
||||
if test_info.prompt_path_encoder is not None:
|
||||
if tmp_path is None:
|
||||
raise ValueError("Prompt path encoder requires setting local path")
|
||||
model_prompts = [
|
||||
test_info.prompt_path_encoder(tmp_path, prompt, [asset])
|
||||
for prompt, asset in zip(model_prompts, image_assets)
|
||||
]
|
||||
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
assert len(images) == len(model_prompts)
|
||||
return build_single_image_inputs(images, model_prompts, size_wrapper)
|
||||
|
||||
|
||||
def build_single_image_inputs(
|
||||
images, model_prompts, size_wrapper: ImageSizeWrapper
|
||||
) -> list[PromptWithMultiModalInput]:
|
||||
# For every image / prompt pair, get a pair containing two lists of
|
||||
# length size_factors, where the first contains duplicates of the model
|
||||
# prompt [str], and the second contains copies of the image after being
|
||||
# scaled by one of the size factors.
|
||||
#
|
||||
# NOTE: rescaling preserves the image aspect ratio.
|
||||
return [
|
||||
PromptWithMultiModalInput(
|
||||
prompts=[prompt for _ in size_wrapper.data],
|
||||
image_data=[
|
||||
apply_image_size_scaling(image, size, size_wrapper.type)
|
||||
for size in size_wrapper.data
|
||||
],
|
||||
)
|
||||
for image, prompt in zip(images, model_prompts)
|
||||
]
|
||||
|
||||
|
||||
def build_multi_image_inputs_from_test_info(
|
||||
test_info: VLMTestInfo,
|
||||
image_assets: ImageTestAssets,
|
||||
size_wrapper: ImageSizeWrapper,
|
||||
tmp_path: PosixPath | None = None,
|
||||
) -> list[PromptWithMultiModalInput]:
|
||||
if test_info.prompt_formatter is None:
|
||||
raise ValueError("Prompt formatter must be set to build multi image inputs")
|
||||
|
||||
model_prompts = get_model_prompts(
|
||||
[test_info.multi_image_prompt],
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.audio_idx_to_prompt,
|
||||
test_info.prompt_formatter,
|
||||
)
|
||||
|
||||
if test_info.prompt_path_encoder is not None:
|
||||
if tmp_path is None:
|
||||
raise ValueError("Prompt path encoder requires setting local path")
|
||||
model_prompts = [
|
||||
test_info.prompt_path_encoder(tmp_path, model_prompt, image_assets)
|
||||
for model_prompt in model_prompts
|
||||
]
|
||||
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
# Currently, we only have one multi-image list & one multi-image prompt
|
||||
return build_multi_image_inputs(
|
||||
image_lists=[images],
|
||||
model_prompts=model_prompts,
|
||||
size_wrapper=size_wrapper,
|
||||
)
|
||||
|
||||
|
||||
def build_multi_image_inputs(
|
||||
image_lists, model_prompts, size_wrapper: ImageSizeWrapper
|
||||
) -> list[PromptWithMultiModalInput]:
|
||||
return [
|
||||
PromptWithMultiModalInput(
|
||||
prompts=[prompt for _ in size_wrapper.data],
|
||||
image_data=[
|
||||
[
|
||||
apply_image_size_scaling(image, size, size_wrapper.type)
|
||||
for image in images
|
||||
]
|
||||
for size in size_wrapper.data
|
||||
],
|
||||
)
|
||||
for images, prompt in zip(image_lists, model_prompts)
|
||||
]
|
||||
|
||||
|
||||
def build_embedding_inputs_from_test_info(
|
||||
test_info: VLMTestInfo,
|
||||
image_assets: ImageTestAssets,
|
||||
size_wrapper: ImageSizeWrapper,
|
||||
):
|
||||
# These conditions will always be true if invoked through filtering,
|
||||
# but we still check them in case this is ever called directly
|
||||
if test_info.prompt_formatter is None:
|
||||
raise ValueError("Prompt formatter must be set to build image embedding inputs")
|
||||
if size_wrapper.type != SizeType.SIZE_FACTOR or not all(
|
||||
factor == 1.0 for factor in size_wrapper.data
|
||||
):
|
||||
raise ValueError("Embedding tests require constant (1.0) size factors")
|
||||
if test_info.convert_assets_to_embeddings is None:
|
||||
raise ValueError("No conversion func for getting embeddings found")
|
||||
|
||||
model_prompts = get_model_prompts(
|
||||
SINGLE_IMAGE_BASE_PROMPTS,
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.audio_idx_to_prompt,
|
||||
test_info.prompt_formatter,
|
||||
)
|
||||
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
embeds = test_info.convert_assets_to_embeddings(image_assets)
|
||||
if test_info.dtype != "auto":
|
||||
dtype = getattr(torch, test_info.dtype) # type: ignore
|
||||
embeds = [e.to(dtype=dtype) for e in embeds]
|
||||
assert len(images) == len(model_prompts)
|
||||
|
||||
inputs = build_single_image_inputs(images, model_prompts, size_wrapper)
|
||||
vllm_embeddings = build_single_image_inputs(embeds, model_prompts, size_wrapper)
|
||||
return inputs, vllm_embeddings
|
||||
|
||||
|
||||
def build_video_inputs_from_test_info(
|
||||
test_info: VLMTestInfo,
|
||||
video_assets: VideoTestAssets,
|
||||
size_wrapper: ImageSizeWrapper,
|
||||
num_frames: int,
|
||||
needs_video_metadata: bool,
|
||||
) -> list[PromptWithMultiModalInput]:
|
||||
if test_info.prompt_formatter is None:
|
||||
raise ValueError("Prompt formatter must be set to build video inputs")
|
||||
model_prompts = get_model_prompts(
|
||||
[VIDEO_BASE_PROMPT],
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.audio_idx_to_prompt,
|
||||
test_info.prompt_formatter,
|
||||
)
|
||||
|
||||
sampled_vids = [
|
||||
sample_frames_with_video_metadata(
|
||||
(asset.np_ndarrays, asset.metadata),
|
||||
num_frames,
|
||||
)
|
||||
for asset in video_assets
|
||||
]
|
||||
|
||||
video_scaler = (
|
||||
resize_video if size_wrapper.type == SizeType.FIXED_SIZE else rescale_video_size
|
||||
)
|
||||
|
||||
return [
|
||||
PromptWithMultiModalInput(
|
||||
prompts=[prompt for _ in size_wrapper.data],
|
||||
video_data=[
|
||||
(
|
||||
video_scaler(video, size)
|
||||
if not needs_video_metadata
|
||||
else (video_scaler(video, size), meta)
|
||||
)
|
||||
for size in size_wrapper.data
|
||||
],
|
||||
)
|
||||
for (video, meta), prompt in zip(sampled_vids, model_prompts)
|
||||
]
|
||||
|
||||
|
||||
def sample_frames_with_video_metadata(
|
||||
video_with_meta: tuple[npt.NDArray, dict[str, Any]],
|
||||
num_frames: int,
|
||||
) -> tuple[npt.NDArray, dict[str, Any]]:
|
||||
video, meta = video_with_meta
|
||||
video = sample_frames_from_video(video, num_frames)
|
||||
|
||||
meta["do_sample_frames"] = meta["total_num_frames"] == num_frames
|
||||
meta["total_num_frames"] = num_frames
|
||||
meta["fps"] = meta["duration"] / num_frames
|
||||
meta["frames_indices"] = list(range(num_frames))
|
||||
return video, meta
|
||||
|
||||
|
||||
def apply_image_size_scaling(image, size: float | tuple[int, int], size_type: SizeType):
|
||||
"""Applies a size scaler to one image; this can be an image size factor,
|
||||
which scales the image while maintaining the aspect ratio"""
|
||||
# Special case for embeddings; if it's a tensor, it's only valid if we
|
||||
# are considering size factors at constant scale, i.e., we just clone
|
||||
# the tensor
|
||||
if isinstance(image, torch.Tensor):
|
||||
assert size_type == SizeType.SIZE_FACTOR and size == 1
|
||||
return image
|
||||
if size_type == SizeType.SIZE_FACTOR:
|
||||
# We have a list of image size factors
|
||||
return rescale_image_size(image, size)
|
||||
elif size_type == SizeType.FIXED_SIZE:
|
||||
# We have a list of fixed sizes
|
||||
return image.resize(size)
|
||||
raise ValueError("ImageSizeWrapper type must be FIXED_SIZE or SIZE_FACTOR")
|
||||
|
||||
|
||||
def build_audio_inputs_from_test_info(
|
||||
test_info: VLMTestInfo,
|
||||
audio_assets: AudioTestAssets,
|
||||
) -> list[PromptWithMultiModalInput]:
|
||||
if test_info.prompt_formatter is None:
|
||||
raise ValueError("Prompt formatter must be set to build audio inputs")
|
||||
model_prompts = get_model_prompts(
|
||||
SINGLE_AUDIO_BASE_PROMPT,
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.audio_idx_to_prompt,
|
||||
test_info.prompt_formatter,
|
||||
)
|
||||
resampler = AudioResampler(
|
||||
target_sr=16000,
|
||||
method="librosa",
|
||||
)
|
||||
audios = [asset.audio_and_sample_rate for asset in audio_assets]
|
||||
resampled_audios = [
|
||||
(
|
||||
resampler.resample(
|
||||
audio,
|
||||
orig_sr=sr,
|
||||
),
|
||||
int(resampler.target_sr),
|
||||
)
|
||||
for audio, sr in audios
|
||||
]
|
||||
|
||||
return [
|
||||
PromptWithMultiModalInput(
|
||||
prompts=model_prompts,
|
||||
audio_data=resampled_audios,
|
||||
)
|
||||
]
|
||||
183
tests/models/multimodal/generation/vlm_utils/case_filtering.py
Normal file
183
tests/models/multimodal/generation/vlm_utils/case_filtering.py
Normal file
@@ -0,0 +1,183 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Utils for determining which subset of model tests belong to a specific
|
||||
modality, getting all combinations (similar to pytest's parametrization),
|
||||
handling multimodal placeholder substitution, and so on.
|
||||
"""
|
||||
|
||||
import itertools
|
||||
from collections import OrderedDict
|
||||
from collections.abc import Iterable
|
||||
|
||||
import pytest
|
||||
|
||||
from .types import (
|
||||
EMBEDDING_SIZE_FACTORS,
|
||||
ExpandableVLMTestArgs,
|
||||
ImageSizeWrapper,
|
||||
SizeType,
|
||||
VLMTestInfo,
|
||||
VLMTestType,
|
||||
)
|
||||
|
||||
|
||||
def get_filtered_test_settings(
|
||||
test_settings: dict[str, VLMTestInfo],
|
||||
test_type: VLMTestType,
|
||||
new_proc_per_test: bool,
|
||||
) -> dict[str, VLMTestInfo]:
|
||||
"""Given the dict of potential test settings to run, return a subdict
|
||||
of tests who have the current test type enabled with the matching val for
|
||||
fork_per_test.
|
||||
"""
|
||||
|
||||
def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType):
|
||||
return test_info.test_type == test_type or (
|
||||
isinstance(test_info.test_type, Iterable)
|
||||
and test_type in test_info.test_type
|
||||
)
|
||||
|
||||
matching_tests = {}
|
||||
for test_name, test_info in test_settings.items():
|
||||
# Otherwise check if the test has the right type & keep if it does
|
||||
if matches_test_type(test_info, test_type):
|
||||
# Embedding tests need to have a conversion func in their test info
|
||||
if matches_test_type(test_info, VLMTestType.EMBEDDING):
|
||||
assert test_info.convert_assets_to_embeddings is not None
|
||||
# Custom test inputs need to explicitly define the mm limit/inputs
|
||||
if matches_test_type(test_info, VLMTestType.CUSTOM_INPUTS):
|
||||
assert test_info.custom_test_opts is not None and isinstance(
|
||||
test_info.custom_test_opts, Iterable
|
||||
)
|
||||
# For all types besides custom inputs, we need a prompt formatter
|
||||
else:
|
||||
assert test_info.prompt_formatter is not None
|
||||
|
||||
# Everything looks okay; keep if this is correct proc handling
|
||||
if (
|
||||
test_info.distributed_executor_backend is not None
|
||||
) == new_proc_per_test:
|
||||
matching_tests[test_name] = test_info
|
||||
|
||||
return matching_tests
|
||||
|
||||
|
||||
def get_model_type_cases(
|
||||
model_type: str,
|
||||
test_info: VLMTestInfo,
|
||||
test_type: VLMTestType,
|
||||
):
|
||||
# Ensure that something is wrapped as an iterable it's not already
|
||||
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,)
|
||||
|
||||
# This is essentially the same as nesting a bunch of mark.parametrize
|
||||
# decorators, but we do it programmatically to allow overrides for on
|
||||
# a per-model basis, while still being able to execute each of these
|
||||
# as individual test cases in pytest.
|
||||
iter_kwargs = OrderedDict(
|
||||
[
|
||||
("model", ensure_wrapped(test_info.models)),
|
||||
("max_tokens", ensure_wrapped(test_info.max_tokens)),
|
||||
("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
|
||||
("dtype", ensure_wrapped(test_info.dtype)),
|
||||
(
|
||||
"distributed_executor_backend",
|
||||
ensure_wrapped(test_info.distributed_executor_backend),
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
# num_frames is video only
|
||||
if test_type == VLMTestType.VIDEO:
|
||||
iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames)
|
||||
iter_kwargs["needs_video_metadata"] = ensure_wrapped(
|
||||
test_info.needs_video_metadata
|
||||
)
|
||||
|
||||
# No sizes passed for custom inputs, since inputs are directly provided
|
||||
if test_type not in (
|
||||
VLMTestType.CUSTOM_INPUTS,
|
||||
VLMTestType.AUDIO,
|
||||
):
|
||||
wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
|
||||
if wrapped_sizes is None:
|
||||
raise ValueError(f"Sizes must be set for test type {test_type}")
|
||||
iter_kwargs["size_wrapper"] = wrapped_sizes
|
||||
|
||||
# Otherwise expand the custom test options instead
|
||||
elif test_type == VLMTestType.CUSTOM_INPUTS:
|
||||
if test_info.custom_test_opts is None:
|
||||
raise ValueError("Test has type CUSTOM_INPUTS, but none given")
|
||||
iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
|
||||
|
||||
# Wrap all model cases in a pytest parameter & pass marks through
|
||||
return [
|
||||
pytest.param(
|
||||
model_type,
|
||||
ExpandableVLMTestArgs(**{k: v for k, v in zip(iter_kwargs.keys(), case)}),
|
||||
marks=test_info.marks if test_info.marks is not None else [],
|
||||
)
|
||||
for case in list(itertools.product(*iter_kwargs.values()))
|
||||
]
|
||||
|
||||
|
||||
def get_parametrized_options(
|
||||
test_settings: dict[str, VLMTestInfo],
|
||||
test_type: VLMTestType,
|
||||
create_new_process_for_each_test: bool,
|
||||
):
|
||||
"""Converts all of our VLMTestInfo into an expanded list of parameters.
|
||||
This is similar to nesting pytest parametrize calls, but done directly
|
||||
through an itertools product so that each test can set things like
|
||||
size factors etc, while still running in isolated test cases.
|
||||
"""
|
||||
matching_tests = get_filtered_test_settings(
|
||||
test_settings, test_type, create_new_process_for_each_test
|
||||
)
|
||||
|
||||
# Get a list per model type, where each entry contains a tuple of all of
|
||||
# that model type's cases, then flatten them into the top level so that
|
||||
# we can consume them in one mark.parametrize call.
|
||||
cases_by_model_type = [
|
||||
get_model_type_cases(model_type, test_info, test_type)
|
||||
for model_type, test_info in matching_tests.items()
|
||||
]
|
||||
return list(itertools.chain(*cases_by_model_type))
|
||||
|
||||
|
||||
def get_wrapped_test_sizes(
|
||||
test_info: VLMTestInfo, test_type: VLMTestType
|
||||
) -> tuple[ImageSizeWrapper, ...]:
|
||||
"""Given a test info which may have size factors or fixed sizes, wrap them
|
||||
and combine them into an iterable, each of which will be used in parameter
|
||||
expansion.
|
||||
|
||||
Args:
|
||||
test_info: Test configuration to be expanded.
|
||||
test_type: The type of test being filtered for.
|
||||
"""
|
||||
# If it is an embedding test, we always use the EMBEDDING_SIZE_FACTORS
|
||||
if test_type == VLMTestType.EMBEDDING:
|
||||
return tuple(
|
||||
[
|
||||
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
|
||||
for factor in EMBEDDING_SIZE_FACTORS
|
||||
]
|
||||
)
|
||||
# Audio and Custom inputs have preprocessed inputs
|
||||
elif test_type in (VLMTestType.AUDIO, VLMTestType.CUSTOM_INPUTS):
|
||||
return tuple()
|
||||
|
||||
size_factors = test_info.image_size_factors if test_info.image_size_factors else []
|
||||
fixed_sizes = test_info.image_sizes if test_info.image_sizes else []
|
||||
|
||||
wrapped_factors = [
|
||||
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
|
||||
for factor in size_factors
|
||||
]
|
||||
|
||||
wrapped_sizes = [
|
||||
ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size) for size in fixed_sizes
|
||||
]
|
||||
|
||||
return tuple(wrapped_factors + wrapped_sizes)
|
||||
189
tests/models/multimodal/generation/vlm_utils/core.py
Normal file
189
tests/models/multimodal/generation/vlm_utils/core.py
Normal file
@@ -0,0 +1,189 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Core test implementation to be shared across modalities."""
|
||||
|
||||
from collections.abc import Callable
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||
|
||||
from vllm.config.model import RunnerOption
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
|
||||
from .....conftest import HfRunner, VllmRunner
|
||||
from ....registry import HF_EXAMPLE_MODELS
|
||||
from .types import PromptWithMultiModalInput, RunnerOutput
|
||||
|
||||
|
||||
def run_test(
|
||||
*,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
inputs: list[PromptWithMultiModalInput],
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
enforce_eager: bool,
|
||||
max_model_len: int,
|
||||
max_num_seqs: int,
|
||||
hf_output_post_proc: Callable[[RunnerOutput, str], Any] | None,
|
||||
vllm_output_post_proc: Callable[[RunnerOutput, str], Any] | None,
|
||||
auto_cls: type[_BaseAutoModelClass],
|
||||
use_tokenizer_eos: bool,
|
||||
comparator: Callable[..., None],
|
||||
get_stop_token_ids: Callable[[TokenizerLike], list[int]] | None,
|
||||
stop_str: list[str] | None,
|
||||
limit_mm_per_prompt: dict[str, int],
|
||||
vllm_runner_kwargs: dict[str, Any] | None,
|
||||
hf_model_kwargs: dict[str, Any] | None,
|
||||
patch_hf_runner: Callable[[HfRunner], HfRunner] | None,
|
||||
runner: RunnerOption = "auto",
|
||||
distributed_executor_backend: str | None = None,
|
||||
tensor_parallel_size: int = 1,
|
||||
vllm_embeddings: torch.Tensor | None = None,
|
||||
):
|
||||
"""Modality agnostic test executor for comparing HF/vLLM outputs."""
|
||||
# In the case of embeddings, vLLM takes separate input tensors
|
||||
vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs
|
||||
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
# Disable other modalities to save memory
|
||||
default_limits = {"image": 0, "video": 0, "audio": 0}
|
||||
limit_mm_per_prompt = default_limits | limit_mm_per_prompt
|
||||
|
||||
vllm_outputs_per_mm = []
|
||||
hf_outputs_per_mm = []
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
|
||||
vllm_runner_kwargs_: dict[str, Any] = {"mm_processor_cache_gb": 0}
|
||||
if model_info.tokenizer:
|
||||
vllm_runner_kwargs_["tokenizer_name"] = model_info.tokenizer
|
||||
if model_info.tokenizer_mode:
|
||||
vllm_runner_kwargs_["tokenizer_mode"] = model_info.tokenizer_mode
|
||||
if model_info.hf_overrides:
|
||||
vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides
|
||||
if model_info.require_embed_inputs:
|
||||
for k in ("skip_tokenizer_init", "enable_prompt_embeds", "enable_mm_embeds"):
|
||||
vllm_runner_kwargs_[k] = model_info.require_embed_inputs
|
||||
|
||||
if vllm_runner_kwargs:
|
||||
vllm_runner_kwargs_.update(vllm_runner_kwargs)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=max_num_seqs,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=enforce_eager,
|
||||
runner=runner,
|
||||
**vllm_runner_kwargs_,
|
||||
) as vllm_model:
|
||||
tokenizer = vllm_model.llm.get_tokenizer()
|
||||
|
||||
vllm_kwargs: dict[str, Any] = {}
|
||||
if get_stop_token_ids is not None:
|
||||
vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
|
||||
if stop_str:
|
||||
vllm_kwargs["stop"] = stop_str
|
||||
|
||||
for prompts, image_data, video_data, audio_data in vllm_inputs:
|
||||
mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
|
||||
vllm_kwargs_with_mm_data = vllm_kwargs | mm_data
|
||||
vllm_output = vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
**vllm_kwargs_with_mm_data,
|
||||
)
|
||||
vllm_outputs_per_mm.append(vllm_output)
|
||||
|
||||
hf_model = hf_runner(
|
||||
model, dtype=dtype, auto_cls=auto_cls, model_kwargs=hf_model_kwargs
|
||||
)
|
||||
|
||||
# Some models need to patch things like the model processor, e.g., internvl
|
||||
if patch_hf_runner is not None:
|
||||
hf_model = patch_hf_runner(hf_model)
|
||||
|
||||
with hf_model, torch.no_grad():
|
||||
tokenizer = hf_model.tokenizer
|
||||
|
||||
# Some models need to explicitly pass the eos_token_id off the tokenizer
|
||||
# or processor for a good comparison;
|
||||
# currently assume processor/tokenizer agree on the EOS, and pull it off
|
||||
# the tokenizer if requested.
|
||||
hf_kwargs = {}
|
||||
if use_tokenizer_eos:
|
||||
hf_kwargs["eos_token_id"] = tokenizer.eos_token_id
|
||||
if stop_str:
|
||||
hf_kwargs["stop_strings"] = stop_str
|
||||
|
||||
for prompts, image_data, video_data, audio_data in inputs:
|
||||
mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
|
||||
hf_kwargs_with_mm_data = hf_kwargs | mm_data
|
||||
hf_output = hf_model.generate_greedy_logprobs_limit(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tokenizer=tokenizer,
|
||||
**hf_kwargs_with_mm_data,
|
||||
)
|
||||
hf_outputs_per_mm.append(hf_output)
|
||||
|
||||
# Apply output processing / sanitation to the vLLM and HF runner results
|
||||
hf_outputs_per_mm, vllm_outputs_per_mm = process_runner_outputs(
|
||||
model,
|
||||
first_runner_outputs=hf_outputs_per_mm,
|
||||
second_runner_outputs=vllm_outputs_per_mm,
|
||||
first_runner_processor=hf_output_post_proc,
|
||||
second_runner_processor=vllm_output_post_proc,
|
||||
)
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm, vllm_outputs_per_mm):
|
||||
# This is usually check_logprobs_close, but it's passed through to
|
||||
# allow things like check_outputs_equal where needed
|
||||
comparator(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
def process_runner_outputs(
|
||||
model,
|
||||
first_runner_outputs,
|
||||
second_runner_outputs,
|
||||
first_runner_processor=None,
|
||||
second_runner_processor=None,
|
||||
):
|
||||
"""Applies the runner processor(s) to the runner outputs, if any."""
|
||||
if first_runner_processor is not None:
|
||||
first_runner_outputs = process_outputs(
|
||||
first_runner_processor, model, first_runner_outputs
|
||||
)
|
||||
if second_runner_processor is not None:
|
||||
second_runner_outputs = process_outputs(
|
||||
second_runner_processor, model, second_runner_outputs
|
||||
)
|
||||
return first_runner_outputs, second_runner_outputs
|
||||
|
||||
|
||||
def process_outputs(output_processor, model, outputs_per_image):
|
||||
"""Applies a model specific post-processor function to a runner's output"""
|
||||
return [
|
||||
[output_processor(res, model) for res in outputs]
|
||||
for outputs in outputs_per_image
|
||||
]
|
||||
156
tests/models/multimodal/generation/vlm_utils/custom_inputs.py
Normal file
156
tests/models/multimodal/generation/vlm_utils/custom_inputs.py
Normal file
@@ -0,0 +1,156 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Custom input builders for edge-cases in different models."""
|
||||
|
||||
from collections.abc import Callable
|
||||
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.multimodal.video import (
|
||||
rescale_video_size,
|
||||
resize_video,
|
||||
sample_frames_from_video,
|
||||
)
|
||||
|
||||
from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
|
||||
from .builders import build_multi_image_inputs, build_single_image_inputs
|
||||
from .types import ImageSizeWrapper, PromptWithMultiModalInput, SizeType
|
||||
|
||||
|
||||
def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
|
||||
"""Builds inputs for multi-image (varied sizes/aspect ratio) testing.
|
||||
|
||||
Args:
|
||||
formatter: model-specific prompt formatter.
|
||||
"""
|
||||
stop_sign = IMAGE_ASSETS[0].pil_image
|
||||
cherry_blossom = IMAGE_ASSETS[1].pil_image
|
||||
|
||||
# Apply the selected formatter to the base prompts
|
||||
img_prompts = [
|
||||
"<image><image>\nDescribe 2 images.",
|
||||
"<image><image>\nDescribe 2 images.",
|
||||
"<image><image><image><image>\nDescribe 4 images.",
|
||||
"<image>\nWhat is the season?",
|
||||
]
|
||||
formatted_prompts = [formatter(prompt) for prompt in img_prompts]
|
||||
aspect_ratio_images = [
|
||||
[stop_sign, cherry_blossom],
|
||||
# Images with different sizes and aspect-ratios
|
||||
[
|
||||
rescale_image_size(stop_sign, 0.1),
|
||||
stop_sign,
|
||||
],
|
||||
[
|
||||
stop_sign,
|
||||
rescale_image_size(stop_sign, 0.25),
|
||||
cherry_blossom.resize((183, 488)),
|
||||
cherry_blossom.resize((488, 183)),
|
||||
],
|
||||
cherry_blossom,
|
||||
]
|
||||
|
||||
return [
|
||||
PromptWithMultiModalInput(
|
||||
prompts=formatted_prompts,
|
||||
image_data=aspect_ratio_images,
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def multi_video_multi_aspect_ratio_inputs(
|
||||
formatter: Callable[[str], str], num_frames: int = 16
|
||||
):
|
||||
"""Builds inputs for multi-video (varied sizes/aspect ratio) testing.
|
||||
|
||||
Args:
|
||||
formatter: model-specific prompt formatter.
|
||||
"""
|
||||
video = sample_frames_from_video(VIDEO_ASSETS[0].np_ndarrays, num_frames)
|
||||
# Apply the selected formatter to the base prompts
|
||||
video_prompts = [
|
||||
"<video><video>\nDescribe 2 videos.",
|
||||
"<video><video>\nDescribe 2 videos.",
|
||||
"<video><video><video><video>\nDescribe 4 videos.",
|
||||
"<video>\nWhy is this video funny?",
|
||||
]
|
||||
formatted_prompts = [formatter(prompt) for prompt in video_prompts]
|
||||
aspect_ratio_videos = [
|
||||
[video, video],
|
||||
# Videos with different sizes and aspect-ratios
|
||||
[
|
||||
rescale_video_size(video, 0.1),
|
||||
video,
|
||||
],
|
||||
[
|
||||
video,
|
||||
rescale_video_size(video, 0.25),
|
||||
resize_video(video, (183, 488)),
|
||||
resize_video(video, (488, 183)),
|
||||
],
|
||||
video,
|
||||
]
|
||||
|
||||
return [
|
||||
PromptWithMultiModalInput(
|
||||
prompts=formatted_prompts,
|
||||
video_data=aspect_ratio_videos,
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def different_patch_input_cases_internvl():
|
||||
images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
|
||||
formatter = (
|
||||
lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501
|
||||
)
|
||||
single_img_prompts = [
|
||||
"<image>\nWhat's the content in the center of the image?",
|
||||
"<image>\nWhat is the season?",
|
||||
]
|
||||
multi_img_prompts = [
|
||||
"Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.\n", # noqa: E501
|
||||
]
|
||||
formatted_sprompts = [formatter(prompt) for prompt in single_img_prompts]
|
||||
formatted_mprompts = [formatter(prompt) for prompt in multi_img_prompts]
|
||||
|
||||
wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5, 1.0])
|
||||
return [
|
||||
build_single_image_inputs(images, formatted_sprompts, wrapped_sf),
|
||||
build_multi_image_inputs([images], formatted_mprompts, wrapped_sf),
|
||||
]
|
||||
|
||||
|
||||
def windows_attention_image_qwen2_5_vl():
|
||||
# image from regression issue: https://github.com/vllm-project/vllm/issues/15122 # noqa: E501
|
||||
image = ImageAsset("hato").pil_image
|
||||
|
||||
question = "Describe the image."
|
||||
img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
|
||||
prompt = (
|
||||
f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n<|im_start|>assistant\n"
|
||||
)
|
||||
|
||||
wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
|
||||
return build_single_image_inputs([image], [prompt], wrapped_sf)
|
||||
|
||||
|
||||
def video_with_metadata_glm4_1v():
|
||||
video_array = VIDEO_ASSETS[0].np_ndarrays
|
||||
metadata = VIDEO_ASSETS[0].metadata
|
||||
question = "Describe the video."
|
||||
video_prompt = "<|begin_of_video|><|video|><|end_of_video|>"
|
||||
formatted_prompt = f"[gMASK]<|user|>\n{video_prompt}{question}<|assistant|>\n"
|
||||
|
||||
scales = [0.1, 0.2, 0.25]
|
||||
video_input = [
|
||||
[(rescale_video_size(video_array, scale), metadata)] for scale in scales
|
||||
]
|
||||
prompts = [formatted_prompt] * len(video_input)
|
||||
|
||||
return [
|
||||
PromptWithMultiModalInput(
|
||||
prompts=prompts,
|
||||
video_data=video_input,
|
||||
)
|
||||
]
|
||||
1008
tests/models/multimodal/generation/vlm_utils/model_utils.py
Normal file
1008
tests/models/multimodal/generation/vlm_utils/model_utils.py
Normal file
File diff suppressed because it is too large
Load Diff
190
tests/models/multimodal/generation/vlm_utils/runners.py
Normal file
190
tests/models/multimodal/generation/vlm_utils/runners.py
Normal file
@@ -0,0 +1,190 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Entrypoints for wrapping the core run_test implementation for specific test
|
||||
types / modalities.
|
||||
"""
|
||||
|
||||
from pathlib import PosixPath
|
||||
|
||||
from .....conftest import (
|
||||
AudioTestAssets,
|
||||
HfRunner,
|
||||
ImageTestAssets,
|
||||
VideoTestAssets,
|
||||
VllmRunner,
|
||||
)
|
||||
from . import builders, core
|
||||
from .types import ExpandableVLMTestArgs, VLMTestInfo
|
||||
|
||||
|
||||
####### Entrypoints for running different test types
|
||||
def run_single_image_test(
|
||||
*,
|
||||
tmp_path: PosixPath,
|
||||
model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
assert test_case.size_wrapper is not None
|
||||
inputs = builders.build_single_image_inputs_from_test_info(
|
||||
model_test_info, image_assets, test_case.size_wrapper, tmp_path
|
||||
)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
inputs=inputs,
|
||||
model=test_case.model,
|
||||
dtype=test_case.dtype,
|
||||
max_tokens=test_case.max_tokens,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
|
||||
|
||||
def run_multi_image_test(
|
||||
*,
|
||||
tmp_path: PosixPath,
|
||||
model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
assert test_case.size_wrapper is not None
|
||||
inputs = builders.build_multi_image_inputs_from_test_info(
|
||||
model_test_info, image_assets, test_case.size_wrapper, tmp_path
|
||||
)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
inputs=inputs,
|
||||
model=test_case.model,
|
||||
dtype=test_case.dtype,
|
||||
max_tokens=test_case.max_tokens,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"image": len(image_assets)},
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
|
||||
|
||||
def run_embedding_test(
|
||||
*,
|
||||
model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
assert test_case.size_wrapper is not None
|
||||
inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
|
||||
model_test_info, image_assets, test_case.size_wrapper
|
||||
)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
inputs=inputs,
|
||||
model=test_case.model,
|
||||
dtype=test_case.dtype,
|
||||
max_tokens=test_case.max_tokens,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
vllm_embeddings=vllm_embeddings,
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
|
||||
|
||||
def run_video_test(
|
||||
*,
|
||||
model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
video_assets: VideoTestAssets,
|
||||
):
|
||||
assert test_case.size_wrapper is not None
|
||||
assert test_case.num_video_frames is not None
|
||||
inputs = builders.build_video_inputs_from_test_info(
|
||||
model_test_info,
|
||||
video_assets,
|
||||
test_case.size_wrapper,
|
||||
test_case.num_video_frames,
|
||||
test_case.needs_video_metadata,
|
||||
)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
inputs=inputs,
|
||||
model=test_case.model,
|
||||
dtype=test_case.dtype,
|
||||
max_tokens=test_case.max_tokens,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"video": len(video_assets)},
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
|
||||
|
||||
def run_audio_test(
|
||||
*,
|
||||
model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
audio_assets: AudioTestAssets,
|
||||
):
|
||||
inputs = builders.build_audio_inputs_from_test_info(model_test_info, audio_assets)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
inputs=inputs,
|
||||
model=test_case.model,
|
||||
dtype=test_case.dtype,
|
||||
max_tokens=test_case.max_tokens,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"audio": 1},
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
|
||||
|
||||
def run_custom_inputs_test(
|
||||
*,
|
||||
model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
):
|
||||
# Custom test cases can provide inputs directly, but they need to
|
||||
# explicitly provided a CustomTestConfig, which wraps the inputs and
|
||||
# the limit_mm_per_prompt
|
||||
assert test_case.custom_test_opts is not None
|
||||
|
||||
inputs = test_case.custom_test_opts.inputs
|
||||
limit_mm_per_prompt = test_case.custom_test_opts.limit_mm_per_prompt
|
||||
# Inputs and limit_mm_per_prompt should all be set
|
||||
assert inputs is not None
|
||||
assert limit_mm_per_prompt is not None
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
inputs=inputs,
|
||||
model=test_case.model,
|
||||
dtype=test_case.dtype,
|
||||
max_tokens=test_case.max_tokens,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
218
tests/models/multimodal/generation/vlm_utils/types.py
Normal file
218
tests/models/multimodal/generation/vlm_utils/types.py
Normal file
@@ -0,0 +1,218 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Types for writing multimodal model tests."""
|
||||
|
||||
from collections.abc import Callable, Iterable
|
||||
from enum import Enum
|
||||
from pathlib import PosixPath
|
||||
from typing import Any, NamedTuple
|
||||
|
||||
import torch
|
||||
from pytest import MarkDecorator
|
||||
from transformers import AutoModelForCausalLM
|
||||
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||
|
||||
from vllm.config.model import RunnerOption
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
|
||||
from .....conftest import (
|
||||
AUDIO_ASSETS,
|
||||
IMAGE_ASSETS,
|
||||
HfRunner,
|
||||
ImageAsset,
|
||||
ImageTestAssets,
|
||||
PromptAudioInput,
|
||||
PromptImageInput,
|
||||
PromptVideoInput,
|
||||
)
|
||||
from ....utils import check_logprobs_close
|
||||
|
||||
# meta image tag; will be replaced by the appropriate tag for the model
|
||||
TEST_IMG_PLACEHOLDER = "<vlm_image>"
|
||||
TEST_VIDEO_PLACEHOLDER = "<vlm_video>"
|
||||
TEST_AUDIO_PLACEHOLDER = "<lmm_audio>"
|
||||
|
||||
SINGLE_IMAGE_BASE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": f"{TEST_IMG_PLACEHOLDER}What's the content of the image?",
|
||||
"cherry_blossom": f"{TEST_IMG_PLACEHOLDER}What is the season?",
|
||||
}
|
||||
)
|
||||
SINGLE_AUDIO_BASE_PROMPT = AUDIO_ASSETS.prompts(
|
||||
{
|
||||
"mary_had_lamb": f"{TEST_AUDIO_PLACEHOLDER}Transcribe this audio into English.", # noqa: E501
|
||||
"winning_call": f"{TEST_AUDIO_PLACEHOLDER}What is happening in this audio clip?", # noqa: E501
|
||||
}
|
||||
)
|
||||
|
||||
MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PLACEHOLDER}Describe the two images in detail.\n" # noqa: E501
|
||||
VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"
|
||||
|
||||
|
||||
IMAGE_SIZE_FACTORS = [(1.0,), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
|
||||
EMBEDDING_SIZE_FACTORS = [(1.0,), (1.0, 1.0, 1.0)]
|
||||
RunnerOutput = tuple[list[int], str, SampleLogprobs | None]
|
||||
|
||||
|
||||
class PromptWithMultiModalInput(NamedTuple):
|
||||
"""Holds the multimodal input for a single test case."""
|
||||
|
||||
prompts: list[str]
|
||||
image_data: PromptImageInput | None = None
|
||||
video_data: PromptVideoInput | None = None
|
||||
audio_data: PromptAudioInput | None = None
|
||||
|
||||
|
||||
class VLMTestType(Enum):
|
||||
IMAGE = 1
|
||||
MULTI_IMAGE = 2
|
||||
EMBEDDING = 3
|
||||
VIDEO = 4
|
||||
AUDIO = 5
|
||||
CUSTOM_INPUTS = 6
|
||||
|
||||
|
||||
class SizeType(Enum):
|
||||
SIZE_FACTOR = 1
|
||||
FIXED_SIZE = 2
|
||||
|
||||
|
||||
class CustomTestOptions(NamedTuple):
|
||||
inputs: list[PromptWithMultiModalInput]
|
||||
limit_mm_per_prompt: dict[str, int]
|
||||
|
||||
|
||||
class ImageSizeWrapper(NamedTuple):
|
||||
type: SizeType
|
||||
# A size factor is a wrapper of 0+ floats,
|
||||
# while a fixed size contains an iterable of integer pairs
|
||||
data: Iterable[float] | Iterable[tuple[int, int]]
|
||||
|
||||
|
||||
class VLMTestInfo(NamedTuple):
|
||||
"""Holds the configuration for 1+ tests for one model architecture."""
|
||||
|
||||
models: list[str]
|
||||
test_type: VLMTestType | Iterable[VLMTestType]
|
||||
|
||||
# Should be None only if this is a CUSTOM_INPUTS test
|
||||
prompt_formatter: Callable[[str], str] | None = None
|
||||
img_idx_to_prompt: Callable[[int], str] = lambda idx: "<image>\n"
|
||||
video_idx_to_prompt: Callable[[int], str] = lambda idx: "<video>\n"
|
||||
audio_idx_to_prompt: Callable[[int], str] = lambda idx: "<audio>\n"
|
||||
|
||||
# Most models work on the single / multi-image prompts above, but in some
|
||||
# cases the log prob check fails, e.g., for paligemma. We allow passing
|
||||
# an override for the single image prompts / multi-image prompt for this
|
||||
# reason.
|
||||
single_image_prompts: Iterable[str] = SINGLE_IMAGE_BASE_PROMPTS
|
||||
multi_image_prompt: str = MULTI_IMAGE_BASE_PROMPT
|
||||
|
||||
# Function for converting ImageAssets to image embeddings;
|
||||
# We need to define this explicitly for embedding tests
|
||||
convert_assets_to_embeddings: (
|
||||
Callable[[ImageTestAssets], list[torch.Tensor]] | None
|
||||
) = None
|
||||
|
||||
# Exposed options for vLLM runner; we change these in a several tests,
|
||||
# but the defaults are derived from VllmRunner & the engine defaults
|
||||
# These settings are chosen to avoid OOMs when running in the CI
|
||||
enforce_eager: bool = True
|
||||
max_model_len: int = 1024
|
||||
max_num_seqs: int = 256
|
||||
runner: RunnerOption = "auto"
|
||||
tensor_parallel_size: int = 1
|
||||
vllm_runner_kwargs: dict[str, Any] | None = None
|
||||
|
||||
# Optional callable which gets a list of token IDs from the model tokenizer
|
||||
get_stop_token_ids: Callable[[TokenizerLike], list[int]] | None = None
|
||||
# Optional list of strings to stop generation, useful when stop tokens are
|
||||
# not special tokens in the tokenizer
|
||||
stop_str: list[str] | None = None
|
||||
|
||||
# Exposed options for HF runner
|
||||
hf_model_kwargs: dict[str, Any] | None = None
|
||||
# Indicates we should explicitly pass the EOS from the tokenizer
|
||||
use_tokenizer_eos: bool = False
|
||||
auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM
|
||||
patch_hf_runner: Callable[[HfRunner], HfRunner] | None = None
|
||||
|
||||
# Post processors that if defined, will run oun the outputs of the
|
||||
# vLLM and HF runner, respectively (useful for sanitization, etc).
|
||||
vllm_output_post_proc: Callable[[RunnerOutput, str], Any] | None = None
|
||||
hf_output_post_proc: Callable[[RunnerOutput, str], Any] | None = None
|
||||
|
||||
# Consumes the output of the callables above and checks if they're equal
|
||||
comparator: Callable[..., None] = check_logprobs_close
|
||||
|
||||
# Default expandable params per test; these defaults can be overridden in
|
||||
# instances of this object; the complete set of test cases for the model
|
||||
# is all combinations of .models + all fields below
|
||||
max_tokens: int = 128
|
||||
num_logprobs: int = 5
|
||||
dtype: str = "auto"
|
||||
distributed_executor_backend: str | None = None
|
||||
# Only expanded in video tests
|
||||
num_video_frames: int | tuple[int] = 16
|
||||
needs_video_metadata: bool = False
|
||||
|
||||
# Fixed image sizes / image size factors; most tests use image_size_factors
|
||||
# The values provided for these two fields will be stacked and expanded
|
||||
# such that each model will consider each image size factor / image size
|
||||
# once per tests (much like concatenating and wrapping in one parametrize
|
||||
# call)
|
||||
image_size_factors: Iterable[Iterable[float]] = IMAGE_SIZE_FACTORS
|
||||
image_sizes: Iterable[Iterable[tuple[int, int]]] | None = None
|
||||
|
||||
# Hack for updating a prompt to take into a local path; currently only used
|
||||
# for Qwen-VL, which requires encoding the image path / url into the prompt
|
||||
# for HF runner
|
||||
prompt_path_encoder: (
|
||||
Callable[[PosixPath, str, list[ImageAsset] | ImageTestAssets], str] | None
|
||||
) = None # noqa: E501
|
||||
|
||||
# Allows configuring a test to run with custom inputs
|
||||
custom_test_opts: list[CustomTestOptions] | None = None
|
||||
|
||||
marks: list[MarkDecorator] | None = None
|
||||
|
||||
def get_non_parametrized_runner_kwargs(self):
|
||||
"""Returns a dictionary of expandable kwargs for items that are used
|
||||
in all test types, which are NOT used when creating the parametrized
|
||||
test cases.
|
||||
"""
|
||||
return {
|
||||
"enforce_eager": self.enforce_eager,
|
||||
"max_model_len": self.max_model_len,
|
||||
"max_num_seqs": self.max_num_seqs,
|
||||
"runner": self.runner,
|
||||
"tensor_parallel_size": self.tensor_parallel_size,
|
||||
"vllm_runner_kwargs": self.vllm_runner_kwargs,
|
||||
"hf_output_post_proc": self.hf_output_post_proc,
|
||||
"vllm_output_post_proc": self.vllm_output_post_proc,
|
||||
"auto_cls": self.auto_cls,
|
||||
"use_tokenizer_eos": self.use_tokenizer_eos,
|
||||
"comparator": self.comparator,
|
||||
"get_stop_token_ids": self.get_stop_token_ids,
|
||||
"hf_model_kwargs": self.hf_model_kwargs,
|
||||
"stop_str": self.stop_str,
|
||||
"patch_hf_runner": self.patch_hf_runner,
|
||||
}
|
||||
|
||||
|
||||
class ExpandableVLMTestArgs(NamedTuple):
|
||||
"""The expanded kwargs which correspond to a single test case."""
|
||||
|
||||
model: str
|
||||
max_tokens: int
|
||||
num_logprobs: int
|
||||
dtype: str
|
||||
distributed_executor_backend: str | None
|
||||
# Sizes are used for everything except for custom input tests
|
||||
size_wrapper: ImageSizeWrapper | None = None
|
||||
# Video only
|
||||
num_video_frames: int | None = None
|
||||
needs_video_metadata: bool = False
|
||||
# Custom inputs only
|
||||
custom_test_opts: CustomTestOptions | None = None
|
||||
Reference in New Issue
Block a user