Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

@@ -0,0 +1,35 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM tests."""
import warnings
import torch
from vllm.platforms import current_platform
def pytest_configure(config):
"""Disable Flash/MemEfficient SDP on ROCm to avoid HF
Transformers accuracy issues.
"""
if not current_platform.is_rocm():
return
skip_patterns = ["test_granite_speech.py"]
if any(pattern in str(arg) for arg in config.args for pattern in skip_patterns):
# Skip disabling SDP for Granite Speech tests on ROCm
return
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
# accuracy issues
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_math_sdp(True)
warnings.warn(
"ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
"to avoid HuggingFace Transformers accuracy issues",
UserWarning,
stacklevel=1,
)

View File

@@ -0,0 +1,142 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright 2025 The vLLM team.
# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
# reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import pytest
from tests.models.registry import HF_EXAMPLE_MODELS
from vllm import LLM, SamplingParams
MODEL_NAME = "nvidia/audio-flamingo-3-hf"
def get_fixture_path(filename):
return os.path.join(
os.path.dirname(__file__), "../../fixtures/audioflamingo3", filename
)
@pytest.fixture(scope="module")
def llm():
# Check if the model is supported by the current transformers version
model_info = HF_EXAMPLE_MODELS.get_hf_info("AudioFlamingo3ForConditionalGeneration")
model_info.check_transformers_version(on_fail="skip")
try:
llm = LLM(
model=MODEL_NAME,
trust_remote_code=True,
dtype="bfloat16",
enforce_eager=True,
limit_mm_per_prompt={"audio": 1},
)
return llm
except Exception as e:
pytest.skip(f"Failed to load model {MODEL_NAME}: {e}")
def test_single_generation(llm):
fixture_path = get_fixture_path("expected_results_single.json")
if not os.path.exists(fixture_path):
pytest.skip(f"Fixture not found: {fixture_path}")
with open(fixture_path) as f:
expected = json.load(f)
audio_url = "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Why_do_we_ask_questions_converted.wav"
messages = [
{
"role": "user",
"content": [
{"type": "audio_url", "audio_url": {"url": audio_url}},
{"type": "text", "text": "Transcribe the input speech."},
],
}
]
sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
outputs = llm.chat(
messages=messages,
sampling_params=sampling_params,
)
generated_text = outputs[0].outputs[0].text.strip()
expected_text = expected["transcriptions"][0]
assert expected_text in generated_text or generated_text in expected_text
def test_batched_generation(llm):
fixture_path = get_fixture_path("expected_results_batched.json")
if not os.path.exists(fixture_path):
pytest.skip(f"Fixture not found: {fixture_path}")
with open(fixture_path) as f:
expected = json.load(f)
items = [
{
"audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/dogs_barking_in_sync_with_the_music.wav",
"question": "What is surprising about the relationship "
"between the barking and the music?",
"expected_idx": 0,
},
{
"audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Ch6Ae9DT6Ko_00-04-03_00-04-31.wav",
"question": (
"Why is the philosopher's name mentioned in the lyrics? "
"(A) To express a sense of nostalgia "
"(B) To indicate that language cannot express clearly, "
"satirizing the inversion of black and white in the world "
"(C) To add depth and complexity to the lyrics "
"(D) To showcase the wisdom and influence of the philosopher"
),
"expected_idx": 1,
},
]
conversations = []
for item in items:
messages = [
{
"role": "user",
"content": [
{"type": "audio_url", "audio_url": {"url": item["audio_url"]}},
{"type": "text", "text": item["question"]},
],
}
]
conversations.append(messages)
sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
outputs = llm.chat(
messages=conversations,
sampling_params=sampling_params,
)
for i, output in enumerate(outputs):
generated_text = output.outputs[0].text.strip()
expected_text = expected["transcriptions"][i]
assert expected_text in generated_text or generated_text in expected_text

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,160 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
import pytest
from transformers import AutoModelForSpeechSeq2Seq
from vllm.logprobs import SampleLogprobs
from vllm.lora.request import LoRARequest
from vllm.platforms import current_platform
from ....conftest import AudioTestAssets, HfRunner, PromptAudioInput, VllmRunner
from ...registry import HF_EXAMPLE_MODELS
from ...utils import check_logprobs_close
HF_AUDIO_PROMPT = "<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|><|audio|>can you transcribe the speech into a written format?<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>" # noqa: E501
def vllm_to_hf_output(
vllm_output: tuple[list[int], str, SampleLogprobs | None],
) -> tuple[list[int], str, SampleLogprobs | None]:
"""Sanitize hf output to be comparable with vllm output."""
output_ids, output_str, out_logprobs = vllm_output
hf_output_str = output_str + "<|end_of_text|>"
return output_ids, hf_output_str, out_logprobs
MODEL_NAME = "ibm-granite/granite-speech-3.3-2b"
# Audio lora co-exists directly in the model directory, but
# currently still needs to be passed directly to vLLM.
audio_lora_path = MODEL_NAME
models = [MODEL_NAME]
@pytest.fixture(autouse=True)
def set_attention_backend_for_rocm(monkeypatch):
if current_platform.is_rocm():
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
def run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
inputs: Sequence[tuple[list[str], PromptAudioInput]],
model: str,
*,
max_model_len: int,
dtype: str,
max_tokens: int,
num_logprobs: int,
tensor_parallel_size: int,
distributed_executor_backend: str | None = None,
):
"""Inference result should be the same between hf and vllm.
All the audio fixtures for the test are from AUDIO_ASSETS.
For huggingface runner, we provide the audio as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with vllm_runner(
model,
runner="generate",
max_model_len=max_model_len,
max_num_seqs=1,
dtype=dtype,
limit_mm_per_prompt={"audio": 1},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enable_lora=True,
max_lora_rank=64,
enforce_eager=True,
) as vllm_model:
lora_request = LoRARequest("audio", 1, audio_lora_path)
vllm_outputs_per_case = [
vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
audios=audios,
lora_request=lora_request,
)
for prompts, audios in inputs
]
with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
hf_processor = hf_model.processor
eos_token_id = hf_processor.tokenizer.eos_token_id
hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(
prompts,
max_tokens,
num_logprobs=num_logprobs,
audios=[audios],
eos_token_id=eos_token_id,
)
for prompts, audios in inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=[vllm_to_hf_output(output) for output in vllm_outputs],
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"dtype", ["float16"] if current_platform.is_rocm() else ["bfloat16"]
)
@pytest.mark.parametrize(
"max_model_len", [512] if current_platform.is_rocm() else [2048]
)
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_models(
hf_runner,
vllm_runner,
model: str,
audio_assets: AudioTestAssets,
dtype: str,
max_model_len: int,
max_tokens: int,
num_logprobs: int,
) -> None:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
audio, sr = audio_assets[0].audio_and_sample_rate
# This model expects 16k sample rate, which our test audio
# already is; if this changes, it may break this test,
# so we check it directly
assert sr == 16000
run_test(
hf_runner,
vllm_runner,
[
([HF_AUDIO_PROMPT], [audio]),
],
model,
dtype=dtype,
max_model_len=max_model_len,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=1,
)

View File

@@ -0,0 +1,81 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.multimodal.image import convert_image_mode
models = ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]
def base_prompt(modalities_str: str) -> str:
return f"<|im_start|>user {modalities_str}\nDescribe what you see from these items.<|im_end|><|im_start|>assistant\n" # noqa: E501
INTERLEAVED_PROMPT = base_prompt("<image><video><image>\n")
NONINTERLEAVED_PROMPT = base_prompt("<image><image><video>\n")
@pytest.mark.core_model
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", ["float16"])
@pytest.mark.parametrize("max_tokens", [128])
def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
"""
This is a simple test to check if interleaved and non-interleaved prompts
give the same result.
"""
image_cherry = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
image_stop = convert_image_mode(ImageAsset("stop_sign").pil_image, "RGB")
images = [image_cherry, image_stop]
video = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays
inputs = [
(
[INTERLEAVED_PROMPT],
[images],
[video],
),
(
[NONINTERLEAVED_PROMPT],
[images],
[video],
),
]
with vllm_runner(
model,
runner="generate",
dtype=dtype,
limit_mm_per_prompt={"image": 2},
max_model_len=32768,
max_num_seqs=2,
tensor_parallel_size=1,
enforce_eager=True,
) as vllm_model:
vllm_outputs_per_case = [
vllm_model.generate_greedy(
prompts, max_tokens, images=images, videos=videos
)
for prompts, images, videos in inputs
]
all_results = [output[0][1] for output in vllm_outputs_per_case]
outputs = [
(total_str, total_str.find("assistant\n") + len("assistant\n"))
for total_str in all_results
]
prompt_lengths = [prompt_len for _, prompt_len in outputs]
generated_strs = [total_str[prompt_len:] for total_str, prompt_len in outputs]
interleaved_prompt_len, noninterleaved_prompt_len = prompt_lengths
interleaved_output_str, noninterleaved_output_str = generated_strs
# The two prompts are identical except for the order of modality tokens.
assert interleaved_prompt_len == noninterleaved_prompt_len
# The two generated strings should be different because of the
# interleaved modality tokens.
assert interleaved_output_str != noninterleaved_output_str

View File

@@ -0,0 +1,86 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import asdict
from typing import NamedTuple
import pytest
from PIL.Image import Image
from transformers import AutoProcessor
from vllm import LLM, EngineArgs, SamplingParams
from vllm.multimodal.utils import encode_image_base64
MODEL_NAME = "Kwai-Keye/Keye-VL-8B-Preview"
QUESTION = "What is the content of each image?"
class ModelRequestData(NamedTuple):
engine_args: EngineArgs
prompt: str
image_data: list[Image]
stop_token_ids: list[int] | None = None
chat_template: str | None = None
sampling_params: SamplingParams | None = None
@pytest.mark.core_model
@pytest.mark.parametrize("question", [QUESTION])
def test_keye_vl(
image_assets,
question: str,
):
images = [asset.pil_image for asset in image_assets]
image_urls = [
f"data:image/jpeg;base64,{encode_image_base64(image)}" for image in images
]
engine_args = EngineArgs(
model=MODEL_NAME,
trust_remote_code=True,
max_model_len=8192,
max_num_seqs=5,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [
{
"role": "user",
"content": [
*placeholders,
{"type": "text", "text": question},
],
},
]
processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
prompt = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
engine_args = asdict(engine_args) | {"seed": 42}
llm = LLM(**engine_args)
sampling_params = SamplingParams(
temperature=0.0, max_tokens=256, stop_token_ids=None
)
outputs = llm.generate(
{
"prompt": prompt,
"multi_modal_data": {"image": images},
},
sampling_params=sampling_params,
)
print("-" * 50)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
assert len(generated_text) > 10, (
f"Generated text is too short: {generated_text}"
)
print("-" * 50)

View File

@@ -0,0 +1,723 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Create a reduced-layer version of the Maverick model for testing purposes.
This script creates a new model with fewer layers by:
1. Loading the original Maverick model configuration
2. Creating a reduced configuration
3. Generating compatible safetensors files with appropriate weights
4. Creating the necessary index files for vLLM compatibility
"""
import json
import shutil
from pathlib import Path
from typing import Any
import pytest
import torch
from safetensors.torch import save_file
from transformers import AutoConfig, AutoProcessor, AutoTokenizer, GenerationConfig
from vllm import LLM, SamplingParams
from vllm.v1.executor.abstract import Executor
from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, FullAttentionSpec
from ....utils import multi_gpu_test
# Sample prompts for testing
PROMPTS: list[str] = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
def run_maverick_serving(model: str):
"""Test Llama-4-Maverick model with vLLM LLM class using CLI equivalent
options with reduced layers.
"""
try:
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(
model=model,
max_model_len=2048,
enforce_eager=True,
tensor_parallel_size=8,
enable_expert_parallel=True,
trust_remote_code=True,
gpu_memory_utilization=0.4,
kv_cache_dtype="fp8",
)
outputs = llm.generate(PROMPTS, sampling_params)
# Print the outputs
print("\nGenerated Outputs:\n" + "-" * 60)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}")
print(f"Output: {generated_text!r}")
print("-" * 60)
except Exception as e:
print(f"Error initializing or running model: {e}")
raise
def get_rope_layers_config(model_path: str) -> list[int]:
"""
Get the interleaved RoPE configuration from HuggingFace config
Args:
model_path: Path to the local directory containing the reduced
Maverick model checkpoint
Returns:
List of 0 or 1 indicating whether each layer uses RoPE and local attn
0 indicates that RoPE is not used while 1 indicates that RoPE is used.
"""
config_path = Path(model_path) / "config.json"
model_config = json.loads(config_path.read_text())
text_config = model_config["text_config"]
no_rope_layers = text_config["no_rope_layers"]
print(f"Found no_rope_layers: {no_rope_layers}")
return no_rope_layers
def create_reduced_maverick_model(
original_model_name: str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
output_dir: str = "/tmp/reduced_maverick",
text_layers: int = 4,
num_experts: int = 4,
vision_layers: int = 2,
force_recreate: bool = False,
) -> str:
"""
Create a reduced-layer version of the Maverick model.
Args:
original_model_name: Name of the original Maverick model
output_dir: Directory to save the reduced model
text_layers: Number of text transformer layers
num_experts: Number of experts per layer
vision_layers: Number of vision transformer layers
force_recreate: Whether to recreate if output_dir already exists
Returns:
Path to the created reduced model directory
"""
print(
f"Creating reduced Maverick model with {text_layers} text layers and "
f"{vision_layers} vision layers..."
)
# Create output directory
output_path = Path(output_dir)
if output_path.exists():
if force_recreate:
shutil.rmtree(output_path)
else:
print(
f"Output directory {output_dir} already exists. "
"Use --force-recreate to overwrite."
)
return str(output_path)
output_path.mkdir(parents=True, exist_ok=True)
try:
print("Loading original model configuration...")
original_config = AutoConfig.from_pretrained(
original_model_name, trust_remote_code=True
)
print("Creating reduced configuration...")
reduced_config = create_reduced_config(
original_config, text_layers, num_experts, vision_layers
)
config_path = output_path / "config.json"
with open(config_path, "w") as f:
json.dump(reduced_config, f, indent=2)
print(f"Saved reduced config to {config_path}")
print("Copying tokenizer files...")
copy_tokenizer_files(original_model_name, output_path)
print("Creating reduced safetensors files...")
create_reduced_safetensors(original_config, reduced_config, output_path)
print("Creating preprocessor config...")
create_preprocessor_config(original_config, output_path)
try:
gen_config = GenerationConfig.from_pretrained(original_model_name)
gen_config.save_pretrained(output_path)
print("Copied generation config")
except Exception as e:
print(f"Could not copy generation config: {e}")
print(f"Successfully created reduced Maverick model at {output_path}")
return str(output_path)
except Exception as e:
print(f"Error creating reduced model: {e}")
# Clean up on failure
if output_path.exists():
shutil.rmtree(output_path)
raise
def create_reduced_config(
original_config: Any, text_layers: int, num_experts: int, vision_layers: int
) -> dict[str, Any]:
"""Create a reduced configuration based on the original."""
# Convert config to dictionary
config_dict = original_config.to_dict()
# Reduce text layers
if "text_config" in config_dict:
original_text_layers = config_dict["text_config"]["num_hidden_layers"]
config_dict["text_config"]["num_hidden_layers"] = text_layers
original_layer_types = config_dict["text_config"]["layer_types"]
config_dict["text_config"]["layer_types"] = original_layer_types[:text_layers]
print(f"Reduced text layers from {original_text_layers} to {text_layers}")
original_num_experts = config_dict["text_config"]["num_local_experts"]
config_dict["text_config"]["num_local_experts"] = num_experts
print(f"Reduced num experts from {original_num_experts} to {num_experts}")
hidden_dim_divisor = 4
original_hidden_size = config_dict["text_config"]["hidden_size"]
new_hidden_size = original_hidden_size // hidden_dim_divisor
config_dict["text_config"]["hidden_size"] = new_hidden_size
print(f"Reduced hidden size from {original_hidden_size} to {new_hidden_size}")
original_head_dim = config_dict["text_config"]["head_dim"]
new_head_dim = original_head_dim // hidden_dim_divisor
config_dict["text_config"]["head_dim"] = new_head_dim
print(f"Reduced head dim from {original_head_dim} to {new_head_dim}")
# Reduce vision layers
if "vision_config" in config_dict:
original_vision_layers = config_dict["vision_config"]["num_hidden_layers"]
config_dict["vision_config"]["num_hidden_layers"] = vision_layers
print(f"Reduced vision layers from {original_vision_layers} to {vision_layers}")
# Update model name to indicate it's a reduced version
config_dict["_name_or_path"] = f"reduced_maverick_{text_layers}t_{vision_layers}v"
return config_dict
def copy_tokenizer_files(original_model_name: str, output_path: Path) -> None:
"""Copy tokenizer files from the original model."""
try:
tokenizer = AutoTokenizer.from_pretrained(
original_model_name, trust_remote_code=True
)
tokenizer.save_pretrained(output_path)
print("Tokenizer files copied successfully")
except Exception as e:
print(f"Warning: Could not copy tokenizer files: {e}")
def create_preprocessor_config(original_config: Any, output_path: Path) -> None:
"""Create preprocessor_config.json for multimodal model."""
# Try to load the original preprocessor config
try:
processor = AutoProcessor.from_pretrained(
original_config._name_or_path
or "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
trust_remote_code=True,
)
processor.save_pretrained(output_path)
print("Copied original preprocessor config")
return
except Exception as e:
print(f"Could not copy original preprocessor config: {e}")
raise
def create_reduced_safetensors(
original_config: Any, reduced_config: dict[str, Any], output_path: Path
) -> None:
"""Create safetensors files with weights for the reduced model."""
print("Generating synthetic weights for reduced model...")
text_config = reduced_config["text_config"]
vision_config = reduced_config["vision_config"]
weights = {}
print("Creating text model weights...")
weights.update(create_text_model_weights(text_config))
print("Creating vision model weights...")
weights.update(create_vision_model_weights(vision_config))
print("Creating shared model weights...")
weights.update(create_shared_weights(text_config, vision_config))
print("Saving weights to safetensors files...")
save_weights_to_safetensors(weights, output_path)
def create_text_model_weights(text_config: dict[str, Any]) -> dict[str, torch.Tensor]:
"""Create synthetic weights for the text model with MoE structure."""
weights = {}
vocab_size = text_config["vocab_size"]
hidden_size = text_config["hidden_size"]
intermediate_size = text_config["intermediate_size"]
intermediate_size_mlp = text_config["intermediate_size_mlp"]
num_layers = text_config["num_hidden_layers"]
num_attention_heads = text_config["num_attention_heads"]
num_key_value_heads = text_config.get("num_key_value_heads", num_attention_heads)
# MoE specific parameters
num_experts = text_config.get("num_local_experts")
assert num_experts is not None, "num_local_experts must be specified for MoE"
head_dim = hidden_size // num_attention_heads
# Embedding layers
weights["language_model.model.embed_tokens.weight"] = torch.randn(
vocab_size, hidden_size, dtype=torch.float16
)
# Transformer layers
for layer_idx in range(num_layers):
layer_prefix = f"language_model.model.layers.{layer_idx}"
print(f"Creating weights for layer {layer_prefix}...")
# Self-attention weights (separate q, k, v projections)
weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
hidden_size, num_key_value_heads * head_dim, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16
)
print("Self-attention weights created.")
# Feed-forward weights - MoE pattern based on interleave_moe_layer_step
# For interleave_moe_layer_step=2: layers 1,3,5,... are MoE, layers
# 0,2,4,... are dense
interleave_step = text_config.get("interleave_moe_layer_step", 1)
is_moe_layer = interleave_step > 0 and (layer_idx + 1) % interleave_step == 0
if is_moe_layer:
# MoE layer structure
# 1. Router weights
weights[f"{layer_prefix}.feed_forward.router.weight"] = torch.randn(
num_experts, hidden_size, dtype=torch.float16
)
# 2. Individual expert weights (not fused)
for expert_idx in range(num_experts):
expert_prefix = f"{layer_prefix}.feed_forward.experts.{expert_idx}"
weights[f"{expert_prefix}.gate_proj.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{expert_prefix}.up_proj.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{expert_prefix}.down_proj.weight"] = torch.randn(
hidden_size, intermediate_size, dtype=torch.bfloat16
)
# Expert weight scales (FP8 quantization)
weights[f"{expert_prefix}.gate_proj.weight_scale"] = torch.ones(
intermediate_size, 1, dtype=torch.bfloat16
)
weights[f"{expert_prefix}.up_proj.weight_scale"] = torch.ones(
intermediate_size, 1, dtype=torch.bfloat16
)
weights[f"{expert_prefix}.down_proj.weight_scale"] = torch.ones(
hidden_size, 1, dtype=torch.bfloat16
)
# 3. Shared expert weights
shared_expert_prefix = f"{layer_prefix}.feed_forward.shared_expert"
weights[f"{shared_expert_prefix}.gate_proj.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{shared_expert_prefix}.up_proj.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{shared_expert_prefix}.down_proj.weight"] = torch.randn(
hidden_size, intermediate_size, dtype=torch.bfloat16
)
print(f"MoE feed-forward weights created for layer {layer_idx}.")
else:
# Dense layer structure
weights[f"{layer_prefix}.feed_forward.gate_proj.weight"] = torch.randn(
intermediate_size_mlp, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.feed_forward.up_proj.weight"] = torch.randn(
intermediate_size_mlp, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.feed_forward.down_proj.weight"] = torch.randn(
hidden_size, intermediate_size_mlp, dtype=torch.bfloat16
)
print(f"Dense feed-forward weights created for layer {layer_idx}.")
# Layer norms
weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16
)
print("Layer norms created.")
# Final layer norm and output projection
weights["language_model.model.norm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16
)
weights["language_model.lm_head.weight"] = torch.randn(
vocab_size, hidden_size, dtype=torch.bfloat16
)
return weights
def create_vision_model_weights(
vision_config: dict[str, Any],
) -> dict[str, torch.Tensor]:
"""Create synthetic weights for the vision model."""
weights = {}
hidden_size = vision_config["hidden_size"]
intermediate_size = vision_config["intermediate_size"]
num_layers = vision_config["num_hidden_layers"]
# Vision transformer layers
for layer_idx in range(num_layers):
layer_prefix = f"vision_model.model.layers.{layer_idx}"
weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
hidden_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.q_proj.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
hidden_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.k_proj.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
hidden_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.v_proj.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
hidden_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.o_proj.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.mlp.fc1.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.mlp.fc1.bias"] = torch.zeros(
intermediate_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.mlp.fc2.weight"] = torch.randn(
hidden_size, intermediate_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.mlp.fc2.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.input_layernorm.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.post_attention_layernorm.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16
)
return weights
def create_shared_weights(
text_config: dict[str, Any], vision_config: dict[str, Any]
) -> dict[str, torch.Tensor]:
"""Create weights for shared components (vision-language connector)"""
weights = {}
text_hidden_size = text_config["hidden_size"]
projector_input_dim = vision_config["projector_input_dim"]
# Vision-language connector (projects vision features to text space)
weights["multi_modal_projector.linear_1.weight"] = torch.randn(
text_hidden_size, projector_input_dim, dtype=torch.bfloat16
)
return weights
def save_weights_to_safetensors(
weights: dict[str, torch.Tensor], output_path: Path
) -> None:
"""Save weights to safetensors files and create index."""
# Determine how to shard the weights
max_shard_size = 5 * 1024 * 1024 * 1024 # 5GB per shard
# Calculate sizes and create shards
shards = []
current_shard: dict[str, torch.Tensor] = {}
current_size = 0
for name, tensor in weights.items():
tensor_size = tensor.numel() * tensor.element_size()
if current_size + tensor_size > max_shard_size and current_shard:
shards.append(current_shard)
current_shard = {}
current_size = 0
current_shard[name] = tensor
current_size += tensor_size
if current_shard:
shards.append(current_shard)
# Save shards and create index
weight_map = {}
if len(shards) == 1:
# Single file
filename = "model.safetensors"
save_file(shards[0], output_path / filename)
weight_map = {name: filename for name in shards[0]}
print(f"Saved weights to single file: {filename}")
else:
# Multiple shards
for i, shard in enumerate(shards):
filename = f"model-{i + 1:05d}-of-{len(shards):05d}.safetensors"
save_file(shard, output_path / filename)
for name in shard:
weight_map[name] = filename
print(f"Saved shard {i + 1}/{len(shards)}: {filename}")
# Create index file
index_data = {
"metadata": {
"total_size": sum(
tensor.numel() * tensor.element_size() for tensor in weights.values()
)
},
"weight_map": weight_map,
}
index_path = output_path / "model.safetensors.index.json"
with open(index_path, "w") as f:
json.dump(index_data, f, indent=2)
print(f"Created index file: {index_path}")
print(
f"Total model size: {index_data['metadata']['total_size'] / (1024**3):.2f} GB"
)
def check_attention_spec_interleaved_rope(
llm: LLM,
num_attention_layers: int,
num_ranks: int,
rope_layers: list[int],
):
"""Check that the attention spec is correct."""
assert isinstance(llm.llm_engine.model_executor, Executor)
kv_cache_specs_per_rank = llm.llm_engine.model_executor.get_kv_cache_specs()
for rank in range(num_ranks):
kv_cache_specs = kv_cache_specs_per_rank[rank]
assert len(kv_cache_specs.keys()) == num_attention_layers
for i in range(num_attention_layers):
if rope_layers[i] == 0:
expected_spec = FullAttentionSpec
else:
expected_spec = ChunkedLocalAttentionSpec
assert isinstance(
kv_cache_specs[f"language_model.model.layers.{i}.self_attn.attn"],
expected_spec,
)
def run_reduced_model(llm: LLM, should_profile: bool = False) -> None:
"""Test the created reduced model with vLLM."""
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=50)
if should_profile:
llm.start_profile()
outputs = llm.generate(PROMPTS, sampling_params)
if should_profile:
llm.stop_profile()
print("Test generation successful!")
for output in outputs:
print(f"Prompt: {output.prompt}")
print(f"Output: {output.outputs[0].text}")
print("-" * 40)
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize(
"original_model_name,text_layers,num_experts,vision_layers,",
[("meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", 4, 4, 2)],
)
@pytest.mark.parametrize("enforce_eager", [True, False])
@pytest.mark.parametrize("tp,ep", [(2, True)])
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
def test_dummy_maverick(
monkeypatch,
original_model_name: str,
text_layers: int,
num_experts: int,
vision_layers: int,
enforce_eager: bool,
tp: int,
ep: bool,
output_dir: str = "/tmp/reduced_maverick",
force_recreate: bool = True,
profile: bool = False,
) -> None:
# Disable multiprocessing allows us to access model executor from LLM engine
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
model_path = create_reduced_maverick_model(
original_model_name=original_model_name,
output_dir=output_dir,
text_layers=text_layers,
num_experts=num_experts,
vision_layers=vision_layers,
force_recreate=force_recreate,
)
print(f"\nReduced model created successfully at: {model_path}")
rope_layers = get_rope_layers_config(model_path)
llm = LLM(
model=model_path,
trust_remote_code=True,
max_model_len=512, # Small context for testing
gpu_memory_utilization=0.3, # Conservative memory usage
enforce_eager=enforce_eager,
tensor_parallel_size=tp,
enable_expert_parallel=ep,
)
check_attention_spec_interleaved_rope(
llm,
text_layers,
tp,
rope_layers,
)
print(f"\nTesting reduced model at {model_path}...")
run_reduced_model(llm=llm, should_profile=profile)
def main():
"""Main function to create and test the reduced model."""
import argparse
parser = argparse.ArgumentParser(
description="Create a reduced-layer Maverick model"
)
parser.add_argument(
"--output-dir",
default="/tmp/reduced_maverick",
help="Output directory for the reduced model",
)
parser.add_argument(
"--text-layers",
type=int,
default=4,
help="Number of text transformer layers",
)
parser.add_argument("--num-experts", type=int, default=4, help="Number of experts")
parser.add_argument(
"--vision-layers",
type=int,
default=2,
help="Number of vision transformer layers",
)
parser.add_argument(
"--force-recreate",
action="store_true",
help="Force recreation if output directory exists",
)
parser.add_argument(
"--test", action="store_true", help="Test the created model with vLLM"
)
parser.add_argument(
"--profile", action="store_true", help="Profile the created model with vLLM"
)
parser.add_argument(
"--test-original",
action="store_true",
help="Test the original model with vLLM",
)
parser.add_argument(
"--original-model",
default="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
help="Original model name to base the reduction on",
)
args = parser.parse_args()
if args.test:
test_dummy_maverick(
original_model_name=args.original_model,
output_dir=args.output_dir,
text_layers=args.text_layers,
num_experts=args.num_experts,
vision_layers=args.vision_layers,
force_recreate=args.force_recreate,
tp=2,
ep=True,
enforce_eager=True,
profile=args.profile,
)
if args.test_original:
run_maverick_serving(args.original_model)
if __name__ == "__main__":
exit(main())

View File

@@ -0,0 +1,180 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
from typing import Any, NamedTuple
import pytest
from huggingface_hub import hf_hub_download
from pytest import MarkDecorator
from transformers import AutoModelForImageTextToText
from tests.quantization.utils import is_quant_method_supported
from vllm.assets.image import ImageAsset
from vllm.multimodal.image import rescale_image_size
from vllm.utils.torch_utils import set_default_torch_num_threads
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner
from ...utils import check_logprobs_close
class GGUFMMTestConfig(NamedTuple):
original_model: str
gguf_repo: str
gguf_backbone: str
gguf_mmproj: str
prompt: list[str]
image_names: list[str] # Store names, load PIL images at runtime
max_model_len: int = 4096
marks: list[MarkDecorator] = []
mm_processor_kwargs: dict[str, Any] = {}
@property
def gguf_model(self):
hf_hub_download(self.gguf_repo, filename=self.gguf_mmproj)
return hf_hub_download(self.gguf_repo, filename=self.gguf_backbone)
# Common prompts aligned with test_common.py "gemma3" entry format
_GEMMA3_PROMPTS = IMAGE_ASSETS.prompts(
{
"stop_sign": (
"<bos><start_of_turn>user\n"
"<start_of_image>What's the content in the center of the image?"
"<end_of_turn>\n<start_of_turn>model\n"
),
"cherry_blossom": (
"<bos><start_of_turn>user\n"
"<start_of_image>What is the season?"
"<end_of_turn>\n<start_of_turn>model\n"
),
}
)
# Image asset names - load at runtime to avoid pickle issues with subprocess
_GEMMA3_IMAGE_NAMES = ["stop_sign", "cherry_blossom"]
# Regular multimodal (no pan-and-scan) - uses QAT Q4_0 GGUF
GEMMA3_CONFIG = GGUFMMTestConfig(
original_model="google/gemma-3-4b-it",
gguf_repo="google/gemma-3-4b-it-qat-q4_0-gguf",
gguf_backbone="gemma-3-4b-it-q4_0.gguf",
gguf_mmproj="mmproj-model-f16-4B.gguf",
prompt=_GEMMA3_PROMPTS,
image_names=_GEMMA3_IMAGE_NAMES,
max_model_len=4096,
marks=[pytest.mark.core_model],
mm_processor_kwargs={},
)
# Pan-and-scan multimodal - uses unquantized BF16 GGUF
GEMMA3_CONFIG_PAN_AND_SCAN = GGUFMMTestConfig(
original_model="google/gemma-3-4b-it",
gguf_repo="unsloth/gemma-3-4b-it-GGUF",
gguf_backbone="gemma-3-4b-it-BF16.gguf",
gguf_mmproj="mmproj-BF16.gguf",
prompt=_GEMMA3_PROMPTS,
image_names=_GEMMA3_IMAGE_NAMES,
max_model_len=4096,
marks=[pytest.mark.core_model],
mm_processor_kwargs={"do_pan_and_scan": True},
)
MODELS_TO_TEST = [GEMMA3_CONFIG, GEMMA3_CONFIG_PAN_AND_SCAN]
def run_multimodal_gguf_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
model: GGUFMMTestConfig,
dtype: str,
max_tokens: int,
num_logprobs: int,
):
# Load images at runtime (inside subprocess) to avoid pickle issues
images = [ImageAsset(name).pil_image for name in model.image_names]
size_factors = [0.25, 0.5, 1.0]
inputs_per_image = [
(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
)
for image, prompt in zip(images, model.prompt)
]
# NOTE: Run vLLM first to avoid CUDA init issues with multiprocessing fork.
# Run GGUF model via vLLM.
with (
set_default_torch_num_threads(1),
vllm_runner(
model_name=model.gguf_model,
enforce_eager=True,
tokenizer_name=model.original_model,
dtype=dtype,
max_model_len=model.max_model_len,
mm_processor_kwargs=model.mm_processor_kwargs,
) as gguf_model,
):
gguf_outputs_per_case = [
gguf_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
)
for prompts, images in inputs_per_image
]
# Then run HfRunner for HuggingFace baseline comparison.
with hf_runner(
model.original_model,
dtype=dtype,
auto_cls=AutoModelForImageTextToText,
) as hf_model:
hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
)
for prompts, images in inputs_per_image
]
for hf_outputs, gguf_outputs in zip(hf_outputs_per_case, gguf_outputs_per_case):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=gguf_outputs,
name_0="hf",
name_1="gguf",
)
@pytest.mark.skipif(
not is_quant_method_supported("gguf"),
reason="gguf is not supported on this GPU type.",
)
@pytest.mark.parametrize(
"model",
[
pytest.param(test_config, marks=test_config.marks)
for test_config in MODELS_TO_TEST
],
)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [10])
def test_gemma3_mm_gguf(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
model: GGUFMMTestConfig,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
run_multimodal_gguf_test(
hf_runner, vllm_runner, model, dtype, max_tokens, num_logprobs
)

View File

@@ -0,0 +1,317 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from collections.abc import Sequence
import librosa
import pytest
import regex as re
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer
from vllm.assets.image import ImageAsset
from vllm.logprobs import SampleLogprobs
from vllm.lora.request import LoRARequest
from vllm.multimodal.image import convert_image_mode, rescale_image_size
from ....conftest import (
IMAGE_ASSETS,
HfRunner,
PromptAudioInput,
PromptImageInput,
VllmRunner,
)
from ....utils import large_gpu_test
from ...utils import check_logprobs_close
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
{
"stop_sign": "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
"cherry_blossom": "<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
}
)
HF_MULTIIMAGE_IMAGE_PROMPT = (
"<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
)
model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
vision_lora_path = os.path.join(model_path, "vision-lora")
speech_question = os.path.join(
model_path, "examples", "what_is_shown_in_this_image.wav"
)
models = [model_path]
def vllm_to_hf_output(
vllm_output: tuple[list[int], str, SampleLogprobs | None], model: str
):
"""Sanitize vllm output to be comparable with hf output."""
_, output_str, out_logprobs = vllm_output
output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
assert output_str_without_image[0] == " "
output_str_without_image = output_str_without_image[1:]
hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
tokenizer = AutoTokenizer.from_pretrained(model)
hf_output_ids = tokenizer.encode(output_str_without_image)
assert hf_output_ids[0] == 1
hf_output_ids = hf_output_ids[1:]
return hf_output_ids, hf_output_str, out_logprobs
target_dtype = "half"
def run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
inputs: Sequence[tuple[list[str], PromptImageInput, PromptAudioInput | None]],
model: str,
*,
max_model_len: int,
dtype: str,
max_tokens: int,
num_logprobs: int,
mm_limit: int,
tensor_parallel_size: int,
distributed_executor_backend: str | None = None,
):
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test are from IMAGE_ASSETS.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with vllm_runner(
model,
runner="generate",
max_model_len=max_model_len,
max_num_seqs=2,
dtype=dtype,
limit_mm_per_prompt={"image": mm_limit},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enable_lora=True,
max_lora_rank=320,
gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
enforce_eager=True,
) as vllm_model:
lora_request = LoRARequest("vision", 1, vision_lora_path)
vllm_outputs_per_case = [
vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
audios=audios,
lora_request=lora_request,
)
for prompts, images, audios in inputs
]
# This error occurs inside `get_peft_model`
# FIXME: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/75
pytest.skip("HF impl is not compatible with current transformers")
hf_model_kwargs = {"_attn_implementation": "sdpa"}
with hf_runner(model, dtype=dtype, model_kwargs=hf_model_kwargs) as hf_model:
hf_processor = hf_model.processor
eos_token_id = hf_processor.tokenizer.eos_token_id
def patch_hf_processor(
*args, text="", images=None, audio=None, sampling_rate=None, **kwargs
):
audios = None
if audio is not None and sampling_rate is not None:
audios = [(audio, sampling_rate)]
return hf_processor(
*args, text=text, images=images, audios=audios, **kwargs
)
hf_model.processor = patch_hf_processor
hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
audios=audios,
eos_token_id=eos_token_id,
num_logits_to_keep=0,
)
for prompts, images, audios in inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# No image
[],
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_models(
hf_runner,
vllm_runner,
image_assets,
model,
size_factors,
dtype: str,
max_model_len: int,
max_tokens: int,
num_logprobs: int,
) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_image = [
(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
None,
)
for image, prompt in zip(images, HF_IMAGE_PROMPTS)
]
run_test(
hf_runner,
vllm_runner,
inputs_per_image,
model,
dtype=dtype,
max_model_len=max_model_len,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=1,
tensor_parallel_size=1,
)
@large_gpu_test(min_gb=48)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# No image
# [],
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_model_len", [25600])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_multi_images_models(
hf_runner,
vllm_runner,
image_assets,
model,
size_factors,
dtype: str,
max_model_len: int,
max_tokens: int,
num_logprobs: int,
) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_case = [
(
[HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
[
[rescale_image_size(image, factor) for image in images]
for factor in size_factors
],
None,
),
]
run_test(
hf_runner,
vllm_runner,
inputs_per_case,
model,
dtype=dtype,
max_model_len=max_model_len,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=2,
tensor_parallel_size=1,
)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_vision_speech_models(
hf_runner,
vllm_runner,
model,
dtype: str,
max_model_len: int,
max_tokens: int,
num_logprobs: int,
) -> None:
# use the example speech question so that the model outputs are reasonable
audio = librosa.load(speech_question, sr=None)
image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
inputs_vision_speech = [
(
["<|user|><|image_1|><|audio_1|><|end|><|assistant|>"],
[image],
[audio],
),
]
run_test(
hf_runner,
vllm_runner,
inputs_vision_speech,
model,
dtype=dtype,
max_model_len=max_model_len,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=1,
tensor_parallel_size=1,
)

View File

@@ -0,0 +1,211 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from dataclasses import asdict
from typing import TYPE_CHECKING, Any
import pytest
from mistral_common.multimodal import download_image
from mistral_common.protocol.instruct.chunk import ImageURLChunk
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
from transformers import AutoProcessor
from vllm import SamplingParams, TextPrompt, TokensPrompt
from vllm.logprobs import Logprob, SampleLogprobs
from vllm.multimodal import MultiModalDataBuiltins
from vllm.platforms import current_platform
from ....utils import VLLM_PATH, large_gpu_test
from ...utils import check_logprobs_close
if TYPE_CHECKING:
from _typeshed import StrPath
PIXTRAL_ID = "mistralai/Pixtral-12B-2409"
MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
MODELS = [PIXTRAL_ID, MISTRAL_SMALL_3_1_ID]
IMG_URLS = [
"237-400x300.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
"231-200x300.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
"27-500x500.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
"17-150x600.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
]
PROMPT = "Describe each image in one short sentence."
def _create_msg_format(urls: list[str]) -> list[dict[str, Any]]:
return [
{
"role": "user",
"content": [
{
"type": "text",
"text": PROMPT,
}
]
+ [{"type": "image_url", "image_url": {"url": url}} for url in urls],
}
]
def _create_msg_format_hf(urls: list[str]) -> list[dict[str, Any]]:
return [
{
"role": "user",
"content": [
{
"type": "text",
"content": PROMPT,
},
*({"type": "image", "image": download_image(url)} for url in urls),
],
}
]
def _create_engine_inputs(urls: list[str]) -> TokensPrompt:
msg = _create_msg_format(urls)
tokenizer = MistralTokenizer.from_model("pixtral")
request = ChatCompletionRequest(messages=msg) # type: ignore[type-var]
tokenized = tokenizer.encode_chat_completion(request)
engine_inputs = TokensPrompt(prompt_token_ids=tokenized.tokens)
images = []
for chunk in request.messages[0].content:
if isinstance(chunk, ImageURLChunk):
images.append(image_from_chunk(chunk))
mm_data = MultiModalDataBuiltins(image=images)
engine_inputs["multi_modal_data"] = mm_data
return engine_inputs
def _create_engine_inputs_hf(urls: list[str]) -> TextPrompt:
msg = _create_msg_format_hf(urls)
tokenizer = AutoProcessor.from_pretrained("mistral-community/pixtral-12b")
prompt = tokenizer.apply_chat_template(msg)
images = []
for chunk in msg[0]["content"]:
if chunk["type"] == "image":
images.append(chunk["image"])
mm_data = MultiModalDataBuiltins(image=images)
engine_inputs = TextPrompt(prompt=prompt, multi_modal_data=mm_data)
return engine_inputs
SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
LIMIT_MM_PER_PROMPT = dict(image=4)
MAX_MODEL_LEN = [8192, 65536]
FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
assert FIXTURES_PATH.exists()
FIXTURE_LOGPROBS_CHAT = {
PIXTRAL_ID: FIXTURES_PATH / "pixtral_chat.json",
MISTRAL_SMALL_3_1_ID: FIXTURES_PATH / "mistral_small_3_chat.json",
}
OutputsLogprobs = list[tuple[list[int], str, SampleLogprobs | None]]
# For the test author to store golden output in JSON
def _dump_outputs_w_logprobs(
outputs: OutputsLogprobs,
filename: "StrPath",
) -> None:
json_data = [
(
tokens,
text,
[
{k: asdict(v) for k, v in token_logprobs.items()}
for token_logprobs in (logprobs or [])
],
)
for tokens, text, logprobs in outputs
]
with open(filename, "w") as f:
json.dump(json_data, f)
def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
with open(filename, "rb") as f:
json_data = json.load(f)
return [
(
tokens,
text,
[
{int(k): Logprob(**v) for k, v in token_logprobs.items()}
for token_logprobs in logprobs
],
)
for tokens, text, logprobs in json_data
]
@large_gpu_test(min_gb=80)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
@pytest.mark.parametrize("dtype", ["bfloat16"])
def test_chat(
vllm_runner, max_model_len: int, model: str, dtype: str, local_asset_server
) -> None:
if (
model == MISTRAL_SMALL_3_1_ID
and max_model_len == 65536
and current_platform.is_rocm()
):
pytest.skip(
"OOM on ROCm: 24B model with 65536 context length exceeds GPU memory"
)
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT[model])
with vllm_runner(
model,
dtype=dtype,
tokenizer_mode="mistral",
load_format="mistral",
config_format="mistral",
max_model_len=max_model_len,
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
) as vllm_model:
outputs = []
urls_all = [local_asset_server.url_for(u) for u in IMG_URLS]
msgs = [
_create_msg_format(urls_all[:1]),
_create_msg_format(urls_all[:2]),
_create_msg_format(urls_all),
]
for msg in msgs:
output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS)
outputs.extend(output)
logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
# Remove last `None` prompt_logprobs to compare with fixture
for i in range(len(logprobs)):
assert logprobs[i][-1] is None
logprobs[i] = logprobs[i][:-1]
check_logprobs_close(
outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
outputs_1_lst=logprobs,
name_0="h100_ref",
name_1="output",
)

View File

@@ -0,0 +1,148 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.multimodal.video import sample_frames_from_video
from ....conftest import VIDEO_ASSETS
models = ["Qwen/Qwen2.5-VL-3B-Instruct"]
target_dtype = "bfloat16"
VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
def qwen2_5_vl_chat_template(*query):
return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n" # noqa: E501
VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
{
"baby_reading": qwen2_5_vl_chat_template(
VIDEO_PLACEHOLDER,
"Describe this video with a short sentence ",
"(no more than 20 words)",
),
}
)
@pytest.mark.core_model
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("video_pruning_rate", [0.0, 0.75])
@pytest.mark.parametrize("num_frames", [16])
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
def test_qwen2_5_vl_evs_functionality(
vllm_runner,
video_assets,
model,
video_pruning_rate: float,
num_frames: int,
dtype: str,
max_tokens: int,
use_bytecode_hook: bool,
monkeypatch,
) -> None:
"""Test EVS (Efficient Video Sampling) functionality with different
pruning rates.
"""
# Set the environment variable for this test
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
# Sample frames from video assets
sampled_vids = [
sample_frames_from_video(asset.np_ndarrays, num_frames)
for asset in video_assets
]
prompts = [VIDEO_PROMPTS[0]]
videos = [sampled_vids[0]]
# Initialize model with EVS configuration
with vllm_runner(
model,
runner="generate",
max_model_len=4000,
dtype=dtype,
limit_mm_per_prompt={"video": 1},
video_pruning_rate=video_pruning_rate,
) as vllm_model:
# Generate output - this should not crash
outputs = vllm_model.generate_greedy(prompts, max_tokens, videos=videos)
# Basic validation that we got a response
assert len(outputs) == 1
output_ids, output_text = outputs[0]
# Ensure we got some output
assert len(output_ids) > 0
assert len(output_text) > 0
# Ensure the output is a string
assert isinstance(output_text, str)
@pytest.mark.core_model
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("video_pruning_rate", [0.0, 0.75])
@pytest.mark.parametrize("num_frames", [16])
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
def test_qwen2_5_vl_evs_batched_videos(
vllm_runner,
video_assets,
model,
video_pruning_rate: float,
num_frames: int,
dtype: str,
max_tokens: int,
use_bytecode_hook: bool,
monkeypatch,
) -> None:
"""Test EVS functionality with batched videos.
This test validates that:
1. The model handles batched video inputs correctly with EVS
2. Both pruning configurations work with multiple videos
3. The model doesn't crash when processing multiple videos simultaneously
"""
# Set the environment variable for this test
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
# Sample frames from video assets
sampled_vids = [
sample_frames_from_video(asset.np_ndarrays, num_frames)
for asset in video_assets
]
# Test batched videos
prompts = [VIDEO_PROMPTS[0], VIDEO_PROMPTS[0]]
videos = [sampled_vids[0], sampled_vids[0]] # Use same video twice for testing
# Initialize model with EVS configuration
with vllm_runner(
model,
runner="generate",
max_model_len=4000,
max_num_seqs=2,
dtype=dtype,
limit_mm_per_prompt={"video": 2},
tensor_parallel_size=1,
video_pruning_rate=video_pruning_rate,
) as vllm_model:
# Generate output - this should not crash
outputs = vllm_model.generate_greedy(prompts, max_tokens, videos=videos)
# Basic validation that we got responses for both videos
assert len(outputs) == 2
for output_ids, output_text in outputs:
# Ensure we got some output for each video
assert len(output_ids) > 0
assert len(output_text) > 0
# Ensure the output is a string
assert isinstance(output_text, str)

View File

@@ -0,0 +1,473 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any, TypedDict
import numpy.typing as npt
import pytest
import torch
from PIL import Image
from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
from ....conftest import (
IMAGE_ASSETS,
VIDEO_ASSETS,
PromptImageInput,
PromptVideoInput,
VllmRunner,
)
from ...utils import check_logprobs_close
@pytest.fixture(scope="function", autouse=True)
def enable_pickle(monkeypatch):
"""`LLM.apply_model` requires pickling a function."""
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
models = ["Qwen/Qwen2-VL-2B-Instruct"]
target_dtype = "half"
IMAGE_PLACEHOLDER = "<|vision_start|><|image_pad|><|vision_end|>"
VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
MODEL_HIDDEN_SIZE = 1536
def qwen2_vl_chat_template(*query):
return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n" # noqa: E501
IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
{
"stop_sign": qwen2_vl_chat_template(
IMAGE_PLACEHOLDER,
"What is the biggest text's content in this image?",
),
"cherry_blossom": qwen2_vl_chat_template(
IMAGE_PLACEHOLDER,
"What is the season shown in this image? ",
"Reply with a short sentence (no more than 20 words)",
),
}
)
VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
{
"baby_reading": qwen2_vl_chat_template(
VIDEO_PLACEHOLDER,
"Describe this video with a short sentence ",
"(no more than 20 words)",
),
}
)
MULTIIMAGE_PROMPT = qwen2_vl_chat_template(
IMAGE_PLACEHOLDER,
IMAGE_PLACEHOLDER,
"Describe these two images separately. ",
"For each image, reply with a short sentence ",
"(no more than 10 words).",
)
class Qwen2VLPromptImageEmbeddingInput(TypedDict):
image_embeds: torch.Tensor
image_grid_thw: torch.Tensor
class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
video_embeds: torch.Tensor
video_grid_thw: torch.Tensor
def batch_make_image_embeddings(
image_batches: list[Image.Image | list[Image.Image]],
processor,
llm: VllmRunner,
) -> list[Qwen2VLPromptImageEmbeddingInput]:
"""batched image embeddings for Qwen2-VL
This will infer all images' embeddings in a single batch,
and split the result according to input batches.
image_batches:
- Single-image batches: `list[Image.Image]`
- Multiple-image batches: `list[list[Image.Image]]]`
returns: `list[Qwen2VLPromptImageEmbeddingInput]`
"""
image_batches_: list[Any] = image_batches[:]
# convert single-image batches to multiple-image batches
for idx in range(len(image_batches_)):
if not isinstance(image_batches_[idx], list):
image_batches_[idx] = [image_batches_[idx]]
assert isinstance(image_batches_[idx], list)
# append all images into a list (as a batch)
images: list[Image.Image] = []
for image_batch in image_batches_:
images += image_batch
# image to pixel values
image_processor = processor.image_processor
preprocess_result = image_processor.preprocess(
images=images, return_tensors="pt"
).data
pixel_values = preprocess_result["pixel_values"]
image_grid_thw = preprocess_result["image_grid_thw"]
# pixel values to embeddings & grid_thws
def get_image_embeds(model):
with torch.no_grad():
visual = model.visual
pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
return visual(pixel_values_on_device, grid_thw=image_grid_thw).cpu()
image_embeds = torch.concat(llm.apply_model(get_image_embeds))
# split into original batches
result: list[Qwen2VLPromptImageEmbeddingInput] = []
image_counter = 0
embed_counter = 0
for image_batch in image_batches_:
cur_batch_image_count = len(image_batch)
merge_size = image_processor.merge_size
cur_batch_embed_len = sum(
grid_thw.prod(-1) // merge_size // merge_size
for grid_thw in image_grid_thw[
image_counter : image_counter + cur_batch_image_count
]
)
result.append(
{
"image_embeds": image_embeds[
embed_counter : embed_counter + cur_batch_embed_len
],
"image_grid_thw": image_grid_thw[
image_counter : image_counter + cur_batch_image_count
],
}
)
embed_counter += cur_batch_embed_len
image_counter += cur_batch_image_count
# ensure we don't lose any images or embeddings
assert embed_counter == image_embeds.size(0)
assert image_counter == image_grid_thw.size(0)
assert len(image_batches) == len(result)
return result
def batch_make_video_embeddings(
video_batches: PromptVideoInput, processor, llm: VllmRunner
) -> list[Qwen2VLPromptVideoEmbeddingInput]:
"""batched video embeddings for Qwen2-VL
A NDArray represents a single video's all frames.
This will infer all videos' embeddings in a single batch,
and split the result according to input batches.
video_batches:
- Single-video batches: `list[NDArray]`
- Multiple-video batches: `list[list[NDArray]]`
"""
video_batches_: list[Any] = video_batches[:]
for idx in range(len(video_batches_)):
if not isinstance(video_batches_[idx], list):
single_video_batch: list[npt.NDArray] = [video_batches_[idx]]
video_batches_[idx] = single_video_batch
assert isinstance(video_batches_[idx], list)
# append all videos into a list (as a batch)
videos: list[npt.NDArray] = []
for video_batch in video_batches_:
videos += video_batch
# video to pixel values
image_processor = processor.image_processor
preprocess_result = image_processor.preprocess(
images=None, videos=videos, return_tensors="pt"
).data
pixel_values = preprocess_result["pixel_values_videos"]
video_grid_thw = preprocess_result["video_grid_thw"]
# pixel values to embeddings & grid_thws
def get_image_embeds(model):
with torch.no_grad():
visual = model.visual
pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
return visual(pixel_values_on_device, grid_thw=video_grid_thw).cpu()
video_embeds = torch.concat(llm.apply_model(get_image_embeds))
# split into original batches
result: list[Qwen2VLPromptVideoEmbeddingInput] = []
video_counter = 0
embed_counter = 0
for video_batch in video_batches_:
cur_batch_video_count = len(video_batch)
merge_size = image_processor.merge_size
cur_batch_embed_len = sum(
grid_thw.prod(-1) // merge_size // merge_size
for grid_thw in video_grid_thw[
video_counter : video_counter + cur_batch_video_count
]
)
result.append(
{
"video_embeds": video_embeds[
embed_counter : embed_counter + cur_batch_embed_len
],
"video_grid_thw": video_grid_thw[
video_counter : video_counter + cur_batch_video_count
],
}
)
embed_counter += cur_batch_embed_len
video_counter += cur_batch_video_count
# ensure we don't lose any videos or embeddings
assert embed_counter == video_embeds.size(0)
assert video_counter == video_grid_thw.size(0)
assert len(video_batches) == len(result)
return result
def run_embedding_input_test(
vllm_runner: type[VllmRunner],
inputs: list[tuple[list[str], PromptImageInput, PromptVideoInput]],
model: str,
*,
dtype: str,
max_tokens: int,
num_logprobs: int,
mm_limit: int,
tensor_parallel_size: int,
distributed_executor_backend: str | None = None,
):
"""Inference result should be the same between
original image/video input and image/video embeddings input.
"""
from transformers import AutoProcessor # noqa: F401
processor = AutoProcessor.from_pretrained(model)
# max_model_len should be greater than image_feature_size
with vllm_runner(
model,
runner="generate",
max_model_len=4000,
max_num_seqs=3,
dtype=dtype,
limit_mm_per_prompt={"image": mm_limit, "video": mm_limit},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
default_torch_num_threads=1,
enable_mm_embeds=True,
) as vllm_model:
outputs_per_case_for_original_input = [
vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images or None,
videos=videos or None,
)
for prompts, images, videos in inputs
]
outputs_per_case_for_embeddings_input = [
vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=batch_make_image_embeddings(images, processor, vllm_model)
if images
else None,
videos=batch_make_video_embeddings(videos, processor, vllm_model)
if videos
else None,
)
for prompts, images, videos in inputs
]
for outputs_for_original_input, outputs_for_embeddings_input in zip(
outputs_per_case_for_original_input, outputs_per_case_for_embeddings_input
):
check_logprobs_close(
outputs_0_lst=outputs_for_original_input,
outputs_1_lst=outputs_for_embeddings_input,
name_0="original_input",
name_1="embeddings_input",
)
@pytest.mark.core_model
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# Single-scale
[0.5],
# Single-scale, batched
[0.5, 0.5],
# Multi-scale
[0.25, 0.5, 0.5],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_qwen2_vl_image_embeddings_input(
vllm_runner,
image_assets,
model,
size_factors,
dtype,
max_tokens,
num_logprobs,
monkeypatch,
) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
[],
)
for image, prompt in zip(images, IMAGE_PROMPTS)
]
run_embedding_input_test(
vllm_runner,
inputs_per_case,
model,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=1,
tensor_parallel_size=1,
)
@pytest.mark.core_model
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
[],
# Single-scale
[0.5],
# Single-scale, batched
[0.5, 0.5],
# Multi-scale
[0.25, 0.5, 0.5],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_qwen2_vl_multiple_image_embeddings_input(
vllm_runner,
image_assets,
model,
size_factors,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
(
[MULTIIMAGE_PROMPT for _ in size_factors],
[
[rescale_image_size(image, factor) for image in images]
for factor in size_factors
],
[],
)
]
run_embedding_input_test(
vllm_runner,
inputs_per_case,
model,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=2,
tensor_parallel_size=1,
)
@pytest.mark.core_model
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# Single-scale
[0.5],
# Single-scale, batched
[0.5, 0.5],
# Multi-scale
[0.25, 0.25, 0.5],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_qwen2_vl_video_embeddings_input(
vllm_runner,
video_assets,
model,
size_factors,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
num_frames = 4
sampled_vids = [
sample_frames_from_video(asset.np_ndarrays, num_frames)
for asset in video_assets
]
inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
(
[prompt for _ in size_factors],
[],
[rescale_video_size(video, factor) for factor in size_factors],
)
for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)
]
run_embedding_input_test(
vllm_runner,
inputs_per_case,
model,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=1,
tensor_parallel_size=1,
)

View File

@@ -0,0 +1,185 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from typing import Any
import numpy as np
import pytest
import pytest_asyncio
from transformers import AutoTokenizer
from ....conftest import AUDIO_ASSETS, AudioTestAssets, VllmRunner
from ....utils import RemoteOpenAIServer
from ...registry import HF_EXAMPLE_MODELS
MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
AUDIO_PROMPTS = AUDIO_ASSETS.prompts(
{
"mary_had_lamb": "Transcribe this into English.",
"winning_call": "What is happening in this audio clip?",
}
)
MULTI_AUDIO_PROMPT = "Describe each of the audios above."
AudioTuple = tuple[np.ndarray, int]
VLLM_PLACEHOLDER = "<|audio|>"
HF_PLACEHOLDER = "<|audio|>"
CHUNKED_PREFILL_KWARGS = {
"enable_chunked_prefill": True,
"max_num_seqs": 2,
# Use a very small limit to exercise chunked prefill.
"max_num_batched_tokens": 16,
}
def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]:
"""Convert kwargs to CLI args."""
args = []
for key, value in params_kwargs.items():
if isinstance(value, bool):
if value:
args.append(f"--{key.replace('_', '-')}")
else:
args.append(f"--{key.replace('_', '-')}={value}")
return args
@pytest.fixture(
params=[
pytest.param({}, marks=pytest.mark.cpu_model),
pytest.param(CHUNKED_PREFILL_KWARGS),
]
)
def server(request, audio_assets: AudioTestAssets):
args = [
"--dtype",
"bfloat16",
"--max-model-len",
"4096",
"--enforce-eager",
"--limit-mm-per-prompt",
json.dumps({"audio": len(audio_assets)}),
"--trust-remote-code",
] + params_kwargs_to_cli_args(request.param)
with RemoteOpenAIServer(
MODEL_NAME, args, env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"}
) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
def _get_prompt(audio_count, question, placeholder):
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
placeholder = f"{placeholder}\n" * audio_count
return tokenizer.apply_chat_template(
[{"role": "user", "content": f"{placeholder}{question}"}],
tokenize=False,
add_generation_prompt=True,
)
def run_multi_audio_test(
vllm_runner: type[VllmRunner],
prompts_and_audios: list[tuple[str, list[AudioTuple]]],
model: str,
*,
dtype: str,
max_tokens: int,
num_logprobs: int,
**kwargs,
):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
with vllm_runner(
model,
dtype=dtype,
enforce_eager=True,
limit_mm_per_prompt={
"audio": max((len(audio) for _, audio in prompts_and_audios))
},
**kwargs,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
[prompt for prompt, _ in prompts_and_audios],
max_tokens,
num_logprobs=num_logprobs,
audios=[audios for _, audios in prompts_and_audios],
)
# The HuggingFace model doesn't support multiple audios yet, so
# just assert that some tokens were generated.
assert all(tokens for tokens, *_ in vllm_outputs)
@pytest.mark.core_model
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize(
"vllm_kwargs",
[
pytest.param({}, marks=pytest.mark.cpu_model),
pytest.param(CHUNKED_PREFILL_KWARGS),
],
)
def test_models_with_multiple_audios(
vllm_runner,
audio_assets: AudioTestAssets,
dtype: str,
max_tokens: int,
num_logprobs: int,
vllm_kwargs: dict,
) -> None:
vllm_prompt = _get_prompt(len(audio_assets), MULTI_AUDIO_PROMPT, VLLM_PLACEHOLDER)
run_multi_audio_test(
vllm_runner,
[(vllm_prompt, [audio.audio_and_sample_rate for audio in audio_assets])],
MODEL_NAME,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
**vllm_kwargs,
)
@pytest.mark.asyncio
async def test_online_serving(client, audio_assets: AudioTestAssets):
"""Exercises online serving with/without chunked prefill enabled."""
messages = [
{
"role": "user",
"content": [
*[
{"type": "audio_url", "audio_url": {"url": audio.url}}
for audio in audio_assets
],
{
"type": "text",
"text": f"What's happening in these {len(audio_assets)} audio clips?", # noqa: E501
},
],
}
]
chat_completion = await client.chat.completions.create(
model=MODEL_NAME, messages=messages, max_tokens=10
)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"

View File

@@ -0,0 +1,435 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Consolidated test for ViT attention backend functionality across multiple models.
This test validates that each multimodal model can successfully generate outputs
using different ViT attention backends. Tests are parametrized by model and backend.
"""
from dataclasses import asdict
from typing import Any
import pytest
from transformers import AutoProcessor
from vllm import LLM, EngineArgs, SamplingParams
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.multimodal.utils import encode_image_base64
from vllm.multimodal.video import sample_frames_from_video
from vllm.platforms import current_platform
from ....utils import create_new_process_for_each_test
from ...utils import dummy_hf_overrides
# Dots.OCR prompt from official repository
# https://github.com/rednote-hilab/dots.ocr/blob/d72d1d8c5bdd0362eb264f714cdbd1e5daa7cdff/dots_ocr/utils/prompts.py#L3
# ruff: noqa: E501
DOTS_OCR_PROMPT = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
1. Bbox format: [x1, y1, x2, y2]
2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
3. Text Extraction & Formatting Rules:
- Picture: For the 'Picture' category, the text field should be omitted.
- Formula: Format its text as LaTeX.
- Table: Format its text as HTML.
- All Others (Text, Title, etc.): Format their text as Markdown.
4. Constraints:
- The output text must be the original text from the image, with no translation.
- All layout elements must be sorted according to human reading order.
5. Final Output: The entire output must be a single JSON object.
"""
VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
# Model configurations
MODEL_CONFIGS: dict[str, dict[str, Any]] = {
"dots_ocr": {
"model_name": "rednote-hilab/dots.ocr",
"interface": "llm_chat",
"max_model_len": 32768,
"max_num_seqs": 1,
"limit_mm_per_prompt": {"image": 1},
"sampling_params": {
"temperature": 0.1,
"max_tokens": 16384,
"top_p": 0.9,
"stop_token_ids": None,
},
"use_specific_image": "stop_sign",
"prompt_builder": "build_dots_ocr_prompt",
"output_validator": lambda x: len(x) > 10 and "stop" in x.lower(),
},
"ernie45_vl": {
"model_name": "baidu/ERNIE-4.5-VL-28B-A3B-PT",
"interface": "llm_generate",
"max_model_len": 16384,
"max_num_seqs": 2,
"sampling_params": {
"temperature": 0.0,
"max_tokens": 256,
"stop_token_ids": None,
},
"use_processor": True,
"question": "What is the content of each image?",
},
"glm4_1v": {
"model_name": "zai-org/GLM-4.1V-9B-Thinking",
"interface": "llm_generate",
"max_model_len": 32768,
"max_num_seqs": 2,
"sampling_params": {
"temperature": 0.0,
"max_tokens": 256,
"stop_token_ids": None,
},
"use_processor": True,
"question": "What is the content of each image?",
},
"keye_vl": {
"model_name": "Kwai-Keye/Keye-VL-8B-Preview",
"interface": "llm_generate",
"max_model_len": 8192,
"max_num_seqs": 5,
"sampling_params": {
"temperature": 0.0,
"max_tokens": 256,
"stop_token_ids": None,
},
"supported_backends": {
AttentionBackendEnum.FLASH_ATTN,
AttentionBackendEnum.ROCM_AITER_FA,
},
"use_processor": True,
"question": "What is the content of each image?",
},
"ovis2_5": {
"model_name": "AIDC-AI/Ovis2.5-2B",
"interface": "llm_generate",
"max_model_len": 8192,
"max_num_seqs": 2,
"sampling_params": {
"temperature": 0.0,
"max_tokens": 256,
"stop_token_ids": None,
},
"prompt_builder": "build_ovis_prompt",
"question": "What is the content of each image?",
},
"qwen2_5_vl": {
"model_name": "Qwen/Qwen2.5-VL-3B-Instruct",
"interface": "vllm_runner",
"media_type": "video",
"max_model_len": 4000,
"max_num_seqs": 1,
"limit_mm_per_prompt": {"video": 1},
"sampling_params": {
"max_tokens": 128,
},
"runner_kwargs": {
"runner": "generate",
"dtype": "bfloat16",
},
"video_params": {
"num_frames": 16,
"pruning_rates": [0.0, 0.75],
},
},
"qwen2_5_omni": {
"model_name": "Qwen/Qwen2.5-Omni-3B",
"interface": "llm_generate",
"max_model_len": 32768,
"max_num_seqs": 2,
"limit_mm_per_prompt": {"image": 3, "video": 3, "audio": 3},
"sampling_params": {
"temperature": 0.6,
"top_p": 0.95,
"top_k": 20,
"max_tokens": 16384,
},
"use_processor": True,
"question": "What is the content of each image?",
},
"qwen3_omni": {
"model_name": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
"interface": "llm_generate",
"max_model_len": 32768,
"max_num_seqs": 2,
"limit_mm_per_prompt": {"image": 3, "video": 3, "audio": 3},
"sampling_params": {
"temperature": 0.6,
"top_p": 0.95,
"top_k": 20,
"max_tokens": 16384,
},
"use_processor": True,
"question": "What is the content of each image?",
},
}
# Prompt builder functions
def build_dots_ocr_prompt(images, config):
"""Build Dots.OCR specific prompt with OCR instructions."""
# Use only stop_sign image for Dots.OCR
image = images[0] # Already filtered to stop_sign
image_url = f"data:image/jpeg;base64,{encode_image_base64(image)}"
placeholders = [{"type": "image_url", "image_url": {"url": image_url}}]
messages = [
{
"role": "user",
"content": [
*placeholders,
{
"type": "text",
"text": f"<|img|><|imgpad|><|endofimg|>{DOTS_OCR_PROMPT}",
},
],
},
]
return messages
def build_processor_prompt(images, config):
"""Build prompt using AutoProcessor.apply_chat_template()."""
processor = AutoProcessor.from_pretrained(
config["model_name"], trust_remote_code=True
)
image_urls = [
f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
]
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [
{
"role": "user",
"content": [
*placeholders,
{"type": "text", "text": config["question"]},
],
},
]
return processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
def build_ovis_prompt(images, config):
"""Build Ovis2.5 specific prompt with custom format."""
image_urls = [
f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
]
placeholders = "\n".join(
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
)
return (
f"<|im_start|>user\n\n{placeholders}\n{config['question']}<|im_end|>\n"
"<|im_start|>assistant\n"
)
def build_qwen2_5_video_prompt():
"""Build Qwen2.5-VL video prompt with EVS placeholder."""
return (
f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n{VIDEO_PLACEHOLDER}"
"Describe this video with a short sentence (no more than 20 words)"
"<|im_end|><|im_start|>assistant\n"
)
# Handler functions
def run_llm_generate_test(config, mm_encoder_attn_backend, image_assets):
"""Standard LLM.generate() interface handler."""
images = [asset.pil_image for asset in image_assets]
# Build prompt
if config.get("use_processor"):
prompt = build_processor_prompt(images, config)
else:
prompt_builder_name = config.get("prompt_builder", "build_ovis_prompt")
prompt_builder = globals()[prompt_builder_name]
prompt = prompt_builder(images, config)
# Determine limit_mm_per_prompt
limit_mm_per_prompt = config.get("limit_mm_per_prompt", {"image": len(images)})
# Create engine
engine_args = EngineArgs(
model=config["model_name"],
trust_remote_code=True,
max_model_len=config["max_model_len"],
max_num_seqs=config["max_num_seqs"],
limit_mm_per_prompt=limit_mm_per_prompt,
mm_encoder_attn_backend=mm_encoder_attn_backend,
hf_overrides=dummy_hf_overrides,
load_format="dummy",
)
engine_dict = asdict(engine_args) | {"seed": 42}
llm = LLM(**engine_dict)
# Generate
sampling_params = SamplingParams(**config["sampling_params"])
outputs = llm.generate(
{
"prompt": prompt,
"multi_modal_data": {"image": images},
},
sampling_params=sampling_params,
)
# Validate
for o in outputs:
generated_text = o.outputs[0].text
validator = config.get("output_validator", lambda x: len(x) > 10)
assert validator(generated_text), (
f"Validation failed for {config['model_name']}: {generated_text}"
)
def run_llm_chat_test(config, mm_encoder_attn_backend, image_assets):
"""LLM.chat() interface handler for Dots.OCR."""
# Filter to stop_sign image only
stop_sign_image = [
asset.pil_image for asset in image_assets if asset.name == "stop_sign"
][0]
# Build messages
messages = build_dots_ocr_prompt([stop_sign_image], config)
# Create engine
engine_args = EngineArgs(
model=config["model_name"],
trust_remote_code=True,
max_model_len=config["max_model_len"],
max_num_seqs=config["max_num_seqs"],
limit_mm_per_prompt=config["limit_mm_per_prompt"],
mm_encoder_attn_backend=mm_encoder_attn_backend,
hf_overrides=dummy_hf_overrides,
load_format="dummy",
)
engine_dict = asdict(engine_args) | {"seed": 42}
llm = LLM(**engine_dict)
# Generate using chat
sampling_params = SamplingParams(**config["sampling_params"])
outputs = llm.chat(messages=messages, sampling_params=sampling_params)
# Validate
for o in outputs:
generated_text = o.outputs[0].text
validator = config.get("output_validator", lambda x: len(x) > 10)
assert validator(generated_text), (
f"Validation failed for {config['model_name']}: {generated_text}"
)
def run_video_test(config, mm_encoder_attn_backend, video_assets, vllm_runner):
"""Video test with EVS (Efficient Video Sampling) handler."""
for pruning_rate in config["video_params"]["pruning_rates"]:
num_frames = config["video_params"]["num_frames"]
# Sample frames from video
sampled_vids = [
sample_frames_from_video(asset.np_ndarrays, num_frames)
for asset in video_assets
]
# Build prompt and prepare video
prompt = build_qwen2_5_video_prompt()
prompts = [prompt]
videos = [sampled_vids[0]]
# Run with vllm_runner context manager
with vllm_runner(
config["model_name"],
max_model_len=config["max_model_len"],
max_num_seqs=config["max_num_seqs"],
limit_mm_per_prompt=config["limit_mm_per_prompt"],
tensor_parallel_size=1,
video_pruning_rate=pruning_rate,
mm_encoder_attn_backend=mm_encoder_attn_backend,
hf_overrides=dummy_hf_overrides,
load_format="dummy",
**config["runner_kwargs"],
) as vllm_model:
outputs = vllm_model.generate_greedy(
prompts,
config["sampling_params"]["max_tokens"],
videos=videos,
)
# Validate output
assert len(outputs) == 1, f"Expected 1 output, got {len(outputs)}"
output_ids, output_text = outputs[0]
assert len(output_ids) > 0, "Generated no output IDs"
assert len(output_text) > 0, "Generated empty text"
assert isinstance(output_text, str), (
f"Output is not string: {type(output_text)}"
)
# Main test function
@pytest.mark.parametrize("model_key", list(MODEL_CONFIGS.keys()))
@pytest.mark.parametrize(
"mm_encoder_attn_backend",
[None] + current_platform.get_supported_vit_attn_backends(),
)
@pytest.mark.skip(reason="Broken test due to memory segmentation fault")
@create_new_process_for_each_test()
def test_vit_backend_functionality(
model_key: str,
mm_encoder_attn_backend: AttentionBackendEnum | None,
image_assets,
video_assets,
vllm_runner,
request,
):
"""Test ViT attention backend functionality for multimodal models.
This test validates that each model can successfully generate outputs
using different ViT attention backends. The test:
1. Filters unsupported backends per model
2. Applies appropriate GPU marks
3. Routes to the correct test handler based on interface
4. Validates output meets minimum requirements
"""
config = MODEL_CONFIGS[model_key]
# Step 1: Backend filtering
if (
"supported_backends" in config
and mm_encoder_attn_backend is not None
and mm_encoder_attn_backend not in config["supported_backends"]
):
pytest.skip(
f"{model_key} does not support {mm_encoder_attn_backend} backend now."
)
# Step 2: Apply GPU marks dynamically
if "gpu_marks" in config:
for mark in config["gpu_marks"]:
request.applymarker(mark)
# Step 3: Route to appropriate handler
if config.get("media_type") == "video":
run_video_test(config, mm_encoder_attn_backend, video_assets, vllm_runner)
elif config["interface"] == "llm_chat":
run_llm_chat_test(config, mm_encoder_attn_backend, image_assets)
elif config["interface"] == "llm_generate":
run_llm_generate_test(config, mm_encoder_attn_backend, image_assets)
else:
raise ValueError(f"Unknown interface: {config['interface']}")

View File

@@ -0,0 +1,114 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import pytest
import pytest_asyncio
from mistral_common.audio import Audio
from mistral_common.protocol.instruct.chunk import AudioChunk, RawAudio, TextChunk
from mistral_common.protocol.instruct.messages import UserMessage
from vllm.tokenizers.mistral import MistralTokenizer
from ....conftest import AudioTestAssets
from ....utils import RemoteOpenAIServer
from .test_ultravox import MULTI_AUDIO_PROMPT, run_multi_audio_test
MODEL_NAME = "mistralai/Voxtral-Mini-3B-2507"
MISTRAL_FORMAT_ARGS = [
"--tokenizer_mode",
"mistral",
"--config_format",
"mistral",
"--load_format",
"mistral",
]
@pytest.fixture()
def server(request, audio_assets: AudioTestAssets):
args = [
"--enforce-eager",
"--limit-mm-per-prompt",
json.dumps({"audio": len(audio_assets)}),
] + MISTRAL_FORMAT_ARGS
with RemoteOpenAIServer(
MODEL_NAME, args, env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"}
) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
def _get_prompt(audio_assets, question):
tokenizer = MistralTokenizer.from_pretrained(MODEL_NAME)
audios = [
Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
for i in range(len(audio_assets))
]
audio_chunks = [
AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
]
text_chunk = TextChunk(text=question)
messages = [UserMessage(content=[*audio_chunks, text_chunk]).to_openai()]
return tokenizer.apply_chat_template(messages=messages)
@pytest.mark.core_model
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models_with_multiple_audios(
vllm_runner,
audio_assets: AudioTestAssets,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
vllm_prompt = _get_prompt(audio_assets, MULTI_AUDIO_PROMPT)
run_multi_audio_test(
vllm_runner,
[(vllm_prompt, [audio.audio_and_sample_rate for audio in audio_assets])],
MODEL_NAME,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tokenizer_mode="mistral",
)
@pytest.mark.asyncio
async def test_online_serving(client, audio_assets: AudioTestAssets):
"""Exercises online serving with/without chunked prefill enabled."""
def asset_to_chunk(asset):
audio = Audio.from_file(str(asset.get_local_path()), strict=False)
audio.format = "wav"
audio_dict = AudioChunk.from_audio(audio).to_openai()
return audio_dict
audio_chunks = [asset_to_chunk(asset) for asset in audio_assets]
text = f"What's happening in these {len(audio_assets)} audio clips?"
messages = [
{
"role": "user",
"content": [*audio_chunks, {"type": "text", "text": text}],
}
]
chat_completion = await client.chat.completions.create(
model=MODEL_NAME, messages=messages, max_tokens=10
)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"

View File

@@ -0,0 +1,178 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
from typing import Any
import librosa
import pytest
from transformers import AutoModelForSpeechSeq2Seq
from vllm.assets.audio import AudioAsset
from vllm.platforms import current_platform
from ....conftest import HfRunner, PromptAudioInput, VllmRunner
from ....utils import create_new_process_for_each_test, multi_gpu_test
from ...registry import HF_EXAMPLE_MODELS
from ...utils import check_logprobs_close
VLLM_PROMPT = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
HF_PROMPT = ""
# Whisper expects 16kHz audio
WHISPER_SAMPLE_RATE = 16000
@pytest.fixture(autouse=True)
def use_spawn_for_whisper(monkeypatch):
"""Whisper has issues with forked workers, use spawn instead."""
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
def run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
inputs: Sequence[tuple[list[str], list[str], PromptAudioInput]],
model: str,
*,
max_model_len: int,
dtype: str,
max_tokens: int,
num_logprobs: int,
tensor_parallel_size: int,
distributed_executor_backend: str | None = None,
enforce_eager: bool = True,
) -> None:
"""Inference result should be the same between hf and vllm.
All the audio fixtures for the test are from AudioAsset.
For huggingface runner, we provide the audio as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
"""
with vllm_runner(
model,
dtype=dtype,
max_model_len=max_model_len,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
limit_mm_per_prompt={"audio": 2},
enforce_eager=enforce_eager,
disable_custom_all_reduce=True,
) as vllm_model:
vllm_outputs_per_case = [
vllm_model.generate_greedy_logprobs(
vllm_prompts,
max_tokens,
num_logprobs=num_logprobs,
audios=audios,
)
for vllm_prompts, _, audios in inputs
]
with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(
hf_prompts,
max_tokens,
num_logprobs=num_logprobs,
audios=audios,
)
for _, hf_prompts, audios in inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.fixture
def input_audios() -> list[tuple[list[str], list[str], list[tuple[Any, int]]]]:
audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
inputs = []
for asset in audio_assets:
audio, orig_sr = asset.audio_and_sample_rate
# Resample to Whisper's expected sample rate (16kHz)
if orig_sr != WHISPER_SAMPLE_RATE:
audio = librosa.resample(
audio, orig_sr=orig_sr, target_sr=WHISPER_SAMPLE_RATE
)
# vLLM prompts, HF prompts, audio inputs
inputs.append(([VLLM_PROMPT], [HF_PROMPT], [(audio, WHISPER_SAMPLE_RATE)]))
return inputs
def check_model_available(model: str) -> None:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
@pytest.mark.core_model
@pytest.mark.cpu_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("enforce_eager", [True, False])
@create_new_process_for_each_test("spawn")
def test_models(
hf_runner,
vllm_runner,
model: str,
dtype: str,
num_logprobs: int,
input_audios,
enforce_eager: bool,
) -> None:
check_model_available(model)
if current_platform.is_cpu() and not enforce_eager:
pytest.skip("Skipping test for CPU with non-eager mode")
run_test(
hf_runner,
vllm_runner,
input_audios,
model,
dtype=dtype,
max_model_len=448,
max_tokens=200,
num_logprobs=num_logprobs,
tensor_parallel_size=1,
enforce_eager=enforce_eager,
)
@multi_gpu_test(num_gpus=2)
@pytest.mark.core_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [200])
@pytest.mark.parametrize("num_logprobs", [5])
@create_new_process_for_each_test("spawn")
def test_models_distributed(
hf_runner,
vllm_runner,
model: str,
distributed_executor_backend: str,
dtype: str,
max_tokens: int,
num_logprobs: int,
input_audios,
) -> None:
check_model_available(model)
run_test(
hf_runner,
vllm_runner,
input_audios,
model,
dtype=dtype,
max_model_len=448,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=False,
)

View File

@@ -0,0 +1,347 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Helpers for building inputs that can be leveraged for different test types."""
from collections.abc import Callable, Iterable
from pathlib import PosixPath
from typing import Any
import numpy.typing as npt
import torch
from vllm.multimodal.audio import AudioResampler
from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.video import (
rescale_video_size,
resize_video,
sample_frames_from_video,
)
from .....conftest import AudioTestAssets, ImageTestAssets, VideoTestAssets
from .types import (
SINGLE_AUDIO_BASE_PROMPT,
SINGLE_IMAGE_BASE_PROMPTS,
TEST_AUDIO_PLACEHOLDER,
TEST_IMG_PLACEHOLDER,
TEST_VIDEO_PLACEHOLDER,
VIDEO_BASE_PROMPT,
ImageSizeWrapper,
PromptWithMultiModalInput,
SizeType,
VLMTestInfo,
)
def replace_test_placeholder(
prompt: str, mm_idx_to_prompt: Callable[[int], str], test_placeholder: str
) -> str:
"""Given a prompt, replaces each test placeholder with the
model-specific tag.
"""
prompt_segments = prompt.split(test_placeholder)
img_prompt = prompt_segments[0]
for placeholder_idx, next_seg in enumerate(prompt_segments[1:], start=1):
img_prompt += mm_idx_to_prompt(placeholder_idx)
img_prompt += next_seg
return img_prompt
def get_model_prompts(
base_prompts: Iterable[str],
img_idx_to_prompt: Callable[[int], str] | None,
video_idx_to_prompt: Callable[[int], str] | None,
audio_idx_to_prompt: Callable[[int], str] | None,
prompt_formatter: Callable[[str], str],
) -> list[str]:
"""Given a model-agnostic base prompt and test configuration for a model(s)
to be tested, update the media placeholders and apply the prompt formatting
to get the test prompt string for this model.
Example for phi3v, given the base_prompt: "<image>What is the season?"
1. Replace img placeholder(s)
-> "<|image_1|>\nWhat is the season?"
2. Apply prompt formatter:
-> <|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n
"""
assert isinstance(base_prompts, (list, tuple))
model_prompts = []
for base_prompt in base_prompts:
# Replace the multimodal placeholders in the base prompt with
# the correct ones for the model that we are testing
if img_idx_to_prompt:
base_prompt = replace_test_placeholder(
base_prompt, img_idx_to_prompt, TEST_IMG_PLACEHOLDER
)
if video_idx_to_prompt:
base_prompt = replace_test_placeholder(
base_prompt, video_idx_to_prompt, TEST_VIDEO_PLACEHOLDER
)
if audio_idx_to_prompt:
base_prompt = replace_test_placeholder(
base_prompt, audio_idx_to_prompt, TEST_AUDIO_PLACEHOLDER
)
# Apply the prompt formatter to wrap the base prompt with
# the correct media placeholders to get the model test prompt
model_prompt = prompt_formatter(base_prompt)
model_prompts.append(model_prompt)
return model_prompts
def build_single_image_inputs_from_test_info(
test_info: VLMTestInfo,
image_assets: ImageTestAssets,
size_wrapper: ImageSizeWrapper,
tmp_path: PosixPath | None = None,
) -> list[PromptWithMultiModalInput]:
if test_info.prompt_formatter is None:
raise ValueError("Prompt formatter must be set to build single image inputs")
model_prompts = get_model_prompts(
test_info.single_image_prompts,
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter,
)
# For models that require a local path / URL encoded in the image; export
# assets and encode into tmp_path for this test. This should be avoided
# where possible (currently needed for Qwen-VL).
if test_info.prompt_path_encoder is not None:
if tmp_path is None:
raise ValueError("Prompt path encoder requires setting local path")
model_prompts = [
test_info.prompt_path_encoder(tmp_path, prompt, [asset])
for prompt, asset in zip(model_prompts, image_assets)
]
images = [asset.pil_image for asset in image_assets]
assert len(images) == len(model_prompts)
return build_single_image_inputs(images, model_prompts, size_wrapper)
def build_single_image_inputs(
images, model_prompts, size_wrapper: ImageSizeWrapper
) -> list[PromptWithMultiModalInput]:
# For every image / prompt pair, get a pair containing two lists of
# length size_factors, where the first contains duplicates of the model
# prompt [str], and the second contains copies of the image after being
# scaled by one of the size factors.
#
# NOTE: rescaling preserves the image aspect ratio.
return [
PromptWithMultiModalInput(
prompts=[prompt for _ in size_wrapper.data],
image_data=[
apply_image_size_scaling(image, size, size_wrapper.type)
for size in size_wrapper.data
],
)
for image, prompt in zip(images, model_prompts)
]
def build_multi_image_inputs_from_test_info(
test_info: VLMTestInfo,
image_assets: ImageTestAssets,
size_wrapper: ImageSizeWrapper,
tmp_path: PosixPath | None = None,
) -> list[PromptWithMultiModalInput]:
if test_info.prompt_formatter is None:
raise ValueError("Prompt formatter must be set to build multi image inputs")
model_prompts = get_model_prompts(
[test_info.multi_image_prompt],
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter,
)
if test_info.prompt_path_encoder is not None:
if tmp_path is None:
raise ValueError("Prompt path encoder requires setting local path")
model_prompts = [
test_info.prompt_path_encoder(tmp_path, model_prompt, image_assets)
for model_prompt in model_prompts
]
images = [asset.pil_image for asset in image_assets]
# Currently, we only have one multi-image list & one multi-image prompt
return build_multi_image_inputs(
image_lists=[images],
model_prompts=model_prompts,
size_wrapper=size_wrapper,
)
def build_multi_image_inputs(
image_lists, model_prompts, size_wrapper: ImageSizeWrapper
) -> list[PromptWithMultiModalInput]:
return [
PromptWithMultiModalInput(
prompts=[prompt for _ in size_wrapper.data],
image_data=[
[
apply_image_size_scaling(image, size, size_wrapper.type)
for image in images
]
for size in size_wrapper.data
],
)
for images, prompt in zip(image_lists, model_prompts)
]
def build_embedding_inputs_from_test_info(
test_info: VLMTestInfo,
image_assets: ImageTestAssets,
size_wrapper: ImageSizeWrapper,
):
# These conditions will always be true if invoked through filtering,
# but we still check them in case this is ever called directly
if test_info.prompt_formatter is None:
raise ValueError("Prompt formatter must be set to build image embedding inputs")
if size_wrapper.type != SizeType.SIZE_FACTOR or not all(
factor == 1.0 for factor in size_wrapper.data
):
raise ValueError("Embedding tests require constant (1.0) size factors")
if test_info.convert_assets_to_embeddings is None:
raise ValueError("No conversion func for getting embeddings found")
model_prompts = get_model_prompts(
SINGLE_IMAGE_BASE_PROMPTS,
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter,
)
images = [asset.pil_image for asset in image_assets]
embeds = test_info.convert_assets_to_embeddings(image_assets)
if test_info.dtype != "auto":
dtype = getattr(torch, test_info.dtype) # type: ignore
embeds = [e.to(dtype=dtype) for e in embeds]
assert len(images) == len(model_prompts)
inputs = build_single_image_inputs(images, model_prompts, size_wrapper)
vllm_embeddings = build_single_image_inputs(embeds, model_prompts, size_wrapper)
return inputs, vllm_embeddings
def build_video_inputs_from_test_info(
test_info: VLMTestInfo,
video_assets: VideoTestAssets,
size_wrapper: ImageSizeWrapper,
num_frames: int,
needs_video_metadata: bool,
) -> list[PromptWithMultiModalInput]:
if test_info.prompt_formatter is None:
raise ValueError("Prompt formatter must be set to build video inputs")
model_prompts = get_model_prompts(
[VIDEO_BASE_PROMPT],
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter,
)
sampled_vids = [
sample_frames_with_video_metadata(
(asset.np_ndarrays, asset.metadata),
num_frames,
)
for asset in video_assets
]
video_scaler = (
resize_video if size_wrapper.type == SizeType.FIXED_SIZE else rescale_video_size
)
return [
PromptWithMultiModalInput(
prompts=[prompt for _ in size_wrapper.data],
video_data=[
(
video_scaler(video, size)
if not needs_video_metadata
else (video_scaler(video, size), meta)
)
for size in size_wrapper.data
],
)
for (video, meta), prompt in zip(sampled_vids, model_prompts)
]
def sample_frames_with_video_metadata(
video_with_meta: tuple[npt.NDArray, dict[str, Any]],
num_frames: int,
) -> tuple[npt.NDArray, dict[str, Any]]:
video, meta = video_with_meta
video = sample_frames_from_video(video, num_frames)
meta["do_sample_frames"] = meta["total_num_frames"] == num_frames
meta["total_num_frames"] = num_frames
meta["fps"] = meta["duration"] / num_frames
meta["frames_indices"] = list(range(num_frames))
return video, meta
def apply_image_size_scaling(image, size: float | tuple[int, int], size_type: SizeType):
"""Applies a size scaler to one image; this can be an image size factor,
which scales the image while maintaining the aspect ratio"""
# Special case for embeddings; if it's a tensor, it's only valid if we
# are considering size factors at constant scale, i.e., we just clone
# the tensor
if isinstance(image, torch.Tensor):
assert size_type == SizeType.SIZE_FACTOR and size == 1
return image
if size_type == SizeType.SIZE_FACTOR:
# We have a list of image size factors
return rescale_image_size(image, size)
elif size_type == SizeType.FIXED_SIZE:
# We have a list of fixed sizes
return image.resize(size)
raise ValueError("ImageSizeWrapper type must be FIXED_SIZE or SIZE_FACTOR")
def build_audio_inputs_from_test_info(
test_info: VLMTestInfo,
audio_assets: AudioTestAssets,
) -> list[PromptWithMultiModalInput]:
if test_info.prompt_formatter is None:
raise ValueError("Prompt formatter must be set to build audio inputs")
model_prompts = get_model_prompts(
SINGLE_AUDIO_BASE_PROMPT,
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter,
)
resampler = AudioResampler(
target_sr=16000,
method="librosa",
)
audios = [asset.audio_and_sample_rate for asset in audio_assets]
resampled_audios = [
(
resampler.resample(
audio,
orig_sr=sr,
),
int(resampler.target_sr),
)
for audio, sr in audios
]
return [
PromptWithMultiModalInput(
prompts=model_prompts,
audio_data=resampled_audios,
)
]

View File

@@ -0,0 +1,183 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Utils for determining which subset of model tests belong to a specific
modality, getting all combinations (similar to pytest's parametrization),
handling multimodal placeholder substitution, and so on.
"""
import itertools
from collections import OrderedDict
from collections.abc import Iterable
import pytest
from .types import (
EMBEDDING_SIZE_FACTORS,
ExpandableVLMTestArgs,
ImageSizeWrapper,
SizeType,
VLMTestInfo,
VLMTestType,
)
def get_filtered_test_settings(
test_settings: dict[str, VLMTestInfo],
test_type: VLMTestType,
new_proc_per_test: bool,
) -> dict[str, VLMTestInfo]:
"""Given the dict of potential test settings to run, return a subdict
of tests who have the current test type enabled with the matching val for
fork_per_test.
"""
def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType):
return test_info.test_type == test_type or (
isinstance(test_info.test_type, Iterable)
and test_type in test_info.test_type
)
matching_tests = {}
for test_name, test_info in test_settings.items():
# Otherwise check if the test has the right type & keep if it does
if matches_test_type(test_info, test_type):
# Embedding tests need to have a conversion func in their test info
if matches_test_type(test_info, VLMTestType.EMBEDDING):
assert test_info.convert_assets_to_embeddings is not None
# Custom test inputs need to explicitly define the mm limit/inputs
if matches_test_type(test_info, VLMTestType.CUSTOM_INPUTS):
assert test_info.custom_test_opts is not None and isinstance(
test_info.custom_test_opts, Iterable
)
# For all types besides custom inputs, we need a prompt formatter
else:
assert test_info.prompt_formatter is not None
# Everything looks okay; keep if this is correct proc handling
if (
test_info.distributed_executor_backend is not None
) == new_proc_per_test:
matching_tests[test_name] = test_info
return matching_tests
def get_model_type_cases(
model_type: str,
test_info: VLMTestInfo,
test_type: VLMTestType,
):
# Ensure that something is wrapped as an iterable it's not already
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,)
# This is essentially the same as nesting a bunch of mark.parametrize
# decorators, but we do it programmatically to allow overrides for on
# a per-model basis, while still being able to execute each of these
# as individual test cases in pytest.
iter_kwargs = OrderedDict(
[
("model", ensure_wrapped(test_info.models)),
("max_tokens", ensure_wrapped(test_info.max_tokens)),
("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
("dtype", ensure_wrapped(test_info.dtype)),
(
"distributed_executor_backend",
ensure_wrapped(test_info.distributed_executor_backend),
),
]
)
# num_frames is video only
if test_type == VLMTestType.VIDEO:
iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames)
iter_kwargs["needs_video_metadata"] = ensure_wrapped(
test_info.needs_video_metadata
)
# No sizes passed for custom inputs, since inputs are directly provided
if test_type not in (
VLMTestType.CUSTOM_INPUTS,
VLMTestType.AUDIO,
):
wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
if wrapped_sizes is None:
raise ValueError(f"Sizes must be set for test type {test_type}")
iter_kwargs["size_wrapper"] = wrapped_sizes
# Otherwise expand the custom test options instead
elif test_type == VLMTestType.CUSTOM_INPUTS:
if test_info.custom_test_opts is None:
raise ValueError("Test has type CUSTOM_INPUTS, but none given")
iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
# Wrap all model cases in a pytest parameter & pass marks through
return [
pytest.param(
model_type,
ExpandableVLMTestArgs(**{k: v for k, v in zip(iter_kwargs.keys(), case)}),
marks=test_info.marks if test_info.marks is not None else [],
)
for case in list(itertools.product(*iter_kwargs.values()))
]
def get_parametrized_options(
test_settings: dict[str, VLMTestInfo],
test_type: VLMTestType,
create_new_process_for_each_test: bool,
):
"""Converts all of our VLMTestInfo into an expanded list of parameters.
This is similar to nesting pytest parametrize calls, but done directly
through an itertools product so that each test can set things like
size factors etc, while still running in isolated test cases.
"""
matching_tests = get_filtered_test_settings(
test_settings, test_type, create_new_process_for_each_test
)
# Get a list per model type, where each entry contains a tuple of all of
# that model type's cases, then flatten them into the top level so that
# we can consume them in one mark.parametrize call.
cases_by_model_type = [
get_model_type_cases(model_type, test_info, test_type)
for model_type, test_info in matching_tests.items()
]
return list(itertools.chain(*cases_by_model_type))
def get_wrapped_test_sizes(
test_info: VLMTestInfo, test_type: VLMTestType
) -> tuple[ImageSizeWrapper, ...]:
"""Given a test info which may have size factors or fixed sizes, wrap them
and combine them into an iterable, each of which will be used in parameter
expansion.
Args:
test_info: Test configuration to be expanded.
test_type: The type of test being filtered for.
"""
# If it is an embedding test, we always use the EMBEDDING_SIZE_FACTORS
if test_type == VLMTestType.EMBEDDING:
return tuple(
[
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
for factor in EMBEDDING_SIZE_FACTORS
]
)
# Audio and Custom inputs have preprocessed inputs
elif test_type in (VLMTestType.AUDIO, VLMTestType.CUSTOM_INPUTS):
return tuple()
size_factors = test_info.image_size_factors if test_info.image_size_factors else []
fixed_sizes = test_info.image_sizes if test_info.image_sizes else []
wrapped_factors = [
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
for factor in size_factors
]
wrapped_sizes = [
ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size) for size in fixed_sizes
]
return tuple(wrapped_factors + wrapped_sizes)

View File

@@ -0,0 +1,189 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Core test implementation to be shared across modalities."""
from collections.abc import Callable
from typing import Any
import torch
from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm.config.model import RunnerOption
from vllm.tokenizers import TokenizerLike
from .....conftest import HfRunner, VllmRunner
from ....registry import HF_EXAMPLE_MODELS
from .types import PromptWithMultiModalInput, RunnerOutput
def run_test(
*,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
inputs: list[PromptWithMultiModalInput],
model: str,
dtype: str,
max_tokens: int,
num_logprobs: int,
enforce_eager: bool,
max_model_len: int,
max_num_seqs: int,
hf_output_post_proc: Callable[[RunnerOutput, str], Any] | None,
vllm_output_post_proc: Callable[[RunnerOutput, str], Any] | None,
auto_cls: type[_BaseAutoModelClass],
use_tokenizer_eos: bool,
comparator: Callable[..., None],
get_stop_token_ids: Callable[[TokenizerLike], list[int]] | None,
stop_str: list[str] | None,
limit_mm_per_prompt: dict[str, int],
vllm_runner_kwargs: dict[str, Any] | None,
hf_model_kwargs: dict[str, Any] | None,
patch_hf_runner: Callable[[HfRunner], HfRunner] | None,
runner: RunnerOption = "auto",
distributed_executor_backend: str | None = None,
tensor_parallel_size: int = 1,
vllm_embeddings: torch.Tensor | None = None,
):
"""Modality agnostic test executor for comparing HF/vLLM outputs."""
# In the case of embeddings, vLLM takes separate input tensors
vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
# Disable other modalities to save memory
default_limits = {"image": 0, "video": 0, "audio": 0}
limit_mm_per_prompt = default_limits | limit_mm_per_prompt
vllm_outputs_per_mm = []
hf_outputs_per_mm = []
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
vllm_runner_kwargs_: dict[str, Any] = {"mm_processor_cache_gb": 0}
if model_info.tokenizer:
vllm_runner_kwargs_["tokenizer_name"] = model_info.tokenizer
if model_info.tokenizer_mode:
vllm_runner_kwargs_["tokenizer_mode"] = model_info.tokenizer_mode
if model_info.hf_overrides:
vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides
if model_info.require_embed_inputs:
for k in ("skip_tokenizer_init", "enable_prompt_embeds", "enable_mm_embeds"):
vllm_runner_kwargs_[k] = model_info.require_embed_inputs
if vllm_runner_kwargs:
vllm_runner_kwargs_.update(vllm_runner_kwargs)
with vllm_runner(
model,
max_model_len=max_model_len,
max_num_seqs=max_num_seqs,
dtype=dtype,
limit_mm_per_prompt=limit_mm_per_prompt,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=enforce_eager,
runner=runner,
**vllm_runner_kwargs_,
) as vllm_model:
tokenizer = vllm_model.llm.get_tokenizer()
vllm_kwargs: dict[str, Any] = {}
if get_stop_token_ids is not None:
vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
if stop_str:
vllm_kwargs["stop"] = stop_str
for prompts, image_data, video_data, audio_data in vllm_inputs:
mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
vllm_kwargs_with_mm_data = vllm_kwargs | mm_data
vllm_output = vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
**vllm_kwargs_with_mm_data,
)
vllm_outputs_per_mm.append(vllm_output)
hf_model = hf_runner(
model, dtype=dtype, auto_cls=auto_cls, model_kwargs=hf_model_kwargs
)
# Some models need to patch things like the model processor, e.g., internvl
if patch_hf_runner is not None:
hf_model = patch_hf_runner(hf_model)
with hf_model, torch.no_grad():
tokenizer = hf_model.tokenizer
# Some models need to explicitly pass the eos_token_id off the tokenizer
# or processor for a good comparison;
# currently assume processor/tokenizer agree on the EOS, and pull it off
# the tokenizer if requested.
hf_kwargs = {}
if use_tokenizer_eos:
hf_kwargs["eos_token_id"] = tokenizer.eos_token_id
if stop_str:
hf_kwargs["stop_strings"] = stop_str
for prompts, image_data, video_data, audio_data in inputs:
mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
hf_kwargs_with_mm_data = hf_kwargs | mm_data
hf_output = hf_model.generate_greedy_logprobs_limit(
prompts,
max_tokens,
num_logprobs=num_logprobs,
tokenizer=tokenizer,
**hf_kwargs_with_mm_data,
)
hf_outputs_per_mm.append(hf_output)
# Apply output processing / sanitation to the vLLM and HF runner results
hf_outputs_per_mm, vllm_outputs_per_mm = process_runner_outputs(
model,
first_runner_outputs=hf_outputs_per_mm,
second_runner_outputs=vllm_outputs_per_mm,
first_runner_processor=hf_output_post_proc,
second_runner_processor=vllm_output_post_proc,
)
for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm, vllm_outputs_per_mm):
# This is usually check_logprobs_close, but it's passed through to
# allow things like check_outputs_equal where needed
comparator(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
def process_runner_outputs(
model,
first_runner_outputs,
second_runner_outputs,
first_runner_processor=None,
second_runner_processor=None,
):
"""Applies the runner processor(s) to the runner outputs, if any."""
if first_runner_processor is not None:
first_runner_outputs = process_outputs(
first_runner_processor, model, first_runner_outputs
)
if second_runner_processor is not None:
second_runner_outputs = process_outputs(
second_runner_processor, model, second_runner_outputs
)
return first_runner_outputs, second_runner_outputs
def process_outputs(output_processor, model, outputs_per_image):
"""Applies a model specific post-processor function to a runner's output"""
return [
[output_processor(res, model) for res in outputs]
for outputs in outputs_per_image
]

View File

@@ -0,0 +1,156 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Custom input builders for edge-cases in different models."""
from collections.abc import Callable
from vllm.assets.image import ImageAsset
from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.video import (
rescale_video_size,
resize_video,
sample_frames_from_video,
)
from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
from .builders import build_multi_image_inputs, build_single_image_inputs
from .types import ImageSizeWrapper, PromptWithMultiModalInput, SizeType
def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
"""Builds inputs for multi-image (varied sizes/aspect ratio) testing.
Args:
formatter: model-specific prompt formatter.
"""
stop_sign = IMAGE_ASSETS[0].pil_image
cherry_blossom = IMAGE_ASSETS[1].pil_image
# Apply the selected formatter to the base prompts
img_prompts = [
"<image><image>\nDescribe 2 images.",
"<image><image>\nDescribe 2 images.",
"<image><image><image><image>\nDescribe 4 images.",
"<image>\nWhat is the season?",
]
formatted_prompts = [formatter(prompt) for prompt in img_prompts]
aspect_ratio_images = [
[stop_sign, cherry_blossom],
# Images with different sizes and aspect-ratios
[
rescale_image_size(stop_sign, 0.1),
stop_sign,
],
[
stop_sign,
rescale_image_size(stop_sign, 0.25),
cherry_blossom.resize((183, 488)),
cherry_blossom.resize((488, 183)),
],
cherry_blossom,
]
return [
PromptWithMultiModalInput(
prompts=formatted_prompts,
image_data=aspect_ratio_images,
)
]
def multi_video_multi_aspect_ratio_inputs(
formatter: Callable[[str], str], num_frames: int = 16
):
"""Builds inputs for multi-video (varied sizes/aspect ratio) testing.
Args:
formatter: model-specific prompt formatter.
"""
video = sample_frames_from_video(VIDEO_ASSETS[0].np_ndarrays, num_frames)
# Apply the selected formatter to the base prompts
video_prompts = [
"<video><video>\nDescribe 2 videos.",
"<video><video>\nDescribe 2 videos.",
"<video><video><video><video>\nDescribe 4 videos.",
"<video>\nWhy is this video funny?",
]
formatted_prompts = [formatter(prompt) for prompt in video_prompts]
aspect_ratio_videos = [
[video, video],
# Videos with different sizes and aspect-ratios
[
rescale_video_size(video, 0.1),
video,
],
[
video,
rescale_video_size(video, 0.25),
resize_video(video, (183, 488)),
resize_video(video, (488, 183)),
],
video,
]
return [
PromptWithMultiModalInput(
prompts=formatted_prompts,
video_data=aspect_ratio_videos,
)
]
def different_patch_input_cases_internvl():
images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
formatter = (
lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501
)
single_img_prompts = [
"<image>\nWhat's the content in the center of the image?",
"<image>\nWhat is the season?",
]
multi_img_prompts = [
"Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.\n", # noqa: E501
]
formatted_sprompts = [formatter(prompt) for prompt in single_img_prompts]
formatted_mprompts = [formatter(prompt) for prompt in multi_img_prompts]
wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5, 1.0])
return [
build_single_image_inputs(images, formatted_sprompts, wrapped_sf),
build_multi_image_inputs([images], formatted_mprompts, wrapped_sf),
]
def windows_attention_image_qwen2_5_vl():
# image from regression issue: https://github.com/vllm-project/vllm/issues/15122 # noqa: E501
image = ImageAsset("hato").pil_image
question = "Describe the image."
img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
prompt = (
f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n<|im_start|>assistant\n"
)
wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
return build_single_image_inputs([image], [prompt], wrapped_sf)
def video_with_metadata_glm4_1v():
video_array = VIDEO_ASSETS[0].np_ndarrays
metadata = VIDEO_ASSETS[0].metadata
question = "Describe the video."
video_prompt = "<|begin_of_video|><|video|><|end_of_video|>"
formatted_prompt = f"[gMASK]<|user|>\n{video_prompt}{question}<|assistant|>\n"
scales = [0.1, 0.2, 0.25]
video_input = [
[(rescale_video_size(video_array, scale), metadata)] for scale in scales
]
prompts = [formatted_prompt] * len(video_input)
return [
PromptWithMultiModalInput(
prompts=prompts,
video_data=video_input,
)
]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,190 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Entrypoints for wrapping the core run_test implementation for specific test
types / modalities.
"""
from pathlib import PosixPath
from .....conftest import (
AudioTestAssets,
HfRunner,
ImageTestAssets,
VideoTestAssets,
VllmRunner,
)
from . import builders, core
from .types import ExpandableVLMTestArgs, VLMTestInfo
####### Entrypoints for running different test types
def run_single_image_test(
*,
tmp_path: PosixPath,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets,
):
assert test_case.size_wrapper is not None
inputs = builders.build_single_image_inputs_from_test_info(
model_test_info, image_assets, test_case.size_wrapper, tmp_path
)
core.run_test(
hf_runner=hf_runner,
vllm_runner=vllm_runner,
inputs=inputs,
model=test_case.model,
dtype=test_case.dtype,
max_tokens=test_case.max_tokens,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"image": 1},
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_multi_image_test(
*,
tmp_path: PosixPath,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets,
):
assert test_case.size_wrapper is not None
inputs = builders.build_multi_image_inputs_from_test_info(
model_test_info, image_assets, test_case.size_wrapper, tmp_path
)
core.run_test(
hf_runner=hf_runner,
vllm_runner=vllm_runner,
inputs=inputs,
model=test_case.model,
dtype=test_case.dtype,
max_tokens=test_case.max_tokens,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"image": len(image_assets)},
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_embedding_test(
*,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets,
):
assert test_case.size_wrapper is not None
inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
model_test_info, image_assets, test_case.size_wrapper
)
core.run_test(
hf_runner=hf_runner,
vllm_runner=vllm_runner,
inputs=inputs,
model=test_case.model,
dtype=test_case.dtype,
max_tokens=test_case.max_tokens,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"image": 1},
vllm_embeddings=vllm_embeddings,
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_video_test(
*,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
video_assets: VideoTestAssets,
):
assert test_case.size_wrapper is not None
assert test_case.num_video_frames is not None
inputs = builders.build_video_inputs_from_test_info(
model_test_info,
video_assets,
test_case.size_wrapper,
test_case.num_video_frames,
test_case.needs_video_metadata,
)
core.run_test(
hf_runner=hf_runner,
vllm_runner=vllm_runner,
inputs=inputs,
model=test_case.model,
dtype=test_case.dtype,
max_tokens=test_case.max_tokens,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"video": len(video_assets)},
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_audio_test(
*,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets,
):
inputs = builders.build_audio_inputs_from_test_info(model_test_info, audio_assets)
core.run_test(
hf_runner=hf_runner,
vllm_runner=vllm_runner,
inputs=inputs,
model=test_case.model,
dtype=test_case.dtype,
max_tokens=test_case.max_tokens,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"audio": 1},
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_custom_inputs_test(
*,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
):
# Custom test cases can provide inputs directly, but they need to
# explicitly provided a CustomTestConfig, which wraps the inputs and
# the limit_mm_per_prompt
assert test_case.custom_test_opts is not None
inputs = test_case.custom_test_opts.inputs
limit_mm_per_prompt = test_case.custom_test_opts.limit_mm_per_prompt
# Inputs and limit_mm_per_prompt should all be set
assert inputs is not None
assert limit_mm_per_prompt is not None
core.run_test(
hf_runner=hf_runner,
vllm_runner=vllm_runner,
inputs=inputs,
model=test_case.model,
dtype=test_case.dtype,
max_tokens=test_case.max_tokens,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt=limit_mm_per_prompt,
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs(),
)

View File

@@ -0,0 +1,218 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Types for writing multimodal model tests."""
from collections.abc import Callable, Iterable
from enum import Enum
from pathlib import PosixPath
from typing import Any, NamedTuple
import torch
from pytest import MarkDecorator
from transformers import AutoModelForCausalLM
from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm.config.model import RunnerOption
from vllm.logprobs import SampleLogprobs
from vllm.tokenizers import TokenizerLike
from .....conftest import (
AUDIO_ASSETS,
IMAGE_ASSETS,
HfRunner,
ImageAsset,
ImageTestAssets,
PromptAudioInput,
PromptImageInput,
PromptVideoInput,
)
from ....utils import check_logprobs_close
# meta image tag; will be replaced by the appropriate tag for the model
TEST_IMG_PLACEHOLDER = "<vlm_image>"
TEST_VIDEO_PLACEHOLDER = "<vlm_video>"
TEST_AUDIO_PLACEHOLDER = "<lmm_audio>"
SINGLE_IMAGE_BASE_PROMPTS = IMAGE_ASSETS.prompts(
{
"stop_sign": f"{TEST_IMG_PLACEHOLDER}What's the content of the image?",
"cherry_blossom": f"{TEST_IMG_PLACEHOLDER}What is the season?",
}
)
SINGLE_AUDIO_BASE_PROMPT = AUDIO_ASSETS.prompts(
{
"mary_had_lamb": f"{TEST_AUDIO_PLACEHOLDER}Transcribe this audio into English.", # noqa: E501
"winning_call": f"{TEST_AUDIO_PLACEHOLDER}What is happening in this audio clip?", # noqa: E501
}
)
MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PLACEHOLDER}Describe the two images in detail.\n" # noqa: E501
VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"
IMAGE_SIZE_FACTORS = [(1.0,), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
EMBEDDING_SIZE_FACTORS = [(1.0,), (1.0, 1.0, 1.0)]
RunnerOutput = tuple[list[int], str, SampleLogprobs | None]
class PromptWithMultiModalInput(NamedTuple):
"""Holds the multimodal input for a single test case."""
prompts: list[str]
image_data: PromptImageInput | None = None
video_data: PromptVideoInput | None = None
audio_data: PromptAudioInput | None = None
class VLMTestType(Enum):
IMAGE = 1
MULTI_IMAGE = 2
EMBEDDING = 3
VIDEO = 4
AUDIO = 5
CUSTOM_INPUTS = 6
class SizeType(Enum):
SIZE_FACTOR = 1
FIXED_SIZE = 2
class CustomTestOptions(NamedTuple):
inputs: list[PromptWithMultiModalInput]
limit_mm_per_prompt: dict[str, int]
class ImageSizeWrapper(NamedTuple):
type: SizeType
# A size factor is a wrapper of 0+ floats,
# while a fixed size contains an iterable of integer pairs
data: Iterable[float] | Iterable[tuple[int, int]]
class VLMTestInfo(NamedTuple):
"""Holds the configuration for 1+ tests for one model architecture."""
models: list[str]
test_type: VLMTestType | Iterable[VLMTestType]
# Should be None only if this is a CUSTOM_INPUTS test
prompt_formatter: Callable[[str], str] | None = None
img_idx_to_prompt: Callable[[int], str] = lambda idx: "<image>\n"
video_idx_to_prompt: Callable[[int], str] = lambda idx: "<video>\n"
audio_idx_to_prompt: Callable[[int], str] = lambda idx: "<audio>\n"
# Most models work on the single / multi-image prompts above, but in some
# cases the log prob check fails, e.g., for paligemma. We allow passing
# an override for the single image prompts / multi-image prompt for this
# reason.
single_image_prompts: Iterable[str] = SINGLE_IMAGE_BASE_PROMPTS
multi_image_prompt: str = MULTI_IMAGE_BASE_PROMPT
# Function for converting ImageAssets to image embeddings;
# We need to define this explicitly for embedding tests
convert_assets_to_embeddings: (
Callable[[ImageTestAssets], list[torch.Tensor]] | None
) = None
# Exposed options for vLLM runner; we change these in a several tests,
# but the defaults are derived from VllmRunner & the engine defaults
# These settings are chosen to avoid OOMs when running in the CI
enforce_eager: bool = True
max_model_len: int = 1024
max_num_seqs: int = 256
runner: RunnerOption = "auto"
tensor_parallel_size: int = 1
vllm_runner_kwargs: dict[str, Any] | None = None
# Optional callable which gets a list of token IDs from the model tokenizer
get_stop_token_ids: Callable[[TokenizerLike], list[int]] | None = None
# Optional list of strings to stop generation, useful when stop tokens are
# not special tokens in the tokenizer
stop_str: list[str] | None = None
# Exposed options for HF runner
hf_model_kwargs: dict[str, Any] | None = None
# Indicates we should explicitly pass the EOS from the tokenizer
use_tokenizer_eos: bool = False
auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM
patch_hf_runner: Callable[[HfRunner], HfRunner] | None = None
# Post processors that if defined, will run oun the outputs of the
# vLLM and HF runner, respectively (useful for sanitization, etc).
vllm_output_post_proc: Callable[[RunnerOutput, str], Any] | None = None
hf_output_post_proc: Callable[[RunnerOutput, str], Any] | None = None
# Consumes the output of the callables above and checks if they're equal
comparator: Callable[..., None] = check_logprobs_close
# Default expandable params per test; these defaults can be overridden in
# instances of this object; the complete set of test cases for the model
# is all combinations of .models + all fields below
max_tokens: int = 128
num_logprobs: int = 5
dtype: str = "auto"
distributed_executor_backend: str | None = None
# Only expanded in video tests
num_video_frames: int | tuple[int] = 16
needs_video_metadata: bool = False
# Fixed image sizes / image size factors; most tests use image_size_factors
# The values provided for these two fields will be stacked and expanded
# such that each model will consider each image size factor / image size
# once per tests (much like concatenating and wrapping in one parametrize
# call)
image_size_factors: Iterable[Iterable[float]] = IMAGE_SIZE_FACTORS
image_sizes: Iterable[Iterable[tuple[int, int]]] | None = None
# Hack for updating a prompt to take into a local path; currently only used
# for Qwen-VL, which requires encoding the image path / url into the prompt
# for HF runner
prompt_path_encoder: (
Callable[[PosixPath, str, list[ImageAsset] | ImageTestAssets], str] | None
) = None # noqa: E501
# Allows configuring a test to run with custom inputs
custom_test_opts: list[CustomTestOptions] | None = None
marks: list[MarkDecorator] | None = None
def get_non_parametrized_runner_kwargs(self):
"""Returns a dictionary of expandable kwargs for items that are used
in all test types, which are NOT used when creating the parametrized
test cases.
"""
return {
"enforce_eager": self.enforce_eager,
"max_model_len": self.max_model_len,
"max_num_seqs": self.max_num_seqs,
"runner": self.runner,
"tensor_parallel_size": self.tensor_parallel_size,
"vllm_runner_kwargs": self.vllm_runner_kwargs,
"hf_output_post_proc": self.hf_output_post_proc,
"vllm_output_post_proc": self.vllm_output_post_proc,
"auto_cls": self.auto_cls,
"use_tokenizer_eos": self.use_tokenizer_eos,
"comparator": self.comparator,
"get_stop_token_ids": self.get_stop_token_ids,
"hf_model_kwargs": self.hf_model_kwargs,
"stop_str": self.stop_str,
"patch_hf_runner": self.patch_hf_runner,
}
class ExpandableVLMTestArgs(NamedTuple):
"""The expanded kwargs which correspond to a single test case."""
model: str
max_tokens: int
num_logprobs: int
dtype: str
distributed_executor_backend: str | None
# Sizes are used for everything except for custom input tests
size_wrapper: ImageSizeWrapper | None = None
# Video only
num_video_frames: int | None = None
needs_video_metadata: bool = False
# Custom inputs only
custom_test_opts: CustomTestOptions | None = None