Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

View File

@@ -0,0 +1,35 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM tests."""
import warnings
import torch
from vllm.platforms import current_platform
def pytest_configure(config):
"""Disable Flash/MemEfficient SDP on ROCm to avoid HF
Transformers accuracy issues.
"""
if not current_platform.is_rocm():
return
skip_patterns = ["test_granite_speech.py"]
if any(pattern in str(arg) for arg in config.args for pattern in skip_patterns):
# Skip disabling SDP for Granite Speech tests on ROCm
return
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
# accuracy issues
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_math_sdp(True)
warnings.warn(
"ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
"to avoid HuggingFace Transformers accuracy issues",
UserWarning,
stacklevel=1,
)

View File

@@ -0,0 +1,142 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright 2025 The vLLM team.
# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
# reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import pytest
from tests.models.registry import HF_EXAMPLE_MODELS
from vllm import LLM, SamplingParams
MODEL_NAME = "nvidia/audio-flamingo-3-hf"
def get_fixture_path(filename):
return os.path.join(
os.path.dirname(__file__), "../../fixtures/audioflamingo3", filename
)
@pytest.fixture(scope="module")
def llm():
# Check if the model is supported by the current transformers version
model_info = HF_EXAMPLE_MODELS.get_hf_info("AudioFlamingo3ForConditionalGeneration")
model_info.check_transformers_version(on_fail="skip")
try:
llm = LLM(
model=MODEL_NAME,
trust_remote_code=True,
dtype="bfloat16",
enforce_eager=True,
limit_mm_per_prompt={"audio": 1},
)
return llm
except Exception as e:
pytest.skip(f"Failed to load model {MODEL_NAME}: {e}")
def test_single_generation(llm):
fixture_path = get_fixture_path("expected_results_single.json")
if not os.path.exists(fixture_path):
pytest.skip(f"Fixture not found: {fixture_path}")
with open(fixture_path) as f:
expected = json.load(f)
audio_url = "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Why_do_we_ask_questions_converted.wav"
messages = [
{
"role": "user",
"content": [
{"type": "audio_url", "audio_url": {"url": audio_url}},
{"type": "text", "text": "Transcribe the input speech."},
],
}
]
sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
outputs = llm.chat(
messages=messages,
sampling_params=sampling_params,
)
generated_text = outputs[0].outputs[0].text.strip()
expected_text = expected["transcriptions"][0]
assert expected_text in generated_text or generated_text in expected_text
def test_batched_generation(llm):
fixture_path = get_fixture_path("expected_results_batched.json")
if not os.path.exists(fixture_path):
pytest.skip(f"Fixture not found: {fixture_path}")
with open(fixture_path) as f:
expected = json.load(f)
items = [
{
"audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/dogs_barking_in_sync_with_the_music.wav",
"question": "What is surprising about the relationship "
"between the barking and the music?",
"expected_idx": 0,
},
{
"audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Ch6Ae9DT6Ko_00-04-03_00-04-31.wav",
"question": (
"Why is the philosopher's name mentioned in the lyrics? "
"(A) To express a sense of nostalgia "
"(B) To indicate that language cannot express clearly, "
"satirizing the inversion of black and white in the world "
"(C) To add depth and complexity to the lyrics "
"(D) To showcase the wisdom and influence of the philosopher"
),
"expected_idx": 1,
},
]
conversations = []
for item in items:
messages = [
{
"role": "user",
"content": [
{"type": "audio_url", "audio_url": {"url": item["audio_url"]}},
{"type": "text", "text": item["question"]},
],
}
]
conversations.append(messages)
sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
outputs = llm.chat(
messages=conversations,
sampling_params=sampling_params,
)
for i, output in enumerate(outputs):
generated_text = output.outputs[0].text.strip()
expected_text = expected["transcriptions"][i]
assert expected_text in generated_text or generated_text in expected_text

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,160 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
import pytest
from transformers import AutoModelForSpeechSeq2Seq
from vllm.logprobs import SampleLogprobs
from vllm.lora.request import LoRARequest
from vllm.platforms import current_platform
from ....conftest import AudioTestAssets, HfRunner, PromptAudioInput, VllmRunner
from ...registry import HF_EXAMPLE_MODELS
from ...utils import check_logprobs_close
HF_AUDIO_PROMPT = "<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|><|audio|>can you transcribe the speech into a written format?<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>" # noqa: E501
def vllm_to_hf_output(
vllm_output: tuple[list[int], str, SampleLogprobs | None],
) -> tuple[list[int], str, SampleLogprobs | None]:
"""Sanitize hf output to be comparable with vllm output."""
output_ids, output_str, out_logprobs = vllm_output
hf_output_str = output_str + "<|end_of_text|>"
return output_ids, hf_output_str, out_logprobs
MODEL_NAME = "ibm-granite/granite-speech-3.3-2b"
# Audio lora co-exists directly in the model directory, but
# currently still needs to be passed directly to vLLM.
audio_lora_path = MODEL_NAME
models = [MODEL_NAME]
@pytest.fixture(autouse=True)
def set_attention_backend_for_rocm(monkeypatch):
if current_platform.is_rocm():
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
def run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
inputs: Sequence[tuple[list[str], PromptAudioInput]],
model: str,
*,
max_model_len: int,
dtype: str,
max_tokens: int,
num_logprobs: int,
tensor_parallel_size: int,
distributed_executor_backend: str | None = None,
):
"""Inference result should be the same between hf and vllm.
All the audio fixtures for the test are from AUDIO_ASSETS.
For huggingface runner, we provide the audio as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with vllm_runner(
model,
runner="generate",
max_model_len=max_model_len,
max_num_seqs=1,
dtype=dtype,
limit_mm_per_prompt={"audio": 1},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enable_lora=True,
max_lora_rank=64,
enforce_eager=True,
) as vllm_model:
lora_request = LoRARequest("audio", 1, audio_lora_path)
vllm_outputs_per_case = [
vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
audios=audios,
lora_request=lora_request,
)
for prompts, audios in inputs
]
with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
hf_processor = hf_model.processor
eos_token_id = hf_processor.tokenizer.eos_token_id
hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(
prompts,
max_tokens,
num_logprobs=num_logprobs,
audios=[audios],
eos_token_id=eos_token_id,
)
for prompts, audios in inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=[vllm_to_hf_output(output) for output in vllm_outputs],
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"dtype", ["float16"] if current_platform.is_rocm() else ["bfloat16"]
)
@pytest.mark.parametrize(
"max_model_len", [512] if current_platform.is_rocm() else [2048]
)
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_models(
hf_runner,
vllm_runner,
model: str,
audio_assets: AudioTestAssets,
dtype: str,
max_model_len: int,
max_tokens: int,
num_logprobs: int,
) -> None:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
audio, sr = audio_assets[0].audio_and_sample_rate
# This model expects 16k sample rate, which our test audio
# already is; if this changes, it may break this test,
# so we check it directly
assert sr == 16000
run_test(
hf_runner,
vllm_runner,
[
([HF_AUDIO_PROMPT], [audio]),
],
model,
dtype=dtype,
max_model_len=max_model_len,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=1,
)

View File

@@ -0,0 +1,81 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.multimodal.image import convert_image_mode
models = ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]
def base_prompt(modalities_str: str) -> str:
return f"<|im_start|>user {modalities_str}\nDescribe what you see from these items.<|im_end|><|im_start|>assistant\n" # noqa: E501
INTERLEAVED_PROMPT = base_prompt("<image><video><image>\n")
NONINTERLEAVED_PROMPT = base_prompt("<image><image><video>\n")
@pytest.mark.core_model
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", ["float16"])
@pytest.mark.parametrize("max_tokens", [128])
def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
"""
This is a simple test to check if interleaved and non-interleaved prompts
give the same result.
"""
image_cherry = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
image_stop = convert_image_mode(ImageAsset("stop_sign").pil_image, "RGB")
images = [image_cherry, image_stop]
video = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays
inputs = [
(
[INTERLEAVED_PROMPT],
[images],
[video],
),
(
[NONINTERLEAVED_PROMPT],
[images],
[video],
),
]
with vllm_runner(
model,
runner="generate",
dtype=dtype,
limit_mm_per_prompt={"image": 2},
max_model_len=32768,
max_num_seqs=2,
tensor_parallel_size=1,
enforce_eager=True,
) as vllm_model:
vllm_outputs_per_case = [
vllm_model.generate_greedy(
prompts, max_tokens, images=images, videos=videos
)
for prompts, images, videos in inputs
]
all_results = [output[0][1] for output in vllm_outputs_per_case]
outputs = [
(total_str, total_str.find("assistant\n") + len("assistant\n"))
for total_str in all_results
]
prompt_lengths = [prompt_len for _, prompt_len in outputs]
generated_strs = [total_str[prompt_len:] for total_str, prompt_len in outputs]
interleaved_prompt_len, noninterleaved_prompt_len = prompt_lengths
interleaved_output_str, noninterleaved_output_str = generated_strs
# The two prompts are identical except for the order of modality tokens.
assert interleaved_prompt_len == noninterleaved_prompt_len
# The two generated strings should be different because of the
# interleaved modality tokens.
assert interleaved_output_str != noninterleaved_output_str

View File

@@ -0,0 +1,86 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import asdict
from typing import NamedTuple
import pytest
from PIL.Image import Image
from transformers import AutoProcessor
from vllm import LLM, EngineArgs, SamplingParams
from vllm.multimodal.utils import encode_image_base64
MODEL_NAME = "Kwai-Keye/Keye-VL-8B-Preview"
QUESTION = "What is the content of each image?"
class ModelRequestData(NamedTuple):
engine_args: EngineArgs
prompt: str
image_data: list[Image]
stop_token_ids: list[int] | None = None
chat_template: str | None = None
sampling_params: SamplingParams | None = None
@pytest.mark.core_model
@pytest.mark.parametrize("question", [QUESTION])
def test_keye_vl(
image_assets,
question: str,
):
images = [asset.pil_image for asset in image_assets]
image_urls = [
f"data:image/jpeg;base64,{encode_image_base64(image)}" for image in images
]
engine_args = EngineArgs(
model=MODEL_NAME,
trust_remote_code=True,
max_model_len=8192,
max_num_seqs=5,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [
{
"role": "user",
"content": [
*placeholders,
{"type": "text", "text": question},
],
},
]
processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
prompt = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
engine_args = asdict(engine_args) | {"seed": 42}
llm = LLM(**engine_args)
sampling_params = SamplingParams(
temperature=0.0, max_tokens=256, stop_token_ids=None
)
outputs = llm.generate(
{
"prompt": prompt,
"multi_modal_data": {"image": images},
},
sampling_params=sampling_params,
)
print("-" * 50)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
assert len(generated_text) > 10, (
f"Generated text is too short: {generated_text}"
)
print("-" * 50)

View File

@@ -0,0 +1,723 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Create a reduced-layer version of the Maverick model for testing purposes.
This script creates a new model with fewer layers by:
1. Loading the original Maverick model configuration
2. Creating a reduced configuration
3. Generating compatible safetensors files with appropriate weights
4. Creating the necessary index files for vLLM compatibility
"""
import json
import shutil
from pathlib import Path
from typing import Any
import pytest
import torch
from safetensors.torch import save_file
from transformers import AutoConfig, AutoProcessor, AutoTokenizer, GenerationConfig
from vllm import LLM, SamplingParams
from vllm.v1.executor.abstract import Executor
from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, FullAttentionSpec
from ....utils import multi_gpu_test
# Sample prompts for testing
PROMPTS: list[str] = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
def run_maverick_serving(model: str):
"""Test Llama-4-Maverick model with vLLM LLM class using CLI equivalent
options with reduced layers.
"""
try:
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(
model=model,
max_model_len=2048,
enforce_eager=True,
tensor_parallel_size=8,
enable_expert_parallel=True,
trust_remote_code=True,
gpu_memory_utilization=0.4,
kv_cache_dtype="fp8",
)
outputs = llm.generate(PROMPTS, sampling_params)
# Print the outputs
print("\nGenerated Outputs:\n" + "-" * 60)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}")
print(f"Output: {generated_text!r}")
print("-" * 60)
except Exception as e:
print(f"Error initializing or running model: {e}")
raise
def get_rope_layers_config(model_path: str) -> list[int]:
"""
Get the interleaved RoPE configuration from HuggingFace config
Args:
model_path: Path to the local directory containing the reduced
Maverick model checkpoint
Returns:
List of 0 or 1 indicating whether each layer uses RoPE and local attn
0 indicates that RoPE is not used while 1 indicates that RoPE is used.
"""
config_path = Path(model_path) / "config.json"
model_config = json.loads(config_path.read_text())
text_config = model_config["text_config"]
no_rope_layers = text_config["no_rope_layers"]
print(f"Found no_rope_layers: {no_rope_layers}")
return no_rope_layers
def create_reduced_maverick_model(
original_model_name: str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
output_dir: str = "/tmp/reduced_maverick",
text_layers: int = 4,
num_experts: int = 4,
vision_layers: int = 2,
force_recreate: bool = False,
) -> str:
"""
Create a reduced-layer version of the Maverick model.
Args:
original_model_name: Name of the original Maverick model
output_dir: Directory to save the reduced model
text_layers: Number of text transformer layers
num_experts: Number of experts per layer
vision_layers: Number of vision transformer layers
force_recreate: Whether to recreate if output_dir already exists
Returns:
Path to the created reduced model directory
"""
print(
f"Creating reduced Maverick model with {text_layers} text layers and "
f"{vision_layers} vision layers..."
)
# Create output directory
output_path = Path(output_dir)
if output_path.exists():
if force_recreate:
shutil.rmtree(output_path)
else:
print(
f"Output directory {output_dir} already exists. "
"Use --force-recreate to overwrite."
)
return str(output_path)
output_path.mkdir(parents=True, exist_ok=True)
try:
print("Loading original model configuration...")
original_config = AutoConfig.from_pretrained(
original_model_name, trust_remote_code=True
)
print("Creating reduced configuration...")
reduced_config = create_reduced_config(
original_config, text_layers, num_experts, vision_layers
)
config_path = output_path / "config.json"
with open(config_path, "w") as f:
json.dump(reduced_config, f, indent=2)
print(f"Saved reduced config to {config_path}")
print("Copying tokenizer files...")
copy_tokenizer_files(original_model_name, output_path)
print("Creating reduced safetensors files...")
create_reduced_safetensors(original_config, reduced_config, output_path)
print("Creating preprocessor config...")
create_preprocessor_config(original_config, output_path)
try:
gen_config = GenerationConfig.from_pretrained(original_model_name)
gen_config.save_pretrained(output_path)
print("Copied generation config")
except Exception as e:
print(f"Could not copy generation config: {e}")
print(f"Successfully created reduced Maverick model at {output_path}")
return str(output_path)
except Exception as e:
print(f"Error creating reduced model: {e}")
# Clean up on failure
if output_path.exists():
shutil.rmtree(output_path)
raise
def create_reduced_config(
original_config: Any, text_layers: int, num_experts: int, vision_layers: int
) -> dict[str, Any]:
"""Create a reduced configuration based on the original."""
# Convert config to dictionary
config_dict = original_config.to_dict()
# Reduce text layers
if "text_config" in config_dict:
original_text_layers = config_dict["text_config"]["num_hidden_layers"]
config_dict["text_config"]["num_hidden_layers"] = text_layers
original_layer_types = config_dict["text_config"]["layer_types"]
config_dict["text_config"]["layer_types"] = original_layer_types[:text_layers]
print(f"Reduced text layers from {original_text_layers} to {text_layers}")
original_num_experts = config_dict["text_config"]["num_local_experts"]
config_dict["text_config"]["num_local_experts"] = num_experts
print(f"Reduced num experts from {original_num_experts} to {num_experts}")
hidden_dim_divisor = 4
original_hidden_size = config_dict["text_config"]["hidden_size"]
new_hidden_size = original_hidden_size // hidden_dim_divisor
config_dict["text_config"]["hidden_size"] = new_hidden_size
print(f"Reduced hidden size from {original_hidden_size} to {new_hidden_size}")
original_head_dim = config_dict["text_config"]["head_dim"]
new_head_dim = original_head_dim // hidden_dim_divisor
config_dict["text_config"]["head_dim"] = new_head_dim
print(f"Reduced head dim from {original_head_dim} to {new_head_dim}")
# Reduce vision layers
if "vision_config" in config_dict:
original_vision_layers = config_dict["vision_config"]["num_hidden_layers"]
config_dict["vision_config"]["num_hidden_layers"] = vision_layers
print(f"Reduced vision layers from {original_vision_layers} to {vision_layers}")
# Update model name to indicate it's a reduced version
config_dict["_name_or_path"] = f"reduced_maverick_{text_layers}t_{vision_layers}v"
return config_dict
def copy_tokenizer_files(original_model_name: str, output_path: Path) -> None:
"""Copy tokenizer files from the original model."""
try:
tokenizer = AutoTokenizer.from_pretrained(
original_model_name, trust_remote_code=True
)
tokenizer.save_pretrained(output_path)
print("Tokenizer files copied successfully")
except Exception as e:
print(f"Warning: Could not copy tokenizer files: {e}")
def create_preprocessor_config(original_config: Any, output_path: Path) -> None:
"""Create preprocessor_config.json for multimodal model."""
# Try to load the original preprocessor config
try:
processor = AutoProcessor.from_pretrained(
original_config._name_or_path
or "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
trust_remote_code=True,
)
processor.save_pretrained(output_path)
print("Copied original preprocessor config")
return
except Exception as e:
print(f"Could not copy original preprocessor config: {e}")
raise
def create_reduced_safetensors(
original_config: Any, reduced_config: dict[str, Any], output_path: Path
) -> None:
"""Create safetensors files with weights for the reduced model."""
print("Generating synthetic weights for reduced model...")
text_config = reduced_config["text_config"]
vision_config = reduced_config["vision_config"]
weights = {}
print("Creating text model weights...")
weights.update(create_text_model_weights(text_config))
print("Creating vision model weights...")
weights.update(create_vision_model_weights(vision_config))
print("Creating shared model weights...")
weights.update(create_shared_weights(text_config, vision_config))
print("Saving weights to safetensors files...")
save_weights_to_safetensors(weights, output_path)
def create_text_model_weights(text_config: dict[str, Any]) -> dict[str, torch.Tensor]:
"""Create synthetic weights for the text model with MoE structure."""
weights = {}
vocab_size = text_config["vocab_size"]
hidden_size = text_config["hidden_size"]
intermediate_size = text_config["intermediate_size"]
intermediate_size_mlp = text_config["intermediate_size_mlp"]
num_layers = text_config["num_hidden_layers"]
num_attention_heads = text_config["num_attention_heads"]
num_key_value_heads = text_config.get("num_key_value_heads", num_attention_heads)
# MoE specific parameters
num_experts = text_config.get("num_local_experts")
assert num_experts is not None, "num_local_experts must be specified for MoE"
head_dim = hidden_size // num_attention_heads
# Embedding layers
weights["language_model.model.embed_tokens.weight"] = torch.randn(
vocab_size, hidden_size, dtype=torch.float16
)
# Transformer layers
for layer_idx in range(num_layers):
layer_prefix = f"language_model.model.layers.{layer_idx}"
print(f"Creating weights for layer {layer_prefix}...")
# Self-attention weights (separate q, k, v projections)
weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
hidden_size, num_key_value_heads * head_dim, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16
)
print("Self-attention weights created.")
# Feed-forward weights - MoE pattern based on interleave_moe_layer_step
# For interleave_moe_layer_step=2: layers 1,3,5,... are MoE, layers
# 0,2,4,... are dense
interleave_step = text_config.get("interleave_moe_layer_step", 1)
is_moe_layer = interleave_step > 0 and (layer_idx + 1) % interleave_step == 0
if is_moe_layer:
# MoE layer structure
# 1. Router weights
weights[f"{layer_prefix}.feed_forward.router.weight"] = torch.randn(
num_experts, hidden_size, dtype=torch.float16
)
# 2. Individual expert weights (not fused)
for expert_idx in range(num_experts):
expert_prefix = f"{layer_prefix}.feed_forward.experts.{expert_idx}"
weights[f"{expert_prefix}.gate_proj.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{expert_prefix}.up_proj.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{expert_prefix}.down_proj.weight"] = torch.randn(
hidden_size, intermediate_size, dtype=torch.bfloat16
)
# Expert weight scales (FP8 quantization)
weights[f"{expert_prefix}.gate_proj.weight_scale"] = torch.ones(
intermediate_size, 1, dtype=torch.bfloat16
)
weights[f"{expert_prefix}.up_proj.weight_scale"] = torch.ones(
intermediate_size, 1, dtype=torch.bfloat16
)
weights[f"{expert_prefix}.down_proj.weight_scale"] = torch.ones(
hidden_size, 1, dtype=torch.bfloat16
)
# 3. Shared expert weights
shared_expert_prefix = f"{layer_prefix}.feed_forward.shared_expert"
weights[f"{shared_expert_prefix}.gate_proj.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{shared_expert_prefix}.up_proj.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{shared_expert_prefix}.down_proj.weight"] = torch.randn(
hidden_size, intermediate_size, dtype=torch.bfloat16
)
print(f"MoE feed-forward weights created for layer {layer_idx}.")
else:
# Dense layer structure
weights[f"{layer_prefix}.feed_forward.gate_proj.weight"] = torch.randn(
intermediate_size_mlp, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.feed_forward.up_proj.weight"] = torch.randn(
intermediate_size_mlp, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.feed_forward.down_proj.weight"] = torch.randn(
hidden_size, intermediate_size_mlp, dtype=torch.bfloat16
)
print(f"Dense feed-forward weights created for layer {layer_idx}.")
# Layer norms
weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16
)
print("Layer norms created.")
# Final layer norm and output projection
weights["language_model.model.norm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16
)
weights["language_model.lm_head.weight"] = torch.randn(
vocab_size, hidden_size, dtype=torch.bfloat16
)
return weights
def create_vision_model_weights(
vision_config: dict[str, Any],
) -> dict[str, torch.Tensor]:
"""Create synthetic weights for the vision model."""
weights = {}
hidden_size = vision_config["hidden_size"]
intermediate_size = vision_config["intermediate_size"]
num_layers = vision_config["num_hidden_layers"]
# Vision transformer layers
for layer_idx in range(num_layers):
layer_prefix = f"vision_model.model.layers.{layer_idx}"
weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
hidden_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.q_proj.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
hidden_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.k_proj.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
hidden_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.v_proj.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
hidden_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.o_proj.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.mlp.fc1.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.mlp.fc1.bias"] = torch.zeros(
intermediate_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.mlp.fc2.weight"] = torch.randn(
hidden_size, intermediate_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.mlp.fc2.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.input_layernorm.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.post_attention_layernorm.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16
)
return weights
def create_shared_weights(
text_config: dict[str, Any], vision_config: dict[str, Any]
) -> dict[str, torch.Tensor]:
"""Create weights for shared components (vision-language connector)"""
weights = {}
text_hidden_size = text_config["hidden_size"]
projector_input_dim = vision_config["projector_input_dim"]
# Vision-language connector (projects vision features to text space)
weights["multi_modal_projector.linear_1.weight"] = torch.randn(
text_hidden_size, projector_input_dim, dtype=torch.bfloat16
)
return weights
def save_weights_to_safetensors(
weights: dict[str, torch.Tensor], output_path: Path
) -> None:
"""Save weights to safetensors files and create index."""
# Determine how to shard the weights
max_shard_size = 5 * 1024 * 1024 * 1024 # 5GB per shard
# Calculate sizes and create shards
shards = []
current_shard: dict[str, torch.Tensor] = {}
current_size = 0
for name, tensor in weights.items():
tensor_size = tensor.numel() * tensor.element_size()
if current_size + tensor_size > max_shard_size and current_shard:
shards.append(current_shard)
current_shard = {}
current_size = 0
current_shard[name] = tensor
current_size += tensor_size
if current_shard:
shards.append(current_shard)
# Save shards and create index
weight_map = {}
if len(shards) == 1:
# Single file
filename = "model.safetensors"
save_file(shards[0], output_path / filename)
weight_map = {name: filename for name in shards[0]}
print(f"Saved weights to single file: {filename}")
else:
# Multiple shards
for i, shard in enumerate(shards):
filename = f"model-{i + 1:05d}-of-{len(shards):05d}.safetensors"
save_file(shard, output_path / filename)
for name in shard:
weight_map[name] = filename
print(f"Saved shard {i + 1}/{len(shards)}: {filename}")
# Create index file
index_data = {
"metadata": {
"total_size": sum(
tensor.numel() * tensor.element_size() for tensor in weights.values()
)
},
"weight_map": weight_map,
}
index_path = output_path / "model.safetensors.index.json"
with open(index_path, "w") as f:
json.dump(index_data, f, indent=2)
print(f"Created index file: {index_path}")
print(
f"Total model size: {index_data['metadata']['total_size'] / (1024**3):.2f} GB"
)
def check_attention_spec_interleaved_rope(
llm: LLM,
num_attention_layers: int,
num_ranks: int,
rope_layers: list[int],
):
"""Check that the attention spec is correct."""
assert isinstance(llm.llm_engine.model_executor, Executor)
kv_cache_specs_per_rank = llm.llm_engine.model_executor.get_kv_cache_specs()
for rank in range(num_ranks):
kv_cache_specs = kv_cache_specs_per_rank[rank]
assert len(kv_cache_specs.keys()) == num_attention_layers
for i in range(num_attention_layers):
if rope_layers[i] == 0:
expected_spec = FullAttentionSpec
else:
expected_spec = ChunkedLocalAttentionSpec
assert isinstance(
kv_cache_specs[f"language_model.model.layers.{i}.self_attn.attn"],
expected_spec,
)
def run_reduced_model(llm: LLM, should_profile: bool = False) -> None:
"""Test the created reduced model with vLLM."""
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=50)
if should_profile:
llm.start_profile()
outputs = llm.generate(PROMPTS, sampling_params)
if should_profile:
llm.stop_profile()
print("Test generation successful!")
for output in outputs:
print(f"Prompt: {output.prompt}")
print(f"Output: {output.outputs[0].text}")
print("-" * 40)
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize(
"original_model_name,text_layers,num_experts,vision_layers,",
[("meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", 4, 4, 2)],
)
@pytest.mark.parametrize("enforce_eager", [True, False])
@pytest.mark.parametrize("tp,ep", [(2, True)])
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
def test_dummy_maverick(
monkeypatch,
original_model_name: str,
text_layers: int,
num_experts: int,
vision_layers: int,
enforce_eager: bool,
tp: int,
ep: bool,
output_dir: str = "/tmp/reduced_maverick",
force_recreate: bool = True,
profile: bool = False,
) -> None:
# Disable multiprocessing allows us to access model executor from LLM engine
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
model_path = create_reduced_maverick_model(
original_model_name=original_model_name,
output_dir=output_dir,
text_layers=text_layers,
num_experts=num_experts,
vision_layers=vision_layers,
force_recreate=force_recreate,
)
print(f"\nReduced model created successfully at: {model_path}")
rope_layers = get_rope_layers_config(model_path)
llm = LLM(
model=model_path,
trust_remote_code=True,
max_model_len=512, # Small context for testing
gpu_memory_utilization=0.3, # Conservative memory usage
enforce_eager=enforce_eager,
tensor_parallel_size=tp,
enable_expert_parallel=ep,
)
check_attention_spec_interleaved_rope(
llm,
text_layers,
tp,
rope_layers,
)
print(f"\nTesting reduced model at {model_path}...")
run_reduced_model(llm=llm, should_profile=profile)
def main():
"""Main function to create and test the reduced model."""
import argparse
parser = argparse.ArgumentParser(
description="Create a reduced-layer Maverick model"
)
parser.add_argument(
"--output-dir",
default="/tmp/reduced_maverick",
help="Output directory for the reduced model",
)
parser.add_argument(
"--text-layers",
type=int,
default=4,
help="Number of text transformer layers",
)
parser.add_argument("--num-experts", type=int, default=4, help="Number of experts")
parser.add_argument(
"--vision-layers",
type=int,
default=2,
help="Number of vision transformer layers",
)
parser.add_argument(
"--force-recreate",
action="store_true",
help="Force recreation if output directory exists",
)
parser.add_argument(
"--test", action="store_true", help="Test the created model with vLLM"
)
parser.add_argument(
"--profile", action="store_true", help="Profile the created model with vLLM"
)
parser.add_argument(
"--test-original",
action="store_true",
help="Test the original model with vLLM",
)
parser.add_argument(
"--original-model",
default="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
help="Original model name to base the reduction on",
)
args = parser.parse_args()
if args.test:
test_dummy_maverick(
original_model_name=args.original_model,
output_dir=args.output_dir,
text_layers=args.text_layers,
num_experts=args.num_experts,
vision_layers=args.vision_layers,
force_recreate=args.force_recreate,
tp=2,
ep=True,
enforce_eager=True,
profile=args.profile,
)
if args.test_original:
run_maverick_serving(args.original_model)
if __name__ == "__main__":
exit(main())

View File

@@ -0,0 +1,180 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
from typing import Any, NamedTuple
import pytest
from huggingface_hub import hf_hub_download
from pytest import MarkDecorator
from transformers import AutoModelForImageTextToText
from tests.quantization.utils import is_quant_method_supported
from vllm.assets.image import ImageAsset
from vllm.multimodal.image import rescale_image_size
from vllm.utils.torch_utils import set_default_torch_num_threads
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner
from ...utils import check_logprobs_close
class GGUFMMTestConfig(NamedTuple):
original_model: str
gguf_repo: str
gguf_backbone: str
gguf_mmproj: str
prompt: list[str]
image_names: list[str] # Store names, load PIL images at runtime
max_model_len: int = 4096
marks: list[MarkDecorator] = []
mm_processor_kwargs: dict[str, Any] = {}
@property
def gguf_model(self):
hf_hub_download(self.gguf_repo, filename=self.gguf_mmproj)
return hf_hub_download(self.gguf_repo, filename=self.gguf_backbone)
# Common prompts aligned with test_common.py "gemma3" entry format
_GEMMA3_PROMPTS = IMAGE_ASSETS.prompts(
{
"stop_sign": (
"<bos><start_of_turn>user\n"
"<start_of_image>What's the content in the center of the image?"
"<end_of_turn>\n<start_of_turn>model\n"
),
"cherry_blossom": (
"<bos><start_of_turn>user\n"
"<start_of_image>What is the season?"
"<end_of_turn>\n<start_of_turn>model\n"
),
}
)
# Image asset names - load at runtime to avoid pickle issues with subprocess
_GEMMA3_IMAGE_NAMES = ["stop_sign", "cherry_blossom"]
# Regular multimodal (no pan-and-scan) - uses QAT Q4_0 GGUF
GEMMA3_CONFIG = GGUFMMTestConfig(
original_model="google/gemma-3-4b-it",
gguf_repo="google/gemma-3-4b-it-qat-q4_0-gguf",
gguf_backbone="gemma-3-4b-it-q4_0.gguf",
gguf_mmproj="mmproj-model-f16-4B.gguf",
prompt=_GEMMA3_PROMPTS,
image_names=_GEMMA3_IMAGE_NAMES,
max_model_len=4096,
marks=[pytest.mark.core_model],
mm_processor_kwargs={},
)
# Pan-and-scan multimodal - uses unquantized BF16 GGUF
GEMMA3_CONFIG_PAN_AND_SCAN = GGUFMMTestConfig(
original_model="google/gemma-3-4b-it",
gguf_repo="unsloth/gemma-3-4b-it-GGUF",
gguf_backbone="gemma-3-4b-it-BF16.gguf",
gguf_mmproj="mmproj-BF16.gguf",
prompt=_GEMMA3_PROMPTS,
image_names=_GEMMA3_IMAGE_NAMES,
max_model_len=4096,
marks=[pytest.mark.core_model],
mm_processor_kwargs={"do_pan_and_scan": True},
)
MODELS_TO_TEST = [GEMMA3_CONFIG, GEMMA3_CONFIG_PAN_AND_SCAN]
def run_multimodal_gguf_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
model: GGUFMMTestConfig,
dtype: str,
max_tokens: int,
num_logprobs: int,
):
# Load images at runtime (inside subprocess) to avoid pickle issues
images = [ImageAsset(name).pil_image for name in model.image_names]
size_factors = [0.25, 0.5, 1.0]
inputs_per_image = [
(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
)
for image, prompt in zip(images, model.prompt)
]
# NOTE: Run vLLM first to avoid CUDA init issues with multiprocessing fork.
# Run GGUF model via vLLM.
with (
set_default_torch_num_threads(1),
vllm_runner(
model_name=model.gguf_model,
enforce_eager=True,
tokenizer_name=model.original_model,
dtype=dtype,
max_model_len=model.max_model_len,
mm_processor_kwargs=model.mm_processor_kwargs,
) as gguf_model,
):
gguf_outputs_per_case = [
gguf_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
)
for prompts, images in inputs_per_image
]
# Then run HfRunner for HuggingFace baseline comparison.
with hf_runner(
model.original_model,
dtype=dtype,
auto_cls=AutoModelForImageTextToText,
) as hf_model:
hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
)
for prompts, images in inputs_per_image
]
for hf_outputs, gguf_outputs in zip(hf_outputs_per_case, gguf_outputs_per_case):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=gguf_outputs,
name_0="hf",
name_1="gguf",
)
@pytest.mark.skipif(
not is_quant_method_supported("gguf"),
reason="gguf is not supported on this GPU type.",
)
@pytest.mark.parametrize(
"model",
[
pytest.param(test_config, marks=test_config.marks)
for test_config in MODELS_TO_TEST
],
)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [10])
def test_gemma3_mm_gguf(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
model: GGUFMMTestConfig,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
run_multimodal_gguf_test(
hf_runner, vllm_runner, model, dtype, max_tokens, num_logprobs
)

View File

@@ -0,0 +1,317 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from collections.abc import Sequence
import librosa
import pytest
import regex as re
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer
from vllm.assets.image import ImageAsset
from vllm.logprobs import SampleLogprobs
from vllm.lora.request import LoRARequest
from vllm.multimodal.image import convert_image_mode, rescale_image_size
from ....conftest import (
IMAGE_ASSETS,
HfRunner,
PromptAudioInput,
PromptImageInput,
VllmRunner,
)
from ....utils import large_gpu_test
from ...utils import check_logprobs_close
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
{
"stop_sign": "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
"cherry_blossom": "<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
}
)
HF_MULTIIMAGE_IMAGE_PROMPT = (
"<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
)
model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
vision_lora_path = os.path.join(model_path, "vision-lora")
speech_question = os.path.join(
model_path, "examples", "what_is_shown_in_this_image.wav"
)
models = [model_path]
def vllm_to_hf_output(
vllm_output: tuple[list[int], str, SampleLogprobs | None], model: str
):
"""Sanitize vllm output to be comparable with hf output."""
_, output_str, out_logprobs = vllm_output
output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
assert output_str_without_image[0] == " "
output_str_without_image = output_str_without_image[1:]
hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
tokenizer = AutoTokenizer.from_pretrained(model)
hf_output_ids = tokenizer.encode(output_str_without_image)
assert hf_output_ids[0] == 1
hf_output_ids = hf_output_ids[1:]
return hf_output_ids, hf_output_str, out_logprobs
target_dtype = "half"
def run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
inputs: Sequence[tuple[list[str], PromptImageInput, PromptAudioInput | None]],
model: str,
*,
max_model_len: int,
dtype: str,
max_tokens: int,
num_logprobs: int,
mm_limit: int,
tensor_parallel_size: int,
distributed_executor_backend: str | None = None,
):
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test are from IMAGE_ASSETS.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with vllm_runner(
model,
runner="generate",
max_model_len=max_model_len,
max_num_seqs=2,
dtype=dtype,
limit_mm_per_prompt={"image": mm_limit},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enable_lora=True,
max_lora_rank=320,
gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
enforce_eager=True,
) as vllm_model:
lora_request = LoRARequest("vision", 1, vision_lora_path)
vllm_outputs_per_case = [
vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
audios=audios,
lora_request=lora_request,
)
for prompts, images, audios in inputs
]
# This error occurs inside `get_peft_model`
# FIXME: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/75
pytest.skip("HF impl is not compatible with current transformers")
hf_model_kwargs = {"_attn_implementation": "sdpa"}
with hf_runner(model, dtype=dtype, model_kwargs=hf_model_kwargs) as hf_model:
hf_processor = hf_model.processor
eos_token_id = hf_processor.tokenizer.eos_token_id
def patch_hf_processor(
*args, text="", images=None, audio=None, sampling_rate=None, **kwargs
):
audios = None
if audio is not None and sampling_rate is not None:
audios = [(audio, sampling_rate)]
return hf_processor(
*args, text=text, images=images, audios=audios, **kwargs
)
hf_model.processor = patch_hf_processor
hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
audios=audios,
eos_token_id=eos_token_id,
num_logits_to_keep=0,
)
for prompts, images, audios in inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# No image
[],
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_models(
hf_runner,
vllm_runner,
image_assets,
model,
size_factors,
dtype: str,
max_model_len: int,
max_tokens: int,
num_logprobs: int,
) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_image = [
(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
None,
)
for image, prompt in zip(images, HF_IMAGE_PROMPTS)
]
run_test(
hf_runner,
vllm_runner,
inputs_per_image,
model,
dtype=dtype,
max_model_len=max_model_len,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=1,
tensor_parallel_size=1,
)
@large_gpu_test(min_gb=48)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# No image
# [],
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_model_len", [25600])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_multi_images_models(
hf_runner,
vllm_runner,
image_assets,
model,
size_factors,
dtype: str,
max_model_len: int,
max_tokens: int,
num_logprobs: int,
) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_case = [
(
[HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
[
[rescale_image_size(image, factor) for image in images]
for factor in size_factors
],
None,
),
]
run_test(
hf_runner,
vllm_runner,
inputs_per_case,
model,
dtype=dtype,
max_model_len=max_model_len,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=2,
tensor_parallel_size=1,
)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_vision_speech_models(
hf_runner,
vllm_runner,
model,
dtype: str,
max_model_len: int,
max_tokens: int,
num_logprobs: int,
) -> None:
# use the example speech question so that the model outputs are reasonable
audio = librosa.load(speech_question, sr=None)
image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
inputs_vision_speech = [
(
["<|user|><|image_1|><|audio_1|><|end|><|assistant|>"],
[image],
[audio],
),
]
run_test(
hf_runner,
vllm_runner,
inputs_vision_speech,
model,
dtype=dtype,
max_model_len=max_model_len,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=1,
tensor_parallel_size=1,
)

View File

@@ -0,0 +1,211 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from dataclasses import asdict
from typing import TYPE_CHECKING, Any
import pytest
from mistral_common.multimodal import download_image
from mistral_common.protocol.instruct.chunk import ImageURLChunk
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
from transformers import AutoProcessor
from vllm import SamplingParams, TextPrompt, TokensPrompt
from vllm.logprobs import Logprob, SampleLogprobs
from vllm.multimodal import MultiModalDataBuiltins
from vllm.platforms import current_platform
from ....utils import VLLM_PATH, large_gpu_test
from ...utils import check_logprobs_close
if TYPE_CHECKING:
from _typeshed import StrPath
PIXTRAL_ID = "mistralai/Pixtral-12B-2409"
MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
MODELS = [PIXTRAL_ID, MISTRAL_SMALL_3_1_ID]
IMG_URLS = [
"237-400x300.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
"231-200x300.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
"27-500x500.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
"17-150x600.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
]
PROMPT = "Describe each image in one short sentence."
def _create_msg_format(urls: list[str]) -> list[dict[str, Any]]:
return [
{
"role": "user",
"content": [
{
"type": "text",
"text": PROMPT,
}
]
+ [{"type": "image_url", "image_url": {"url": url}} for url in urls],
}
]
def _create_msg_format_hf(urls: list[str]) -> list[dict[str, Any]]:
return [
{
"role": "user",
"content": [
{
"type": "text",
"content": PROMPT,
},
*({"type": "image", "image": download_image(url)} for url in urls),
],
}
]
def _create_engine_inputs(urls: list[str]) -> TokensPrompt:
msg = _create_msg_format(urls)
tokenizer = MistralTokenizer.from_model("pixtral")
request = ChatCompletionRequest(messages=msg) # type: ignore[type-var]
tokenized = tokenizer.encode_chat_completion(request)
engine_inputs = TokensPrompt(prompt_token_ids=tokenized.tokens)
images = []
for chunk in request.messages[0].content:
if isinstance(chunk, ImageURLChunk):
images.append(image_from_chunk(chunk))
mm_data = MultiModalDataBuiltins(image=images)
engine_inputs["multi_modal_data"] = mm_data
return engine_inputs
def _create_engine_inputs_hf(urls: list[str]) -> TextPrompt:
msg = _create_msg_format_hf(urls)
tokenizer = AutoProcessor.from_pretrained("mistral-community/pixtral-12b")
prompt = tokenizer.apply_chat_template(msg)
images = []
for chunk in msg[0]["content"]:
if chunk["type"] == "image":
images.append(chunk["image"])
mm_data = MultiModalDataBuiltins(image=images)
engine_inputs = TextPrompt(prompt=prompt, multi_modal_data=mm_data)
return engine_inputs
SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
LIMIT_MM_PER_PROMPT = dict(image=4)
MAX_MODEL_LEN = [8192, 65536]
FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
assert FIXTURES_PATH.exists()
FIXTURE_LOGPROBS_CHAT = {
PIXTRAL_ID: FIXTURES_PATH / "pixtral_chat.json",
MISTRAL_SMALL_3_1_ID: FIXTURES_PATH / "mistral_small_3_chat.json",
}
OutputsLogprobs = list[tuple[list[int], str, SampleLogprobs | None]]
# For the test author to store golden output in JSON
def _dump_outputs_w_logprobs(
outputs: OutputsLogprobs,
filename: "StrPath",
) -> None:
json_data = [
(
tokens,
text,
[
{k: asdict(v) for k, v in token_logprobs.items()}
for token_logprobs in (logprobs or [])
],
)
for tokens, text, logprobs in outputs
]
with open(filename, "w") as f:
json.dump(json_data, f)
def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
with open(filename, "rb") as f:
json_data = json.load(f)
return [
(
tokens,
text,
[
{int(k): Logprob(**v) for k, v in token_logprobs.items()}
for token_logprobs in logprobs
],
)
for tokens, text, logprobs in json_data
]
@large_gpu_test(min_gb=80)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
@pytest.mark.parametrize("dtype", ["bfloat16"])
def test_chat(
vllm_runner, max_model_len: int, model: str, dtype: str, local_asset_server
) -> None:
if (
model == MISTRAL_SMALL_3_1_ID
and max_model_len == 65536
and current_platform.is_rocm()
):
pytest.skip(
"OOM on ROCm: 24B model with 65536 context length exceeds GPU memory"
)
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT[model])
with vllm_runner(
model,
dtype=dtype,
tokenizer_mode="mistral",
load_format="mistral",
config_format="mistral",
max_model_len=max_model_len,
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
) as vllm_model:
outputs = []
urls_all = [local_asset_server.url_for(u) for u in IMG_URLS]
msgs = [
_create_msg_format(urls_all[:1]),
_create_msg_format(urls_all[:2]),
_create_msg_format(urls_all),
]
for msg in msgs:
output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS)
outputs.extend(output)
logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
# Remove last `None` prompt_logprobs to compare with fixture
for i in range(len(logprobs)):
assert logprobs[i][-1] is None
logprobs[i] = logprobs[i][:-1]
check_logprobs_close(
outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
outputs_1_lst=logprobs,
name_0="h100_ref",
name_1="output",
)

View File

@@ -0,0 +1,148 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.multimodal.video import sample_frames_from_video
from ....conftest import VIDEO_ASSETS
models = ["Qwen/Qwen2.5-VL-3B-Instruct"]
target_dtype = "bfloat16"
VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
def qwen2_5_vl_chat_template(*query):
return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n" # noqa: E501
VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
{
"baby_reading": qwen2_5_vl_chat_template(
VIDEO_PLACEHOLDER,
"Describe this video with a short sentence ",
"(no more than 20 words)",
),
}
)
@pytest.mark.core_model
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("video_pruning_rate", [0.0, 0.75])
@pytest.mark.parametrize("num_frames", [16])
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
def test_qwen2_5_vl_evs_functionality(
vllm_runner,
video_assets,
model,
video_pruning_rate: float,
num_frames: int,
dtype: str,
max_tokens: int,
use_bytecode_hook: bool,
monkeypatch,
) -> None:
"""Test EVS (Efficient Video Sampling) functionality with different
pruning rates.
"""
# Set the environment variable for this test
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
# Sample frames from video assets
sampled_vids = [
sample_frames_from_video(asset.np_ndarrays, num_frames)
for asset in video_assets
]
prompts = [VIDEO_PROMPTS[0]]
videos = [sampled_vids[0]]
# Initialize model with EVS configuration
with vllm_runner(
model,
runner="generate",
max_model_len=4000,
dtype=dtype,
limit_mm_per_prompt={"video": 1},
video_pruning_rate=video_pruning_rate,
) as vllm_model:
# Generate output - this should not crash
outputs = vllm_model.generate_greedy(prompts, max_tokens, videos=videos)
# Basic validation that we got a response
assert len(outputs) == 1
output_ids, output_text = outputs[0]
# Ensure we got some output
assert len(output_ids) > 0
assert len(output_text) > 0
# Ensure the output is a string
assert isinstance(output_text, str)
@pytest.mark.core_model
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("video_pruning_rate", [0.0, 0.75])
@pytest.mark.parametrize("num_frames", [16])
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
def test_qwen2_5_vl_evs_batched_videos(
vllm_runner,
video_assets,
model,
video_pruning_rate: float,
num_frames: int,
dtype: str,
max_tokens: int,
use_bytecode_hook: bool,
monkeypatch,
) -> None:
"""Test EVS functionality with batched videos.
This test validates that:
1. The model handles batched video inputs correctly with EVS
2. Both pruning configurations work with multiple videos
3. The model doesn't crash when processing multiple videos simultaneously
"""
# Set the environment variable for this test
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
# Sample frames from video assets
sampled_vids = [
sample_frames_from_video(asset.np_ndarrays, num_frames)
for asset in video_assets
]
# Test batched videos
prompts = [VIDEO_PROMPTS[0], VIDEO_PROMPTS[0]]
videos = [sampled_vids[0], sampled_vids[0]] # Use same video twice for testing
# Initialize model with EVS configuration
with vllm_runner(
model,
runner="generate",
max_model_len=4000,
max_num_seqs=2,
dtype=dtype,
limit_mm_per_prompt={"video": 2},
tensor_parallel_size=1,
video_pruning_rate=video_pruning_rate,
) as vllm_model:
# Generate output - this should not crash
outputs = vllm_model.generate_greedy(prompts, max_tokens, videos=videos)
# Basic validation that we got responses for both videos
assert len(outputs) == 2
for output_ids, output_text in outputs:
# Ensure we got some output for each video
assert len(output_ids) > 0
assert len(output_text) > 0
# Ensure the output is a string
assert isinstance(output_text, str)

View File

@@ -0,0 +1,473 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any, TypedDict
import numpy.typing as npt
import pytest
import torch
from PIL import Image
from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
from ....conftest import (
IMAGE_ASSETS,
VIDEO_ASSETS,
PromptImageInput,
PromptVideoInput,
VllmRunner,
)
from ...utils import check_logprobs_close
@pytest.fixture(scope="function", autouse=True)
def enable_pickle(monkeypatch):
"""`LLM.apply_model` requires pickling a function."""
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
models = ["Qwen/Qwen2-VL-2B-Instruct"]
target_dtype = "half"
IMAGE_PLACEHOLDER = "<|vision_start|><|image_pad|><|vision_end|>"
VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
MODEL_HIDDEN_SIZE = 1536
def qwen2_vl_chat_template(*query):
return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n" # noqa: E501
IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
{
"stop_sign": qwen2_vl_chat_template(
IMAGE_PLACEHOLDER,
"What is the biggest text's content in this image?",
),
"cherry_blossom": qwen2_vl_chat_template(
IMAGE_PLACEHOLDER,
"What is the season shown in this image? ",
"Reply with a short sentence (no more than 20 words)",
),
}
)
VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
{
"baby_reading": qwen2_vl_chat_template(
VIDEO_PLACEHOLDER,
"Describe this video with a short sentence ",
"(no more than 20 words)",
),
}
)
MULTIIMAGE_PROMPT = qwen2_vl_chat_template(
IMAGE_PLACEHOLDER,
IMAGE_PLACEHOLDER,
"Describe these two images separately. ",
"For each image, reply with a short sentence ",
"(no more than 10 words).",
)
class Qwen2VLPromptImageEmbeddingInput(TypedDict):
image_embeds: torch.Tensor
image_grid_thw: torch.Tensor
class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
video_embeds: torch.Tensor
video_grid_thw: torch.Tensor
def batch_make_image_embeddings(
image_batches: list[Image.Image | list[Image.Image]],
processor,
llm: VllmRunner,
) -> list[Qwen2VLPromptImageEmbeddingInput]:
"""batched image embeddings for Qwen2-VL
This will infer all images' embeddings in a single batch,
and split the result according to input batches.
image_batches:
- Single-image batches: `list[Image.Image]`
- Multiple-image batches: `list[list[Image.Image]]]`
returns: `list[Qwen2VLPromptImageEmbeddingInput]`
"""
image_batches_: list[Any] = image_batches[:]
# convert single-image batches to multiple-image batches
for idx in range(len(image_batches_)):
if not isinstance(image_batches_[idx], list):
image_batches_[idx] = [image_batches_[idx]]
assert isinstance(image_batches_[idx], list)
# append all images into a list (as a batch)
images: list[Image.Image] = []
for image_batch in image_batches_:
images += image_batch
# image to pixel values
image_processor = processor.image_processor
preprocess_result = image_processor.preprocess(
images=images, return_tensors="pt"
).data
pixel_values = preprocess_result["pixel_values"]
image_grid_thw = preprocess_result["image_grid_thw"]
# pixel values to embeddings & grid_thws
def get_image_embeds(model):
with torch.no_grad():
visual = model.visual
pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
return visual(pixel_values_on_device, grid_thw=image_grid_thw).cpu()
image_embeds = torch.concat(llm.apply_model(get_image_embeds))
# split into original batches
result: list[Qwen2VLPromptImageEmbeddingInput] = []
image_counter = 0
embed_counter = 0
for image_batch in image_batches_:
cur_batch_image_count = len(image_batch)
merge_size = image_processor.merge_size
cur_batch_embed_len = sum(
grid_thw.prod(-1) // merge_size // merge_size
for grid_thw in image_grid_thw[
image_counter : image_counter + cur_batch_image_count
]
)
result.append(
{
"image_embeds": image_embeds[
embed_counter : embed_counter + cur_batch_embed_len
],
"image_grid_thw": image_grid_thw[
image_counter : image_counter + cur_batch_image_count
],
}
)
embed_counter += cur_batch_embed_len
image_counter += cur_batch_image_count
# ensure we don't lose any images or embeddings
assert embed_counter == image_embeds.size(0)
assert image_counter == image_grid_thw.size(0)
assert len(image_batches) == len(result)
return result
def batch_make_video_embeddings(
video_batches: PromptVideoInput, processor, llm: VllmRunner
) -> list[Qwen2VLPromptVideoEmbeddingInput]:
"""batched video embeddings for Qwen2-VL
A NDArray represents a single video's all frames.
This will infer all videos' embeddings in a single batch,
and split the result according to input batches.
video_batches:
- Single-video batches: `list[NDArray]`
- Multiple-video batches: `list[list[NDArray]]`
"""
video_batches_: list[Any] = video_batches[:]
for idx in range(len(video_batches_)):
if not isinstance(video_batches_[idx], list):
single_video_batch: list[npt.NDArray] = [video_batches_[idx]]
video_batches_[idx] = single_video_batch
assert isinstance(video_batches_[idx], list)
# append all videos into a list (as a batch)
videos: list[npt.NDArray] = []
for video_batch in video_batches_:
videos += video_batch
# video to pixel values
image_processor = processor.image_processor
preprocess_result = image_processor.preprocess(
images=None, videos=videos, return_tensors="pt"
).data
pixel_values = preprocess_result["pixel_values_videos"]
video_grid_thw = preprocess_result["video_grid_thw"]
# pixel values to embeddings & grid_thws
def get_image_embeds(model):
with torch.no_grad():
visual = model.visual
pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
return visual(pixel_values_on_device, grid_thw=video_grid_thw).cpu()
video_embeds = torch.concat(llm.apply_model(get_image_embeds))
# split into original batches
result: list[Qwen2VLPromptVideoEmbeddingInput] = []
video_counter = 0
embed_counter = 0
for video_batch in video_batches_:
cur_batch_video_count = len(video_batch)
merge_size = image_processor.merge_size
cur_batch_embed_len = sum(
grid_thw.prod(-1) // merge_size // merge_size
for grid_thw in video_grid_thw[
video_counter : video_counter + cur_batch_video_count
]
)
result.append(
{
"video_embeds": video_embeds[
embed_counter : embed_counter + cur_batch_embed_len
],
"video_grid_thw": video_grid_thw[
video_counter : video_counter + cur_batch_video_count
],
}
)
embed_counter += cur_batch_embed_len
video_counter += cur_batch_video_count
# ensure we don't lose any videos or embeddings
assert embed_counter == video_embeds.size(0)
assert video_counter == video_grid_thw.size(0)
assert len(video_batches) == len(result)
return result
def run_embedding_input_test(
vllm_runner: type[VllmRunner],
inputs: list[tuple[list[str], PromptImageInput, PromptVideoInput]],
model: str,
*,
dtype: str,
max_tokens: int,
num_logprobs: int,
mm_limit: int,
tensor_parallel_size: int,
distributed_executor_backend: str | None = None,
):
"""Inference result should be the same between
original image/video input and image/video embeddings input.
"""
from transformers import AutoProcessor # noqa: F401
processor = AutoProcessor.from_pretrained(model)
# max_model_len should be greater than image_feature_size
with vllm_runner(
model,
runner="generate",
max_model_len=4000,
max_num_seqs=3,
dtype=dtype,
limit_mm_per_prompt={"image": mm_limit, "video": mm_limit},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
default_torch_num_threads=1,
enable_mm_embeds=True,
) as vllm_model:
outputs_per_case_for_original_input = [
vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images or None,
videos=videos or None,
)
for prompts, images, videos in inputs
]
outputs_per_case_for_embeddings_input = [
vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=batch_make_image_embeddings(images, processor, vllm_model)
if images
else None,
videos=batch_make_video_embeddings(videos, processor, vllm_model)
if videos
else None,
)
for prompts, images, videos in inputs
]
for outputs_for_original_input, outputs_for_embeddings_input in zip(
outputs_per_case_for_original_input, outputs_per_case_for_embeddings_input
):
check_logprobs_close(
outputs_0_lst=outputs_for_original_input,
outputs_1_lst=outputs_for_embeddings_input,
name_0="original_input",
name_1="embeddings_input",
)
@pytest.mark.core_model
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# Single-scale
[0.5],
# Single-scale, batched
[0.5, 0.5],
# Multi-scale
[0.25, 0.5, 0.5],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_qwen2_vl_image_embeddings_input(
vllm_runner,
image_assets,
model,
size_factors,
dtype,
max_tokens,
num_logprobs,
monkeypatch,
) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
[],
)
for image, prompt in zip(images, IMAGE_PROMPTS)
]
run_embedding_input_test(
vllm_runner,
inputs_per_case,
model,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=1,
tensor_parallel_size=1,
)
@pytest.mark.core_model
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
[],
# Single-scale
[0.5],
# Single-scale, batched
[0.5, 0.5],
# Multi-scale
[0.25, 0.5, 0.5],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_qwen2_vl_multiple_image_embeddings_input(
vllm_runner,
image_assets,
model,
size_factors,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
(
[MULTIIMAGE_PROMPT for _ in size_factors],
[
[rescale_image_size(image, factor) for image in images]
for factor in size_factors
],
[],
)
]
run_embedding_input_test(
vllm_runner,
inputs_per_case,
model,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=2,
tensor_parallel_size=1,
)
@pytest.mark.core_model
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# Single-scale
[0.5],
# Single-scale, batched
[0.5, 0.5],
# Multi-scale
[0.25, 0.25, 0.5],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_qwen2_vl_video_embeddings_input(
vllm_runner,
video_assets,
model,
size_factors,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
num_frames = 4
sampled_vids = [
sample_frames_from_video(asset.np_ndarrays, num_frames)
for asset in video_assets
]
inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
(
[prompt for _ in size_factors],
[],
[rescale_video_size(video, factor) for factor in size_factors],
)
for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)
]
run_embedding_input_test(
vllm_runner,
inputs_per_case,
model,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=1,
tensor_parallel_size=1,
)

View File

@@ -0,0 +1,185 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from typing import Any
import numpy as np
import pytest
import pytest_asyncio
from transformers import AutoTokenizer
from ....conftest import AUDIO_ASSETS, AudioTestAssets, VllmRunner
from ....utils import RemoteOpenAIServer
from ...registry import HF_EXAMPLE_MODELS
MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
AUDIO_PROMPTS = AUDIO_ASSETS.prompts(
{
"mary_had_lamb": "Transcribe this into English.",
"winning_call": "What is happening in this audio clip?",
}
)
MULTI_AUDIO_PROMPT = "Describe each of the audios above."
AudioTuple = tuple[np.ndarray, int]
VLLM_PLACEHOLDER = "<|audio|>"
HF_PLACEHOLDER = "<|audio|>"
CHUNKED_PREFILL_KWARGS = {
"enable_chunked_prefill": True,
"max_num_seqs": 2,
# Use a very small limit to exercise chunked prefill.
"max_num_batched_tokens": 16,
}
def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]:
"""Convert kwargs to CLI args."""
args = []
for key, value in params_kwargs.items():
if isinstance(value, bool):
if value:
args.append(f"--{key.replace('_', '-')}")
else:
args.append(f"--{key.replace('_', '-')}={value}")
return args
@pytest.fixture(
params=[
pytest.param({}, marks=pytest.mark.cpu_model),
pytest.param(CHUNKED_PREFILL_KWARGS),
]
)
def server(request, audio_assets: AudioTestAssets):
args = [
"--dtype",
"bfloat16",
"--max-model-len",
"4096",
"--enforce-eager",
"--limit-mm-per-prompt",
json.dumps({"audio": len(audio_assets)}),
"--trust-remote-code",
] + params_kwargs_to_cli_args(request.param)
with RemoteOpenAIServer(
MODEL_NAME, args, env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"}
) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
def _get_prompt(audio_count, question, placeholder):
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
placeholder = f"{placeholder}\n" * audio_count
return tokenizer.apply_chat_template(
[{"role": "user", "content": f"{placeholder}{question}"}],
tokenize=False,
add_generation_prompt=True,
)
def run_multi_audio_test(
vllm_runner: type[VllmRunner],
prompts_and_audios: list[tuple[str, list[AudioTuple]]],
model: str,
*,
dtype: str,
max_tokens: int,
num_logprobs: int,
**kwargs,
):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
with vllm_runner(
model,
dtype=dtype,
enforce_eager=True,
limit_mm_per_prompt={
"audio": max((len(audio) for _, audio in prompts_and_audios))
},
**kwargs,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
[prompt for prompt, _ in prompts_and_audios],
max_tokens,
num_logprobs=num_logprobs,
audios=[audios for _, audios in prompts_and_audios],
)
# The HuggingFace model doesn't support multiple audios yet, so
# just assert that some tokens were generated.
assert all(tokens for tokens, *_ in vllm_outputs)
@pytest.mark.core_model
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize(
"vllm_kwargs",
[
pytest.param({}, marks=pytest.mark.cpu_model),
pytest.param(CHUNKED_PREFILL_KWARGS),
],
)
def test_models_with_multiple_audios(
vllm_runner,
audio_assets: AudioTestAssets,
dtype: str,
max_tokens: int,
num_logprobs: int,
vllm_kwargs: dict,
) -> None:
vllm_prompt = _get_prompt(len(audio_assets), MULTI_AUDIO_PROMPT, VLLM_PLACEHOLDER)
run_multi_audio_test(
vllm_runner,
[(vllm_prompt, [audio.audio_and_sample_rate for audio in audio_assets])],
MODEL_NAME,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
**vllm_kwargs,
)
@pytest.mark.asyncio
async def test_online_serving(client, audio_assets: AudioTestAssets):
"""Exercises online serving with/without chunked prefill enabled."""
messages = [
{
"role": "user",
"content": [
*[
{"type": "audio_url", "audio_url": {"url": audio.url}}
for audio in audio_assets
],
{
"type": "text",
"text": f"What's happening in these {len(audio_assets)} audio clips?", # noqa: E501
},
],
}
]
chat_completion = await client.chat.completions.create(
model=MODEL_NAME, messages=messages, max_tokens=10
)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"

View File

@@ -0,0 +1,435 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Consolidated test for ViT attention backend functionality across multiple models.
This test validates that each multimodal model can successfully generate outputs
using different ViT attention backends. Tests are parametrized by model and backend.
"""
from dataclasses import asdict
from typing import Any
import pytest
from transformers import AutoProcessor
from vllm import LLM, EngineArgs, SamplingParams
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.multimodal.utils import encode_image_base64
from vllm.multimodal.video import sample_frames_from_video
from vllm.platforms import current_platform
from ....utils import create_new_process_for_each_test
from ...utils import dummy_hf_overrides
# Dots.OCR prompt from official repository
# https://github.com/rednote-hilab/dots.ocr/blob/d72d1d8c5bdd0362eb264f714cdbd1e5daa7cdff/dots_ocr/utils/prompts.py#L3
# ruff: noqa: E501
DOTS_OCR_PROMPT = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
1. Bbox format: [x1, y1, x2, y2]
2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
3. Text Extraction & Formatting Rules:
- Picture: For the 'Picture' category, the text field should be omitted.
- Formula: Format its text as LaTeX.
- Table: Format its text as HTML.
- All Others (Text, Title, etc.): Format their text as Markdown.
4. Constraints:
- The output text must be the original text from the image, with no translation.
- All layout elements must be sorted according to human reading order.
5. Final Output: The entire output must be a single JSON object.
"""
VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
# Model configurations
MODEL_CONFIGS: dict[str, dict[str, Any]] = {
"dots_ocr": {
"model_name": "rednote-hilab/dots.ocr",
"interface": "llm_chat",
"max_model_len": 32768,
"max_num_seqs": 1,
"limit_mm_per_prompt": {"image": 1},
"sampling_params": {
"temperature": 0.1,
"max_tokens": 16384,
"top_p": 0.9,
"stop_token_ids": None,
},
"use_specific_image": "stop_sign",
"prompt_builder": "build_dots_ocr_prompt",
"output_validator": lambda x: len(x) > 10 and "stop" in x.lower(),
},
"ernie45_vl": {
"model_name": "baidu/ERNIE-4.5-VL-28B-A3B-PT",
"interface": "llm_generate",
"max_model_len": 16384,
"max_num_seqs": 2,
"sampling_params": {
"temperature": 0.0,
"max_tokens": 256,
"stop_token_ids": None,
},
"use_processor": True,
"question": "What is the content of each image?",
},
"glm4_1v": {
"model_name": "zai-org/GLM-4.1V-9B-Thinking",
"interface": "llm_generate",
"max_model_len": 32768,
"max_num_seqs": 2,
"sampling_params": {
"temperature": 0.0,
"max_tokens": 256,
"stop_token_ids": None,
},
"use_processor": True,
"question": "What is the content of each image?",
},
"keye_vl": {
"model_name": "Kwai-Keye/Keye-VL-8B-Preview",
"interface": "llm_generate",
"max_model_len": 8192,
"max_num_seqs": 5,
"sampling_params": {
"temperature": 0.0,
"max_tokens": 256,
"stop_token_ids": None,
},
"supported_backends": {
AttentionBackendEnum.FLASH_ATTN,
AttentionBackendEnum.ROCM_AITER_FA,
},
"use_processor": True,
"question": "What is the content of each image?",
},
"ovis2_5": {
"model_name": "AIDC-AI/Ovis2.5-2B",
"interface": "llm_generate",
"max_model_len": 8192,
"max_num_seqs": 2,
"sampling_params": {
"temperature": 0.0,
"max_tokens": 256,
"stop_token_ids": None,
},
"prompt_builder": "build_ovis_prompt",
"question": "What is the content of each image?",
},
"qwen2_5_vl": {
"model_name": "Qwen/Qwen2.5-VL-3B-Instruct",
"interface": "vllm_runner",
"media_type": "video",
"max_model_len": 4000,
"max_num_seqs": 1,
"limit_mm_per_prompt": {"video": 1},
"sampling_params": {
"max_tokens": 128,
},
"runner_kwargs": {
"runner": "generate",
"dtype": "bfloat16",
},
"video_params": {
"num_frames": 16,
"pruning_rates": [0.0, 0.75],
},
},
"qwen2_5_omni": {
"model_name": "Qwen/Qwen2.5-Omni-3B",
"interface": "llm_generate",
"max_model_len": 32768,
"max_num_seqs": 2,
"limit_mm_per_prompt": {"image": 3, "video": 3, "audio": 3},
"sampling_params": {
"temperature": 0.6,
"top_p": 0.95,
"top_k": 20,
"max_tokens": 16384,
},
"use_processor": True,
"question": "What is the content of each image?",
},
"qwen3_omni": {
"model_name": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
"interface": "llm_generate",
"max_model_len": 32768,
"max_num_seqs": 2,
"limit_mm_per_prompt": {"image": 3, "video": 3, "audio": 3},
"sampling_params": {
"temperature": 0.6,
"top_p": 0.95,
"top_k": 20,
"max_tokens": 16384,
},
"use_processor": True,
"question": "What is the content of each image?",
},
}
# Prompt builder functions
def build_dots_ocr_prompt(images, config):
"""Build Dots.OCR specific prompt with OCR instructions."""
# Use only stop_sign image for Dots.OCR
image = images[0] # Already filtered to stop_sign
image_url = f"data:image/jpeg;base64,{encode_image_base64(image)}"
placeholders = [{"type": "image_url", "image_url": {"url": image_url}}]
messages = [
{
"role": "user",
"content": [
*placeholders,
{
"type": "text",
"text": f"<|img|><|imgpad|><|endofimg|>{DOTS_OCR_PROMPT}",
},
],
},
]
return messages
def build_processor_prompt(images, config):
"""Build prompt using AutoProcessor.apply_chat_template()."""
processor = AutoProcessor.from_pretrained(
config["model_name"], trust_remote_code=True
)
image_urls = [
f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
]
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [
{
"role": "user",
"content": [
*placeholders,
{"type": "text", "text": config["question"]},
],
},
]
return processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
def build_ovis_prompt(images, config):
"""Build Ovis2.5 specific prompt with custom format."""
image_urls = [
f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
]
placeholders = "\n".join(
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
)
return (
f"<|im_start|>user\n\n{placeholders}\n{config['question']}<|im_end|>\n"
"<|im_start|>assistant\n"
)
def build_qwen2_5_video_prompt():
"""Build Qwen2.5-VL video prompt with EVS placeholder."""
return (
f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n{VIDEO_PLACEHOLDER}"
"Describe this video with a short sentence (no more than 20 words)"
"<|im_end|><|im_start|>assistant\n"
)
# Handler functions
def run_llm_generate_test(config, mm_encoder_attn_backend, image_assets):
"""Standard LLM.generate() interface handler."""
images = [asset.pil_image for asset in image_assets]
# Build prompt
if config.get("use_processor"):
prompt = build_processor_prompt(images, config)
else:
prompt_builder_name = config.get("prompt_builder", "build_ovis_prompt")
prompt_builder = globals()[prompt_builder_name]
prompt = prompt_builder(images, config)
# Determine limit_mm_per_prompt
limit_mm_per_prompt = config.get("limit_mm_per_prompt", {"image": len(images)})
# Create engine
engine_args = EngineArgs(
model=config["model_name"],
trust_remote_code=True,
max_model_len=config["max_model_len"],
max_num_seqs=config["max_num_seqs"],
limit_mm_per_prompt=limit_mm_per_prompt,
mm_encoder_attn_backend=mm_encoder_attn_backend,
hf_overrides=dummy_hf_overrides,
load_format="dummy",
)
engine_dict = asdict(engine_args) | {"seed": 42}
llm = LLM(**engine_dict)
# Generate
sampling_params = SamplingParams(**config["sampling_params"])
outputs = llm.generate(
{
"prompt": prompt,
"multi_modal_data": {"image": images},
},
sampling_params=sampling_params,
)
# Validate
for o in outputs:
generated_text = o.outputs[0].text
validator = config.get("output_validator", lambda x: len(x) > 10)
assert validator(generated_text), (
f"Validation failed for {config['model_name']}: {generated_text}"
)
def run_llm_chat_test(config, mm_encoder_attn_backend, image_assets):
"""LLM.chat() interface handler for Dots.OCR."""
# Filter to stop_sign image only
stop_sign_image = [
asset.pil_image for asset in image_assets if asset.name == "stop_sign"
][0]
# Build messages
messages = build_dots_ocr_prompt([stop_sign_image], config)
# Create engine
engine_args = EngineArgs(
model=config["model_name"],
trust_remote_code=True,
max_model_len=config["max_model_len"],
max_num_seqs=config["max_num_seqs"],
limit_mm_per_prompt=config["limit_mm_per_prompt"],
mm_encoder_attn_backend=mm_encoder_attn_backend,
hf_overrides=dummy_hf_overrides,
load_format="dummy",
)
engine_dict = asdict(engine_args) | {"seed": 42}
llm = LLM(**engine_dict)
# Generate using chat
sampling_params = SamplingParams(**config["sampling_params"])
outputs = llm.chat(messages=messages, sampling_params=sampling_params)
# Validate
for o in outputs:
generated_text = o.outputs[0].text
validator = config.get("output_validator", lambda x: len(x) > 10)
assert validator(generated_text), (
f"Validation failed for {config['model_name']}: {generated_text}"
)
def run_video_test(config, mm_encoder_attn_backend, video_assets, vllm_runner):
"""Video test with EVS (Efficient Video Sampling) handler."""
for pruning_rate in config["video_params"]["pruning_rates"]:
num_frames = config["video_params"]["num_frames"]
# Sample frames from video
sampled_vids = [
sample_frames_from_video(asset.np_ndarrays, num_frames)
for asset in video_assets
]
# Build prompt and prepare video
prompt = build_qwen2_5_video_prompt()
prompts = [prompt]
videos = [sampled_vids[0]]
# Run with vllm_runner context manager
with vllm_runner(
config["model_name"],
max_model_len=config["max_model_len"],
max_num_seqs=config["max_num_seqs"],
limit_mm_per_prompt=config["limit_mm_per_prompt"],
tensor_parallel_size=1,
video_pruning_rate=pruning_rate,
mm_encoder_attn_backend=mm_encoder_attn_backend,
hf_overrides=dummy_hf_overrides,
load_format="dummy",
**config["runner_kwargs"],
) as vllm_model:
outputs = vllm_model.generate_greedy(
prompts,
config["sampling_params"]["max_tokens"],
videos=videos,
)
# Validate output
assert len(outputs) == 1, f"Expected 1 output, got {len(outputs)}"
output_ids, output_text = outputs[0]
assert len(output_ids) > 0, "Generated no output IDs"
assert len(output_text) > 0, "Generated empty text"
assert isinstance(output_text, str), (
f"Output is not string: {type(output_text)}"
)
# Main test function
@pytest.mark.parametrize("model_key", list(MODEL_CONFIGS.keys()))
@pytest.mark.parametrize(
"mm_encoder_attn_backend",
[None] + current_platform.get_supported_vit_attn_backends(),
)
@pytest.mark.skip(reason="Broken test due to memory segmentation fault")
@create_new_process_for_each_test()
def test_vit_backend_functionality(
model_key: str,
mm_encoder_attn_backend: AttentionBackendEnum | None,
image_assets,
video_assets,
vllm_runner,
request,
):
"""Test ViT attention backend functionality for multimodal models.
This test validates that each model can successfully generate outputs
using different ViT attention backends. The test:
1. Filters unsupported backends per model
2. Applies appropriate GPU marks
3. Routes to the correct test handler based on interface
4. Validates output meets minimum requirements
"""
config = MODEL_CONFIGS[model_key]
# Step 1: Backend filtering
if (
"supported_backends" in config
and mm_encoder_attn_backend is not None
and mm_encoder_attn_backend not in config["supported_backends"]
):
pytest.skip(
f"{model_key} does not support {mm_encoder_attn_backend} backend now."
)
# Step 2: Apply GPU marks dynamically
if "gpu_marks" in config:
for mark in config["gpu_marks"]:
request.applymarker(mark)
# Step 3: Route to appropriate handler
if config.get("media_type") == "video":
run_video_test(config, mm_encoder_attn_backend, video_assets, vllm_runner)
elif config["interface"] == "llm_chat":
run_llm_chat_test(config, mm_encoder_attn_backend, image_assets)
elif config["interface"] == "llm_generate":
run_llm_generate_test(config, mm_encoder_attn_backend, image_assets)
else:
raise ValueError(f"Unknown interface: {config['interface']}")

View File

@@ -0,0 +1,114 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import pytest
import pytest_asyncio
from mistral_common.audio import Audio
from mistral_common.protocol.instruct.chunk import AudioChunk, RawAudio, TextChunk
from mistral_common.protocol.instruct.messages import UserMessage
from vllm.tokenizers.mistral import MistralTokenizer
from ....conftest import AudioTestAssets
from ....utils import RemoteOpenAIServer
from .test_ultravox import MULTI_AUDIO_PROMPT, run_multi_audio_test
MODEL_NAME = "mistralai/Voxtral-Mini-3B-2507"
MISTRAL_FORMAT_ARGS = [
"--tokenizer_mode",
"mistral",
"--config_format",
"mistral",
"--load_format",
"mistral",
]
@pytest.fixture()
def server(request, audio_assets: AudioTestAssets):
args = [
"--enforce-eager",
"--limit-mm-per-prompt",
json.dumps({"audio": len(audio_assets)}),
] + MISTRAL_FORMAT_ARGS
with RemoteOpenAIServer(
MODEL_NAME, args, env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"}
) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
def _get_prompt(audio_assets, question):
tokenizer = MistralTokenizer.from_pretrained(MODEL_NAME)
audios = [
Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
for i in range(len(audio_assets))
]
audio_chunks = [
AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
]
text_chunk = TextChunk(text=question)
messages = [UserMessage(content=[*audio_chunks, text_chunk]).to_openai()]
return tokenizer.apply_chat_template(messages=messages)
@pytest.mark.core_model
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models_with_multiple_audios(
vllm_runner,
audio_assets: AudioTestAssets,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
vllm_prompt = _get_prompt(audio_assets, MULTI_AUDIO_PROMPT)
run_multi_audio_test(
vllm_runner,
[(vllm_prompt, [audio.audio_and_sample_rate for audio in audio_assets])],
MODEL_NAME,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tokenizer_mode="mistral",
)
@pytest.mark.asyncio
async def test_online_serving(client, audio_assets: AudioTestAssets):
"""Exercises online serving with/without chunked prefill enabled."""
def asset_to_chunk(asset):
audio = Audio.from_file(str(asset.get_local_path()), strict=False)
audio.format = "wav"
audio_dict = AudioChunk.from_audio(audio).to_openai()
return audio_dict
audio_chunks = [asset_to_chunk(asset) for asset in audio_assets]
text = f"What's happening in these {len(audio_assets)} audio clips?"
messages = [
{
"role": "user",
"content": [*audio_chunks, {"type": "text", "text": text}],
}
]
chat_completion = await client.chat.completions.create(
model=MODEL_NAME, messages=messages, max_tokens=10
)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"

View File

@@ -0,0 +1,178 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
from typing import Any
import librosa
import pytest
from transformers import AutoModelForSpeechSeq2Seq
from vllm.assets.audio import AudioAsset
from vllm.platforms import current_platform
from ....conftest import HfRunner, PromptAudioInput, VllmRunner
from ....utils import create_new_process_for_each_test, multi_gpu_test
from ...registry import HF_EXAMPLE_MODELS
from ...utils import check_logprobs_close
VLLM_PROMPT = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
HF_PROMPT = ""
# Whisper expects 16kHz audio
WHISPER_SAMPLE_RATE = 16000
@pytest.fixture(autouse=True)
def use_spawn_for_whisper(monkeypatch):
"""Whisper has issues with forked workers, use spawn instead."""
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
def run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
inputs: Sequence[tuple[list[str], list[str], PromptAudioInput]],
model: str,
*,
max_model_len: int,
dtype: str,
max_tokens: int,
num_logprobs: int,
tensor_parallel_size: int,
distributed_executor_backend: str | None = None,
enforce_eager: bool = True,
) -> None:
"""Inference result should be the same between hf and vllm.
All the audio fixtures for the test are from AudioAsset.
For huggingface runner, we provide the audio as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
"""
with vllm_runner(
model,
dtype=dtype,
max_model_len=max_model_len,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
limit_mm_per_prompt={"audio": 2},
enforce_eager=enforce_eager,
disable_custom_all_reduce=True,
) as vllm_model:
vllm_outputs_per_case = [
vllm_model.generate_greedy_logprobs(
vllm_prompts,
max_tokens,
num_logprobs=num_logprobs,
audios=audios,
)
for vllm_prompts, _, audios in inputs
]
with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(
hf_prompts,
max_tokens,
num_logprobs=num_logprobs,
audios=audios,
)
for _, hf_prompts, audios in inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.fixture
def input_audios() -> list[tuple[list[str], list[str], list[tuple[Any, int]]]]:
audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
inputs = []
for asset in audio_assets:
audio, orig_sr = asset.audio_and_sample_rate
# Resample to Whisper's expected sample rate (16kHz)
if orig_sr != WHISPER_SAMPLE_RATE:
audio = librosa.resample(
audio, orig_sr=orig_sr, target_sr=WHISPER_SAMPLE_RATE
)
# vLLM prompts, HF prompts, audio inputs
inputs.append(([VLLM_PROMPT], [HF_PROMPT], [(audio, WHISPER_SAMPLE_RATE)]))
return inputs
def check_model_available(model: str) -> None:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
@pytest.mark.core_model
@pytest.mark.cpu_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("enforce_eager", [True, False])
@create_new_process_for_each_test("spawn")
def test_models(
hf_runner,
vllm_runner,
model: str,
dtype: str,
num_logprobs: int,
input_audios,
enforce_eager: bool,
) -> None:
check_model_available(model)
if current_platform.is_cpu() and not enforce_eager:
pytest.skip("Skipping test for CPU with non-eager mode")
run_test(
hf_runner,
vllm_runner,
input_audios,
model,
dtype=dtype,
max_model_len=448,
max_tokens=200,
num_logprobs=num_logprobs,
tensor_parallel_size=1,
enforce_eager=enforce_eager,
)
@multi_gpu_test(num_gpus=2)
@pytest.mark.core_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [200])
@pytest.mark.parametrize("num_logprobs", [5])
@create_new_process_for_each_test("spawn")
def test_models_distributed(
hf_runner,
vllm_runner,
model: str,
distributed_executor_backend: str,
dtype: str,
max_tokens: int,
num_logprobs: int,
input_audios,
) -> None:
check_model_available(model)
run_test(
hf_runner,
vllm_runner,
input_audios,
model,
dtype=dtype,
max_model_len=448,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=False,
)

View File

@@ -0,0 +1,347 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Helpers for building inputs that can be leveraged for different test types."""
from collections.abc import Callable, Iterable
from pathlib import PosixPath
from typing import Any
import numpy.typing as npt
import torch
from vllm.multimodal.audio import AudioResampler
from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.video import (
rescale_video_size,
resize_video,
sample_frames_from_video,
)
from .....conftest import AudioTestAssets, ImageTestAssets, VideoTestAssets
from .types import (
SINGLE_AUDIO_BASE_PROMPT,
SINGLE_IMAGE_BASE_PROMPTS,
TEST_AUDIO_PLACEHOLDER,
TEST_IMG_PLACEHOLDER,
TEST_VIDEO_PLACEHOLDER,
VIDEO_BASE_PROMPT,
ImageSizeWrapper,
PromptWithMultiModalInput,
SizeType,
VLMTestInfo,
)
def replace_test_placeholder(
prompt: str, mm_idx_to_prompt: Callable[[int], str], test_placeholder: str
) -> str:
"""Given a prompt, replaces each test placeholder with the
model-specific tag.
"""
prompt_segments = prompt.split(test_placeholder)
img_prompt = prompt_segments[0]
for placeholder_idx, next_seg in enumerate(prompt_segments[1:], start=1):
img_prompt += mm_idx_to_prompt(placeholder_idx)
img_prompt += next_seg
return img_prompt
def get_model_prompts(
base_prompts: Iterable[str],
img_idx_to_prompt: Callable[[int], str] | None,
video_idx_to_prompt: Callable[[int], str] | None,
audio_idx_to_prompt: Callable[[int], str] | None,
prompt_formatter: Callable[[str], str],
) -> list[str]:
"""Given a model-agnostic base prompt and test configuration for a model(s)
to be tested, update the media placeholders and apply the prompt formatting
to get the test prompt string for this model.
Example for phi3v, given the base_prompt: "<image>What is the season?"
1. Replace img placeholder(s)
-> "<|image_1|>\nWhat is the season?"
2. Apply prompt formatter:
-> <|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n
"""
assert isinstance(base_prompts, (list, tuple))
model_prompts = []
for base_prompt in base_prompts:
# Replace the multimodal placeholders in the base prompt with
# the correct ones for the model that we are testing
if img_idx_to_prompt:
base_prompt = replace_test_placeholder(
base_prompt, img_idx_to_prompt, TEST_IMG_PLACEHOLDER
)
if video_idx_to_prompt:
base_prompt = replace_test_placeholder(
base_prompt, video_idx_to_prompt, TEST_VIDEO_PLACEHOLDER
)
if audio_idx_to_prompt:
base_prompt = replace_test_placeholder(
base_prompt, audio_idx_to_prompt, TEST_AUDIO_PLACEHOLDER
)
# Apply the prompt formatter to wrap the base prompt with
# the correct media placeholders to get the model test prompt
model_prompt = prompt_formatter(base_prompt)
model_prompts.append(model_prompt)
return model_prompts
def build_single_image_inputs_from_test_info(
test_info: VLMTestInfo,
image_assets: ImageTestAssets,
size_wrapper: ImageSizeWrapper,
tmp_path: PosixPath | None = None,
) -> list[PromptWithMultiModalInput]:
if test_info.prompt_formatter is None:
raise ValueError("Prompt formatter must be set to build single image inputs")
model_prompts = get_model_prompts(
test_info.single_image_prompts,
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter,
)
# For models that require a local path / URL encoded in the image; export
# assets and encode into tmp_path for this test. This should be avoided
# where possible (currently needed for Qwen-VL).
if test_info.prompt_path_encoder is not None:
if tmp_path is None:
raise ValueError("Prompt path encoder requires setting local path")
model_prompts = [
test_info.prompt_path_encoder(tmp_path, prompt, [asset])
for prompt, asset in zip(model_prompts, image_assets)
]
images = [asset.pil_image for asset in image_assets]
assert len(images) == len(model_prompts)
return build_single_image_inputs(images, model_prompts, size_wrapper)
def build_single_image_inputs(
images, model_prompts, size_wrapper: ImageSizeWrapper
) -> list[PromptWithMultiModalInput]:
# For every image / prompt pair, get a pair containing two lists of
# length size_factors, where the first contains duplicates of the model
# prompt [str], and the second contains copies of the image after being
# scaled by one of the size factors.
#
# NOTE: rescaling preserves the image aspect ratio.
return [
PromptWithMultiModalInput(
prompts=[prompt for _ in size_wrapper.data],
image_data=[
apply_image_size_scaling(image, size, size_wrapper.type)
for size in size_wrapper.data
],
)
for image, prompt in zip(images, model_prompts)
]
def build_multi_image_inputs_from_test_info(
test_info: VLMTestInfo,
image_assets: ImageTestAssets,
size_wrapper: ImageSizeWrapper,
tmp_path: PosixPath | None = None,
) -> list[PromptWithMultiModalInput]:
if test_info.prompt_formatter is None:
raise ValueError("Prompt formatter must be set to build multi image inputs")
model_prompts = get_model_prompts(
[test_info.multi_image_prompt],
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter,
)
if test_info.prompt_path_encoder is not None:
if tmp_path is None:
raise ValueError("Prompt path encoder requires setting local path")
model_prompts = [
test_info.prompt_path_encoder(tmp_path, model_prompt, image_assets)
for model_prompt in model_prompts
]
images = [asset.pil_image for asset in image_assets]
# Currently, we only have one multi-image list & one multi-image prompt
return build_multi_image_inputs(
image_lists=[images],
model_prompts=model_prompts,
size_wrapper=size_wrapper,
)
def build_multi_image_inputs(
image_lists, model_prompts, size_wrapper: ImageSizeWrapper
) -> list[PromptWithMultiModalInput]:
return [
PromptWithMultiModalInput(
prompts=[prompt for _ in size_wrapper.data],
image_data=[
[
apply_image_size_scaling(image, size, size_wrapper.type)
for image in images
]
for size in size_wrapper.data
],
)
for images, prompt in zip(image_lists, model_prompts)
]
def build_embedding_inputs_from_test_info(
test_info: VLMTestInfo,
image_assets: ImageTestAssets,
size_wrapper: ImageSizeWrapper,
):
# These conditions will always be true if invoked through filtering,
# but we still check them in case this is ever called directly
if test_info.prompt_formatter is None:
raise ValueError("Prompt formatter must be set to build image embedding inputs")
if size_wrapper.type != SizeType.SIZE_FACTOR or not all(
factor == 1.0 for factor in size_wrapper.data
):
raise ValueError("Embedding tests require constant (1.0) size factors")
if test_info.convert_assets_to_embeddings is None:
raise ValueError("No conversion func for getting embeddings found")
model_prompts = get_model_prompts(
SINGLE_IMAGE_BASE_PROMPTS,
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter,
)
images = [asset.pil_image for asset in image_assets]
embeds = test_info.convert_assets_to_embeddings(image_assets)
if test_info.dtype != "auto":
dtype = getattr(torch, test_info.dtype) # type: ignore
embeds = [e.to(dtype=dtype) for e in embeds]
assert len(images) == len(model_prompts)
inputs = build_single_image_inputs(images, model_prompts, size_wrapper)
vllm_embeddings = build_single_image_inputs(embeds, model_prompts, size_wrapper)
return inputs, vllm_embeddings
def build_video_inputs_from_test_info(
test_info: VLMTestInfo,
video_assets: VideoTestAssets,
size_wrapper: ImageSizeWrapper,
num_frames: int,
needs_video_metadata: bool,
) -> list[PromptWithMultiModalInput]:
if test_info.prompt_formatter is None:
raise ValueError("Prompt formatter must be set to build video inputs")
model_prompts = get_model_prompts(
[VIDEO_BASE_PROMPT],
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter,
)
sampled_vids = [
sample_frames_with_video_metadata(
(asset.np_ndarrays, asset.metadata),
num_frames,
)
for asset in video_assets
]
video_scaler = (
resize_video if size_wrapper.type == SizeType.FIXED_SIZE else rescale_video_size
)
return [
PromptWithMultiModalInput(
prompts=[prompt for _ in size_wrapper.data],
video_data=[
(
video_scaler(video, size)
if not needs_video_metadata
else (video_scaler(video, size), meta)
)
for size in size_wrapper.data
],
)
for (video, meta), prompt in zip(sampled_vids, model_prompts)
]
def sample_frames_with_video_metadata(
video_with_meta: tuple[npt.NDArray, dict[str, Any]],
num_frames: int,
) -> tuple[npt.NDArray, dict[str, Any]]:
video, meta = video_with_meta
video = sample_frames_from_video(video, num_frames)
meta["do_sample_frames"] = meta["total_num_frames"] == num_frames
meta["total_num_frames"] = num_frames
meta["fps"] = meta["duration"] / num_frames
meta["frames_indices"] = list(range(num_frames))
return video, meta
def apply_image_size_scaling(image, size: float | tuple[int, int], size_type: SizeType):
"""Applies a size scaler to one image; this can be an image size factor,
which scales the image while maintaining the aspect ratio"""
# Special case for embeddings; if it's a tensor, it's only valid if we
# are considering size factors at constant scale, i.e., we just clone
# the tensor
if isinstance(image, torch.Tensor):
assert size_type == SizeType.SIZE_FACTOR and size == 1
return image
if size_type == SizeType.SIZE_FACTOR:
# We have a list of image size factors
return rescale_image_size(image, size)
elif size_type == SizeType.FIXED_SIZE:
# We have a list of fixed sizes
return image.resize(size)
raise ValueError("ImageSizeWrapper type must be FIXED_SIZE or SIZE_FACTOR")
def build_audio_inputs_from_test_info(
test_info: VLMTestInfo,
audio_assets: AudioTestAssets,
) -> list[PromptWithMultiModalInput]:
if test_info.prompt_formatter is None:
raise ValueError("Prompt formatter must be set to build audio inputs")
model_prompts = get_model_prompts(
SINGLE_AUDIO_BASE_PROMPT,
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter,
)
resampler = AudioResampler(
target_sr=16000,
method="librosa",
)
audios = [asset.audio_and_sample_rate for asset in audio_assets]
resampled_audios = [
(
resampler.resample(
audio,
orig_sr=sr,
),
int(resampler.target_sr),
)
for audio, sr in audios
]
return [
PromptWithMultiModalInput(
prompts=model_prompts,
audio_data=resampled_audios,
)
]

View File

@@ -0,0 +1,183 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Utils for determining which subset of model tests belong to a specific
modality, getting all combinations (similar to pytest's parametrization),
handling multimodal placeholder substitution, and so on.
"""
import itertools
from collections import OrderedDict
from collections.abc import Iterable
import pytest
from .types import (
EMBEDDING_SIZE_FACTORS,
ExpandableVLMTestArgs,
ImageSizeWrapper,
SizeType,
VLMTestInfo,
VLMTestType,
)
def get_filtered_test_settings(
test_settings: dict[str, VLMTestInfo],
test_type: VLMTestType,
new_proc_per_test: bool,
) -> dict[str, VLMTestInfo]:
"""Given the dict of potential test settings to run, return a subdict
of tests who have the current test type enabled with the matching val for
fork_per_test.
"""
def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType):
return test_info.test_type == test_type or (
isinstance(test_info.test_type, Iterable)
and test_type in test_info.test_type
)
matching_tests = {}
for test_name, test_info in test_settings.items():
# Otherwise check if the test has the right type & keep if it does
if matches_test_type(test_info, test_type):
# Embedding tests need to have a conversion func in their test info
if matches_test_type(test_info, VLMTestType.EMBEDDING):
assert test_info.convert_assets_to_embeddings is not None
# Custom test inputs need to explicitly define the mm limit/inputs
if matches_test_type(test_info, VLMTestType.CUSTOM_INPUTS):
assert test_info.custom_test_opts is not None and isinstance(
test_info.custom_test_opts, Iterable
)
# For all types besides custom inputs, we need a prompt formatter
else:
assert test_info.prompt_formatter is not None
# Everything looks okay; keep if this is correct proc handling
if (
test_info.distributed_executor_backend is not None
) == new_proc_per_test:
matching_tests[test_name] = test_info
return matching_tests
def get_model_type_cases(
model_type: str,
test_info: VLMTestInfo,
test_type: VLMTestType,
):
# Ensure that something is wrapped as an iterable it's not already
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,)
# This is essentially the same as nesting a bunch of mark.parametrize
# decorators, but we do it programmatically to allow overrides for on
# a per-model basis, while still being able to execute each of these
# as individual test cases in pytest.
iter_kwargs = OrderedDict(
[
("model", ensure_wrapped(test_info.models)),
("max_tokens", ensure_wrapped(test_info.max_tokens)),
("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
("dtype", ensure_wrapped(test_info.dtype)),
(
"distributed_executor_backend",
ensure_wrapped(test_info.distributed_executor_backend),
),
]
)
# num_frames is video only
if test_type == VLMTestType.VIDEO:
iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames)
iter_kwargs["needs_video_metadata"] = ensure_wrapped(
test_info.needs_video_metadata
)
# No sizes passed for custom inputs, since inputs are directly provided
if test_type not in (
VLMTestType.CUSTOM_INPUTS,
VLMTestType.AUDIO,
):
wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
if wrapped_sizes is None:
raise ValueError(f"Sizes must be set for test type {test_type}")
iter_kwargs["size_wrapper"] = wrapped_sizes
# Otherwise expand the custom test options instead
elif test_type == VLMTestType.CUSTOM_INPUTS:
if test_info.custom_test_opts is None:
raise ValueError("Test has type CUSTOM_INPUTS, but none given")
iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
# Wrap all model cases in a pytest parameter & pass marks through
return [
pytest.param(
model_type,
ExpandableVLMTestArgs(**{k: v for k, v in zip(iter_kwargs.keys(), case)}),
marks=test_info.marks if test_info.marks is not None else [],
)
for case in list(itertools.product(*iter_kwargs.values()))
]
def get_parametrized_options(
test_settings: dict[str, VLMTestInfo],
test_type: VLMTestType,
create_new_process_for_each_test: bool,
):
"""Converts all of our VLMTestInfo into an expanded list of parameters.
This is similar to nesting pytest parametrize calls, but done directly
through an itertools product so that each test can set things like
size factors etc, while still running in isolated test cases.
"""
matching_tests = get_filtered_test_settings(
test_settings, test_type, create_new_process_for_each_test
)
# Get a list per model type, where each entry contains a tuple of all of
# that model type's cases, then flatten them into the top level so that
# we can consume them in one mark.parametrize call.
cases_by_model_type = [
get_model_type_cases(model_type, test_info, test_type)
for model_type, test_info in matching_tests.items()
]
return list(itertools.chain(*cases_by_model_type))
def get_wrapped_test_sizes(
test_info: VLMTestInfo, test_type: VLMTestType
) -> tuple[ImageSizeWrapper, ...]:
"""Given a test info which may have size factors or fixed sizes, wrap them
and combine them into an iterable, each of which will be used in parameter
expansion.
Args:
test_info: Test configuration to be expanded.
test_type: The type of test being filtered for.
"""
# If it is an embedding test, we always use the EMBEDDING_SIZE_FACTORS
if test_type == VLMTestType.EMBEDDING:
return tuple(
[
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
for factor in EMBEDDING_SIZE_FACTORS
]
)
# Audio and Custom inputs have preprocessed inputs
elif test_type in (VLMTestType.AUDIO, VLMTestType.CUSTOM_INPUTS):
return tuple()
size_factors = test_info.image_size_factors if test_info.image_size_factors else []
fixed_sizes = test_info.image_sizes if test_info.image_sizes else []
wrapped_factors = [
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
for factor in size_factors
]
wrapped_sizes = [
ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size) for size in fixed_sizes
]
return tuple(wrapped_factors + wrapped_sizes)

View File

@@ -0,0 +1,189 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Core test implementation to be shared across modalities."""
from collections.abc import Callable
from typing import Any
import torch
from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm.config.model import RunnerOption
from vllm.tokenizers import TokenizerLike
from .....conftest import HfRunner, VllmRunner
from ....registry import HF_EXAMPLE_MODELS
from .types import PromptWithMultiModalInput, RunnerOutput
def run_test(
*,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
inputs: list[PromptWithMultiModalInput],
model: str,
dtype: str,
max_tokens: int,
num_logprobs: int,
enforce_eager: bool,
max_model_len: int,
max_num_seqs: int,
hf_output_post_proc: Callable[[RunnerOutput, str], Any] | None,
vllm_output_post_proc: Callable[[RunnerOutput, str], Any] | None,
auto_cls: type[_BaseAutoModelClass],
use_tokenizer_eos: bool,
comparator: Callable[..., None],
get_stop_token_ids: Callable[[TokenizerLike], list[int]] | None,
stop_str: list[str] | None,
limit_mm_per_prompt: dict[str, int],
vllm_runner_kwargs: dict[str, Any] | None,
hf_model_kwargs: dict[str, Any] | None,
patch_hf_runner: Callable[[HfRunner], HfRunner] | None,
runner: RunnerOption = "auto",
distributed_executor_backend: str | None = None,
tensor_parallel_size: int = 1,
vllm_embeddings: torch.Tensor | None = None,
):
"""Modality agnostic test executor for comparing HF/vLLM outputs."""
# In the case of embeddings, vLLM takes separate input tensors
vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
# Disable other modalities to save memory
default_limits = {"image": 0, "video": 0, "audio": 0}
limit_mm_per_prompt = default_limits | limit_mm_per_prompt
vllm_outputs_per_mm = []
hf_outputs_per_mm = []
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
vllm_runner_kwargs_: dict[str, Any] = {"mm_processor_cache_gb": 0}
if model_info.tokenizer:
vllm_runner_kwargs_["tokenizer_name"] = model_info.tokenizer
if model_info.tokenizer_mode:
vllm_runner_kwargs_["tokenizer_mode"] = model_info.tokenizer_mode
if model_info.hf_overrides:
vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides
if model_info.require_embed_inputs:
for k in ("skip_tokenizer_init", "enable_prompt_embeds", "enable_mm_embeds"):
vllm_runner_kwargs_[k] = model_info.require_embed_inputs
if vllm_runner_kwargs:
vllm_runner_kwargs_.update(vllm_runner_kwargs)
with vllm_runner(
model,
max_model_len=max_model_len,
max_num_seqs=max_num_seqs,
dtype=dtype,
limit_mm_per_prompt=limit_mm_per_prompt,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=enforce_eager,
runner=runner,
**vllm_runner_kwargs_,
) as vllm_model:
tokenizer = vllm_model.llm.get_tokenizer()
vllm_kwargs: dict[str, Any] = {}
if get_stop_token_ids is not None:
vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
if stop_str:
vllm_kwargs["stop"] = stop_str
for prompts, image_data, video_data, audio_data in vllm_inputs:
mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
vllm_kwargs_with_mm_data = vllm_kwargs | mm_data
vllm_output = vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
**vllm_kwargs_with_mm_data,
)
vllm_outputs_per_mm.append(vllm_output)
hf_model = hf_runner(
model, dtype=dtype, auto_cls=auto_cls, model_kwargs=hf_model_kwargs
)
# Some models need to patch things like the model processor, e.g., internvl
if patch_hf_runner is not None:
hf_model = patch_hf_runner(hf_model)
with hf_model, torch.no_grad():
tokenizer = hf_model.tokenizer
# Some models need to explicitly pass the eos_token_id off the tokenizer
# or processor for a good comparison;
# currently assume processor/tokenizer agree on the EOS, and pull it off
# the tokenizer if requested.
hf_kwargs = {}
if use_tokenizer_eos:
hf_kwargs["eos_token_id"] = tokenizer.eos_token_id
if stop_str:
hf_kwargs["stop_strings"] = stop_str
for prompts, image_data, video_data, audio_data in inputs:
mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
hf_kwargs_with_mm_data = hf_kwargs | mm_data
hf_output = hf_model.generate_greedy_logprobs_limit(
prompts,
max_tokens,
num_logprobs=num_logprobs,
tokenizer=tokenizer,
**hf_kwargs_with_mm_data,
)
hf_outputs_per_mm.append(hf_output)
# Apply output processing / sanitation to the vLLM and HF runner results
hf_outputs_per_mm, vllm_outputs_per_mm = process_runner_outputs(
model,
first_runner_outputs=hf_outputs_per_mm,
second_runner_outputs=vllm_outputs_per_mm,
first_runner_processor=hf_output_post_proc,
second_runner_processor=vllm_output_post_proc,
)
for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm, vllm_outputs_per_mm):
# This is usually check_logprobs_close, but it's passed through to
# allow things like check_outputs_equal where needed
comparator(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
def process_runner_outputs(
model,
first_runner_outputs,
second_runner_outputs,
first_runner_processor=None,
second_runner_processor=None,
):
"""Applies the runner processor(s) to the runner outputs, if any."""
if first_runner_processor is not None:
first_runner_outputs = process_outputs(
first_runner_processor, model, first_runner_outputs
)
if second_runner_processor is not None:
second_runner_outputs = process_outputs(
second_runner_processor, model, second_runner_outputs
)
return first_runner_outputs, second_runner_outputs
def process_outputs(output_processor, model, outputs_per_image):
"""Applies a model specific post-processor function to a runner's output"""
return [
[output_processor(res, model) for res in outputs]
for outputs in outputs_per_image
]

View File

@@ -0,0 +1,156 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Custom input builders for edge-cases in different models."""
from collections.abc import Callable
from vllm.assets.image import ImageAsset
from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.video import (
rescale_video_size,
resize_video,
sample_frames_from_video,
)
from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
from .builders import build_multi_image_inputs, build_single_image_inputs
from .types import ImageSizeWrapper, PromptWithMultiModalInput, SizeType
def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
"""Builds inputs for multi-image (varied sizes/aspect ratio) testing.
Args:
formatter: model-specific prompt formatter.
"""
stop_sign = IMAGE_ASSETS[0].pil_image
cherry_blossom = IMAGE_ASSETS[1].pil_image
# Apply the selected formatter to the base prompts
img_prompts = [
"<image><image>\nDescribe 2 images.",
"<image><image>\nDescribe 2 images.",
"<image><image><image><image>\nDescribe 4 images.",
"<image>\nWhat is the season?",
]
formatted_prompts = [formatter(prompt) for prompt in img_prompts]
aspect_ratio_images = [
[stop_sign, cherry_blossom],
# Images with different sizes and aspect-ratios
[
rescale_image_size(stop_sign, 0.1),
stop_sign,
],
[
stop_sign,
rescale_image_size(stop_sign, 0.25),
cherry_blossom.resize((183, 488)),
cherry_blossom.resize((488, 183)),
],
cherry_blossom,
]
return [
PromptWithMultiModalInput(
prompts=formatted_prompts,
image_data=aspect_ratio_images,
)
]
def multi_video_multi_aspect_ratio_inputs(
formatter: Callable[[str], str], num_frames: int = 16
):
"""Builds inputs for multi-video (varied sizes/aspect ratio) testing.
Args:
formatter: model-specific prompt formatter.
"""
video = sample_frames_from_video(VIDEO_ASSETS[0].np_ndarrays, num_frames)
# Apply the selected formatter to the base prompts
video_prompts = [
"<video><video>\nDescribe 2 videos.",
"<video><video>\nDescribe 2 videos.",
"<video><video><video><video>\nDescribe 4 videos.",
"<video>\nWhy is this video funny?",
]
formatted_prompts = [formatter(prompt) for prompt in video_prompts]
aspect_ratio_videos = [
[video, video],
# Videos with different sizes and aspect-ratios
[
rescale_video_size(video, 0.1),
video,
],
[
video,
rescale_video_size(video, 0.25),
resize_video(video, (183, 488)),
resize_video(video, (488, 183)),
],
video,
]
return [
PromptWithMultiModalInput(
prompts=formatted_prompts,
video_data=aspect_ratio_videos,
)
]
def different_patch_input_cases_internvl():
images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
formatter = (
lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501
)
single_img_prompts = [
"<image>\nWhat's the content in the center of the image?",
"<image>\nWhat is the season?",
]
multi_img_prompts = [
"Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.\n", # noqa: E501
]
formatted_sprompts = [formatter(prompt) for prompt in single_img_prompts]
formatted_mprompts = [formatter(prompt) for prompt in multi_img_prompts]
wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5, 1.0])
return [
build_single_image_inputs(images, formatted_sprompts, wrapped_sf),
build_multi_image_inputs([images], formatted_mprompts, wrapped_sf),
]
def windows_attention_image_qwen2_5_vl():
# image from regression issue: https://github.com/vllm-project/vllm/issues/15122 # noqa: E501
image = ImageAsset("hato").pil_image
question = "Describe the image."
img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
prompt = (
f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n<|im_start|>assistant\n"
)
wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
return build_single_image_inputs([image], [prompt], wrapped_sf)
def video_with_metadata_glm4_1v():
video_array = VIDEO_ASSETS[0].np_ndarrays
metadata = VIDEO_ASSETS[0].metadata
question = "Describe the video."
video_prompt = "<|begin_of_video|><|video|><|end_of_video|>"
formatted_prompt = f"[gMASK]<|user|>\n{video_prompt}{question}<|assistant|>\n"
scales = [0.1, 0.2, 0.25]
video_input = [
[(rescale_video_size(video_array, scale), metadata)] for scale in scales
]
prompts = [formatted_prompt] * len(video_input)
return [
PromptWithMultiModalInput(
prompts=prompts,
video_data=video_input,
)
]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,190 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Entrypoints for wrapping the core run_test implementation for specific test
types / modalities.
"""
from pathlib import PosixPath
from .....conftest import (
AudioTestAssets,
HfRunner,
ImageTestAssets,
VideoTestAssets,
VllmRunner,
)
from . import builders, core
from .types import ExpandableVLMTestArgs, VLMTestInfo
####### Entrypoints for running different test types
def run_single_image_test(
*,
tmp_path: PosixPath,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets,
):
assert test_case.size_wrapper is not None
inputs = builders.build_single_image_inputs_from_test_info(
model_test_info, image_assets, test_case.size_wrapper, tmp_path
)
core.run_test(
hf_runner=hf_runner,
vllm_runner=vllm_runner,
inputs=inputs,
model=test_case.model,
dtype=test_case.dtype,
max_tokens=test_case.max_tokens,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"image": 1},
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_multi_image_test(
*,
tmp_path: PosixPath,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets,
):
assert test_case.size_wrapper is not None
inputs = builders.build_multi_image_inputs_from_test_info(
model_test_info, image_assets, test_case.size_wrapper, tmp_path
)
core.run_test(
hf_runner=hf_runner,
vllm_runner=vllm_runner,
inputs=inputs,
model=test_case.model,
dtype=test_case.dtype,
max_tokens=test_case.max_tokens,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"image": len(image_assets)},
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_embedding_test(
*,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets,
):
assert test_case.size_wrapper is not None
inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
model_test_info, image_assets, test_case.size_wrapper
)
core.run_test(
hf_runner=hf_runner,
vllm_runner=vllm_runner,
inputs=inputs,
model=test_case.model,
dtype=test_case.dtype,
max_tokens=test_case.max_tokens,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"image": 1},
vllm_embeddings=vllm_embeddings,
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_video_test(
*,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
video_assets: VideoTestAssets,
):
assert test_case.size_wrapper is not None
assert test_case.num_video_frames is not None
inputs = builders.build_video_inputs_from_test_info(
model_test_info,
video_assets,
test_case.size_wrapper,
test_case.num_video_frames,
test_case.needs_video_metadata,
)
core.run_test(
hf_runner=hf_runner,
vllm_runner=vllm_runner,
inputs=inputs,
model=test_case.model,
dtype=test_case.dtype,
max_tokens=test_case.max_tokens,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"video": len(video_assets)},
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_audio_test(
*,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets,
):
inputs = builders.build_audio_inputs_from_test_info(model_test_info, audio_assets)
core.run_test(
hf_runner=hf_runner,
vllm_runner=vllm_runner,
inputs=inputs,
model=test_case.model,
dtype=test_case.dtype,
max_tokens=test_case.max_tokens,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"audio": 1},
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_custom_inputs_test(
*,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
):
# Custom test cases can provide inputs directly, but they need to
# explicitly provided a CustomTestConfig, which wraps the inputs and
# the limit_mm_per_prompt
assert test_case.custom_test_opts is not None
inputs = test_case.custom_test_opts.inputs
limit_mm_per_prompt = test_case.custom_test_opts.limit_mm_per_prompt
# Inputs and limit_mm_per_prompt should all be set
assert inputs is not None
assert limit_mm_per_prompt is not None
core.run_test(
hf_runner=hf_runner,
vllm_runner=vllm_runner,
inputs=inputs,
model=test_case.model,
dtype=test_case.dtype,
max_tokens=test_case.max_tokens,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt=limit_mm_per_prompt,
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs(),
)

View File

@@ -0,0 +1,218 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Types for writing multimodal model tests."""
from collections.abc import Callable, Iterable
from enum import Enum
from pathlib import PosixPath
from typing import Any, NamedTuple
import torch
from pytest import MarkDecorator
from transformers import AutoModelForCausalLM
from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm.config.model import RunnerOption
from vllm.logprobs import SampleLogprobs
from vllm.tokenizers import TokenizerLike
from .....conftest import (
AUDIO_ASSETS,
IMAGE_ASSETS,
HfRunner,
ImageAsset,
ImageTestAssets,
PromptAudioInput,
PromptImageInput,
PromptVideoInput,
)
from ....utils import check_logprobs_close
# meta image tag; will be replaced by the appropriate tag for the model
TEST_IMG_PLACEHOLDER = "<vlm_image>"
TEST_VIDEO_PLACEHOLDER = "<vlm_video>"
TEST_AUDIO_PLACEHOLDER = "<lmm_audio>"
SINGLE_IMAGE_BASE_PROMPTS = IMAGE_ASSETS.prompts(
{
"stop_sign": f"{TEST_IMG_PLACEHOLDER}What's the content of the image?",
"cherry_blossom": f"{TEST_IMG_PLACEHOLDER}What is the season?",
}
)
SINGLE_AUDIO_BASE_PROMPT = AUDIO_ASSETS.prompts(
{
"mary_had_lamb": f"{TEST_AUDIO_PLACEHOLDER}Transcribe this audio into English.", # noqa: E501
"winning_call": f"{TEST_AUDIO_PLACEHOLDER}What is happening in this audio clip?", # noqa: E501
}
)
MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PLACEHOLDER}Describe the two images in detail.\n" # noqa: E501
VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"
IMAGE_SIZE_FACTORS = [(1.0,), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
EMBEDDING_SIZE_FACTORS = [(1.0,), (1.0, 1.0, 1.0)]
RunnerOutput = tuple[list[int], str, SampleLogprobs | None]
class PromptWithMultiModalInput(NamedTuple):
"""Holds the multimodal input for a single test case."""
prompts: list[str]
image_data: PromptImageInput | None = None
video_data: PromptVideoInput | None = None
audio_data: PromptAudioInput | None = None
class VLMTestType(Enum):
IMAGE = 1
MULTI_IMAGE = 2
EMBEDDING = 3
VIDEO = 4
AUDIO = 5
CUSTOM_INPUTS = 6
class SizeType(Enum):
SIZE_FACTOR = 1
FIXED_SIZE = 2
class CustomTestOptions(NamedTuple):
inputs: list[PromptWithMultiModalInput]
limit_mm_per_prompt: dict[str, int]
class ImageSizeWrapper(NamedTuple):
type: SizeType
# A size factor is a wrapper of 0+ floats,
# while a fixed size contains an iterable of integer pairs
data: Iterable[float] | Iterable[tuple[int, int]]
class VLMTestInfo(NamedTuple):
"""Holds the configuration for 1+ tests for one model architecture."""
models: list[str]
test_type: VLMTestType | Iterable[VLMTestType]
# Should be None only if this is a CUSTOM_INPUTS test
prompt_formatter: Callable[[str], str] | None = None
img_idx_to_prompt: Callable[[int], str] = lambda idx: "<image>\n"
video_idx_to_prompt: Callable[[int], str] = lambda idx: "<video>\n"
audio_idx_to_prompt: Callable[[int], str] = lambda idx: "<audio>\n"
# Most models work on the single / multi-image prompts above, but in some
# cases the log prob check fails, e.g., for paligemma. We allow passing
# an override for the single image prompts / multi-image prompt for this
# reason.
single_image_prompts: Iterable[str] = SINGLE_IMAGE_BASE_PROMPTS
multi_image_prompt: str = MULTI_IMAGE_BASE_PROMPT
# Function for converting ImageAssets to image embeddings;
# We need to define this explicitly for embedding tests
convert_assets_to_embeddings: (
Callable[[ImageTestAssets], list[torch.Tensor]] | None
) = None
# Exposed options for vLLM runner; we change these in a several tests,
# but the defaults are derived from VllmRunner & the engine defaults
# These settings are chosen to avoid OOMs when running in the CI
enforce_eager: bool = True
max_model_len: int = 1024
max_num_seqs: int = 256
runner: RunnerOption = "auto"
tensor_parallel_size: int = 1
vllm_runner_kwargs: dict[str, Any] | None = None
# Optional callable which gets a list of token IDs from the model tokenizer
get_stop_token_ids: Callable[[TokenizerLike], list[int]] | None = None
# Optional list of strings to stop generation, useful when stop tokens are
# not special tokens in the tokenizer
stop_str: list[str] | None = None
# Exposed options for HF runner
hf_model_kwargs: dict[str, Any] | None = None
# Indicates we should explicitly pass the EOS from the tokenizer
use_tokenizer_eos: bool = False
auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM
patch_hf_runner: Callable[[HfRunner], HfRunner] | None = None
# Post processors that if defined, will run oun the outputs of the
# vLLM and HF runner, respectively (useful for sanitization, etc).
vllm_output_post_proc: Callable[[RunnerOutput, str], Any] | None = None
hf_output_post_proc: Callable[[RunnerOutput, str], Any] | None = None
# Consumes the output of the callables above and checks if they're equal
comparator: Callable[..., None] = check_logprobs_close
# Default expandable params per test; these defaults can be overridden in
# instances of this object; the complete set of test cases for the model
# is all combinations of .models + all fields below
max_tokens: int = 128
num_logprobs: int = 5
dtype: str = "auto"
distributed_executor_backend: str | None = None
# Only expanded in video tests
num_video_frames: int | tuple[int] = 16
needs_video_metadata: bool = False
# Fixed image sizes / image size factors; most tests use image_size_factors
# The values provided for these two fields will be stacked and expanded
# such that each model will consider each image size factor / image size
# once per tests (much like concatenating and wrapping in one parametrize
# call)
image_size_factors: Iterable[Iterable[float]] = IMAGE_SIZE_FACTORS
image_sizes: Iterable[Iterable[tuple[int, int]]] | None = None
# Hack for updating a prompt to take into a local path; currently only used
# for Qwen-VL, which requires encoding the image path / url into the prompt
# for HF runner
prompt_path_encoder: (
Callable[[PosixPath, str, list[ImageAsset] | ImageTestAssets], str] | None
) = None # noqa: E501
# Allows configuring a test to run with custom inputs
custom_test_opts: list[CustomTestOptions] | None = None
marks: list[MarkDecorator] | None = None
def get_non_parametrized_runner_kwargs(self):
"""Returns a dictionary of expandable kwargs for items that are used
in all test types, which are NOT used when creating the parametrized
test cases.
"""
return {
"enforce_eager": self.enforce_eager,
"max_model_len": self.max_model_len,
"max_num_seqs": self.max_num_seqs,
"runner": self.runner,
"tensor_parallel_size": self.tensor_parallel_size,
"vllm_runner_kwargs": self.vllm_runner_kwargs,
"hf_output_post_proc": self.hf_output_post_proc,
"vllm_output_post_proc": self.vllm_output_post_proc,
"auto_cls": self.auto_cls,
"use_tokenizer_eos": self.use_tokenizer_eos,
"comparator": self.comparator,
"get_stop_token_ids": self.get_stop_token_ids,
"hf_model_kwargs": self.hf_model_kwargs,
"stop_str": self.stop_str,
"patch_hf_runner": self.patch_hf_runner,
}
class ExpandableVLMTestArgs(NamedTuple):
"""The expanded kwargs which correspond to a single test case."""
model: str
max_tokens: int
num_logprobs: int
dtype: str
distributed_executor_backend: str | None
# Sizes are used for everything except for custom input tests
size_wrapper: ImageSizeWrapper | None = None
# Video only
num_video_frames: int | None = None
needs_video_metadata: bool = False
# Custom inputs only
custom_test_opts: CustomTestOptions | None = None

View File

@@ -0,0 +1,24 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM pooling tests."""
import os
import warnings
from vllm.platforms import current_platform
def pytest_collection_modifyitems(config, items):
"""Set FLEX_ATTENTION backend for SigLIP tests on ROCm."""
if not current_platform.is_rocm():
return
siglip_tests = [item for item in items if "test_siglip" in item.nodeid]
if siglip_tests:
os.environ["VLLM_ATTENTION_BACKEND"] = "FLEX_ATTENTION"
warnings.warn(
"ROCm: Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION for SigLIP tests",
UserWarning,
stacklevel=1,
)

View File

@@ -0,0 +1,139 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from transformers import CLIPModel
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ...utils import check_embeddings_close
HF_TEXT_PROMPTS = [
"a photo of a stop sign",
"a photo of a cherry blossom",
]
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
{
"stop_sign": "",
"cherry_blossom": "",
}
)
MODELS = ["openai/clip-vit-base-patch32"]
def _run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
input_texts: list[str],
input_images: PromptImageInput,
model: str,
*,
dtype: str,
) -> None:
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(
model, runner="pooling", dtype=dtype, enforce_eager=True, max_model_len=77
) as vllm_model:
vllm_outputs = vllm_model.embed(input_texts, images=input_images)
with hf_runner(model, dtype=dtype, auto_cls=CLIPModel) as hf_model:
all_inputs = hf_model.get_inputs(input_texts, images=input_images)
all_outputs = []
for inputs in all_inputs:
inputs = hf_model.wrap_device(inputs)
if "pixel_values" in inputs:
pooled_output = hf_model.model.get_image_features(
pixel_values=inputs.pixel_values,
).squeeze(0)
else:
pooled_output = hf_model.model.get_text_features(
input_ids=inputs.input_ids,
attention_mask=inputs.attention_mask,
).squeeze(0)
all_outputs.append(pooled_output.tolist())
hf_outputs = all_outputs
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
def test_models_text(
hf_runner,
vllm_runner,
image_assets,
model: str,
dtype: str,
) -> None:
input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
input_texts = [text for text, _ in input_texts_images]
input_images = [image for _, image in input_texts_images]
_run_test(
hf_runner,
vllm_runner,
input_texts,
input_images, # type: ignore
model,
dtype=dtype,
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
def test_models_image(
hf_runner,
vllm_runner,
image_assets,
model: str,
dtype: str,
) -> None:
input_texts_images = [
(text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
]
input_texts = [text for text, _ in input_texts_images]
input_images = [image for _, image in input_texts_images]
_run_test(
hf_runner,
vllm_runner,
input_texts,
input_images,
model,
dtype=dtype,
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
def test_models_text_image_no_crash(
vllm_runner,
image_assets,
model: str,
dtype: str,
) -> None:
texts = [HF_TEXT_PROMPTS[0]]
images = [image_assets[0].pil_image]
with vllm_runner(
model, runner="pooling", dtype=dtype, enforce_eager=True, max_model_len=77
) as vllm_model:
with pytest.raises(ValueError, match="not both"):
vllm_model.embed(texts, images=images)
# Should still be able to run subsequent requests
vllm_model.embed(texts)
vllm_model.embed([""], images=images)

View File

@@ -0,0 +1,215 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Callable
import pytest
import torch
import torch.nn.functional as F
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ....utils import large_gpu_test
from ...utils import check_embeddings_close
HF_TEXT_PROMPTS = [
# T -> X
(
"Query: Find me an everyday image that matches the given caption: The label of the object is stop sign", # noqa: E501,
Image.new("RGB", (56, 56)),
),
# T -> X
(
"Query: Retrieve an image of this caption: cherry blossom",
Image.new("RGB", (56, 56)),
),
]
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
{
"stop_sign": "What is shown in this image?",
"cherry_blossom": "What is shown in this image?",
}
)
MODELS = ["MrLight/dse-qwen2-2b-mrl-v1"]
def get_messages(image: Image.Image, text: str, embed_text: bool):
# assert False, 'remember to use outer [] as required'
if embed_text:
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": Image.new("RGB", (56, 56)),
"resized_height": 1,
"resized_width": 1,
}, # need a dummy image here for an easier process.
{"type": "text", "text": text},
],
}
]
else:
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": text},
],
}
]
return messages
def apply_chat_template_and_add_eos(
messages: list[dict],
apply_chat_template_fn: Callable,
):
prompt = (
apply_chat_template_fn(messages, tokenize=False, add_generation_prompt=True)
+ "<|endoftext|>"
)
return prompt
def _run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
input_texts: list[str],
input_images: PromptImageInput,
embed_texts: list[bool],
model: str,
*,
dtype: str,
) -> None:
"""SET PYTHONPATH"""
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(
model, runner="pooling", dtype=dtype, enforce_eager=True, max_model_len=8192
) as vllm_model:
tokenizer = vllm_model.llm.get_tokenizer()
texts = [
# this is necessary because vllm_model.embed will not apply any
# templating to the prompt, and therefore lacks an image_pad
# token unless one is inserted beforehand (the (28,28) image
# above is converted to an image pad token by the chat template).
apply_chat_template_and_add_eos(
get_messages(image, text, False),
apply_chat_template_fn=tokenizer.apply_chat_template,
)
for text, image in zip(input_texts, input_images)
# vllm will replace the pad token with the actual image,
# which may be a placeholder image, later.
]
vllm_outputs = vllm_model.embed(texts, images=input_images)
hf_outputs = []
with hf_runner(
model, dtype=dtype, auto_cls=Qwen2VLForConditionalGeneration
) as hf_model:
prompts = []
for text, image, embed_text in zip(input_texts, input_images, embed_texts):
# dse requires non-standard input processing
# because it needs an image_pad token
messages = get_messages(image, text, embed_text)
prompt = apply_chat_template_and_add_eos(
messages, hf_model.processor.apply_chat_template
)
prompts.append(prompt)
all_inputs = hf_model.get_inputs(
prompts=prompts,
images=input_images,
)
with torch.no_grad():
all_outputs = []
for inputs in all_inputs:
inputs = hf_model.model.prepare_inputs_for_generation(
**inputs,
cache_position=torch.arange(1), # 1 for batch size
use_cache=False,
)
outputs = hf_model.model(
**hf_model.wrap_device(inputs),
return_dict=True,
output_hidden_states=True,
)
pooled_output = F.normalize(
outputs.hidden_states[-1][0, -1], p=2, dim=-1
)
all_outputs.append(pooled_output.tolist())
hf_outputs = all_outputs
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
def test_models_text(
hf_runner,
vllm_runner,
image_assets,
model: str,
dtype: str,
) -> None:
input_texts_images = [
(text, image_placeholder) for text, image_placeholder in HF_TEXT_PROMPTS
]
input_texts = [text for text, _ in input_texts_images]
input_images = [image for _, image in input_texts_images]
embed_texts = [True] * len(input_texts)
_run_test(
hf_runner,
vllm_runner,
input_texts,
input_images, # type: ignore
embed_texts,
model,
dtype=dtype,
)
@large_gpu_test(min_gb=48)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
def test_models_image(
hf_runner,
vllm_runner,
image_assets,
model: str,
dtype: str,
) -> None:
input_texts_images = [
(text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
]
input_texts = [text for text, _ in input_texts_images]
input_images = [image for _, image in input_texts_images]
embed_texts = [False] * len(input_texts)
_run_test(
hf_runner,
vllm_runner,
input_texts,
input_images,
embed_texts,
model,
dtype=dtype,
)

View File

@@ -0,0 +1,81 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
import torch.nn as nn
from huggingface_hub import snapshot_download
from transformers import AutoConfig, AutoModel, CLIPImageProcessor
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
from ....conftest import ImageTestAssets
# we use snapshot_download to prevent conflicts between
# dynamic_module and trust_remote_code for hf_runner
DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]
@torch.inference_mode()
def run_intern_vit_test(
image_assets: ImageTestAssets,
model_id: str,
*,
dtype: str,
):
model = snapshot_download(model_id, allow_patterns=DOWNLOAD_PATTERN)
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
img_processor = CLIPImageProcessor.from_pretrained(model)
images = [asset.pil_image for asset in image_assets]
pixel_values = [
img_processor(images, return_tensors="pt").pixel_values.to(torch_dtype)
for images in images
]
config = AutoConfig.from_pretrained(model, trust_remote_code=True)
if not getattr(config, "norm_type", None):
config.norm_type = "rms_norm"
hf_model = AutoModel.from_pretrained(
model, dtype=torch_dtype, trust_remote_code=True
).to("cuda")
hf_outputs_per_image = [
hf_model(pixel_value.to("cuda")).last_hidden_state
for pixel_value in pixel_values
]
from vllm.model_executor.models.intern_vit import InternVisionModel
vllm_model = InternVisionModel(config)
vllm_model.load_weights(hf_model.state_dict().items())
del hf_model
cleanup_dist_env_and_memory()
vllm_model = vllm_model.to("cuda", torch_dtype)
vllm_outputs_per_image = [
vllm_model(pixel_values=pixel_value.to("cuda")) for pixel_value in pixel_values
]
del vllm_model
cleanup_dist_env_and_memory()
cos_similar = nn.CosineSimilarity(dim=-1)
for vllm_output, hf_output in zip(vllm_outputs_per_image, hf_outputs_per_image):
assert cos_similar(vllm_output, hf_output).mean() > 0.99
@pytest.mark.parametrize(
"model_id",
[
"OpenGVLab/InternViT-300M-448px",
"OpenGVLab/InternViT-6B-448px-V1-5",
],
)
@pytest.mark.parametrize("dtype", ["half"])
def test_models(dist_init, image_assets, model_id, dtype: str) -> None:
run_intern_vit_test(
image_assets,
model_id,
dtype=dtype,
)

View File

@@ -0,0 +1,194 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from transformers import AutoModel
from vllm.entrypoints.chat_utils import ChatCompletionContentPartImageParam
from vllm.entrypoints.score_utils import ScoreMultiModalParam
from ....conftest import HfRunner, VllmRunner
model_name = "jinaai/jina-reranker-m0"
mm_processor_kwargs = {
"min_pixels": 3136,
"max_pixels": 602112,
}
limit_mm_per_prompt = {"image": 2}
def vllm_reranker(
vllm_runner: type[VllmRunner],
model_name: str,
dtype: str,
query_strs: list[str],
document_strs: list[str],
query_type: str = "text",
doc_type: str = "text",
):
def create_image_param(url: str) -> ChatCompletionContentPartImageParam:
return {"type": "image_url", "image_url": {"url": f"{url}"}}
query: list[str] | ScoreMultiModalParam
if query_type == "text":
query = query_strs
elif query_type == "image":
query = ScoreMultiModalParam(
content=[create_image_param(url) for url in query_strs]
)
documents: list[str] | ScoreMultiModalParam
if doc_type == "text":
documents = document_strs
elif doc_type == "image":
documents = ScoreMultiModalParam(
content=[create_image_param(url) for url in document_strs]
)
with vllm_runner(
model_name,
runner="pooling",
dtype=dtype,
max_num_seqs=2,
max_model_len=2048,
mm_processor_kwargs=mm_processor_kwargs,
limit_mm_per_prompt=limit_mm_per_prompt,
) as vllm_model:
outputs = vllm_model.llm.score(query, documents)
return [output.outputs.score for output in outputs]
def hf_reranker(
hf_runner: type[HfRunner],
model_name: str,
dtype: str,
query_strs: list[str],
document_strs: list[str],
query_type: str = "text",
doc_type: str = "text",
):
checkpoint_to_hf_mapper = {
"visual.": "model.visual.",
"model.": "model.language_model.",
}
data_pairs = [[query_strs[0], d] for d in document_strs]
with hf_runner(
model_name,
dtype=dtype,
trust_remote_code=True,
auto_cls=AutoModel,
model_kwargs={"key_mapping": checkpoint_to_hf_mapper},
) as hf_model:
return hf_model.model.compute_score(
data_pairs, max_length=2048, query_type=query_type, doc_type=doc_type
)
# Visual Documents Reranking
@pytest.mark.parametrize("model_name", [model_name])
@pytest.mark.parametrize("dtype", ["half"])
def test_model_text_image(hf_runner, vllm_runner, model_name, dtype):
query = ["slm markdown"]
documents = [
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png",
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
]
hf_outputs = hf_reranker(
hf_runner, model_name, dtype, query, documents, "text", "image"
)
vllm_outputs = vllm_reranker(
vllm_runner, model_name, dtype, query, documents, "text", "image"
)
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
# Textual Documents Reranking
@pytest.mark.parametrize("model_name", [model_name])
@pytest.mark.parametrize("dtype", ["half"])
def test_model_text_text(hf_runner, vllm_runner, model_name, dtype):
query = ["slm markdown"]
documents = [
"""We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient
web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML
into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding
large language models. The models effectiveness results from two key innovations: (1) a three-stage
data synthesis pipeline that generates high quality, diverse training data by iteratively drafting,
refining, and critiquing web content extraction; and (2) a unified training framework combining
continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that
ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated
benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly
lower computational requirements.""", # noqa: E501
"数据提取么?为什么不用正则啊,你用正则不就全解决了么?",
]
hf_outputs = hf_reranker(
hf_runner, model_name, dtype, query, documents, "text", "text"
)
vllm_outputs = vllm_reranker(
vllm_runner, model_name, dtype, query, documents, "text", "text"
)
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
# Image Querying for Textual Documents
@pytest.mark.parametrize("model_name", [model_name])
@pytest.mark.parametrize("dtype", ["half"])
def test_model_image_text(hf_runner, vllm_runner, model_name, dtype):
query = [
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
]
documents = [
"""We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient
web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML
into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding
large language models. The models effectiveness results from two key innovations: (1) a three-stage
data synthesis pipeline that generates high quality, diverse training data by iteratively drafting,
refining, and critiquing web content extraction; and (2) a unified training framework combining
continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that
ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated
benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly
lower computational requirements.""", # noqa: E501
"数据提取么?为什么不用正则啊,你用正则不就全解决了么?",
]
hf_outputs = hf_reranker(
hf_runner, model_name, dtype, query, documents, "image", "text"
)
vllm_outputs = vllm_reranker(
vllm_runner, model_name, dtype, query, documents, "image", "text"
)
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
# Image Querying for Image Documents
@pytest.mark.parametrize("model_name", [model_name])
@pytest.mark.parametrize("dtype", ["half"])
def test_model_image_image(hf_runner, vllm_runner, model_name, dtype):
query = [
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
]
documents = [
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png",
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
]
hf_outputs = hf_reranker(
hf_runner, model_name, dtype, query, documents, "image", "image"
)
vllm_outputs = vllm_reranker(
vllm_runner, model_name, dtype, query, documents, "image", "image"
)
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)

View File

@@ -0,0 +1,159 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch.nn.functional as F
from transformers import AutoModelForImageTextToText
from vllm.platforms import current_platform
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ....utils import large_gpu_test
from ...utils import check_embeddings_close
# Llava Next embedding implementation is only supported by CUDA.
# If run on ROCm, hf_model.model.resize_token_embeddings will
# cause the following error:
# RuntimeError: Calling torch.linalg.cholesky on a CUDA tensor
# requires compiling PyTorch with MAGMA. Please use PyTorch
# built with MAGMA support.
# If run on CPU, hf_model.model.resize_token_embeddings will
# cause the following error:
# RuntimeError: Calling torch.linalg.cholesky on a CPU tensor
# requires compiling PyTorch with LAPACK. Please use PyTorch
# built with LAPACK support.
pytestmark = pytest.mark.skipif(
not current_platform.is_cuda(),
reason="Llava Next model uses op that is only supported in CUDA",
)
llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" # noqa: E501
HF_TEXT_PROMPTS = [
# T -> X
llama3_template.format(
"The label of the object is stop sign\nSummary above sentence in one word: " # noqa: E501
),
# T -> X
llama3_template.format("cherry blossom\nSummary above sentence in one word: "),
]
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
{
# I -> X
"stop_sign": llama3_template.format(
"<image>\nSummary above image in one word: "
),
# I -> X
"cherry_blossom": llama3_template.format(
"<image>\nSummary above image in one word: "
),
}
)
MODELS = ["royokong/e5-v"]
def _run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
input_texts: list[str],
input_images: PromptImageInput,
model: str,
*,
dtype: str,
) -> None:
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(
model, runner="pooling", dtype=dtype, max_model_len=4096, enforce_eager=True
) as vllm_model:
vllm_outputs = vllm_model.embed(input_texts, images=input_images)
with hf_runner(
model, dtype=dtype, auto_cls=AutoModelForImageTextToText
) as hf_model:
# Patch the issue where generation_config.json is missing
hf_model.processor.patch_size = hf_model.model.config.vision_config.patch_size
# Patch the issue where image_token_id
# exceeds the maximum allowed vocab size
hf_model.model.resize_token_embeddings(
hf_model.model.language_model.vocab_size + 1
)
all_inputs = hf_model.get_inputs(input_texts, images=input_images)
all_outputs = []
for inputs in all_inputs:
# Based on: https://huggingface.co/royokong/e5-v
outputs = hf_model.model(
**hf_model.wrap_device(inputs),
return_dict=True,
output_hidden_states=True,
)
pooled_output = F.normalize(outputs.hidden_states[-1][0, -1, :], dim=-1)
all_outputs.append(pooled_output.tolist())
hf_outputs = all_outputs
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.core_model
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_models_text(
hf_runner,
vllm_runner,
image_assets,
model: str,
dtype: str,
) -> None:
input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
input_texts = [text for text, _ in input_texts_images]
input_images = [image for _, image in input_texts_images]
_run_test(
hf_runner,
vllm_runner,
input_texts,
input_images, # type: ignore
model,
dtype=dtype,
)
@large_gpu_test(min_gb=48)
@pytest.mark.core_model
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_models_image(
hf_runner,
vllm_runner,
image_assets,
model: str,
dtype: str,
) -> None:
input_texts_images = [
(text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
]
input_texts = [text for text, _ in input_texts_images]
input_images = [image for _, image in input_texts_images]
_run_test(
hf_runner,
vllm_runner,
input_texts,
input_images,
model,
dtype=dtype,
)

View File

@@ -0,0 +1,142 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch.nn.functional as F
from PIL import Image
from vllm.assets.base import get_vllm_public_assets
from vllm.assets.image import VLM_IMAGES_DIR
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ....utils import large_gpu_test
from ...utils import check_embeddings_close
HF_TEXT_PROMPTS = [
# T -> X
"Find me an everyday image that matches the given caption: The label of the object is stop sign", # noqa: E501
# T -> X
"Retrieve an image of this caption: cherry blossom",
]
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
{
# T + I -> X
"stop_sign": "<|image_1|> Select the portion of the image that isolates the object of the given label: The label of the object is stop sign", # noqa: E501
# I -> X
"cherry_blossom": "<|image_1|> Represent the given image for classification", # noqa: E501
}
)
MODELS = ["TIGER-Lab/VLM2Vec-Full"]
def _run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
input_texts: list[str],
input_images: PromptImageInput,
model: str,
*,
dtype: str,
) -> None:
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(
model, runner="pooling", dtype=dtype, enforce_eager=True
) as vllm_model:
vllm_outputs = vllm_model.embed(input_texts, images=input_images)
# use eager mode for hf runner, since phi3_v didn't work with flash_attn
hf_model_kwargs = {"_attn_implementation": "eager"}
with hf_runner(model, dtype=dtype, model_kwargs=hf_model_kwargs) as hf_model:
all_inputs = hf_model.get_inputs(input_texts, images=input_images)
all_outputs = []
for inputs in all_inputs:
# Based on: https://github.com/TIGER-AI-Lab/VLM2Vec/blob/db3b951bccabba220c1f53ab46a734e50dd2fc08/src/model.py
outputs = hf_model.model(
**hf_model.wrap_device(inputs),
return_dict=True,
output_hidden_states=True,
)
last_hidden_state = outputs.hidden_states[-1][0]
reps = last_hidden_state[inputs.attention_mask[0].sum() - 1]
pooled_output = F.normalize(reps, p=2, dim=-1)
all_outputs.append(pooled_output.tolist())
hf_outputs = all_outputs
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.core_model
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_models_text(
hf_runner,
vllm_runner,
image_assets,
model: str,
dtype: str,
) -> None:
input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
input_texts = [text for text, _ in input_texts_images]
input_images = [image for _, image in input_texts_images]
_run_test(
hf_runner,
vllm_runner,
input_texts,
input_images, # type: ignore
model,
dtype=dtype,
)
@large_gpu_test(min_gb=48)
@pytest.mark.core_model
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_models_image(
hf_runner,
vllm_runner,
image_assets,
model: str,
dtype: str,
) -> None:
input_texts_images = [
(text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
]
# add cases for special_tokens
input_texts_images.append(
(
"\n<s><|user|>\n <|image_1|>\n\t <s>"
"Represent the given image for classification<|end|>"
"\n<|assistant|>\n",
Image.open(
get_vllm_public_assets(
filename="cherry_blossom.jpg", s3_prefix=VLM_IMAGES_DIR
)
),
)
)
input_texts = [text for text, _ in input_texts_images]
input_images = [image for _, image in input_texts_images]
_run_test(
hf_runner,
vllm_runner,
input_texts,
input_images,
model,
dtype=dtype,
)

View File

@@ -0,0 +1,60 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from ....conftest import VllmRunner
def generate_test_mm_data():
mm_data = {
"pixel_values": torch.full((6, 512, 512), 1.0, dtype=torch.float16),
"location_coords": torch.full((1, 2), 1.0, dtype=torch.float16),
}
return mm_data
def _run_test(
vllm_runner: type[VllmRunner],
model: str,
) -> None:
prompt = [
{
# This model deals with no text input
"prompt_token_ids": [1],
"multi_modal_data": generate_test_mm_data(),
}
for _ in range(10)
]
with vllm_runner(
model,
runner="pooling",
dtype="half",
enforce_eager=True,
skip_tokenizer_init=True,
enable_mm_embeds=True,
# Limit the maximum number of sequences to avoid the
# test going OOM during the warmup run
max_num_seqs=32,
default_torch_num_threads=1,
) as vllm_model:
vllm_model.llm.encode(prompt, pooling_task="plugin")
MODELS = ["mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
@pytest.mark.core_model
@pytest.mark.parametrize("model", MODELS)
def test_models_image(
hf_runner,
vllm_runner,
image_assets,
model: str,
) -> None:
_run_test(
vllm_runner,
model,
)

View File

@@ -0,0 +1,98 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
import torch.nn as nn
from huggingface_hub import snapshot_download
from transformers import AutoConfig, AutoModel, CLIPImageProcessor
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.model_executor.models.radio import RadioModel
from vllm.transformers_utils.configs.radio import RadioConfig
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
from ....conftest import ImageTestAssets
# we use snapshot_download to prevent conflicts between
# dynamic_module and trust_remote_code for hf_runner
DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]
@torch.inference_mode()
def run_radio_test(
image_assets: ImageTestAssets,
model_id: str,
*,
dtype: str,
):
model = snapshot_download(model_id, allow_patterns=DOWNLOAD_PATTERN)
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
img_processor = CLIPImageProcessor.from_pretrained(model)
images = [asset.pil_image for asset in image_assets]
# Input resolution must be a multiple of `self.min_resolution_step`.
# Using `self.get_nearest_supported_resolution`, for assets 432x642 the
# nearest supported resolution is 432x640.
pixel_values = [
img_processor(image, return_tensors="pt").pixel_values.to(torch_dtype)[
:, :, :, :640
]
for image in images
]
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
# RADIO model on HF does not properly handle torch_dtype argument
# And relies on args["dtype"] which we have to patch manually:
config.args["dtype"] = torch_dtype
hf_model = AutoModel.from_pretrained(
model_id,
config=config,
dtype=torch_dtype,
trust_remote_code=True,
).to("cuda")
hf_model.eval()
# A HF model has image normalization as a part of model's forward
# However in vLLM we don't make normalization a part of the model
# forward step since mean/std stored as model's parameters and
# subject to precision loss (when using fp16/bf16) which negatively
# affects evaluation benchmarks.
hf_model.make_preprocessor_external()
hf_outputs_per_image = [
hf_model(pixel_value.to("cuda")).features for pixel_value in pixel_values
]
radio_config = RadioConfig(
model_name=config.args["model"], reg_tokens=config.args["register_multiple"]
)
vllm_model = RadioModel(radio_config)
vllm_model.load_weights(hf_model.state_dict())
vllm_model = vllm_model.to("cuda", torch_dtype)
vllm_outputs_per_image = [
vllm_model(pixel_values=pixel_value.to("cuda")) for pixel_value in pixel_values
]
del vllm_model, hf_model
cleanup_dist_env_and_memory()
cos_similar = nn.CosineSimilarity(dim=-1)
for vllm_output, hf_output in zip(vllm_outputs_per_image, hf_outputs_per_image):
assert cos_similar(vllm_output, hf_output).mean() > 0.99
@pytest.mark.parametrize(
"model_id",
[
"nvidia/C-RADIOv2-H",
],
)
@pytest.mark.parametrize("dtype", ["half", "bfloat16"])
def test_radio(dist_init, image_assets, model_id, dtype: str) -> None:
run_radio_test(
image_assets,
model_id,
dtype=dtype,
)

View File

@@ -0,0 +1,162 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
import pytest
from transformers import SiglipModel
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ...utils import check_embeddings_close
HF_TEXT_PROMPTS = [
"a photo of a stop sign",
"a photo of a cherry blossom",
]
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
{
"stop_sign": "",
"cherry_blossom": "",
}
)
MODELS = [
"google/siglip-base-patch16-224",
"google/siglip2-base-patch16-224",
# Different image embedding dim than text_config.hidden_size
"google/siglip2-giant-opt-patch16-384",
]
def _run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
input_texts: list[str],
input_images: PromptImageInput,
model: str,
*,
dtype: str,
tokenization_kwargs: dict[str, Any] | None = None,
) -> None:
if tokenization_kwargs is None:
tokenization_kwargs = {}
with vllm_runner(
model,
runner="pooling",
dtype=dtype,
enforce_eager=True,
max_model_len=64,
gpu_memory_utilization=0.7,
) as vllm_model:
vllm_outputs = vllm_model.embed(
input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs
)
with hf_runner(model, dtype=dtype, auto_cls=SiglipModel) as hf_model:
all_inputs = hf_model.get_inputs(
input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs
)
all_outputs = []
for inputs in all_inputs:
inputs = hf_model.wrap_device(inputs)
if "pixel_values" in inputs:
pooled_output = hf_model.model.get_image_features(
pixel_values=inputs.pixel_values,
).squeeze(0)
else:
pooled_output = hf_model.model.get_text_features(
input_ids=inputs.input_ids,
).squeeze(0)
all_outputs.append(pooled_output.tolist())
hf_outputs = all_outputs
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
def test_models_text(
hf_runner,
vllm_runner,
image_assets,
model: str,
dtype: str,
) -> None:
input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
input_texts = [text for text, _ in input_texts_images]
input_images = [image for _, image in input_texts_images]
_run_test(
hf_runner,
vllm_runner,
input_texts,
input_images, # type: ignore
model,
dtype=dtype,
tokenization_kwargs={
"padding": "max_length",
"max_length": 64,
}, # siglip2 was trained with this padding setting.
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
def test_models_image(
hf_runner,
vllm_runner,
image_assets,
model: str,
dtype: str,
) -> None:
input_texts_images = [
(text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
]
input_texts = [text for text, _ in input_texts_images]
input_images = [image for _, image in input_texts_images]
_run_test(
hf_runner,
vllm_runner,
input_texts,
input_images,
model,
dtype=dtype,
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
def test_models_text_image_no_crash(
vllm_runner,
image_assets,
model: str,
dtype: str,
) -> None:
texts = [HF_TEXT_PROMPTS[0]]
images = [image_assets[0].pil_image]
with vllm_runner(
model,
runner="pooling",
dtype=dtype,
enforce_eager=True,
max_model_len=64,
gpu_memory_utilization=0.7,
) as vllm_model:
with pytest.raises(ValueError, match="not both"):
vllm_model.embed(texts, images=images)
vllm_model.embed(texts)
vllm_model.embed([""], images=images)

View File

@@ -0,0 +1,125 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright 2025 The vLLM team.
# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
# reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from unittest.mock import MagicMock
import numpy as np
import pytest
import torch
from transformers import PretrainedConfig
from tests.models.registry import HF_EXAMPLE_MODELS
class MockAudioFlamingo3Config(PretrainedConfig):
model_type = "audioflamingo3"
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.audio_config = PretrainedConfig()
self.text_config = PretrainedConfig()
class MockAudioFlamingo3Processor:
def __init__(self):
self.audio_token = "<sound>"
self.audio_token_id = 12345
self.feature_extractor = MockFeatureExtractor()
def __call__(self, text=None, audios=None, **kwargs):
return {"input_ids": [1, 2, 3], "input_features": [np.zeros((3000, 80))]}
class MockFeatureExtractor:
def __init__(self):
self.sampling_rate = 16000
self.chunk_length = 30
@pytest.fixture
def mock_ctx():
config = MockAudioFlamingo3Config()
ctx = MagicMock()
ctx.get_hf_config.return_value = config
ctx.get_hf_processor.return_value = MockAudioFlamingo3Processor()
ctx.model_config.hf_config = config
return ctx
@pytest.fixture(autouse=True)
def check_transformers_version():
# Check if the model is supported by the current transformers version
model_info = HF_EXAMPLE_MODELS.get_hf_info("AudioFlamingo3ForConditionalGeneration")
model_info.check_transformers_version(on_fail="skip")
def test_audio_chunk_counting(mock_ctx):
from vllm.model_executor.models.audioflamingo3 import (
AudioFlamingo3DummyInputsBuilder,
AudioFlamingo3MultiModalProcessor,
AudioFlamingo3ProcessingInfo,
)
info = AudioFlamingo3ProcessingInfo(mock_ctx)
processor = AudioFlamingo3MultiModalProcessor(
info, AudioFlamingo3DummyInputsBuilder(info)
)
sr = 16000
audio_1 = np.zeros(30 * sr)
audio_2 = np.zeros(45 * sr)
mm_data = {"audio": [audio_1, audio_2]}
prompt = "<|user|>Listen.<|end|>"
from vllm.multimodal.processing import BaseMultiModalProcessor
def mock_base_call(self, prompt, mm_data, mm_kwargs, tok_kwargs):
return {"input_ids": [1, 2, 3], "input_features": torch.randn(1, 80, 3000)}
with pytest.MonkeyPatch.context() as mp:
mp.setattr(BaseMultiModalProcessor, "_call_hf_processor", mock_base_call)
processed = processor._call_hf_processor(prompt, mm_data, {}, {})
chunk_counts = processed["chunk_counts"]
assert chunk_counts[0].item() == 1
assert chunk_counts[1].item() == 2
assert len(chunk_counts) == 2
def test_dummy_data_generation(mock_ctx):
from vllm.model_executor.models.audioflamingo3 import (
AudioFlamingo3DummyInputsBuilder,
AudioFlamingo3ProcessingInfo,
)
info = AudioFlamingo3ProcessingInfo(mock_ctx)
builder = AudioFlamingo3DummyInputsBuilder(info)
mm_counts = {"audio": 2}
dummy_data = builder.get_dummy_mm_data(100, mm_counts, None)
assert "audio" in dummy_data
assert len(dummy_data["audio"]) == 2
expected_len = 600 * 16000
assert len(dummy_data["audio"][0]) == expected_len

View File

@@ -0,0 +1,418 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Set as AbstractSet
from functools import partial
import numpy as np
import pytest
from mistral_common.protocol.instruct.chunk import ImageChunk, TextChunk
from mistral_common.protocol.instruct.messages import UserMessage
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from PIL import Image
from vllm.config import ModelConfig
from vllm.config.multimodal import (
AudioDummyOptions,
BaseDummyOptions,
ImageDummyOptions,
VideoDummyOptions,
)
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
from vllm.multimodal.cache import MultiModalProcessorOnlyCache
from vllm.multimodal.inputs import MultiModalInputs, batched_tensors_equal
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
from vllm.tokenizers.mistral import MistralTokenizer
from ....multimodal.utils import random_audio, random_image, random_video
from ...registry import (
_MULTIMODAL_EXAMPLE_MODELS,
_TRANSFORMERS_BACKEND_MODELS,
HF_EXAMPLE_MODELS,
)
def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
"""
Patch the multimodal data for GLM4.1V model.
"""
# Ensure video metadata is included
if "video" in mm_data:
# GLM4.1V doesn't support multiple videos
video = mm_data["video"]
num_frames = len(video)
mm_data["video"] = (
video,
{
"total_num_frames": num_frames,
"fps": num_frames,
"duration": 1,
"frames_indices": [i for i in range(num_frames)],
"video_backend": "opencv",
"do_sample_frames": True,
},
)
return mm_data
def qwen3_vl_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
"""
Patch the multimodal data for Qwen3-VL model.
"""
def create_metadata(frames: np.ndarray):
num_frames = len(frames)
return {
"total_num_frames": num_frames,
"fps": 2.0,
"duration": num_frames / 2.0,
"video_backend": "opencv",
"frames_indices": list(range(num_frames)),
"do_sample_frames": True,
}
# Ensure video metadata is included
if "video" in mm_data:
video = mm_data["video"]
if isinstance(video, list):
# multiple videos
mm_data["video"] = [(vid, create_metadata(vid)) for vid in video]
else:
# single video
mm_data["video"] = (video, create_metadata(video))
return mm_data
# For some multimodal models, tokenizer will always add bos_token
# at the beginning of prompt by default, causing hf_processor outputs
# incorrect token ids. So we need use `add_special_tokens=False` here
# to leave bos_token to be added by the processor.
_ADD_SPECIAL_TOKENS_OVERRIDES = {
"ovis": False,
"ovis2_5": False,
"paligemma": False,
"ultravox": False,
"whisper": False,
}
_IGNORE_MM_KEYS = {
# In Ultravox, the audio_features can be different depending on padding
# The slight difference should not be a problem though, since
# attention_mask lets us ignore the difference.
"ultravox": {"audio_features"},
}
MM_DATA_PATCHES = {
# GLM4.1V and Qwen3-VL requires video metadata to be included in the input
"glm4v": glm4_1v_patch_mm_data,
"glm4v_moe": glm4_1v_patch_mm_data,
"qwen3_vl": qwen3_vl_patch_mm_data,
"qwen3_vl_moe": qwen3_vl_patch_mm_data,
}
def _iter_model_ids_to_test(model_arch_list: AbstractSet[str]):
for model_arch in model_arch_list:
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
yield model_info.default
for extra_type, extra_model_id in model_info.extras.items():
if "fp" in extra_type:
continue # Redundant to test quantized models
yield extra_model_id
def _get_model_ids_to_test(model_arch_list: AbstractSet[str]):
return list(_iter_model_ids_to_test(model_arch_list))
def get_model_ids_to_test():
transformers_arch_ids = {
model_id
for info in _TRANSFORMERS_BACKEND_MODELS.values()
for model_id in (info.default, *info.extras.values())
}
vllm_only_archs = {
arch
for arch, info in _MULTIMODAL_EXAMPLE_MODELS.items()
if not any(
model_id in transformers_arch_ids
for model_id in (info.default, *info.extras.values())
)
}
return _get_model_ids_to_test(vllm_only_archs)
def get_text_token_prompts(
processor: BaseMultiModalProcessor,
mm_data: MultiModalDataDict,
):
dummy_inputs = processor.dummy_inputs
tokenizer: TokenizerLike = processor.info.get_tokenizer()
model_config = processor.info.ctx.model_config
model_type = model_config.hf_config.model_type
if model_type in MM_DATA_PATCHES:
mm_data = MM_DATA_PATCHES[model_type](mm_data)
parsed_data = processor.data_parser.parse_mm_data(mm_data)
mm_counts = {k: len(vs) for k, vs in parsed_data.items()}
text_prompt: str | None
token_prompt: list[int]
if isinstance(tokenizer, MistralTokenizer):
images = parsed_data.get("image", [])
request = ChatCompletionRequest(
messages=[
UserMessage(
content=[
TextChunk(text=""),
*(ImageChunk(image=image) for image in images),
]
),
]
)
res = tokenizer.mistral.encode_chat_completion(request)
# Mistral does not support decode_tokens with skip_special_tokens=False
text_prompt = None
token_prompt = res.tokens
else:
inputs = dummy_inputs.get_dummy_processor_inputs(
model_config.max_model_len,
mm_counts,
)
assert isinstance(inputs.prompt, str)
text_prompt = inputs.prompt
token_prompt = tokenizer.encode(
text_prompt,
add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type, True),
)
return text_prompt, token_prompt
def _test_processing_correctness(
model_id_or_arch: str,
hit_rate: float,
num_batches: int,
simplify_rate: float,
):
if model_id_or_arch in HF_EXAMPLE_MODELS.get_supported_archs():
# Use model architecture to get the default model id
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_id_or_arch)
model_id = model_info.default
else:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch)
model_id = model_id_or_arch
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
model_config = ModelConfig(
model_id,
tokenizer=model_info.tokenizer or model_id,
tokenizer_mode=model_info.tokenizer_mode,
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code,
hf_overrides=model_info.hf_overrides,
# Ensure that the cache can fit all of the data
mm_processor_cache_gb=2048,
skip_tokenizer_init=model_info.require_embed_inputs,
enable_prompt_embeds=model_info.require_embed_inputs,
enable_mm_embeds=model_info.require_embed_inputs,
enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype,
)
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
factories = model_cls._processor_factory
ctx = InputProcessingContext(
model_config,
tokenizer=cached_tokenizer_from_config(model_config),
)
cache = MultiModalProcessorOnlyCache(model_config)
processing_info = factories.info(ctx)
supported_mm_limits = processing_info.get_supported_mm_limits()
# Keep integer limits for local data generation
limit_mm_per_prompt_ints = {
modality: 3 if limit is None else limit
for modality, limit in supported_mm_limits.items()
}
def _to_dummy_options(modality: str, count: int) -> BaseDummyOptions:
if modality == "video":
return VideoDummyOptions(count=count)
if modality == "image":
return ImageDummyOptions(count=count)
if modality == "audio":
return AudioDummyOptions(count=count)
return BaseDummyOptions(count=count)
# Assign normalized DummyOptions to the model config
model_config.get_multimodal_config().limit_per_prompt = {
modality: _to_dummy_options(modality, count)
for modality, count in limit_mm_per_prompt_ints.items()
}
baseline_processor = factories.build_processor(ctx, cache=None)
cached_processor = factories.build_processor(ctx, cache=cache)
rng = np.random.RandomState(0)
input_to_hit = {
"image": Image.new("RGB", size=(128, 128)),
"video": np.zeros((4, 128, 128, 3), dtype=np.uint8),
"audio": (np.zeros((512,)), 16000),
}
input_factory = {
"image": partial(random_image, rng, min_wh=128, max_wh=256),
"video": partial(
random_video, rng, min_frames=2, max_frames=16, min_wh=128, max_wh=256
),
"audio": partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
}
for batch_idx in range(num_batches):
mm_data = {
k: [
(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
for _ in range(rng.randint(limit + 1))
]
for k, limit in limit_mm_per_prompt_ints.items()
}
# Drop unnecessary keys and test single -> multi conversion
if rng.rand() < simplify_rate:
for k in list(mm_data.keys()):
if not mm_data[k]:
del mm_data[k]
elif len(mm_data[k]) == 1:
mm_data[k] = mm_data[k][0]
_test_processing_correctness_one(
model_config,
mm_data,
baseline_processor,
cached_processor,
batch_idx,
)
def _test_processing_correctness_one(
model_config: ModelConfig,
mm_data: MultiModalDataDict,
baseline_processor: BaseMultiModalProcessor,
cached_processor: BaseMultiModalProcessor,
batch_idx: int,
):
model_type = model_config.hf_config.model_type
text_prompt, token_prompt = get_text_token_prompts(baseline_processor, mm_data)
ignore_mm_keys = _IGNORE_MM_KEYS.get(model_type, set[str]())
baseline_tokenized_result = baseline_processor.apply(
token_prompt,
mm_data=mm_data,
hf_processor_mm_kwargs={},
)
cached_tokenized_result = cached_processor.apply(
token_prompt,
mm_data=mm_data,
hf_processor_mm_kwargs={},
)
_assert_inputs_equal(
baseline_tokenized_result,
cached_tokenized_result,
ignore_mm_keys=ignore_mm_keys,
msg=f"Failed ({batch_idx=}, {token_prompt=}, {mm_data=})",
)
if text_prompt is not None:
baseline_text_result = baseline_processor.apply(
text_prompt,
mm_data=mm_data,
hf_processor_mm_kwargs={},
)
cached_text_result = cached_processor.apply(
text_prompt,
mm_data=mm_data,
hf_processor_mm_kwargs={},
)
_assert_inputs_equal(
baseline_text_result,
cached_text_result,
ignore_mm_keys=ignore_mm_keys,
msg=f"Failed ({batch_idx=}, {text_prompt=}, {mm_data=})",
)
_assert_inputs_equal(
baseline_text_result,
baseline_tokenized_result,
ignore_mm_keys=ignore_mm_keys,
msg=f"Failed ({batch_idx=}, {text_prompt=}, {token_prompt=}, {mm_data=})",
)
_assert_inputs_equal(
cached_text_result,
cached_tokenized_result,
ignore_mm_keys=ignore_mm_keys,
msg=f"Failed ({batch_idx=}, {text_prompt=}, {token_prompt=}, {mm_data=})",
)
@pytest.mark.parametrize("model_id", get_model_ids_to_test())
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
@pytest.mark.parametrize("num_batches", [32])
@pytest.mark.parametrize("simplify_rate", [1.0])
def test_processing_correctness(
model_id: str,
hit_rate: float,
num_batches: int,
simplify_rate: float,
):
if model_id == "google/gemma-3n-E2B-it":
pytest.skip("Fix later")
if model_id == "OpenGVLab/InternVL2-2B":
pytest.skip("Fix later")
if model_id == "jinaai/jina-reranker-m0":
pytest.skip("Fix later")
_test_processing_correctness(
model_id,
hit_rate=hit_rate,
num_batches=num_batches,
simplify_rate=simplify_rate,
)
def _assert_inputs_equal(
a: MultiModalInputs,
b: MultiModalInputs,
*,
ignore_mm_keys: set[str] | None = None,
msg: str = "",
):
if ignore_mm_keys is None:
ignore_mm_keys = set()
a_rest = {k: v for k, v in a.items() if k != "mm_kwargs"}
b_rest = {k: v for k, v in b.items() if k != "mm_kwargs"}
assert a_rest == b_rest, msg
a_data = a["mm_kwargs"].get_data()
b_data = b["mm_kwargs"].get_data()
for key in ignore_mm_keys:
a_data.pop(key, None)
b_data.pop(key, None)
assert batched_tensors_equal(a_data, b_data), msg

View File

@@ -0,0 +1,42 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY
from ....conftest import ImageTestAssets
from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["google/gemma-3-4b-it"])
def test_get_image_size_with_most_features(
image_assets: ImageTestAssets, model_id: str
):
ctx = build_model_context(
model_id,
mm_processor_kwargs={"do_pan_and_scan": True},
limit_mm_per_prompt={"image": 1},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
hf_processor_mm_kwargs: dict[str, object] = {}
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
max_image_size = processor.info.get_image_size_with_most_features()
max_tokens = processor.info.get_num_image_tokens(
image_width=max_image_size.width,
image_height=max_image_size.height,
processor=hf_processor,
)
prompt = "<start_of_image>"
image_seq_length = hf_processor.image_seq_length
for asset in image_assets:
mm_data = {"image": [asset.pil_image]}
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
mm_kwargs_data = processed_inputs["mm_kwargs"].get_data()
num_patches_tensor = mm_kwargs_data["num_patches"]
tokens = int(num_patches_tensor.item()) * image_seq_length
assert tokens <= max_tokens

View File

@@ -0,0 +1,110 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.assets.video import VideoAsset
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import batched_tensors_equal
from vllm.multimodal.video import OpenCVDynamicVideoBackend, OpenCVVideoBackend
from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"])
@pytest.mark.parametrize("expected_toks_per_frame", [299])
@pytest.mark.parametrize(
"num_frames, fps, expected_grid_t",
[
# pre-sampled fixed frames (unexpected behavior,
# but we still expect it to work without errors)
(32, 1, 16),
(32, 2, 16),
(128, 1, 64),
(128, 2, 64),
# post-sampled frames (expected behavior)
(-1, 1, 5),
(-1, 2, 10),
],
)
def test_processor_override(
model_id: str,
expected_toks_per_frame: int,
expected_grid_t: int,
fps: int,
num_frames: int,
):
"""Ensure GLM4vMultiModalProcessor can handle video frames properly."""
ctx = build_model_context(
model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"video": 1},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
tokenizer = processor.info.get_tokenizer()
hf_processor_mm_kwargs = {"fps": fps}
# Build the image str / prompt based on the number of images we pass
video_assets = VideoAsset(name="baby_reading", num_frames=num_frames)
prompt = "<|begin_of_video|><|video|><|end_of_video|>"
video, metadata = video_assets.np_ndarrays, video_assets.metadata
metadata["fps"] = fps
mm_data = {"video": [(video, metadata)]}
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
# Ensure we have the right number of placeholders per num_crops size
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token)
video_tok_count = processed_inputs["prompt_token_ids"].count(video_token_id)
grid_t, _, _ = processed_inputs["mm_kwargs"].get_data()["video_grid_thw"][0]
assert grid_t == expected_grid_t
assert video_tok_count == expected_toks_per_frame * grid_t
@pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"])
@pytest.mark.parametrize("fps", [2])
def test_video_loader_consistency(
model_id: str,
fps: int,
):
"""
Ensure dynamic video loader (pre-sampled by loader) and normal video
loader (post-sampled by processor) produce same video processing outputs.
"""
ctx = build_model_context(
model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"video": 1},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
hf_processor_mm_kwargs = {"fps": fps}
# Build the image str / prompt based on the number of images we pass
prompt = "<|begin_of_video|><|video|><|end_of_video|>"
video_path = VideoAsset(name="baby_reading", num_frames=-1).video_path
with open(video_path, "rb") as f:
video_bytes = f.read()
static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes)
dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes(
video_bytes, fps=fps
)
# pre-sampled loader shouldn't read all frames
assert len(dynamic_video) < len(static_video)
static_mm_data = {"video": [(static_video, static_metadata)]}
dynamic_mm_data = {"video": [(dynamic_video, dynamic_metadata)]}
static_outputs = processor.apply(prompt, static_mm_data, hf_processor_mm_kwargs)
dynamic_outputs = processor.apply(prompt, dynamic_mm_data, hf_processor_mm_kwargs)
assert static_outputs["prompt_token_ids"] == dynamic_outputs["prompt_token_ids"]
assert batched_tensors_equal(
static_outputs["mm_kwargs"].get_data(),
dynamic_outputs["mm_kwargs"].get_data(),
)

View File

@@ -0,0 +1,177 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for H2OVL's multimodal preprocessing kwargs."""
from collections.abc import Mapping
import pytest
from PIL import Image
from transformers import PretrainedConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.processing import BaseMultiModalProcessor
from ....conftest import ImageTestAssets
from ...utils import build_model_context
def _get_expected_num_patches(
config: PretrainedConfig,
image: Image.Image,
num_imgs: int,
min_num: int,
max_num: int,
):
from vllm.model_executor.models.h2ovl import (
calculate_h2ovl_targets,
get_h2ovl_target_ratios,
)
width, height = image.size
# Calculate the expected number of blocks
if num_imgs == 1 and config.use_msac:
# First pass
blocks1, _, _, aspect_ratio = calculate_h2ovl_targets(
orig_width=width,
orig_height=height,
target_ratios=get_h2ovl_target_ratios(
min_num=1,
max_num=max_num,
prior_aspect_ratio=None,
),
image_size=config.vision_config.image_size,
use_thumbnail=False, # Thumbnail is handled separately
)
# Second pass
blocks2, _, _, _ = calculate_h2ovl_targets(
orig_width=width,
orig_height=height,
target_ratios=get_h2ovl_target_ratios(
min_num=3,
max_num=max_num,
prior_aspect_ratio=aspect_ratio,
),
image_size=config.vision_config.image_size,
use_thumbnail=False,
)
# Add thumbnail if use_thumbnail is True and total_blocks > 1
if config.use_thumbnail:
blocks1 += 1 if blocks1 > 1 else 0
blocks2 += 1 if blocks2 > 1 else 0
# Total blocks is the sum of blocks from both passes minus
# overlapping
total_blocks = blocks1 + blocks2 - 1
return total_blocks
blocks, _, _, _ = calculate_h2ovl_targets(
orig_width=width,
orig_height=height,
target_ratios=get_h2ovl_target_ratios(
min_num,
max_num,
prior_aspect_ratio=None,
),
image_size=config.vision_config.image_size,
use_thumbnail=False,
)
expected_num_patches = blocks
if config.use_thumbnail and expected_num_patches > 1:
expected_num_patches += 1
return expected_num_patches
def _run_check(
processor: BaseMultiModalProcessor,
images: list[Image.Image],
min_num: int,
max_num: int,
mm_processor_kwargs: Mapping[str, object],
):
tokenizer = processor.info.get_tokenizer()
config = processor.info.get_hf_config()
prompt = "<image>" * len(images)
mm_data = {"image": images}
total_expected_num_patches = sum(
_get_expected_num_patches(config, image, len(images), min_num, max_num)
for image in images
)
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
# Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape
assert img_tok_count == 256 * total_expected_num_patches
assert pixel_shape[0] == total_expected_num_patches
@pytest.mark.parametrize(
"model_id",
[
"h2oai/h2ovl-mississippi-800m",
"h2oai/h2ovl-mississippi-2b",
],
)
@pytest.mark.parametrize(
"size_factors",
[
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
[4.0, 2.0, 1.0],
],
)
@pytest.mark.parametrize(
("min_dynamic_patch", "max_dynamic_patch"),
[(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
)
@pytest.mark.parametrize("dynamic_image_size", [True, False])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(
model_id: str,
image_assets: ImageTestAssets,
size_factors: list[int],
min_dynamic_patch: int,
max_dynamic_patch: int,
dynamic_image_size: bool | None,
kwargs_on_init: bool,
):
mm_processor_kwargs = {
"min_dynamic_patch": min_dynamic_patch,
"max_dynamic_patch": max_dynamic_patch,
"dynamic_image_size": dynamic_image_size,
}
ctx = build_model_context(
model_id,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": len(size_factors)},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
min_num = min_dynamic_patch if dynamic_image_size else 1
max_num = max_dynamic_patch if dynamic_image_size else 1
_run_check(
processor,
[rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
min_num,
max_num,
hf_processor_mm_kwargs,
)

View File

@@ -0,0 +1,68 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for Idefics3's multimodal preprocessing kwargs."""
import pytest
from transformers import Idefics3Config
from vllm.multimodal import MULTIMODAL_REGISTRY
from ....conftest import ImageTestAssets
from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["HuggingFaceM4/Idefics3-8B-Llama3"])
@pytest.mark.parametrize(
("mm_processor_kwargs", "expected_toks_per_img"),
[
({"size": {"longest_edge": 364}}, 169),
({"size": {"longest_edge": 728}}, 169 * (2**2 + 1)),
],
)
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(
image_assets: ImageTestAssets,
model_id: str,
mm_processor_kwargs: dict[str, object],
expected_toks_per_img: int,
num_imgs: int,
kwargs_on_init: bool,
):
"""Ensure Idefics3MultiModalProcessor handles num_crops properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
# the partial when calling the custom input processor.
ctx = build_model_context(
model_id,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass
placeholders = (
"<image>"
if num_imgs == 1
else "\n".join(f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
)
prompt = f"<|begin_of_text|>User:{placeholders}\n<end_of_utterance>\nAssistant:" # noqa: E501
# Build mm_data
image_size = ctx.get_hf_config(Idefics3Config).vision_config.image_size
dummy_image_size = (image_size * 4, image_size * 4)
dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
mm_data = {"image": [dummy_image] * num_imgs}
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
# Ensure the placeholders format are correct
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs["input_ids"][0]
# Ensure we have the right number of placeholders per num_crops size
image_token_id = ctx.get_hf_config().image_token_id
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
assert img_tok_count == expected_toks_per_img * num_imgs

View File

@@ -0,0 +1,131 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for InternVL's multimodal preprocessing kwargs."""
from collections.abc import Mapping
import pytest
from PIL import Image
from transformers import PretrainedConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.processing import BaseMultiModalProcessor
from ....conftest import ImageTestAssets
from ...utils import build_model_context
def _get_expected_num_patches(
config: PretrainedConfig,
image: Image.Image,
num_imgs: int,
min_num: int,
max_num: int,
):
from vllm.model_executor.models.internvl import (
calculate_internvl_targets,
get_internvl_target_ratios,
)
width, height = image.size
blocks, _, _ = calculate_internvl_targets(
orig_width=width,
orig_height=height,
target_ratios=get_internvl_target_ratios(
min_num,
max_num,
),
image_size=config.vision_config.image_size,
use_thumbnail=False,
)
expected_num_patches = blocks
if config.use_thumbnail and expected_num_patches > 1:
expected_num_patches += 1
return expected_num_patches
def _run_check(
processor: BaseMultiModalProcessor,
images: list[Image.Image],
min_num: int,
max_num: int,
mm_processor_kwargs: Mapping[str, object],
):
tokenizer = processor.info.get_tokenizer()
config = processor.info.get_hf_config()
prompt = "<image>" * len(images)
mm_data = {"image": images}
total_expected_num_patches = sum(
_get_expected_num_patches(config, image, len(images), min_num, max_num)
for image in images
)
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
# Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape
assert img_tok_count == 256 * total_expected_num_patches
assert pixel_shape[0] == total_expected_num_patches
@pytest.mark.parametrize("model_id", ["OpenGVLab/InternVL2-2B"])
@pytest.mark.parametrize(
"size_factors",
[
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
[4.0, 2.0, 1.0],
],
)
@pytest.mark.parametrize(
("min_dynamic_patch", "max_dynamic_patch"),
[(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
)
@pytest.mark.parametrize("dynamic_image_size", [True, False])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(
model_id: str,
image_assets: ImageTestAssets,
size_factors: list[int],
min_dynamic_patch: int,
max_dynamic_patch: int,
dynamic_image_size: bool | None,
kwargs_on_init: bool,
):
mm_processor_kwargs = {
"min_dynamic_patch": min_dynamic_patch,
"max_dynamic_patch": max_dynamic_patch,
"dynamic_image_size": dynamic_image_size,
}
ctx = build_model_context(
model_id,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": len(size_factors)},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
min_num = min_dynamic_patch if dynamic_image_size else 1
max_num = max_dynamic_patch if dynamic_image_size else 1
_run_check(
processor,
[rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
min_num,
max_num,
hf_processor_mm_kwargs,
)

View File

@@ -0,0 +1,85 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for Llama4's multimodal preprocessing kwargs."""
import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY
from ....conftest import ImageTestAssets
from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["meta-llama/Llama-4-Scout-17B-16E-Instruct"])
@pytest.mark.parametrize("mm_processor_kwargs", [{}])
@pytest.mark.parametrize("num_imgs", [1, 5])
@pytest.mark.parametrize("mm_processor_cache_gb", [0, 4])
@pytest.mark.parametrize("tokenized_prompt", [True, False])
def test_processor_override(
image_assets: ImageTestAssets,
model_id: str,
mm_processor_kwargs: dict,
num_imgs: int,
mm_processor_cache_gb: int,
tokenized_prompt: bool,
):
"""Ensure llama4 processor works properly."""
ctx = build_model_context(
model_id,
mm_processor_kwargs=mm_processor_kwargs,
limit_mm_per_prompt={"image": num_imgs},
mm_processor_cache_gb=mm_processor_cache_gb,
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
config = processor.info.get_hf_config()
tokenizer = processor.info.get_tokenizer()
hf_processor = processor.info.get_hf_processor()
vocab = tokenizer.get_vocab()
prompt = (
"<|begin_of_text|><|header_start|>user<|header_end|>"
+ "<|image|>" * num_imgs
+ "<|eot|><|header_start|>assistant<|header_end|>"
)
mm_data = {
"image": [
image_assets[(i % len(image_assets))].pil_image for i in range(num_imgs)
]
}
if tokenized_prompt:
prompt = tokenizer.encode(prompt)
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
mm_data = processed_inputs["mm_kwargs"].get_data()
# place holder replacements
prompt_token_ids = processed_inputs["prompt_token_ids"]
assert prompt_token_ids.count(config.boi_token_index) == num_imgs
assert prompt_token_ids.count(config.eoi_token_index) == num_imgs
assert prompt_token_ids.count(vocab[hf_processor.image_token]) == num_imgs
aspect_ratios = mm_data["aspect_ratios"]
num_x_separators = num_y_separators = 0
for tiles_y, tiles_x in aspect_ratios:
if tiles_x * tiles_y > 1:
num_x_separators += (tiles_x - 1) * tiles_y
num_y_separators += tiles_y
assert prompt_token_ids.count(vocab[hf_processor.tile_token]) == num_x_separators
assert (
prompt_token_ids.count(vocab[hf_processor.tile_global_token])
== num_y_separators
)
# image token offsets
img_locs = processed_inputs["mm_placeholders"].get("image", [])
assert len(img_locs) == num_imgs
assert [img_loc.offset for img_loc in img_locs] == [
i for i, v in enumerate(prompt_token_ids) if v == config.boi_token_index
]
# patch sizes and masks
num_patches_per_chunk = processor.info.get_patch_per_chunk(config.vision_config)
assert (
prompt_token_ids.count(config.image_token_index)
== sum(mm_data["patches_per_image"]) * num_patches_per_chunk
)
assert len(mm_data["pixel_values"]) == sum(mm_data["patches_per_image"])

View File

@@ -0,0 +1,194 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
from functools import partial
import pytest
from PIL import Image
from pqdm.threads import pqdm
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.parse import ImageSize
from vllm.multimodal.processing import BaseMultiModalProcessor
from ...utils import build_model_context
def _validate_image_max_tokens_one(
processor: BaseMultiModalProcessor,
max_tokens: int,
failed_size_excs: list[tuple[ImageSize, Exception]],
image_size: ImageSize,
) -> None:
info = processor.info
feature_size = info.get_num_image_tokens(
image_width=image_size.width, image_height=image_size.height
)
try:
assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
except Exception as exc:
failed_size_excs.append((image_size, exc))
@pytest.mark.skip(
"This test takes around 5 minutes to run. Comment this out to run it manually."
)
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
def test_processor_max_tokens(model_id):
ctx = build_model_context(
model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": 1},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
info = processor.info
seen_aspect_ratios = set[float]()
image_sizes = list[ImageSize]()
# The aspect ratio of the grid layout is between 1 and 2
# NOTE: Assumes that feature size calculation is the same if we
# swap the width and height of the image
for w, h in itertools.product(range(32, 4096), repeat=2):
aspect_ratio = w / h
if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios:
image_sizes.append(ImageSize(w, h))
seen_aspect_ratios.add(aspect_ratio)
failed_size_excs = list[tuple[ImageSize, Exception]]()
validate_one = partial(
_validate_image_max_tokens_one,
processor,
info.get_max_image_tokens(), # type: ignore
failed_size_excs,
)
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
if failed_size_excs:
msg = "Found failing image sizes:" + "\n========\n".join(
f"[{size}]\n{exc}" for size, exc in failed_size_excs
)
raise AssertionError(msg)
def _validate_image_prompt_replacements_one(
processor: BaseMultiModalProcessor,
num_imgs: int,
failed_size_excs: list[tuple[ImageSize, Exception]],
image_size: ImageSize,
) -> None:
prompt = "<image>" * num_imgs
image = Image.new("RGB", size=image_size)
mm_data = {"image": [image] * num_imgs}
try:
# The processor will throw an error if there is a mismatch
# in the prompt replacements
processed_inputs = processor.apply(prompt, mm_data, {})
image_placeholders = processed_inputs["mm_placeholders"]["image"]
assert len(image_placeholders) == num_imgs
first_placeholder = image_placeholders[0]
# NOTE: There is a BOS token
assert first_placeholder.offset == 1
assert (
first_placeholder.length
== (len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
)
except Exception as exc:
failed_size_excs.append((image_size, exc))
def _test_image_prompt_replacements(
processor,
*,
num_imgs: int,
image_sizes: list[ImageSize],
) -> None:
"""
Ensure LlavaNextMultiModalProcessor
handles prompt replacement properly for input images.
"""
failed_size_excs = list[tuple[ImageSize, Exception]]()
validate_one = partial(
_validate_image_prompt_replacements_one,
processor,
num_imgs,
failed_size_excs,
)
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
if failed_size_excs:
msg = "Found failing image sizes:" + "\n========\n".join(
f"[{size}]\n{exc}" for size, exc in failed_size_excs
)
raise AssertionError(msg)
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_prompt_replacements_regression(model_id, num_imgs):
ctx = build_model_context(
model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
image_ratios = [
(171, 152),
(184, 161),
(198, 176),
(333, 296),
(369, 328),
(488, 183),
(2560, 1669),
]
image_sizes = [
size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
]
_test_image_prompt_replacements(
processor,
num_imgs=num_imgs,
image_sizes=image_sizes,
)
@pytest.mark.skip(
"This test takes around 2 hours to run. Comment this out to run it manually."
)
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize("num_imgs", [1])
def test_processor_prompt_replacements_all(model_id, num_imgs):
ctx = build_model_context(
model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
seen_aspect_ratios = set[float]()
image_sizes = list[ImageSize]()
# The aspect ratio of the grid layout is between 1 and 2
# NOTE: Assumes that feature size calculation is the same if we
# swap the width and height of the image
for w, h in itertools.product(range(64, 1024), repeat=2):
aspect_ratio = w / h
if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios:
image_sizes.append(ImageSize(w, h))
seen_aspect_ratios.add(aspect_ratio)
_test_image_prompt_replacements(
processor,
num_imgs=num_imgs,
image_sizes=image_sizes,
)

View File

@@ -0,0 +1,192 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
from functools import partial
import pytest
from PIL import Image
from pqdm.threads import pqdm
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.parse import ImageSize
from vllm.multimodal.processing import BaseMultiModalProcessor
from ...utils import build_model_context
def _validate_image_max_tokens_one(
processor: BaseMultiModalProcessor,
max_tokens: int,
failed_size_excs: list[tuple[ImageSize, Exception]],
image_size: ImageSize,
) -> None:
info = processor.info
feature_size = info.get_num_image_tokens(
image_width=image_size.width, image_height=image_size.height
)
try:
assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
except Exception as exc:
failed_size_excs.append((image_size, exc))
@pytest.mark.skip(
"This test takes around 5 minutes to run. Comment this out to run it manually."
)
@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
def test_processor_max_tokens(model_id):
ctx = build_model_context(
model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": 1},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
info = processor.info
seen_aspect_ratios = set[float]()
image_sizes = list[ImageSize]()
# The aspect ratio of the grid layout is between 1 and 6
# NOTE: Assumes that feature size calculation is the same if we
# swap the width and height of the image
for w, h in itertools.product(range(32, 4096), repeat=2):
aspect_ratio = w / h
if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
image_sizes.append(ImageSize(w, h))
seen_aspect_ratios.add(aspect_ratio)
failed_size_excs = list[tuple[ImageSize, Exception]]()
validate_one = partial(
_validate_image_max_tokens_one,
processor,
info.get_max_image_tokens(), # type: ignore
failed_size_excs,
)
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
if failed_size_excs:
msg = "Found failing image sizes:" + "\n========\n".join(
f"[{size}]\n{exc}" for size, exc in failed_size_excs
)
raise AssertionError(msg)
def _validate_image_prompt_replacements_one(
processor: BaseMultiModalProcessor,
num_imgs: int,
failed_size_excs: list[tuple[ImageSize, Exception]],
image_size: ImageSize,
) -> None:
prompt = "<image>" * num_imgs
image = Image.new("RGB", size=image_size)
mm_data = {"image": [image] * num_imgs}
try:
# The processor will throw an error if there is a mismatch
# in the prompt replacements
processed_inputs = processor.apply(prompt, mm_data, {})
image_placeholders = processed_inputs["mm_placeholders"]["image"]
assert len(image_placeholders) == num_imgs
first_placeholder = image_placeholders[0]
assert first_placeholder.offset == 0
assert (
first_placeholder.length
== len(processed_inputs["prompt_token_ids"]) // num_imgs
)
except Exception as exc:
failed_size_excs.append((image_size, exc))
def _test_image_prompt_replacements(
processor,
*,
num_imgs: int,
image_sizes: list[ImageSize],
) -> None:
"""
Ensure LlavaOnevisionMultiModalProcessor
handles prompt replacement properly for input images.
"""
failed_size_excs = list[tuple[ImageSize, Exception]]()
validate_one = partial(
_validate_image_prompt_replacements_one,
processor,
num_imgs,
failed_size_excs,
)
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
if failed_size_excs:
msg = "Found failing image sizes:" + "\n========\n".join(
f"[{size}]\n{exc}" for size, exc in failed_size_excs
)
raise AssertionError(msg)
@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
@pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_prompt_replacements_regression(model_id, num_imgs):
ctx = build_model_context(
model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
image_ratios = [
(171, 152),
(184, 161),
(198, 176),
(333, 296),
(369, 328),
(488, 183),
(2560, 1669),
]
image_sizes = [
size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
]
_test_image_prompt_replacements(
processor,
num_imgs=num_imgs,
image_sizes=image_sizes,
)
@pytest.mark.skip(
"This test takes around 2 hours to run. Comment this out to run it manually."
)
@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
@pytest.mark.parametrize("num_imgs", [1])
def test_processor_prompt_replacements_all(model_id, num_imgs):
ctx = build_model_context(
model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
seen_aspect_ratios = set[float]()
image_sizes = list[ImageSize]()
# The aspect ratio of the grid layout is between 1 and 6
# NOTE: Assumes that feature size calculation is the same if we
# swap the width and height of the image
for w, h in itertools.product(range(64, 1024), repeat=2):
aspect_ratio = w / h
if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
image_sizes.append(ImageSize(w, h))
seen_aspect_ratios.add(aspect_ratio)
_test_image_prompt_replacements(
processor,
num_imgs=num_imgs,
image_sizes=image_sizes,
)

View File

@@ -0,0 +1,105 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from PIL import Image
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.parse import ImageSize
from vllm.multimodal.processing import BaseMultiModalProcessor
from ....conftest import ImageTestAssets
from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["MiniMaxAI/MiniMax-VL-01"])
@pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_override(
image_assets: ImageTestAssets,
model_id: str,
num_imgs: int,
):
ctx = build_model_context(
model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
prompt = "<image>" * num_imgs
image = Image.new("RGB", size=(364, 364))
mm_data = {"image": [image] * num_imgs}
processed_inputs = processor.apply(prompt, mm_data, {})
image_placeholders = processed_inputs["mm_placeholders"]["image"]
assert len(image_placeholders) == num_imgs
def _validate_image_prompt_replacements_one(
processor: BaseMultiModalProcessor,
num_imgs: int,
failed_size_excs: list[tuple[ImageSize, Exception]],
image_size: ImageSize,
) -> None:
prompt = "<image>" * num_imgs
image = Image.new("RGB", size=image_size)
mm_data = {"image": [image] * num_imgs}
try:
processed_inputs = processor.apply(prompt, mm_data, {})
image_placeholders = processed_inputs["mm_placeholders"]["image"]
assert len(image_placeholders) == num_imgs
except Exception as exc:
failed_size_excs.append((image_size, exc))
def _test_image_prompt_replacements(
processor,
*,
num_imgs: int,
image_sizes: list[ImageSize],
) -> None:
failed_size_excs = list[tuple[ImageSize, Exception]]()
for size in image_sizes:
_validate_image_prompt_replacements_one(
processor, num_imgs, failed_size_excs, size
)
if failed_size_excs:
msg = "Found failing image sizes:" + "\n========\n".join(
f"[{size}]\n{exc}" for size, exc in failed_size_excs
)
raise AssertionError(msg)
@pytest.mark.parametrize("model_id", ["MiniMaxAI/MiniMax-VL-01"])
@pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_prompt_replacements_regression(model_id, num_imgs):
ctx = build_model_context(
model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
image_ratios = [
(171, 152),
(184, 161),
(198, 176),
(333, 296),
(369, 328),
(488, 183),
(2560, 1669),
]
image_sizes = [
size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
]
_test_image_prompt_replacements(
processor,
num_imgs=num_imgs,
image_sizes=image_sizes,
)

View File

@@ -0,0 +1,72 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for mllama's multimodal preprocessing and profiling."""
import pytest
from torch import prod
from transformers import Llama4Config
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.profiling import MultiModalProfiler
from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["meta-llama/Llama-Guard-4-12B"])
@pytest.mark.parametrize("max_model_len", [4096, 8192, 25600, 131072])
def test_profiling(model_id: str, max_model_len: int):
model_config_kwargs = {
"max_model_len": max_model_len,
}
mm_counts = {"image": 1}
ctx = build_model_context(
model_id,
model_config_kwargs=model_config_kwargs,
limit_mm_per_prompt=mm_counts,
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
profiler = MultiModalProfiler(processor)
decoder_dummy_data = profiler.get_decoder_dummy_data(
max_model_len,
mm_counts=mm_counts,
)
dummy_mm_data = processor.dummy_inputs.get_dummy_processor_inputs(
max_model_len,
mm_counts=mm_counts,
)
hf_config = ctx.get_hf_config(Llama4Config)
mm_data = processor.apply(
prompt=dummy_mm_data.prompt,
mm_data=dummy_mm_data.mm_data,
hf_processor_mm_kwargs=dict(),
)["mm_kwargs"].get_data()
image_size = hf_config.vision_config.image_size
patch_size = hf_config.vision_config.patch_size
downsample_ratio = int(
round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2))
)
tokens_per_patch = ((image_size // patch_size) ** 2) // downsample_ratio
chunks_per_image = prod(mm_data["patches_per_image"])
total_num_patches = chunks_per_image * tokens_per_patch
num_tiles = (
mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][1]
) # x-y separator tokens
total_tokens = (
total_num_patches.item() + num_tiles.item() + 3
) # image start, image, image end
profiled_tokens = profiler.get_mm_max_tokens(
max_model_len,
mm_counts=mm_counts,
)
assert total_num_patches == profiled_tokens["image"]
assert total_tokens == sum(
placeholder.length
for placeholder in decoder_dummy_data.multi_modal_placeholders["image"]
)

View File

@@ -0,0 +1,133 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for Nemotron-Nano-VL's multimodal preprocessing kwargs."""
from collections.abc import Mapping
import pytest
from PIL import Image
from transformers import PretrainedConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.processing import BaseMultiModalProcessor
from ....conftest import ImageTestAssets
from ...utils import build_model_context
def _get_expected_num_patches(
config: PretrainedConfig,
image: Image.Image,
num_imgs: int,
min_num: int,
max_num: int,
):
from vllm.model_executor.models.nemotron_vl import (
calculate_nemotron_vl_targets,
get_nemotron_vl_target_ratios,
)
width, height = image.size
blocks, _, _ = calculate_nemotron_vl_targets(
orig_width=width,
orig_height=height,
target_ratios=get_nemotron_vl_target_ratios(
min_num,
max_num,
),
image_size=config.force_image_size,
use_thumbnail=False,
)
expected_num_patches = blocks
if config.use_thumbnail and expected_num_patches > 1:
expected_num_patches += 1
return expected_num_patches
def _run_check(
processor: BaseMultiModalProcessor,
images: list[Image.Image],
min_num: int,
max_num: int,
mm_processor_kwargs: Mapping[str, object],
):
tokenizer = processor.info.get_tokenizer()
config = processor.info.get_hf_config()
image_processor = processor.info.get_image_processor()
config.use_thumbnail = image_processor.use_thumbnail
prompt = "<image>" * len(images)
mm_data = {"image": images}
total_expected_num_patches = sum(
_get_expected_num_patches(config, image, len(images), min_num, max_num)
for image in images
)
print(total_expected_num_patches)
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
# Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<image>")
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape
print("Image token count:", img_tok_count, "Pixel shape:", pixel_shape)
assert img_tok_count == 256 * total_expected_num_patches
assert pixel_shape[0] == total_expected_num_patches
@pytest.mark.parametrize("model_id", ["nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"])
@pytest.mark.parametrize(
"size_factors",
[
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
[4.0, 2.0, 1.0],
],
)
@pytest.mark.parametrize(
("min_dynamic_patch", "max_dynamic_patch"),
[(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
)
@pytest.mark.parametrize("dynamic_image_size", [True, False])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(
model_id: str,
image_assets: ImageTestAssets,
size_factors: list[int],
min_dynamic_patch: int,
max_dynamic_patch: int,
dynamic_image_size: bool | None,
kwargs_on_init: bool,
):
mm_processor_kwargs = {
"min_dynamic_patch": min_dynamic_patch,
"max_dynamic_patch": max_dynamic_patch,
"dynamic_image_size": dynamic_image_size,
}
ctx = build_model_context(
model_id,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": len(size_factors)},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
min_num = min_dynamic_patch if dynamic_image_size else 1
max_num = max_dynamic_patch if dynamic_image_size else 1
_run_check(
processor,
[rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
min_num,
max_num,
hf_processor_mm_kwargs,
)

View File

@@ -0,0 +1,54 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for phi3v's multimodal preprocessing kwargs."""
import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY
from ....conftest import ImageTestAssets
from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"])
@pytest.mark.parametrize(
("mm_processor_kwargs", "expected_toks_per_img"),
[
({"num_crops": 4}, 757),
({"num_crops": 16}, 1921),
# the default num_crops of phi-3.5-vision is 4
({}, 757),
],
)
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(
image_assets: ImageTestAssets,
model_id: str,
mm_processor_kwargs: dict[str, int],
expected_toks_per_img: int,
num_imgs: int,
kwargs_on_init: bool,
):
"""Ensure Phi3VMultiModalProcessor handles num_crops properly."""
# Avoid initializing CUDA early
from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
ctx = build_model_context(
model_id,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass
img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
# Ensure we have the right number of placeholders per num_crops size
img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)
assert img_tok_count == expected_toks_per_img * num_imgs

View File

@@ -0,0 +1,60 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for phi4mm's multimodal preprocessing kwargs."""
import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY
from ....conftest import ImageTestAssets
from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["microsoft/Phi-4-multimodal-instruct"])
@pytest.mark.parametrize(
("mm_processor_kwargs", "expected_toks_per_img"),
[
({"dynamic_hd": 4}, 1329),
({"dynamic_hd": 16}, 4433),
# the default num_crops of phi-4-multimodal is 36
({}, 9585),
],
)
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(
image_assets: ImageTestAssets,
model_id: str,
mm_processor_kwargs: dict[str, int],
expected_toks_per_img: int,
num_imgs: int,
kwargs_on_init: bool,
):
"""Ensure Phi4MMMultiModalProcessor handles dynamic_hd properly."""
# Avoid initializing CUDA early
from vllm.model_executor.models.phi4mm import _IMAGE_PLACEHOLDER_TOKEN_ID
ctx = build_model_context(
model_id,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass
img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
image_size = ctx.get_hf_config().embd_layer["image_embd_layer"]["crop_size"]
dummy_image_size = (image_size * 7, image_size * 7)
dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
mm_data = {"image": [dummy_image] * num_imgs}
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
# Ensure we have the right number of placeholders per num_crops size
img_tok_count = processed_inputs["prompt_token_ids"].count(
_IMAGE_PLACEHOLDER_TOKEN_ID
)
assert img_tok_count == expected_toks_per_img * num_imgs

View File

@@ -0,0 +1,90 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY
from ....conftest import ImageTestAssets
from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
@pytest.mark.parametrize(
("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"),
[
({}, 1426, (5704, 1176)),
({"min_pixels": 64**2, "max_pixels": 512**2}, 330, (1320, 1176)),
],
)
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(
image_assets: ImageTestAssets,
model_id: str,
mm_processor_kwargs: dict[str, object],
expected_toks_per_img: int,
expected_pixels_shape: tuple[int, int],
num_imgs: int,
kwargs_on_init: bool,
):
"""Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
ctx = build_model_context(
model_id,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
tokenizer = processor.info.get_tokenizer()
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass
prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs
mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
# Ensure we have the right number of placeholders per num_crops size
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values"].shape
assert img_tok_count == expected_toks_per_img * num_imgs
assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs
assert pixel_shape[1] == expected_pixels_shape[1]
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
@pytest.mark.parametrize("max_pixels", [1280 * 28 * 28, 1283 * 28 * 28])
def test_get_image_size_with_most_features(
image_assets: ImageTestAssets,
model_id: str,
max_pixels: int,
):
ctx = build_model_context(
model_id,
mm_processor_kwargs={"max_pixels": max_pixels},
limit_mm_per_prompt={"image": 1},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
hf_processor_mm_kwargs: dict[str, object] = {}
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
merge_size = processor.info.get_hf_config().vision_config.spatial_merge_size
max_image_size = processor.info.get_image_size_with_most_features()
max_tokens = processor.info.get_num_image_tokens(
image_width=max_image_size.width,
image_height=max_image_size.height,
image_processor=hf_processor.image_processor,
)
prompt = "<|vision_start|><|image_pad|><|vision_end|>"
for asset in image_assets:
mm_data = {"image": [asset.pil_image]}
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
grid_thw = processed_inputs["mm_kwargs"].get_data()["image_grid_thw"].tolist()
t, h, w = grid_thw[0]
tokens = (t * h * w) // (merge_size**2)
assert tokens < max_tokens

View File

@@ -0,0 +1,68 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for smolvlm's multimodal preprocessing kwargs."""
import pytest
from transformers import SmolVLMConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from ....conftest import ImageTestAssets
from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["HuggingFaceTB/SmolVLM2-2.2B-Instruct"])
@pytest.mark.parametrize(
("mm_processor_kwargs", "expected_toks_per_img"),
[
({"max_image_size": {"longest_edge": 384}}, 1377),
({"max_image_size": {"longest_edge": 768}}, 405),
],
)
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(
image_assets: ImageTestAssets,
model_id: str,
mm_processor_kwargs: dict[str, object],
expected_toks_per_img: int,
num_imgs: int,
kwargs_on_init: bool,
):
"""Ensure Idefics3MultiModalProcessor handles num_crops properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
# the partial when calling the custom input processor.
ctx = build_model_context(
model_id,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass
placeholders = (
"<image>"
if num_imgs == 1
else "\n".join(f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
)
prompt = f"<|im_start|>User:{placeholders}\n<end_of_utterance>\nAssistant:" # noqa: E501
# Build mm_data
image_size = ctx.get_hf_config(SmolVLMConfig).vision_config.image_size
dummy_image_size = (image_size * 4, image_size * 4)
dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
mm_data = {"image": [dummy_image] * num_imgs}
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
# Ensure the placeholders format are correct
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs["input_ids"][0]
# Ensure we have the right number of placeholders per num_crops size
image_token_id = ctx.get_hf_config().image_token_id
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
assert img_tok_count == expected_toks_per_img * num_imgs

View File

@@ -0,0 +1,258 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import tempfile
from collections.abc import Iterable
from contextlib import contextmanager
from functools import partial
from typing import Any, TypeAlias
import numpy as np
import pytest
import torch
import torch.nn as nn
from PIL import Image
from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
from vllm.config.multimodal import (
AudioDummyOptions,
BaseDummyOptions,
ImageDummyOptions,
VideoDummyOptions,
)
from vllm.distributed import (
cleanup_dist_env_and_memory,
init_distributed_environment,
initialize_model_parallel,
)
from vllm.model_executor.models.interfaces import (
SupportsMultiModal,
supports_multimodal,
)
from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
from vllm.multimodal.utils import group_mm_kwargs_by_modality
from vllm.platforms import current_platform
from vllm.tokenizers import cached_tokenizer_from_config
from vllm.utils.collection_utils import is_list_of
from vllm.utils.torch_utils import set_default_torch_dtype
from ....utils import create_new_process_for_each_test
from ...registry import HF_EXAMPLE_MODELS
from ...utils import dummy_hf_overrides
from .test_common import get_model_ids_to_test, get_text_token_prompts
ImageInput = list[Image.Image]
VideoInput: TypeAlias = (
list[Image.Image] | list[np.ndarray] | list[tuple[np.ndarray, dict[str, Any]]]
)
AudioInput = list[tuple[np.ndarray, int]]
MM_OPTIONS_OVERRIDES = {
# Qwen3-VL's default profiling video size (64x64) can cause trouble
# after resizing, so we override it here for testing.
"qwen3_vl": dict(
video=VideoDummyOptions(num_frames=128, width=256, height=256),
),
"qwen3_vl_moe": dict(
video=VideoDummyOptions(num_frames=128, width=256, height=256),
),
}
def _resize_data(
_data: Image.Image | np.ndarray, size_factor: float
) -> Image.Image | np.ndarray:
assert size_factor <= 1, "Size factor must be less than 1"
# Image input
if isinstance(_data, Image.Image):
W, H = _data.width, _data.height
W, H = map(lambda x: int(x * size_factor), (W, H))
return _data.resize((W, H))
# Video input with PIL Images
elif is_list_of(_data, Image.Image):
W, H = next(iter(_data)).width, next(iter(_data)).height
T = len(_data)
T, W, H = map(lambda x: max(int(x * size_factor), 1), (T, W, H))
return [d.resize((W, H)) for d in _data[:T]]
# Video input with numpy arrays
elif isinstance(_data, np.ndarray) and _data.ndim >= 4:
T, H, W, C = _data.shape[-4:]
T, H, W = map(lambda x: max(int(x * size_factor), 1), (T, H, W))
return _data[..., :T, :H, :W, :C]
# Audio input
elif isinstance(_data, np.ndarray) and _data.ndim == 1:
return _data[: int(len(_data) * size_factor)]
raise AssertionError("This line should be unreachable.")
def resize_mm_data(
data: ImageInput | VideoInput | AudioInput, size_factors: tuple[float, ...]
) -> ImageInput | VideoInput | AudioInput:
size_factors = size_factors[: len(data)]
if is_list_of(data, (Image.Image, np.ndarray, list)):
return [_resize_data(d, s) for d, s in zip(data, size_factors)]
elif is_list_of(data, tuple):
return [_resize_data(d, s) for (d, _), s in zip(data, size_factors)]
raise ValueError("Unsupported multimodal data type.")
def create_batched_mm_kwargs(
model_cls: type[SupportsMultiModal],
model_config: ModelConfig,
processor: BaseMultiModalProcessor,
size_factors: tuple[float, ...] = (1.0, 0.5, 0.25),
) -> Iterable[tuple[str, int, BatchedTensorInputs]]:
model_type = model_config.hf_config.model_type
processing_info = processor.info
dummy_inputs = processor.dummy_inputs
supported_mm_limits = processing_info.get_supported_mm_limits()
mm_counts = {
modality: 3 if limit is None else limit
for modality, limit in supported_mm_limits.items()
}
processor_inputs = dummy_inputs.get_dummy_processor_inputs(
seq_len=model_config.max_model_len,
mm_counts=mm_counts,
mm_options=MM_OPTIONS_OVERRIDES.get(model_type),
)
mm_data = processor_inputs.mm_data
resized_mm_data = {
modality: resize_mm_data(data, size_factors)
for modality, data in mm_data.items()
}
# video metadata will be added back to the resized video data here.
text_prompt, token_prompt = get_text_token_prompts(processor, resized_mm_data)
mm_kwargs = processor.apply(
prompt=token_prompt if text_prompt is None else text_prompt,
mm_data=resized_mm_data,
hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
tokenization_kwargs=processor_inputs.tokenization_kwargs,
)["mm_kwargs"].require_data()
return group_mm_kwargs_by_modality(
[item for modality in supported_mm_limits for item in mm_kwargs[modality]]
)
# TODO(Isotr0py): Don't initalize model during test
@contextmanager
def initialize_dummy_model(
model_cls: type[nn.Module],
model_config: ModelConfig,
):
temp_file = tempfile.mkstemp()[1]
init_distributed_environment(
world_size=1,
rank=0,
distributed_init_method=f"file://{temp_file}",
local_rank=0,
backend="nccl",
)
initialize_model_parallel(tensor_model_parallel_size=1)
current_device = torch.get_default_device()
vllm_config = VllmConfig(model_config=model_config)
with set_current_vllm_config(vllm_config=vllm_config):
with set_default_torch_dtype(model_config.dtype):
torch.set_default_device(current_platform.device_type)
model = model_cls(vllm_config=vllm_config)
torch.set_default_device(current_device)
yield model
del model
cleanup_dist_env_and_memory()
@create_new_process_for_each_test()
@pytest.mark.parametrize("model_id", get_model_ids_to_test())
def test_model_tensor_schema(model_id: str):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
model_arch = next(
arch for arch, info in HF_EXAMPLE_MODELS.hf_models.items() if info == model_info
)
hf_overrides_fn = partial(
dummy_hf_overrides,
model_arch=model_arch,
exist_overrides=model_info.hf_overrides,
)
# ROCm: Detect if model uses AWQ quantization and set appropriate dtype
if "awq" in model_id.lower() and current_platform.is_rocm():
dtype = "float16"
else:
dtype = model_info.dtype
model_config = ModelConfig(
model_id,
tokenizer=model_info.tokenizer or model_id,
tokenizer_mode=model_info.tokenizer_mode,
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code,
hf_overrides=hf_overrides_fn,
skip_tokenizer_init=model_info.require_embed_inputs,
enable_prompt_embeds=model_info.require_embed_inputs,
enable_mm_embeds=model_info.require_embed_inputs,
enforce_eager=model_info.enforce_eager,
dtype=dtype,
)
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
assert supports_multimodal(model_cls)
factories = model_cls._processor_factory
inputs_parse_methods = []
for attr_name in dir(model_cls):
attr = getattr(model_cls, attr_name)
if hasattr(attr, "__annotations__"):
return_type = attr.__annotations__.get("return", None)
if return_type is not None and "Input" in str(return_type):
inputs_parse_methods.append(attr_name)
if not any(inputs_parse_methods):
pytest.skip(f"{model_arch} does not support tensor schema validation.")
ctx = InputProcessingContext(
model_config,
tokenizer=cached_tokenizer_from_config(model_config),
)
processing_info = factories.info(ctx)
supported_mm_limits = processing_info.get_supported_mm_limits()
limit_mm_per_prompt = {
modality: 3 if limit is None else limit
for modality, limit in supported_mm_limits.items()
}
def _to_dummy_options(modality: str, count: int) -> BaseDummyOptions:
if modality == "video":
return VideoDummyOptions(count=count)
if modality == "image":
return ImageDummyOptions(count=count)
if modality == "audio":
return AudioDummyOptions(count=count)
return BaseDummyOptions(count=count)
model_config.get_multimodal_config().limit_per_prompt = {
modality: _to_dummy_options(modality, count)
for modality, count in limit_mm_per_prompt.items()
}
processor = factories.build_processor(ctx, cache=None)
with initialize_dummy_model(model_cls, model_config) as model:
for modality, _, mm_kwargs in create_batched_mm_kwargs(
model_cls, model_config, processor
):
for method_name in inputs_parse_methods:
print(
f"Testing `{method_name}` with modality={modality} "
f"and mm_kwargs{list(mm_kwargs.keys())}"
)
getattr(model, method_name)(modality=modality, **mm_kwargs)

View File

@@ -0,0 +1,56 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.assets.image import ImageAsset
from vllm.config import ModelConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
def test_multimodal_processor(model_id):
model_config = ModelConfig(
model=model_id,
model_impl="transformers",
)
mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config)
image_pil = ImageAsset("cherry_blossom").pil_image
mm_data = {"image": image_pil}
str_prompt = "<|im_start|>user <image>\nWhat is the content of this image?<|im_end|><|im_start|>assistant\n" # noqa: E501
str_processed_inputs = mm_processor.apply(
prompt=str_prompt,
mm_data=mm_data,
hf_processor_mm_kwargs={},
)
ids_prompt = [
151644,
872,
220,
151646,
198,
3838,
374,
279,
2213,
315,
419,
2168,
30,
151645,
151644,
77091,
198,
]
ids_processed_inputs = mm_processor.apply(
prompt=ids_prompt,
mm_data=mm_data,
hf_processor_mm_kwargs={},
)
assert (
str_processed_inputs["prompt_token_ids"]
== ids_processed_inputs["prompt_token_ids"]
)

View File

@@ -0,0 +1,104 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable
import pytest
import torch
import transformers
from transformers import AutoConfig, PreTrainedModel
from vllm.config import ModelConfig
from vllm.model_executor.models.utils import WeightsMapper
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.transformers_utils.config import try_get_safetensors_metadata
from ..registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS
def create_repo_dummy_weights(repo: str) -> Iterable[tuple[str, torch.Tensor]]:
"""Create weights from safetensors checkpoint metadata"""
metadata = try_get_safetensors_metadata(repo)
weight_names = list(metadata.weight_map.keys())
with torch.device("meta"):
return ((name, torch.empty(0)) for name in weight_names)
def create_dummy_model(repo: str, model_arch: str) -> PreTrainedModel:
"""
Create weights from a dummy meta deserialized hf model with name conversion
"""
model_cls: PreTrainedModel = getattr(transformers, model_arch)
config = AutoConfig.from_pretrained(repo)
with torch.device("meta"):
return model_cls._from_config(config)
def model_architectures_for_test() -> list[str]:
arch_to_test = list[str]()
for model_arch, info in _MULTIMODAL_EXAMPLE_MODELS.items():
if not info.trust_remote_code and hasattr(transformers, model_arch):
model_cls: PreTrainedModel = getattr(transformers, model_arch)
if getattr(model_cls, "_checkpoint_conversion_mapping", None):
arch_to_test.append(model_arch)
return arch_to_test
@pytest.mark.core_model
@pytest.mark.parametrize("model_arch", model_architectures_for_test())
def test_hf_model_weights_mapper(model_arch: str):
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
is_mistral_model = model_arch in [
"Mistral3ForConditionalGeneration",
"PixtralForConditionalGeneration",
"VoxtralForConditionalGeneration",
]
if not is_mistral_model or model_info.tokenizer_mode == "mistral":
tokenizer_mode = model_info.tokenizer_mode
else:
tokenizer_mode = "hf"
model_id = model_info.default
model_config = ModelConfig(
model_id,
tokenizer=model_info.tokenizer or model_id,
tokenizer_mode=tokenizer_mode,
config_format="hf",
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code,
hf_overrides=model_info.hf_overrides,
skip_tokenizer_init=model_info.require_embed_inputs,
enable_prompt_embeds=model_info.require_embed_inputs,
enable_mm_embeds=model_info.require_embed_inputs,
enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype,
)
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
original_weights = create_repo_dummy_weights(model_id)
hf_dummy_model = create_dummy_model(model_id, model_arch)
hf_converted_weights = hf_dummy_model.named_parameters()
hf_converted_buffers = hf_dummy_model.named_buffers()
mapper: WeightsMapper = model_cls.hf_to_vllm_mapper
mapped_original_weights = mapper.apply(original_weights)
mapped_hf_converted_weights = mapper.apply(hf_converted_weights)
mapped_hf_converted_buffers = mapper.apply(hf_converted_buffers)
ref_weight_names = set(map(lambda x: x[0], mapped_original_weights))
weight_names = set(map(lambda x: x[0], mapped_hf_converted_weights))
buffer_names = set(map(lambda x: x[0], mapped_hf_converted_buffers))
# Some checkpoints may have buffers, we ignore them for this test
ref_weight_names -= buffer_names
weights_missing = ref_weight_names - weight_names
weights_unmapped = weight_names - ref_weight_names
assert not weights_missing and not weights_unmapped, (
f"Following weights are not mapped correctly: {weights_unmapped}, "
f"Missing expected weights: {weights_missing}."
)