Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

@@ -0,0 +1,347 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Helpers for building inputs that can be leveraged for different test types."""
from collections.abc import Callable, Iterable
from pathlib import PosixPath
from typing import Any
import numpy.typing as npt
import torch
from vllm.multimodal.audio import AudioResampler
from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.video import (
rescale_video_size,
resize_video,
sample_frames_from_video,
)
from .....conftest import AudioTestAssets, ImageTestAssets, VideoTestAssets
from .types import (
SINGLE_AUDIO_BASE_PROMPT,
SINGLE_IMAGE_BASE_PROMPTS,
TEST_AUDIO_PLACEHOLDER,
TEST_IMG_PLACEHOLDER,
TEST_VIDEO_PLACEHOLDER,
VIDEO_BASE_PROMPT,
ImageSizeWrapper,
PromptWithMultiModalInput,
SizeType,
VLMTestInfo,
)
def replace_test_placeholder(
prompt: str, mm_idx_to_prompt: Callable[[int], str], test_placeholder: str
) -> str:
"""Given a prompt, replaces each test placeholder with the
model-specific tag.
"""
prompt_segments = prompt.split(test_placeholder)
img_prompt = prompt_segments[0]
for placeholder_idx, next_seg in enumerate(prompt_segments[1:], start=1):
img_prompt += mm_idx_to_prompt(placeholder_idx)
img_prompt += next_seg
return img_prompt
def get_model_prompts(
base_prompts: Iterable[str],
img_idx_to_prompt: Callable[[int], str] | None,
video_idx_to_prompt: Callable[[int], str] | None,
audio_idx_to_prompt: Callable[[int], str] | None,
prompt_formatter: Callable[[str], str],
) -> list[str]:
"""Given a model-agnostic base prompt and test configuration for a model(s)
to be tested, update the media placeholders and apply the prompt formatting
to get the test prompt string for this model.
Example for phi3v, given the base_prompt: "<image>What is the season?"
1. Replace img placeholder(s)
-> "<|image_1|>\nWhat is the season?"
2. Apply prompt formatter:
-> <|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n
"""
assert isinstance(base_prompts, (list, tuple))
model_prompts = []
for base_prompt in base_prompts:
# Replace the multimodal placeholders in the base prompt with
# the correct ones for the model that we are testing
if img_idx_to_prompt:
base_prompt = replace_test_placeholder(
base_prompt, img_idx_to_prompt, TEST_IMG_PLACEHOLDER
)
if video_idx_to_prompt:
base_prompt = replace_test_placeholder(
base_prompt, video_idx_to_prompt, TEST_VIDEO_PLACEHOLDER
)
if audio_idx_to_prompt:
base_prompt = replace_test_placeholder(
base_prompt, audio_idx_to_prompt, TEST_AUDIO_PLACEHOLDER
)
# Apply the prompt formatter to wrap the base prompt with
# the correct media placeholders to get the model test prompt
model_prompt = prompt_formatter(base_prompt)
model_prompts.append(model_prompt)
return model_prompts
def build_single_image_inputs_from_test_info(
test_info: VLMTestInfo,
image_assets: ImageTestAssets,
size_wrapper: ImageSizeWrapper,
tmp_path: PosixPath | None = None,
) -> list[PromptWithMultiModalInput]:
if test_info.prompt_formatter is None:
raise ValueError("Prompt formatter must be set to build single image inputs")
model_prompts = get_model_prompts(
test_info.single_image_prompts,
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter,
)
# For models that require a local path / URL encoded in the image; export
# assets and encode into tmp_path for this test. This should be avoided
# where possible (currently needed for Qwen-VL).
if test_info.prompt_path_encoder is not None:
if tmp_path is None:
raise ValueError("Prompt path encoder requires setting local path")
model_prompts = [
test_info.prompt_path_encoder(tmp_path, prompt, [asset])
for prompt, asset in zip(model_prompts, image_assets)
]
images = [asset.pil_image for asset in image_assets]
assert len(images) == len(model_prompts)
return build_single_image_inputs(images, model_prompts, size_wrapper)
def build_single_image_inputs(
images, model_prompts, size_wrapper: ImageSizeWrapper
) -> list[PromptWithMultiModalInput]:
# For every image / prompt pair, get a pair containing two lists of
# length size_factors, where the first contains duplicates of the model
# prompt [str], and the second contains copies of the image after being
# scaled by one of the size factors.
#
# NOTE: rescaling preserves the image aspect ratio.
return [
PromptWithMultiModalInput(
prompts=[prompt for _ in size_wrapper.data],
image_data=[
apply_image_size_scaling(image, size, size_wrapper.type)
for size in size_wrapper.data
],
)
for image, prompt in zip(images, model_prompts)
]
def build_multi_image_inputs_from_test_info(
test_info: VLMTestInfo,
image_assets: ImageTestAssets,
size_wrapper: ImageSizeWrapper,
tmp_path: PosixPath | None = None,
) -> list[PromptWithMultiModalInput]:
if test_info.prompt_formatter is None:
raise ValueError("Prompt formatter must be set to build multi image inputs")
model_prompts = get_model_prompts(
[test_info.multi_image_prompt],
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter,
)
if test_info.prompt_path_encoder is not None:
if tmp_path is None:
raise ValueError("Prompt path encoder requires setting local path")
model_prompts = [
test_info.prompt_path_encoder(tmp_path, model_prompt, image_assets)
for model_prompt in model_prompts
]
images = [asset.pil_image for asset in image_assets]
# Currently, we only have one multi-image list & one multi-image prompt
return build_multi_image_inputs(
image_lists=[images],
model_prompts=model_prompts,
size_wrapper=size_wrapper,
)
def build_multi_image_inputs(
image_lists, model_prompts, size_wrapper: ImageSizeWrapper
) -> list[PromptWithMultiModalInput]:
return [
PromptWithMultiModalInput(
prompts=[prompt for _ in size_wrapper.data],
image_data=[
[
apply_image_size_scaling(image, size, size_wrapper.type)
for image in images
]
for size in size_wrapper.data
],
)
for images, prompt in zip(image_lists, model_prompts)
]
def build_embedding_inputs_from_test_info(
test_info: VLMTestInfo,
image_assets: ImageTestAssets,
size_wrapper: ImageSizeWrapper,
):
# These conditions will always be true if invoked through filtering,
# but we still check them in case this is ever called directly
if test_info.prompt_formatter is None:
raise ValueError("Prompt formatter must be set to build image embedding inputs")
if size_wrapper.type != SizeType.SIZE_FACTOR or not all(
factor == 1.0 for factor in size_wrapper.data
):
raise ValueError("Embedding tests require constant (1.0) size factors")
if test_info.convert_assets_to_embeddings is None:
raise ValueError("No conversion func for getting embeddings found")
model_prompts = get_model_prompts(
SINGLE_IMAGE_BASE_PROMPTS,
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter,
)
images = [asset.pil_image for asset in image_assets]
embeds = test_info.convert_assets_to_embeddings(image_assets)
if test_info.dtype != "auto":
dtype = getattr(torch, test_info.dtype) # type: ignore
embeds = [e.to(dtype=dtype) for e in embeds]
assert len(images) == len(model_prompts)
inputs = build_single_image_inputs(images, model_prompts, size_wrapper)
vllm_embeddings = build_single_image_inputs(embeds, model_prompts, size_wrapper)
return inputs, vllm_embeddings
def build_video_inputs_from_test_info(
test_info: VLMTestInfo,
video_assets: VideoTestAssets,
size_wrapper: ImageSizeWrapper,
num_frames: int,
needs_video_metadata: bool,
) -> list[PromptWithMultiModalInput]:
if test_info.prompt_formatter is None:
raise ValueError("Prompt formatter must be set to build video inputs")
model_prompts = get_model_prompts(
[VIDEO_BASE_PROMPT],
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter,
)
sampled_vids = [
sample_frames_with_video_metadata(
(asset.np_ndarrays, asset.metadata),
num_frames,
)
for asset in video_assets
]
video_scaler = (
resize_video if size_wrapper.type == SizeType.FIXED_SIZE else rescale_video_size
)
return [
PromptWithMultiModalInput(
prompts=[prompt for _ in size_wrapper.data],
video_data=[
(
video_scaler(video, size)
if not needs_video_metadata
else (video_scaler(video, size), meta)
)
for size in size_wrapper.data
],
)
for (video, meta), prompt in zip(sampled_vids, model_prompts)
]
def sample_frames_with_video_metadata(
video_with_meta: tuple[npt.NDArray, dict[str, Any]],
num_frames: int,
) -> tuple[npt.NDArray, dict[str, Any]]:
video, meta = video_with_meta
video = sample_frames_from_video(video, num_frames)
meta["do_sample_frames"] = meta["total_num_frames"] == num_frames
meta["total_num_frames"] = num_frames
meta["fps"] = meta["duration"] / num_frames
meta["frames_indices"] = list(range(num_frames))
return video, meta
def apply_image_size_scaling(image, size: float | tuple[int, int], size_type: SizeType):
"""Applies a size scaler to one image; this can be an image size factor,
which scales the image while maintaining the aspect ratio"""
# Special case for embeddings; if it's a tensor, it's only valid if we
# are considering size factors at constant scale, i.e., we just clone
# the tensor
if isinstance(image, torch.Tensor):
assert size_type == SizeType.SIZE_FACTOR and size == 1
return image
if size_type == SizeType.SIZE_FACTOR:
# We have a list of image size factors
return rescale_image_size(image, size)
elif size_type == SizeType.FIXED_SIZE:
# We have a list of fixed sizes
return image.resize(size)
raise ValueError("ImageSizeWrapper type must be FIXED_SIZE or SIZE_FACTOR")
def build_audio_inputs_from_test_info(
test_info: VLMTestInfo,
audio_assets: AudioTestAssets,
) -> list[PromptWithMultiModalInput]:
if test_info.prompt_formatter is None:
raise ValueError("Prompt formatter must be set to build audio inputs")
model_prompts = get_model_prompts(
SINGLE_AUDIO_BASE_PROMPT,
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter,
)
resampler = AudioResampler(
target_sr=16000,
method="librosa",
)
audios = [asset.audio_and_sample_rate for asset in audio_assets]
resampled_audios = [
(
resampler.resample(
audio,
orig_sr=sr,
),
int(resampler.target_sr),
)
for audio, sr in audios
]
return [
PromptWithMultiModalInput(
prompts=model_prompts,
audio_data=resampled_audios,
)
]

View File

@@ -0,0 +1,183 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Utils for determining which subset of model tests belong to a specific
modality, getting all combinations (similar to pytest's parametrization),
handling multimodal placeholder substitution, and so on.
"""
import itertools
from collections import OrderedDict
from collections.abc import Iterable
import pytest
from .types import (
EMBEDDING_SIZE_FACTORS,
ExpandableVLMTestArgs,
ImageSizeWrapper,
SizeType,
VLMTestInfo,
VLMTestType,
)
def get_filtered_test_settings(
test_settings: dict[str, VLMTestInfo],
test_type: VLMTestType,
new_proc_per_test: bool,
) -> dict[str, VLMTestInfo]:
"""Given the dict of potential test settings to run, return a subdict
of tests who have the current test type enabled with the matching val for
fork_per_test.
"""
def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType):
return test_info.test_type == test_type or (
isinstance(test_info.test_type, Iterable)
and test_type in test_info.test_type
)
matching_tests = {}
for test_name, test_info in test_settings.items():
# Otherwise check if the test has the right type & keep if it does
if matches_test_type(test_info, test_type):
# Embedding tests need to have a conversion func in their test info
if matches_test_type(test_info, VLMTestType.EMBEDDING):
assert test_info.convert_assets_to_embeddings is not None
# Custom test inputs need to explicitly define the mm limit/inputs
if matches_test_type(test_info, VLMTestType.CUSTOM_INPUTS):
assert test_info.custom_test_opts is not None and isinstance(
test_info.custom_test_opts, Iterable
)
# For all types besides custom inputs, we need a prompt formatter
else:
assert test_info.prompt_formatter is not None
# Everything looks okay; keep if this is correct proc handling
if (
test_info.distributed_executor_backend is not None
) == new_proc_per_test:
matching_tests[test_name] = test_info
return matching_tests
def get_model_type_cases(
model_type: str,
test_info: VLMTestInfo,
test_type: VLMTestType,
):
# Ensure that something is wrapped as an iterable it's not already
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,)
# This is essentially the same as nesting a bunch of mark.parametrize
# decorators, but we do it programmatically to allow overrides for on
# a per-model basis, while still being able to execute each of these
# as individual test cases in pytest.
iter_kwargs = OrderedDict(
[
("model", ensure_wrapped(test_info.models)),
("max_tokens", ensure_wrapped(test_info.max_tokens)),
("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
("dtype", ensure_wrapped(test_info.dtype)),
(
"distributed_executor_backend",
ensure_wrapped(test_info.distributed_executor_backend),
),
]
)
# num_frames is video only
if test_type == VLMTestType.VIDEO:
iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames)
iter_kwargs["needs_video_metadata"] = ensure_wrapped(
test_info.needs_video_metadata
)
# No sizes passed for custom inputs, since inputs are directly provided
if test_type not in (
VLMTestType.CUSTOM_INPUTS,
VLMTestType.AUDIO,
):
wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
if wrapped_sizes is None:
raise ValueError(f"Sizes must be set for test type {test_type}")
iter_kwargs["size_wrapper"] = wrapped_sizes
# Otherwise expand the custom test options instead
elif test_type == VLMTestType.CUSTOM_INPUTS:
if test_info.custom_test_opts is None:
raise ValueError("Test has type CUSTOM_INPUTS, but none given")
iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
# Wrap all model cases in a pytest parameter & pass marks through
return [
pytest.param(
model_type,
ExpandableVLMTestArgs(**{k: v for k, v in zip(iter_kwargs.keys(), case)}),
marks=test_info.marks if test_info.marks is not None else [],
)
for case in list(itertools.product(*iter_kwargs.values()))
]
def get_parametrized_options(
test_settings: dict[str, VLMTestInfo],
test_type: VLMTestType,
create_new_process_for_each_test: bool,
):
"""Converts all of our VLMTestInfo into an expanded list of parameters.
This is similar to nesting pytest parametrize calls, but done directly
through an itertools product so that each test can set things like
size factors etc, while still running in isolated test cases.
"""
matching_tests = get_filtered_test_settings(
test_settings, test_type, create_new_process_for_each_test
)
# Get a list per model type, where each entry contains a tuple of all of
# that model type's cases, then flatten them into the top level so that
# we can consume them in one mark.parametrize call.
cases_by_model_type = [
get_model_type_cases(model_type, test_info, test_type)
for model_type, test_info in matching_tests.items()
]
return list(itertools.chain(*cases_by_model_type))
def get_wrapped_test_sizes(
test_info: VLMTestInfo, test_type: VLMTestType
) -> tuple[ImageSizeWrapper, ...]:
"""Given a test info which may have size factors or fixed sizes, wrap them
and combine them into an iterable, each of which will be used in parameter
expansion.
Args:
test_info: Test configuration to be expanded.
test_type: The type of test being filtered for.
"""
# If it is an embedding test, we always use the EMBEDDING_SIZE_FACTORS
if test_type == VLMTestType.EMBEDDING:
return tuple(
[
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
for factor in EMBEDDING_SIZE_FACTORS
]
)
# Audio and Custom inputs have preprocessed inputs
elif test_type in (VLMTestType.AUDIO, VLMTestType.CUSTOM_INPUTS):
return tuple()
size_factors = test_info.image_size_factors if test_info.image_size_factors else []
fixed_sizes = test_info.image_sizes if test_info.image_sizes else []
wrapped_factors = [
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
for factor in size_factors
]
wrapped_sizes = [
ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size) for size in fixed_sizes
]
return tuple(wrapped_factors + wrapped_sizes)

View File

@@ -0,0 +1,189 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Core test implementation to be shared across modalities."""
from collections.abc import Callable
from typing import Any
import torch
from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm.config.model import RunnerOption
from vllm.tokenizers import TokenizerLike
from .....conftest import HfRunner, VllmRunner
from ....registry import HF_EXAMPLE_MODELS
from .types import PromptWithMultiModalInput, RunnerOutput
def run_test(
*,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
inputs: list[PromptWithMultiModalInput],
model: str,
dtype: str,
max_tokens: int,
num_logprobs: int,
enforce_eager: bool,
max_model_len: int,
max_num_seqs: int,
hf_output_post_proc: Callable[[RunnerOutput, str], Any] | None,
vllm_output_post_proc: Callable[[RunnerOutput, str], Any] | None,
auto_cls: type[_BaseAutoModelClass],
use_tokenizer_eos: bool,
comparator: Callable[..., None],
get_stop_token_ids: Callable[[TokenizerLike], list[int]] | None,
stop_str: list[str] | None,
limit_mm_per_prompt: dict[str, int],
vllm_runner_kwargs: dict[str, Any] | None,
hf_model_kwargs: dict[str, Any] | None,
patch_hf_runner: Callable[[HfRunner], HfRunner] | None,
runner: RunnerOption = "auto",
distributed_executor_backend: str | None = None,
tensor_parallel_size: int = 1,
vllm_embeddings: torch.Tensor | None = None,
):
"""Modality agnostic test executor for comparing HF/vLLM outputs."""
# In the case of embeddings, vLLM takes separate input tensors
vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
# Disable other modalities to save memory
default_limits = {"image": 0, "video": 0, "audio": 0}
limit_mm_per_prompt = default_limits | limit_mm_per_prompt
vllm_outputs_per_mm = []
hf_outputs_per_mm = []
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
vllm_runner_kwargs_: dict[str, Any] = {"mm_processor_cache_gb": 0}
if model_info.tokenizer:
vllm_runner_kwargs_["tokenizer_name"] = model_info.tokenizer
if model_info.tokenizer_mode:
vllm_runner_kwargs_["tokenizer_mode"] = model_info.tokenizer_mode
if model_info.hf_overrides:
vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides
if model_info.require_embed_inputs:
for k in ("skip_tokenizer_init", "enable_prompt_embeds", "enable_mm_embeds"):
vllm_runner_kwargs_[k] = model_info.require_embed_inputs
if vllm_runner_kwargs:
vllm_runner_kwargs_.update(vllm_runner_kwargs)
with vllm_runner(
model,
max_model_len=max_model_len,
max_num_seqs=max_num_seqs,
dtype=dtype,
limit_mm_per_prompt=limit_mm_per_prompt,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=enforce_eager,
runner=runner,
**vllm_runner_kwargs_,
) as vllm_model:
tokenizer = vllm_model.llm.get_tokenizer()
vllm_kwargs: dict[str, Any] = {}
if get_stop_token_ids is not None:
vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
if stop_str:
vllm_kwargs["stop"] = stop_str
for prompts, image_data, video_data, audio_data in vllm_inputs:
mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
vllm_kwargs_with_mm_data = vllm_kwargs | mm_data
vllm_output = vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
**vllm_kwargs_with_mm_data,
)
vllm_outputs_per_mm.append(vllm_output)
hf_model = hf_runner(
model, dtype=dtype, auto_cls=auto_cls, model_kwargs=hf_model_kwargs
)
# Some models need to patch things like the model processor, e.g., internvl
if patch_hf_runner is not None:
hf_model = patch_hf_runner(hf_model)
with hf_model, torch.no_grad():
tokenizer = hf_model.tokenizer
# Some models need to explicitly pass the eos_token_id off the tokenizer
# or processor for a good comparison;
# currently assume processor/tokenizer agree on the EOS, and pull it off
# the tokenizer if requested.
hf_kwargs = {}
if use_tokenizer_eos:
hf_kwargs["eos_token_id"] = tokenizer.eos_token_id
if stop_str:
hf_kwargs["stop_strings"] = stop_str
for prompts, image_data, video_data, audio_data in inputs:
mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
hf_kwargs_with_mm_data = hf_kwargs | mm_data
hf_output = hf_model.generate_greedy_logprobs_limit(
prompts,
max_tokens,
num_logprobs=num_logprobs,
tokenizer=tokenizer,
**hf_kwargs_with_mm_data,
)
hf_outputs_per_mm.append(hf_output)
# Apply output processing / sanitation to the vLLM and HF runner results
hf_outputs_per_mm, vllm_outputs_per_mm = process_runner_outputs(
model,
first_runner_outputs=hf_outputs_per_mm,
second_runner_outputs=vllm_outputs_per_mm,
first_runner_processor=hf_output_post_proc,
second_runner_processor=vllm_output_post_proc,
)
for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm, vllm_outputs_per_mm):
# This is usually check_logprobs_close, but it's passed through to
# allow things like check_outputs_equal where needed
comparator(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
def process_runner_outputs(
model,
first_runner_outputs,
second_runner_outputs,
first_runner_processor=None,
second_runner_processor=None,
):
"""Applies the runner processor(s) to the runner outputs, if any."""
if first_runner_processor is not None:
first_runner_outputs = process_outputs(
first_runner_processor, model, first_runner_outputs
)
if second_runner_processor is not None:
second_runner_outputs = process_outputs(
second_runner_processor, model, second_runner_outputs
)
return first_runner_outputs, second_runner_outputs
def process_outputs(output_processor, model, outputs_per_image):
"""Applies a model specific post-processor function to a runner's output"""
return [
[output_processor(res, model) for res in outputs]
for outputs in outputs_per_image
]

View File

@@ -0,0 +1,156 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Custom input builders for edge-cases in different models."""
from collections.abc import Callable
from vllm.assets.image import ImageAsset
from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.video import (
rescale_video_size,
resize_video,
sample_frames_from_video,
)
from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
from .builders import build_multi_image_inputs, build_single_image_inputs
from .types import ImageSizeWrapper, PromptWithMultiModalInput, SizeType
def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
"""Builds inputs for multi-image (varied sizes/aspect ratio) testing.
Args:
formatter: model-specific prompt formatter.
"""
stop_sign = IMAGE_ASSETS[0].pil_image
cherry_blossom = IMAGE_ASSETS[1].pil_image
# Apply the selected formatter to the base prompts
img_prompts = [
"<image><image>\nDescribe 2 images.",
"<image><image>\nDescribe 2 images.",
"<image><image><image><image>\nDescribe 4 images.",
"<image>\nWhat is the season?",
]
formatted_prompts = [formatter(prompt) for prompt in img_prompts]
aspect_ratio_images = [
[stop_sign, cherry_blossom],
# Images with different sizes and aspect-ratios
[
rescale_image_size(stop_sign, 0.1),
stop_sign,
],
[
stop_sign,
rescale_image_size(stop_sign, 0.25),
cherry_blossom.resize((183, 488)),
cherry_blossom.resize((488, 183)),
],
cherry_blossom,
]
return [
PromptWithMultiModalInput(
prompts=formatted_prompts,
image_data=aspect_ratio_images,
)
]
def multi_video_multi_aspect_ratio_inputs(
formatter: Callable[[str], str], num_frames: int = 16
):
"""Builds inputs for multi-video (varied sizes/aspect ratio) testing.
Args:
formatter: model-specific prompt formatter.
"""
video = sample_frames_from_video(VIDEO_ASSETS[0].np_ndarrays, num_frames)
# Apply the selected formatter to the base prompts
video_prompts = [
"<video><video>\nDescribe 2 videos.",
"<video><video>\nDescribe 2 videos.",
"<video><video><video><video>\nDescribe 4 videos.",
"<video>\nWhy is this video funny?",
]
formatted_prompts = [formatter(prompt) for prompt in video_prompts]
aspect_ratio_videos = [
[video, video],
# Videos with different sizes and aspect-ratios
[
rescale_video_size(video, 0.1),
video,
],
[
video,
rescale_video_size(video, 0.25),
resize_video(video, (183, 488)),
resize_video(video, (488, 183)),
],
video,
]
return [
PromptWithMultiModalInput(
prompts=formatted_prompts,
video_data=aspect_ratio_videos,
)
]
def different_patch_input_cases_internvl():
images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
formatter = (
lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501
)
single_img_prompts = [
"<image>\nWhat's the content in the center of the image?",
"<image>\nWhat is the season?",
]
multi_img_prompts = [
"Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.\n", # noqa: E501
]
formatted_sprompts = [formatter(prompt) for prompt in single_img_prompts]
formatted_mprompts = [formatter(prompt) for prompt in multi_img_prompts]
wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5, 1.0])
return [
build_single_image_inputs(images, formatted_sprompts, wrapped_sf),
build_multi_image_inputs([images], formatted_mprompts, wrapped_sf),
]
def windows_attention_image_qwen2_5_vl():
# image from regression issue: https://github.com/vllm-project/vllm/issues/15122 # noqa: E501
image = ImageAsset("hato").pil_image
question = "Describe the image."
img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
prompt = (
f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n<|im_start|>assistant\n"
)
wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
return build_single_image_inputs([image], [prompt], wrapped_sf)
def video_with_metadata_glm4_1v():
video_array = VIDEO_ASSETS[0].np_ndarrays
metadata = VIDEO_ASSETS[0].metadata
question = "Describe the video."
video_prompt = "<|begin_of_video|><|video|><|end_of_video|>"
formatted_prompt = f"[gMASK]<|user|>\n{video_prompt}{question}<|assistant|>\n"
scales = [0.1, 0.2, 0.25]
video_input = [
[(rescale_video_size(video_array, scale), metadata)] for scale in scales
]
prompts = [formatted_prompt] * len(video_input)
return [
PromptWithMultiModalInput(
prompts=prompts,
video_data=video_input,
)
]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,190 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Entrypoints for wrapping the core run_test implementation for specific test
types / modalities.
"""
from pathlib import PosixPath
from .....conftest import (
AudioTestAssets,
HfRunner,
ImageTestAssets,
VideoTestAssets,
VllmRunner,
)
from . import builders, core
from .types import ExpandableVLMTestArgs, VLMTestInfo
####### Entrypoints for running different test types
def run_single_image_test(
*,
tmp_path: PosixPath,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets,
):
assert test_case.size_wrapper is not None
inputs = builders.build_single_image_inputs_from_test_info(
model_test_info, image_assets, test_case.size_wrapper, tmp_path
)
core.run_test(
hf_runner=hf_runner,
vllm_runner=vllm_runner,
inputs=inputs,
model=test_case.model,
dtype=test_case.dtype,
max_tokens=test_case.max_tokens,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"image": 1},
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_multi_image_test(
*,
tmp_path: PosixPath,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets,
):
assert test_case.size_wrapper is not None
inputs = builders.build_multi_image_inputs_from_test_info(
model_test_info, image_assets, test_case.size_wrapper, tmp_path
)
core.run_test(
hf_runner=hf_runner,
vllm_runner=vllm_runner,
inputs=inputs,
model=test_case.model,
dtype=test_case.dtype,
max_tokens=test_case.max_tokens,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"image": len(image_assets)},
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_embedding_test(
*,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets,
):
assert test_case.size_wrapper is not None
inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
model_test_info, image_assets, test_case.size_wrapper
)
core.run_test(
hf_runner=hf_runner,
vllm_runner=vllm_runner,
inputs=inputs,
model=test_case.model,
dtype=test_case.dtype,
max_tokens=test_case.max_tokens,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"image": 1},
vllm_embeddings=vllm_embeddings,
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_video_test(
*,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
video_assets: VideoTestAssets,
):
assert test_case.size_wrapper is not None
assert test_case.num_video_frames is not None
inputs = builders.build_video_inputs_from_test_info(
model_test_info,
video_assets,
test_case.size_wrapper,
test_case.num_video_frames,
test_case.needs_video_metadata,
)
core.run_test(
hf_runner=hf_runner,
vllm_runner=vllm_runner,
inputs=inputs,
model=test_case.model,
dtype=test_case.dtype,
max_tokens=test_case.max_tokens,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"video": len(video_assets)},
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_audio_test(
*,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets,
):
inputs = builders.build_audio_inputs_from_test_info(model_test_info, audio_assets)
core.run_test(
hf_runner=hf_runner,
vllm_runner=vllm_runner,
inputs=inputs,
model=test_case.model,
dtype=test_case.dtype,
max_tokens=test_case.max_tokens,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"audio": 1},
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_custom_inputs_test(
*,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
):
# Custom test cases can provide inputs directly, but they need to
# explicitly provided a CustomTestConfig, which wraps the inputs and
# the limit_mm_per_prompt
assert test_case.custom_test_opts is not None
inputs = test_case.custom_test_opts.inputs
limit_mm_per_prompt = test_case.custom_test_opts.limit_mm_per_prompt
# Inputs and limit_mm_per_prompt should all be set
assert inputs is not None
assert limit_mm_per_prompt is not None
core.run_test(
hf_runner=hf_runner,
vllm_runner=vllm_runner,
inputs=inputs,
model=test_case.model,
dtype=test_case.dtype,
max_tokens=test_case.max_tokens,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt=limit_mm_per_prompt,
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs(),
)

View File

@@ -0,0 +1,218 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Types for writing multimodal model tests."""
from collections.abc import Callable, Iterable
from enum import Enum
from pathlib import PosixPath
from typing import Any, NamedTuple
import torch
from pytest import MarkDecorator
from transformers import AutoModelForCausalLM
from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm.config.model import RunnerOption
from vllm.logprobs import SampleLogprobs
from vllm.tokenizers import TokenizerLike
from .....conftest import (
AUDIO_ASSETS,
IMAGE_ASSETS,
HfRunner,
ImageAsset,
ImageTestAssets,
PromptAudioInput,
PromptImageInput,
PromptVideoInput,
)
from ....utils import check_logprobs_close
# meta image tag; will be replaced by the appropriate tag for the model
TEST_IMG_PLACEHOLDER = "<vlm_image>"
TEST_VIDEO_PLACEHOLDER = "<vlm_video>"
TEST_AUDIO_PLACEHOLDER = "<lmm_audio>"
SINGLE_IMAGE_BASE_PROMPTS = IMAGE_ASSETS.prompts(
{
"stop_sign": f"{TEST_IMG_PLACEHOLDER}What's the content of the image?",
"cherry_blossom": f"{TEST_IMG_PLACEHOLDER}What is the season?",
}
)
SINGLE_AUDIO_BASE_PROMPT = AUDIO_ASSETS.prompts(
{
"mary_had_lamb": f"{TEST_AUDIO_PLACEHOLDER}Transcribe this audio into English.", # noqa: E501
"winning_call": f"{TEST_AUDIO_PLACEHOLDER}What is happening in this audio clip?", # noqa: E501
}
)
MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PLACEHOLDER}Describe the two images in detail.\n" # noqa: E501
VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"
IMAGE_SIZE_FACTORS = [(1.0,), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
EMBEDDING_SIZE_FACTORS = [(1.0,), (1.0, 1.0, 1.0)]
RunnerOutput = tuple[list[int], str, SampleLogprobs | None]
class PromptWithMultiModalInput(NamedTuple):
"""Holds the multimodal input for a single test case."""
prompts: list[str]
image_data: PromptImageInput | None = None
video_data: PromptVideoInput | None = None
audio_data: PromptAudioInput | None = None
class VLMTestType(Enum):
IMAGE = 1
MULTI_IMAGE = 2
EMBEDDING = 3
VIDEO = 4
AUDIO = 5
CUSTOM_INPUTS = 6
class SizeType(Enum):
SIZE_FACTOR = 1
FIXED_SIZE = 2
class CustomTestOptions(NamedTuple):
inputs: list[PromptWithMultiModalInput]
limit_mm_per_prompt: dict[str, int]
class ImageSizeWrapper(NamedTuple):
type: SizeType
# A size factor is a wrapper of 0+ floats,
# while a fixed size contains an iterable of integer pairs
data: Iterable[float] | Iterable[tuple[int, int]]
class VLMTestInfo(NamedTuple):
"""Holds the configuration for 1+ tests for one model architecture."""
models: list[str]
test_type: VLMTestType | Iterable[VLMTestType]
# Should be None only if this is a CUSTOM_INPUTS test
prompt_formatter: Callable[[str], str] | None = None
img_idx_to_prompt: Callable[[int], str] = lambda idx: "<image>\n"
video_idx_to_prompt: Callable[[int], str] = lambda idx: "<video>\n"
audio_idx_to_prompt: Callable[[int], str] = lambda idx: "<audio>\n"
# Most models work on the single / multi-image prompts above, but in some
# cases the log prob check fails, e.g., for paligemma. We allow passing
# an override for the single image prompts / multi-image prompt for this
# reason.
single_image_prompts: Iterable[str] = SINGLE_IMAGE_BASE_PROMPTS
multi_image_prompt: str = MULTI_IMAGE_BASE_PROMPT
# Function for converting ImageAssets to image embeddings;
# We need to define this explicitly for embedding tests
convert_assets_to_embeddings: (
Callable[[ImageTestAssets], list[torch.Tensor]] | None
) = None
# Exposed options for vLLM runner; we change these in a several tests,
# but the defaults are derived from VllmRunner & the engine defaults
# These settings are chosen to avoid OOMs when running in the CI
enforce_eager: bool = True
max_model_len: int = 1024
max_num_seqs: int = 256
runner: RunnerOption = "auto"
tensor_parallel_size: int = 1
vllm_runner_kwargs: dict[str, Any] | None = None
# Optional callable which gets a list of token IDs from the model tokenizer
get_stop_token_ids: Callable[[TokenizerLike], list[int]] | None = None
# Optional list of strings to stop generation, useful when stop tokens are
# not special tokens in the tokenizer
stop_str: list[str] | None = None
# Exposed options for HF runner
hf_model_kwargs: dict[str, Any] | None = None
# Indicates we should explicitly pass the EOS from the tokenizer
use_tokenizer_eos: bool = False
auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM
patch_hf_runner: Callable[[HfRunner], HfRunner] | None = None
# Post processors that if defined, will run oun the outputs of the
# vLLM and HF runner, respectively (useful for sanitization, etc).
vllm_output_post_proc: Callable[[RunnerOutput, str], Any] | None = None
hf_output_post_proc: Callable[[RunnerOutput, str], Any] | None = None
# Consumes the output of the callables above and checks if they're equal
comparator: Callable[..., None] = check_logprobs_close
# Default expandable params per test; these defaults can be overridden in
# instances of this object; the complete set of test cases for the model
# is all combinations of .models + all fields below
max_tokens: int = 128
num_logprobs: int = 5
dtype: str = "auto"
distributed_executor_backend: str | None = None
# Only expanded in video tests
num_video_frames: int | tuple[int] = 16
needs_video_metadata: bool = False
# Fixed image sizes / image size factors; most tests use image_size_factors
# The values provided for these two fields will be stacked and expanded
# such that each model will consider each image size factor / image size
# once per tests (much like concatenating and wrapping in one parametrize
# call)
image_size_factors: Iterable[Iterable[float]] = IMAGE_SIZE_FACTORS
image_sizes: Iterable[Iterable[tuple[int, int]]] | None = None
# Hack for updating a prompt to take into a local path; currently only used
# for Qwen-VL, which requires encoding the image path / url into the prompt
# for HF runner
prompt_path_encoder: (
Callable[[PosixPath, str, list[ImageAsset] | ImageTestAssets], str] | None
) = None # noqa: E501
# Allows configuring a test to run with custom inputs
custom_test_opts: list[CustomTestOptions] | None = None
marks: list[MarkDecorator] | None = None
def get_non_parametrized_runner_kwargs(self):
"""Returns a dictionary of expandable kwargs for items that are used
in all test types, which are NOT used when creating the parametrized
test cases.
"""
return {
"enforce_eager": self.enforce_eager,
"max_model_len": self.max_model_len,
"max_num_seqs": self.max_num_seqs,
"runner": self.runner,
"tensor_parallel_size": self.tensor_parallel_size,
"vllm_runner_kwargs": self.vllm_runner_kwargs,
"hf_output_post_proc": self.hf_output_post_proc,
"vllm_output_post_proc": self.vllm_output_post_proc,
"auto_cls": self.auto_cls,
"use_tokenizer_eos": self.use_tokenizer_eos,
"comparator": self.comparator,
"get_stop_token_ids": self.get_stop_token_ids,
"hf_model_kwargs": self.hf_model_kwargs,
"stop_str": self.stop_str,
"patch_hf_runner": self.patch_hf_runner,
}
class ExpandableVLMTestArgs(NamedTuple):
"""The expanded kwargs which correspond to a single test case."""
model: str
max_tokens: int
num_logprobs: int
dtype: str
distributed_executor_backend: str | None
# Sizes are used for everything except for custom input tests
size_wrapper: ImageSizeWrapper | None = None
# Video only
num_video_frames: int | None = None
needs_video_metadata: bool = False
# Custom inputs only
custom_test_opts: CustomTestOptions | None = None