Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/tests/models/multimodal/generation/init.py
+++ b/tests/models/multimodal/generation/init.py
--- a/tests/models/multimodal/generation/conftest.py
+++ b/tests/models/multimodal/generation/conftest.py
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Pytest configuration for vLLM tests."""
+
+import warnings
+
+import torch
+
+from vllm.platforms import current_platform
+
+
+def pytest_configure(config):
+    """Disable Flash/MemEfficient SDP on ROCm to avoid HF
+    Transformers accuracy issues.
+    """
+    if not current_platform.is_rocm():
+        return
+
+    skip_patterns = ["test_granite_speech.py"]
+    if any(pattern in str(arg) for arg in config.args for pattern in skip_patterns):
+        # Skip disabling SDP for Granite Speech tests on ROCm
+        return
+
+    # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
+    # accuracy issues
+    # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
+    torch.backends.cuda.enable_flash_sdp(False)
+    torch.backends.cuda.enable_mem_efficient_sdp(False)
+    torch.backends.cuda.enable_math_sdp(True)
+    warnings.warn(
+        "ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
+        "to avoid HuggingFace Transformers accuracy issues",
+        UserWarning,
+        stacklevel=1,
+    )
--- a/tests/models/multimodal/generation/test_audioflamingo3.py
+++ b/tests/models/multimodal/generation/test_audioflamingo3.py
@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The vLLM team.
+# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
+# reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+
+import pytest
+
+from tests.models.registry import HF_EXAMPLE_MODELS
+from vllm import LLM, SamplingParams
+
+MODEL_NAME = "nvidia/audio-flamingo-3-hf"
+
+
+def get_fixture_path(filename):
+    return os.path.join(
+        os.path.dirname(__file__), "../../fixtures/audioflamingo3", filename
+    )
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # Check if the model is supported by the current transformers version
+    model_info = HF_EXAMPLE_MODELS.get_hf_info("AudioFlamingo3ForConditionalGeneration")
+    model_info.check_transformers_version(on_fail="skip")
+
+    try:
+        llm = LLM(
+            model=MODEL_NAME,
+            trust_remote_code=True,
+            dtype="bfloat16",
+            enforce_eager=True,
+            limit_mm_per_prompt={"audio": 1},
+        )
+        return llm
+    except Exception as e:
+        pytest.skip(f"Failed to load model {MODEL_NAME}: {e}")
+
+
+def test_single_generation(llm):
+    fixture_path = get_fixture_path("expected_results_single.json")
+    if not os.path.exists(fixture_path):
+        pytest.skip(f"Fixture not found: {fixture_path}")
+
+    with open(fixture_path) as f:
+        expected = json.load(f)
+
+    audio_url = "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Why_do_we_ask_questions_converted.wav"
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "audio_url", "audio_url": {"url": audio_url}},
+                {"type": "text", "text": "Transcribe the input speech."},
+            ],
+        }
+    ]
+
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
+
+    outputs = llm.chat(
+        messages=messages,
+        sampling_params=sampling_params,
+    )
+    generated_text = outputs[0].outputs[0].text.strip()
+
+    expected_text = expected["transcriptions"][0]
+
+    assert expected_text in generated_text or generated_text in expected_text
+
+
+def test_batched_generation(llm):
+    fixture_path = get_fixture_path("expected_results_batched.json")
+    if not os.path.exists(fixture_path):
+        pytest.skip(f"Fixture not found: {fixture_path}")
+
+    with open(fixture_path) as f:
+        expected = json.load(f)
+
+    items = [
+        {
+            "audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/dogs_barking_in_sync_with_the_music.wav",
+            "question": "What is surprising about the relationship "
+            "between the barking and the music?",
+            "expected_idx": 0,
+        },
+        {
+            "audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Ch6Ae9DT6Ko_00-04-03_00-04-31.wav",
+            "question": (
+                "Why is the philosopher's name mentioned in the lyrics? "
+                "(A) To express a sense of nostalgia "
+                "(B) To indicate that language cannot express clearly, "
+                "satirizing the inversion of black and white in the world "
+                "(C) To add depth and complexity to the lyrics "
+                "(D) To showcase the wisdom and influence of the philosopher"
+            ),
+            "expected_idx": 1,
+        },
+    ]
+
+    conversations = []
+    for item in items:
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "audio_url", "audio_url": {"url": item["audio_url"]}},
+                    {"type": "text", "text": item["question"]},
+                ],
+            }
+        ]
+        conversations.append(messages)
+
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
+
+    outputs = llm.chat(
+        messages=conversations,
+        sampling_params=sampling_params,
+    )
+
+    for i, output in enumerate(outputs):
+        generated_text = output.outputs[0].text.strip()
+        expected_text = expected["transcriptions"][i]
+
+        assert expected_text in generated_text or generated_text in expected_text
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
--- a/tests/models/multimodal/generation/test_granite_speech.py
+++ b/tests/models/multimodal/generation/test_granite_speech.py
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+
+import pytest
+from transformers import AutoModelForSpeechSeq2Seq
+
+from vllm.logprobs import SampleLogprobs
+from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
+
+from ....conftest import AudioTestAssets, HfRunner, PromptAudioInput, VllmRunner
+from ...registry import HF_EXAMPLE_MODELS
+from ...utils import check_logprobs_close
+
+HF_AUDIO_PROMPT = "<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|><|audio|>can you transcribe the speech into a written format?<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>"  # noqa: E501
+
+
+def vllm_to_hf_output(
+    vllm_output: tuple[list[int], str, SampleLogprobs | None],
+) -> tuple[list[int], str, SampleLogprobs | None]:
+    """Sanitize hf output to be comparable with vllm output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    hf_output_str = output_str + "<|end_of_text|>"
+
+    return output_ids, hf_output_str, out_logprobs
+
+
+MODEL_NAME = "ibm-granite/granite-speech-3.3-2b"
+# Audio lora co-exists directly in the model directory, but
+# currently still needs to be passed directly to vLLM.
+audio_lora_path = MODEL_NAME
+models = [MODEL_NAME]
+
+
+@pytest.fixture(autouse=True)
+def set_attention_backend_for_rocm(monkeypatch):
+    if current_platform.is_rocm():
+        monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
+
+
+def run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    inputs: Sequence[tuple[list[str], PromptAudioInput]],
+    model: str,
+    *,
+    max_model_len: int,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: str | None = None,
+):
+    """Inference result should be the same between hf and vllm.
+
+    All the audio fixtures for the test are from AUDIO_ASSETS.
+    For huggingface runner, we provide the audio as input.
+    For vllm runner, we provide MultiModalDataDict objects
+    and corresponding MultiModalConfig as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(
+        model,
+        runner="generate",
+        max_model_len=max_model_len,
+        max_num_seqs=1,
+        dtype=dtype,
+        limit_mm_per_prompt={"audio": 1},
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enable_lora=True,
+        max_lora_rank=64,
+        enforce_eager=True,
+    ) as vllm_model:
+        lora_request = LoRARequest("audio", 1, audio_lora_path)
+        vllm_outputs_per_case = [
+            vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                audios=audios,
+                lora_request=lora_request,
+            )
+            for prompts, audios in inputs
+        ]
+
+    with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
+        hf_processor = hf_model.processor
+        eos_token_id = hf_processor.tokenizer.eos_token_id
+
+        hf_outputs_per_case = [
+            hf_model.generate_greedy_logprobs_limit(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                audios=[audios],
+                eos_token_id=eos_token_id,
+            )
+            for prompts, audios in inputs
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[vllm_to_hf_output(output) for output in vllm_outputs],
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "dtype", ["float16"] if current_platform.is_rocm() else ["bfloat16"]
+)
+@pytest.mark.parametrize(
+    "max_model_len", [512] if current_platform.is_rocm() else [2048]
+)
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    audio_assets: AudioTestAssets,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
+
+    audio, sr = audio_assets[0].audio_and_sample_rate
+    # This model expects 16k sample rate, which our test audio
+    # already is; if this changes, it may break this test,
+    # so we check it directly
+    assert sr == 16000
+    run_test(
+        hf_runner,
+        vllm_runner,
+        [
+            ([HF_AUDIO_PROMPT], [audio]),
+        ],
+        model,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
--- a/tests/models/multimodal/generation/test_interleaved.py
+++ b/tests/models/multimodal/generation/test_interleaved.py
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+from vllm.multimodal.image import convert_image_mode
+
+models = ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]
+
+
+def base_prompt(modalities_str: str) -> str:
+    return f"<|im_start|>user {modalities_str}\nDescribe what you see from these items.<|im_end|><|im_start|>assistant\n"  # noqa: E501
+
+
+INTERLEAVED_PROMPT = base_prompt("<image><video><image>\n")
+NONINTERLEAVED_PROMPT = base_prompt("<image><image><video>\n")
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["float16"])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
+    """
+    This is a simple test to check if interleaved and non-interleaved prompts
+    give the same result.
+    """
+
+    image_cherry = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
+    image_stop = convert_image_mode(ImageAsset("stop_sign").pil_image, "RGB")
+    images = [image_cherry, image_stop]
+    video = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays
+
+    inputs = [
+        (
+            [INTERLEAVED_PROMPT],
+            [images],
+            [video],
+        ),
+        (
+            [NONINTERLEAVED_PROMPT],
+            [images],
+            [video],
+        ),
+    ]
+
+    with vllm_runner(
+        model,
+        runner="generate",
+        dtype=dtype,
+        limit_mm_per_prompt={"image": 2},
+        max_model_len=32768,
+        max_num_seqs=2,
+        tensor_parallel_size=1,
+        enforce_eager=True,
+    ) as vllm_model:
+        vllm_outputs_per_case = [
+            vllm_model.generate_greedy(
+                prompts, max_tokens, images=images, videos=videos
+            )
+            for prompts, images, videos in inputs
+        ]
+
+    all_results = [output[0][1] for output in vllm_outputs_per_case]
+    outputs = [
+        (total_str, total_str.find("assistant\n") + len("assistant\n"))
+        for total_str in all_results
+    ]
+    prompt_lengths = [prompt_len for _, prompt_len in outputs]
+    generated_strs = [total_str[prompt_len:] for total_str, prompt_len in outputs]
+    interleaved_prompt_len, noninterleaved_prompt_len = prompt_lengths
+    interleaved_output_str, noninterleaved_output_str = generated_strs
+
+    # The two prompts are identical except for the order of modality tokens.
+    assert interleaved_prompt_len == noninterleaved_prompt_len
+
+    # The two generated strings should be different because of the
+    # interleaved modality tokens.
+    assert interleaved_output_str != noninterleaved_output_str
--- a/tests/models/multimodal/generation/test_keye.py
+++ b/tests/models/multimodal/generation/test_keye.py
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import asdict
+from typing import NamedTuple
+
+import pytest
+from PIL.Image import Image
+from transformers import AutoProcessor
+
+from vllm import LLM, EngineArgs, SamplingParams
+from vllm.multimodal.utils import encode_image_base64
+
+MODEL_NAME = "Kwai-Keye/Keye-VL-8B-Preview"
+
+QUESTION = "What is the content of each image?"
+
+
+class ModelRequestData(NamedTuple):
+    engine_args: EngineArgs
+    prompt: str
+    image_data: list[Image]
+    stop_token_ids: list[int] | None = None
+    chat_template: str | None = None
+    sampling_params: SamplingParams | None = None
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("question", [QUESTION])
+def test_keye_vl(
+    image_assets,
+    question: str,
+):
+    images = [asset.pil_image for asset in image_assets]
+
+    image_urls = [
+        f"data:image/jpeg;base64,{encode_image_base64(image)}" for image in images
+    ]
+
+    engine_args = EngineArgs(
+        model=MODEL_NAME,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        },
+    ]
+
+    processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    engine_args = asdict(engine_args) | {"seed": 42}
+    llm = LLM(**engine_args)
+
+    sampling_params = SamplingParams(
+        temperature=0.0, max_tokens=256, stop_token_ids=None
+    )
+
+    outputs = llm.generate(
+        {
+            "prompt": prompt,
+            "multi_modal_data": {"image": images},
+        },
+        sampling_params=sampling_params,
+    )
+
+    print("-" * 50)
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+        assert len(generated_text) > 10, (
+            f"Generated text is too short: {generated_text}"
+        )
+        print("-" * 50)
--- a/tests/models/multimodal/generation/test_maverick.py
+++ b/tests/models/multimodal/generation/test_maverick.py
@@ -0,0 +1,723 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Create a reduced-layer version of the Maverick model for testing purposes.
+
+This script creates a new model with fewer layers by:
+1. Loading the original Maverick model configuration
+2. Creating a reduced configuration
+3. Generating compatible safetensors files with appropriate weights
+4. Creating the necessary index files for vLLM compatibility
+"""
+
+import json
+import shutil
+from pathlib import Path
+from typing import Any
+
+import pytest
+import torch
+from safetensors.torch import save_file
+from transformers import AutoConfig, AutoProcessor, AutoTokenizer, GenerationConfig
+
+from vllm import LLM, SamplingParams
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, FullAttentionSpec
+
+from ....utils import multi_gpu_test
+
+# Sample prompts for testing
+PROMPTS: list[str] = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+
+def run_maverick_serving(model: str):
+    """Test Llama-4-Maverick model with vLLM LLM class using CLI equivalent
+    options with reduced layers.
+    """
+
+    try:
+        sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+        llm = LLM(
+            model=model,
+            max_model_len=2048,
+            enforce_eager=True,
+            tensor_parallel_size=8,
+            enable_expert_parallel=True,
+            trust_remote_code=True,
+            gpu_memory_utilization=0.4,
+            kv_cache_dtype="fp8",
+        )
+
+        outputs = llm.generate(PROMPTS, sampling_params)
+
+        # Print the outputs
+        print("\nGenerated Outputs:\n" + "-" * 60)
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt:    {prompt!r}")
+            print(f"Output:    {generated_text!r}")
+            print("-" * 60)
+
+    except Exception as e:
+        print(f"Error initializing or running model: {e}")
+        raise
+
+
+def get_rope_layers_config(model_path: str) -> list[int]:
+    """
+    Get the interleaved RoPE configuration from HuggingFace config
+
+    Args:
+        model_path: Path to the local directory containing the reduced
+            Maverick model checkpoint
+
+    Returns:
+        List of 0 or 1 indicating whether each layer uses RoPE and local attn
+        0 indicates that RoPE is not used while 1 indicates that RoPE is used.
+    """
+    config_path = Path(model_path) / "config.json"
+    model_config = json.loads(config_path.read_text())
+    text_config = model_config["text_config"]
+    no_rope_layers = text_config["no_rope_layers"]
+    print(f"Found no_rope_layers: {no_rope_layers}")
+    return no_rope_layers
+
+
+def create_reduced_maverick_model(
+    original_model_name: str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+    output_dir: str = "/tmp/reduced_maverick",
+    text_layers: int = 4,
+    num_experts: int = 4,
+    vision_layers: int = 2,
+    force_recreate: bool = False,
+) -> str:
+    """
+    Create a reduced-layer version of the Maverick model.
+
+    Args:
+        original_model_name: Name of the original Maverick model
+        output_dir: Directory to save the reduced model
+        text_layers: Number of text transformer layers
+        num_experts: Number of experts per layer
+        vision_layers: Number of vision transformer layers
+        force_recreate: Whether to recreate if output_dir already exists
+
+    Returns:
+        Path to the created reduced model directory
+    """
+
+    print(
+        f"Creating reduced Maverick model with {text_layers} text layers and "
+        f"{vision_layers} vision layers..."
+    )
+
+    # Create output directory
+    output_path = Path(output_dir)
+    if output_path.exists():
+        if force_recreate:
+            shutil.rmtree(output_path)
+        else:
+            print(
+                f"Output directory {output_dir} already exists. "
+                "Use --force-recreate to overwrite."
+            )
+            return str(output_path)
+
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    try:
+        print("Loading original model configuration...")
+        original_config = AutoConfig.from_pretrained(
+            original_model_name, trust_remote_code=True
+        )
+        print("Creating reduced configuration...")
+        reduced_config = create_reduced_config(
+            original_config, text_layers, num_experts, vision_layers
+        )
+
+        config_path = output_path / "config.json"
+        with open(config_path, "w") as f:
+            json.dump(reduced_config, f, indent=2)
+        print(f"Saved reduced config to {config_path}")
+
+        print("Copying tokenizer files...")
+        copy_tokenizer_files(original_model_name, output_path)
+
+        print("Creating reduced safetensors files...")
+        create_reduced_safetensors(original_config, reduced_config, output_path)
+
+        print("Creating preprocessor config...")
+        create_preprocessor_config(original_config, output_path)
+
+        try:
+            gen_config = GenerationConfig.from_pretrained(original_model_name)
+            gen_config.save_pretrained(output_path)
+            print("Copied generation config")
+        except Exception as e:
+            print(f"Could not copy generation config: {e}")
+
+        print(f"Successfully created reduced Maverick model at {output_path}")
+        return str(output_path)
+
+    except Exception as e:
+        print(f"Error creating reduced model: {e}")
+        # Clean up on failure
+        if output_path.exists():
+            shutil.rmtree(output_path)
+        raise
+
+
+def create_reduced_config(
+    original_config: Any, text_layers: int, num_experts: int, vision_layers: int
+) -> dict[str, Any]:
+    """Create a reduced configuration based on the original."""
+
+    # Convert config to dictionary
+    config_dict = original_config.to_dict()
+
+    # Reduce text layers
+    if "text_config" in config_dict:
+        original_text_layers = config_dict["text_config"]["num_hidden_layers"]
+        config_dict["text_config"]["num_hidden_layers"] = text_layers
+        original_layer_types = config_dict["text_config"]["layer_types"]
+        config_dict["text_config"]["layer_types"] = original_layer_types[:text_layers]
+        print(f"Reduced text layers from {original_text_layers} to {text_layers}")
+
+        original_num_experts = config_dict["text_config"]["num_local_experts"]
+        config_dict["text_config"]["num_local_experts"] = num_experts
+        print(f"Reduced num experts from {original_num_experts} to {num_experts}")
+
+        hidden_dim_divisor = 4
+
+        original_hidden_size = config_dict["text_config"]["hidden_size"]
+        new_hidden_size = original_hidden_size // hidden_dim_divisor
+        config_dict["text_config"]["hidden_size"] = new_hidden_size
+        print(f"Reduced hidden size from {original_hidden_size} to {new_hidden_size}")
+
+        original_head_dim = config_dict["text_config"]["head_dim"]
+        new_head_dim = original_head_dim // hidden_dim_divisor
+        config_dict["text_config"]["head_dim"] = new_head_dim
+        print(f"Reduced head dim from {original_head_dim} to {new_head_dim}")
+
+    # Reduce vision layers
+    if "vision_config" in config_dict:
+        original_vision_layers = config_dict["vision_config"]["num_hidden_layers"]
+        config_dict["vision_config"]["num_hidden_layers"] = vision_layers
+        print(f"Reduced vision layers from {original_vision_layers} to {vision_layers}")
+
+    # Update model name to indicate it's a reduced version
+    config_dict["_name_or_path"] = f"reduced_maverick_{text_layers}t_{vision_layers}v"
+
+    return config_dict
+
+
+def copy_tokenizer_files(original_model_name: str, output_path: Path) -> None:
+    """Copy tokenizer files from the original model."""
+
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(
+            original_model_name, trust_remote_code=True
+        )
+        tokenizer.save_pretrained(output_path)
+        print("Tokenizer files copied successfully")
+    except Exception as e:
+        print(f"Warning: Could not copy tokenizer files: {e}")
+
+
+def create_preprocessor_config(original_config: Any, output_path: Path) -> None:
+    """Create preprocessor_config.json for multimodal model."""
+
+    # Try to load the original preprocessor config
+    try:
+        processor = AutoProcessor.from_pretrained(
+            original_config._name_or_path
+            or "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+            trust_remote_code=True,
+        )
+        processor.save_pretrained(output_path)
+        print("Copied original preprocessor config")
+        return
+    except Exception as e:
+        print(f"Could not copy original preprocessor config: {e}")
+        raise
+
+
+def create_reduced_safetensors(
+    original_config: Any, reduced_config: dict[str, Any], output_path: Path
+) -> None:
+    """Create safetensors files with weights for the reduced model."""
+
+    print("Generating synthetic weights for reduced model...")
+
+    text_config = reduced_config["text_config"]
+    vision_config = reduced_config["vision_config"]
+
+    weights = {}
+
+    print("Creating text model weights...")
+    weights.update(create_text_model_weights(text_config))
+
+    print("Creating vision model weights...")
+    weights.update(create_vision_model_weights(vision_config))
+
+    print("Creating shared model weights...")
+    weights.update(create_shared_weights(text_config, vision_config))
+
+    print("Saving weights to safetensors files...")
+    save_weights_to_safetensors(weights, output_path)
+
+
+def create_text_model_weights(text_config: dict[str, Any]) -> dict[str, torch.Tensor]:
+    """Create synthetic weights for the text model with MoE structure."""
+
+    weights = {}
+
+    vocab_size = text_config["vocab_size"]
+    hidden_size = text_config["hidden_size"]
+    intermediate_size = text_config["intermediate_size"]
+    intermediate_size_mlp = text_config["intermediate_size_mlp"]
+    num_layers = text_config["num_hidden_layers"]
+    num_attention_heads = text_config["num_attention_heads"]
+    num_key_value_heads = text_config.get("num_key_value_heads", num_attention_heads)
+
+    # MoE specific parameters
+    num_experts = text_config.get("num_local_experts")
+    assert num_experts is not None, "num_local_experts must be specified for MoE"
+
+    head_dim = hidden_size // num_attention_heads
+
+    # Embedding layers
+    weights["language_model.model.embed_tokens.weight"] = torch.randn(
+        vocab_size, hidden_size, dtype=torch.float16
+    )
+
+    # Transformer layers
+    for layer_idx in range(num_layers):
+        layer_prefix = f"language_model.model.layers.{layer_idx}"
+        print(f"Creating weights for layer {layer_prefix}...")
+
+        # Self-attention weights (separate q, k, v projections)
+        weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
+            hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16
+        )
+        weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
+            hidden_size, num_key_value_heads * head_dim, dtype=torch.bfloat16
+        )
+        weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
+            num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16
+        )
+        weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
+            hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16
+        )
+        print("Self-attention weights created.")
+
+        # Feed-forward weights - MoE pattern based on interleave_moe_layer_step
+        # For interleave_moe_layer_step=2: layers 1,3,5,... are MoE, layers
+        # 0,2,4,... are dense
+        interleave_step = text_config.get("interleave_moe_layer_step", 1)
+        is_moe_layer = interleave_step > 0 and (layer_idx + 1) % interleave_step == 0
+
+        if is_moe_layer:
+            # MoE layer structure
+            # 1. Router weights
+            weights[f"{layer_prefix}.feed_forward.router.weight"] = torch.randn(
+                num_experts, hidden_size, dtype=torch.float16
+            )
+
+            # 2. Individual expert weights (not fused)
+            for expert_idx in range(num_experts):
+                expert_prefix = f"{layer_prefix}.feed_forward.experts.{expert_idx}"
+
+                weights[f"{expert_prefix}.gate_proj.weight"] = torch.randn(
+                    intermediate_size, hidden_size, dtype=torch.bfloat16
+                )
+                weights[f"{expert_prefix}.up_proj.weight"] = torch.randn(
+                    intermediate_size, hidden_size, dtype=torch.bfloat16
+                )
+                weights[f"{expert_prefix}.down_proj.weight"] = torch.randn(
+                    hidden_size, intermediate_size, dtype=torch.bfloat16
+                )
+
+                # Expert weight scales (FP8 quantization)
+                weights[f"{expert_prefix}.gate_proj.weight_scale"] = torch.ones(
+                    intermediate_size, 1, dtype=torch.bfloat16
+                )
+                weights[f"{expert_prefix}.up_proj.weight_scale"] = torch.ones(
+                    intermediate_size, 1, dtype=torch.bfloat16
+                )
+                weights[f"{expert_prefix}.down_proj.weight_scale"] = torch.ones(
+                    hidden_size, 1, dtype=torch.bfloat16
+                )
+
+            # 3. Shared expert weights
+            shared_expert_prefix = f"{layer_prefix}.feed_forward.shared_expert"
+            weights[f"{shared_expert_prefix}.gate_proj.weight"] = torch.randn(
+                intermediate_size, hidden_size, dtype=torch.bfloat16
+            )
+            weights[f"{shared_expert_prefix}.up_proj.weight"] = torch.randn(
+                intermediate_size, hidden_size, dtype=torch.bfloat16
+            )
+            weights[f"{shared_expert_prefix}.down_proj.weight"] = torch.randn(
+                hidden_size, intermediate_size, dtype=torch.bfloat16
+            )
+            print(f"MoE feed-forward weights created for layer {layer_idx}.")
+        else:
+            # Dense layer structure
+            weights[f"{layer_prefix}.feed_forward.gate_proj.weight"] = torch.randn(
+                intermediate_size_mlp, hidden_size, dtype=torch.bfloat16
+            )
+            weights[f"{layer_prefix}.feed_forward.up_proj.weight"] = torch.randn(
+                intermediate_size_mlp, hidden_size, dtype=torch.bfloat16
+            )
+            weights[f"{layer_prefix}.feed_forward.down_proj.weight"] = torch.randn(
+                hidden_size, intermediate_size_mlp, dtype=torch.bfloat16
+            )
+            print(f"Dense feed-forward weights created for layer {layer_idx}.")
+
+        # Layer norms
+        weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
+            hidden_size, dtype=torch.bfloat16
+        )
+        weights[f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
+            hidden_size, dtype=torch.bfloat16
+        )
+        print("Layer norms created.")
+
+    # Final layer norm and output projection
+    weights["language_model.model.norm.weight"] = torch.ones(
+        hidden_size, dtype=torch.bfloat16
+    )
+    weights["language_model.lm_head.weight"] = torch.randn(
+        vocab_size, hidden_size, dtype=torch.bfloat16
+    )
+
+    return weights
+
+
+def create_vision_model_weights(
+    vision_config: dict[str, Any],
+) -> dict[str, torch.Tensor]:
+    """Create synthetic weights for the vision model."""
+
+    weights = {}
+
+    hidden_size = vision_config["hidden_size"]
+    intermediate_size = vision_config["intermediate_size"]
+    num_layers = vision_config["num_hidden_layers"]
+
+    # Vision transformer layers
+    for layer_idx in range(num_layers):
+        layer_prefix = f"vision_model.model.layers.{layer_idx}"
+
+        weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
+            hidden_size, hidden_size, dtype=torch.bfloat16
+        )
+        weights[f"{layer_prefix}.self_attn.q_proj.bias"] = torch.zeros(
+            hidden_size, dtype=torch.bfloat16
+        )
+        weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
+            hidden_size, hidden_size, dtype=torch.bfloat16
+        )
+        weights[f"{layer_prefix}.self_attn.k_proj.bias"] = torch.zeros(
+            hidden_size, dtype=torch.bfloat16
+        )
+        weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
+            hidden_size, hidden_size, dtype=torch.bfloat16
+        )
+        weights[f"{layer_prefix}.self_attn.v_proj.bias"] = torch.zeros(
+            hidden_size, dtype=torch.bfloat16
+        )
+        weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
+            hidden_size, hidden_size, dtype=torch.bfloat16
+        )
+        weights[f"{layer_prefix}.self_attn.o_proj.bias"] = torch.zeros(
+            hidden_size, dtype=torch.bfloat16
+        )
+
+        weights[f"{layer_prefix}.mlp.fc1.weight"] = torch.randn(
+            intermediate_size, hidden_size, dtype=torch.bfloat16
+        )
+        weights[f"{layer_prefix}.mlp.fc1.bias"] = torch.zeros(
+            intermediate_size, dtype=torch.bfloat16
+        )
+        weights[f"{layer_prefix}.mlp.fc2.weight"] = torch.randn(
+            hidden_size, intermediate_size, dtype=torch.bfloat16
+        )
+        weights[f"{layer_prefix}.mlp.fc2.bias"] = torch.zeros(
+            hidden_size, dtype=torch.bfloat16
+        )
+
+        weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
+            hidden_size, dtype=torch.bfloat16
+        )
+        weights[f"{layer_prefix}.input_layernorm.bias"] = torch.zeros(
+            hidden_size, dtype=torch.bfloat16
+        )
+        weights[f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
+            hidden_size, dtype=torch.bfloat16
+        )
+        weights[f"{layer_prefix}.post_attention_layernorm.bias"] = torch.zeros(
+            hidden_size, dtype=torch.bfloat16
+        )
+
+    return weights
+
+
+def create_shared_weights(
+    text_config: dict[str, Any], vision_config: dict[str, Any]
+) -> dict[str, torch.Tensor]:
+    """Create weights for shared components (vision-language connector)"""
+
+    weights = {}
+
+    text_hidden_size = text_config["hidden_size"]
+    projector_input_dim = vision_config["projector_input_dim"]
+
+    # Vision-language connector (projects vision features to text space)
+    weights["multi_modal_projector.linear_1.weight"] = torch.randn(
+        text_hidden_size, projector_input_dim, dtype=torch.bfloat16
+    )
+
+    return weights
+
+
+def save_weights_to_safetensors(
+    weights: dict[str, torch.Tensor], output_path: Path
+) -> None:
+    """Save weights to safetensors files and create index."""
+
+    # Determine how to shard the weights
+    max_shard_size = 5 * 1024 * 1024 * 1024  # 5GB per shard
+
+    # Calculate sizes and create shards
+    shards = []
+    current_shard: dict[str, torch.Tensor] = {}
+    current_size = 0
+
+    for name, tensor in weights.items():
+        tensor_size = tensor.numel() * tensor.element_size()
+
+        if current_size + tensor_size > max_shard_size and current_shard:
+            shards.append(current_shard)
+            current_shard = {}
+            current_size = 0
+
+        current_shard[name] = tensor
+        current_size += tensor_size
+
+    if current_shard:
+        shards.append(current_shard)
+
+    # Save shards and create index
+    weight_map = {}
+
+    if len(shards) == 1:
+        # Single file
+        filename = "model.safetensors"
+        save_file(shards[0], output_path / filename)
+        weight_map = {name: filename for name in shards[0]}
+        print(f"Saved weights to single file: {filename}")
+    else:
+        # Multiple shards
+        for i, shard in enumerate(shards):
+            filename = f"model-{i + 1:05d}-of-{len(shards):05d}.safetensors"
+            save_file(shard, output_path / filename)
+            for name in shard:
+                weight_map[name] = filename
+            print(f"Saved shard {i + 1}/{len(shards)}: {filename}")
+
+    # Create index file
+    index_data = {
+        "metadata": {
+            "total_size": sum(
+                tensor.numel() * tensor.element_size() for tensor in weights.values()
+            )
+        },
+        "weight_map": weight_map,
+    }
+
+    index_path = output_path / "model.safetensors.index.json"
+    with open(index_path, "w") as f:
+        json.dump(index_data, f, indent=2)
+
+    print(f"Created index file: {index_path}")
+    print(
+        f"Total model size: {index_data['metadata']['total_size'] / (1024**3):.2f} GB"
+    )
+
+
+def check_attention_spec_interleaved_rope(
+    llm: LLM,
+    num_attention_layers: int,
+    num_ranks: int,
+    rope_layers: list[int],
+):
+    """Check that the attention spec is correct."""
+    assert isinstance(llm.llm_engine.model_executor, Executor)
+    kv_cache_specs_per_rank = llm.llm_engine.model_executor.get_kv_cache_specs()
+    for rank in range(num_ranks):
+        kv_cache_specs = kv_cache_specs_per_rank[rank]
+        assert len(kv_cache_specs.keys()) == num_attention_layers
+        for i in range(num_attention_layers):
+            if rope_layers[i] == 0:
+                expected_spec = FullAttentionSpec
+            else:
+                expected_spec = ChunkedLocalAttentionSpec
+            assert isinstance(
+                kv_cache_specs[f"language_model.model.layers.{i}.self_attn.attn"],
+                expected_spec,
+            )
+
+
+def run_reduced_model(llm: LLM, should_profile: bool = False) -> None:
+    """Test the created reduced model with vLLM."""
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=50)
+
+    if should_profile:
+        llm.start_profile()
+    outputs = llm.generate(PROMPTS, sampling_params)
+    if should_profile:
+        llm.stop_profile()
+
+    print("Test generation successful!")
+    for output in outputs:
+        print(f"Prompt: {output.prompt}")
+        print(f"Output: {output.outputs[0].text}")
+        print("-" * 40)
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "original_model_name,text_layers,num_experts,vision_layers,",
+    [("meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", 4, 4, 2)],
+)
+@pytest.mark.parametrize("enforce_eager", [True, False])
+@pytest.mark.parametrize("tp,ep", [(2, True)])
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_dummy_maverick(
+    monkeypatch,
+    original_model_name: str,
+    text_layers: int,
+    num_experts: int,
+    vision_layers: int,
+    enforce_eager: bool,
+    tp: int,
+    ep: bool,
+    output_dir: str = "/tmp/reduced_maverick",
+    force_recreate: bool = True,
+    profile: bool = False,
+) -> None:
+    # Disable multiprocessing allows us to access model executor from LLM engine
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+
+    model_path = create_reduced_maverick_model(
+        original_model_name=original_model_name,
+        output_dir=output_dir,
+        text_layers=text_layers,
+        num_experts=num_experts,
+        vision_layers=vision_layers,
+        force_recreate=force_recreate,
+    )
+
+    print(f"\nReduced model created successfully at: {model_path}")
+
+    rope_layers = get_rope_layers_config(model_path)
+
+    llm = LLM(
+        model=model_path,
+        trust_remote_code=True,
+        max_model_len=512,  # Small context for testing
+        gpu_memory_utilization=0.3,  # Conservative memory usage
+        enforce_eager=enforce_eager,
+        tensor_parallel_size=tp,
+        enable_expert_parallel=ep,
+    )
+
+    check_attention_spec_interleaved_rope(
+        llm,
+        text_layers,
+        tp,
+        rope_layers,
+    )
+
+    print(f"\nTesting reduced model at {model_path}...")
+    run_reduced_model(llm=llm, should_profile=profile)
+
+
+def main():
+    """Main function to create and test the reduced model."""
+
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Create a reduced-layer Maverick model"
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="/tmp/reduced_maverick",
+        help="Output directory for the reduced model",
+    )
+    parser.add_argument(
+        "--text-layers",
+        type=int,
+        default=4,
+        help="Number of text transformer layers",
+    )
+    parser.add_argument("--num-experts", type=int, default=4, help="Number of experts")
+    parser.add_argument(
+        "--vision-layers",
+        type=int,
+        default=2,
+        help="Number of vision transformer layers",
+    )
+    parser.add_argument(
+        "--force-recreate",
+        action="store_true",
+        help="Force recreation if output directory exists",
+    )
+    parser.add_argument(
+        "--test", action="store_true", help="Test the created model with vLLM"
+    )
+    parser.add_argument(
+        "--profile", action="store_true", help="Profile the created model with vLLM"
+    )
+    parser.add_argument(
+        "--test-original",
+        action="store_true",
+        help="Test the original model with vLLM",
+    )
+    parser.add_argument(
+        "--original-model",
+        default="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+        help="Original model name to base the reduction on",
+    )
+
+    args = parser.parse_args()
+
+    if args.test:
+        test_dummy_maverick(
+            original_model_name=args.original_model,
+            output_dir=args.output_dir,
+            text_layers=args.text_layers,
+            num_experts=args.num_experts,
+            vision_layers=args.vision_layers,
+            force_recreate=args.force_recreate,
+            tp=2,
+            ep=True,
+            enforce_eager=True,
+            profile=args.profile,
+        )
+
+    if args.test_original:
+        run_maverick_serving(args.original_model)
+
+
+if __name__ == "__main__":
+    exit(main())
--- a/tests/models/multimodal/generation/test_multimodal_gguf.py
+++ b/tests/models/multimodal/generation/test_multimodal_gguf.py
@@ -0,0 +1,180 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+from typing import Any, NamedTuple
+
+import pytest
+from huggingface_hub import hf_hub_download
+from pytest import MarkDecorator
+from transformers import AutoModelForImageTextToText
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm.assets.image import ImageAsset
+from vllm.multimodal.image import rescale_image_size
+from vllm.utils.torch_utils import set_default_torch_num_threads
+
+from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner
+from ...utils import check_logprobs_close
+
+
+class GGUFMMTestConfig(NamedTuple):
+    original_model: str
+    gguf_repo: str
+    gguf_backbone: str
+    gguf_mmproj: str
+    prompt: list[str]
+    image_names: list[str]  # Store names, load PIL images at runtime
+    max_model_len: int = 4096
+    marks: list[MarkDecorator] = []
+    mm_processor_kwargs: dict[str, Any] = {}
+
+    @property
+    def gguf_model(self):
+        hf_hub_download(self.gguf_repo, filename=self.gguf_mmproj)
+        return hf_hub_download(self.gguf_repo, filename=self.gguf_backbone)
+
+
+# Common prompts aligned with test_common.py "gemma3" entry format
+_GEMMA3_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": (
+            "<bos><start_of_turn>user\n"
+            "<start_of_image>What's the content in the center of the image?"
+            "<end_of_turn>\n<start_of_turn>model\n"
+        ),
+        "cherry_blossom": (
+            "<bos><start_of_turn>user\n"
+            "<start_of_image>What is the season?"
+            "<end_of_turn>\n<start_of_turn>model\n"
+        ),
+    }
+)
+
+# Image asset names - load at runtime to avoid pickle issues with subprocess
+_GEMMA3_IMAGE_NAMES = ["stop_sign", "cherry_blossom"]
+
+# Regular multimodal (no pan-and-scan) - uses QAT Q4_0 GGUF
+GEMMA3_CONFIG = GGUFMMTestConfig(
+    original_model="google/gemma-3-4b-it",
+    gguf_repo="google/gemma-3-4b-it-qat-q4_0-gguf",
+    gguf_backbone="gemma-3-4b-it-q4_0.gguf",
+    gguf_mmproj="mmproj-model-f16-4B.gguf",
+    prompt=_GEMMA3_PROMPTS,
+    image_names=_GEMMA3_IMAGE_NAMES,
+    max_model_len=4096,
+    marks=[pytest.mark.core_model],
+    mm_processor_kwargs={},
+)
+
+# Pan-and-scan multimodal - uses unquantized BF16 GGUF
+GEMMA3_CONFIG_PAN_AND_SCAN = GGUFMMTestConfig(
+    original_model="google/gemma-3-4b-it",
+    gguf_repo="unsloth/gemma-3-4b-it-GGUF",
+    gguf_backbone="gemma-3-4b-it-BF16.gguf",
+    gguf_mmproj="mmproj-BF16.gguf",
+    prompt=_GEMMA3_PROMPTS,
+    image_names=_GEMMA3_IMAGE_NAMES,
+    max_model_len=4096,
+    marks=[pytest.mark.core_model],
+    mm_processor_kwargs={"do_pan_and_scan": True},
+)
+
+MODELS_TO_TEST = [GEMMA3_CONFIG, GEMMA3_CONFIG_PAN_AND_SCAN]
+
+
+def run_multimodal_gguf_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    model: GGUFMMTestConfig,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+):
+    # Load images at runtime (inside subprocess) to avoid pickle issues
+    images = [ImageAsset(name).pil_image for name in model.image_names]
+    size_factors = [0.25, 0.5, 1.0]
+    inputs_per_image = [
+        (
+            [prompt for _ in size_factors],
+            [rescale_image_size(image, factor) for factor in size_factors],
+        )
+        for image, prompt in zip(images, model.prompt)
+    ]
+
+    # NOTE: Run vLLM first to avoid CUDA init issues with multiprocessing fork.
+    # Run GGUF model via vLLM.
+    with (
+        set_default_torch_num_threads(1),
+        vllm_runner(
+            model_name=model.gguf_model,
+            enforce_eager=True,
+            tokenizer_name=model.original_model,
+            dtype=dtype,
+            max_model_len=model.max_model_len,
+            mm_processor_kwargs=model.mm_processor_kwargs,
+        ) as gguf_model,
+    ):
+        gguf_outputs_per_case = [
+            gguf_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+            )
+            for prompts, images in inputs_per_image
+        ]
+
+    # Then run HfRunner for HuggingFace baseline comparison.
+    with hf_runner(
+        model.original_model,
+        dtype=dtype,
+        auto_cls=AutoModelForImageTextToText,
+    ) as hf_model:
+        hf_outputs_per_case = [
+            hf_model.generate_greedy_logprobs_limit(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+            )
+            for prompts, images in inputs_per_image
+        ]
+
+    for hf_outputs, gguf_outputs in zip(hf_outputs_per_case, gguf_outputs_per_case):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=gguf_outputs,
+            name_0="hf",
+            name_1="gguf",
+        )
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("gguf"),
+    reason="gguf is not supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "model",
+    [
+        pytest.param(test_config, marks=test_config.marks)
+        for test_config in MODELS_TO_TEST
+    ],
+)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_gemma3_mm_gguf(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    model: GGUFMMTestConfig,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    run_multimodal_gguf_test(
+        hf_runner, vllm_runner, model, dtype, max_tokens, num_logprobs
+    )
--- a/tests/models/multimodal/generation/test_phi4mm.py
+++ b/tests/models/multimodal/generation/test_phi4mm.py
@@ -0,0 +1,317 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from collections.abc import Sequence
+
+import librosa
+import pytest
+import regex as re
+from huggingface_hub import snapshot_download
+from transformers import AutoTokenizer
+
+from vllm.assets.image import ImageAsset
+from vllm.logprobs import SampleLogprobs
+from vllm.lora.request import LoRARequest
+from vllm.multimodal.image import convert_image_mode, rescale_image_size
+
+from ....conftest import (
+    IMAGE_ASSETS,
+    HfRunner,
+    PromptAudioInput,
+    PromptImageInput,
+    VllmRunner,
+)
+from ....utils import large_gpu_test
+from ...utils import check_logprobs_close
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
+        "cherry_blossom": "<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n",  # noqa: E501
+    }
+)
+HF_MULTIIMAGE_IMAGE_PROMPT = (
+    "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
+)
+
+model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
+# Since the vision-lora and speech-lora co-exist with the base model,
+# we have to manually specify the path of the lora weights.
+vision_lora_path = os.path.join(model_path, "vision-lora")
+speech_question = os.path.join(
+    model_path, "examples", "what_is_shown_in_this_image.wav"
+)
+models = [model_path]
+
+
+def vllm_to_hf_output(
+    vllm_output: tuple[list[int], str, SampleLogprobs | None], model: str
+):
+    """Sanitize vllm output to be comparable with hf output."""
+    _, output_str, out_logprobs = vllm_output
+
+    output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
+    assert output_str_without_image[0] == " "
+    output_str_without_image = output_str_without_image[1:]
+
+    hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    hf_output_ids = tokenizer.encode(output_str_without_image)
+    assert hf_output_ids[0] == 1
+    hf_output_ids = hf_output_ids[1:]
+
+    return hf_output_ids, hf_output_str, out_logprobs
+
+
+target_dtype = "half"
+
+
+def run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    inputs: Sequence[tuple[list[str], PromptImageInput, PromptAudioInput | None]],
+    model: str,
+    *,
+    max_model_len: int,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    mm_limit: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: str | None = None,
+):
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test are from IMAGE_ASSETS.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalDataDict objects
+    and corresponding MultiModalConfig as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(
+        model,
+        runner="generate",
+        max_model_len=max_model_len,
+        max_num_seqs=2,
+        dtype=dtype,
+        limit_mm_per_prompt={"image": mm_limit},
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enable_lora=True,
+        max_lora_rank=320,
+        gpu_memory_utilization=0.8,  # set to 0.8 to avoid OOM in CI
+        enforce_eager=True,
+    ) as vllm_model:
+        lora_request = LoRARequest("vision", 1, vision_lora_path)
+        vllm_outputs_per_case = [
+            vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+                audios=audios,
+                lora_request=lora_request,
+            )
+            for prompts, images, audios in inputs
+        ]
+
+    # This error occurs inside `get_peft_model`
+    # FIXME: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/75
+    pytest.skip("HF impl is not compatible with current transformers")
+
+    hf_model_kwargs = {"_attn_implementation": "sdpa"}
+    with hf_runner(model, dtype=dtype, model_kwargs=hf_model_kwargs) as hf_model:
+        hf_processor = hf_model.processor
+        eos_token_id = hf_processor.tokenizer.eos_token_id
+
+        def patch_hf_processor(
+            *args, text="", images=None, audio=None, sampling_rate=None, **kwargs
+        ):
+            audios = None
+            if audio is not None and sampling_rate is not None:
+                audios = [(audio, sampling_rate)]
+            return hf_processor(
+                *args, text=text, images=images, audios=audios, **kwargs
+            )
+
+        hf_model.processor = patch_hf_processor
+
+        hf_outputs_per_case = [
+            hf_model.generate_greedy_logprobs_limit(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+                audios=audios,
+                eos_token_id=eos_token_id,
+                num_logits_to_keep=0,
+            )
+            for prompts, images, audios in inputs
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_model_len", [12800])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model,
+    size_factors,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [
+        (
+            [prompt for _ in size_factors],
+            [rescale_image_size(image, factor) for factor in size_factors],
+            None,
+        )
+        for image, prompt in zip(images, HF_IMAGE_PROMPTS)
+    ]
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs_per_image,
+        model,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
+
+
+@large_gpu_test(min_gb=48)
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        # [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_model_len", [25600])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_multi_images_models(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model,
+    size_factors,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_case = [
+        (
+            [HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
+            [
+                [rescale_image_size(image, factor) for image in images]
+                for factor in size_factors
+            ],
+            None,
+        ),
+    ]
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs_per_case,
+        model,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=2,
+        tensor_parallel_size=1,
+    )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_model_len", [12800])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_vision_speech_models(
+    hf_runner,
+    vllm_runner,
+    model,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    # use the example speech question so that the model outputs are reasonable
+    audio = librosa.load(speech_question, sr=None)
+    image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
+
+    inputs_vision_speech = [
+        (
+            ["<|user|><|image_1|><|audio_1|><|end|><|assistant|>"],
+            [image],
+            [audio],
+        ),
+    ]
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs_vision_speech,
+        model,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
--- a/tests/models/multimodal/generation/test_pixtral.py
+++ b/tests/models/multimodal/generation/test_pixtral.py
@@ -0,0 +1,211 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+from dataclasses import asdict
+from typing import TYPE_CHECKING, Any
+
+import pytest
+from mistral_common.multimodal import download_image
+from mistral_common.protocol.instruct.chunk import ImageURLChunk
+from mistral_common.protocol.instruct.request import ChatCompletionRequest
+from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
+from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
+from transformers import AutoProcessor
+
+from vllm import SamplingParams, TextPrompt, TokensPrompt
+from vllm.logprobs import Logprob, SampleLogprobs
+from vllm.multimodal import MultiModalDataBuiltins
+from vllm.platforms import current_platform
+
+from ....utils import VLLM_PATH, large_gpu_test
+from ...utils import check_logprobs_close
+
+if TYPE_CHECKING:
+    from _typeshed import StrPath
+
+PIXTRAL_ID = "mistralai/Pixtral-12B-2409"
+MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+
+MODELS = [PIXTRAL_ID, MISTRAL_SMALL_3_1_ID]
+
+IMG_URLS = [
+    "237-400x300.jpg",  # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
+    "231-200x300.jpg",  # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
+    "27-500x500.jpg",  # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
+    "17-150x600.jpg",  # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
+]
+PROMPT = "Describe each image in one short sentence."
+
+
+def _create_msg_format(urls: list[str]) -> list[dict[str, Any]]:
+    return [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": PROMPT,
+                }
+            ]
+            + [{"type": "image_url", "image_url": {"url": url}} for url in urls],
+        }
+    ]
+
+
+def _create_msg_format_hf(urls: list[str]) -> list[dict[str, Any]]:
+    return [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "content": PROMPT,
+                },
+                *({"type": "image", "image": download_image(url)} for url in urls),
+            ],
+        }
+    ]
+
+
+def _create_engine_inputs(urls: list[str]) -> TokensPrompt:
+    msg = _create_msg_format(urls)
+
+    tokenizer = MistralTokenizer.from_model("pixtral")
+
+    request = ChatCompletionRequest(messages=msg)  # type: ignore[type-var]
+    tokenized = tokenizer.encode_chat_completion(request)
+
+    engine_inputs = TokensPrompt(prompt_token_ids=tokenized.tokens)
+
+    images = []
+    for chunk in request.messages[0].content:
+        if isinstance(chunk, ImageURLChunk):
+            images.append(image_from_chunk(chunk))
+
+    mm_data = MultiModalDataBuiltins(image=images)
+    engine_inputs["multi_modal_data"] = mm_data
+
+    return engine_inputs
+
+
+def _create_engine_inputs_hf(urls: list[str]) -> TextPrompt:
+    msg = _create_msg_format_hf(urls)
+
+    tokenizer = AutoProcessor.from_pretrained("mistral-community/pixtral-12b")
+    prompt = tokenizer.apply_chat_template(msg)
+
+    images = []
+    for chunk in msg[0]["content"]:
+        if chunk["type"] == "image":
+            images.append(chunk["image"])
+
+    mm_data = MultiModalDataBuiltins(image=images)
+    engine_inputs = TextPrompt(prompt=prompt, multi_modal_data=mm_data)
+
+    return engine_inputs
+
+
+SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
+LIMIT_MM_PER_PROMPT = dict(image=4)
+
+MAX_MODEL_LEN = [8192, 65536]
+
+FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
+assert FIXTURES_PATH.exists()
+
+FIXTURE_LOGPROBS_CHAT = {
+    PIXTRAL_ID: FIXTURES_PATH / "pixtral_chat.json",
+    MISTRAL_SMALL_3_1_ID: FIXTURES_PATH / "mistral_small_3_chat.json",
+}
+
+OutputsLogprobs = list[tuple[list[int], str, SampleLogprobs | None]]
+
+
+# For the test author to store golden output in JSON
+def _dump_outputs_w_logprobs(
+    outputs: OutputsLogprobs,
+    filename: "StrPath",
+) -> None:
+    json_data = [
+        (
+            tokens,
+            text,
+            [
+                {k: asdict(v) for k, v in token_logprobs.items()}
+                for token_logprobs in (logprobs or [])
+            ],
+        )
+        for tokens, text, logprobs in outputs
+    ]
+
+    with open(filename, "w") as f:
+        json.dump(json_data, f)
+
+
+def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
+    with open(filename, "rb") as f:
+        json_data = json.load(f)
+
+    return [
+        (
+            tokens,
+            text,
+            [
+                {int(k): Logprob(**v) for k, v in token_logprobs.items()}
+                for token_logprobs in logprobs
+            ],
+        )
+        for tokens, text, logprobs in json_data
+    ]
+
+
+@large_gpu_test(min_gb=80)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+def test_chat(
+    vllm_runner, max_model_len: int, model: str, dtype: str, local_asset_server
+) -> None:
+    if (
+        model == MISTRAL_SMALL_3_1_ID
+        and max_model_len == 65536
+        and current_platform.is_rocm()
+    ):
+        pytest.skip(
+            "OOM on ROCm: 24B model with 65536 context length exceeds GPU memory"
+        )
+
+    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT[model])
+    with vllm_runner(
+        model,
+        dtype=dtype,
+        tokenizer_mode="mistral",
+        load_format="mistral",
+        config_format="mistral",
+        max_model_len=max_model_len,
+        limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
+    ) as vllm_model:
+        outputs = []
+
+        urls_all = [local_asset_server.url_for(u) for u in IMG_URLS]
+        msgs = [
+            _create_msg_format(urls_all[:1]),
+            _create_msg_format(urls_all[:2]),
+            _create_msg_format(urls_all),
+        ]
+        for msg in msgs:
+            output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS)
+
+            outputs.extend(output)
+
+    logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
+    # Remove last `None` prompt_logprobs to compare with fixture
+    for i in range(len(logprobs)):
+        assert logprobs[i][-1] is None
+        logprobs[i] = logprobs[i][:-1]
+    check_logprobs_close(
+        outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
+        outputs_1_lst=logprobs,
+        name_0="h100_ref",
+        name_1="output",
+    )
--- a/tests/models/multimodal/generation/test_qwen2_5_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_5_vl.py
@@ -0,0 +1,148 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.multimodal.video import sample_frames_from_video
+
+from ....conftest import VIDEO_ASSETS
+
+models = ["Qwen/Qwen2.5-VL-3B-Instruct"]
+target_dtype = "bfloat16"
+
+VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
+
+
+def qwen2_5_vl_chat_template(*query):
+    return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n"  # noqa: E501
+
+
+VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
+    {
+        "baby_reading": qwen2_5_vl_chat_template(
+            VIDEO_PLACEHOLDER,
+            "Describe this video with a short sentence ",
+            "(no more than 20 words)",
+        ),
+    }
+)
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("video_pruning_rate", [0.0, 0.75])
+@pytest.mark.parametrize("num_frames", [16])
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("use_bytecode_hook", [True, False])
+def test_qwen2_5_vl_evs_functionality(
+    vllm_runner,
+    video_assets,
+    model,
+    video_pruning_rate: float,
+    num_frames: int,
+    dtype: str,
+    max_tokens: int,
+    use_bytecode_hook: bool,
+    monkeypatch,
+) -> None:
+    """Test EVS (Efficient Video Sampling) functionality with different
+    pruning rates.
+    """
+    # Set the environment variable for this test
+    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
+
+    # Sample frames from video assets
+    sampled_vids = [
+        sample_frames_from_video(asset.np_ndarrays, num_frames)
+        for asset in video_assets
+    ]
+
+    prompts = [VIDEO_PROMPTS[0]]
+    videos = [sampled_vids[0]]
+
+    # Initialize model with EVS configuration
+    with vllm_runner(
+        model,
+        runner="generate",
+        max_model_len=4000,
+        dtype=dtype,
+        limit_mm_per_prompt={"video": 1},
+        video_pruning_rate=video_pruning_rate,
+    ) as vllm_model:
+        # Generate output - this should not crash
+        outputs = vllm_model.generate_greedy(prompts, max_tokens, videos=videos)
+
+        # Basic validation that we got a response
+        assert len(outputs) == 1
+        output_ids, output_text = outputs[0]
+
+        # Ensure we got some output
+        assert len(output_ids) > 0
+        assert len(output_text) > 0
+
+        # Ensure the output is a string
+        assert isinstance(output_text, str)
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("video_pruning_rate", [0.0, 0.75])
+@pytest.mark.parametrize("num_frames", [16])
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("use_bytecode_hook", [True, False])
+def test_qwen2_5_vl_evs_batched_videos(
+    vllm_runner,
+    video_assets,
+    model,
+    video_pruning_rate: float,
+    num_frames: int,
+    dtype: str,
+    max_tokens: int,
+    use_bytecode_hook: bool,
+    monkeypatch,
+) -> None:
+    """Test EVS functionality with batched videos.
+
+    This test validates that:
+    1. The model handles batched video inputs correctly with EVS
+    2. Both pruning configurations work with multiple videos
+    3. The model doesn't crash when processing multiple videos simultaneously
+    """
+    # Set the environment variable for this test
+    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
+    # Sample frames from video assets
+    sampled_vids = [
+        sample_frames_from_video(asset.np_ndarrays, num_frames)
+        for asset in video_assets
+    ]
+
+    # Test batched videos
+    prompts = [VIDEO_PROMPTS[0], VIDEO_PROMPTS[0]]
+    videos = [sampled_vids[0], sampled_vids[0]]  # Use same video twice for testing
+
+    # Initialize model with EVS configuration
+    with vllm_runner(
+        model,
+        runner="generate",
+        max_model_len=4000,
+        max_num_seqs=2,
+        dtype=dtype,
+        limit_mm_per_prompt={"video": 2},
+        tensor_parallel_size=1,
+        video_pruning_rate=video_pruning_rate,
+    ) as vllm_model:
+        # Generate output - this should not crash
+        outputs = vllm_model.generate_greedy(prompts, max_tokens, videos=videos)
+
+        # Basic validation that we got responses for both videos
+        assert len(outputs) == 2
+
+        for output_ids, output_text in outputs:
+            # Ensure we got some output for each video
+            assert len(output_ids) > 0
+            assert len(output_text) > 0
+
+            # Ensure the output is a string
+            assert isinstance(output_text, str)
--- a/tests/models/multimodal/generation/test_qwen2_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_vl.py
@@ -0,0 +1,473 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, TypedDict
+
+import numpy.typing as npt
+import pytest
+import torch
+from PIL import Image
+
+from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
+
+from ....conftest import (
+    IMAGE_ASSETS,
+    VIDEO_ASSETS,
+    PromptImageInput,
+    PromptVideoInput,
+    VllmRunner,
+)
+from ...utils import check_logprobs_close
+
+
+@pytest.fixture(scope="function", autouse=True)
+def enable_pickle(monkeypatch):
+    """`LLM.apply_model` requires pickling a function."""
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+
+
+models = ["Qwen/Qwen2-VL-2B-Instruct"]
+target_dtype = "half"
+
+IMAGE_PLACEHOLDER = "<|vision_start|><|image_pad|><|vision_end|>"
+VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
+MODEL_HIDDEN_SIZE = 1536
+
+
+def qwen2_vl_chat_template(*query):
+    return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n"  # noqa: E501
+
+
+IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": qwen2_vl_chat_template(
+            IMAGE_PLACEHOLDER,
+            "What is the biggest text's content in this image?",
+        ),
+        "cherry_blossom": qwen2_vl_chat_template(
+            IMAGE_PLACEHOLDER,
+            "What is the season shown in this image? ",
+            "Reply with a short sentence (no more than 20 words)",
+        ),
+    }
+)
+
+VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
+    {
+        "baby_reading": qwen2_vl_chat_template(
+            VIDEO_PLACEHOLDER,
+            "Describe this video with a short sentence ",
+            "(no more than 20 words)",
+        ),
+    }
+)
+
+MULTIIMAGE_PROMPT = qwen2_vl_chat_template(
+    IMAGE_PLACEHOLDER,
+    IMAGE_PLACEHOLDER,
+    "Describe these two images separately. ",
+    "For each image, reply with a short sentence ",
+    "(no more than 10 words).",
+)
+
+
+class Qwen2VLPromptImageEmbeddingInput(TypedDict):
+    image_embeds: torch.Tensor
+    image_grid_thw: torch.Tensor
+
+
+class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
+    video_embeds: torch.Tensor
+    video_grid_thw: torch.Tensor
+
+
+def batch_make_image_embeddings(
+    image_batches: list[Image.Image | list[Image.Image]],
+    processor,
+    llm: VllmRunner,
+) -> list[Qwen2VLPromptImageEmbeddingInput]:
+    """batched image embeddings for Qwen2-VL
+
+    This will infer all images' embeddings in a single batch,
+      and split the result according to input batches.
+
+    image_batches:
+      - Single-image batches: `list[Image.Image]`
+      - Multiple-image batches: `list[list[Image.Image]]]`
+
+    returns: `list[Qwen2VLPromptImageEmbeddingInput]`
+    """
+
+    image_batches_: list[Any] = image_batches[:]
+
+    # convert single-image batches to multiple-image batches
+    for idx in range(len(image_batches_)):
+        if not isinstance(image_batches_[idx], list):
+            image_batches_[idx] = [image_batches_[idx]]
+
+        assert isinstance(image_batches_[idx], list)
+
+    # append all images into a list (as a batch)
+    images: list[Image.Image] = []
+    for image_batch in image_batches_:
+        images += image_batch
+
+    # image to pixel values
+    image_processor = processor.image_processor
+
+    preprocess_result = image_processor.preprocess(
+        images=images, return_tensors="pt"
+    ).data
+    pixel_values = preprocess_result["pixel_values"]
+    image_grid_thw = preprocess_result["image_grid_thw"]
+
+    # pixel values to embeddings & grid_thws
+    def get_image_embeds(model):
+        with torch.no_grad():
+            visual = model.visual
+
+            pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
+            return visual(pixel_values_on_device, grid_thw=image_grid_thw).cpu()
+
+    image_embeds = torch.concat(llm.apply_model(get_image_embeds))
+
+    # split into original batches
+    result: list[Qwen2VLPromptImageEmbeddingInput] = []
+    image_counter = 0
+    embed_counter = 0
+    for image_batch in image_batches_:
+        cur_batch_image_count = len(image_batch)
+        merge_size = image_processor.merge_size
+        cur_batch_embed_len = sum(
+            grid_thw.prod(-1) // merge_size // merge_size
+            for grid_thw in image_grid_thw[
+                image_counter : image_counter + cur_batch_image_count
+            ]
+        )
+
+        result.append(
+            {
+                "image_embeds": image_embeds[
+                    embed_counter : embed_counter + cur_batch_embed_len
+                ],
+                "image_grid_thw": image_grid_thw[
+                    image_counter : image_counter + cur_batch_image_count
+                ],
+            }
+        )
+
+        embed_counter += cur_batch_embed_len
+        image_counter += cur_batch_image_count
+
+    # ensure we don't lose any images or embeddings
+    assert embed_counter == image_embeds.size(0)
+    assert image_counter == image_grid_thw.size(0)
+    assert len(image_batches) == len(result)
+
+    return result
+
+
+def batch_make_video_embeddings(
+    video_batches: PromptVideoInput, processor, llm: VllmRunner
+) -> list[Qwen2VLPromptVideoEmbeddingInput]:
+    """batched video embeddings for Qwen2-VL
+
+    A NDArray represents a single video's all frames.
+
+    This will infer all videos' embeddings in a single batch,
+      and split the result according to input batches.
+
+    video_batches:
+      - Single-video batches: `list[NDArray]`
+      - Multiple-video batches: `list[list[NDArray]]`
+    """
+
+    video_batches_: list[Any] = video_batches[:]
+
+    for idx in range(len(video_batches_)):
+        if not isinstance(video_batches_[idx], list):
+            single_video_batch: list[npt.NDArray] = [video_batches_[idx]]
+            video_batches_[idx] = single_video_batch
+
+        assert isinstance(video_batches_[idx], list)
+
+    # append all videos into a list (as a batch)
+    videos: list[npt.NDArray] = []
+    for video_batch in video_batches_:
+        videos += video_batch
+
+    # video to pixel values
+    image_processor = processor.image_processor
+
+    preprocess_result = image_processor.preprocess(
+        images=None, videos=videos, return_tensors="pt"
+    ).data
+    pixel_values = preprocess_result["pixel_values_videos"]
+    video_grid_thw = preprocess_result["video_grid_thw"]
+
+    # pixel values to embeddings & grid_thws
+    def get_image_embeds(model):
+        with torch.no_grad():
+            visual = model.visual
+
+            pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
+            return visual(pixel_values_on_device, grid_thw=video_grid_thw).cpu()
+
+    video_embeds = torch.concat(llm.apply_model(get_image_embeds))
+
+    # split into original batches
+    result: list[Qwen2VLPromptVideoEmbeddingInput] = []
+    video_counter = 0
+    embed_counter = 0
+    for video_batch in video_batches_:
+        cur_batch_video_count = len(video_batch)
+        merge_size = image_processor.merge_size
+        cur_batch_embed_len = sum(
+            grid_thw.prod(-1) // merge_size // merge_size
+            for grid_thw in video_grid_thw[
+                video_counter : video_counter + cur_batch_video_count
+            ]
+        )
+
+        result.append(
+            {
+                "video_embeds": video_embeds[
+                    embed_counter : embed_counter + cur_batch_embed_len
+                ],
+                "video_grid_thw": video_grid_thw[
+                    video_counter : video_counter + cur_batch_video_count
+                ],
+            }
+        )
+
+        embed_counter += cur_batch_embed_len
+        video_counter += cur_batch_video_count
+
+    # ensure we don't lose any videos or embeddings
+    assert embed_counter == video_embeds.size(0)
+    assert video_counter == video_grid_thw.size(0)
+    assert len(video_batches) == len(result)
+
+    return result
+
+
+def run_embedding_input_test(
+    vllm_runner: type[VllmRunner],
+    inputs: list[tuple[list[str], PromptImageInput, PromptVideoInput]],
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    mm_limit: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: str | None = None,
+):
+    """Inference result should be the same between
+    original image/video input and image/video embeddings input.
+    """
+    from transformers import AutoProcessor  # noqa: F401
+
+    processor = AutoProcessor.from_pretrained(model)
+
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(
+        model,
+        runner="generate",
+        max_model_len=4000,
+        max_num_seqs=3,
+        dtype=dtype,
+        limit_mm_per_prompt={"image": mm_limit, "video": mm_limit},
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        default_torch_num_threads=1,
+        enable_mm_embeds=True,
+    ) as vllm_model:
+        outputs_per_case_for_original_input = [
+            vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images or None,
+                videos=videos or None,
+            )
+            for prompts, images, videos in inputs
+        ]
+
+        outputs_per_case_for_embeddings_input = [
+            vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=batch_make_image_embeddings(images, processor, vllm_model)
+                if images
+                else None,
+                videos=batch_make_video_embeddings(videos, processor, vllm_model)
+                if videos
+                else None,
+            )
+            for prompts, images, videos in inputs
+        ]
+
+    for outputs_for_original_input, outputs_for_embeddings_input in zip(
+        outputs_per_case_for_original_input, outputs_per_case_for_embeddings_input
+    ):
+        check_logprobs_close(
+            outputs_0_lst=outputs_for_original_input,
+            outputs_1_lst=outputs_for_embeddings_input,
+            name_0="original_input",
+            name_1="embeddings_input",
+        )
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # Single-scale
+        [0.5],
+        # Single-scale, batched
+        [0.5, 0.5],
+        # Multi-scale
+        [0.25, 0.5, 0.5],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_qwen2_vl_image_embeddings_input(
+    vllm_runner,
+    image_assets,
+    model,
+    size_factors,
+    dtype,
+    max_tokens,
+    num_logprobs,
+    monkeypatch,
+) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
+        (
+            [prompt for _ in size_factors],
+            [rescale_image_size(image, factor) for factor in size_factors],
+            [],
+        )
+        for image, prompt in zip(images, IMAGE_PROMPTS)
+    ]
+
+    run_embedding_input_test(
+        vllm_runner,
+        inputs_per_case,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        [],
+        # Single-scale
+        [0.5],
+        # Single-scale, batched
+        [0.5, 0.5],
+        # Multi-scale
+        [0.25, 0.5, 0.5],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_qwen2_vl_multiple_image_embeddings_input(
+    vllm_runner,
+    image_assets,
+    model,
+    size_factors,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
+        (
+            [MULTIIMAGE_PROMPT for _ in size_factors],
+            [
+                [rescale_image_size(image, factor) for image in images]
+                for factor in size_factors
+            ],
+            [],
+        )
+    ]
+
+    run_embedding_input_test(
+        vllm_runner,
+        inputs_per_case,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=2,
+        tensor_parallel_size=1,
+    )
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # Single-scale
+        [0.5],
+        # Single-scale, batched
+        [0.5, 0.5],
+        # Multi-scale
+        [0.25, 0.25, 0.5],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_qwen2_vl_video_embeddings_input(
+    vllm_runner,
+    video_assets,
+    model,
+    size_factors,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    num_frames = 4
+    sampled_vids = [
+        sample_frames_from_video(asset.np_ndarrays, num_frames)
+        for asset in video_assets
+    ]
+
+    inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
+        (
+            [prompt for _ in size_factors],
+            [],
+            [rescale_video_size(video, factor) for factor in size_factors],
+        )
+        for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)
+    ]
+
+    run_embedding_input_test(
+        vllm_runner,
+        inputs_per_case,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
--- a/tests/models/multimodal/generation/test_ultravox.py
+++ b/tests/models/multimodal/generation/test_ultravox.py
@@ -0,0 +1,185 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from typing import Any
+
+import numpy as np
+import pytest
+import pytest_asyncio
+from transformers import AutoTokenizer
+
+from ....conftest import AUDIO_ASSETS, AudioTestAssets, VllmRunner
+from ....utils import RemoteOpenAIServer
+from ...registry import HF_EXAMPLE_MODELS
+
+MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
+
+AUDIO_PROMPTS = AUDIO_ASSETS.prompts(
+    {
+        "mary_had_lamb": "Transcribe this into English.",
+        "winning_call": "What is happening in this audio clip?",
+    }
+)
+
+MULTI_AUDIO_PROMPT = "Describe each of the audios above."
+
+AudioTuple = tuple[np.ndarray, int]
+
+VLLM_PLACEHOLDER = "<|audio|>"
+HF_PLACEHOLDER = "<|audio|>"
+
+CHUNKED_PREFILL_KWARGS = {
+    "enable_chunked_prefill": True,
+    "max_num_seqs": 2,
+    # Use a very small limit to exercise chunked prefill.
+    "max_num_batched_tokens": 16,
+}
+
+
+def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]:
+    """Convert kwargs to CLI args."""
+    args = []
+    for key, value in params_kwargs.items():
+        if isinstance(value, bool):
+            if value:
+                args.append(f"--{key.replace('_', '-')}")
+        else:
+            args.append(f"--{key.replace('_', '-')}={value}")
+    return args
+
+
+@pytest.fixture(
+    params=[
+        pytest.param({}, marks=pytest.mark.cpu_model),
+        pytest.param(CHUNKED_PREFILL_KWARGS),
+    ]
+)
+def server(request, audio_assets: AudioTestAssets):
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "4096",
+        "--enforce-eager",
+        "--limit-mm-per-prompt",
+        json.dumps({"audio": len(audio_assets)}),
+        "--trust-remote-code",
+    ] + params_kwargs_to_cli_args(request.param)
+
+    with RemoteOpenAIServer(
+        MODEL_NAME, args, env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"}
+    ) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+def _get_prompt(audio_count, question, placeholder):
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    placeholder = f"{placeholder}\n" * audio_count
+
+    return tokenizer.apply_chat_template(
+        [{"role": "user", "content": f"{placeholder}{question}"}],
+        tokenize=False,
+        add_generation_prompt=True,
+    )
+
+
+def run_multi_audio_test(
+    vllm_runner: type[VllmRunner],
+    prompts_and_audios: list[tuple[str, list[AudioTuple]]],
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    **kwargs,
+):
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
+
+    with vllm_runner(
+        model,
+        dtype=dtype,
+        enforce_eager=True,
+        limit_mm_per_prompt={
+            "audio": max((len(audio) for _, audio in prompts_and_audios))
+        },
+        **kwargs,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            [prompt for prompt, _ in prompts_and_audios],
+            max_tokens,
+            num_logprobs=num_logprobs,
+            audios=[audios for _, audios in prompts_and_audios],
+        )
+
+    # The HuggingFace model doesn't support multiple audios yet, so
+    # just assert that some tokens were generated.
+    assert all(tokens for tokens, *_ in vllm_outputs)
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize(
+    "vllm_kwargs",
+    [
+        pytest.param({}, marks=pytest.mark.cpu_model),
+        pytest.param(CHUNKED_PREFILL_KWARGS),
+    ],
+)
+def test_models_with_multiple_audios(
+    vllm_runner,
+    audio_assets: AudioTestAssets,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    vllm_kwargs: dict,
+) -> None:
+    vllm_prompt = _get_prompt(len(audio_assets), MULTI_AUDIO_PROMPT, VLLM_PLACEHOLDER)
+    run_multi_audio_test(
+        vllm_runner,
+        [(vllm_prompt, [audio.audio_and_sample_rate for audio in audio_assets])],
+        MODEL_NAME,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        **vllm_kwargs,
+    )
+
+
+@pytest.mark.asyncio
+async def test_online_serving(client, audio_assets: AudioTestAssets):
+    """Exercises online serving with/without chunked prefill enabled."""
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *[
+                    {"type": "audio_url", "audio_url": {"url": audio.url}}
+                    for audio in audio_assets
+                ],
+                {
+                    "type": "text",
+                    "text": f"What's happening in these {len(audio_assets)} audio clips?",  # noqa: E501
+                },
+            ],
+        }
+    ]
+
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME, messages=messages, max_tokens=10
+    )
+
+    assert len(chat_completion.choices) == 1
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
--- a/tests/models/multimodal/generation/test_vit_backend_functionality.py
+++ b/tests/models/multimodal/generation/test_vit_backend_functionality.py
@@ -0,0 +1,435 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Consolidated test for ViT attention backend functionality across multiple models.
+
+This test validates that each multimodal model can successfully generate outputs
+using different ViT attention backends. Tests are parametrized by model and backend.
+"""
+
+from dataclasses import asdict
+from typing import Any
+
+import pytest
+from transformers import AutoProcessor
+
+from vllm import LLM, EngineArgs, SamplingParams
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.multimodal.utils import encode_image_base64
+from vllm.multimodal.video import sample_frames_from_video
+from vllm.platforms import current_platform
+
+from ....utils import create_new_process_for_each_test
+from ...utils import dummy_hf_overrides
+
+# Dots.OCR prompt from official repository
+# https://github.com/rednote-hilab/dots.ocr/blob/d72d1d8c5bdd0362eb264f714cdbd1e5daa7cdff/dots_ocr/utils/prompts.py#L3
+# ruff: noqa: E501
+DOTS_OCR_PROMPT = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
+
+1. Bbox format: [x1, y1, x2, y2]
+
+2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
+
+3. Text Extraction & Formatting Rules:
+    - Picture: For the 'Picture' category, the text field should be omitted.
+    - Formula: Format its text as LaTeX.
+    - Table: Format its text as HTML.
+    - All Others (Text, Title, etc.): Format their text as Markdown.
+
+4. Constraints:
+    - The output text must be the original text from the image, with no translation.
+    - All layout elements must be sorted according to human reading order.
+
+5. Final Output: The entire output must be a single JSON object.
+"""
+
+VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
+
+
+# Model configurations
+MODEL_CONFIGS: dict[str, dict[str, Any]] = {
+    "dots_ocr": {
+        "model_name": "rednote-hilab/dots.ocr",
+        "interface": "llm_chat",
+        "max_model_len": 32768,
+        "max_num_seqs": 1,
+        "limit_mm_per_prompt": {"image": 1},
+        "sampling_params": {
+            "temperature": 0.1,
+            "max_tokens": 16384,
+            "top_p": 0.9,
+            "stop_token_ids": None,
+        },
+        "use_specific_image": "stop_sign",
+        "prompt_builder": "build_dots_ocr_prompt",
+        "output_validator": lambda x: len(x) > 10 and "stop" in x.lower(),
+    },
+    "ernie45_vl": {
+        "model_name": "baidu/ERNIE-4.5-VL-28B-A3B-PT",
+        "interface": "llm_generate",
+        "max_model_len": 16384,
+        "max_num_seqs": 2,
+        "sampling_params": {
+            "temperature": 0.0,
+            "max_tokens": 256,
+            "stop_token_ids": None,
+        },
+        "use_processor": True,
+        "question": "What is the content of each image?",
+    },
+    "glm4_1v": {
+        "model_name": "zai-org/GLM-4.1V-9B-Thinking",
+        "interface": "llm_generate",
+        "max_model_len": 32768,
+        "max_num_seqs": 2,
+        "sampling_params": {
+            "temperature": 0.0,
+            "max_tokens": 256,
+            "stop_token_ids": None,
+        },
+        "use_processor": True,
+        "question": "What is the content of each image?",
+    },
+    "keye_vl": {
+        "model_name": "Kwai-Keye/Keye-VL-8B-Preview",
+        "interface": "llm_generate",
+        "max_model_len": 8192,
+        "max_num_seqs": 5,
+        "sampling_params": {
+            "temperature": 0.0,
+            "max_tokens": 256,
+            "stop_token_ids": None,
+        },
+        "supported_backends": {
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
+        },
+        "use_processor": True,
+        "question": "What is the content of each image?",
+    },
+    "ovis2_5": {
+        "model_name": "AIDC-AI/Ovis2.5-2B",
+        "interface": "llm_generate",
+        "max_model_len": 8192,
+        "max_num_seqs": 2,
+        "sampling_params": {
+            "temperature": 0.0,
+            "max_tokens": 256,
+            "stop_token_ids": None,
+        },
+        "prompt_builder": "build_ovis_prompt",
+        "question": "What is the content of each image?",
+    },
+    "qwen2_5_vl": {
+        "model_name": "Qwen/Qwen2.5-VL-3B-Instruct",
+        "interface": "vllm_runner",
+        "media_type": "video",
+        "max_model_len": 4000,
+        "max_num_seqs": 1,
+        "limit_mm_per_prompt": {"video": 1},
+        "sampling_params": {
+            "max_tokens": 128,
+        },
+        "runner_kwargs": {
+            "runner": "generate",
+            "dtype": "bfloat16",
+        },
+        "video_params": {
+            "num_frames": 16,
+            "pruning_rates": [0.0, 0.75],
+        },
+    },
+    "qwen2_5_omni": {
+        "model_name": "Qwen/Qwen2.5-Omni-3B",
+        "interface": "llm_generate",
+        "max_model_len": 32768,
+        "max_num_seqs": 2,
+        "limit_mm_per_prompt": {"image": 3, "video": 3, "audio": 3},
+        "sampling_params": {
+            "temperature": 0.6,
+            "top_p": 0.95,
+            "top_k": 20,
+            "max_tokens": 16384,
+        },
+        "use_processor": True,
+        "question": "What is the content of each image?",
+    },
+    "qwen3_omni": {
+        "model_name": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
+        "interface": "llm_generate",
+        "max_model_len": 32768,
+        "max_num_seqs": 2,
+        "limit_mm_per_prompt": {"image": 3, "video": 3, "audio": 3},
+        "sampling_params": {
+            "temperature": 0.6,
+            "top_p": 0.95,
+            "top_k": 20,
+            "max_tokens": 16384,
+        },
+        "use_processor": True,
+        "question": "What is the content of each image?",
+    },
+}
+
+
+# Prompt builder functions
+def build_dots_ocr_prompt(images, config):
+    """Build Dots.OCR specific prompt with OCR instructions."""
+    # Use only stop_sign image for Dots.OCR
+    image = images[0]  # Already filtered to stop_sign
+
+    image_url = f"data:image/jpeg;base64,{encode_image_base64(image)}"
+
+    placeholders = [{"type": "image_url", "image_url": {"url": image_url}}]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {
+                    "type": "text",
+                    "text": f"<|img|><|imgpad|><|endofimg|>{DOTS_OCR_PROMPT}",
+                },
+            ],
+        },
+    ]
+
+    return messages
+
+
+def build_processor_prompt(images, config):
+    """Build prompt using AutoProcessor.apply_chat_template()."""
+    processor = AutoProcessor.from_pretrained(
+        config["model_name"], trust_remote_code=True
+    )
+
+    image_urls = [
+        f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
+    ]
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": config["question"]},
+            ],
+        },
+    ]
+
+    return processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+
+def build_ovis_prompt(images, config):
+    """Build Ovis2.5 specific prompt with custom format."""
+    image_urls = [
+        f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
+    ]
+
+    placeholders = "\n".join(
+        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
+    )
+
+    return (
+        f"<|im_start|>user\n\n{placeholders}\n{config['question']}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
+
+
+def build_qwen2_5_video_prompt():
+    """Build Qwen2.5-VL video prompt with EVS placeholder."""
+    return (
+        f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+        f"<|im_start|>user\n{VIDEO_PLACEHOLDER}"
+        "Describe this video with a short sentence (no more than 20 words)"
+        "<|im_end|><|im_start|>assistant\n"
+    )
+
+
+# Handler functions
+def run_llm_generate_test(config, mm_encoder_attn_backend, image_assets):
+    """Standard LLM.generate() interface handler."""
+    images = [asset.pil_image for asset in image_assets]
+
+    # Build prompt
+    if config.get("use_processor"):
+        prompt = build_processor_prompt(images, config)
+    else:
+        prompt_builder_name = config.get("prompt_builder", "build_ovis_prompt")
+        prompt_builder = globals()[prompt_builder_name]
+        prompt = prompt_builder(images, config)
+
+    # Determine limit_mm_per_prompt
+    limit_mm_per_prompt = config.get("limit_mm_per_prompt", {"image": len(images)})
+
+    # Create engine
+    engine_args = EngineArgs(
+        model=config["model_name"],
+        trust_remote_code=True,
+        max_model_len=config["max_model_len"],
+        max_num_seqs=config["max_num_seqs"],
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        mm_encoder_attn_backend=mm_encoder_attn_backend,
+        hf_overrides=dummy_hf_overrides,
+        load_format="dummy",
+    )
+
+    engine_dict = asdict(engine_args) | {"seed": 42}
+    llm = LLM(**engine_dict)
+
+    # Generate
+    sampling_params = SamplingParams(**config["sampling_params"])
+    outputs = llm.generate(
+        {
+            "prompt": prompt,
+            "multi_modal_data": {"image": images},
+        },
+        sampling_params=sampling_params,
+    )
+
+    # Validate
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        validator = config.get("output_validator", lambda x: len(x) > 10)
+        assert validator(generated_text), (
+            f"Validation failed for {config['model_name']}: {generated_text}"
+        )
+
+
+def run_llm_chat_test(config, mm_encoder_attn_backend, image_assets):
+    """LLM.chat() interface handler for Dots.OCR."""
+    # Filter to stop_sign image only
+    stop_sign_image = [
+        asset.pil_image for asset in image_assets if asset.name == "stop_sign"
+    ][0]
+
+    # Build messages
+    messages = build_dots_ocr_prompt([stop_sign_image], config)
+
+    # Create engine
+    engine_args = EngineArgs(
+        model=config["model_name"],
+        trust_remote_code=True,
+        max_model_len=config["max_model_len"],
+        max_num_seqs=config["max_num_seqs"],
+        limit_mm_per_prompt=config["limit_mm_per_prompt"],
+        mm_encoder_attn_backend=mm_encoder_attn_backend,
+        hf_overrides=dummy_hf_overrides,
+        load_format="dummy",
+    )
+
+    engine_dict = asdict(engine_args) | {"seed": 42}
+    llm = LLM(**engine_dict)
+
+    # Generate using chat
+    sampling_params = SamplingParams(**config["sampling_params"])
+    outputs = llm.chat(messages=messages, sampling_params=sampling_params)
+
+    # Validate
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        validator = config.get("output_validator", lambda x: len(x) > 10)
+        assert validator(generated_text), (
+            f"Validation failed for {config['model_name']}: {generated_text}"
+        )
+
+
+def run_video_test(config, mm_encoder_attn_backend, video_assets, vllm_runner):
+    """Video test with EVS (Efficient Video Sampling) handler."""
+    for pruning_rate in config["video_params"]["pruning_rates"]:
+        num_frames = config["video_params"]["num_frames"]
+
+        # Sample frames from video
+        sampled_vids = [
+            sample_frames_from_video(asset.np_ndarrays, num_frames)
+            for asset in video_assets
+        ]
+
+        # Build prompt and prepare video
+        prompt = build_qwen2_5_video_prompt()
+        prompts = [prompt]
+        videos = [sampled_vids[0]]
+
+        # Run with vllm_runner context manager
+        with vllm_runner(
+            config["model_name"],
+            max_model_len=config["max_model_len"],
+            max_num_seqs=config["max_num_seqs"],
+            limit_mm_per_prompt=config["limit_mm_per_prompt"],
+            tensor_parallel_size=1,
+            video_pruning_rate=pruning_rate,
+            mm_encoder_attn_backend=mm_encoder_attn_backend,
+            hf_overrides=dummy_hf_overrides,
+            load_format="dummy",
+            **config["runner_kwargs"],
+        ) as vllm_model:
+            outputs = vllm_model.generate_greedy(
+                prompts,
+                config["sampling_params"]["max_tokens"],
+                videos=videos,
+            )
+
+            # Validate output
+            assert len(outputs) == 1, f"Expected 1 output, got {len(outputs)}"
+            output_ids, output_text = outputs[0]
+            assert len(output_ids) > 0, "Generated no output IDs"
+            assert len(output_text) > 0, "Generated empty text"
+            assert isinstance(output_text, str), (
+                f"Output is not string: {type(output_text)}"
+            )
+
+
+# Main test function
+@pytest.mark.parametrize("model_key", list(MODEL_CONFIGS.keys()))
+@pytest.mark.parametrize(
+    "mm_encoder_attn_backend",
+    [None] + current_platform.get_supported_vit_attn_backends(),
+)
+@pytest.mark.skip(reason="Broken test due to memory segmentation fault")
+@create_new_process_for_each_test()
+def test_vit_backend_functionality(
+    model_key: str,
+    mm_encoder_attn_backend: AttentionBackendEnum | None,
+    image_assets,
+    video_assets,
+    vllm_runner,
+    request,
+):
+    """Test ViT attention backend functionality for multimodal models.
+
+    This test validates that each model can successfully generate outputs
+    using different ViT attention backends. The test:
+    1. Filters unsupported backends per model
+    2. Applies appropriate GPU marks
+    3. Routes to the correct test handler based on interface
+    4. Validates output meets minimum requirements
+    """
+    config = MODEL_CONFIGS[model_key]
+
+    # Step 1: Backend filtering
+    if (
+        "supported_backends" in config
+        and mm_encoder_attn_backend is not None
+        and mm_encoder_attn_backend not in config["supported_backends"]
+    ):
+        pytest.skip(
+            f"{model_key} does not support {mm_encoder_attn_backend} backend now."
+        )
+
+    # Step 2: Apply GPU marks dynamically
+    if "gpu_marks" in config:
+        for mark in config["gpu_marks"]:
+            request.applymarker(mark)
+
+    # Step 3: Route to appropriate handler
+    if config.get("media_type") == "video":
+        run_video_test(config, mm_encoder_attn_backend, video_assets, vllm_runner)
+    elif config["interface"] == "llm_chat":
+        run_llm_chat_test(config, mm_encoder_attn_backend, image_assets)
+    elif config["interface"] == "llm_generate":
+        run_llm_generate_test(config, mm_encoder_attn_backend, image_assets)
+    else:
+        raise ValueError(f"Unknown interface: {config['interface']}")
--- a/tests/models/multimodal/generation/test_voxtral.py
+++ b/tests/models/multimodal/generation/test_voxtral.py
@@ -0,0 +1,114 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+import pytest_asyncio
+from mistral_common.audio import Audio
+from mistral_common.protocol.instruct.chunk import AudioChunk, RawAudio, TextChunk
+from mistral_common.protocol.instruct.messages import UserMessage
+
+from vllm.tokenizers.mistral import MistralTokenizer
+
+from ....conftest import AudioTestAssets
+from ....utils import RemoteOpenAIServer
+from .test_ultravox import MULTI_AUDIO_PROMPT, run_multi_audio_test
+
+MODEL_NAME = "mistralai/Voxtral-Mini-3B-2507"
+MISTRAL_FORMAT_ARGS = [
+    "--tokenizer_mode",
+    "mistral",
+    "--config_format",
+    "mistral",
+    "--load_format",
+    "mistral",
+]
+
+
+@pytest.fixture()
+def server(request, audio_assets: AudioTestAssets):
+    args = [
+        "--enforce-eager",
+        "--limit-mm-per-prompt",
+        json.dumps({"audio": len(audio_assets)}),
+    ] + MISTRAL_FORMAT_ARGS
+
+    with RemoteOpenAIServer(
+        MODEL_NAME, args, env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"}
+    ) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+def _get_prompt(audio_assets, question):
+    tokenizer = MistralTokenizer.from_pretrained(MODEL_NAME)
+
+    audios = [
+        Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
+        for i in range(len(audio_assets))
+    ]
+    audio_chunks = [
+        AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
+    ]
+
+    text_chunk = TextChunk(text=question)
+    messages = [UserMessage(content=[*audio_chunks, text_chunk]).to_openai()]
+
+    return tokenizer.apply_chat_template(messages=messages)
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models_with_multiple_audios(
+    vllm_runner,
+    audio_assets: AudioTestAssets,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    vllm_prompt = _get_prompt(audio_assets, MULTI_AUDIO_PROMPT)
+    run_multi_audio_test(
+        vllm_runner,
+        [(vllm_prompt, [audio.audio_and_sample_rate for audio in audio_assets])],
+        MODEL_NAME,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tokenizer_mode="mistral",
+    )
+
+
+@pytest.mark.asyncio
+async def test_online_serving(client, audio_assets: AudioTestAssets):
+    """Exercises online serving with/without chunked prefill enabled."""
+
+    def asset_to_chunk(asset):
+        audio = Audio.from_file(str(asset.get_local_path()), strict=False)
+        audio.format = "wav"
+        audio_dict = AudioChunk.from_audio(audio).to_openai()
+        return audio_dict
+
+    audio_chunks = [asset_to_chunk(asset) for asset in audio_assets]
+    text = f"What's happening in these {len(audio_assets)} audio clips?"
+    messages = [
+        {
+            "role": "user",
+            "content": [*audio_chunks, {"type": "text", "text": text}],
+        }
+    ]
+
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME, messages=messages, max_tokens=10
+    )
+
+    assert len(chat_completion.choices) == 1
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
--- a/tests/models/multimodal/generation/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
@@ -0,0 +1,178 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+from typing import Any
+
+import librosa
+import pytest
+from transformers import AutoModelForSpeechSeq2Seq
+
+from vllm.assets.audio import AudioAsset
+from vllm.platforms import current_platform
+
+from ....conftest import HfRunner, PromptAudioInput, VllmRunner
+from ....utils import create_new_process_for_each_test, multi_gpu_test
+from ...registry import HF_EXAMPLE_MODELS
+from ...utils import check_logprobs_close
+
+VLLM_PROMPT = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
+HF_PROMPT = ""
+# Whisper expects 16kHz audio
+WHISPER_SAMPLE_RATE = 16000
+
+
+@pytest.fixture(autouse=True)
+def use_spawn_for_whisper(monkeypatch):
+    """Whisper has issues with forked workers, use spawn instead."""
+    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+
+
+def run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    inputs: Sequence[tuple[list[str], list[str], PromptAudioInput]],
+    model: str,
+    *,
+    max_model_len: int,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: str | None = None,
+    enforce_eager: bool = True,
+) -> None:
+    """Inference result should be the same between hf and vllm.
+
+    All the audio fixtures for the test are from AudioAsset.
+    For huggingface runner, we provide the audio as input.
+    For vllm runner, we provide MultiModalDataDict objects
+    and corresponding MultiModalConfig as input.
+    """
+    with vllm_runner(
+        model,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        limit_mm_per_prompt={"audio": 2},
+        enforce_eager=enforce_eager,
+        disable_custom_all_reduce=True,
+    ) as vllm_model:
+        vllm_outputs_per_case = [
+            vllm_model.generate_greedy_logprobs(
+                vllm_prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                audios=audios,
+            )
+            for vllm_prompts, _, audios in inputs
+        ]
+
+    with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
+        hf_outputs_per_case = [
+            hf_model.generate_greedy_logprobs_limit(
+                hf_prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                audios=audios,
+            )
+            for _, hf_prompts, audios in inputs
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.fixture
+def input_audios() -> list[tuple[list[str], list[str], list[tuple[Any, int]]]]:
+    audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
+    inputs = []
+    for asset in audio_assets:
+        audio, orig_sr = asset.audio_and_sample_rate
+        # Resample to Whisper's expected sample rate (16kHz)
+        if orig_sr != WHISPER_SAMPLE_RATE:
+            audio = librosa.resample(
+                audio, orig_sr=orig_sr, target_sr=WHISPER_SAMPLE_RATE
+            )
+        # vLLM prompts, HF prompts, audio inputs
+        inputs.append(([VLLM_PROMPT], [HF_PROMPT], [(audio, WHISPER_SAMPLE_RATE)]))
+    return inputs
+
+
+def check_model_available(model: str) -> None:
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
+
+
+@pytest.mark.core_model
+@pytest.mark.cpu_model
+@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("enforce_eager", [True, False])
+@create_new_process_for_each_test("spawn")
+def test_models(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    dtype: str,
+    num_logprobs: int,
+    input_audios,
+    enforce_eager: bool,
+) -> None:
+    check_model_available(model)
+    if current_platform.is_cpu() and not enforce_eager:
+        pytest.skip("Skipping test for CPU with non-eager mode")
+    run_test(
+        hf_runner,
+        vllm_runner,
+        input_audios,
+        model,
+        dtype=dtype,
+        max_model_len=448,
+        max_tokens=200,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+        enforce_eager=enforce_eager,
+    )
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
+@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [200])
+@pytest.mark.parametrize("num_logprobs", [5])
+@create_new_process_for_each_test("spawn")
+def test_models_distributed(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    distributed_executor_backend: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    input_audios,
+) -> None:
+    check_model_available(model)
+    run_test(
+        hf_runner,
+        vllm_runner,
+        input_audios,
+        model,
+        dtype=dtype,
+        max_model_len=448,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=2,
+        distributed_executor_backend=distributed_executor_backend,
+        enforce_eager=False,
+    )
--- a/tests/models/multimodal/generation/vlm_utils/init.py
+++ b/tests/models/multimodal/generation/vlm_utils/init.py
--- a/tests/models/multimodal/generation/vlm_utils/builders.py
+++ b/tests/models/multimodal/generation/vlm_utils/builders.py
@@ -0,0 +1,347 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Helpers for building inputs that can be leveraged for different test types."""
+
+from collections.abc import Callable, Iterable
+from pathlib import PosixPath
+from typing import Any
+
+import numpy.typing as npt
+import torch
+
+from vllm.multimodal.audio import AudioResampler
+from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.video import (
+    rescale_video_size,
+    resize_video,
+    sample_frames_from_video,
+)
+
+from .....conftest import AudioTestAssets, ImageTestAssets, VideoTestAssets
+from .types import (
+    SINGLE_AUDIO_BASE_PROMPT,
+    SINGLE_IMAGE_BASE_PROMPTS,
+    TEST_AUDIO_PLACEHOLDER,
+    TEST_IMG_PLACEHOLDER,
+    TEST_VIDEO_PLACEHOLDER,
+    VIDEO_BASE_PROMPT,
+    ImageSizeWrapper,
+    PromptWithMultiModalInput,
+    SizeType,
+    VLMTestInfo,
+)
+
+
+def replace_test_placeholder(
+    prompt: str, mm_idx_to_prompt: Callable[[int], str], test_placeholder: str
+) -> str:
+    """Given a prompt, replaces each test placeholder with the
+    model-specific tag.
+    """
+    prompt_segments = prompt.split(test_placeholder)
+    img_prompt = prompt_segments[0]
+    for placeholder_idx, next_seg in enumerate(prompt_segments[1:], start=1):
+        img_prompt += mm_idx_to_prompt(placeholder_idx)
+        img_prompt += next_seg
+    return img_prompt
+
+
+def get_model_prompts(
+    base_prompts: Iterable[str],
+    img_idx_to_prompt: Callable[[int], str] | None,
+    video_idx_to_prompt: Callable[[int], str] | None,
+    audio_idx_to_prompt: Callable[[int], str] | None,
+    prompt_formatter: Callable[[str], str],
+) -> list[str]:
+    """Given a model-agnostic base prompt and test configuration for a model(s)
+    to be tested, update the media placeholders and apply the prompt formatting
+    to get the test prompt string for this model.
+
+    Example for phi3v, given the base_prompt: "<image>What is the season?"
+        1. Replace img placeholder(s)
+          -> "<|image_1|>\nWhat is the season?"
+        2. Apply prompt formatter:
+          -> <|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n
+    """
+    assert isinstance(base_prompts, (list, tuple))
+    model_prompts = []
+    for base_prompt in base_prompts:
+        # Replace the multimodal placeholders in the base prompt with
+        # the correct ones for the model that we are testing
+        if img_idx_to_prompt:
+            base_prompt = replace_test_placeholder(
+                base_prompt, img_idx_to_prompt, TEST_IMG_PLACEHOLDER
+            )
+
+        if video_idx_to_prompt:
+            base_prompt = replace_test_placeholder(
+                base_prompt, video_idx_to_prompt, TEST_VIDEO_PLACEHOLDER
+            )
+
+        if audio_idx_to_prompt:
+            base_prompt = replace_test_placeholder(
+                base_prompt, audio_idx_to_prompt, TEST_AUDIO_PLACEHOLDER
+            )
+
+        # Apply the prompt formatter to wrap the base prompt with
+        # the correct media placeholders to get the model test prompt
+        model_prompt = prompt_formatter(base_prompt)
+        model_prompts.append(model_prompt)
+    return model_prompts
+
+
+def build_single_image_inputs_from_test_info(
+    test_info: VLMTestInfo,
+    image_assets: ImageTestAssets,
+    size_wrapper: ImageSizeWrapper,
+    tmp_path: PosixPath | None = None,
+) -> list[PromptWithMultiModalInput]:
+    if test_info.prompt_formatter is None:
+        raise ValueError("Prompt formatter must be set to build single image inputs")
+
+    model_prompts = get_model_prompts(
+        test_info.single_image_prompts,
+        test_info.img_idx_to_prompt,
+        test_info.video_idx_to_prompt,
+        test_info.audio_idx_to_prompt,
+        test_info.prompt_formatter,
+    )
+
+    # For models that require a local path / URL encoded in the image; export
+    # assets and encode into tmp_path for this test. This should be avoided
+    # where possible (currently needed for Qwen-VL).
+    if test_info.prompt_path_encoder is not None:
+        if tmp_path is None:
+            raise ValueError("Prompt path encoder requires setting local path")
+        model_prompts = [
+            test_info.prompt_path_encoder(tmp_path, prompt, [asset])
+            for prompt, asset in zip(model_prompts, image_assets)
+        ]
+
+    images = [asset.pil_image for asset in image_assets]
+    assert len(images) == len(model_prompts)
+    return build_single_image_inputs(images, model_prompts, size_wrapper)
+
+
+def build_single_image_inputs(
+    images, model_prompts, size_wrapper: ImageSizeWrapper
+) -> list[PromptWithMultiModalInput]:
+    # For every image / prompt pair, get a pair containing two lists of
+    # length size_factors, where the first contains duplicates of the model
+    # prompt [str], and the second contains copies of the image after being
+    # scaled by one of the size factors.
+    #
+    # NOTE: rescaling preserves the image aspect ratio.
+    return [
+        PromptWithMultiModalInput(
+            prompts=[prompt for _ in size_wrapper.data],
+            image_data=[
+                apply_image_size_scaling(image, size, size_wrapper.type)
+                for size in size_wrapper.data
+            ],
+        )
+        for image, prompt in zip(images, model_prompts)
+    ]
+
+
+def build_multi_image_inputs_from_test_info(
+    test_info: VLMTestInfo,
+    image_assets: ImageTestAssets,
+    size_wrapper: ImageSizeWrapper,
+    tmp_path: PosixPath | None = None,
+) -> list[PromptWithMultiModalInput]:
+    if test_info.prompt_formatter is None:
+        raise ValueError("Prompt formatter must be set to build multi image inputs")
+
+    model_prompts = get_model_prompts(
+        [test_info.multi_image_prompt],
+        test_info.img_idx_to_prompt,
+        test_info.video_idx_to_prompt,
+        test_info.audio_idx_to_prompt,
+        test_info.prompt_formatter,
+    )
+
+    if test_info.prompt_path_encoder is not None:
+        if tmp_path is None:
+            raise ValueError("Prompt path encoder requires setting local path")
+        model_prompts = [
+            test_info.prompt_path_encoder(tmp_path, model_prompt, image_assets)
+            for model_prompt in model_prompts
+        ]
+
+    images = [asset.pil_image for asset in image_assets]
+
+    # Currently, we only have one multi-image list & one multi-image prompt
+    return build_multi_image_inputs(
+        image_lists=[images],
+        model_prompts=model_prompts,
+        size_wrapper=size_wrapper,
+    )
+
+
+def build_multi_image_inputs(
+    image_lists, model_prompts, size_wrapper: ImageSizeWrapper
+) -> list[PromptWithMultiModalInput]:
+    return [
+        PromptWithMultiModalInput(
+            prompts=[prompt for _ in size_wrapper.data],
+            image_data=[
+                [
+                    apply_image_size_scaling(image, size, size_wrapper.type)
+                    for image in images
+                ]
+                for size in size_wrapper.data
+            ],
+        )
+        for images, prompt in zip(image_lists, model_prompts)
+    ]
+
+
+def build_embedding_inputs_from_test_info(
+    test_info: VLMTestInfo,
+    image_assets: ImageTestAssets,
+    size_wrapper: ImageSizeWrapper,
+):
+    # These conditions will always be true if invoked through filtering,
+    # but we still check them in case this is ever called directly
+    if test_info.prompt_formatter is None:
+        raise ValueError("Prompt formatter must be set to build image embedding inputs")
+    if size_wrapper.type != SizeType.SIZE_FACTOR or not all(
+        factor == 1.0 for factor in size_wrapper.data
+    ):
+        raise ValueError("Embedding tests require constant (1.0) size factors")
+    if test_info.convert_assets_to_embeddings is None:
+        raise ValueError("No conversion func for getting embeddings found")
+
+    model_prompts = get_model_prompts(
+        SINGLE_IMAGE_BASE_PROMPTS,
+        test_info.img_idx_to_prompt,
+        test_info.video_idx_to_prompt,
+        test_info.audio_idx_to_prompt,
+        test_info.prompt_formatter,
+    )
+
+    images = [asset.pil_image for asset in image_assets]
+    embeds = test_info.convert_assets_to_embeddings(image_assets)
+    if test_info.dtype != "auto":
+        dtype = getattr(torch, test_info.dtype)  # type: ignore
+        embeds = [e.to(dtype=dtype) for e in embeds]
+    assert len(images) == len(model_prompts)
+
+    inputs = build_single_image_inputs(images, model_prompts, size_wrapper)
+    vllm_embeddings = build_single_image_inputs(embeds, model_prompts, size_wrapper)
+    return inputs, vllm_embeddings
+
+
+def build_video_inputs_from_test_info(
+    test_info: VLMTestInfo,
+    video_assets: VideoTestAssets,
+    size_wrapper: ImageSizeWrapper,
+    num_frames: int,
+    needs_video_metadata: bool,
+) -> list[PromptWithMultiModalInput]:
+    if test_info.prompt_formatter is None:
+        raise ValueError("Prompt formatter must be set to build video inputs")
+    model_prompts = get_model_prompts(
+        [VIDEO_BASE_PROMPT],
+        test_info.img_idx_to_prompt,
+        test_info.video_idx_to_prompt,
+        test_info.audio_idx_to_prompt,
+        test_info.prompt_formatter,
+    )
+
+    sampled_vids = [
+        sample_frames_with_video_metadata(
+            (asset.np_ndarrays, asset.metadata),
+            num_frames,
+        )
+        for asset in video_assets
+    ]
+
+    video_scaler = (
+        resize_video if size_wrapper.type == SizeType.FIXED_SIZE else rescale_video_size
+    )
+
+    return [
+        PromptWithMultiModalInput(
+            prompts=[prompt for _ in size_wrapper.data],
+            video_data=[
+                (
+                    video_scaler(video, size)
+                    if not needs_video_metadata
+                    else (video_scaler(video, size), meta)
+                )
+                for size in size_wrapper.data
+            ],
+        )
+        for (video, meta), prompt in zip(sampled_vids, model_prompts)
+    ]
+
+
+def sample_frames_with_video_metadata(
+    video_with_meta: tuple[npt.NDArray, dict[str, Any]],
+    num_frames: int,
+) -> tuple[npt.NDArray, dict[str, Any]]:
+    video, meta = video_with_meta
+    video = sample_frames_from_video(video, num_frames)
+
+    meta["do_sample_frames"] = meta["total_num_frames"] == num_frames
+    meta["total_num_frames"] = num_frames
+    meta["fps"] = meta["duration"] / num_frames
+    meta["frames_indices"] = list(range(num_frames))
+    return video, meta
+
+
+def apply_image_size_scaling(image, size: float | tuple[int, int], size_type: SizeType):
+    """Applies a size scaler to one image; this can be an image size factor,
+    which scales the image while maintaining the aspect ratio"""
+    # Special case for embeddings; if it's a tensor, it's only valid if we
+    # are considering size factors at constant scale, i.e., we just clone
+    # the tensor
+    if isinstance(image, torch.Tensor):
+        assert size_type == SizeType.SIZE_FACTOR and size == 1
+        return image
+    if size_type == SizeType.SIZE_FACTOR:
+        # We have a list of image size factors
+        return rescale_image_size(image, size)
+    elif size_type == SizeType.FIXED_SIZE:
+        # We have a list of fixed sizes
+        return image.resize(size)
+    raise ValueError("ImageSizeWrapper type must be FIXED_SIZE or SIZE_FACTOR")
+
+
+def build_audio_inputs_from_test_info(
+    test_info: VLMTestInfo,
+    audio_assets: AudioTestAssets,
+) -> list[PromptWithMultiModalInput]:
+    if test_info.prompt_formatter is None:
+        raise ValueError("Prompt formatter must be set to build audio inputs")
+    model_prompts = get_model_prompts(
+        SINGLE_AUDIO_BASE_PROMPT,
+        test_info.img_idx_to_prompt,
+        test_info.video_idx_to_prompt,
+        test_info.audio_idx_to_prompt,
+        test_info.prompt_formatter,
+    )
+    resampler = AudioResampler(
+        target_sr=16000,
+        method="librosa",
+    )
+    audios = [asset.audio_and_sample_rate for asset in audio_assets]
+    resampled_audios = [
+        (
+            resampler.resample(
+                audio,
+                orig_sr=sr,
+            ),
+            int(resampler.target_sr),
+        )
+        for audio, sr in audios
+    ]
+
+    return [
+        PromptWithMultiModalInput(
+            prompts=model_prompts,
+            audio_data=resampled_audios,
+        )
+    ]
--- a/tests/models/multimodal/generation/vlm_utils/case_filtering.py
+++ b/tests/models/multimodal/generation/vlm_utils/case_filtering.py
@@ -0,0 +1,183 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Utils for determining which subset of model tests belong to a specific
+modality, getting all combinations (similar to pytest's parametrization),
+handling multimodal placeholder substitution, and so on.
+"""
+
+import itertools
+from collections import OrderedDict
+from collections.abc import Iterable
+
+import pytest
+
+from .types import (
+    EMBEDDING_SIZE_FACTORS,
+    ExpandableVLMTestArgs,
+    ImageSizeWrapper,
+    SizeType,
+    VLMTestInfo,
+    VLMTestType,
+)
+
+
+def get_filtered_test_settings(
+    test_settings: dict[str, VLMTestInfo],
+    test_type: VLMTestType,
+    new_proc_per_test: bool,
+) -> dict[str, VLMTestInfo]:
+    """Given the dict of potential test settings to run, return a subdict
+    of tests who have the current test type enabled with the matching val for
+    fork_per_test.
+    """
+
+    def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType):
+        return test_info.test_type == test_type or (
+            isinstance(test_info.test_type, Iterable)
+            and test_type in test_info.test_type
+        )
+
+    matching_tests = {}
+    for test_name, test_info in test_settings.items():
+        # Otherwise check if the test has the right type & keep if it does
+        if matches_test_type(test_info, test_type):
+            # Embedding tests need to have a conversion func in their test info
+            if matches_test_type(test_info, VLMTestType.EMBEDDING):
+                assert test_info.convert_assets_to_embeddings is not None
+            # Custom test inputs need to explicitly define the mm limit/inputs
+            if matches_test_type(test_info, VLMTestType.CUSTOM_INPUTS):
+                assert test_info.custom_test_opts is not None and isinstance(
+                    test_info.custom_test_opts, Iterable
+                )
+            # For all types besides custom inputs, we need a prompt formatter
+            else:
+                assert test_info.prompt_formatter is not None
+
+            # Everything looks okay; keep if this is correct proc handling
+            if (
+                test_info.distributed_executor_backend is not None
+            ) == new_proc_per_test:
+                matching_tests[test_name] = test_info
+
+    return matching_tests
+
+
+def get_model_type_cases(
+    model_type: str,
+    test_info: VLMTestInfo,
+    test_type: VLMTestType,
+):
+    # Ensure that something is wrapped as an iterable it's not already
+    ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,)
+
+    # This is essentially the same as nesting a bunch of mark.parametrize
+    # decorators, but we do it programmatically to allow overrides for on
+    # a per-model basis, while still being able to execute each of these
+    # as individual test cases in pytest.
+    iter_kwargs = OrderedDict(
+        [
+            ("model", ensure_wrapped(test_info.models)),
+            ("max_tokens", ensure_wrapped(test_info.max_tokens)),
+            ("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
+            ("dtype", ensure_wrapped(test_info.dtype)),
+            (
+                "distributed_executor_backend",
+                ensure_wrapped(test_info.distributed_executor_backend),
+            ),
+        ]
+    )
+
+    # num_frames is video only
+    if test_type == VLMTestType.VIDEO:
+        iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames)
+        iter_kwargs["needs_video_metadata"] = ensure_wrapped(
+            test_info.needs_video_metadata
+        )
+
+    # No sizes passed for custom inputs, since inputs are directly provided
+    if test_type not in (
+        VLMTestType.CUSTOM_INPUTS,
+        VLMTestType.AUDIO,
+    ):
+        wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
+        if wrapped_sizes is None:
+            raise ValueError(f"Sizes must be set for test type {test_type}")
+        iter_kwargs["size_wrapper"] = wrapped_sizes
+
+    # Otherwise expand the custom test options instead
+    elif test_type == VLMTestType.CUSTOM_INPUTS:
+        if test_info.custom_test_opts is None:
+            raise ValueError("Test has type CUSTOM_INPUTS, but none given")
+        iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
+
+    # Wrap all model cases in a pytest parameter & pass marks through
+    return [
+        pytest.param(
+            model_type,
+            ExpandableVLMTestArgs(**{k: v for k, v in zip(iter_kwargs.keys(), case)}),
+            marks=test_info.marks if test_info.marks is not None else [],
+        )
+        for case in list(itertools.product(*iter_kwargs.values()))
+    ]
+
+
+def get_parametrized_options(
+    test_settings: dict[str, VLMTestInfo],
+    test_type: VLMTestType,
+    create_new_process_for_each_test: bool,
+):
+    """Converts all of our VLMTestInfo into an expanded list of parameters.
+    This is similar to nesting pytest parametrize calls, but done directly
+    through an itertools product so that each test can set things like
+    size factors etc, while still running in isolated test cases.
+    """
+    matching_tests = get_filtered_test_settings(
+        test_settings, test_type, create_new_process_for_each_test
+    )
+
+    # Get a list per model type, where each entry contains a tuple of all of
+    # that model type's cases, then flatten them into the top level so that
+    # we can consume them in one mark.parametrize call.
+    cases_by_model_type = [
+        get_model_type_cases(model_type, test_info, test_type)
+        for model_type, test_info in matching_tests.items()
+    ]
+    return list(itertools.chain(*cases_by_model_type))
+
+
+def get_wrapped_test_sizes(
+    test_info: VLMTestInfo, test_type: VLMTestType
+) -> tuple[ImageSizeWrapper, ...]:
+    """Given a test info which may have size factors or fixed sizes, wrap them
+    and combine them into an iterable, each of which will be used in parameter
+    expansion.
+
+    Args:
+        test_info: Test configuration to be expanded.
+        test_type: The type of test being filtered for.
+    """
+    # If it is an embedding test, we always use the EMBEDDING_SIZE_FACTORS
+    if test_type == VLMTestType.EMBEDDING:
+        return tuple(
+            [
+                ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
+                for factor in EMBEDDING_SIZE_FACTORS
+            ]
+        )
+    # Audio and Custom inputs have preprocessed inputs
+    elif test_type in (VLMTestType.AUDIO, VLMTestType.CUSTOM_INPUTS):
+        return tuple()
+
+    size_factors = test_info.image_size_factors if test_info.image_size_factors else []
+    fixed_sizes = test_info.image_sizes if test_info.image_sizes else []
+
+    wrapped_factors = [
+        ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
+        for factor in size_factors
+    ]
+
+    wrapped_sizes = [
+        ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size) for size in fixed_sizes
+    ]
+
+    return tuple(wrapped_factors + wrapped_sizes)
--- a/tests/models/multimodal/generation/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@@ -0,0 +1,189 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Core test implementation to be shared across modalities."""
+
+from collections.abc import Callable
+from typing import Any
+
+import torch
+from transformers.models.auto.auto_factory import _BaseAutoModelClass
+
+from vllm.config.model import RunnerOption
+from vllm.tokenizers import TokenizerLike
+
+from .....conftest import HfRunner, VllmRunner
+from ....registry import HF_EXAMPLE_MODELS
+from .types import PromptWithMultiModalInput, RunnerOutput
+
+
+def run_test(
+    *,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    inputs: list[PromptWithMultiModalInput],
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    enforce_eager: bool,
+    max_model_len: int,
+    max_num_seqs: int,
+    hf_output_post_proc: Callable[[RunnerOutput, str], Any] | None,
+    vllm_output_post_proc: Callable[[RunnerOutput, str], Any] | None,
+    auto_cls: type[_BaseAutoModelClass],
+    use_tokenizer_eos: bool,
+    comparator: Callable[..., None],
+    get_stop_token_ids: Callable[[TokenizerLike], list[int]] | None,
+    stop_str: list[str] | None,
+    limit_mm_per_prompt: dict[str, int],
+    vllm_runner_kwargs: dict[str, Any] | None,
+    hf_model_kwargs: dict[str, Any] | None,
+    patch_hf_runner: Callable[[HfRunner], HfRunner] | None,
+    runner: RunnerOption = "auto",
+    distributed_executor_backend: str | None = None,
+    tensor_parallel_size: int = 1,
+    vllm_embeddings: torch.Tensor | None = None,
+):
+    """Modality agnostic test executor for comparing HF/vLLM outputs."""
+    # In the case of embeddings, vLLM takes separate input tensors
+    vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs
+
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
+
+    # Disable other modalities to save memory
+    default_limits = {"image": 0, "video": 0, "audio": 0}
+    limit_mm_per_prompt = default_limits | limit_mm_per_prompt
+
+    vllm_outputs_per_mm = []
+    hf_outputs_per_mm = []
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+
+    vllm_runner_kwargs_: dict[str, Any] = {"mm_processor_cache_gb": 0}
+    if model_info.tokenizer:
+        vllm_runner_kwargs_["tokenizer_name"] = model_info.tokenizer
+    if model_info.tokenizer_mode:
+        vllm_runner_kwargs_["tokenizer_mode"] = model_info.tokenizer_mode
+    if model_info.hf_overrides:
+        vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides
+    if model_info.require_embed_inputs:
+        for k in ("skip_tokenizer_init", "enable_prompt_embeds", "enable_mm_embeds"):
+            vllm_runner_kwargs_[k] = model_info.require_embed_inputs
+
+    if vllm_runner_kwargs:
+        vllm_runner_kwargs_.update(vllm_runner_kwargs)
+
+    with vllm_runner(
+        model,
+        max_model_len=max_model_len,
+        max_num_seqs=max_num_seqs,
+        dtype=dtype,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enforce_eager=enforce_eager,
+        runner=runner,
+        **vllm_runner_kwargs_,
+    ) as vllm_model:
+        tokenizer = vllm_model.llm.get_tokenizer()
+
+        vllm_kwargs: dict[str, Any] = {}
+        if get_stop_token_ids is not None:
+            vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
+        if stop_str:
+            vllm_kwargs["stop"] = stop_str
+
+        for prompts, image_data, video_data, audio_data in vllm_inputs:
+            mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
+            vllm_kwargs_with_mm_data = vllm_kwargs | mm_data
+            vllm_output = vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                **vllm_kwargs_with_mm_data,
+            )
+            vllm_outputs_per_mm.append(vllm_output)
+
+    hf_model = hf_runner(
+        model, dtype=dtype, auto_cls=auto_cls, model_kwargs=hf_model_kwargs
+    )
+
+    # Some models need to patch things like the model processor, e.g., internvl
+    if patch_hf_runner is not None:
+        hf_model = patch_hf_runner(hf_model)
+
+    with hf_model, torch.no_grad():
+        tokenizer = hf_model.tokenizer
+
+        # Some models need to explicitly pass the eos_token_id off the tokenizer
+        # or processor for a good comparison;
+        # currently assume processor/tokenizer agree on the EOS, and pull it off
+        # the tokenizer if requested.
+        hf_kwargs = {}
+        if use_tokenizer_eos:
+            hf_kwargs["eos_token_id"] = tokenizer.eos_token_id
+        if stop_str:
+            hf_kwargs["stop_strings"] = stop_str
+
+        for prompts, image_data, video_data, audio_data in inputs:
+            mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
+            hf_kwargs_with_mm_data = hf_kwargs | mm_data
+            hf_output = hf_model.generate_greedy_logprobs_limit(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                tokenizer=tokenizer,
+                **hf_kwargs_with_mm_data,
+            )
+            hf_outputs_per_mm.append(hf_output)
+
+    # Apply output processing / sanitation to the vLLM and HF runner results
+    hf_outputs_per_mm, vllm_outputs_per_mm = process_runner_outputs(
+        model,
+        first_runner_outputs=hf_outputs_per_mm,
+        second_runner_outputs=vllm_outputs_per_mm,
+        first_runner_processor=hf_output_post_proc,
+        second_runner_processor=vllm_output_post_proc,
+    )
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm, vllm_outputs_per_mm):
+        # This is usually check_logprobs_close, but it's passed through to
+        # allow things like check_outputs_equal where needed
+        comparator(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+def process_runner_outputs(
+    model,
+    first_runner_outputs,
+    second_runner_outputs,
+    first_runner_processor=None,
+    second_runner_processor=None,
+):
+    """Applies the runner processor(s) to the runner outputs, if any."""
+    if first_runner_processor is not None:
+        first_runner_outputs = process_outputs(
+            first_runner_processor, model, first_runner_outputs
+        )
+    if second_runner_processor is not None:
+        second_runner_outputs = process_outputs(
+            second_runner_processor, model, second_runner_outputs
+        )
+    return first_runner_outputs, second_runner_outputs
+
+
+def process_outputs(output_processor, model, outputs_per_image):
+    """Applies a model specific post-processor function to a runner's output"""
+    return [
+        [output_processor(res, model) for res in outputs]
+        for outputs in outputs_per_image
+    ]
--- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
+++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
@@ -0,0 +1,156 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Custom input builders for edge-cases in different models."""
+
+from collections.abc import Callable
+
+from vllm.assets.image import ImageAsset
+from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.video import (
+    rescale_video_size,
+    resize_video,
+    sample_frames_from_video,
+)
+
+from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
+from .builders import build_multi_image_inputs, build_single_image_inputs
+from .types import ImageSizeWrapper, PromptWithMultiModalInput, SizeType
+
+
+def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
+    """Builds inputs for multi-image (varied sizes/aspect ratio) testing.
+
+    Args:
+        formatter: model-specific prompt formatter.
+    """
+    stop_sign = IMAGE_ASSETS[0].pil_image
+    cherry_blossom = IMAGE_ASSETS[1].pil_image
+
+    # Apply the selected formatter to the base prompts
+    img_prompts = [
+        "<image><image>\nDescribe 2 images.",
+        "<image><image>\nDescribe 2 images.",
+        "<image><image><image><image>\nDescribe 4 images.",
+        "<image>\nWhat is the season?",
+    ]
+    formatted_prompts = [formatter(prompt) for prompt in img_prompts]
+    aspect_ratio_images = [
+        [stop_sign, cherry_blossom],
+        # Images with different sizes and aspect-ratios
+        [
+            rescale_image_size(stop_sign, 0.1),
+            stop_sign,
+        ],
+        [
+            stop_sign,
+            rescale_image_size(stop_sign, 0.25),
+            cherry_blossom.resize((183, 488)),
+            cherry_blossom.resize((488, 183)),
+        ],
+        cherry_blossom,
+    ]
+
+    return [
+        PromptWithMultiModalInput(
+            prompts=formatted_prompts,
+            image_data=aspect_ratio_images,
+        )
+    ]
+
+
+def multi_video_multi_aspect_ratio_inputs(
+    formatter: Callable[[str], str], num_frames: int = 16
+):
+    """Builds inputs for multi-video (varied sizes/aspect ratio) testing.
+
+    Args:
+        formatter: model-specific prompt formatter.
+    """
+    video = sample_frames_from_video(VIDEO_ASSETS[0].np_ndarrays, num_frames)
+    # Apply the selected formatter to the base prompts
+    video_prompts = [
+        "<video><video>\nDescribe 2 videos.",
+        "<video><video>\nDescribe 2 videos.",
+        "<video><video><video><video>\nDescribe 4 videos.",
+        "<video>\nWhy is this video funny?",
+    ]
+    formatted_prompts = [formatter(prompt) for prompt in video_prompts]
+    aspect_ratio_videos = [
+        [video, video],
+        # Videos with different sizes and aspect-ratios
+        [
+            rescale_video_size(video, 0.1),
+            video,
+        ],
+        [
+            video,
+            rescale_video_size(video, 0.25),
+            resize_video(video, (183, 488)),
+            resize_video(video, (488, 183)),
+        ],
+        video,
+    ]
+
+    return [
+        PromptWithMultiModalInput(
+            prompts=formatted_prompts,
+            video_data=aspect_ratio_videos,
+        )
+    ]
+
+
+def different_patch_input_cases_internvl():
+    images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
+    formatter = (
+        lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n"  # noqa: E501
+    )
+    single_img_prompts = [
+        "<image>\nWhat's the content in the center of the image?",
+        "<image>\nWhat is the season?",
+    ]
+    multi_img_prompts = [
+        "Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.\n",  # noqa: E501
+    ]
+    formatted_sprompts = [formatter(prompt) for prompt in single_img_prompts]
+    formatted_mprompts = [formatter(prompt) for prompt in multi_img_prompts]
+
+    wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5, 1.0])
+    return [
+        build_single_image_inputs(images, formatted_sprompts, wrapped_sf),
+        build_multi_image_inputs([images], formatted_mprompts, wrapped_sf),
+    ]
+
+
+def windows_attention_image_qwen2_5_vl():
+    # image from regression issue: https://github.com/vllm-project/vllm/issues/15122 # noqa: E501
+    image = ImageAsset("hato").pil_image
+
+    question = "Describe the image."
+    img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
+    prompt = (
+        f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n<|im_start|>assistant\n"
+    )
+
+    wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
+    return build_single_image_inputs([image], [prompt], wrapped_sf)
+
+
+def video_with_metadata_glm4_1v():
+    video_array = VIDEO_ASSETS[0].np_ndarrays
+    metadata = VIDEO_ASSETS[0].metadata
+    question = "Describe the video."
+    video_prompt = "<|begin_of_video|><|video|><|end_of_video|>"
+    formatted_prompt = f"[gMASK]<|user|>\n{video_prompt}{question}<|assistant|>\n"
+
+    scales = [0.1, 0.2, 0.25]
+    video_input = [
+        [(rescale_video_size(video_array, scale), metadata)] for scale in scales
+    ]
+    prompts = [formatted_prompt] * len(video_input)
+
+    return [
+        PromptWithMultiModalInput(
+            prompts=prompts,
+            video_data=video_input,
+        )
+    ]
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
--- a/tests/models/multimodal/generation/vlm_utils/runners.py
+++ b/tests/models/multimodal/generation/vlm_utils/runners.py
@@ -0,0 +1,190 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Entrypoints for wrapping the core run_test implementation for specific test
+types / modalities.
+"""
+
+from pathlib import PosixPath
+
+from .....conftest import (
+    AudioTestAssets,
+    HfRunner,
+    ImageTestAssets,
+    VideoTestAssets,
+    VllmRunner,
+)
+from . import builders, core
+from .types import ExpandableVLMTestArgs, VLMTestInfo
+
+
+####### Entrypoints for running different test types
+def run_single_image_test(
+    *,
+    tmp_path: PosixPath,
+    model_test_info: VLMTestInfo,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+):
+    assert test_case.size_wrapper is not None
+    inputs = builders.build_single_image_inputs_from_test_info(
+        model_test_info, image_assets, test_case.size_wrapper, tmp_path
+    )
+
+    core.run_test(
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        inputs=inputs,
+        model=test_case.model,
+        dtype=test_case.dtype,
+        max_tokens=test_case.max_tokens,
+        num_logprobs=test_case.num_logprobs,
+        limit_mm_per_prompt={"image": 1},
+        distributed_executor_backend=test_case.distributed_executor_backend,
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )
+
+
+def run_multi_image_test(
+    *,
+    tmp_path: PosixPath,
+    model_test_info: VLMTestInfo,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+):
+    assert test_case.size_wrapper is not None
+    inputs = builders.build_multi_image_inputs_from_test_info(
+        model_test_info, image_assets, test_case.size_wrapper, tmp_path
+    )
+
+    core.run_test(
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        inputs=inputs,
+        model=test_case.model,
+        dtype=test_case.dtype,
+        max_tokens=test_case.max_tokens,
+        num_logprobs=test_case.num_logprobs,
+        limit_mm_per_prompt={"image": len(image_assets)},
+        distributed_executor_backend=test_case.distributed_executor_backend,
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )
+
+
+def run_embedding_test(
+    *,
+    model_test_info: VLMTestInfo,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+):
+    assert test_case.size_wrapper is not None
+    inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
+        model_test_info, image_assets, test_case.size_wrapper
+    )
+
+    core.run_test(
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        inputs=inputs,
+        model=test_case.model,
+        dtype=test_case.dtype,
+        max_tokens=test_case.max_tokens,
+        num_logprobs=test_case.num_logprobs,
+        limit_mm_per_prompt={"image": 1},
+        vllm_embeddings=vllm_embeddings,
+        distributed_executor_backend=test_case.distributed_executor_backend,
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )
+
+
+def run_video_test(
+    *,
+    model_test_info: VLMTestInfo,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    video_assets: VideoTestAssets,
+):
+    assert test_case.size_wrapper is not None
+    assert test_case.num_video_frames is not None
+    inputs = builders.build_video_inputs_from_test_info(
+        model_test_info,
+        video_assets,
+        test_case.size_wrapper,
+        test_case.num_video_frames,
+        test_case.needs_video_metadata,
+    )
+
+    core.run_test(
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        inputs=inputs,
+        model=test_case.model,
+        dtype=test_case.dtype,
+        max_tokens=test_case.max_tokens,
+        num_logprobs=test_case.num_logprobs,
+        limit_mm_per_prompt={"video": len(video_assets)},
+        distributed_executor_backend=test_case.distributed_executor_backend,
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )
+
+
+def run_audio_test(
+    *,
+    model_test_info: VLMTestInfo,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    audio_assets: AudioTestAssets,
+):
+    inputs = builders.build_audio_inputs_from_test_info(model_test_info, audio_assets)
+
+    core.run_test(
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        inputs=inputs,
+        model=test_case.model,
+        dtype=test_case.dtype,
+        max_tokens=test_case.max_tokens,
+        num_logprobs=test_case.num_logprobs,
+        limit_mm_per_prompt={"audio": 1},
+        distributed_executor_backend=test_case.distributed_executor_backend,
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )
+
+
+def run_custom_inputs_test(
+    *,
+    model_test_info: VLMTestInfo,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+):
+    # Custom test cases can provide inputs directly, but they need to
+    # explicitly provided a CustomTestConfig, which wraps the inputs and
+    # the limit_mm_per_prompt
+    assert test_case.custom_test_opts is not None
+
+    inputs = test_case.custom_test_opts.inputs
+    limit_mm_per_prompt = test_case.custom_test_opts.limit_mm_per_prompt
+    # Inputs and limit_mm_per_prompt should all be set
+    assert inputs is not None
+    assert limit_mm_per_prompt is not None
+
+    core.run_test(
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        inputs=inputs,
+        model=test_case.model,
+        dtype=test_case.dtype,
+        max_tokens=test_case.max_tokens,
+        num_logprobs=test_case.num_logprobs,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        distributed_executor_backend=test_case.distributed_executor_backend,
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )
--- a/tests/models/multimodal/generation/vlm_utils/types.py
+++ b/tests/models/multimodal/generation/vlm_utils/types.py
@@ -0,0 +1,218 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Types for writing multimodal model tests."""
+
+from collections.abc import Callable, Iterable
+from enum import Enum
+from pathlib import PosixPath
+from typing import Any, NamedTuple
+
+import torch
+from pytest import MarkDecorator
+from transformers import AutoModelForCausalLM
+from transformers.models.auto.auto_factory import _BaseAutoModelClass
+
+from vllm.config.model import RunnerOption
+from vllm.logprobs import SampleLogprobs
+from vllm.tokenizers import TokenizerLike
+
+from .....conftest import (
+    AUDIO_ASSETS,
+    IMAGE_ASSETS,
+    HfRunner,
+    ImageAsset,
+    ImageTestAssets,
+    PromptAudioInput,
+    PromptImageInput,
+    PromptVideoInput,
+)
+from ....utils import check_logprobs_close
+
+# meta image tag; will be replaced by the appropriate tag for the model
+TEST_IMG_PLACEHOLDER = "<vlm_image>"
+TEST_VIDEO_PLACEHOLDER = "<vlm_video>"
+TEST_AUDIO_PLACEHOLDER = "<lmm_audio>"
+
+SINGLE_IMAGE_BASE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": f"{TEST_IMG_PLACEHOLDER}What's the content of the image?",
+        "cherry_blossom": f"{TEST_IMG_PLACEHOLDER}What is the season?",
+    }
+)
+SINGLE_AUDIO_BASE_PROMPT = AUDIO_ASSETS.prompts(
+    {
+        "mary_had_lamb": f"{TEST_AUDIO_PLACEHOLDER}Transcribe this audio into English.",  # noqa: E501
+        "winning_call": f"{TEST_AUDIO_PLACEHOLDER}What is happening in this audio clip?",  # noqa: E501
+    }
+)
+
+MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PLACEHOLDER}Describe the two images in detail.\n"  # noqa: E501
+VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"
+
+
+IMAGE_SIZE_FACTORS = [(1.0,), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
+EMBEDDING_SIZE_FACTORS = [(1.0,), (1.0, 1.0, 1.0)]
+RunnerOutput = tuple[list[int], str, SampleLogprobs | None]
+
+
+class PromptWithMultiModalInput(NamedTuple):
+    """Holds the multimodal input for a single test case."""
+
+    prompts: list[str]
+    image_data: PromptImageInput | None = None
+    video_data: PromptVideoInput | None = None
+    audio_data: PromptAudioInput | None = None
+
+
+class VLMTestType(Enum):
+    IMAGE = 1
+    MULTI_IMAGE = 2
+    EMBEDDING = 3
+    VIDEO = 4
+    AUDIO = 5
+    CUSTOM_INPUTS = 6
+
+
+class SizeType(Enum):
+    SIZE_FACTOR = 1
+    FIXED_SIZE = 2
+
+
+class CustomTestOptions(NamedTuple):
+    inputs: list[PromptWithMultiModalInput]
+    limit_mm_per_prompt: dict[str, int]
+
+
+class ImageSizeWrapper(NamedTuple):
+    type: SizeType
+    # A size factor is a wrapper of 0+ floats,
+    # while a fixed size contains an iterable of integer pairs
+    data: Iterable[float] | Iterable[tuple[int, int]]
+
+
+class VLMTestInfo(NamedTuple):
+    """Holds the configuration for 1+ tests for one model architecture."""
+
+    models: list[str]
+    test_type: VLMTestType | Iterable[VLMTestType]
+
+    # Should be None only if this is a CUSTOM_INPUTS test
+    prompt_formatter: Callable[[str], str] | None = None
+    img_idx_to_prompt: Callable[[int], str] = lambda idx: "<image>\n"
+    video_idx_to_prompt: Callable[[int], str] = lambda idx: "<video>\n"
+    audio_idx_to_prompt: Callable[[int], str] = lambda idx: "<audio>\n"
+
+    # Most models work on the single / multi-image prompts above, but in some
+    # cases the log prob check fails, e.g., for paligemma. We allow passing
+    # an override for the single image prompts / multi-image prompt for this
+    # reason.
+    single_image_prompts: Iterable[str] = SINGLE_IMAGE_BASE_PROMPTS
+    multi_image_prompt: str = MULTI_IMAGE_BASE_PROMPT
+
+    # Function for converting ImageAssets to image embeddings;
+    # We need to define this explicitly for embedding tests
+    convert_assets_to_embeddings: (
+        Callable[[ImageTestAssets], list[torch.Tensor]] | None
+    ) = None
+
+    # Exposed options for vLLM runner; we change these in a several tests,
+    # but the defaults are derived from VllmRunner & the engine defaults
+    # These settings are chosen to avoid OOMs when running in the CI
+    enforce_eager: bool = True
+    max_model_len: int = 1024
+    max_num_seqs: int = 256
+    runner: RunnerOption = "auto"
+    tensor_parallel_size: int = 1
+    vllm_runner_kwargs: dict[str, Any] | None = None
+
+    # Optional callable which gets a list of token IDs from the model tokenizer
+    get_stop_token_ids: Callable[[TokenizerLike], list[int]] | None = None
+    # Optional list of strings to stop generation, useful when stop tokens are
+    # not special tokens in the tokenizer
+    stop_str: list[str] | None = None
+
+    # Exposed options for HF runner
+    hf_model_kwargs: dict[str, Any] | None = None
+    # Indicates we should explicitly pass the EOS from the tokenizer
+    use_tokenizer_eos: bool = False
+    auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM
+    patch_hf_runner: Callable[[HfRunner], HfRunner] | None = None
+
+    # Post processors that if defined, will run oun the outputs of the
+    # vLLM and HF runner, respectively (useful for sanitization, etc).
+    vllm_output_post_proc: Callable[[RunnerOutput, str], Any] | None = None
+    hf_output_post_proc: Callable[[RunnerOutput, str], Any] | None = None
+
+    # Consumes the output of the callables above and checks if they're equal
+    comparator: Callable[..., None] = check_logprobs_close
+
+    # Default expandable params per test; these defaults can be overridden in
+    # instances of this object; the complete set of test cases for the model
+    # is all combinations of .models + all fields below
+    max_tokens: int = 128
+    num_logprobs: int = 5
+    dtype: str = "auto"
+    distributed_executor_backend: str | None = None
+    # Only expanded in video tests
+    num_video_frames: int | tuple[int] = 16
+    needs_video_metadata: bool = False
+
+    # Fixed image sizes / image size factors; most tests use image_size_factors
+    # The values provided for these two fields will be stacked and expanded
+    # such that each model will consider each image size factor / image size
+    # once per tests (much like concatenating and wrapping in one parametrize
+    # call)
+    image_size_factors: Iterable[Iterable[float]] = IMAGE_SIZE_FACTORS
+    image_sizes: Iterable[Iterable[tuple[int, int]]] | None = None
+
+    # Hack for updating a prompt to take into a local path; currently only used
+    # for Qwen-VL, which requires encoding the image path / url into the prompt
+    # for HF runner
+    prompt_path_encoder: (
+        Callable[[PosixPath, str, list[ImageAsset] | ImageTestAssets], str] | None
+    ) = None  # noqa: E501
+
+    # Allows configuring a test to run with custom inputs
+    custom_test_opts: list[CustomTestOptions] | None = None
+
+    marks: list[MarkDecorator] | None = None
+
+    def get_non_parametrized_runner_kwargs(self):
+        """Returns a dictionary of expandable kwargs for items that are used
+        in all test types, which are NOT used when creating the parametrized
+        test cases.
+        """
+        return {
+            "enforce_eager": self.enforce_eager,
+            "max_model_len": self.max_model_len,
+            "max_num_seqs": self.max_num_seqs,
+            "runner": self.runner,
+            "tensor_parallel_size": self.tensor_parallel_size,
+            "vllm_runner_kwargs": self.vllm_runner_kwargs,
+            "hf_output_post_proc": self.hf_output_post_proc,
+            "vllm_output_post_proc": self.vllm_output_post_proc,
+            "auto_cls": self.auto_cls,
+            "use_tokenizer_eos": self.use_tokenizer_eos,
+            "comparator": self.comparator,
+            "get_stop_token_ids": self.get_stop_token_ids,
+            "hf_model_kwargs": self.hf_model_kwargs,
+            "stop_str": self.stop_str,
+            "patch_hf_runner": self.patch_hf_runner,
+        }
+
+
+class ExpandableVLMTestArgs(NamedTuple):
+    """The expanded kwargs which correspond to a single test case."""
+
+    model: str
+    max_tokens: int
+    num_logprobs: int
+    dtype: str
+    distributed_executor_backend: str | None
+    # Sizes are used for everything except for custom input tests
+    size_wrapper: ImageSizeWrapper | None = None
+    # Video only
+    num_video_frames: int | None = None
+    needs_video_metadata: bool = False
+    # Custom inputs only
+    custom_test_opts: CustomTestOptions | None = None