Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/tests/models/multimodal/processing/init.py
+++ b/tests/models/multimodal/processing/init.py
--- a/tests/models/multimodal/processing/test_audioflamingo3.py
+++ b/tests/models/multimodal/processing/test_audioflamingo3.py
@@ -0,0 +1,125 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The vLLM team.
+# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
+# reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from unittest.mock import MagicMock
+
+import numpy as np
+import pytest
+import torch
+from transformers import PretrainedConfig
+
+from tests.models.registry import HF_EXAMPLE_MODELS
+
+
+class MockAudioFlamingo3Config(PretrainedConfig):
+    model_type = "audioflamingo3"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.audio_config = PretrainedConfig()
+        self.text_config = PretrainedConfig()
+
+
+class MockAudioFlamingo3Processor:
+    def __init__(self):
+        self.audio_token = "<sound>"
+        self.audio_token_id = 12345
+        self.feature_extractor = MockFeatureExtractor()
+
+    def __call__(self, text=None, audios=None, **kwargs):
+        return {"input_ids": [1, 2, 3], "input_features": [np.zeros((3000, 80))]}
+
+
+class MockFeatureExtractor:
+    def __init__(self):
+        self.sampling_rate = 16000
+        self.chunk_length = 30
+
+
+@pytest.fixture
+def mock_ctx():
+    config = MockAudioFlamingo3Config()
+
+    ctx = MagicMock()
+    ctx.get_hf_config.return_value = config
+    ctx.get_hf_processor.return_value = MockAudioFlamingo3Processor()
+    ctx.model_config.hf_config = config
+    return ctx
+
+
+@pytest.fixture(autouse=True)
+def check_transformers_version():
+    # Check if the model is supported by the current transformers version
+    model_info = HF_EXAMPLE_MODELS.get_hf_info("AudioFlamingo3ForConditionalGeneration")
+    model_info.check_transformers_version(on_fail="skip")
+
+
+def test_audio_chunk_counting(mock_ctx):
+    from vllm.model_executor.models.audioflamingo3 import (
+        AudioFlamingo3DummyInputsBuilder,
+        AudioFlamingo3MultiModalProcessor,
+        AudioFlamingo3ProcessingInfo,
+    )
+
+    info = AudioFlamingo3ProcessingInfo(mock_ctx)
+    processor = AudioFlamingo3MultiModalProcessor(
+        info, AudioFlamingo3DummyInputsBuilder(info)
+    )
+
+    sr = 16000
+    audio_1 = np.zeros(30 * sr)
+    audio_2 = np.zeros(45 * sr)
+
+    mm_data = {"audio": [audio_1, audio_2]}
+    prompt = "<|user|>Listen.<|end|>"
+
+    from vllm.multimodal.processing import BaseMultiModalProcessor
+
+    def mock_base_call(self, prompt, mm_data, mm_kwargs, tok_kwargs):
+        return {"input_ids": [1, 2, 3], "input_features": torch.randn(1, 80, 3000)}
+
+    with pytest.MonkeyPatch.context() as mp:
+        mp.setattr(BaseMultiModalProcessor, "_call_hf_processor", mock_base_call)
+
+        processed = processor._call_hf_processor(prompt, mm_data, {}, {})
+
+        chunk_counts = processed["chunk_counts"]
+
+        assert chunk_counts[0].item() == 1
+        assert chunk_counts[1].item() == 2
+        assert len(chunk_counts) == 2
+
+
+def test_dummy_data_generation(mock_ctx):
+    from vllm.model_executor.models.audioflamingo3 import (
+        AudioFlamingo3DummyInputsBuilder,
+        AudioFlamingo3ProcessingInfo,
+    )
+
+    info = AudioFlamingo3ProcessingInfo(mock_ctx)
+    builder = AudioFlamingo3DummyInputsBuilder(info)
+
+    mm_counts = {"audio": 2}
+    dummy_data = builder.get_dummy_mm_data(100, mm_counts, None)
+
+    assert "audio" in dummy_data
+    assert len(dummy_data["audio"]) == 2
+
+    expected_len = 600 * 16000
+    assert len(dummy_data["audio"][0]) == expected_len
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -0,0 +1,418 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Set as AbstractSet
+from functools import partial
+
+import numpy as np
+import pytest
+from mistral_common.protocol.instruct.chunk import ImageChunk, TextChunk
+from mistral_common.protocol.instruct.messages import UserMessage
+from mistral_common.protocol.instruct.request import ChatCompletionRequest
+from PIL import Image
+
+from vllm.config import ModelConfig
+from vllm.config.multimodal import (
+    AudioDummyOptions,
+    BaseDummyOptions,
+    ImageDummyOptions,
+    VideoDummyOptions,
+)
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
+from vllm.multimodal.cache import MultiModalProcessorOnlyCache
+from vllm.multimodal.inputs import MultiModalInputs, batched_tensors_equal
+from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
+from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
+from vllm.tokenizers.mistral import MistralTokenizer
+
+from ....multimodal.utils import random_audio, random_image, random_video
+from ...registry import (
+    _MULTIMODAL_EXAMPLE_MODELS,
+    _TRANSFORMERS_BACKEND_MODELS,
+    HF_EXAMPLE_MODELS,
+)
+
+
+def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
+    """
+    Patch the multimodal data for GLM4.1V model.
+    """
+    # Ensure video metadata is included
+    if "video" in mm_data:
+        # GLM4.1V doesn't support multiple videos
+        video = mm_data["video"]
+        num_frames = len(video)
+        mm_data["video"] = (
+            video,
+            {
+                "total_num_frames": num_frames,
+                "fps": num_frames,
+                "duration": 1,
+                "frames_indices": [i for i in range(num_frames)],
+                "video_backend": "opencv",
+                "do_sample_frames": True,
+            },
+        )
+    return mm_data
+
+
+def qwen3_vl_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
+    """
+    Patch the multimodal data for Qwen3-VL model.
+    """
+
+    def create_metadata(frames: np.ndarray):
+        num_frames = len(frames)
+        return {
+            "total_num_frames": num_frames,
+            "fps": 2.0,
+            "duration": num_frames / 2.0,
+            "video_backend": "opencv",
+            "frames_indices": list(range(num_frames)),
+            "do_sample_frames": True,
+        }
+
+    # Ensure video metadata is included
+    if "video" in mm_data:
+        video = mm_data["video"]
+        if isinstance(video, list):
+            # multiple videos
+            mm_data["video"] = [(vid, create_metadata(vid)) for vid in video]
+        else:
+            # single video
+            mm_data["video"] = (video, create_metadata(video))
+    return mm_data
+
+
+# For some multimodal models, tokenizer will always add bos_token
+# at the beginning of prompt by default, causing hf_processor outputs
+# incorrect token ids. So we need use `add_special_tokens=False` here
+# to leave bos_token to be added by the processor.
+_ADD_SPECIAL_TOKENS_OVERRIDES = {
+    "ovis": False,
+    "ovis2_5": False,
+    "paligemma": False,
+    "ultravox": False,
+    "whisper": False,
+}
+
+_IGNORE_MM_KEYS = {
+    # In Ultravox, the audio_features can be different depending on padding
+    # The slight difference should not be a problem though, since
+    # attention_mask lets us ignore the difference.
+    "ultravox": {"audio_features"},
+}
+
+MM_DATA_PATCHES = {
+    # GLM4.1V and Qwen3-VL requires video metadata to be included in the input
+    "glm4v": glm4_1v_patch_mm_data,
+    "glm4v_moe": glm4_1v_patch_mm_data,
+    "qwen3_vl": qwen3_vl_patch_mm_data,
+    "qwen3_vl_moe": qwen3_vl_patch_mm_data,
+}
+
+
+def _iter_model_ids_to_test(model_arch_list: AbstractSet[str]):
+    for model_arch in model_arch_list:
+        model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
+        yield model_info.default
+
+        for extra_type, extra_model_id in model_info.extras.items():
+            if "fp" in extra_type:
+                continue  # Redundant to test quantized models
+
+            yield extra_model_id
+
+
+def _get_model_ids_to_test(model_arch_list: AbstractSet[str]):
+    return list(_iter_model_ids_to_test(model_arch_list))
+
+
+def get_model_ids_to_test():
+    transformers_arch_ids = {
+        model_id
+        for info in _TRANSFORMERS_BACKEND_MODELS.values()
+        for model_id in (info.default, *info.extras.values())
+    }
+    vllm_only_archs = {
+        arch
+        for arch, info in _MULTIMODAL_EXAMPLE_MODELS.items()
+        if not any(
+            model_id in transformers_arch_ids
+            for model_id in (info.default, *info.extras.values())
+        )
+    }
+
+    return _get_model_ids_to_test(vllm_only_archs)
+
+
+def get_text_token_prompts(
+    processor: BaseMultiModalProcessor,
+    mm_data: MultiModalDataDict,
+):
+    dummy_inputs = processor.dummy_inputs
+    tokenizer: TokenizerLike = processor.info.get_tokenizer()
+    model_config = processor.info.ctx.model_config
+
+    model_type = model_config.hf_config.model_type
+    if model_type in MM_DATA_PATCHES:
+        mm_data = MM_DATA_PATCHES[model_type](mm_data)
+
+    parsed_data = processor.data_parser.parse_mm_data(mm_data)
+    mm_counts = {k: len(vs) for k, vs in parsed_data.items()}
+
+    text_prompt: str | None
+    token_prompt: list[int]
+    if isinstance(tokenizer, MistralTokenizer):
+        images = parsed_data.get("image", [])
+        request = ChatCompletionRequest(
+            messages=[
+                UserMessage(
+                    content=[
+                        TextChunk(text=""),
+                        *(ImageChunk(image=image) for image in images),
+                    ]
+                ),
+            ]
+        )
+        res = tokenizer.mistral.encode_chat_completion(request)
+
+        # Mistral does not support decode_tokens with skip_special_tokens=False
+        text_prompt = None
+        token_prompt = res.tokens
+    else:
+        inputs = dummy_inputs.get_dummy_processor_inputs(
+            model_config.max_model_len,
+            mm_counts,
+        )
+        assert isinstance(inputs.prompt, str)
+
+        text_prompt = inputs.prompt
+        token_prompt = tokenizer.encode(
+            text_prompt,
+            add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type, True),
+        )
+
+    return text_prompt, token_prompt
+
+
+def _test_processing_correctness(
+    model_id_or_arch: str,
+    hit_rate: float,
+    num_batches: int,
+    simplify_rate: float,
+):
+    if model_id_or_arch in HF_EXAMPLE_MODELS.get_supported_archs():
+        # Use model architecture to get the default model id
+        model_info = HF_EXAMPLE_MODELS.get_hf_info(model_id_or_arch)
+        model_id = model_info.default
+    else:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch)
+        model_id = model_id_or_arch
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
+
+    model_config = ModelConfig(
+        model_id,
+        tokenizer=model_info.tokenizer or model_id,
+        tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
+        trust_remote_code=model_info.trust_remote_code,
+        hf_overrides=model_info.hf_overrides,
+        # Ensure that the cache can fit all of the data
+        mm_processor_cache_gb=2048,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype,
+    )
+
+    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
+    factories = model_cls._processor_factory
+    ctx = InputProcessingContext(
+        model_config,
+        tokenizer=cached_tokenizer_from_config(model_config),
+    )
+    cache = MultiModalProcessorOnlyCache(model_config)
+
+    processing_info = factories.info(ctx)
+    supported_mm_limits = processing_info.get_supported_mm_limits()
+    # Keep integer limits for local data generation
+    limit_mm_per_prompt_ints = {
+        modality: 3 if limit is None else limit
+        for modality, limit in supported_mm_limits.items()
+    }
+
+    def _to_dummy_options(modality: str, count: int) -> BaseDummyOptions:
+        if modality == "video":
+            return VideoDummyOptions(count=count)
+        if modality == "image":
+            return ImageDummyOptions(count=count)
+        if modality == "audio":
+            return AudioDummyOptions(count=count)
+        return BaseDummyOptions(count=count)
+
+    # Assign normalized DummyOptions to the model config
+    model_config.get_multimodal_config().limit_per_prompt = {
+        modality: _to_dummy_options(modality, count)
+        for modality, count in limit_mm_per_prompt_ints.items()
+    }
+
+    baseline_processor = factories.build_processor(ctx, cache=None)
+    cached_processor = factories.build_processor(ctx, cache=cache)
+
+    rng = np.random.RandomState(0)
+
+    input_to_hit = {
+        "image": Image.new("RGB", size=(128, 128)),
+        "video": np.zeros((4, 128, 128, 3), dtype=np.uint8),
+        "audio": (np.zeros((512,)), 16000),
+    }
+    input_factory = {
+        "image": partial(random_image, rng, min_wh=128, max_wh=256),
+        "video": partial(
+            random_video, rng, min_frames=2, max_frames=16, min_wh=128, max_wh=256
+        ),
+        "audio": partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
+    }
+
+    for batch_idx in range(num_batches):
+        mm_data = {
+            k: [
+                (input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
+                for _ in range(rng.randint(limit + 1))
+            ]
+            for k, limit in limit_mm_per_prompt_ints.items()
+        }
+
+        # Drop unnecessary keys and test single -> multi conversion
+        if rng.rand() < simplify_rate:
+            for k in list(mm_data.keys()):
+                if not mm_data[k]:
+                    del mm_data[k]
+                elif len(mm_data[k]) == 1:
+                    mm_data[k] = mm_data[k][0]
+
+        _test_processing_correctness_one(
+            model_config,
+            mm_data,
+            baseline_processor,
+            cached_processor,
+            batch_idx,
+        )
+
+
+def _test_processing_correctness_one(
+    model_config: ModelConfig,
+    mm_data: MultiModalDataDict,
+    baseline_processor: BaseMultiModalProcessor,
+    cached_processor: BaseMultiModalProcessor,
+    batch_idx: int,
+):
+    model_type = model_config.hf_config.model_type
+
+    text_prompt, token_prompt = get_text_token_prompts(baseline_processor, mm_data)
+    ignore_mm_keys = _IGNORE_MM_KEYS.get(model_type, set[str]())
+
+    baseline_tokenized_result = baseline_processor.apply(
+        token_prompt,
+        mm_data=mm_data,
+        hf_processor_mm_kwargs={},
+    )
+
+    cached_tokenized_result = cached_processor.apply(
+        token_prompt,
+        mm_data=mm_data,
+        hf_processor_mm_kwargs={},
+    )
+
+    _assert_inputs_equal(
+        baseline_tokenized_result,
+        cached_tokenized_result,
+        ignore_mm_keys=ignore_mm_keys,
+        msg=f"Failed ({batch_idx=}, {token_prompt=}, {mm_data=})",
+    )
+
+    if text_prompt is not None:
+        baseline_text_result = baseline_processor.apply(
+            text_prompt,
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+        cached_text_result = cached_processor.apply(
+            text_prompt,
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+
+        _assert_inputs_equal(
+            baseline_text_result,
+            cached_text_result,
+            ignore_mm_keys=ignore_mm_keys,
+            msg=f"Failed ({batch_idx=}, {text_prompt=}, {mm_data=})",
+        )
+
+        _assert_inputs_equal(
+            baseline_text_result,
+            baseline_tokenized_result,
+            ignore_mm_keys=ignore_mm_keys,
+            msg=f"Failed ({batch_idx=}, {text_prompt=}, {token_prompt=}, {mm_data=})",
+        )
+
+        _assert_inputs_equal(
+            cached_text_result,
+            cached_tokenized_result,
+            ignore_mm_keys=ignore_mm_keys,
+            msg=f"Failed ({batch_idx=}, {text_prompt=}, {token_prompt=}, {mm_data=})",
+        )
+
+
+@pytest.mark.parametrize("model_id", get_model_ids_to_test())
+@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
+@pytest.mark.parametrize("num_batches", [32])
+@pytest.mark.parametrize("simplify_rate", [1.0])
+def test_processing_correctness(
+    model_id: str,
+    hit_rate: float,
+    num_batches: int,
+    simplify_rate: float,
+):
+    if model_id == "google/gemma-3n-E2B-it":
+        pytest.skip("Fix later")
+    if model_id == "OpenGVLab/InternVL2-2B":
+        pytest.skip("Fix later")
+    if model_id == "jinaai/jina-reranker-m0":
+        pytest.skip("Fix later")
+
+    _test_processing_correctness(
+        model_id,
+        hit_rate=hit_rate,
+        num_batches=num_batches,
+        simplify_rate=simplify_rate,
+    )
+
+
+def _assert_inputs_equal(
+    a: MultiModalInputs,
+    b: MultiModalInputs,
+    *,
+    ignore_mm_keys: set[str] | None = None,
+    msg: str = "",
+):
+    if ignore_mm_keys is None:
+        ignore_mm_keys = set()
+
+    a_rest = {k: v for k, v in a.items() if k != "mm_kwargs"}
+    b_rest = {k: v for k, v in b.items() if k != "mm_kwargs"}
+
+    assert a_rest == b_rest, msg
+
+    a_data = a["mm_kwargs"].get_data()
+    b_data = b["mm_kwargs"].get_data()
+
+    for key in ignore_mm_keys:
+        a_data.pop(key, None)
+        b_data.pop(key, None)
+
+    assert batched_tensors_equal(a_data, b_data), msg
--- a/tests/models/multimodal/processing/test_gemma3.py
+++ b/tests/models/multimodal/processing/test_gemma3.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from ....conftest import ImageTestAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["google/gemma-3-4b-it"])
+def test_get_image_size_with_most_features(
+    image_assets: ImageTestAssets, model_id: str
+):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs={"do_pan_and_scan": True},
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+
+    hf_processor_mm_kwargs: dict[str, object] = {}
+    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+    max_image_size = processor.info.get_image_size_with_most_features()
+    max_tokens = processor.info.get_num_image_tokens(
+        image_width=max_image_size.width,
+        image_height=max_image_size.height,
+        processor=hf_processor,
+    )
+
+    prompt = "<start_of_image>"
+    image_seq_length = hf_processor.image_seq_length
+
+    for asset in image_assets:
+        mm_data = {"image": [asset.pil_image]}
+        processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+        mm_kwargs_data = processed_inputs["mm_kwargs"].get_data()
+        num_patches_tensor = mm_kwargs_data["num_patches"]
+        tokens = int(num_patches_tensor.item()) * image_seq_length
+        assert tokens <= max_tokens
--- a/tests/models/multimodal/processing/test_glm4_1v.py
+++ b/tests/models/multimodal/processing/test_glm4_1v.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.assets.video import VideoAsset
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import batched_tensors_equal
+from vllm.multimodal.video import OpenCVDynamicVideoBackend, OpenCVVideoBackend
+
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"])
+@pytest.mark.parametrize("expected_toks_per_frame", [299])
+@pytest.mark.parametrize(
+    "num_frames, fps, expected_grid_t",
+    [
+        # pre-sampled fixed frames (unexpected behavior,
+        # but we still expect it to work without errors)
+        (32, 1, 16),
+        (32, 2, 16),
+        (128, 1, 64),
+        (128, 2, 64),
+        # post-sampled frames (expected behavior)
+        (-1, 1, 5),
+        (-1, 2, 10),
+    ],
+)
+def test_processor_override(
+    model_id: str,
+    expected_toks_per_frame: int,
+    expected_grid_t: int,
+    fps: int,
+    num_frames: int,
+):
+    """Ensure GLM4vMultiModalProcessor can handle video frames properly."""
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"video": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    tokenizer = processor.info.get_tokenizer()
+    hf_processor_mm_kwargs = {"fps": fps}
+
+    # Build the image str / prompt based on the number of images we pass
+    video_assets = VideoAsset(name="baby_reading", num_frames=num_frames)
+    prompt = "<|begin_of_video|><|video|><|end_of_video|>"
+
+    video, metadata = video_assets.np_ndarrays, video_assets.metadata
+    metadata["fps"] = fps
+    mm_data = {"video": [(video, metadata)]}
+
+    processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+
+    # Ensure we have the right number of placeholders per num_crops size
+    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
+    video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token)
+    video_tok_count = processed_inputs["prompt_token_ids"].count(video_token_id)
+    grid_t, _, _ = processed_inputs["mm_kwargs"].get_data()["video_grid_thw"][0]
+
+    assert grid_t == expected_grid_t
+    assert video_tok_count == expected_toks_per_frame * grid_t
+
+
+@pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"])
+@pytest.mark.parametrize("fps", [2])
+def test_video_loader_consistency(
+    model_id: str,
+    fps: int,
+):
+    """
+    Ensure dynamic video loader (pre-sampled by loader) and normal video
+    loader (post-sampled by processor) produce same video processing outputs.
+    """
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"video": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs = {"fps": fps}
+
+    # Build the image str / prompt based on the number of images we pass
+    prompt = "<|begin_of_video|><|video|><|end_of_video|>"
+
+    video_path = VideoAsset(name="baby_reading", num_frames=-1).video_path
+    with open(video_path, "rb") as f:
+        video_bytes = f.read()
+
+    static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes)
+    dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes(
+        video_bytes, fps=fps
+    )
+
+    # pre-sampled loader shouldn't read all frames
+    assert len(dynamic_video) < len(static_video)
+
+    static_mm_data = {"video": [(static_video, static_metadata)]}
+    dynamic_mm_data = {"video": [(dynamic_video, dynamic_metadata)]}
+
+    static_outputs = processor.apply(prompt, static_mm_data, hf_processor_mm_kwargs)
+    dynamic_outputs = processor.apply(prompt, dynamic_mm_data, hf_processor_mm_kwargs)
+
+    assert static_outputs["prompt_token_ids"] == dynamic_outputs["prompt_token_ids"]
+    assert batched_tensors_equal(
+        static_outputs["mm_kwargs"].get_data(),
+        dynamic_outputs["mm_kwargs"].get_data(),
+    )
--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -0,0 +1,177 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for H2OVL's multimodal preprocessing kwargs."""
+
+from collections.abc import Mapping
+
+import pytest
+from PIL import Image
+from transformers import PretrainedConfig
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.processing import BaseMultiModalProcessor
+
+from ....conftest import ImageTestAssets
+from ...utils import build_model_context
+
+
+def _get_expected_num_patches(
+    config: PretrainedConfig,
+    image: Image.Image,
+    num_imgs: int,
+    min_num: int,
+    max_num: int,
+):
+    from vllm.model_executor.models.h2ovl import (
+        calculate_h2ovl_targets,
+        get_h2ovl_target_ratios,
+    )
+
+    width, height = image.size
+
+    # Calculate the expected number of blocks
+    if num_imgs == 1 and config.use_msac:
+        # First pass
+        blocks1, _, _, aspect_ratio = calculate_h2ovl_targets(
+            orig_width=width,
+            orig_height=height,
+            target_ratios=get_h2ovl_target_ratios(
+                min_num=1,
+                max_num=max_num,
+                prior_aspect_ratio=None,
+            ),
+            image_size=config.vision_config.image_size,
+            use_thumbnail=False,  # Thumbnail is handled separately
+        )
+
+        # Second pass
+        blocks2, _, _, _ = calculate_h2ovl_targets(
+            orig_width=width,
+            orig_height=height,
+            target_ratios=get_h2ovl_target_ratios(
+                min_num=3,
+                max_num=max_num,
+                prior_aspect_ratio=aspect_ratio,
+            ),
+            image_size=config.vision_config.image_size,
+            use_thumbnail=False,
+        )
+
+        # Add thumbnail if use_thumbnail is True and total_blocks > 1
+        if config.use_thumbnail:
+            blocks1 += 1 if blocks1 > 1 else 0
+            blocks2 += 1 if blocks2 > 1 else 0
+
+        # Total blocks is the sum of blocks from both passes minus
+        # overlapping
+        total_blocks = blocks1 + blocks2 - 1
+
+        return total_blocks
+
+    blocks, _, _, _ = calculate_h2ovl_targets(
+        orig_width=width,
+        orig_height=height,
+        target_ratios=get_h2ovl_target_ratios(
+            min_num,
+            max_num,
+            prior_aspect_ratio=None,
+        ),
+        image_size=config.vision_config.image_size,
+        use_thumbnail=False,
+    )
+    expected_num_patches = blocks
+
+    if config.use_thumbnail and expected_num_patches > 1:
+        expected_num_patches += 1
+
+    return expected_num_patches
+
+
+def _run_check(
+    processor: BaseMultiModalProcessor,
+    images: list[Image.Image],
+    min_num: int,
+    max_num: int,
+    mm_processor_kwargs: Mapping[str, object],
+):
+    tokenizer = processor.info.get_tokenizer()
+    config = processor.info.get_hf_config()
+
+    prompt = "<image>" * len(images)
+    mm_data = {"image": images}
+
+    total_expected_num_patches = sum(
+        _get_expected_num_patches(config, image, len(images), min_num, max_num)
+        for image in images
+    )
+
+    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
+
+    # Ensure we have the right number of placeholders per num_crops size
+    image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
+    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
+    pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape
+
+    assert img_tok_count == 256 * total_expected_num_patches
+    assert pixel_shape[0] == total_expected_num_patches
+
+
+@pytest.mark.parametrize(
+    "model_id",
+    [
+        "h2oai/h2ovl-mississippi-800m",
+        "h2oai/h2ovl-mississippi-2b",
+    ],
+)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+        [4.0, 2.0, 1.0],
+    ],
+)
+@pytest.mark.parametrize(
+    ("min_dynamic_patch", "max_dynamic_patch"),
+    [(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
+)
+@pytest.mark.parametrize("dynamic_image_size", [True, False])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
+def test_processor_override(
+    model_id: str,
+    image_assets: ImageTestAssets,
+    size_factors: list[int],
+    min_dynamic_patch: int,
+    max_dynamic_patch: int,
+    dynamic_image_size: bool | None,
+    kwargs_on_init: bool,
+):
+    mm_processor_kwargs = {
+        "min_dynamic_patch": min_dynamic_patch,
+        "max_dynamic_patch": max_dynamic_patch,
+        "dynamic_image_size": dynamic_image_size,
+    }
+
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
+        limit_mm_per_prompt={"image": len(size_factors)},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
+
+    min_num = min_dynamic_patch if dynamic_image_size else 1
+    max_num = max_dynamic_patch if dynamic_image_size else 1
+
+    _run_check(
+        processor,
+        [rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
+        min_num,
+        max_num,
+        hf_processor_mm_kwargs,
+    )
--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for Idefics3's multimodal preprocessing kwargs."""
+
+import pytest
+from transformers import Idefics3Config
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from ....conftest import ImageTestAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["HuggingFaceM4/Idefics3-8B-Llama3"])
+@pytest.mark.parametrize(
+    ("mm_processor_kwargs", "expected_toks_per_img"),
+    [
+        ({"size": {"longest_edge": 364}}, 169),
+        ({"size": {"longest_edge": 728}}, 169 * (2**2 + 1)),
+    ],
+)
+@pytest.mark.parametrize("num_imgs", [1, 2])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
+def test_processor_override(
+    image_assets: ImageTestAssets,
+    model_id: str,
+    mm_processor_kwargs: dict[str, object],
+    expected_toks_per_img: int,
+    num_imgs: int,
+    kwargs_on_init: bool,
+):
+    """Ensure Idefics3MultiModalProcessor handles num_crops properly."""
+    # Same as the previous test - don't initialize mm_processor_kwargs
+    # in this test and assume that the kwargs will be correctly expanded by
+    # the partial when calling the custom input processor.
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
+
+    # Build the image str / prompt based on the number of images we pass
+    placeholders = (
+        "<image>"
+        if num_imgs == 1
+        else "\n".join(f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
+    )
+    prompt = f"<|begin_of_text|>User:{placeholders}\n<end_of_utterance>\nAssistant:"  # noqa: E501
+
+    # Build mm_data
+    image_size = ctx.get_hf_config(Idefics3Config).vision_config.image_size
+    dummy_image_size = (image_size * 4, image_size * 4)
+    dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
+    mm_data = {"image": [dummy_image] * num_imgs}
+
+    processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+
+    # Ensure the placeholders format are correct
+    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
+    hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
+    assert processed_inputs["prompt_token_ids"] == hf_processed_inputs["input_ids"][0]
+
+    # Ensure we have the right number of placeholders per num_crops size
+    image_token_id = ctx.get_hf_config().image_token_id
+    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
+    assert img_tok_count == expected_toks_per_img * num_imgs
--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -0,0 +1,131 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for InternVL's multimodal preprocessing kwargs."""
+
+from collections.abc import Mapping
+
+import pytest
+from PIL import Image
+from transformers import PretrainedConfig
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.processing import BaseMultiModalProcessor
+
+from ....conftest import ImageTestAssets
+from ...utils import build_model_context
+
+
+def _get_expected_num_patches(
+    config: PretrainedConfig,
+    image: Image.Image,
+    num_imgs: int,
+    min_num: int,
+    max_num: int,
+):
+    from vllm.model_executor.models.internvl import (
+        calculate_internvl_targets,
+        get_internvl_target_ratios,
+    )
+
+    width, height = image.size
+
+    blocks, _, _ = calculate_internvl_targets(
+        orig_width=width,
+        orig_height=height,
+        target_ratios=get_internvl_target_ratios(
+            min_num,
+            max_num,
+        ),
+        image_size=config.vision_config.image_size,
+        use_thumbnail=False,
+    )
+    expected_num_patches = blocks
+
+    if config.use_thumbnail and expected_num_patches > 1:
+        expected_num_patches += 1
+
+    return expected_num_patches
+
+
+def _run_check(
+    processor: BaseMultiModalProcessor,
+    images: list[Image.Image],
+    min_num: int,
+    max_num: int,
+    mm_processor_kwargs: Mapping[str, object],
+):
+    tokenizer = processor.info.get_tokenizer()
+    config = processor.info.get_hf_config()
+
+    prompt = "<image>" * len(images)
+    mm_data = {"image": images}
+
+    total_expected_num_patches = sum(
+        _get_expected_num_patches(config, image, len(images), min_num, max_num)
+        for image in images
+    )
+
+    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
+
+    # Ensure we have the right number of placeholders per num_crops size
+    image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
+    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
+    pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape
+
+    assert img_tok_count == 256 * total_expected_num_patches
+    assert pixel_shape[0] == total_expected_num_patches
+
+
+@pytest.mark.parametrize("model_id", ["OpenGVLab/InternVL2-2B"])
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+        [4.0, 2.0, 1.0],
+    ],
+)
+@pytest.mark.parametrize(
+    ("min_dynamic_patch", "max_dynamic_patch"),
+    [(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
+)
+@pytest.mark.parametrize("dynamic_image_size", [True, False])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
+def test_processor_override(
+    model_id: str,
+    image_assets: ImageTestAssets,
+    size_factors: list[int],
+    min_dynamic_patch: int,
+    max_dynamic_patch: int,
+    dynamic_image_size: bool | None,
+    kwargs_on_init: bool,
+):
+    mm_processor_kwargs = {
+        "min_dynamic_patch": min_dynamic_patch,
+        "max_dynamic_patch": max_dynamic_patch,
+        "dynamic_image_size": dynamic_image_size,
+    }
+
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
+        limit_mm_per_prompt={"image": len(size_factors)},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
+
+    min_num = min_dynamic_patch if dynamic_image_size else 1
+    max_num = max_dynamic_patch if dynamic_image_size else 1
+
+    _run_check(
+        processor,
+        [rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
+        min_num,
+        max_num,
+        hf_processor_mm_kwargs,
+    )
--- a/tests/models/multimodal/processing/test_llama4.py
+++ b/tests/models/multimodal/processing/test_llama4.py
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for Llama4's multimodal preprocessing kwargs."""
+
+import pytest
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from ....conftest import ImageTestAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["meta-llama/Llama-4-Scout-17B-16E-Instruct"])
+@pytest.mark.parametrize("mm_processor_kwargs", [{}])
+@pytest.mark.parametrize("num_imgs", [1, 5])
+@pytest.mark.parametrize("mm_processor_cache_gb", [0, 4])
+@pytest.mark.parametrize("tokenized_prompt", [True, False])
+def test_processor_override(
+    image_assets: ImageTestAssets,
+    model_id: str,
+    mm_processor_kwargs: dict,
+    num_imgs: int,
+    mm_processor_cache_gb: int,
+    tokenized_prompt: bool,
+):
+    """Ensure llama4 processor works properly."""
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs,
+        limit_mm_per_prompt={"image": num_imgs},
+        mm_processor_cache_gb=mm_processor_cache_gb,
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    config = processor.info.get_hf_config()
+    tokenizer = processor.info.get_tokenizer()
+    hf_processor = processor.info.get_hf_processor()
+    vocab = tokenizer.get_vocab()
+
+    prompt = (
+        "<|begin_of_text|><|header_start|>user<|header_end|>"
+        + "<|image|>" * num_imgs
+        + "<|eot|><|header_start|>assistant<|header_end|>"
+    )
+    mm_data = {
+        "image": [
+            image_assets[(i % len(image_assets))].pil_image for i in range(num_imgs)
+        ]
+    }
+    if tokenized_prompt:
+        prompt = tokenizer.encode(prompt)
+
+    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
+    mm_data = processed_inputs["mm_kwargs"].get_data()
+
+    # place holder replacements
+    prompt_token_ids = processed_inputs["prompt_token_ids"]
+    assert prompt_token_ids.count(config.boi_token_index) == num_imgs
+    assert prompt_token_ids.count(config.eoi_token_index) == num_imgs
+    assert prompt_token_ids.count(vocab[hf_processor.image_token]) == num_imgs
+    aspect_ratios = mm_data["aspect_ratios"]
+    num_x_separators = num_y_separators = 0
+    for tiles_y, tiles_x in aspect_ratios:
+        if tiles_x * tiles_y > 1:
+            num_x_separators += (tiles_x - 1) * tiles_y
+            num_y_separators += tiles_y
+    assert prompt_token_ids.count(vocab[hf_processor.tile_token]) == num_x_separators
+    assert (
+        prompt_token_ids.count(vocab[hf_processor.tile_global_token])
+        == num_y_separators
+    )
+
+    # image token offsets
+    img_locs = processed_inputs["mm_placeholders"].get("image", [])
+    assert len(img_locs) == num_imgs
+    assert [img_loc.offset for img_loc in img_locs] == [
+        i for i, v in enumerate(prompt_token_ids) if v == config.boi_token_index
+    ]
+
+    # patch sizes and masks
+    num_patches_per_chunk = processor.info.get_patch_per_chunk(config.vision_config)
+    assert (
+        prompt_token_ids.count(config.image_token_index)
+        == sum(mm_data["patches_per_image"]) * num_patches_per_chunk
+    )
+    assert len(mm_data["pixel_values"]) == sum(mm_data["patches_per_image"])
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -0,0 +1,194 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import itertools
+from functools import partial
+
+import pytest
+from PIL import Image
+from pqdm.threads import pqdm
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.parse import ImageSize
+from vllm.multimodal.processing import BaseMultiModalProcessor
+
+from ...utils import build_model_context
+
+
+def _validate_image_max_tokens_one(
+    processor: BaseMultiModalProcessor,
+    max_tokens: int,
+    failed_size_excs: list[tuple[ImageSize, Exception]],
+    image_size: ImageSize,
+) -> None:
+    info = processor.info
+    feature_size = info.get_num_image_tokens(
+        image_width=image_size.width, image_height=image_size.height
+    )
+
+    try:
+        assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
+    except Exception as exc:
+        failed_size_excs.append((image_size, exc))
+
+
+@pytest.mark.skip(
+    "This test takes around 5 minutes to run. Comment this out to run it manually."
+)
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+def test_processor_max_tokens(model_id):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    info = processor.info
+
+    seen_aspect_ratios = set[float]()
+    image_sizes = list[ImageSize]()
+
+    # The aspect ratio of the grid layout is between 1 and 2
+    # NOTE: Assumes that feature size calculation is the same if we
+    # swap the width and height of the image
+    for w, h in itertools.product(range(32, 4096), repeat=2):
+        aspect_ratio = w / h
+        if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios:
+            image_sizes.append(ImageSize(w, h))
+            seen_aspect_ratios.add(aspect_ratio)
+
+    failed_size_excs = list[tuple[ImageSize, Exception]]()
+
+    validate_one = partial(
+        _validate_image_max_tokens_one,
+        processor,
+        info.get_max_image_tokens(),  # type: ignore
+        failed_size_excs,
+    )
+    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
+
+    if failed_size_excs:
+        msg = "Found failing image sizes:" + "\n========\n".join(
+            f"[{size}]\n{exc}" for size, exc in failed_size_excs
+        )
+        raise AssertionError(msg)
+
+
+def _validate_image_prompt_replacements_one(
+    processor: BaseMultiModalProcessor,
+    num_imgs: int,
+    failed_size_excs: list[tuple[ImageSize, Exception]],
+    image_size: ImageSize,
+) -> None:
+    prompt = "<image>" * num_imgs
+    image = Image.new("RGB", size=image_size)
+    mm_data = {"image": [image] * num_imgs}
+
+    try:
+        # The processor will throw an error if there is a mismatch
+        # in the prompt replacements
+        processed_inputs = processor.apply(prompt, mm_data, {})
+
+        image_placeholders = processed_inputs["mm_placeholders"]["image"]
+        assert len(image_placeholders) == num_imgs
+
+        first_placeholder = image_placeholders[0]
+
+        # NOTE: There is a BOS token
+        assert first_placeholder.offset == 1
+        assert (
+            first_placeholder.length
+            == (len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
+        )
+
+    except Exception as exc:
+        failed_size_excs.append((image_size, exc))
+
+
+def _test_image_prompt_replacements(
+    processor,
+    *,
+    num_imgs: int,
+    image_sizes: list[ImageSize],
+) -> None:
+    """
+    Ensure LlavaNextMultiModalProcessor
+    handles prompt replacement properly for input images.
+    """
+    failed_size_excs = list[tuple[ImageSize, Exception]]()
+
+    validate_one = partial(
+        _validate_image_prompt_replacements_one,
+        processor,
+        num_imgs,
+        failed_size_excs,
+    )
+    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
+
+    if failed_size_excs:
+        msg = "Found failing image sizes:" + "\n========\n".join(
+            f"[{size}]\n{exc}" for size, exc in failed_size_excs
+        )
+        raise AssertionError(msg)
+
+
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_processor_prompt_replacements_regression(model_id, num_imgs):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+
+    image_ratios = [
+        (171, 152),
+        (184, 161),
+        (198, 176),
+        (333, 296),
+        (369, 328),
+        (488, 183),
+        (2560, 1669),
+    ]
+    image_sizes = [
+        size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
+    ]
+
+    _test_image_prompt_replacements(
+        processor,
+        num_imgs=num_imgs,
+        image_sizes=image_sizes,
+    )
+
+
+@pytest.mark.skip(
+    "This test takes around 2 hours to run. Comment this out to run it manually."
+)
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize("num_imgs", [1])
+def test_processor_prompt_replacements_all(model_id, num_imgs):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+
+    seen_aspect_ratios = set[float]()
+    image_sizes = list[ImageSize]()
+
+    # The aspect ratio of the grid layout is between 1 and 2
+    # NOTE: Assumes that feature size calculation is the same if we
+    # swap the width and height of the image
+    for w, h in itertools.product(range(64, 1024), repeat=2):
+        aspect_ratio = w / h
+        if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios:
+            image_sizes.append(ImageSize(w, h))
+            seen_aspect_ratios.add(aspect_ratio)
+
+    _test_image_prompt_replacements(
+        processor,
+        num_imgs=num_imgs,
+        image_sizes=image_sizes,
+    )
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -0,0 +1,192 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import itertools
+from functools import partial
+
+import pytest
+from PIL import Image
+from pqdm.threads import pqdm
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.parse import ImageSize
+from vllm.multimodal.processing import BaseMultiModalProcessor
+
+from ...utils import build_model_context
+
+
+def _validate_image_max_tokens_one(
+    processor: BaseMultiModalProcessor,
+    max_tokens: int,
+    failed_size_excs: list[tuple[ImageSize, Exception]],
+    image_size: ImageSize,
+) -> None:
+    info = processor.info
+    feature_size = info.get_num_image_tokens(
+        image_width=image_size.width, image_height=image_size.height
+    )
+
+    try:
+        assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
+    except Exception as exc:
+        failed_size_excs.append((image_size, exc))
+
+
+@pytest.mark.skip(
+    "This test takes around 5 minutes to run. Comment this out to run it manually."
+)
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+def test_processor_max_tokens(model_id):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    info = processor.info
+
+    seen_aspect_ratios = set[float]()
+    image_sizes = list[ImageSize]()
+
+    # The aspect ratio of the grid layout is between 1 and 6
+    # NOTE: Assumes that feature size calculation is the same if we
+    # swap the width and height of the image
+    for w, h in itertools.product(range(32, 4096), repeat=2):
+        aspect_ratio = w / h
+        if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
+            image_sizes.append(ImageSize(w, h))
+            seen_aspect_ratios.add(aspect_ratio)
+
+    failed_size_excs = list[tuple[ImageSize, Exception]]()
+
+    validate_one = partial(
+        _validate_image_max_tokens_one,
+        processor,
+        info.get_max_image_tokens(),  # type: ignore
+        failed_size_excs,
+    )
+    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
+
+    if failed_size_excs:
+        msg = "Found failing image sizes:" + "\n========\n".join(
+            f"[{size}]\n{exc}" for size, exc in failed_size_excs
+        )
+        raise AssertionError(msg)
+
+
+def _validate_image_prompt_replacements_one(
+    processor: BaseMultiModalProcessor,
+    num_imgs: int,
+    failed_size_excs: list[tuple[ImageSize, Exception]],
+    image_size: ImageSize,
+) -> None:
+    prompt = "<image>" * num_imgs
+    image = Image.new("RGB", size=image_size)
+    mm_data = {"image": [image] * num_imgs}
+
+    try:
+        # The processor will throw an error if there is a mismatch
+        # in the prompt replacements
+        processed_inputs = processor.apply(prompt, mm_data, {})
+
+        image_placeholders = processed_inputs["mm_placeholders"]["image"]
+        assert len(image_placeholders) == num_imgs
+
+        first_placeholder = image_placeholders[0]
+
+        assert first_placeholder.offset == 0
+        assert (
+            first_placeholder.length
+            == len(processed_inputs["prompt_token_ids"]) // num_imgs
+        )
+    except Exception as exc:
+        failed_size_excs.append((image_size, exc))
+
+
+def _test_image_prompt_replacements(
+    processor,
+    *,
+    num_imgs: int,
+    image_sizes: list[ImageSize],
+) -> None:
+    """
+    Ensure LlavaOnevisionMultiModalProcessor
+    handles prompt replacement properly for input images.
+    """
+    failed_size_excs = list[tuple[ImageSize, Exception]]()
+
+    validate_one = partial(
+        _validate_image_prompt_replacements_one,
+        processor,
+        num_imgs,
+        failed_size_excs,
+    )
+    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
+
+    if failed_size_excs:
+        msg = "Found failing image sizes:" + "\n========\n".join(
+            f"[{size}]\n{exc}" for size, exc in failed_size_excs
+        )
+        raise AssertionError(msg)
+
+
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_processor_prompt_replacements_regression(model_id, num_imgs):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+
+    image_ratios = [
+        (171, 152),
+        (184, 161),
+        (198, 176),
+        (333, 296),
+        (369, 328),
+        (488, 183),
+        (2560, 1669),
+    ]
+    image_sizes = [
+        size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
+    ]
+
+    _test_image_prompt_replacements(
+        processor,
+        num_imgs=num_imgs,
+        image_sizes=image_sizes,
+    )
+
+
+@pytest.mark.skip(
+    "This test takes around 2 hours to run. Comment this out to run it manually."
+)
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+@pytest.mark.parametrize("num_imgs", [1])
+def test_processor_prompt_replacements_all(model_id, num_imgs):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+
+    seen_aspect_ratios = set[float]()
+    image_sizes = list[ImageSize]()
+
+    # The aspect ratio of the grid layout is between 1 and 6
+    # NOTE: Assumes that feature size calculation is the same if we
+    # swap the width and height of the image
+    for w, h in itertools.product(range(64, 1024), repeat=2):
+        aspect_ratio = w / h
+        if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
+            image_sizes.append(ImageSize(w, h))
+            seen_aspect_ratios.add(aspect_ratio)
+
+    _test_image_prompt_replacements(
+        processor,
+        num_imgs=num_imgs,
+        image_sizes=image_sizes,
+    )
--- a/tests/models/multimodal/processing/test_minimax_vl_01.py
+++ b/tests/models/multimodal/processing/test_minimax_vl_01.py
@@ -0,0 +1,105 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from PIL import Image
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.parse import ImageSize
+from vllm.multimodal.processing import BaseMultiModalProcessor
+
+from ....conftest import ImageTestAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["MiniMaxAI/MiniMax-VL-01"])
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_processor_override(
+    image_assets: ImageTestAssets,
+    model_id: str,
+    num_imgs: int,
+):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    prompt = "<image>" * num_imgs
+    image = Image.new("RGB", size=(364, 364))
+    mm_data = {"image": [image] * num_imgs}
+
+    processed_inputs = processor.apply(prompt, mm_data, {})
+    image_placeholders = processed_inputs["mm_placeholders"]["image"]
+
+    assert len(image_placeholders) == num_imgs
+
+
+def _validate_image_prompt_replacements_one(
+    processor: BaseMultiModalProcessor,
+    num_imgs: int,
+    failed_size_excs: list[tuple[ImageSize, Exception]],
+    image_size: ImageSize,
+) -> None:
+    prompt = "<image>" * num_imgs
+    image = Image.new("RGB", size=image_size)
+    mm_data = {"image": [image] * num_imgs}
+
+    try:
+        processed_inputs = processor.apply(prompt, mm_data, {})
+
+        image_placeholders = processed_inputs["mm_placeholders"]["image"]
+        assert len(image_placeholders) == num_imgs
+
+    except Exception as exc:
+        failed_size_excs.append((image_size, exc))
+
+
+def _test_image_prompt_replacements(
+    processor,
+    *,
+    num_imgs: int,
+    image_sizes: list[ImageSize],
+) -> None:
+    failed_size_excs = list[tuple[ImageSize, Exception]]()
+
+    for size in image_sizes:
+        _validate_image_prompt_replacements_one(
+            processor, num_imgs, failed_size_excs, size
+        )
+
+    if failed_size_excs:
+        msg = "Found failing image sizes:" + "\n========\n".join(
+            f"[{size}]\n{exc}" for size, exc in failed_size_excs
+        )
+        raise AssertionError(msg)
+
+
+@pytest.mark.parametrize("model_id", ["MiniMaxAI/MiniMax-VL-01"])
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_processor_prompt_replacements_regression(model_id, num_imgs):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+
+    image_ratios = [
+        (171, 152),
+        (184, 161),
+        (198, 176),
+        (333, 296),
+        (369, 328),
+        (488, 183),
+        (2560, 1669),
+    ]
+    image_sizes = [
+        size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
+    ]
+
+    _test_image_prompt_replacements(
+        processor,
+        num_imgs=num_imgs,
+        image_sizes=image_sizes,
+    )
--- a/tests/models/multimodal/processing/test_mllama4.py
+++ b/tests/models/multimodal/processing/test_mllama4.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for mllama's multimodal preprocessing and profiling."""
+
+import pytest
+from torch import prod
+from transformers import Llama4Config
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.profiling import MultiModalProfiler
+
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["meta-llama/Llama-Guard-4-12B"])
+@pytest.mark.parametrize("max_model_len", [4096, 8192, 25600, 131072])
+def test_profiling(model_id: str, max_model_len: int):
+    model_config_kwargs = {
+        "max_model_len": max_model_len,
+    }
+    mm_counts = {"image": 1}
+    ctx = build_model_context(
+        model_id,
+        model_config_kwargs=model_config_kwargs,
+        limit_mm_per_prompt=mm_counts,
+    )
+
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    profiler = MultiModalProfiler(processor)
+
+    decoder_dummy_data = profiler.get_decoder_dummy_data(
+        max_model_len,
+        mm_counts=mm_counts,
+    )
+    dummy_mm_data = processor.dummy_inputs.get_dummy_processor_inputs(
+        max_model_len,
+        mm_counts=mm_counts,
+    )
+
+    hf_config = ctx.get_hf_config(Llama4Config)
+
+    mm_data = processor.apply(
+        prompt=dummy_mm_data.prompt,
+        mm_data=dummy_mm_data.mm_data,
+        hf_processor_mm_kwargs=dict(),
+    )["mm_kwargs"].get_data()
+
+    image_size = hf_config.vision_config.image_size
+    patch_size = hf_config.vision_config.patch_size
+    downsample_ratio = int(
+        round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2))
+    )
+    tokens_per_patch = ((image_size // patch_size) ** 2) // downsample_ratio
+    chunks_per_image = prod(mm_data["patches_per_image"])
+    total_num_patches = chunks_per_image * tokens_per_patch
+    num_tiles = (
+        mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][1]
+    )  # x-y separator tokens
+    total_tokens = (
+        total_num_patches.item() + num_tiles.item() + 3
+    )  # image start, image, image end
+
+    profiled_tokens = profiler.get_mm_max_tokens(
+        max_model_len,
+        mm_counts=mm_counts,
+    )
+
+    assert total_num_patches == profiled_tokens["image"]
+    assert total_tokens == sum(
+        placeholder.length
+        for placeholder in decoder_dummy_data.multi_modal_placeholders["image"]
+    )
--- a/tests/models/multimodal/processing/test_nemotron_vl.py
+++ b/tests/models/multimodal/processing/test_nemotron_vl.py
@@ -0,0 +1,133 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for Nemotron-Nano-VL's multimodal preprocessing kwargs."""
+
+from collections.abc import Mapping
+
+import pytest
+from PIL import Image
+from transformers import PretrainedConfig
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.processing import BaseMultiModalProcessor
+
+from ....conftest import ImageTestAssets
+from ...utils import build_model_context
+
+
+def _get_expected_num_patches(
+    config: PretrainedConfig,
+    image: Image.Image,
+    num_imgs: int,
+    min_num: int,
+    max_num: int,
+):
+    from vllm.model_executor.models.nemotron_vl import (
+        calculate_nemotron_vl_targets,
+        get_nemotron_vl_target_ratios,
+    )
+
+    width, height = image.size
+
+    blocks, _, _ = calculate_nemotron_vl_targets(
+        orig_width=width,
+        orig_height=height,
+        target_ratios=get_nemotron_vl_target_ratios(
+            min_num,
+            max_num,
+        ),
+        image_size=config.force_image_size,
+        use_thumbnail=False,
+    )
+    expected_num_patches = blocks
+
+    if config.use_thumbnail and expected_num_patches > 1:
+        expected_num_patches += 1
+
+    return expected_num_patches
+
+
+def _run_check(
+    processor: BaseMultiModalProcessor,
+    images: list[Image.Image],
+    min_num: int,
+    max_num: int,
+    mm_processor_kwargs: Mapping[str, object],
+):
+    tokenizer = processor.info.get_tokenizer()
+    config = processor.info.get_hf_config()
+    image_processor = processor.info.get_image_processor()
+
+    config.use_thumbnail = image_processor.use_thumbnail
+    prompt = "<image>" * len(images)
+    mm_data = {"image": images}
+
+    total_expected_num_patches = sum(
+        _get_expected_num_patches(config, image, len(images), min_num, max_num)
+        for image in images
+    )
+    print(total_expected_num_patches)
+    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
+
+    # Ensure we have the right number of placeholders per num_crops size
+    image_token_id = tokenizer.convert_tokens_to_ids("<image>")
+    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
+    pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape
+    print("Image token count:", img_tok_count, "Pixel shape:", pixel_shape)
+    assert img_tok_count == 256 * total_expected_num_patches
+    assert pixel_shape[0] == total_expected_num_patches
+
+
+@pytest.mark.parametrize("model_id", ["nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"])
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+        [4.0, 2.0, 1.0],
+    ],
+)
+@pytest.mark.parametrize(
+    ("min_dynamic_patch", "max_dynamic_patch"),
+    [(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
+)
+@pytest.mark.parametrize("dynamic_image_size", [True, False])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
+def test_processor_override(
+    model_id: str,
+    image_assets: ImageTestAssets,
+    size_factors: list[int],
+    min_dynamic_patch: int,
+    max_dynamic_patch: int,
+    dynamic_image_size: bool | None,
+    kwargs_on_init: bool,
+):
+    mm_processor_kwargs = {
+        "min_dynamic_patch": min_dynamic_patch,
+        "max_dynamic_patch": max_dynamic_patch,
+        "dynamic_image_size": dynamic_image_size,
+    }
+
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
+        limit_mm_per_prompt={"image": len(size_factors)},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
+
+    min_num = min_dynamic_patch if dynamic_image_size else 1
+    max_num = max_dynamic_patch if dynamic_image_size else 1
+
+    _run_check(
+        processor,
+        [rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
+        min_num,
+        max_num,
+        hf_processor_mm_kwargs,
+    )
--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for phi3v's multimodal preprocessing kwargs."""
+
+import pytest
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from ....conftest import ImageTestAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"])
+@pytest.mark.parametrize(
+    ("mm_processor_kwargs", "expected_toks_per_img"),
+    [
+        ({"num_crops": 4}, 757),
+        ({"num_crops": 16}, 1921),
+        # the default num_crops of phi-3.5-vision is 4
+        ({}, 757),
+    ],
+)
+@pytest.mark.parametrize("num_imgs", [1, 2])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
+def test_processor_override(
+    image_assets: ImageTestAssets,
+    model_id: str,
+    mm_processor_kwargs: dict[str, int],
+    expected_toks_per_img: int,
+    num_imgs: int,
+    kwargs_on_init: bool,
+):
+    """Ensure Phi3VMultiModalProcessor handles num_crops properly."""
+    # Avoid initializing CUDA early
+    from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
+
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
+
+    # Build the image str / prompt based on the number of images we pass
+    img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
+    prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
+    mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
+
+    processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+
+    # Ensure we have the right number of placeholders per num_crops size
+    img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)
+    assert img_tok_count == expected_toks_per_img * num_imgs
--- a/tests/models/multimodal/processing/test_phi4mm.py
+++ b/tests/models/multimodal/processing/test_phi4mm.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for phi4mm's multimodal preprocessing kwargs."""
+
+import pytest
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from ....conftest import ImageTestAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["microsoft/Phi-4-multimodal-instruct"])
+@pytest.mark.parametrize(
+    ("mm_processor_kwargs", "expected_toks_per_img"),
+    [
+        ({"dynamic_hd": 4}, 1329),
+        ({"dynamic_hd": 16}, 4433),
+        # the default num_crops of phi-4-multimodal is 36
+        ({}, 9585),
+    ],
+)
+@pytest.mark.parametrize("num_imgs", [1, 2])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
+def test_processor_override(
+    image_assets: ImageTestAssets,
+    model_id: str,
+    mm_processor_kwargs: dict[str, int],
+    expected_toks_per_img: int,
+    num_imgs: int,
+    kwargs_on_init: bool,
+):
+    """Ensure Phi4MMMultiModalProcessor handles dynamic_hd properly."""
+    # Avoid initializing CUDA early
+    from vllm.model_executor.models.phi4mm import _IMAGE_PLACEHOLDER_TOKEN_ID
+
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
+
+    # Build the image str / prompt based on the number of images we pass
+    img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
+    prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
+
+    image_size = ctx.get_hf_config().embd_layer["image_embd_layer"]["crop_size"]
+    dummy_image_size = (image_size * 7, image_size * 7)
+    dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
+    mm_data = {"image": [dummy_image] * num_imgs}
+
+    processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+
+    # Ensure we have the right number of placeholders per num_crops size
+    img_tok_count = processed_inputs["prompt_token_ids"].count(
+        _IMAGE_PLACEHOLDER_TOKEN_ID
+    )
+    assert img_tok_count == expected_toks_per_img * num_imgs
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from ....conftest import ImageTestAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
+@pytest.mark.parametrize(
+    ("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"),
+    [
+        ({}, 1426, (5704, 1176)),
+        ({"min_pixels": 64**2, "max_pixels": 512**2}, 330, (1320, 1176)),
+    ],
+)
+@pytest.mark.parametrize("num_imgs", [1, 2])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
+def test_processor_override(
+    image_assets: ImageTestAssets,
+    model_id: str,
+    mm_processor_kwargs: dict[str, object],
+    expected_toks_per_img: int,
+    expected_pixels_shape: tuple[int, int],
+    num_imgs: int,
+    kwargs_on_init: bool,
+):
+    """Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    tokenizer = processor.info.get_tokenizer()
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
+
+    # Build the image str / prompt based on the number of images we pass
+    prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs
+    mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
+
+    processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+
+    # Ensure we have the right number of placeholders per num_crops size
+    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
+    image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
+    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
+    pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values"].shape
+
+    assert img_tok_count == expected_toks_per_img * num_imgs
+    assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs
+    assert pixel_shape[1] == expected_pixels_shape[1]
+
+
+@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
+@pytest.mark.parametrize("max_pixels", [1280 * 28 * 28, 1283 * 28 * 28])
+def test_get_image_size_with_most_features(
+    image_assets: ImageTestAssets,
+    model_id: str,
+    max_pixels: int,
+):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs={"max_pixels": max_pixels},
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+
+    hf_processor_mm_kwargs: dict[str, object] = {}
+    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
+    merge_size = processor.info.get_hf_config().vision_config.spatial_merge_size
+
+    max_image_size = processor.info.get_image_size_with_most_features()
+    max_tokens = processor.info.get_num_image_tokens(
+        image_width=max_image_size.width,
+        image_height=max_image_size.height,
+        image_processor=hf_processor.image_processor,
+    )
+
+    prompt = "<|vision_start|><|image_pad|><|vision_end|>"
+    for asset in image_assets:
+        mm_data = {"image": [asset.pil_image]}
+        processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+        grid_thw = processed_inputs["mm_kwargs"].get_data()["image_grid_thw"].tolist()
+        t, h, w = grid_thw[0]
+        tokens = (t * h * w) // (merge_size**2)
+        assert tokens < max_tokens
--- a/tests/models/multimodal/processing/test_smolvlm.py
+++ b/tests/models/multimodal/processing/test_smolvlm.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for smolvlm's multimodal preprocessing kwargs."""
+
+import pytest
+from transformers import SmolVLMConfig
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from ....conftest import ImageTestAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["HuggingFaceTB/SmolVLM2-2.2B-Instruct"])
+@pytest.mark.parametrize(
+    ("mm_processor_kwargs", "expected_toks_per_img"),
+    [
+        ({"max_image_size": {"longest_edge": 384}}, 1377),
+        ({"max_image_size": {"longest_edge": 768}}, 405),
+    ],
+)
+@pytest.mark.parametrize("num_imgs", [1, 2])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
+def test_processor_override(
+    image_assets: ImageTestAssets,
+    model_id: str,
+    mm_processor_kwargs: dict[str, object],
+    expected_toks_per_img: int,
+    num_imgs: int,
+    kwargs_on_init: bool,
+):
+    """Ensure Idefics3MultiModalProcessor handles num_crops properly."""
+    # Same as the previous test - don't initialize mm_processor_kwargs
+    # in this test and assume that the kwargs will be correctly expanded by
+    # the partial when calling the custom input processor.
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
+
+    # Build the image str / prompt based on the number of images we pass
+    placeholders = (
+        "<image>"
+        if num_imgs == 1
+        else "\n".join(f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
+    )
+    prompt = f"<|im_start|>User:{placeholders}\n<end_of_utterance>\nAssistant:"  # noqa: E501
+
+    # Build mm_data
+    image_size = ctx.get_hf_config(SmolVLMConfig).vision_config.image_size
+    dummy_image_size = (image_size * 4, image_size * 4)
+    dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
+    mm_data = {"image": [dummy_image] * num_imgs}
+
+    processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+
+    # Ensure the placeholders format are correct
+    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
+    hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
+    assert processed_inputs["prompt_token_ids"] == hf_processed_inputs["input_ids"][0]
+
+    # Ensure we have the right number of placeholders per num_crops size
+    image_token_id = ctx.get_hf_config().image_token_id
+    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
+    assert img_tok_count == expected_toks_per_img * num_imgs
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -0,0 +1,258 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import tempfile
+from collections.abc import Iterable
+from contextlib import contextmanager
+from functools import partial
+from typing import Any, TypeAlias
+
+import numpy as np
+import pytest
+import torch
+import torch.nn as nn
+from PIL import Image
+
+from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
+from vllm.config.multimodal import (
+    AudioDummyOptions,
+    BaseDummyOptions,
+    ImageDummyOptions,
+    VideoDummyOptions,
+)
+from vllm.distributed import (
+    cleanup_dist_env_and_memory,
+    init_distributed_environment,
+    initialize_model_parallel,
+)
+from vllm.model_executor.models.interfaces import (
+    SupportsMultiModal,
+    supports_multimodal,
+)
+from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
+from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
+from vllm.multimodal.utils import group_mm_kwargs_by_modality
+from vllm.platforms import current_platform
+from vllm.tokenizers import cached_tokenizer_from_config
+from vllm.utils.collection_utils import is_list_of
+from vllm.utils.torch_utils import set_default_torch_dtype
+
+from ....utils import create_new_process_for_each_test
+from ...registry import HF_EXAMPLE_MODELS
+from ...utils import dummy_hf_overrides
+from .test_common import get_model_ids_to_test, get_text_token_prompts
+
+ImageInput = list[Image.Image]
+VideoInput: TypeAlias = (
+    list[Image.Image] | list[np.ndarray] | list[tuple[np.ndarray, dict[str, Any]]]
+)
+AudioInput = list[tuple[np.ndarray, int]]
+
+
+MM_OPTIONS_OVERRIDES = {
+    # Qwen3-VL's default profiling video size (64x64) can cause trouble
+    # after resizing, so we override it here for testing.
+    "qwen3_vl": dict(
+        video=VideoDummyOptions(num_frames=128, width=256, height=256),
+    ),
+    "qwen3_vl_moe": dict(
+        video=VideoDummyOptions(num_frames=128, width=256, height=256),
+    ),
+}
+
+
+def _resize_data(
+    _data: Image.Image | np.ndarray, size_factor: float
+) -> Image.Image | np.ndarray:
+    assert size_factor <= 1, "Size factor must be less than 1"
+    # Image input
+    if isinstance(_data, Image.Image):
+        W, H = _data.width, _data.height
+        W, H = map(lambda x: int(x * size_factor), (W, H))
+        return _data.resize((W, H))
+    # Video input with PIL Images
+    elif is_list_of(_data, Image.Image):
+        W, H = next(iter(_data)).width, next(iter(_data)).height
+        T = len(_data)
+        T, W, H = map(lambda x: max(int(x * size_factor), 1), (T, W, H))
+        return [d.resize((W, H)) for d in _data[:T]]
+    # Video input with numpy arrays
+    elif isinstance(_data, np.ndarray) and _data.ndim >= 4:
+        T, H, W, C = _data.shape[-4:]
+        T, H, W = map(lambda x: max(int(x * size_factor), 1), (T, H, W))
+        return _data[..., :T, :H, :W, :C]
+    # Audio input
+    elif isinstance(_data, np.ndarray) and _data.ndim == 1:
+        return _data[: int(len(_data) * size_factor)]
+    raise AssertionError("This line should be unreachable.")
+
+
+def resize_mm_data(
+    data: ImageInput | VideoInput | AudioInput, size_factors: tuple[float, ...]
+) -> ImageInput | VideoInput | AudioInput:
+    size_factors = size_factors[: len(data)]
+    if is_list_of(data, (Image.Image, np.ndarray, list)):
+        return [_resize_data(d, s) for d, s in zip(data, size_factors)]
+    elif is_list_of(data, tuple):
+        return [_resize_data(d, s) for (d, _), s in zip(data, size_factors)]
+    raise ValueError("Unsupported multimodal data type.")
+
+
+def create_batched_mm_kwargs(
+    model_cls: type[SupportsMultiModal],
+    model_config: ModelConfig,
+    processor: BaseMultiModalProcessor,
+    size_factors: tuple[float, ...] = (1.0, 0.5, 0.25),
+) -> Iterable[tuple[str, int, BatchedTensorInputs]]:
+    model_type = model_config.hf_config.model_type
+
+    processing_info = processor.info
+    dummy_inputs = processor.dummy_inputs
+    supported_mm_limits = processing_info.get_supported_mm_limits()
+    mm_counts = {
+        modality: 3 if limit is None else limit
+        for modality, limit in supported_mm_limits.items()
+    }
+    processor_inputs = dummy_inputs.get_dummy_processor_inputs(
+        seq_len=model_config.max_model_len,
+        mm_counts=mm_counts,
+        mm_options=MM_OPTIONS_OVERRIDES.get(model_type),
+    )
+    mm_data = processor_inputs.mm_data
+    resized_mm_data = {
+        modality: resize_mm_data(data, size_factors)
+        for modality, data in mm_data.items()
+    }
+
+    # video metadata will be added back to the resized video data here.
+    text_prompt, token_prompt = get_text_token_prompts(processor, resized_mm_data)
+
+    mm_kwargs = processor.apply(
+        prompt=token_prompt if text_prompt is None else text_prompt,
+        mm_data=resized_mm_data,
+        hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
+        tokenization_kwargs=processor_inputs.tokenization_kwargs,
+    )["mm_kwargs"].require_data()
+
+    return group_mm_kwargs_by_modality(
+        [item for modality in supported_mm_limits for item in mm_kwargs[modality]]
+    )
+
+
+# TODO(Isotr0py): Don't initalize model during test
+@contextmanager
+def initialize_dummy_model(
+    model_cls: type[nn.Module],
+    model_config: ModelConfig,
+):
+    temp_file = tempfile.mkstemp()[1]
+    init_distributed_environment(
+        world_size=1,
+        rank=0,
+        distributed_init_method=f"file://{temp_file}",
+        local_rank=0,
+        backend="nccl",
+    )
+    initialize_model_parallel(tensor_model_parallel_size=1)
+
+    current_device = torch.get_default_device()
+    vllm_config = VllmConfig(model_config=model_config)
+    with set_current_vllm_config(vllm_config=vllm_config):
+        with set_default_torch_dtype(model_config.dtype):
+            torch.set_default_device(current_platform.device_type)
+            model = model_cls(vllm_config=vllm_config)
+            torch.set_default_device(current_device)
+        yield model
+
+    del model
+    cleanup_dist_env_and_memory()
+
+
+@create_new_process_for_each_test()
+@pytest.mark.parametrize("model_id", get_model_ids_to_test())
+def test_model_tensor_schema(model_id: str):
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
+
+    model_arch = next(
+        arch for arch, info in HF_EXAMPLE_MODELS.hf_models.items() if info == model_info
+    )
+
+    hf_overrides_fn = partial(
+        dummy_hf_overrides,
+        model_arch=model_arch,
+        exist_overrides=model_info.hf_overrides,
+    )
+
+    # ROCm: Detect if model uses AWQ quantization and set appropriate dtype
+    if "awq" in model_id.lower() and current_platform.is_rocm():
+        dtype = "float16"
+    else:
+        dtype = model_info.dtype
+
+    model_config = ModelConfig(
+        model_id,
+        tokenizer=model_info.tokenizer or model_id,
+        tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
+        trust_remote_code=model_info.trust_remote_code,
+        hf_overrides=hf_overrides_fn,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
+        enforce_eager=model_info.enforce_eager,
+        dtype=dtype,
+    )
+
+    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
+    assert supports_multimodal(model_cls)
+
+    factories = model_cls._processor_factory
+
+    inputs_parse_methods = []
+    for attr_name in dir(model_cls):
+        attr = getattr(model_cls, attr_name)
+        if hasattr(attr, "__annotations__"):
+            return_type = attr.__annotations__.get("return", None)
+            if return_type is not None and "Input" in str(return_type):
+                inputs_parse_methods.append(attr_name)
+
+    if not any(inputs_parse_methods):
+        pytest.skip(f"{model_arch} does not support tensor schema validation.")
+
+    ctx = InputProcessingContext(
+        model_config,
+        tokenizer=cached_tokenizer_from_config(model_config),
+    )
+    processing_info = factories.info(ctx)
+    supported_mm_limits = processing_info.get_supported_mm_limits()
+    limit_mm_per_prompt = {
+        modality: 3 if limit is None else limit
+        for modality, limit in supported_mm_limits.items()
+    }
+
+    def _to_dummy_options(modality: str, count: int) -> BaseDummyOptions:
+        if modality == "video":
+            return VideoDummyOptions(count=count)
+        if modality == "image":
+            return ImageDummyOptions(count=count)
+        if modality == "audio":
+            return AudioDummyOptions(count=count)
+        return BaseDummyOptions(count=count)
+
+    model_config.get_multimodal_config().limit_per_prompt = {
+        modality: _to_dummy_options(modality, count)
+        for modality, count in limit_mm_per_prompt.items()
+    }
+    processor = factories.build_processor(ctx, cache=None)
+
+    with initialize_dummy_model(model_cls, model_config) as model:
+        for modality, _, mm_kwargs in create_batched_mm_kwargs(
+            model_cls, model_config, processor
+        ):
+            for method_name in inputs_parse_methods:
+                print(
+                    f"Testing `{method_name}` with modality={modality} "
+                    f"and mm_kwargs{list(mm_kwargs.keys())}"
+                )
+                getattr(model, method_name)(modality=modality, **mm_kwargs)
--- a/tests/models/multimodal/processing/test_transformers.py
+++ b/tests/models/multimodal/processing/test_transformers.py
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from vllm.assets.image import ImageAsset
+from vllm.config import ModelConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+def test_multimodal_processor(model_id):
+    model_config = ModelConfig(
+        model=model_id,
+        model_impl="transformers",
+    )
+
+    mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config)
+
+    image_pil = ImageAsset("cherry_blossom").pil_image
+    mm_data = {"image": image_pil}
+    str_prompt = "<|im_start|>user <image>\nWhat is the content of this image?<|im_end|><|im_start|>assistant\n"  # noqa: E501
+    str_processed_inputs = mm_processor.apply(
+        prompt=str_prompt,
+        mm_data=mm_data,
+        hf_processor_mm_kwargs={},
+    )
+
+    ids_prompt = [
+        151644,
+        872,
+        220,
+        151646,
+        198,
+        3838,
+        374,
+        279,
+        2213,
+        315,
+        419,
+        2168,
+        30,
+        151645,
+        151644,
+        77091,
+        198,
+    ]
+    ids_processed_inputs = mm_processor.apply(
+        prompt=ids_prompt,
+        mm_data=mm_data,
+        hf_processor_mm_kwargs={},
+    )
+
+    assert (
+        str_processed_inputs["prompt_token_ids"]
+        == ids_processed_inputs["prompt_token_ids"]
+    )