add qwen3

2026-02-04 17:22:39 +08:00
parent d1c0f68ab4
commit 8511fe8530
1932 changed files with 300426 additions and 0 deletions
--- a/vllm-v0.6.2/tests/multimodal/init.py
+++ b/vllm-v0.6.2/tests/multimodal/init.py
--- a/vllm-v0.6.2/tests/multimodal/test_inputs.py
+++ b/vllm-v0.6.2/tests/multimodal/test_inputs.py
@@ -0,0 +1,95 @@
+import torch
+
+from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
+
+
+def assert_nested_tensors_equal(expected: NestedTensors,
+                                actual: NestedTensors):
+    assert type(expected) == type(actual)  # noqa: E721
+    if isinstance(expected, torch.Tensor):
+        assert torch.equal(expected, actual)
+    else:
+        for expected_item, actual_item in zip(expected, actual):
+            assert_nested_tensors_equal(expected_item, actual_item)
+
+
+def assert_multimodal_inputs_equal(expected: MultiModalKwargs,
+                                   actual: MultiModalKwargs):
+    assert set(expected.keys()) == set(actual.keys())
+    for key in expected:
+        assert_nested_tensors_equal(expected[key], actual[key])
+
+
+def test_multimodal_input_batch_single_tensor():
+    t = torch.rand([1, 2])
+    result = MultiModalKwargs.batch([{"image": t}])
+    assert_multimodal_inputs_equal(result, {"image": t.unsqueeze(0)})
+
+
+def test_multimodal_input_batch_multiple_tensors():
+    a = torch.rand([1, 1, 2])
+    b = torch.rand([1, 1, 2])
+    c = torch.rand([1, 1, 2])
+    result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}])
+    assert_multimodal_inputs_equal(result, {"image": torch.stack([a, b, c])})
+
+
+def test_multimodal_input_batch_multiple_heterogeneous_tensors():
+    a = torch.rand([1, 2, 2])
+    b = torch.rand([1, 3, 2])
+    c = torch.rand([1, 4, 2])
+    result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}])
+    assert_multimodal_inputs_equal(result, {"image": [a, b, c]})
+
+
+def test_multimodal_input_batch_nested_tensors():
+    a = torch.rand([2, 3])
+    b = torch.rand([2, 3])
+    c = torch.rand([2, 3])
+    result = MultiModalKwargs.batch([{
+        "image": [a]
+    }, {
+        "image": [b]
+    }, {
+        "image": [c]
+    }])
+    assert_multimodal_inputs_equal(result, {
+        "image":
+        torch.stack([a.unsqueeze(0),
+                     b.unsqueeze(0),
+                     c.unsqueeze(0)])
+    })
+
+
+def test_multimodal_input_batch_heterogeneous_lists():
+    a = torch.rand([1, 2, 3])
+    b = torch.rand([1, 2, 3])
+    c = torch.rand([1, 2, 3])
+    result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}])
+    assert_multimodal_inputs_equal(
+        result,
+        {"image": [torch.stack([a, b]), c.unsqueeze(0)]})
+
+
+def test_multimodal_input_batch_multiple_batchable_lists():
+    a = torch.rand([1, 2, 3])
+    b = torch.rand([1, 2, 3])
+    c = torch.rand([1, 2, 3])
+    d = torch.rand([1, 2, 3])
+    result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c, d]}])
+    assert_multimodal_inputs_equal(
+        result,
+        {"image": torch.stack([torch.stack([a, b]),
+                               torch.stack([c, d])])})
+
+
+def test_multimodal_input_batch_mixed_stacking_depths():
+    a = torch.rand([1, 2, 3])
+    b = torch.rand([1, 3, 3])
+    c = torch.rand([1, 4, 3])
+
+    result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}])
+    assert_multimodal_inputs_equal(result, {"image": [[a, b], c.unsqueeze(0)]})
+
+    result = MultiModalKwargs.batch([{"image": [a]}, {"image": [b, c]}])
+    assert_multimodal_inputs_equal(result, {"image": [a.unsqueeze(0), [b, c]]})
--- a/vllm-v0.6.2/tests/multimodal/test_mapper.py
+++ b/vllm-v0.6.2/tests/multimodal/test_mapper.py
@@ -0,0 +1,162 @@
+from contextlib import nullcontext
+
+import numpy as np
+import pytest
+from transformers import CLIPImageProcessor, LlavaNextImageProcessor
+
+from vllm.config import ModelConfig
+from vllm.multimodal import MultiModalRegistry
+from vllm.multimodal.utils import rescale_image_size
+
+
+@pytest.fixture
+def mm_registry():
+    return MultiModalRegistry()
+
+
+@pytest.mark.parametrize("dtype", ["half", "float"])
+@pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])
+def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor):
+    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
+
+    hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME)
+    assert isinstance(hf_processor, CLIPImageProcessor)
+
+    model_config = ModelConfig(
+        model=MODEL_NAME,
+        task="auto",
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype=dtype,
+        revision=None,
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    mm_registry.init_mm_limits_per_prompt(model_config)
+
+    for asset in image_assets:
+        image = rescale_image_size(asset.pil_image, size_factor)
+
+        hf_result = hf_processor.preprocess(
+            image,
+            return_tensors="pt",
+        )
+        vllm_result = mm_registry.map_input(
+            model_config,
+            {"image": image},
+        )
+
+        assert hf_result.keys() == vllm_result.keys()
+        for key, hf_tensor in hf_result.items():
+            hf_arr: np.ndarray = hf_tensor.numpy()
+            vllm_arr: np.ndarray = vllm_result[key].numpy()
+
+            assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
+            assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"
+
+
+@pytest.mark.skip("Not support llava-v1.6-vicuna-7b-hf model yet.")
+@pytest.mark.parametrize("dtype", ["half", "float"])
+@pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])
+def test_llava_next_image_processor(image_assets, mm_registry, dtype,
+                                    size_factor):
+    MODEL_NAME = "llava-hf/llava-v1.6-vicuna-7b-hf"
+
+    hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME)
+    assert isinstance(hf_processor, LlavaNextImageProcessor)
+
+    model_config = ModelConfig(
+        model=MODEL_NAME,
+        task="auto",
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype=dtype,
+        revision=None,
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    mm_registry.init_mm_limits_per_prompt(model_config)
+
+    for asset in image_assets:
+        image = rescale_image_size(asset.pil_image, size_factor)
+
+        hf_result = hf_processor.preprocess(
+            image,
+            return_tensors="pt",
+        )
+        vllm_result = mm_registry.map_input(
+            model_config,
+            {"image": image},
+        )
+
+        assert hf_result.keys() == vllm_result.keys()
+        for key, hf_tensor in hf_result.items():
+            hf_arr: np.ndarray = hf_tensor.numpy()
+            vllm_arr: np.ndarray = vllm_result[key].numpy()
+
+            assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
+            assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"
+
+
+@pytest.mark.parametrize(
+    ("num_images", "limit", "is_valid"),
+    [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
+     (2, 1, False), (2, 2, True)],
+)
+def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):
+    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
+
+    model_config = ModelConfig(
+        model=MODEL_NAME,
+        task="auto",
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="half",
+        revision=None,
+        limit_mm_per_prompt={"image": limit},
+    )
+
+    mm_registry.init_mm_limits_per_prompt(model_config)
+
+    image = image_assets[0].pil_image
+    if num_images == 0:
+        mm_inputs = {}
+    elif num_images == 1:
+        mm_inputs = {"image": image}
+    else:
+        mm_inputs = {"image": [image] * num_images}
+
+    with nullcontext() if is_valid else pytest.raises(ValueError):
+        mm_registry.map_input(model_config, mm_inputs)
+
+
+# NOTE: We don't test zero images since the HF processor doesn't support it
+@pytest.mark.parametrize("num_images", [1, 2])
+def test_image_mapper_multi(image_assets, mm_registry, num_images):
+    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
+
+    model_config = ModelConfig(
+        model=MODEL_NAME,
+        task="auto",
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="half",
+        revision=None,
+        limit_mm_per_prompt={"image": num_images},
+    )
+
+    mm_registry.init_mm_limits_per_prompt(model_config)
+
+    image = image_assets[0].pil_image
+    mm_inputs = {"image": [image] * num_images}
+
+    mapped_inputs = mm_registry.map_input(model_config, mm_inputs)
+    assert len(mapped_inputs["pixel_values"]) == num_images
--- a/vllm-v0.6.2/tests/multimodal/test_processor_kwargs.py
+++ b/vllm-v0.6.2/tests/multimodal/test_processor_kwargs.py
@@ -0,0 +1,383 @@
+from array import array
+from typing import Callable, Dict, Mapping, Optional
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from vllm.inputs import (DecoderOnlyInputs, DummyData, InputContext,
+                         InputRegistry, ProcessorInputs, token_inputs)
+from vllm.multimodal import MultiModalRegistry
+from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData
+
+from ..models.utils import build_model_context
+
+# Used for fast tests where the model doesn't matter
+DUMMY_MODEL_ID = "facebook/opt-125m"
+# Used for tests that need a multimodal model
+MULTIMODAL_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
+
+# For mm_processor_kwargs - we test overrides by defining mocks for each place
+# it is used, and ensuring that we can pass processor kwargs an override value
+# to receive the intended result for things like sequence length etc.
+DEFAULT_NUM_CROPS = 4
+NUM_CROPS_OVERRIDE = 16
+
+
+# Mocks for all of the places that we use the mm_processor_kwargs
+# to override values in different callables
+@pytest.fixture
+def use_processor_mock():
+    """Patches the internal model input processor with an override callable."""
+
+    def custom_processor(ctx: InputContext,
+                         inputs: DecoderOnlyInputs,
+                         *,
+                         num_crops=DEFAULT_NUM_CROPS):
+        # For testing purposes, we don't worry about the prompt
+        return token_inputs(prompt_token_ids=[],
+                            mm_processor_kwargs={"num_crops": num_crops})
+
+    with patch("vllm.inputs.registry.InputRegistry._get_model_input_processor",
+               return_value=custom_processor):
+        yield
+
+
+@pytest.fixture
+def use_dummy_data_mock():
+    """Patches the internal model input processor with an override callable."""
+
+    def custom_dummy_data_factory(self,
+                                  ctx: InputContext,
+                                  seq_len: int,
+                                  mm_counts: Mapping[str, int],
+                                  *,
+                                  num_crops=DEFAULT_NUM_CROPS):
+        seq_data = SequenceData(
+            array(VLLM_TOKEN_ID_ARRAY_TYPE, [0] * num_crops))
+        return DummyData(seq_data, None)
+
+    with patch(
+            "vllm.inputs.registry.InputRegistry._default_dummy_data_factory",
+            custom_dummy_data_factory):
+        yield
+
+
+# Lazy import to avoid CUDA reinitialization error
+def mm_model_cls():
+    from vllm.model_executor.models.phi3v import Phi3VForCausalLM
+
+    return Phi3VForCausalLM
+
+
+# lambda whose signature matches max token calcs extra & mapper + extra kwargs
+get_num_crops = lambda ctx, *, num_crops=DEFAULT_NUM_CROPS: num_crops
+custom_mapper = lambda ctx, data, *, num_crops=DEFAULT_NUM_CROPS: {
+    "pixel_values": torch.zeros(size=(1, num_crops + 1, 3, 336, 336))
+}
+
+
+### Tests for default processor logic & mm_processor_kwargs wrapping
+def test_default_processor_is_a_noop():
+    """Ensure that by default, there is no processor override."""
+    dummy_registry = InputRegistry()
+    ctx = build_model_context(DUMMY_MODEL_ID)
+    processor = dummy_registry.create_input_processor(ctx.model_config)
+    proc_inputs = token_inputs(prompt_token_ids=[], prompt="")
+    proc_outputs = processor(inputs=proc_inputs)
+    assert proc_inputs is proc_outputs
+
+
+def _get_num_crops_info(init_num_crops: int, inference_num_crops: int):
+    """Get the init / inference kwargs and expected num_crops for this test."""
+    # If we have a value for num_crops, pass the override value and make
+    # sure we get that value as a return-value from out mock processor,
+    # otherwise fall back to the default value
+    init_kwargs = None if init_num_crops is None else {
+        "num_crops": init_num_crops
+    }
+    inference_kwargs = None if inference_num_crops is None else {
+        "num_crops": inference_num_crops
+    }
+    if inference_num_crops is not None:
+        expected_seq_count = inference_num_crops
+    elif init_num_crops is not None:
+        expected_seq_count = init_num_crops
+    else:
+        expected_seq_count = DEFAULT_NUM_CROPS
+    return init_kwargs, inference_kwargs, expected_seq_count
+
+
+def _get_processed_num_crops(
+    processor: Callable[[ProcessorInputs], ProcessorInputs],
+    inference_kwargs: Optional[Dict[str, int]],
+) -> int:
+    processed_inputs = processor(
+        token_inputs(prompt_token_ids=[],
+                     prompt="",
+                     mm_processor_kwargs=inference_kwargs))
+
+    assert "type" in processed_inputs
+    assert processed_inputs["type"] == "token"
+    assert "mm_processor_kwargs" in processed_inputs
+    return processed_inputs["mm_processor_kwargs"]["num_crops"]
+
+
+@pytest.mark.parametrize("init_num_crops,inference_num_crops", [
+    (None, None),
+    (NUM_CROPS_OVERRIDE, None),
+    (DEFAULT_NUM_CROPS, NUM_CROPS_OVERRIDE),
+])
+def test_input_processor_kwargs(use_processor_mock, init_num_crops,
+                                inference_num_crops):
+    """Ensure input processors can use processor kwargs."""
+    dummy_registry = InputRegistry()
+
+    init_kwargs, inference_kwargs, expected_seq_count = _get_num_crops_info(
+        init_num_crops, inference_num_crops)
+
+    ctx = build_model_context(DUMMY_MODEL_ID, mm_processor_kwargs=init_kwargs)
+    processor = dummy_registry.create_input_processor(ctx.model_config)
+    num_crops_val = _get_processed_num_crops(processor, inference_kwargs)
+
+    assert num_crops_val == expected_seq_count
+
+
+@pytest.mark.parametrize(
+    "mm_processor_kwargs",
+    [
+        # Not part of the signature
+        {
+            "does_not_exist": 100
+        },
+        # Part of the signature, not keyword only
+        {
+            "ctx": "something bad"
+        }
+    ])
+def test_processor_with_sad_kwarg_overrides(use_processor_mock,
+                                            mm_processor_kwargs):
+    """Ensure that input processors filter out invalid mm_processor_kwargs"""
+    dummy_registry = InputRegistry()
+    # Should filter out the init time kwargs
+    ctx = build_model_context(DUMMY_MODEL_ID,
+                              mm_processor_kwargs=mm_processor_kwargs)
+
+    processor = dummy_registry.create_input_processor(ctx.model_config)
+    # Should filter out the inference time kwargs
+    num_crops_val = _get_processed_num_crops(processor, mm_processor_kwargs)
+    assert num_crops_val == DEFAULT_NUM_CROPS
+
+
+### Test overrides for the dummy data
+@pytest.mark.parametrize("num_crops", [None, NUM_CROPS_OVERRIDE])
+def test_dummy_data_kwarg_overrides(use_dummy_data_mock, num_crops):
+    """Ensure dummy data factories can use processor kwargs."""
+    mm_processor_kwargs = None if num_crops is None else {
+        "num_crops": num_crops
+    }
+    expected_seq_count = DEFAULT_NUM_CROPS if num_crops is None else num_crops
+    dummy_registry = InputRegistry()
+    ctx = build_model_context(DUMMY_MODEL_ID,
+                              mm_processor_kwargs=mm_processor_kwargs)
+    mm_registry = MultiModalRegistry()
+    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
+
+    # NOTE: seq_len is thrown away here since this will leverage the
+    # default dummy data factory that we have patched in, whose seq
+    # len is solely dependent on the value of the mm_processor_kwargs.
+    dummy_data = dummy_registry.dummy_data_for_profiling(
+        ctx.model_config, seq_len=-1, mm_registry=mm_registry)
+    assert len(dummy_data.seq_data.prompt_token_ids) == expected_seq_count
+
+
+@pytest.mark.parametrize(
+    "mm_processor_kwargs",
+    [
+        # Not part of the signature
+        {
+            "does_not_exist": 100
+        },
+        # Part of the signature, not keyword only
+        {
+            "ctx": "something bad"
+        }
+    ])
+def test_dummy_data_with_sad_kwarg_overrides(use_dummy_data_mock,
+                                             mm_processor_kwargs):
+    """Ensure the dummy data factory filters out invalid mm_processor_kwargs"""
+    dummy_registry = InputRegistry()
+    ctx = build_model_context(DUMMY_MODEL_ID,
+                              mm_processor_kwargs=mm_processor_kwargs)
+    mm_registry = MultiModalRegistry()
+    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
+
+    # NOTE: seq_len is thrown away here since this will leverage the
+    # default dummy data factory that we have patched in, whose seq
+    # len is solely dependent on the value of the mm_processor_kwargs.
+    dummy_data = dummy_registry.dummy_data_for_profiling(
+        ctx.model_config, seq_len=-1, mm_registry=mm_registry)
+    assert len(dummy_data.seq_data.prompt_token_ids) == DEFAULT_NUM_CROPS
+
+
+### Test overrides for the max token count per multimodal instance
+@pytest.mark.parametrize("num_crops", [None, NUM_CROPS_OVERRIDE])
+def test_max_tokens_kwarg_overrides(num_crops):
+    """Ensure max token calcs can use processor kwargs."""
+    mm_processor_kwargs = None if num_crops is None else {
+        "num_crops": num_crops
+    }
+    expected_seq_count = DEFAULT_NUM_CROPS if num_crops is None else num_crops
+
+    ctx = build_model_context(MULTIMODAL_MODEL_ID,
+                              task="generate",
+                              trust_remote_code=True,
+                              mm_processor_kwargs=mm_processor_kwargs,
+                              limit_mm_per_prompt={"image": 1})
+
+    mm_registry = MultiModalRegistry()
+    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
+    # Patch the image registry for phi3v with our lambda that is compatible
+    # with overrides, then ensure that calling the method correctly echos
+    # our num_crops value back from the mm_processor_kwargs.
+    with patch.object(
+            mm_registry._get_plugin("image"),
+            "_max_mm_tokens",
+        {mm_model_cls(): get_num_crops},
+    ):
+        max_multimodal_tokens = mm_registry.get_max_multimodal_tokens(
+            ctx.model_config)
+
+    assert expected_seq_count == max_multimodal_tokens
+
+
+@pytest.mark.parametrize(
+    "mm_processor_kwargs",
+    [
+        # Not part of the signature
+        {
+            "does_not_exist": 100
+        },
+        # Part of the signature, not keyword only
+        {
+            "ctx": "something bad"
+        }
+    ])
+def test_max_tokens_with_sad_kwarg_overrides(mm_processor_kwargs):
+    """Ensure that max token calcs filters out invalid mm_processor_kwargs"""
+    ctx = build_model_context(MULTIMODAL_MODEL_ID,
+                              task="generate",
+                              trust_remote_code=True,
+                              mm_processor_kwargs=mm_processor_kwargs,
+                              limit_mm_per_prompt={"image": 1})
+
+    mm_registry = MultiModalRegistry()
+    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
+
+    # Similar before, but since these kwargs get filtered,
+    # we always get our default value back.
+    with patch.object(
+            mm_registry._get_plugin("image"),
+            "_max_mm_tokens",
+        {mm_model_cls(): get_num_crops},
+    ):
+        max_multimodal_tokens = mm_registry.get_max_multimodal_tokens(
+            ctx.model_config)
+
+    assert max_multimodal_tokens == DEFAULT_NUM_CROPS
+
+
+### Test overrides for the mapper
+@pytest.mark.parametrize("num_crops", [DEFAULT_NUM_CROPS, NUM_CROPS_OVERRIDE])
+def test_default_mapper_with_processor_kwargs(image_assets, num_crops):
+    """Ensure that the mapper processor kwargs can fall back to HF models."""
+    # NOTE - we don't validate bad inputs for the default mapper, because it's
+    # through the automodel interface in transformers, so we can't easily
+    # inspect what kwargs are or are not allowed.
+    ctx = build_model_context(MULTIMODAL_MODEL_ID,
+                              task="generate",
+                              trust_remote_code=True,
+                              mm_processor_kwargs={"num_crops": num_crops},
+                              limit_mm_per_prompt={"image": 1})
+
+    mm_registry = MultiModalRegistry()
+    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
+
+    image = image_assets[0].pil_image
+    mm_inputs = {"image": image}
+
+    mapped_inputs = mm_registry.map_input(ctx.model_config, mm_inputs)
+    # Phi3v pixel vals should have shape: [batch, num_crops+1, 3, 336, 336]
+    assert mapped_inputs["pixel_values"].shape[1] == num_crops + 1
+
+
+@pytest.mark.parametrize("init_num_crops,inference_num_crops", [
+    (None, None),
+    (NUM_CROPS_OVERRIDE, None),
+    (DEFAULT_NUM_CROPS, NUM_CROPS_OVERRIDE),
+])
+def test_custom_mapper_kwarg_overrides(image_assets, init_num_crops,
+                                       inference_num_crops):
+    """Ensure custom mappers can use processor kwargs."""
+    init_kwargs, inference_kwargs, expected_seq_count = _get_num_crops_info(
+        init_num_crops, inference_num_crops)
+
+    ctx = build_model_context(MULTIMODAL_MODEL_ID,
+                              task="generate",
+                              trust_remote_code=True,
+                              mm_processor_kwargs=init_kwargs,
+                              limit_mm_per_prompt={"image": 1})
+
+    mm_registry = MultiModalRegistry()
+    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
+    image = image_assets[0].pil_image
+    mm_inputs = {"image": image}
+
+    # Patch the image registry for phi3v with our lambda that is compatible
+    # with overrides, then ensure that calling the method correctly echos
+    # our num_crops value back from the mm_processor_kwargs.
+    mm_registry._get_plugin("image").register_input_mapper(custom_mapper)(
+        mm_model_cls())
+    mapped_inputs = mm_registry.map_input(ctx.model_config, mm_inputs,
+                                          inference_kwargs)
+
+    assert mapped_inputs["pixel_values"].shape[1] == expected_seq_count + 1
+
+
+@pytest.mark.parametrize(
+    "mm_processor_kwargs",
+    [
+        # Not part of the signature
+        {
+            "does_not_exist": 100
+        },
+        # Part of the signature, not keyword only
+        {
+            "ctx": "something bad"
+        }
+    ])
+def test_custom_mapper_with_sad_kwarg_overrides(image_assets,
+                                                mm_processor_kwargs):
+    """Ensure that custom mappers filters out invalid mm_processor_kwargs"""
+    # Should filter out the init time kwargs
+    ctx = build_model_context(MULTIMODAL_MODEL_ID,
+                              task="generate",
+                              trust_remote_code=True,
+                              mm_processor_kwargs=mm_processor_kwargs,
+                              limit_mm_per_prompt={"image": 1})
+
+    mm_registry = MultiModalRegistry()
+    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
+    image = image_assets[0].pil_image
+    mm_inputs = {"image": image}
+
+    # Patch the image registry for phi3v with our lambda that is compatible
+    # with overrides, then ensure that calling the method correctly echos
+    # our num_crops value back from the mm_processor_kwargs.
+    mm_registry._get_plugin("image").register_input_mapper(custom_mapper)(
+        mm_model_cls())
+    # Should filter out the inference time kwargs
+    mapped_inputs = mm_registry.map_input(
+        ctx.model_config, mm_inputs, mm_processor_kwargs=mm_processor_kwargs)
+
+    assert mapped_inputs["pixel_values"].shape[1] == DEFAULT_NUM_CROPS + 1
--- a/vllm-v0.6.2/tests/multimodal/test_utils.py
+++ b/vllm-v0.6.2/tests/multimodal/test_utils.py
@@ -0,0 +1,183 @@
+import base64
+import mimetypes
+import os
+from tempfile import NamedTemporaryFile, TemporaryDirectory
+from typing import Dict, Tuple
+
+import numpy as np
+import pytest
+from PIL import Image, ImageChops
+from transformers import AutoConfig, AutoTokenizer
+
+from vllm.multimodal.utils import (async_fetch_image, fetch_image,
+                                   repeat_and_pad_placeholder_tokens)
+
+# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
+TEST_IMAGE_URLS = [
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+]
+
+
+@pytest.fixture(scope="module")
+def url_images() -> Dict[str, Image.Image]:
+    return {image_url: fetch_image(image_url) for image_url in TEST_IMAGE_URLS}
+
+
+def get_supported_suffixes() -> Tuple[str, ...]:
+    # We should at least test the file types mentioned in GPT-4 with Vision
+    OPENAI_SUPPORTED_SUFFIXES = ('.png', '.jpeg', '.jpg', '.webp', '.gif')
+
+    # Additional file types that are supported by us
+    EXTRA_SUPPORTED_SUFFIXES = ('.bmp', '.tiff')
+
+    return OPENAI_SUPPORTED_SUFFIXES + EXTRA_SUPPORTED_SUFFIXES
+
+
+def _image_equals(a: Image.Image, b: Image.Image) -> bool:
+    return (np.asarray(a) == np.asarray(b.convert(a.mode))).all()
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_fetch_image_http(image_url: str):
+    image_sync = fetch_image(image_url)
+    image_async = await async_fetch_image(image_url)
+    assert _image_equals(image_sync, image_async)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("suffix", get_supported_suffixes())
+async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
+                                  image_url: str, suffix: str):
+    url_image = url_images[image_url]
+
+    try:
+        mime_type = Image.MIME[Image.registered_extensions()[suffix]]
+    except KeyError:
+        try:
+            mime_type = mimetypes.types_map[suffix]
+        except KeyError:
+            pytest.skip('No MIME type')
+
+    with NamedTemporaryFile(suffix=suffix) as f:
+        try:
+            url_image.save(f.name)
+        except Exception as e:
+            if e.args[0] == 'cannot write mode RGBA as JPEG':
+                pytest.skip('Conversion not supported')
+
+            raise
+
+        base64_image = base64.b64encode(f.read()).decode("utf-8")
+        data_url = f"data:{mime_type};base64,{base64_image}"
+
+        data_image_sync = fetch_image(data_url)
+        if _image_equals(url_image, Image.open(f)):
+            assert _image_equals(url_image, data_image_sync)
+        else:
+            pass  # Lossy format; only check that image can be opened
+
+        data_image_async = await async_fetch_image(data_url)
+        assert _image_equals(data_image_sync, data_image_async)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_fetch_image_local_files(image_url: str):
+    with TemporaryDirectory() as temp_dir:
+        origin_image = fetch_image(image_url)
+        origin_image.save(os.path.join(temp_dir, os.path.basename(image_url)),
+                          quality=100,
+                          icc_profile=origin_image.info.get('icc_profile'))
+
+        image_async = await async_fetch_image(
+            f"file://{temp_dir}/{os.path.basename(image_url)}",
+            allowed_local_media_path=temp_dir)
+
+        image_sync = fetch_image(
+            f"file://{temp_dir}/{os.path.basename(image_url)}",
+            allowed_local_media_path=temp_dir)
+        # Check that the images are equal
+        assert not ImageChops.difference(image_sync, image_async).getbbox()
+
+        with pytest.raises(ValueError):
+            await async_fetch_image(
+                f"file://{temp_dir}/../{os.path.basename(image_url)}",
+                allowed_local_media_path=temp_dir)
+        with pytest.raises(ValueError):
+            await async_fetch_image(
+                f"file://{temp_dir}/../{os.path.basename(image_url)}")
+
+        with pytest.raises(ValueError):
+            fetch_image(f"file://{temp_dir}/../{os.path.basename(image_url)}",
+                        allowed_local_media_path=temp_dir)
+        with pytest.raises(ValueError):
+            fetch_image(f"file://{temp_dir}/../{os.path.basename(image_url)}")
+
+
+@pytest.mark.parametrize("model", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+def test_repeat_and_pad_placeholder_tokens(model):
+    config = AutoConfig.from_pretrained(model)
+    image_token_id = config.image_token_index
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+
+    test_cases = [
+        (
+            "<image>",
+            2,
+            "<image><image>",
+            [32000, 32000],
+            [{ "offset": 0, "length": 2 }],
+        ),
+        (
+            "<image><image>",
+            2,
+            "<image><image><image>",
+            [32000, 32000, 32000],
+            [{ "offset": 0, "length": 2 }]),
+        (
+            "<image><image>",
+            [3, 2],
+            "<image><image><image><image><image>",
+            [32000, 32000, 32000, 32000, 32000],
+            [{ "offset": 0, "length": 3 }, { "offset": 3, "length": 2 }],
+        ),
+        (
+            "Image:<image>Image:<image>!",
+            [3, 2],
+            "Image:<image><image><image>Image:<image><image>!",
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+            [{ "offset": 2, "length": 3 }, { "offset": 7, "length": 2 }],
+        ),
+        (
+            "<image>",
+            [3, 2],
+            "<image><image><image>",
+            [32000, 32000, 32000],
+            [{ "offset": 0, "length": 3 }],
+        ),
+    ]  # yapf: disable
+
+    for (
+            prompt,
+            repeat_count,
+            expected_prompt,
+            expected_token_ids,
+            expected_ranges,
+    ) in test_cases:
+        new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
+            tokenizer=tokenizer,
+            prompt=prompt,
+            prompt_token_ids=tokenizer.encode(prompt,
+                                              add_special_tokens=False),
+            placeholder_token_id=image_token_id,
+            repeat_count=repeat_count,
+        )
+        assert new_prompt == expected_prompt
+        assert new_token_ids == expected_token_ids
+        assert ranges == expected_ranges