Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/tests/v1/engine/test_process_multi_modal_uuids.py
+++ b/tests/v1/engine/test_process_multi_modal_uuids.py
@@ -0,0 +1,202 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
+from vllm.sampling_params import SamplingParams
+from vllm.v1.engine import input_processor as input_processor_mod
+from vllm.v1.engine.input_processor import InputProcessor
+
+cherry_pil_image = ImageAsset("cherry_blossom").pil_image
+stop_pil_image = ImageAsset("stop_sign").pil_image
+baby_reading_np_ndarrays = VideoAsset("baby_reading").np_ndarrays
+
+
+def _mock_input_processor(
+    monkeypatch, *, mm_cache_gb: float = 4.0, enable_prefix_caching: bool = True
+) -> InputProcessor:
+    """
+    Create a Processor instance with minimal configuration suitable for unit
+    tests without accessing external resources.
+    """
+    monkeypatch.setattr(
+        ModelConfig, "try_get_generation_config", lambda self: {}, raising=True
+    )
+    monkeypatch.setattr(
+        ModelConfig, "__post_init__", lambda self, *args: None, raising=True
+    )
+    monkeypatch.setattr(
+        ModelConfig,
+        "verify_with_parallel_config",
+        lambda self, parallel_config: None,
+        raising=True,
+    )
+    monkeypatch.setattr(
+        input_processor_mod,
+        "processor_cache_from_config",
+        lambda vllm_config, mm_registry: None,
+        raising=True,
+    )
+
+    monkeypatch.setattr(VllmConfig, "__post_init__", lambda self: None, raising=True)
+
+    model_config = ModelConfig(
+        skip_tokenizer_init=True,
+        max_model_len=128,
+        mm_processor_cache_gb=mm_cache_gb,
+        generation_config="vllm",
+        tokenizer="dummy",
+    )
+
+    # Minimal multimodal_config to satisfy references in
+    # Processor.process_inputs.
+    class _MockMMConfig:
+        def __init__(self, gb: float):
+            self.mm_processor_cache_gb = gb
+
+    model_config.multimodal_config = _MockMMConfig(mm_cache_gb)  # type: ignore[attr-defined]
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching),
+        device_config=DeviceConfig(device="cpu"),
+    )
+
+    return InputProcessor(vllm_config, tokenizer=None)
+
+
+def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
+    input_processor = _mock_input_processor(monkeypatch)
+
+    prompt = {
+        "prompt": "USER: <image>\nDescribe\nASSISTANT:",
+        "multi_modal_data": {"image": [cherry_pil_image, stop_pil_image]},
+        # Mismatch: 2 items but only 1 uuid provided
+        "multi_modal_uuids": {"image": ["hash_cherry"]},
+    }
+
+    with pytest.raises(ValueError, match="must have same length as data"):
+        input_processor.process_inputs(
+            request_id="req-1",
+            prompt=prompt,  # type: ignore[arg-type]
+            params=SamplingParams(),
+        )
+
+
+def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
+    input_processor = _mock_input_processor(monkeypatch)
+
+    prompt = {
+        "prompt": "USER: <image><video>\nDescribe\nASSISTANT:",
+        # Two modalities provided in data
+        "multi_modal_data": {
+            "image": [cherry_pil_image],
+            "video": [baby_reading_np_ndarrays],
+        },
+        # Only image uuids provided; video missing should raise
+        "multi_modal_uuids": {"image": ["hash_cherry"]},
+    }
+
+    with pytest.raises(ValueError, match="must be provided if multi_modal_data"):
+        input_processor.process_inputs(
+            request_id="req-2",
+            prompt=prompt,  # type: ignore[arg-type]
+            params=SamplingParams(),
+        )
+
+
+@pytest.mark.parametrize(
+    "mm_cache_gb, enable_prefix_caching",
+    [
+        (4.0, True),  # default behavior
+        (4.0, False),  # prefix caching disabled
+        (0.0, True),  # processor cache disabled
+    ],
+)
+def test_multi_modal_uuids_accepts_none_and_passes_through(
+    monkeypatch, mm_cache_gb: float, enable_prefix_caching: bool
+):
+    input_processor = _mock_input_processor(
+        monkeypatch,
+        mm_cache_gb=mm_cache_gb,
+        enable_prefix_caching=enable_prefix_caching,
+    )
+
+    # Capture the overrides passed to InputPreprocessor.preprocess
+    captured: dict[str, object] = {}
+
+    def fake_preprocess(
+        prompt, *, tokenization_kwargs=None, lora_request=None, mm_uuids=None
+    ):
+        captured["mm_uuids"] = mm_uuids
+        # Minimal processed inputs for decoder-only flow
+        return {"type": "token", "prompt_token_ids": [1]}
+
+    # Monkeypatch only the bound preprocess method on this instance
+    monkeypatch.setattr(
+        input_processor.input_preprocessor, "preprocess", fake_preprocess, raising=True
+    )
+
+    # Use a consistent two-image scenario across all configurations
+    mm_uuids = {"image": [None, "hash_stop"], "video": None}
+    prompt = {
+        "prompt": "USER: <image><image>\nTwo images\nASSISTANT:",
+        "multi_modal_data": {
+            "image": [cherry_pil_image, stop_pil_image],
+            "video": baby_reading_np_ndarrays,
+        },
+        "multi_modal_uuids": mm_uuids,
+    }
+
+    input_processor.process_inputs(
+        request_id="req-3",
+        prompt=prompt,  # type: ignore[arg-type]
+        params=SamplingParams(),
+    )
+
+    assert captured["mm_uuids"] == mm_uuids
+
+
+def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
+    # When both processor cache is 0 and prefix caching disabled, the
+    # processor builds overrides from request id instead of using user UUIDs.
+    input_processor = _mock_input_processor(
+        monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False
+    )
+
+    captured: dict[str, object] = {}
+
+    def fake_preprocess(
+        prompt, *, tokenization_kwargs=None, lora_request=None, mm_uuids=None
+    ):
+        captured["mm_uuids"] = mm_uuids
+        return {"type": "token", "prompt_token_ids": [1]}
+
+    monkeypatch.setattr(
+        input_processor.input_preprocessor, "preprocess", fake_preprocess, raising=True
+    )
+
+    request_id = "req-42"
+    mm_uuids = {"image": ["hash_cherry", "hash_stop"], "video": "hash_video"}
+    prompt = {
+        "prompt": "USER: <image><image><video>\nDescribe\nASSISTANT:",
+        "multi_modal_data": {
+            "image": [cherry_pil_image, stop_pil_image],
+            "video": baby_reading_np_ndarrays,
+        },
+        "multi_modal_uuids": mm_uuids,
+    }
+
+    input_processor.process_inputs(
+        request_id=request_id,
+        prompt=prompt,  # type: ignore[arg-type]
+        params=SamplingParams(),
+    )
+
+    # Expect request-id-based overrides are passed through
+    assert captured["mm_uuids"] == {
+        "image": [f"{request_id}-image-0", f"{request_id}-image-1"],
+        "video": [f"{request_id}-video-0"],
+    }