Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/tests/models/multimodal/pooling/test_clip.py
+++ b/tests/models/multimodal/pooling/test_clip.py
@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from transformers import CLIPModel
+
+from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ...utils import check_embeddings_close
+
+HF_TEXT_PROMPTS = [
+    "a photo of a stop sign",
+    "a photo of a cherry blossom",
+]
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": "",
+        "cherry_blossom": "",
+    }
+)
+
+MODELS = ["openai/clip-vit-base-patch32"]
+
+
+def _run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    input_texts: list[str],
+    input_images: PromptImageInput,
+    model: str,
+    *,
+    dtype: str,
+) -> None:
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    with vllm_runner(
+        model, runner="pooling", dtype=dtype, enforce_eager=True, max_model_len=77
+    ) as vllm_model:
+        vllm_outputs = vllm_model.embed(input_texts, images=input_images)
+
+    with hf_runner(model, dtype=dtype, auto_cls=CLIPModel) as hf_model:
+        all_inputs = hf_model.get_inputs(input_texts, images=input_images)
+
+        all_outputs = []
+        for inputs in all_inputs:
+            inputs = hf_model.wrap_device(inputs)
+
+            if "pixel_values" in inputs:
+                pooled_output = hf_model.model.get_image_features(
+                    pixel_values=inputs.pixel_values,
+                ).squeeze(0)
+            else:
+                pooled_output = hf_model.model.get_text_features(
+                    input_ids=inputs.input_ids,
+                    attention_mask=inputs.attention_mask,
+                ).squeeze(0)
+
+            all_outputs.append(pooled_output.tolist())
+
+        hf_outputs = all_outputs
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_models_text(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
+    input_texts = [text for text, _ in input_texts_images]
+    input_images = [image for _, image in input_texts_images]
+
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        input_texts,
+        input_images,  # type: ignore
+        model,
+        dtype=dtype,
+    )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_models_image(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    input_texts_images = [
+        (text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
+    ]
+    input_texts = [text for text, _ in input_texts_images]
+    input_images = [image for _, image in input_texts_images]
+
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        input_texts,
+        input_images,
+        model,
+        dtype=dtype,
+    )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_models_text_image_no_crash(
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    texts = [HF_TEXT_PROMPTS[0]]
+    images = [image_assets[0].pil_image]
+
+    with vllm_runner(
+        model, runner="pooling", dtype=dtype, enforce_eager=True, max_model_len=77
+    ) as vllm_model:
+        with pytest.raises(ValueError, match="not both"):
+            vllm_model.embed(texts, images=images)
+
+        # Should still be able to run subsequent requests
+        vllm_model.embed(texts)
+        vllm_model.embed([""], images=images)