add qwen3

This commit is contained in:
Chranos
2026-02-04 17:22:39 +08:00
parent d1c0f68ab4
commit 8511fe8530
1932 changed files with 300426 additions and 0 deletions

View File

@@ -0,0 +1,187 @@
"""Tests for Idefics3's multimodal preprocessing kwargs."""
from typing import Optional
import pytest
import torch
import transformers
from transformers import AutoImageProcessor, AutoTokenizer
from vllm.inputs import InputContext, token_inputs
from vllm.multimodal import MultiModalRegistry
from .....conftest import _ImageAssets
from ....utils import build_model_context
models = ["HuggingFaceM4/Idefics3-8B-Llama3"]
# Wrap lazy imports to avoid initializing CUDA during test collection
@pytest.fixture()
def input_processor_for_idefics3():
from vllm.model_executor.models.idefics3 import (
input_processor_for_idefics3)
return input_processor_for_idefics3
@pytest.fixture()
def dummy_data_for_idefics3():
from vllm.model_executor.models.idefics3 import dummy_data_for_idefics3
return dummy_data_for_idefics3
@pytest.fixture()
def get_max_idefics3_image_tokens():
from vllm.model_executor.models.idefics3 import (
get_max_idefics3_image_tokens)
return get_max_idefics3_image_tokens
@pytest.mark.skipif(transformers.__version__ < "4.46.0",
reason="Model introduced in HF >= 4.46.0")
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("longest_edge", [None, 168, 336, 400, 2 * 336])
def test_input_mapper_override(model: str, image_assets: _ImageAssets,
longest_edge: Optional[int]):
"""Ensure that the [default] input mapper handles size properly."""
mm_processor_kwargs = {
"size": {
"longest_edge": longest_edge
}
} if longest_edge is not None else {}
ctx = build_model_context(
model_name=model,
tokenizer_name=model,
trust_remote_code=True,
mm_processor_kwargs=mm_processor_kwargs,
)
hf_processor = AutoImageProcessor.from_pretrained(model,
trust_remote_code=True,
**mm_processor_kwargs)
mm_registry = MultiModalRegistry()
mm_registry.init_mm_limits_per_prompt(ctx.model_config)
image = image_assets[0].pil_image
hf_result = hf_processor.preprocess(
image,
return_tensors="pt",
)
vllm_result = mm_registry.map_input(
ctx.model_config,
{"image": image},
)
assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"])
@pytest.mark.skipif(transformers.__version__ < "4.46.0",
reason="Model introduced in HF >= 4.46.0")
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("longest_edge, expected_max_tokens", [
(None, 2873),
(168, 169),
(336, 169),
(400, 338),
(672, 338),
])
def test_max_tokens_override(get_max_idefics3_image_tokens, model: str,
longest_edge: Optional[int],
expected_max_tokens: int):
"""Ensure get_max_idefics3_image_tokens handles mm_processor_kwargs."""
size = {"longest_edge": longest_edge} if longest_edge is not None else None
ctx = build_model_context(
model_name=model,
tokenizer_name=model,
trust_remote_code=True,
mm_processor_kwargs=None,
)
actual_max_tokens = get_max_idefics3_image_tokens(
ctx=InputContext(ctx.model_config),
size=size,
)
assert expected_max_tokens == actual_max_tokens
@pytest.mark.skipif(transformers.__version__ < "4.46.0",
reason="Model introduced in HF >= 4.46.0")
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("longest_edge, toks_per_img, num_imgs", [
(168, 169, 1),
(168, 169, 2),
(400, 338, 1),
(400, 338, 2),
])
def test_dummy_data_override(dummy_data_for_idefics3, model: str,
longest_edge: int, toks_per_img: int,
num_imgs: int):
"""Ensure dummy_data_for_idefics3 handles num_crops properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
# the partial when calling the dummy data func.
size = {"longest_edge": longest_edge} if longest_edge is not None else None
ctx = build_model_context(
model_name=model,
tokenizer_name=model,
trust_remote_code=True,
mm_processor_kwargs=None,
)
dummy_data = dummy_data_for_idefics3(
ctx=ctx,
seq_len=8192, # Should be bigger than num_imgs * toks_per_img
mm_counts={"image": num_imgs},
size=size)
sequence_data = dummy_data.seq_data
# Ensure we have the right number of placeholders per size
image_token_id = ctx.get_hf_config().image_token_id
img_tok_count = sequence_data.get_token_ids().count(image_token_id)
assert img_tok_count == toks_per_img * num_imgs
@pytest.mark.skipif(transformers.__version__ < "4.46.0",
reason="Model introduced in HF >= 4.46.0")
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("longest_edge,expected_toks_per_img,num_imgs", [
(336, 169 * (1**2 + 1), 1),
(336, 169 * (1**2 + 1), 2),
(400, 169 * (2**2 + 1), 1),
(400, 169 * (2**2 + 1), 2),
])
def test_input_processor_override(input_processor_for_idefics3,
image_assets: _ImageAssets, model: str,
longest_edge: int,
expected_toks_per_img: int, num_imgs: int):
"""Ensure input_processor_for_idefics3 handles num_crops properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
# the partial when calling the custom input processor.
size = {"longest_edge": longest_edge} if longest_edge is not None else None
ctx = build_model_context(
model_name=model,
tokenizer_name=model,
trust_remote_code=True,
mm_processor_kwargs=None,
)
# Build the image str / prompt based on the number of images we pass
tokenizer = AutoTokenizer.from_pretrained(model)
placeholders = "<image>" if num_imgs == 1 else "\n".join(
f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
prompt = f"<|begin_of_text|>User:{placeholders}\n<end_of_utterance>\nAssistant:" # noqa: E501
images = [image_assets[0].pil_image.resize((336 * 4, 336 * 4))] * num_imgs
inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
prompt=prompt,
multi_modal_data={"image": images})
processed_inputs = input_processor_for_idefics3(ctx, inputs, size=size)
# Ensure we have the right number of placeholders per num_crops size
image_token_id = ctx.get_hf_config().image_token_id
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
assert img_tok_count == expected_toks_per_img * num_imgs

View File

@@ -0,0 +1,70 @@
import pytest
from vllm.inputs import InputContext
from ....utils import build_model_context
@pytest.fixture()
def get_max_llava_next_image_tokens():
from vllm.model_executor.models.llava_next import (
get_max_llava_next_image_tokens)
return get_max_llava_next_image_tokens
@pytest.fixture()
def dummy_data_for_llava_next():
from vllm.model_executor.models.llava_next import dummy_data_for_llava_next
return dummy_data_for_llava_next
@pytest.mark.parametrize("gridpoints,expected_max_tokens", [
([[336, 336]], 1176),
([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], 2928),
])
def test_get_max_llava_next_image_tokens(gridpoints, expected_max_tokens,
get_max_llava_next_image_tokens):
ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf")
# Update the config image_grid_pinpoints
# and calculate the resulting max tokens
ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
actual_max_tokens = get_max_llava_next_image_tokens(
InputContext(ctx.model_config))
assert expected_max_tokens == actual_max_tokens
@pytest.mark.parametrize(
"gridpoints,expected_size",
[
# One point; it has to be the largest
([[336, 336]], (336, 336)),
# Default for most llava next models; the 2x2 tile is the largest
([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]],
(672, 672)),
# If two rectangular gridpoints are the same, the more vertical
# one has the higher feature count due to newline features
([[336, 672], [672, 336]], (672, 336))
])
def test_dummy_data_for_llava_next_feature_size(dummy_data_for_llava_next,
gridpoints, expected_size):
ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf")
# Update the config image_grid_pinpoints
ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
seq_len = 5000 # bigger than the max feature size for any image
dummy_data = dummy_data_for_llava_next(
ctx,
seq_len=seq_len,
mm_counts={"image": 1},
)
seq_data = dummy_data.seq_data
mm_data = dummy_data.multi_modal_data
# The dummy data dims should match the gridpoint with the biggest feat size
assert mm_data["image"].height == expected_size[0]
assert mm_data["image"].width == expected_size[1]
assert len(seq_data.get_token_ids()) >= seq_len

View File

@@ -0,0 +1,182 @@
"""Tests for phi3v's multimodal preprocessing kwargs."""
from typing import Optional
import pytest
import torch
from transformers import AutoImageProcessor, AutoTokenizer
from vllm.inputs import InputContext, token_inputs
from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
from vllm.multimodal import MultiModalRegistry
from .....conftest import _ImageAssets
from ....utils import build_model_context
models = ["microsoft/Phi-3.5-vision-instruct"]
# Wrap lazy imports to avoid initializing CUDA during test collection
@pytest.fixture()
def input_processor_for_phi3v():
from vllm.model_executor.models.phi3v import input_processor_for_phi3v
return input_processor_for_phi3v
@pytest.fixture()
def dummy_data_for_phi3v():
from vllm.model_executor.models.phi3v import dummy_data_for_phi3v
return dummy_data_for_phi3v
@pytest.fixture()
def get_max_phi3v_image_tokens():
from vllm.model_executor.models.phi3v import get_max_phi3v_image_tokens
return get_max_phi3v_image_tokens
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("num_crops", [4, 16, None])
def test_input_mapper_override(model: str, image_assets: _ImageAssets,
num_crops: Optional[int]):
"""Ensure that the [default] input mapper handles num_crops properly."""
# We pass the processor kwargs here since for this model, we fall back to
# the default mapper; this will fall back to the HF mapper and forward
# mm_processor_kwargs to it.
mm_processor_kwargs = {
"num_crops": num_crops
} if num_crops is not None else {}
ctx = build_model_context(
model_name=model,
tokenizer_name=model,
trust_remote_code=True,
mm_processor_kwargs=mm_processor_kwargs,
)
hf_processor = AutoImageProcessor.from_pretrained(model,
trust_remote_code=True,
**mm_processor_kwargs)
mm_registry = MultiModalRegistry()
mm_registry.init_mm_limits_per_prompt(ctx.model_config)
image = image_assets[0].pil_image
hf_result = hf_processor.preprocess(
image,
return_tensors="pt",
)
vllm_result = mm_registry.map_input(
ctx.model_config,
{"image": image},
)
assert torch.all(hf_result["image_sizes"] == vllm_result["image_sizes"])
assert torch.all(
hf_result["num_img_tokens"] == vllm_result["num_img_tokens"])
# For pixel values, the second axis should be the num_crops + 1
# for the rescaled original image. The default value in VLLM falls
# back to the HF config, which is why we compare to the processor num_crops
assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"])
assert vllm_result["pixel_values"].shape[1] == hf_processor.num_crops + 1
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("num_crops,expected_max_tokens", [
(4, 781),
(16, 2653),
])
def test_max_tokens_override(get_max_phi3v_image_tokens, model: str,
num_crops: int, expected_max_tokens: int):
"""Ensure get_max_phi3v_image_tokens handles num_crops properly."""
# NOTE: mm_processor_kwargs on the context in this test is unused, since
# this is testing the mapper directly. In practice, the processor kwargs
# are wrapped in a closure when calling the max tokens func. We explicitly
# do NOT use the mm_processor_kwargs in the model context here to ensure
# that the max image tokens implementation is referencing a mix of the
# kwargs to the function and the original mm_processor_kwargs in case
# values are somehow updated and end up in a bad state.
ctx = build_model_context(
model_name=model,
tokenizer_name=model,
trust_remote_code=True,
mm_processor_kwargs=None,
)
actual_max_tokens = get_max_phi3v_image_tokens(
InputContext(ctx.model_config),
num_crops=num_crops,
)
assert expected_max_tokens == actual_max_tokens
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("num_crops,toks_per_img,num_imgs", [
(4, 781, 1),
(4, 781, 2),
(16, 2653, 1),
(16, 2653, 2),
])
def test_dummy_data_override(dummy_data_for_phi3v, model: str, num_crops: int,
toks_per_img: int, num_imgs: int):
"""Ensure dummy_data_for_phi3v handles num_crops properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
# the partial when calling the dummy data func.
ctx = build_model_context(
model_name=model,
tokenizer_name=model,
trust_remote_code=True,
mm_processor_kwargs=None,
)
dummy_data = dummy_data_for_phi3v(
ctx=ctx,
seq_len=8192, # Should be bigger than num_imgs * toks_per_img
mm_counts={"image": num_imgs},
num_crops=num_crops,
)
sequence_data = dummy_data.seq_data
# Ensure we have the right number of placeholders per num_crops size
img_tok_count = sequence_data.get_token_ids().count(_IMAGE_TOKEN_ID)
assert img_tok_count == toks_per_img * num_imgs
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("num_crops,expected_toks_per_img,num_imgs", [
(4, 757, 1),
(4, 757, 2),
(16, 1921, 1),
(16, 1921, 2),
])
def test_input_processor_override(input_processor_for_phi3v,
image_assets: _ImageAssets, model: str,
num_crops: int, expected_toks_per_img: int,
num_imgs: int):
"""Ensure input_processor_for_phi3v handles num_crops properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
# the partial when calling the custom input processor.
ctx = build_model_context(
model_name=model,
tokenizer_name=model,
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model)
# Build the image str / prompt based on the number of images we pass
img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
images = [image_assets[0].pil_image] * num_imgs
inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
prompt=prompt,
multi_modal_data={"image": images})
processed_inputs = input_processor_for_phi3v(ctx,
inputs,
num_crops=num_crops)
# Ensure we have the right number of placeholders per num_crops size
img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)
assert img_tok_count == expected_toks_per_img * num_imgs

View File

@@ -0,0 +1,144 @@
"""Tests for Qwen's multimodal preprocessing kwargs."""
from typing import Dict, List, Union
import pytest
import torch
from PIL.Image import Image
from vllm.inputs import InputContext, token_inputs
from vllm.multimodal import MultiModalKwargs
from vllm.multimodal.utils import cached_get_tokenizer
from .....conftest import IMAGE_ASSETS
from ....utils import build_model_context
### Multimodal preprocessing tests
SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image
# These values are specific to Qwen-VL/Chat; we can get these from the model
# config also, but they are hardcoded here to keep the parameterize/fixtures
# easy to read.
IMG_START_ID = 151857
IMG_END_ID = 151858
IMG_PAD_ID = 151859
TOKS_PER_IMG = 256
VIS_ENC_DIM = 4096
IMG_SIZE = 448
@pytest.fixture()
def input_mapper_for_qwen():
# Lazy import to avoid initializing CUDA during test collection
from vllm.model_executor.models.qwen import input_mapper_for_qwen
return input_mapper_for_qwen
@pytest.fixture()
def input_processor_for_qwen():
# Lazy import to avoid initializing CUDA during test collection
from vllm.model_executor.models.qwen import input_processor_for_qwen
return input_processor_for_qwen
@pytest.fixture()
def qwen_vl_context() -> InputContext:
"""Get an InputContext for Qwen-VL."""
return build_model_context(model_name="Qwen/Qwen-VL",
trust_remote_code=True)
# Happy path tests for single/multi-image scenarios for the multimodal
# input processor and mapper, respectively
@pytest.mark.parametrize("num_images", [1, 2])
def test_input_processor_valid_mm_data(input_processor_for_qwen,
qwen_vl_context: InputContext,
num_images: int):
"""Happy cases for image inputs to Qwen's multimodal input processor."""
prompt = "".join(
[f"Picture {num}: <img></img>\n" for num in range(1, num_images + 1)])
inputs = token_inputs(
prompt=prompt,
# When processing multimodal data for a multimodal model, the qwen
# input processor will overwrite the provided prompt_token_ids with
# the image prompts
prompt_token_ids=[],
multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)},
)
proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs)
assert isinstance(proc_inputs, dict)
# Each image should have one start / stop and a fixed context of 256
proc_tokens = proc_inputs["prompt_token_ids"]
assert proc_tokens.count(IMG_START_ID) == num_images
assert proc_tokens.count(IMG_END_ID) == num_images
assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG
@pytest.mark.parametrize(
"img_data,expected_shape",
[
# single / multi-image
(SAMPLE_IMAGE, (1, 3, IMG_SIZE, IMG_SIZE)),
(2 * [SAMPLE_IMAGE], (2, 3, IMG_SIZE, IMG_SIZE)),
# single / multi-image embeddings
(torch.rand(
(TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
(torch.rand(
(1, TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
(torch.rand(
(2, TOKS_PER_IMG, VIS_ENC_DIM)), (2, TOKS_PER_IMG, VIS_ENC_DIM)),
])
def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
qwen_vl_context: InputContext,
img_data: Union[torch.Tensor, List[Image],
Image],
expected_shape: List[int]):
"""Happy cases for image inputs to Qwen's multimodal input mapper."""
mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data)
# Ensure that we get the appropriately shaped pixel_values
# for images and image embeddings, respectively.
assert isinstance(mapped_img_data, MultiModalKwargs)
assert "pixel_values" in mapped_img_data
assert mapped_img_data["pixel_values"].shape == expected_shape
# Sad path tests for the multimodal input processor and mapper, respectively
@pytest.mark.parametrize("mm_data", [
{
"image": torch.rand(5)
},
{
"image": torch.rand((5, 5, 5, 5, 5))
},
])
def test_input_processor_invalid_mm_data(input_processor_for_qwen,
qwen_vl_context: InputContext,
mm_data: Dict[str, torch.Tensor]):
"""Test sad cases validated in Qwen's multimodal input processor."""
tokenizer = cached_get_tokenizer(qwen_vl_context.model_config.tokenizer,
trust_remote_code=True)
prompt = "Picture 1: <img></img>\n"
prompt_token_ids = tokenizer.encode(prompt)
inputs = token_inputs(prompt=prompt,
prompt_token_ids=prompt_token_ids,
multi_modal_data=mm_data)
# Should fail since we have too many or too few dimensions for embeddings
with pytest.raises(ValueError):
input_processor_for_qwen(qwen_vl_context, inputs)
@pytest.mark.parametrize(
"img_data",
[
# Wrong context length
torch.rand((1, TOKS_PER_IMG + 10, VIS_ENC_DIM)),
# Wrong visual encoder output size
torch.rand((1, TOKS_PER_IMG, VIS_ENC_DIM + 10)),
])
def test_input_mapper_invalid_mm_data(
input_mapper_for_qwen,
qwen_vl_context: InputContext,
img_data: Union[torch.Tensor, List[Image], Image],
):
"""Sad cases validated in Qwen VL's multimodal input mapper."""
with pytest.raises(ValueError):
input_mapper_for_qwen(qwen_vl_context, img_data)

View File

@@ -0,0 +1,167 @@
from typing import Any, Dict, Tuple
import pytest
import torch
from PIL.Image import Image
from transformers import AutoTokenizer
from vllm.inputs import InputContext, token_inputs
from vllm.multimodal import MultiModalRegistry
from .....conftest import _ImageAssets
from ....utils import build_model_context
MODEL = "Qwen/Qwen2-VL-2B-Instruct"
MIN_PIXELS = "min_pixels"
MAX_PIXELS = "max_pixels"
# Fixtures lazy import to avoid initializing CUDA during test collection
# NOTE: Qwen2VL supports multiple input modalities, so it registers multiple
# input mappers.
@pytest.fixture()
def image_input_mapper_for_qwen2_vl():
from vllm.model_executor.models.qwen2_vl import (
image_input_mapper_for_qwen2_vl)
return image_input_mapper_for_qwen2_vl
@pytest.fixture()
def input_processor_for_qwen2_vl():
from vllm.model_executor.models.qwen2_vl import (
input_processor_for_qwen2_vl)
return input_processor_for_qwen2_vl
@pytest.fixture()
def qwen2_vl_context() -> InputContext:
return build_model_context(model_name=MODEL)
@pytest.fixture()
def get_max_qwen2_vl_image_tokens():
from vllm.model_executor.models.qwen2_vl import (
get_max_qwen2_vl_image_tokens)
return get_max_qwen2_vl_image_tokens
@pytest.fixture()
def dummy_data_for_qwen2_vl():
from vllm.model_executor.models.qwen2_vl import dummy_data_for_qwen2_vl
return dummy_data_for_qwen2_vl
@pytest.mark.parametrize("mm_processor_kwargs,expected_max_tokens", [
({}, 1225),
({
MIN_PIXELS: 64**2,
MAX_PIXELS: 512**2
}, 324),
])
def test_qwen2_vl_max_image_tokens(get_max_qwen2_vl_image_tokens,
qwen2_vl_context: InputContext,
mm_processor_kwargs: Dict[str, Any],
expected_max_tokens: int):
"""Ensure that the max token calc handles min/max pixels properly."""
actual_max_tokens = get_max_qwen2_vl_image_tokens(qwen2_vl_context,
**mm_processor_kwargs)
assert actual_max_tokens == expected_max_tokens
@pytest.mark.parametrize("mm_processor_kwargs,token_count,img_size", [
[{}, 1225, (980, 980)],
[{
MIN_PIXELS: 64**2,
MAX_PIXELS: 512**2
}, 324, (504, 504)],
])
def test_qwen2_vl_dummy_data(dummy_data_for_qwen2_vl,
qwen2_vl_context: InputContext,
mm_processor_kwargs: Dict[str, Any],
token_count: int, img_size: Tuple[int, int]):
"""Ensure that the dummy data handles min/max pixels properly."""
seq_len = 3000
hf_config = qwen2_vl_context.get_hf_config()
image_token_id = hf_config.image_token_id
# NOTE: video value is required, but isn't actually used
# when making the dummy data except for error handling currently
dummy_data = dummy_data_for_qwen2_vl(
ctx=qwen2_vl_context,
seq_len=seq_len,
mm_counts={
"image": 1,
"video": 0
},
**mm_processor_kwargs,
)
seq_data = dummy_data.seq_data
mm_data = dummy_data.multi_modal_data
# Ensure we have the right number of placeholders for min/max pixel values
assert seq_data.get_token_ids().count(image_token_id) == token_count
# Ensure the images were resized correctly
image = mm_data["image"]
assert isinstance(image, Image)
assert image.size == img_size
@pytest.mark.parametrize("mm_processor_kwargs,num_placeholders", [
({}, 1426),
({
MIN_PIXELS: 64**2,
MAX_PIXELS: 512**2
}, 330),
])
def test_input_processor(input_processor_for_qwen2_vl,
qwen2_vl_context: InputContext,
image_assets: _ImageAssets, num_placeholders: int,
mm_processor_kwargs: Dict[str, Any]):
"""Ensure that the image processor handles min/max pixels properly."""
tokenizer = AutoTokenizer.from_pretrained(MODEL)
prompt = "<|vision_start|><|image_pad|><|vision_end|>"
image = image_assets[0].pil_image
hf_config = qwen2_vl_context.get_hf_config()
image_token_id = hf_config.image_token_id
inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
prompt=prompt,
multi_modal_data={"image": [image]})
processed_inputs = input_processor_for_qwen2_vl(qwen2_vl_context, inputs,
**mm_processor_kwargs)
assert processed_inputs["prompt_token_ids"].count(
image_token_id) == num_placeholders
assert len(processed_inputs["multi_modal_data"]["image"]) == 1
@pytest.mark.parametrize("mm_processor_kwargs,pixels_shape", [
({}, [5704, 1176]),
({
MIN_PIXELS: 64**2,
MAX_PIXELS: 512**2
}, [1320, 1176]),
])
def test_image_mapper_override(qwen2_vl_context: InputContext,
image_assets: _ImageAssets,
mm_processor_kwargs: Dict[str, Any],
pixels_shape: Tuple[int, int]):
"""Ensure that the image mapper handles min/max pixels properly."""
mm_registry = MultiModalRegistry()
mm_registry.init_mm_limits_per_prompt(qwen2_vl_context.model_config)
image = image_assets[0].pil_image
mapped_output = mm_registry.map_input(
qwen2_vl_context.model_config,
{"image": image},
mm_processor_kwargs=mm_processor_kwargs,
)
# Dimension 0 of pixel values should match the product of image_grid_thw
actual_pixels_shape = mapped_output["pixel_values"].shape
assert list(actual_pixels_shape) == pixels_shape
assert actual_pixels_shape[0] == torch.prod(
mapped_output["image_grid_thw"])

View File

@@ -0,0 +1,120 @@
from typing import List, Optional, Type
import pytest
import torch
from vllm.multimodal.utils import rescale_image_size
from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets
from ...utils import check_logprobs_close
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
"<|im_start|>User\n<image>\nWhat's the content in the center of the image?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
"cherry_blossom":
"<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
})
def run_awq_test(
vllm_runner: Type[VllmRunner],
image_assets: _ImageAssets,
source_model: str,
quant_model: str,
*,
size_factors: List[float],
dtype: str,
max_tokens: int,
num_logprobs: int,
tensor_parallel_size: int,
distributed_executor_backend: Optional[str] = None,
):
images = [asset.pil_image for asset in image_assets]
inputs_per_image = [(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with vllm_runner(source_model,
max_model_len=4096,
dtype=dtype,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True) as vllm_model:
source_outputs_per_image = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images)
for prompts, images in inputs_per_image
]
with vllm_runner(quant_model,
quantization="awq",
max_model_len=4096,
dtype=dtype,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True) as vllm_model:
quant_outputs_per_image = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images)
for prompts, images in inputs_per_image
]
for source_outputs, quant_outputs in zip(source_outputs_per_image,
quant_outputs_per_image):
# TODO: Check whether using original CLIPVisionModel can improve
# consistency against HF
check_logprobs_close(
outputs_0_lst=source_outputs,
outputs_1_lst=quant_outputs,
name_0="source",
name_1="awq",
)
@pytest.mark.quant_model
@pytest.mark.parametrize(
("source_model", "quant_model"),
[("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")],
)
@pytest.mark.parametrize(
"size_factors",
[
# No image
[],
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@torch.inference_mode()
def test_awq_models(vllm_runner, image_assets, source_model, quant_model,
size_factors, dtype, max_tokens, num_logprobs) -> None:
run_awq_test(
vllm_runner,
image_assets,
source_model,
quant_model,
size_factors=size_factors,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=1,
)

View File

@@ -0,0 +1,129 @@
from typing import Optional, Tuple
import pytest
import torch
from PIL.Image import Image
from transformers import AutoConfig
# Import the functions to test
from vllm.model_executor.models.h2ovl import (calculate_num_blocks,
image_to_pixel_values_wrapper)
from vllm.multimodal.utils import rescale_image_size
models = [
"h2oai/h2ovl-mississippi-800m", # Replace with your actual model names
"h2oai/h2ovl-mississippi-2b",
]
def run_preprocessing_test(
image: Image,
config,
max_dynamic_patch: Optional[int] = None,
) -> Tuple[torch.Tensor, int]:
"""Test the image preprocessing and calculate expected blocks."""
if max_dynamic_patch is None:
max_dynamic_patch = config.max_dynamic_patch
width, height = image.size
use_MSAC = config.use_msac
# Create the mapper function with the provided configuration
mapper = image_to_pixel_values_wrapper(config, max_dynamic_patch, use_MSAC)
pixel_values = mapper(image)
# Calculate the expected number of blocks
if use_MSAC:
# First pass
blocks1, _, _, aspect_ratio = calculate_num_blocks(
width,
height,
config.min_dynamic_patch,
max_dynamic_patch,
config.vision_config.image_size,
use_thumbnail=False, # Thumbnail is handled separately
prior_aspect_ratio=None,
)
# Second pass
blocks2, _, _, _ = calculate_num_blocks(
width,
height,
config.min_dynamic_patch,
max_dynamic_patch,
config.vision_config.image_size,
use_thumbnail=False,
prior_aspect_ratio=aspect_ratio,
)
# Add thumbnail if use_thumbnail is True and total_blocks > 1
if config.use_thumbnail:
blocks1 += 1 if blocks1 > 1 else 0
blocks2 += 1 if blocks2 > 1 else 0
# Total blocks is the sum of blocks from both passes minus overlapping
total_blocks = blocks1 + blocks2 - 1
expected_blocks = total_blocks
else:
blocks, _, _, _ = calculate_num_blocks(
width,
height,
config.min_dynamic_patch,
max_dynamic_patch,
config.vision_config.image_size,
use_thumbnail=False,
prior_aspect_ratio=None,
)
expected_blocks = blocks
if config.use_thumbnail and expected_blocks > 1:
expected_blocks += 1
return pixel_values, expected_blocks
@pytest.mark.parametrize("model_name", models)
@pytest.mark.parametrize(
"size_factors",
[
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("max_dynamic_patch", [None, 2, 4, 8])
def test_image_preprocessing(image_assets, model_name, size_factors,
max_dynamic_patch):
"""Test image preprocessing pipeline with different configurations."""
# Load the configuration from the model
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
for asset in image_assets:
image = asset.pil_image
for factor in size_factors:
scaled_image = rescale_image_size(image, factor)
# Test preprocessing and get expected number of blocks
pixel_values, expected_blocks = run_preprocessing_test(
scaled_image, config, max_dynamic_patch)
# Verify output shapes and properties
actual_blocks = pixel_values.shape[0]
assert actual_blocks == expected_blocks, (
f"Expected {expected_blocks} blocks, got {actual_blocks}")
# Check image dimensions
expected_size = (
3, # Number of channels (C, H, W)
config.vision_config.image_size,
config.vision_config.image_size,
)
for img in pixel_values:
assert img.shape == expected_size, (
f"Expected image size {expected_size}, got {img.shape}")

View File

@@ -0,0 +1,77 @@
from typing import Optional
import pytest
import torch
import torch.nn as nn
from huggingface_hub import snapshot_download
from transformers import AutoConfig, AutoModel, CLIPImageProcessor
from ....conftest import _ImageAssets
# we use snapshot_download to prevent conflicts between
# dynamic_module and trust_remote_code for hf_runner
DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]
def run_intern_vit_test(
image_assets: _ImageAssets,
model_id: str,
*,
dtype: str,
distributed_executor_backend: Optional[str] = None,
):
model = snapshot_download(model_id, allow_patterns=DOWNLOAD_PATTERN)
img_processor = CLIPImageProcessor.from_pretrained(model)
images = [asset.pil_image for asset in image_assets]
pixel_values = [
img_processor(images, return_tensors='pt').pixel_values.to(dtype)
for images in images
]
config = AutoConfig.from_pretrained(model, trust_remote_code=True)
if not getattr(config, "norm_type", None):
config.norm_type = "rms_norm"
hf_model = AutoModel.from_pretrained(model,
torch_dtype=dtype,
trust_remote_code=True).to("cuda")
hf_outputs_per_image = [
hf_model(pixel_value.to("cuda")).last_hidden_state
for pixel_value in pixel_values
]
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.model_executor.models.intern_vit import InternVisionModel
vllm_model = InternVisionModel(config)
vllm_model.load_weights(hf_model.state_dict().items())
del hf_model
cleanup_dist_env_and_memory()
vllm_model = vllm_model.to("cuda", dtype)
vllm_outputs_per_image = [
vllm_model(pixel_values=pixel_value.to("cuda"))
for pixel_value in pixel_values
]
del vllm_model
cleanup_dist_env_and_memory()
cos_similar = nn.CosineSimilarity(dim=-1)
for vllm_output, hf_output in zip(vllm_outputs_per_image,
hf_outputs_per_image):
assert cos_similar(vllm_output, hf_output).mean() > 0.99
@pytest.mark.parametrize("model_id", [
"OpenGVLab/InternViT-300M-448px",
"OpenGVLab/InternViT-6B-448px-V1-5",
])
@pytest.mark.parametrize("dtype", [torch.half])
@torch.inference_mode()
def test_models(dist_init, image_assets, model_id, dtype: str) -> None:
run_intern_vit_test(
image_assets,
model_id,
dtype=dtype,
)

View File

@@ -0,0 +1,657 @@
"""Common tests for testing .generate() functionality for single / multiple
image, embedding, and video support for different VLMs in vLLM.
"""
import os
from pathlib import PosixPath
from typing import Type
import pytest
import transformers
from transformers import AutoModelForVision2Seq
from vllm.platforms import current_platform
from vllm.utils import cuda_device_count_stateless, identity
from ....conftest import (IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets,
_VideoAssets)
from ....utils import fork_new_process_for_each_test, large_gpu_mark
from ...utils import check_outputs_equal
from .vlm_utils import custom_inputs, model_utils, runners
from .vlm_utils.case_filtering import get_parametrized_options
from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs,
VLMTestInfo, VLMTestType)
# This hack is needed for phi3v & paligemma models
# ROCm Triton FA can run into shared memory issues with these models,
# use other backends in the meantime
# FIXME (mattwong, gshtrasb, hongxiayan)
if current_platform.is_rocm():
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
# yapf: disable
COMMON_BROADCAST_SETTINGS = {
"test_type": VLMTestType.IMAGE,
"dtype": "half",
"max_tokens": 5,
"tensor_parallel_size": 2,
"model_kwargs": {"device_map": "auto"},
"image_size_factors": [(.25, 0.5, 1.0)],
"distributed_executor_backend": (
"ray",
"mp",
)
}
### Test configuration for specific models
# NOTE: The convention of the test settings below is to lead each test key
# with the name of the model arch used in the test, using underscores in place
# of hyphens; this makes it more convenient to filter tests for a specific kind
# of model. For example....
#
# To run all test types for a specific key:
# use the k flag to substring match with a leading square bracket; if the
# model arch happens to be a substring of another one, you can add a
# trailing hyphen. E.g.,
# - pytest $TEST_FILE -k "[llava-"
# prevents matching on "[llava_next-" & will match just the enabled cases
# for llava, i.e., single image, image embedding, and custom input tests.
#
# To run a test for a Test Info for just one of multiple models:
# use the k flag to substring match the model name, e.g.,
# - pytest $TEST_FILE -k OpenGVLab/InternVL2-1B
# prevents matching on nGVLab/InternVL2-2B.
#
# You can also combine substrings to match more granularly.
# ex 1:
# pytest $TEST_FILE -k "test_single_image and OpenGVLab/InternVL2-1B"
# will run only test_single_image* for OpenGVLab/InternVL2-1B; this would
# match both wrappers for single image tests, since it also matches
# test_single_image_heavy (which forks if we have a distributed backend)
# ex 2:
# pytest $TEST_FILE -k "[llava- or [intern_vl-"
# will run all of the tests for only llava & internvl.
#
# NOTE you can add --collect-only to any of the above commands to see
# which cases would be selected and deselected by pytest. In general,
# this is a good idea for checking your command first, since tests are slow.
VLM_TEST_SETTINGS = {
#### Core tests to always run in the CI
"llava": VLMTestInfo(
models=["llava-hf/llava-1.5-7b-hf"],
test_type=(
VLMTestType.EMBEDDING,
VLMTestType.IMAGE,
VLMTestType.CUSTOM_INPUTS
),
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
convert_assets_to_embeddings=model_utils.get_llava_embeddings,
max_model_len=4096,
auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions(
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:"
),
limit_mm_per_prompt={"image": 4},
)],
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
"paligemma": VLMTestInfo(
models=["google/paligemma-3b-mix-224"],
test_type=VLMTestType.IMAGE,
prompt_formatter=identity,
img_idx_to_prompt = lambda idx: "",
# Paligemma uses its own sample prompts because the default one fails
single_image_prompts=IMAGE_ASSETS.prompts({
"stop_sign": "caption es",
"cherry_blossom": "What is in the picture?",
}),
auto_cls=AutoModelForVision2Seq,
postprocess_inputs=model_utils.get_key_type_post_processor(
"pixel_values"
),
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
dtype=("half" if current_platform.is_cpu() or current_platform.is_rocm()
else ("half", "float")),
marks=[pytest.mark.core_model],
),
"qwen2_vl": VLMTestInfo(
models=["Qwen/Qwen2-VL-2B-Instruct"],
test_type=(
VLMTestType.IMAGE,
VLMTestType.MULTI_IMAGE,
VLMTestType.VIDEO
),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
#### Extended model tests
"blip2": VLMTestInfo(
models=["Salesforce/blip2-opt-2.7b"],
test_type=VLMTestType.IMAGE,
prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
img_idx_to_prompt=lambda idx: "",
auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
),
"chameleon": VLMTestInfo(
models=["facebook/chameleon-7b"],
test_type=VLMTestType.IMAGE,
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
max_model_len=4096,
auto_cls=AutoModelForVision2Seq,
postprocess_inputs=model_utils.get_key_type_post_processor(
"pixel_values"
),
# For chameleon, we only compare the sequences
vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
hf_output_post_proc = lambda hf_output, model: hf_output[:2],
comparator=check_outputs_equal,
max_tokens=8,
dtype="bfloat16",
marks=[
pytest.mark.skipif(
transformers.__version__ < "4.46.2",
reason="Model broken in HF, see huggingface/transformers#34379"
),
]
),
"fuyu": VLMTestInfo(
models=["adept/fuyu-8b"],
test_type=VLMTestType.IMAGE,
prompt_formatter=lambda img_prompt: f"{img_prompt}\n",
img_idx_to_prompt=lambda idx: "",
max_model_len=2048,
max_num_seqs=2,
use_tokenizer_eos=True,
vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
num_logprobs=10,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
),
"glm4": VLMTestInfo(
models=["THUDM/glm-4v-9b"],
test_type=VLMTestType.IMAGE,
prompt_formatter=identity,
img_idx_to_prompt=lambda idx: "",
max_model_len=2048,
max_num_seqs=2,
dtype="bfloat16",
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
patch_hf_runner=model_utils.glm_patch_hf_runner,
marks=[large_gpu_mark(min_gb=48)],
),
"h2ovl": VLMTestInfo(
models = [
"h2oai/h2ovl-mississippi-800m",
"h2oai/h2ovl-mississippi-2b",
],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts({
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<image>\nWhat is the season?",
}),
multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501
max_model_len=8192,
dtype="bfloat16",
use_tokenizer_eos=True,
patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
),
"idefics3": VLMTestInfo(
models=["HuggingFaceM4/Idefics3-8B-Llama3"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>",
max_model_len=8192,
max_num_seqs=2,
auto_cls=AutoModelForVision2Seq,
marks=[
pytest.mark.skipif(
transformers.__version__ < "4.46.0",
reason="Model introduced in HF >= 4.46.0"
),
large_gpu_mark(min_gb=48),
],
),
"intern_vl": VLMTestInfo(
models=[
"OpenGVLab/InternVL2-1B",
"OpenGVLab/InternVL2-2B",
"OpenGVLab/Mono-InternVL-2B",
],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts({
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<image>\nWhat is the season?",
}),
multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501
max_model_len=4096,
# NOTE: Mono-InternVL-2B doesn't work with fp16,
# it will result NaN during inference.
# See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9
dtype="bfloat16",
use_tokenizer_eos=True,
patch_hf_runner=model_utils.internvl_patch_hf_runner,
),
"llava_next": VLMTestInfo(
models=["llava-hf/llava-v1.6-mistral-7b-hf"],
test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
max_model_len=10240,
auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions(
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]"
),
limit_mm_per_prompt={"image": 4},
)],
# Llava-next tests fixed sizes & the default size factors
image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
),
"llava_one_vision": VLMTestInfo(
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
test_type=VLMTestType.CUSTOM_INPUTS,
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
num_video_frames=16,
max_model_len=16384,
postprocess_inputs=model_utils.get_key_type_post_processor(
"pixel_values_videos"
),
auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
# Llava-one-vision tests fixed sizes & the default size factors
image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
custom_test_opts=[CustomTestOptions(
inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs(
formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
),
limit_mm_per_prompt={"video": 4},
runner_mm_key="videos",
)],
),
"llava_next_video": VLMTestInfo(
models=["llava-hf/LLaVA-NeXT-Video-7B-hf"],
test_type=VLMTestType.VIDEO,
prompt_formatter=lambda vid_prompt: f"USER: {vid_prompt} ASSISTANT:",
num_video_frames=16,
max_model_len=4096,
auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output,
image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
marks=[
pytest.mark.skipif(
transformers.__version__ < "4.46.2",
reason="Model broken with changes in transformers 4.46"
)
],
),
"minicpmv": VLMTestInfo(
models=["openbmb/MiniCPM-Llama3-V-2_5"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
max_model_len=4096,
max_num_seqs=2,
get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
postprocess_inputs=model_utils.wrap_inputs_post_processor,
hf_output_post_proc=model_utils.minicmpv_trunc_hf_output,
),
# Tests for phi3v currently live in another file because of a bug in
# transformers. Once this issue is fixed, we can enable them here instead.
# https://github.com/huggingface/transformers/issues/34307
# "phi3v": VLMTestInfo(
# models=["microsoft/Phi-3.5-vision-instruct"],
# test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
# prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
# img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
# max_model_len=4096,
# max_num_seqs=2,
# task="generate",
# # use eager mode for hf runner since phi3v didn't work with flash_attn
# model_kwargs={"_attn_implementation": "eager"},
# use_tokenizer_eos=True,
# vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
# num_logprobs=10,
# ),
"pixtral_hf": VLMTestInfo(
models=["nm-testing/pixtral-12b-FP8-dynamic"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<s>[INST]{img_prompt}[/INST]",
img_idx_to_prompt=lambda idx: "[IMG]",
max_model_len=8192,
max_num_seqs=2,
auto_cls=AutoModelForVision2Seq,
marks=[large_gpu_mark(min_gb=48)],
),
"qwen": VLMTestInfo(
models=["Qwen/Qwen-VL"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=identity,
img_idx_to_prompt=lambda idx: f"Picture {idx}: <img></img>\n",
max_model_len=1024,
max_num_seqs=2,
vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,
prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
),
### Tensor parallel / multi-gpu broadcast tests
"broadcast-chameleon": VLMTestInfo(
models=["facebook/chameleon-7b"],
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
max_model_len=4096,
auto_cls=AutoModelForVision2Seq,
postprocess_inputs=model_utils.get_key_type_post_processor(
"pixel_values"
),
vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
hf_output_post_proc = lambda hf_output, model: hf_output[:2],
comparator=check_outputs_equal,
marks=[
pytest.mark.distributed_2_gpus,
pytest.mark.skipif(
cuda_device_count_stateless() < 2,
reason="Need at least 2 GPUs to run the test.",
),
pytest.mark.skipif(
transformers.__version__ < "4.46.2",
reason="Model broken in HF, see huggingface/transformers#34379"
)
],
**COMMON_BROADCAST_SETTINGS # type: ignore
),
"broadcast-llava": VLMTestInfo(
models=["llava-hf/llava-1.5-7b-hf"],
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
max_model_len=4096,
auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
marks=[
pytest.mark.distributed_2_gpus,
pytest.mark.skipif(
cuda_device_count_stateless() < 2,
reason="Need at least 2 GPUs to run the test.",
)
],
**COMMON_BROADCAST_SETTINGS # type: ignore
),
"broadcast-llava_next": VLMTestInfo(
models=["llava-hf/llava-v1.6-mistral-7b-hf"],
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
max_model_len=10240,
auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
marks=[
pytest.mark.distributed_2_gpus,
pytest.mark.skipif(
cuda_device_count_stateless() < 2,
reason="Need at least 2 GPUs to run the test.",
)
],
**COMMON_BROADCAST_SETTINGS # type: ignore
),
### Custom input edge-cases for specific models
"intern_vl-diff-patches": VLMTestInfo(
models=["OpenGVLab/InternVL2-2B"],
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
test_type=VLMTestType.CUSTOM_INPUTS,
max_model_len=4096,
use_tokenizer_eos=True,
patch_hf_runner=model_utils.internvl_patch_hf_runner,
custom_test_opts=[
CustomTestOptions(
inputs=inp,
limit_mm_per_prompt={"image": 2},
) for inp in custom_inputs.different_patch_input_cases_internvl()
],
),
"llava_one_vision-multiple-images": VLMTestInfo(
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
test_type=VLMTestType.CUSTOM_INPUTS,
max_model_len=16384,
max_num_seqs=2,
postprocess_inputs=model_utils.get_key_type_post_processor(
"pixel_values"
),
auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions(
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
),
limit_mm_per_prompt={"image": 4},
)],
),
}
# yapf: enable
### Test wrappers
# Wrappers around the core test running func for:
# - single image
# - multi-image
# - image embeddings
# - video
# - custom inputs
@pytest.mark.parametrize("model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.IMAGE,
fork_new_process_for_each_test=False,
))
def test_single_image_models(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
image_assets: _ImageAssets):
model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_single_image_test(
tmp_path=tmp_path,
model_test_info=model_test_info,
test_case=test_case,
hf_runner=hf_runner,
vllm_runner=vllm_runner,
image_assets=image_assets,
)
@pytest.mark.parametrize("model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.MULTI_IMAGE,
fork_new_process_for_each_test=False,
))
def test_multi_image_models(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
image_assets: _ImageAssets):
model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_multi_image_test(
tmp_path=tmp_path,
model_test_info=model_test_info,
test_case=test_case,
hf_runner=hf_runner,
vllm_runner=vllm_runner,
image_assets=image_assets,
)
@pytest.mark.parametrize("model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.EMBEDDING,
fork_new_process_for_each_test=False,
))
def test_image_embedding_models(model_type: str,
test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
image_assets: _ImageAssets):
model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_embedding_test(
model_test_info=model_test_info,
test_case=test_case,
hf_runner=hf_runner,
vllm_runner=vllm_runner,
image_assets=image_assets,
)
@pytest.mark.parametrize("model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.VIDEO,
fork_new_process_for_each_test=False,
))
def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner],
video_assets: _VideoAssets):
model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_video_test(
model_test_info=model_test_info,
test_case=test_case,
hf_runner=hf_runner,
vllm_runner=vllm_runner,
video_assets=video_assets,
)
@pytest.mark.parametrize("model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.CUSTOM_INPUTS,
fork_new_process_for_each_test=False,
))
def test_custom_inputs_models(
model_type: str,
test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
):
model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_custom_inputs_test(
model_test_info=model_test_info,
test_case=test_case,
hf_runner=hf_runner,
vllm_runner=vllm_runner,
)
#### Tests filtering for things running each test as a new process
@pytest.mark.parametrize("model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.IMAGE,
fork_new_process_for_each_test=True,
))
@fork_new_process_for_each_test
def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
image_assets: _ImageAssets):
model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_single_image_test(
tmp_path=tmp_path,
model_test_info=model_test_info,
test_case=test_case,
hf_runner=hf_runner,
vllm_runner=vllm_runner,
image_assets=image_assets,
)
@pytest.mark.parametrize("model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.MULTI_IMAGE,
fork_new_process_for_each_test=True,
))
@fork_new_process_for_each_test
def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
image_assets: _ImageAssets):
model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_multi_image_test(
tmp_path=tmp_path,
model_test_info=model_test_info,
test_case=test_case,
hf_runner=hf_runner,
vllm_runner=vllm_runner,
image_assets=image_assets,
)
@pytest.mark.parametrize("model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.EMBEDDING,
fork_new_process_for_each_test=True,
))
@fork_new_process_for_each_test
def test_image_embedding_models_heavy(model_type: str,
test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
image_assets: _ImageAssets):
model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_embedding_test(
model_test_info=model_test_info,
test_case=test_case,
hf_runner=hf_runner,
vllm_runner=vllm_runner,
image_assets=image_assets,
)
@pytest.mark.parametrize("model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.VIDEO,
fork_new_process_for_each_test=True,
))
def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
video_assets: _VideoAssets):
model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_video_test(
model_test_info=model_test_info,
test_case=test_case,
hf_runner=hf_runner,
vllm_runner=vllm_runner,
video_assets=video_assets,
)
@pytest.mark.parametrize("model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.CUSTOM_INPUTS,
fork_new_process_for_each_test=True,
))
@fork_new_process_for_each_test
def test_custom_inputs_models_heavy(
model_type: str,
test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
):
model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_custom_inputs_test(
model_test_info=model_test_info,
test_case=test_case,
hf_runner=hf_runner,
vllm_runner=vllm_runner,
)

View File

@@ -0,0 +1,234 @@
import os
import re
from typing import List, Optional, Tuple, Type
import pytest
from transformers import AutoTokenizer
from vllm.multimodal.utils import rescale_image_size
from vllm.platforms import current_platform
from vllm.sequence import SampleLogprobs
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ...utils import check_logprobs_close
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
"<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
"cherry_blossom":
"<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n",
})
HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
models = ["microsoft/Phi-3.5-vision-instruct"]
def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
Optional[SampleLogprobs]],
model: str):
"""Sanitize vllm output to be comparable with hf output."""
_, output_str, out_logprobs = vllm_output
output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
assert output_str_without_image[0] == " "
output_str_without_image = output_str_without_image[1:]
hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
tokenizer = AutoTokenizer.from_pretrained(model)
hf_output_ids = tokenizer.encode(output_str_without_image)
assert hf_output_ids[0] == 1
hf_output_ids = hf_output_ids[1:]
return hf_output_ids, hf_output_str, out_logprobs
target_dtype = "half"
# ROCm Triton FA can run into shared memory issues with these models,
# use other backends in the meantime
# FIXME (mattwong, gshtrasb, hongxiayan)
if current_platform.is_rocm():
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
def run_test(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
inputs: List[Tuple[List[str], PromptImageInput]],
model: str,
*,
dtype: str,
max_tokens: int,
num_logprobs: int,
mm_limit: int,
tensor_parallel_size: int,
distributed_executor_backend: Optional[str] = None,
):
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test are from IMAGE_ASSETS.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
# HACK - this is an attempted workaround for the following bug
# https://github.com/huggingface/transformers/issues/34307
from transformers import AutoImageProcessor # noqa: F401
from transformers import AutoProcessor # noqa: F401
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with vllm_runner(model,
task="generate",
max_model_len=4096,
max_num_seqs=2,
dtype=dtype,
limit_mm_per_prompt={"image": mm_limit},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True) as vllm_model:
vllm_outputs_per_case = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images)
for prompts, images in inputs
]
# use eager mode for hf runner, since phi3_v didn't work with flash_attn
hf_model_kwargs = {"_attn_implementation": "eager"}
with hf_runner(model, dtype=dtype,
model_kwargs=hf_model_kwargs) as hf_model:
eos_token_id = hf_model.processor.tokenizer.eos_token_id
hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
eos_token_id=eos_token_id)
for prompts, images in inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
vllm_outputs_per_case):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=[
vllm_to_hf_output(vllm_output, model)
for vllm_output in vllm_outputs
],
name_0="hf",
name_1="vllm",
)
# Since we use _attn_implementation="eager" for hf_runner, there is more
# significant numerical difference. The basic `logprobs=5` fails to pass.
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# No image
[],
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
dtype: str, max_tokens: int, num_logprobs: int) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_image = [(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
run_test(
hf_runner,
vllm_runner,
inputs_per_image,
model,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=1,
tensor_parallel_size=1,
)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", [target_dtype])
def test_regression_7840(hf_runner, vllm_runner, image_assets, model,
dtype) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_regresion_7840 = [
([prompt], [image]) for image, prompt in zip(images, HF_IMAGE_PROMPTS)
]
# Regression test for #7840.
run_test(
hf_runner,
vllm_runner,
inputs_regresion_7840,
model,
dtype=dtype,
max_tokens=128,
num_logprobs=10,
mm_limit=1,
tensor_parallel_size=1,
)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# No image
[],
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
size_factors, dtype: str, max_tokens: int,
num_logprobs: int) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_case = [
([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
[[rescale_image_size(image, factor) for image in images]
for factor in size_factors])
]
run_test(
hf_runner,
vllm_runner,
inputs_per_case,
model,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=2,
tensor_parallel_size=1,
)

View File

@@ -0,0 +1,193 @@
"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
Run `pytest tests/models/test_mistral.py`.
"""
import json
import uuid
from dataclasses import asdict
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
import pytest
from mistral_common.protocol.instruct.messages import ImageURLChunk
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
from vllm import EngineArgs, LLMEngine, SamplingParams, TokensPrompt
from vllm.multimodal import MultiModalDataBuiltins
from vllm.sequence import Logprob, SampleLogprobs
from ....utils import VLLM_PATH, large_gpu_test
from ...utils import check_logprobs_close
if TYPE_CHECKING:
from _typeshed import StrPath
MODELS = ["mistralai/Pixtral-12B-2409"]
IMG_URLS = [
"https://picsum.photos/id/237/400/300",
"https://picsum.photos/id/231/200/300",
"https://picsum.photos/id/27/500/500",
"https://picsum.photos/id/17/150/600",
]
PROMPT = "Describe each image in one short sentence."
def _create_msg_format(urls: List[str]) -> List[Dict[str, Any]]:
return [{
"role":
"user",
"content": [{
"type": "text",
"text": PROMPT,
}] + [{
"type": "image_url",
"image_url": {
"url": url
}
} for url in urls],
}]
def _create_engine_inputs(urls: List[str]) -> TokensPrompt:
msg = _create_msg_format(urls)
tokenizer = MistralTokenizer.from_model("pixtral")
request = ChatCompletionRequest(messages=msg) # type: ignore[type-var]
tokenized = tokenizer.encode_chat_completion(request)
engine_inputs = TokensPrompt(prompt_token_ids=tokenized.tokens)
images = []
for chunk in request.messages[0].content:
if isinstance(chunk, ImageURLChunk):
images.append(image_from_chunk(chunk))
mm_data = MultiModalDataBuiltins(image=images)
engine_inputs["multi_modal_data"] = mm_data
return engine_inputs
MSGS = [
_create_msg_format(IMG_URLS[:1]),
_create_msg_format(IMG_URLS[:2]),
_create_msg_format(IMG_URLS),
]
ENGINE_INPUTS = [
_create_engine_inputs(IMG_URLS[:1]),
_create_engine_inputs(IMG_URLS[:2]),
_create_engine_inputs(IMG_URLS),
]
SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
LIMIT_MM_PER_PROMPT = dict(image=4)
MAX_MODEL_LEN = [8192, 65536]
FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
assert FIXTURES_PATH.exists()
FIXTURE_LOGPROBS_CHAT = FIXTURES_PATH / "pixtral_chat.json"
FIXTURE_LOGPROBS_ENGINE = FIXTURES_PATH / "pixtral_chat_engine.json"
OutputsLogprobs = List[Tuple[List[int], str, Optional[SampleLogprobs]]]
# For the test author to store golden output in JSON
def _dump_outputs_w_logprobs(
outputs: OutputsLogprobs,
filename: "StrPath",
) -> None:
json_data = [(tokens, text,
[{k: asdict(v)
for k, v in token_logprobs.items()}
for token_logprobs in (logprobs or [])])
for tokens, text, logprobs in outputs]
with open(filename, "w") as f:
json.dump(json_data, f)
def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
with open(filename, "rb") as f:
json_data = json.load(f)
return [(tokens, text,
[{int(k): Logprob(**v)
for k, v in token_logprobs.items()}
for token_logprobs in logprobs])
for tokens, text, logprobs in json_data]
@large_gpu_test(min_gb=80)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
@pytest.mark.parametrize("dtype", ["bfloat16"])
def test_chat(
vllm_runner,
max_model_len: int,
model: str,
dtype: str,
) -> None:
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT)
with vllm_runner(
model,
dtype=dtype,
tokenizer_mode="mistral",
enable_chunked_prefill=False,
max_model_len=max_model_len,
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
) as vllm_model:
outputs = []
for msg in MSGS:
output = vllm_model.model.chat(msg,
sampling_params=SAMPLING_PARAMS)
outputs.extend(output)
logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
outputs_1_lst=logprobs,
name_0="h100_ref",
name_1="output")
@large_gpu_test(min_gb=80)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
def test_model_engine(vllm_runner, model: str, dtype: str) -> None:
EXPECTED_ENGINE_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_ENGINE)
args = EngineArgs(
model=model,
tokenizer_mode="mistral",
enable_chunked_prefill=False,
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
dtype=dtype,
)
engine = LLMEngine.from_engine_args(args)
engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[0], SAMPLING_PARAMS)
engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[1], SAMPLING_PARAMS)
outputs = []
count = 0
while True:
out = engine.step()
count += 1
for request_output in out:
if request_output.finished:
outputs.append(request_output)
if count == 2:
engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[2],
SAMPLING_PARAMS)
if not engine.has_unfinished_requests():
break
logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
check_logprobs_close(outputs_0_lst=EXPECTED_ENGINE_LOGPROBS,
outputs_1_lst=logprobs,
name_0="h100_ref",
name_1="output")

View File

@@ -0,0 +1,428 @@
from typing import Any, List, Optional, Tuple, Type, TypedDict, Union
import numpy.typing as npt
import pytest
import torch
from PIL import Image
from vllm.entrypoints.llm import LLM
from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
sample_frames_from_video)
from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
PromptVideoInput, VllmRunner)
from ...utils import check_logprobs_close
models = ["Qwen/Qwen2-VL-2B-Instruct"]
target_dtype = "half"
IMAGE_PLACEHOLDER = "<|vision_start|><|image_pad|><|vision_end|>"
VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
def qwen2_vl_chat_template(*query):
return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n" # noqa: E501
IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
qwen2_vl_chat_template(
IMAGE_PLACEHOLDER,
"What is the biggest text's content in this image?",
),
"cherry_blossom":
qwen2_vl_chat_template(
IMAGE_PLACEHOLDER,
"What is the season shown in this image? ",
"Reply with a short sentence (no more than 20 words)",
),
})
VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
"sample_demo_1":
qwen2_vl_chat_template(
VIDEO_PLACEHOLDER,
"Describe this video with a short sentence ",
"(no more than 20 words)",
),
})
MULTIIMAGE_PROMPT = qwen2_vl_chat_template(
IMAGE_PLACEHOLDER,
IMAGE_PLACEHOLDER,
"Describe these two images separately. ",
"For each image, reply with a short sentence ",
"(no more than 10 words).",
)
class Qwen2VLPromptImageEmbeddingInput(TypedDict):
image_embeds: torch.Tensor
image_grid_thw: torch.Tensor
class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
video_embeds: torch.Tensor
video_grid_thw: torch.Tensor
def batch_make_image_embeddings(
image_batches: List[Union[Image.Image, List[Image.Image]]], processor,
llm: LLM) -> List[Qwen2VLPromptImageEmbeddingInput]:
"""batched image embeddings for Qwen2-VL
This will infer all images' embeddings in a single batch,
and split the result according to input batches.
image_batches:
- Single-image batches: `List[Image.Image]`
- Multiple-image batches: `List[List[Image.Image]]]`
returns: `List[Qwen2VLPromptImageEmbeddingInput]`
"""
image_batches_: List[Any] = image_batches[:]
# convert single-image batches to multiple-image batches
for idx in range(len(image_batches_)):
if not isinstance(image_batches_[idx], list):
image_batches_[idx] = [image_batches_[idx]]
assert isinstance(image_batches_[idx], list)
# append all images into a list (as a batch)
images: List[Image.Image] = []
for image_batch in image_batches_:
images += image_batch
# image to pixel values
image_processor = processor.image_processor
preprocess_result = image_processor \
.preprocess(images=images, return_tensors="pt") \
.data
pixel_values = preprocess_result["pixel_values"]
image_grid_thw = preprocess_result["image_grid_thw"]
# pixel values to embeddinds & grid_thws
with torch.no_grad():
visual = llm.llm_engine.model_executor.driver_worker. \
model_runner.model.visual
pixel_values_on_device = pixel_values.to(visual.device,
dtype=visual.dtype)
image_grid_thw_on_device = image_grid_thw.to(visual.device,
dtype=torch.int64)
image_embeds = visual(pixel_values_on_device,
grid_thw=image_grid_thw_on_device)
# split into original batches
result: List[Qwen2VLPromptImageEmbeddingInput] = []
image_counter = 0
embed_counter = 0
for image_batch in image_batches_:
cur_batch_image_count = len(image_batch)
merge_size = image_processor.merge_size
cur_batch_embed_len = sum([
grid_thw.prod() // merge_size // merge_size
for grid_thw in image_grid_thw[image_counter:image_counter +
cur_batch_image_count]
])
result.append({
"image_embeds":
image_embeds[embed_counter:embed_counter + cur_batch_embed_len],
"image_grid_thw":
image_grid_thw[image_counter:image_counter +
cur_batch_image_count],
})
embed_counter += cur_batch_embed_len
image_counter += cur_batch_image_count
# ensure we don't lost any images or embeddings
assert embed_counter == image_embeds.size(0)
assert image_counter == image_grid_thw.size(0)
assert len(image_batches) == len(result)
return result
def batch_make_video_embeddings(
video_batches: PromptVideoInput, processor,
llm: LLM) -> List[Qwen2VLPromptVideoEmbeddingInput]:
"""batched video embeddings for Qwen2-VL
A NDArray represents a single video's all frames.
This will infer all videos' embeddings in a single batch,
and split the result according to input batches.
video_batches:
- Single-video batches: `List[NDArray]`
- Multiple-video batches: `List[List[NDArray]]`
"""
video_batches_: List[Any] = video_batches[:]
for idx in range(len(video_batches_)):
if not isinstance(video_batches_[idx], list):
single_video_batch: List[npt.NDArray] = [video_batches_[idx]]
video_batches_[idx] = single_video_batch
assert isinstance(video_batches_[idx], list)
# append all videos into a list (as a batch)
videos: List[npt.NDArray] = []
for video_batch in video_batches_:
videos += video_batch
# video to pixel values
image_processor = processor.image_processor
preprocess_result = image_processor \
.preprocess(images=None, videos=videos, return_tensors="pt") \
.data
pixel_values = preprocess_result["pixel_values_videos"]
video_grid_thw = preprocess_result["video_grid_thw"]
# pixel values to embeddinds & grid_thws
with torch.no_grad():
visual = llm.llm_engine.model_executor.driver_worker.\
model_runner.model.visual
pixel_values_on_device = pixel_values.to(visual.device,
dtype=visual.dtype)
video_grid_thw_on_device = video_grid_thw.to(visual.device,
dtype=torch.int64)
video_embeds = visual(pixel_values_on_device,
grid_thw=video_grid_thw_on_device)
# split into original batches
result: List[Qwen2VLPromptVideoEmbeddingInput] = []
video_counter = 0
embed_counter = 0
for video_batch in video_batches_:
cur_batch_video_count = len(video_batch)
merge_size = image_processor.merge_size
cur_batch_embed_len = sum([
grid_thw.prod() // merge_size // merge_size
for grid_thw in video_grid_thw[video_counter:video_counter +
cur_batch_video_count]
])
result.append({
"video_embeds":
video_embeds[embed_counter:embed_counter + cur_batch_embed_len],
"video_grid_thw":
video_grid_thw[video_counter:video_counter +
cur_batch_video_count],
})
embed_counter += cur_batch_embed_len
video_counter += cur_batch_video_count
# ensure we don't lost any videos or embeddings
assert embed_counter == video_embeds.size(0)
assert video_counter == video_grid_thw.size(0)
assert len(video_batches) == len(result)
return result
def run_test(
vllm_runner: Type[VllmRunner],
inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]],
model: str,
*,
dtype: str,
max_tokens: int,
num_logprobs: int,
mm_limit: int,
tensor_parallel_size: int,
distributed_executor_backend: Optional[str] = None,
):
"""Inference result should be the same between
original image/video input and image/video embeddings input.
"""
from transformers import AutoProcessor # noqa: F401
processor = AutoProcessor.from_pretrained(model)
# NOTE:
# max_model_len should be greater than image_feature_size
with vllm_runner(model,
task="generate",
max_model_len=4000,
max_num_seqs=3,
dtype=dtype,
limit_mm_per_prompt={
"image": mm_limit,
"video": mm_limit
},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend
) as vllm_model:
outputs_per_case_for_original_input = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images or None,
videos=videos or None)
for prompts, images, videos in inputs
]
outputs_per_case_for_embeddings_input = [
vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=batch_make_image_embeddings(
images, processor, vllm_model.model) if images else None,
videos=batch_make_video_embeddings(
videos, processor, vllm_model.model) if videos else None)
for prompts, images, videos in inputs
]
for outputs_for_original_input, \
outputs_for_embeddings_input \
in zip(outputs_per_case_for_original_input,
outputs_per_case_for_embeddings_input):
check_logprobs_close(
outputs_0_lst=outputs_for_original_input,
outputs_1_lst=outputs_for_embeddings_input,
name_0="original_input",
name_1="embeddings_input",
)
@pytest.mark.core_model
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# Single-scale
[0.5],
# Single-scale, batched
[0.5, 0.5],
# Multi-scale
[0.25, 0.5, 0.5],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
size_factors, dtype: str,
max_tokens: int,
num_logprobs: int) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_case: List[Tuple[
List[str], PromptImageInput, PromptVideoInput]] = [(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
[],
) for image, prompt in zip(images, IMAGE_PROMPTS)]
run_test(
vllm_runner,
inputs_per_case,
model,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=1,
tensor_parallel_size=1,
)
@pytest.mark.core_model
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
[],
# Single-scale
[0.5],
# Single-scale, batched
[0.5, 0.5],
# Multi-scale
[0.25, 0.5, 0.5],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
model, size_factors,
dtype: str, max_tokens: int,
num_logprobs: int) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_case: List[Tuple[List[str], PromptImageInput,
PromptVideoInput]] = [(
[MULTIIMAGE_PROMPT for _ in size_factors],
[[
rescale_image_size(image, factor)
for image in images
] for factor in size_factors],
[],
)]
run_test(
vllm_runner,
inputs_per_case,
model,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=2,
tensor_parallel_size=1,
)
@pytest.mark.core_model
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# Single-scale
[0.5],
# Single-scale, batched
[0.5, 0.5],
# Multi-scale
[0.25, 0.25, 0.5],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
size_factors, dtype: str,
max_tokens: int,
num_logprobs: int) -> None:
num_frames = 4
sampled_vids = [
sample_frames_from_video(asset.np_ndarrays, num_frames)
for asset in video_assets
]
inputs_per_case: List[Tuple[
List[str], PromptImageInput, PromptVideoInput]] = [(
[prompt for _ in size_factors],
[],
[rescale_video_size(video, factor) for factor in size_factors],
) for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)]
run_test(
vllm_runner,
inputs_per_case,
model,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=1,
tensor_parallel_size=1,
)

View File

@@ -0,0 +1,235 @@
"""Helpers for building inputs that can be leveraged for different test types.
"""
from pathlib import PosixPath
from typing import Callable, Iterable, List, Optional, Tuple, Union
import torch
from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
resize_video, sample_frames_from_video)
from .....conftest import _ImageAssets, _VideoAssets
from .types import (SINGLE_IMAGE_BASE_PROMPTS, TEST_IMG_PLACEHOLDER,
TEST_VIDEO_PLACEHOLDER, VIDEO_BASE_PROMPT,
ImageSizeWrapper, SizeType, VLMTestInfo)
def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int],
str],
test_placeholder: str) -> str:
"""Given a prompt, replaces each test placeholder with the
model-specific tag.
"""
prompt_segments = prompt.split(test_placeholder)
img_prompt = prompt_segments[0]
for placeholder_idx, next_seg in enumerate(prompt_segments[1:], start=1):
img_prompt += img_idx_to_prompt(placeholder_idx)
img_prompt += next_seg
return img_prompt
def get_model_prompts(base_prompts: Iterable[str],
img_idx_to_prompt: Optional[Callable[[int], str]],
video_idx_to_prompt: Optional[Callable[[int], str]],
prompt_formatter: Callable[[str], str]) -> List[str]:
"""Given a model-agnostic base prompt and test configuration for a model(s)
to be tested, update the media placeholders and apply the prompt formatting
to get the test prompt string for this model.
Example for phi3v, given the base_prompt: "<image>What is the season?"
1. Replace img placeholder(s)
-> "<|image_1|>\nWhat is the season?"
2. Apply prompt formatter:
-> <|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n
"""
assert isinstance(base_prompts, (list, tuple))
model_prompts = []
for base_prompt in base_prompts:
# Replace the multimodal placeholders in the base prompt with
# the correct ones for the model that we are testing
if img_idx_to_prompt:
base_prompt = replace_test_placeholder(base_prompt,
img_idx_to_prompt,
TEST_IMG_PLACEHOLDER)
if video_idx_to_prompt:
base_prompt = replace_test_placeholder(base_prompt,
video_idx_to_prompt,
TEST_VIDEO_PLACEHOLDER)
# Apply the prompt formatter to wrap the base prompt with
# the correct media placeholders to get the model test prompt
model_prompt = prompt_formatter(base_prompt)
model_prompts.append(model_prompt)
return model_prompts
def build_single_image_inputs_from_test_info(
test_info: VLMTestInfo,
image_assets: _ImageAssets,
size_wrapper: ImageSizeWrapper,
tmp_path: Optional[PosixPath] = None):
if test_info.prompt_formatter is None:
raise ValueError(
"Prompt formatter must be set to build single image inputs")
model_prompts = get_model_prompts(test_info.single_image_prompts,
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.prompt_formatter)
# For models that require a local path / URL encoded in the image; export
# assets and encode into tmp_path for this test. This should be avoided
# where possible (currently needed for Qwen-VL).
if test_info.prompt_path_encoder is not None:
if tmp_path is None:
raise ValueError("Prompt path encoder requires setting local path")
model_prompts = [
test_info.prompt_path_encoder(tmp_path, prompt, [asset])
for prompt, asset in zip(model_prompts, image_assets)
]
images = [asset.pil_image for asset in image_assets]
assert len(images) == len(model_prompts)
return build_single_image_inputs(images, model_prompts, size_wrapper)
def build_single_image_inputs(images, model_prompts,
size_wrapper: ImageSizeWrapper):
# For every image / prompt pair, get a pair containing two lists of
# length size_factors, where the first contains duplicates of the model
# prompt [str], and the second contains copies of the image after being
# scaled by one of the size factors.
#
# NOTE: rescaling preserves the image aspect ratio.
return [(
[prompt for _ in size_wrapper.data],
[
apply_image_size_scaling(image, size, size_wrapper.type)
for size in size_wrapper.data
],
) for image, prompt in zip(images, model_prompts)]
def build_multi_image_inputs_from_test_info(
test_info: VLMTestInfo,
image_assets: _ImageAssets,
size_wrapper: ImageSizeWrapper,
tmp_path: Optional[PosixPath] = None):
if test_info.prompt_formatter is None:
raise ValueError(
"Prompt formatter must be set to build multi image inputs")
model_prompts = get_model_prompts([test_info.multi_image_prompt],
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.prompt_formatter)
if test_info.prompt_path_encoder is not None:
if tmp_path is None:
raise ValueError("Prompt path encoder requires setting local path")
model_prompts = [
test_info.prompt_path_encoder(tmp_path, model_prompt, image_assets)
for model_prompt in model_prompts
]
images = [asset.pil_image for asset in image_assets]
# Currently, we only have one multi-image list & one multi-image prompt
return build_multi_image_inputs(
image_lists=[images],
model_prompts=model_prompts,
size_wrapper=size_wrapper,
)
def build_multi_image_inputs(image_lists, model_prompts,
size_wrapper: ImageSizeWrapper):
return [(
[prompt for _ in size_wrapper.data],
[[
apply_image_size_scaling(image, size, size_wrapper.type)
for image in images
] for size in size_wrapper.data],
) for images, prompt in zip(image_lists, model_prompts)]
def build_embedding_inputs_from_test_info(
test_info: VLMTestInfo,
image_assets: _ImageAssets,
size_wrapper: ImageSizeWrapper,
):
# These conditions will always be true if invoked through filtering,
# but we still check them in case this is ever called directly
if test_info.prompt_formatter is None:
raise ValueError(
"Prompt formatter must be set to build image embedding inputs")
if size_wrapper.type != SizeType.SIZE_FACTOR or not \
all(factor == 1.0 for factor in size_wrapper.data):
raise ValueError("Embedding tests require constant (1.0) size factors")
if test_info.convert_assets_to_embeddings is None:
raise ValueError("No conversion func for getting embeddings found")
model_prompts = get_model_prompts(
SINGLE_IMAGE_BASE_PROMPTS,
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.prompt_formatter,
)
images = [asset.pil_image for asset in image_assets]
embeds = test_info.convert_assets_to_embeddings(image_assets)
assert len(images) == len(model_prompts)
inputs = build_single_image_inputs(images, model_prompts, size_wrapper)
vllm_embeddings = build_single_image_inputs(embeds, model_prompts,
size_wrapper)
return inputs, vllm_embeddings
def build_video_inputs_from_test_info(
test_info: VLMTestInfo,
video_assets: _VideoAssets,
size_wrapper: ImageSizeWrapper,
num_frames: int,
):
if test_info.prompt_formatter is None:
raise ValueError("Prompt formatter must be set to build video inputs")
model_prompts = get_model_prompts(
[VIDEO_BASE_PROMPT],
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.prompt_formatter,
)
sampled_vids = [
sample_frames_from_video(asset.np_ndarrays, num_frames)
for asset in video_assets
]
video_scaler = (resize_video if size_wrapper.type == SizeType.FIXED_SIZE
else rescale_video_size)
return [(
[prompt for _ in size_wrapper.data],
[video_scaler(video, size) for size in size_wrapper.data],
) for video, prompt in zip(sampled_vids, model_prompts)]
def apply_image_size_scaling(image, size: Union[float, Tuple[int, int]],
size_type: SizeType):
"""Applies a size scaler to one image; this can be a an image size factor,
which scales the image while maintaining the aspect ratio"""
# Special case for embeddings; if it's a tensor, it's only valid if we
# are considering size factors at constant scale, i.e., we just clone
# the tensor
if isinstance(image, torch.Tensor):
assert size_type == SizeType.SIZE_FACTOR and size == 1
return image
if size_type == SizeType.SIZE_FACTOR:
# We have a list of image size factors
return rescale_image_size(image, size)
elif size_type == SizeType.FIXED_SIZE:
# We have a list of fixed sizes
return image.resize(size)
raise ValueError("ImageSizeWrapper type must be FIXED_SIZE or SIZE_FACTOR")

View File

@@ -0,0 +1,157 @@
"""Utils for determining which subset of model tests belong to a specific
modality, getting all combinations (similar to pytest's parametrization),
handling multimodal placeholder substitution, and so on.
"""
import itertools
from collections import OrderedDict
from typing import Dict, Iterable, Tuple
import pytest
from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs,
ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType)
def get_filtered_test_settings(test_settings: Dict[str, VLMTestInfo],
test_type: VLMTestType,
fork_per_test: bool) -> Dict[str, VLMTestInfo]:
"""Given the dict of potential test settings to run, return a subdict
of tests who have the current test type enabled with the matching val for
fork_per_test.
"""
def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType):
return test_info.test_type == test_type or (
isinstance(test_info.test_type, Iterable)
and test_type in test_info.test_type)
matching_tests = {}
for test_name, test_info in test_settings.items():
# Otherwise check if the test has the right type & keep if it does
if matches_test_type(test_info, test_type):
# Embedding tests need to have a conversion func in their test info
if matches_test_type(test_info, VLMTestType.EMBEDDING):
assert test_info.convert_assets_to_embeddings is not None
# Custom test inputs need to explicitly define the mm limit/inputs
if matches_test_type(test_info, VLMTestType.CUSTOM_INPUTS):
assert (test_info.custom_test_opts is not None
and isinstance(test_info.custom_test_opts, Iterable))
# For all types besides custom inputs, we need a prompt formatter
else:
assert test_info.prompt_formatter is not None
# Everything looks okay; keep if this is has correct proc handling
if (test_info.distributed_executor_backend
is not None) == fork_per_test:
matching_tests[test_name] = test_info
return matching_tests
def get_parametrized_options(test_settings: Dict[str, VLMTestInfo],
test_type: VLMTestType,
fork_new_process_for_each_test: bool):
"""Converts all of our VLMTestInfo into an expanded list of parameters.
This is similar to nesting pytest parametrize calls, but done directly
through an itertools product so that each test can set things like
size factors etc, while still running in isolated test cases.
"""
matching_tests = get_filtered_test_settings(
test_settings, test_type, fork_new_process_for_each_test)
# Ensure that something is wrapped as an iterable it's not already
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e, )
def get_model_type_cases(model_type: str, test_info: VLMTestInfo):
# This is essentially the same as nesting a bunch of mark.parametrize
# decorators, but we do it programmatically to allow overrides for on
# a per-model basis, while still being able to execute each of these
# as individual test cases in pytest.
iter_kwargs = OrderedDict([
("model", ensure_wrapped(test_info.models)),
("max_tokens", ensure_wrapped(test_info.max_tokens)),
("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
("dtype", ensure_wrapped(test_info.dtype)),
("distributed_executor_backend",
ensure_wrapped(test_info.distributed_executor_backend)),
])
# num_frames is video only
if test_type == VLMTestType.VIDEO:
iter_kwargs["num_video_frames"] = ensure_wrapped(
test_info.num_video_frames)
# No sizes passed for custom inputs, since inputs are directly provided
if test_type != VLMTestType.CUSTOM_INPUTS:
wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
if wrapped_sizes is None:
raise ValueError(
f"Sizes must be set for test type {test_type}")
iter_kwargs["size_wrapper"] = wrapped_sizes
#Otherwise expand the custom test options instead
else:
if test_info.custom_test_opts is None:
raise ValueError("Test has type CUSTOM_INPUTS, but none given")
iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
# yapf: disable
# Wrap all model cases in a pytest parameter & pass marks through
return [
pytest.param(
model_type,
ExpandableVLMTestArgs(
**{k: v for k, v in zip(iter_kwargs.keys(), case)}
),
marks=test_info.marks if test_info.marks is not None else []
) for case in list(itertools.product(*iter_kwargs.values()))
]
# yapf: enable
# Get a list per model type, where each entry contains a tuple of all of
# that model type's cases, then flatten them into the top level so that
# we can consume them in one mark.parametrize call.
cases_by_model_type = [
get_model_type_cases(model_type, test_info)
for model_type, test_info in matching_tests.items()
]
return list(itertools.chain(*cases_by_model_type))
def get_wrapped_test_sizes(
test_info: VLMTestInfo,
test_type: VLMTestType) -> Tuple[ImageSizeWrapper, ...]:
"""Given a test info which may have size factors or fixed sizes, wrap them
and combine them into an iterable, each of which will be used in parameter
expansion.
Args:
test_info: Test configuration to be expanded.
test_type: The type of test being filtered for.
"""
# If it is an embedding test, we always use the EMBEDDING_SIZE_FACTORS
if test_type == VLMTestType.EMBEDDING:
return tuple([
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
for factor in EMBEDDING_SIZE_FACTORS
])
# Custom inputs have preprocessed inputs
elif test_type == VLMTestType.CUSTOM_INPUTS:
return tuple()
size_factors = test_info.image_size_factors \
if test_info.image_size_factors else []
fixed_sizes = test_info.image_sizes \
if test_info.image_sizes else []
wrapped_factors = [
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
for factor in size_factors
]
wrapped_sizes = [
ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size)
for size in fixed_sizes
]
return tuple(wrapped_factors + wrapped_sizes)

View File

@@ -0,0 +1,141 @@
"""Core test implementation to be shared across modalities."""
from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
import torch
from PIL.Image import Image
from transformers import AutoTokenizer, BatchEncoding
from transformers.models.auto.auto_factory import _BaseAutoModelClass
from .....conftest import HfRunner, VllmRunner
from .types import RunnerOutput
def run_test(
*,
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
inputs: List[Tuple[List[str], List[Union[List[Image], Image]]]],
model: str,
dtype: str,
max_tokens: int,
num_logprobs: int,
enforce_eager: bool,
max_model_len: int,
max_num_seqs: int,
hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
auto_cls: Type[_BaseAutoModelClass],
use_tokenizer_eos: bool,
postprocess_inputs: Callable[[BatchEncoding], BatchEncoding],
comparator: Callable[..., None],
get_stop_token_ids: Optional[Callable[[AutoTokenizer], List[int]]],
limit_mm_per_prompt: Dict[str, int],
model_kwargs: Optional[Dict[str, Any]],
patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]],
task: str = "auto",
runner_mm_key: str = "images",
distributed_executor_backend: Optional[str] = None,
tensor_parallel_size: int = 1,
vllm_embeddings: Optional[torch.Tensor] = None,
):
"""Modality agnostic test test executor for comparing HF/vLLM outputs."""
# In the case of embeddings, vLLM takes separate input tensors
vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs
tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
vllm_outputs_per_mm = []
hf_outputs_per_mm = []
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
vllm_kwargs = {}
if get_stop_token_ids is not None:
vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
with vllm_runner(model,
max_model_len=max_model_len,
max_num_seqs=max_num_seqs,
dtype=dtype,
limit_mm_per_prompt=limit_mm_per_prompt,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=enforce_eager,
task=task) as vllm_model:
for prompts, media in vllm_inputs:
vllm_kwargs[runner_mm_key] = media
vllm_output = vllm_model.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs=num_logprobs, **vllm_kwargs)
vllm_outputs_per_mm.append(vllm_output)
hf_model = hf_runner(model,
dtype=dtype,
auto_cls=auto_cls,
postprocess_inputs=postprocess_inputs,
model_kwargs=model_kwargs)
# Some models need to patch things like the model processor, e.g., internvl
if patch_hf_runner is not None:
hf_model = patch_hf_runner(hf_model)
# Some models need to explicitly pass the eos_token_id off the tokenizer or
# processor for a good comparison; currently assume processor/tokenizer
# agree on the EOS, and pull it off the tokenizer if requested.
hf_kwargs = {}
if use_tokenizer_eos:
hf_kwargs["eos_token_id"] = tokenizer.eos_token_id
with hf_model, torch.no_grad():
for prompts, media in inputs:
hf_kwargs[runner_mm_key] = media
hf_output = hf_model.generate_greedy_logprobs_limit(
prompts,
max_tokens,
num_logprobs=num_logprobs,
tokenizer=tokenizer,
**hf_kwargs)
hf_outputs_per_mm.append(hf_output)
# Apply output processing / sanitation to the vLLM and HF runner results
hf_outputs_per_mm, vllm_outputs_per_mm = process_runner_outputs(
model,
first_runner_outputs=hf_outputs_per_mm,
second_runner_outputs=vllm_outputs_per_mm,
first_runner_processor=hf_output_post_proc,
second_runner_processor=vllm_output_post_proc,
)
for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm,
vllm_outputs_per_mm):
# This is usually check_logprobs_close, but it's passed through to
# allow things like check_outputs_equal where needed
comparator(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
def process_runner_outputs(
model,
first_runner_outputs,
second_runner_outputs,
first_runner_processor=None,
second_runner_processor=None,
):
"""Applies the runner processor(s) to the runner outputs, if any."""
if first_runner_processor is not None:
first_runner_outputs = process_outputs(first_runner_processor, model,
first_runner_outputs)
if second_runner_processor is not None:
second_runner_outputs = process_outputs(second_runner_processor, model,
second_runner_outputs)
return first_runner_outputs, second_runner_outputs
def process_outputs(output_processor, model, outputs_per_image):
"""Applies a model specific post-processor function to a runner's output"""
return [[output_processor(res, model) for res in outputs]
for outputs in outputs_per_image]

View File

@@ -0,0 +1,102 @@
"""Custom input builders for edge-cases in different models."""
from typing import Callable
from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
resize_video, sample_frames_from_video)
from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
from .builders import build_multi_image_inputs, build_single_image_inputs
from .types import ImageSizeWrapper, SizeType
def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
"""Builds inputs for multi-image (varied sizes/aspect ratio) testing.
Args:
formatter: model-specific prompt formatter.
"""
stop_sign = IMAGE_ASSETS[0].pil_image
cherry_blossom = IMAGE_ASSETS[1].pil_image
# Apply the selected formatter to the base prompts
img_prompts = [
"<image><image>\nDescribe 2 images.",
"<image><image>\nDescribe 2 images.",
"<image><image><image><image>\nDescribe 4 images.",
"<image>\nWhat is the season?",
]
formatted_prompts = [formatter(prompt) for prompt in img_prompts]
return [(
formatted_prompts,
[
[stop_sign, cherry_blossom],
# Images with different sizes and aspect-ratios
[
rescale_image_size(stop_sign, 0.1),
stop_sign,
],
[
stop_sign,
rescale_image_size(stop_sign, 0.25),
cherry_blossom.resize((183, 488)),
cherry_blossom.resize((488, 183))
],
cherry_blossom,
])]
def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
num_frames: int = 16):
"""Builds inputs for multi-video (varied sizes/aspect ratio) testing.
Args:
formatter: model-specific prompt formatter.
"""
video = sample_frames_from_video(VIDEO_ASSETS[0].np_ndarrays, num_frames)
# Apply the selected formatter to the base prompts
video_prompts = [
"<video><video>\nDescribe 2 videos.",
"<video><video>\nDescribe 2 videos.",
"<video><video><video><video>\nDescribe 4 videos.",
"<video>\nWhy is this video funny?",
]
formatted_prompts = [formatter(prompt) for prompt in video_prompts]
return [(
formatted_prompts,
[
[video, video],
# Videos with different sizes and aspect-ratios
[
rescale_video_size(video, 0.1),
video,
],
[
video,
rescale_video_size(video, 0.25),
resize_video(video, (183, 488)),
resize_video(video, (488, 183))
],
video,
])]
def different_patch_input_cases_internvl():
images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
formatter = lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501
single_img_prompts = [
"<image>\nWhat's the content in the center of the image?",
"<image>\nWhat is the season?",
]
multi_img_prompts = [
"Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.\n", # noqa: E501
]
formatted_sprompts = [formatter(prompt) for prompt in single_img_prompts]
formatted_mprompts = [formatter(prompt) for prompt in multi_img_prompts]
wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5, 1.0])
return [
build_single_image_inputs(images, formatted_sprompts, wrapped_sf),
build_multi_image_inputs([images], formatted_mprompts, wrapped_sf),
]

View File

@@ -0,0 +1,409 @@
"""Common utility functions relating to different models that are useful
for manipulating the input / output of HF & vLLM test runners, which are
typically specific to a small subset of models.
"""
import re
import types
from pathlib import PosixPath
from typing import Callable, List, Optional, Tuple, Union
import torch
from PIL.Image import Image
from transformers import AutoConfig, AutoTokenizer, BatchEncoding
from vllm.sequence import SampleLogprobs
from vllm.transformers_utils.tokenizer import patch_padding_side
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
from .....conftest import HfRunner, ImageAsset, _ImageAssets
from .types import RunnerOutput
####### vLLM output processors functions
def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
"""Sanitize vllm output [blip2 models] to be comparable with hf output."""
_, output_str, out_logprobs = vllm_output
hf_output_str = output_str + "\n"
tokenizer = AutoTokenizer.from_pretrained(model)
hf_output_ids = tokenizer.encode(hf_output_str)
assert hf_output_ids[0] == tokenizer.bos_token_id
hf_output_ids = hf_output_ids[1:]
return hf_output_ids, hf_output_str, out_logprobs
def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
"""Sanitize vllm output [fuyu models] to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
hf_output_str = output_str.lstrip() + "|ENDOFTEXT|"
return output_ids, hf_output_str, out_logprobs
def qwen_vllm_to_hf_output(
vllm_output: RunnerOutput,
model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]:
"""Sanitize vllm output [qwen models] to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
hf_output_str = output_str + "<|endoftext|>"
return output_ids, hf_output_str, out_logprobs
def qwen2_vllm_to_hf_output(
vllm_output: RunnerOutput,
model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]:
"""Sanitize vllm output [qwen2 models] to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
hf_output_str = output_str + "<|im_end|>"
return output_ids, hf_output_str, out_logprobs
def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
config = AutoConfig.from_pretrained(model)
mm_token_id = config.image_token_index
return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
def llava_video_vllm_to_hf_output(
vllm_output: RunnerOutput,
model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]:
config = AutoConfig.from_pretrained(model)
mm_token_id = config.video_token_index
return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
mm_token_id: int) -> RunnerOutput:
"""Sanitize vllm output [Llava models] to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
tokenizer = AutoTokenizer.from_pretrained(model)
eos_token_id = tokenizer.eos_token_id
hf_output_ids = [
token_id for idx, token_id in enumerate(output_ids)
if token_id != mm_token_id or output_ids[idx - 1] != mm_token_id
]
assert output_str[0] == " "
hf_output_str = output_str[1:]
if hf_output_ids[-1] == eos_token_id:
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
return hf_output_ids, hf_output_str, out_logprobs
def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
"""Sanitize vllm output [llava-onevision] to compare with hf output."""
output_ids, output_str, out_logprobs = vllm_output
config = AutoConfig.from_pretrained(model)
video_token_id = config.video_token_index
tokenizer = AutoTokenizer.from_pretrained(model)
eos_token_id = tokenizer.eos_token_id
hf_output_ids = [
token_id for idx, token_id in enumerate(output_ids)
if token_id != video_token_id or output_ids[idx - 1] != video_token_id
]
hf_output_str = output_str
if hf_output_ids[-1] == eos_token_id:
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
return hf_output_ids, hf_output_str, out_logprobs
def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
"""Sanitize vllm output [phi3v] to be comparable with hf output."""
_, output_str, out_logprobs = vllm_output
output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
assert output_str_without_image[0] == " "
output_str_without_image = output_str_without_image[1:]
hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
tokenizer = AutoTokenizer.from_pretrained(model)
hf_output_ids = tokenizer.encode(output_str_without_image)
assert hf_output_ids[0] == 1
hf_output_ids = hf_output_ids[1:]
return hf_output_ids, hf_output_str, out_logprobs
def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
"""Sanitize vllm output to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
config = AutoConfig.from_pretrained(model)
image_token_id = config.image_token_index
tokenizer = AutoTokenizer.from_pretrained(model)
eos_token_id = tokenizer.eos_token_id
hf_output_ids = [
token_id for idx, token_id in enumerate(output_ids)
if token_id != image_token_id or output_ids[idx - 1] != image_token_id
]
hf_output_str = output_str
if hf_output_ids[-1] == eos_token_id:
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
return hf_output_ids, hf_output_str, out_logprobs
####### Post-processors for HF outputs
def minicmpv_trunc_hf_output(hf_output: RunnerOutput,
model: str) -> RunnerOutput:
output_ids, output_str, out_logprobs = hf_output
if output_str.endswith("<|eot_id|>"):
output_str = output_str.split("<|eot_id|>")[0]
return output_ids, output_str, out_logprobs
####### Functions for converting image assets to embeddings
def get_llava_embeddings(image_assets: _ImageAssets):
return [asset.image_embeds for asset in image_assets]
####### postprocessors to run on HF BatchEncoding
def get_key_type_post_processor(
hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]:
"""Gets a handle to a post processor which converts a given key into a
target data type."""
def process(hf_inputs: BatchEncoding, dtype: str):
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
hf_inputs[hf_inp_key] = hf_inputs[hf_inp_key].to(torch_dtype)
return hf_inputs
return process
def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str):
return {"model_inputs": hf_inputs}
####### Prompt path encoders for models that need models on disk
def qwen_prompt_path_encoder(
tmp_path: PosixPath, prompt: str, assets: Union[List[ImageAsset],
_ImageAssets]) -> str:
"""Given a temporary dir path, export one or more image assets into the
tempdir & replace its contents with the local path to the string so that
the HF version of Qwen-VL can resolve the path and load the image in its
forward() call.
Args:
tmp_path: Tempdir for test under consideration.
prompt: Prompt with image placeholders.
assets: List of image assets whose len equals the num placeholders.
"""
# Ensure that the number of placeholders matches the number of assets;
# If this is not true, the test is probably written incorrectly.
assert prompt.count("<img></img>") == len(assets)
# Replace the placeholders with local paths to the exported assets
for asset in assets:
image_tmp_path = tmp_path / f"{asset.name}.jpg"
asset.pil_image.save(image_tmp_path)
prompt = prompt.replace(
"<img></img>",
f"<img>{image_tmp_path}</img>",
1,
)
return prompt
####### Model-specific HuggingFace runner patchers
def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner to use for GLM4."""
hf_processor = hf_model.processor
patch_padding_side(hf_processor)
def processor(*args, text="", images=None, **kwargs):
if images is None:
return hf_processor(*args, **kwargs)
return hf_processor.apply_chat_template(
[{
"role": "user",
"image": images,
"content": text
}],
add_generation_prompt=True,
tokenize=True,
return_dict=True,
**kwargs,
)
hf_model.processor = processor
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.transformer.output_layer
return hf_model
def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner to use for H2OVL."""
class H2OVLProcessor:
"""A simple processor for H2OVL models."""
def __init__(self, hf_runner: HfRunner):
self.num_image_token = hf_runner.model.num_image_token
self.tokenizer = hf_runner.tokenizer
self.dtype = hf_runner.model.dtype
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
trust_remote_code=True)
self.vision_config = self.config.vision_config
self.use_thumbnail = self.config.use_thumbnail
self.min_num = self.config.min_dynamic_patch
self.max_num = self.config.max_dynamic_patch
self.image_size = self.vision_config.image_size
def __call__(self, text: str, images: Union[Image, List[Image]],
**kwargs):
# yapf: disable
from vllm.model_executor.models.h2ovl import (
IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
# yapf: enable
images = [images] if isinstance(images, Image) else images
pixel_values = [
image_to_pixel_values(image,
self.image_size,
self.min_num,
self.max_num,
self.use_thumbnail,
use_MSAC=self.config.use_msac).to(
self.dtype) for image in images
]
num_patches_list = [
pixel_value.shape[0] for pixel_value in pixel_values
]
pixel_values = torch.cat(pixel_values, dim=0)
for num_patches in num_patches_list:
context_tokens = IMG_CONTEXT * self.num_image_token \
* num_patches
image_tokens = IMG_START + context_tokens + IMG_END
text = text.replace('<image>', image_tokens, 1)
prompt = self.tokenizer(text, return_tensors="pt")
prompt.update({"pixel_values": pixel_values})
return prompt
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
"<IMG_CONTEXT>")
hf_model.model.img_context_token_id = img_context_token_id
hf_model.processor = H2OVLProcessor(hf_model)
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.language_model.get_output_embeddings()
hf_model.model.generate = types.MethodType(_internvl_generate,
hf_model.model)
return hf_model
def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner to use for InternVL."""
class InternVLProcessor:
"""A simple processor for InternVL2 which misses a processor."""
def __init__(self, hf_runner: HfRunner):
self.num_image_token = hf_runner.model.num_image_token
self.tokenizer = hf_runner.tokenizer
self.dtype = hf_runner.model.dtype
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
trust_remote_code=True)
self.vision_config = self.config.vision_config
self.use_thumbnail = self.config.use_thumbnail
self.min_num = self.config.min_dynamic_patch
self.max_num = self.config.max_dynamic_patch
self.image_size = self.vision_config.image_size
def __call__(self, text: str, images: Union[Image, List[Image]],
**kwargs):
from vllm.model_executor.models.internvl import (
IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
images = [images] if isinstance(images, Image) else images
pixel_values = [
image_to_pixel_values(image, self.image_size, self.min_num,
self.max_num,
self.use_thumbnail).to(self.dtype)
for image in images
]
num_patches_list = [
pixel_value.shape[0] for pixel_value in pixel_values
]
pixel_values = torch.cat(pixel_values, dim=0)
for num_patches in num_patches_list:
context_tokens = IMG_CONTEXT * self.num_image_token \
* num_patches
image_tokens = IMG_START + context_tokens + IMG_END
text = text.replace('<image>', image_tokens, 1)
prompt = self.tokenizer(text, return_tensors="pt")
prompt.update({"pixel_values": pixel_values})
return prompt
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
"<IMG_CONTEXT>")
hf_model.model.img_context_token_id = img_context_token_id
hf_model.processor = InternVLProcessor(hf_model)
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.language_model.get_output_embeddings()
hf_model.model.generate = types.MethodType(_internvl_generate,
hf_model.model)
return hf_model
def _internvl_generate(
self,
pixel_values: torch.FloatTensor,
input_ids: torch.FloatTensor,
attention_mask: Optional[torch.LongTensor] = None,
**generate_kwargs,
) -> torch.LongTensor:
"""Generate method for InternVL2 model without fixed use_cache."""
assert self.img_context_token_id is not None
vit_embeds = self.extract_feature(pixel_values)
input_embeds = self.language_model.get_input_embeddings()(input_ids)
B, N, C = input_embeds.shape
input_embeds = input_embeds.reshape(B * N, C)
input_ids = input_ids.reshape(B * N)
selected = (input_ids == self.img_context_token_id)
assert selected.sum() != 0
input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
input_embeds = input_embeds.reshape(B, N, C)
forward_kwargs = dict(
inputs_embeds=input_embeds,
attention_mask=attention_mask,
)
if getattr(self, "use_visual_token_mask", False):
visual_token_mask = selected.reshape(B, N, 1).to(input_embeds.dtype)
forward_kwargs["visual_token_mask"] = visual_token_mask
outputs = self.language_model.generate(
**forward_kwargs,
**generate_kwargs,
)
return outputs

View File

@@ -0,0 +1,139 @@
"""Entrypoints for wrapping the core run_test implementation for specific test
types / modalities.
"""
from pathlib import PosixPath
from typing import Type
from .....conftest import HfRunner, VllmRunner, _ImageAssets, _VideoAssets
from . import builders, core
from .types import ExpandableVLMTestArgs, VLMTestInfo
####### Entrypoints for running different test types
def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
image_assets: _ImageAssets):
assert test_case.size_wrapper is not None
inputs = builders.build_single_image_inputs_from_test_info(
model_test_info, image_assets, test_case.size_wrapper, tmp_path)
core.run_test(
hf_runner=hf_runner,
vllm_runner=vllm_runner,
inputs=inputs,
model=test_case.model,
dtype=test_case.dtype,
max_tokens=test_case.max_tokens,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"image": 1},
distributed_executor_backend=test_case.distributed_executor_backend,
runner_mm_key="images",
**model_test_info.get_non_parametrized_runner_kwargs())
def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
image_assets: _ImageAssets):
assert test_case.size_wrapper is not None
inputs = builders.build_multi_image_inputs_from_test_info(
model_test_info, image_assets, test_case.size_wrapper, tmp_path)
core.run_test(
hf_runner=hf_runner,
vllm_runner=vllm_runner,
inputs=inputs,
model=test_case.model,
dtype=test_case.dtype,
max_tokens=test_case.max_tokens,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"image": len(image_assets)},
distributed_executor_backend=test_case.distributed_executor_backend,
runner_mm_key="images",
**model_test_info.get_non_parametrized_runner_kwargs())
def run_embedding_test(*, model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
image_assets: _ImageAssets):
assert test_case.size_wrapper is not None
inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
model_test_info, image_assets, test_case.size_wrapper)
core.run_test(
hf_runner=hf_runner,
vllm_runner=vllm_runner,
inputs=inputs,
model=test_case.model,
dtype=test_case.dtype,
max_tokens=test_case.max_tokens,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"image": 1},
vllm_embeddings=vllm_embeddings,
distributed_executor_backend=test_case.distributed_executor_backend,
runner_mm_key="images",
**model_test_info.get_non_parametrized_runner_kwargs())
def run_video_test(
*,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
video_assets: _VideoAssets,
):
assert test_case.size_wrapper is not None
assert test_case.num_video_frames is not None
inputs = builders.build_video_inputs_from_test_info(
model_test_info, video_assets, test_case.size_wrapper,
test_case.num_video_frames)
core.run_test(
hf_runner=hf_runner,
vllm_runner=vllm_runner,
inputs=inputs,
model=test_case.model,
dtype=test_case.dtype,
max_tokens=test_case.max_tokens,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"video": len(video_assets)},
distributed_executor_backend=test_case.distributed_executor_backend,
runner_mm_key="videos",
**model_test_info.get_non_parametrized_runner_kwargs())
def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner]):
# Custom test cases can provide inputs directly, but they need to
# explicitly provided a CustomTestConfig, which wraps the inputs and
# the limit_mm_per_prompt
assert test_case.custom_test_opts is not None
inputs = test_case.custom_test_opts.inputs
limit_mm_per_prompt = test_case.custom_test_opts.limit_mm_per_prompt
runner_mm_key = test_case.custom_test_opts.runner_mm_key
# Inputs, limit_mm_per_prompt, and runner_mm_key should all be set
assert inputs is not None
assert limit_mm_per_prompt is not None
assert runner_mm_key is not None
core.run_test(
hf_runner=hf_runner,
vllm_runner=vllm_runner,
inputs=inputs,
model=test_case.model,
dtype=test_case.dtype,
max_tokens=test_case.max_tokens,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt=limit_mm_per_prompt,
distributed_executor_backend=test_case.distributed_executor_backend,
runner_mm_key=runner_mm_key,
**model_test_info.get_non_parametrized_runner_kwargs())

View File

@@ -0,0 +1,186 @@
"""Types for writing multimodal model tests."""
from enum import Enum
from pathlib import PosixPath
from typing import (Any, Callable, Dict, Iterable, List, NamedTuple, Optional,
Tuple, Type, Union)
import torch
from PIL.Image import Image
from pytest import MarkDecorator
from transformers import AutoModelForCausalLM, AutoTokenizer, BatchEncoding
from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm.sequence import SampleLogprobs
from vllm.utils import identity
from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets
from ....utils import check_logprobs_close
# meta image tag; will be replaced by the appropriate tag for the model
TEST_IMG_PLACEHOLDER = "<vlm_image>"
TEST_VIDEO_PLACEHOLDER = "<vlm_video>"
# yapf: disable
SINGLE_IMAGE_BASE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign": f"{TEST_IMG_PLACEHOLDER}What's the content of the image?",
"cherry_blossom": f"{TEST_IMG_PLACEHOLDER}What is the season?",
})
MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PLACEHOLDER}Describe the two images in detail.\n" # noqa: E501
VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"
IMAGE_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
EMBEDDING_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0)]
RunnerOutput = Tuple[List[int], str, Optional[SampleLogprobs]]
# yapf: enable
class VLMTestType(Enum):
IMAGE = 1
MULTI_IMAGE = 2
EMBEDDING = 3
VIDEO = 4
CUSTOM_INPUTS = 5
class SizeType(Enum):
SIZE_FACTOR = 1
FIXED_SIZE = 2
class CustomTestOptions(NamedTuple):
inputs: List[Tuple[List[str], List[Union[List[Image], Image]]]]
limit_mm_per_prompt: Dict[str, int]
# kwarg to pass multimodal data in as to vllm/hf runner instances.
runner_mm_key: str = "images"
class ImageSizeWrapper(NamedTuple):
type: SizeType
# A size factor is a wrapper of 0+ floats,
# while a fixed size contains an iterable of integer pairs
data: Union[Iterable[float], Iterable[Tuple[int, int]]]
class VLMTestInfo(NamedTuple):
"""Holds the configuration for 1+ tests for one model architecture."""
models: Union[List[str]]
test_type: Union[VLMTestType, Iterable[VLMTestType]]
# Should be None only if this is a CUSTOM_INPUTS test
prompt_formatter: Optional[Callable[[str], str]] = None
img_idx_to_prompt: Callable[[int], str] = lambda idx: "<image>\n"
video_idx_to_prompt: Callable[[int], str] = lambda idx: "<video>\n"
# Most models work on the single / multi-image prompts above, but in some
# cases the log prob check fails, e.g., for paligemma. We allow passing
# an override for the single image prompts / multi-image prompt for this
# reason.
single_image_prompts: Iterable[str] = SINGLE_IMAGE_BASE_PROMPTS
multi_image_prompt: str = MULTI_IMAGE_BASE_PROMPT
# Function for converting ImageAssets to image embeddings;
# We need to define this explicitly for embedding tests
convert_assets_to_embeddings: Optional[Callable[[_ImageAssets],
torch.Tensor]] = None
# Exposed options for vLLM runner; we change these in a several tests,
# but the defaults are derived from VllmRunner & the engine defaults
# These settings are chosen to avoid OOMs when running in the CI
enforce_eager: bool = True
max_model_len: int = 1024
max_num_seqs: int = 256
task: str = "auto"
tensor_parallel_size: int = 1
# Optional callable which gets a list of token IDs from the model tokenizer
get_stop_token_ids: Optional[Callable[[AutoTokenizer], List[int]]] = None
# Exposed options for HF runner
model_kwargs: Optional[Dict[str, Any]] = None
# Indicates we should explicitly pass the EOS from the tokeniezr
use_tokenizer_eos: bool = False
auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM
# Callable to pass to the HF runner to run on inputs; for now, we also pass
# the data type to input post processing, because almost all of the uses of
# postprocess_inputs are to fix the data types of BatchEncoding values.
postprocess_inputs: Callable[[BatchEncoding, str],
BatchEncoding] = identity
patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]] = None
# Post processors that if defined, will run oun the outputs of the
# vLLM and HF runner, respectively (useful for sanitization, etc).
vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]] = None
hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]] = None
# Consumes the output of the callables above and checks if they're equal
comparator: Callable[..., None] = check_logprobs_close
# Default expandable params per test; these defaults can be overridden in
# instances of this object; the complete set of test cases for the model
# is all combinations of .models + all fields below
max_tokens: Union[int, Tuple[int]] = 128
num_logprobs: Union[int, Tuple[int]] = 5
dtype: Union[str, Iterable[str]] = "half"
distributed_executor_backend: Optional[Union[str, Iterable[str]]] = None
# Only expanded in video tests
num_video_frames: Union[int, Tuple[int]] = 16
# Fixed image sizes / image size factors; most tests use image_size_factors
# The values provided for these two fields will be stacked and expanded
# such that each model will consider each image size factor / image size
# once per tests (much like concatenating and wrapping in one parametrize
# call)
image_size_factors: Iterable[Iterable[float]] = IMAGE_SIZE_FACTORS
image_sizes: Optional[Iterable[Iterable[Tuple[int, int]]]] = None
# Hack for updating a prompt to take into a local path; currently only used
# for Qwen-VL, which requires encoding the image path / url into the prompt
# for HF runner
prompt_path_encoder: Optional[
Callable[[PosixPath, str, Union[List[ImageAsset], _ImageAssets]],
str]] = None # noqa: E501
# Allows configuring a test to run with custom inputs
custom_test_opts: Optional[List[CustomTestOptions]] = None
marks: Optional[List[MarkDecorator]] = None
def get_non_parametrized_runner_kwargs(self):
"""Returns a dictionary of expandable kwargs for items that are used
in all test types, which are NOT used when creating the parametrized
test cases.
"""
return {
"enforce_eager": self.enforce_eager,
"max_model_len": self.max_model_len,
"max_num_seqs": self.max_num_seqs,
"task": self.task,
"tensor_parallel_size": self.tensor_parallel_size,
"hf_output_post_proc": self.hf_output_post_proc,
"vllm_output_post_proc": self.vllm_output_post_proc,
"auto_cls": self.auto_cls,
"use_tokenizer_eos": self.use_tokenizer_eos,
"postprocess_inputs": self.postprocess_inputs,
"comparator": self.comparator,
"get_stop_token_ids": self.get_stop_token_ids,
"model_kwargs": self.model_kwargs,
"patch_hf_runner": self.patch_hf_runner,
}
class ExpandableVLMTestArgs(NamedTuple):
"""The expanded kwargs which correspond to a single test case."""
model: str
max_tokens: int
num_logprobs: int
dtype: str
distributed_executor_backend: Optional[str]
# Sizes are used for everything except for custom input tests
size_wrapper: Optional[ImageSizeWrapper] = None
# Video only
num_video_frames: Optional[int] = None
# Custom inputs only
custom_test_opts: Optional[CustomTestOptions] = None