Sync from v0.13
This commit is contained in:
0
tests/models/multimodal/__init__.py
Normal file
0
tests/models/multimodal/__init__.py
Normal file
0
tests/models/multimodal/generation/__init__.py
Normal file
0
tests/models/multimodal/generation/__init__.py
Normal file
35
tests/models/multimodal/generation/conftest.py
Normal file
35
tests/models/multimodal/generation/conftest.py
Normal file
@@ -0,0 +1,35 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Pytest configuration for vLLM tests."""
|
||||
|
||||
import warnings
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
def pytest_configure(config):
|
||||
"""Disable Flash/MemEfficient SDP on ROCm to avoid HF
|
||||
Transformers accuracy issues.
|
||||
"""
|
||||
if not current_platform.is_rocm():
|
||||
return
|
||||
|
||||
skip_patterns = ["test_granite_speech.py"]
|
||||
if any(pattern in str(arg) for arg in config.args for pattern in skip_patterns):
|
||||
# Skip disabling SDP for Granite Speech tests on ROCm
|
||||
return
|
||||
|
||||
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
|
||||
# accuracy issues
|
||||
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
|
||||
torch.backends.cuda.enable_flash_sdp(False)
|
||||
torch.backends.cuda.enable_mem_efficient_sdp(False)
|
||||
torch.backends.cuda.enable_math_sdp(True)
|
||||
warnings.warn(
|
||||
"ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
|
||||
"to avoid HuggingFace Transformers accuracy issues",
|
||||
UserWarning,
|
||||
stacklevel=1,
|
||||
)
|
||||
142
tests/models/multimodal/generation/test_audioflamingo3.py
Normal file
142
tests/models/multimodal/generation/test_audioflamingo3.py
Normal file
@@ -0,0 +1,142 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Copyright 2025 The vLLM team.
|
||||
# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
|
||||
# reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.models.registry import HF_EXAMPLE_MODELS
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
MODEL_NAME = "nvidia/audio-flamingo-3-hf"
|
||||
|
||||
|
||||
def get_fixture_path(filename):
|
||||
return os.path.join(
|
||||
os.path.dirname(__file__), "../../fixtures/audioflamingo3", filename
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# Check if the model is supported by the current transformers version
|
||||
model_info = HF_EXAMPLE_MODELS.get_hf_info("AudioFlamingo3ForConditionalGeneration")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
try:
|
||||
llm = LLM(
|
||||
model=MODEL_NAME,
|
||||
trust_remote_code=True,
|
||||
dtype="bfloat16",
|
||||
enforce_eager=True,
|
||||
limit_mm_per_prompt={"audio": 1},
|
||||
)
|
||||
return llm
|
||||
except Exception as e:
|
||||
pytest.skip(f"Failed to load model {MODEL_NAME}: {e}")
|
||||
|
||||
|
||||
def test_single_generation(llm):
|
||||
fixture_path = get_fixture_path("expected_results_single.json")
|
||||
if not os.path.exists(fixture_path):
|
||||
pytest.skip(f"Fixture not found: {fixture_path}")
|
||||
|
||||
with open(fixture_path) as f:
|
||||
expected = json.load(f)
|
||||
|
||||
audio_url = "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Why_do_we_ask_questions_converted.wav"
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "audio_url", "audio_url": {"url": audio_url}},
|
||||
{"type": "text", "text": "Transcribe the input speech."},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
|
||||
|
||||
outputs = llm.chat(
|
||||
messages=messages,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
generated_text = outputs[0].outputs[0].text.strip()
|
||||
|
||||
expected_text = expected["transcriptions"][0]
|
||||
|
||||
assert expected_text in generated_text or generated_text in expected_text
|
||||
|
||||
|
||||
def test_batched_generation(llm):
|
||||
fixture_path = get_fixture_path("expected_results_batched.json")
|
||||
if not os.path.exists(fixture_path):
|
||||
pytest.skip(f"Fixture not found: {fixture_path}")
|
||||
|
||||
with open(fixture_path) as f:
|
||||
expected = json.load(f)
|
||||
|
||||
items = [
|
||||
{
|
||||
"audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/dogs_barking_in_sync_with_the_music.wav",
|
||||
"question": "What is surprising about the relationship "
|
||||
"between the barking and the music?",
|
||||
"expected_idx": 0,
|
||||
},
|
||||
{
|
||||
"audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Ch6Ae9DT6Ko_00-04-03_00-04-31.wav",
|
||||
"question": (
|
||||
"Why is the philosopher's name mentioned in the lyrics? "
|
||||
"(A) To express a sense of nostalgia "
|
||||
"(B) To indicate that language cannot express clearly, "
|
||||
"satirizing the inversion of black and white in the world "
|
||||
"(C) To add depth and complexity to the lyrics "
|
||||
"(D) To showcase the wisdom and influence of the philosopher"
|
||||
),
|
||||
"expected_idx": 1,
|
||||
},
|
||||
]
|
||||
|
||||
conversations = []
|
||||
for item in items:
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "audio_url", "audio_url": {"url": item["audio_url"]}},
|
||||
{"type": "text", "text": item["question"]},
|
||||
],
|
||||
}
|
||||
]
|
||||
conversations.append(messages)
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
|
||||
|
||||
outputs = llm.chat(
|
||||
messages=conversations,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
for i, output in enumerate(outputs):
|
||||
generated_text = output.outputs[0].text.strip()
|
||||
expected_text = expected["transcriptions"][i]
|
||||
|
||||
assert expected_text in generated_text or generated_text in expected_text
|
||||
1263
tests/models/multimodal/generation/test_common.py
Normal file
1263
tests/models/multimodal/generation/test_common.py
Normal file
File diff suppressed because it is too large
Load Diff
160
tests/models/multimodal/generation/test_granite_speech.py
Normal file
160
tests/models/multimodal/generation/test_granite_speech.py
Normal file
@@ -0,0 +1,160 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
import pytest
|
||||
from transformers import AutoModelForSpeechSeq2Seq
|
||||
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ....conftest import AudioTestAssets, HfRunner, PromptAudioInput, VllmRunner
|
||||
from ...registry import HF_EXAMPLE_MODELS
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
HF_AUDIO_PROMPT = "<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|><|audio|>can you transcribe the speech into a written format?<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>" # noqa: E501
|
||||
|
||||
|
||||
def vllm_to_hf_output(
|
||||
vllm_output: tuple[list[int], str, SampleLogprobs | None],
|
||||
) -> tuple[list[int], str, SampleLogprobs | None]:
|
||||
"""Sanitize hf output to be comparable with vllm output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
hf_output_str = output_str + "<|end_of_text|>"
|
||||
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
MODEL_NAME = "ibm-granite/granite-speech-3.3-2b"
|
||||
# Audio lora co-exists directly in the model directory, but
|
||||
# currently still needs to be passed directly to vLLM.
|
||||
audio_lora_path = MODEL_NAME
|
||||
models = [MODEL_NAME]
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def set_attention_backend_for_rocm(monkeypatch):
|
||||
if current_platform.is_rocm():
|
||||
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
|
||||
|
||||
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
inputs: Sequence[tuple[list[str], PromptAudioInput]],
|
||||
model: str,
|
||||
*,
|
||||
max_model_len: int,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: str | None = None,
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the audio fixtures for the test are from AUDIO_ASSETS.
|
||||
For huggingface runner, we provide the audio as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
Note, the text input is also adjusted to abide by vllm contract.
|
||||
The text output is sanitized to be able to compare with hf.
|
||||
"""
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=1,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"audio": 1},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enable_lora=True,
|
||||
max_lora_rank=64,
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
lora_request = LoRARequest("audio", 1, audio_lora_path)
|
||||
vllm_outputs_per_case = [
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
audios=audios,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
for prompts, audios in inputs
|
||||
]
|
||||
|
||||
with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
|
||||
hf_processor = hf_model.processor
|
||||
eos_token_id = hf_processor.tokenizer.eos_token_id
|
||||
|
||||
hf_outputs_per_case = [
|
||||
hf_model.generate_greedy_logprobs_limit(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
audios=[audios],
|
||||
eos_token_id=eos_token_id,
|
||||
)
|
||||
for prompts, audios in inputs
|
||||
]
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=[vllm_to_hf_output(output) for output in vllm_outputs],
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize(
|
||||
"dtype", ["float16"] if current_platform.is_rocm() else ["bfloat16"]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"max_model_len", [512] if current_platform.is_rocm() else [2048]
|
||||
)
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model: str,
|
||||
audio_assets: AudioTestAssets,
|
||||
dtype: str,
|
||||
max_model_len: int,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
audio, sr = audio_assets[0].audio_and_sample_rate
|
||||
# This model expects 16k sample rate, which our test audio
|
||||
# already is; if this changes, it may break this test,
|
||||
# so we check it directly
|
||||
assert sr == 16000
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
[
|
||||
([HF_AUDIO_PROMPT], [audio]),
|
||||
],
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=max_model_len,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
81
tests/models/multimodal/generation/test_interleaved.py
Normal file
81
tests/models/multimodal/generation/test_interleaved.py
Normal file
@@ -0,0 +1,81 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.assets.video import VideoAsset
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
|
||||
models = ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]
|
||||
|
||||
|
||||
def base_prompt(modalities_str: str) -> str:
|
||||
return f"<|im_start|>user {modalities_str}\nDescribe what you see from these items.<|im_end|><|im_start|>assistant\n" # noqa: E501
|
||||
|
||||
|
||||
INTERLEAVED_PROMPT = base_prompt("<image><video><image>\n")
|
||||
NONINTERLEAVED_PROMPT = base_prompt("<image><image><video>\n")
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("dtype", ["float16"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
|
||||
"""
|
||||
This is a simple test to check if interleaved and non-interleaved prompts
|
||||
give the same result.
|
||||
"""
|
||||
|
||||
image_cherry = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
|
||||
image_stop = convert_image_mode(ImageAsset("stop_sign").pil_image, "RGB")
|
||||
images = [image_cherry, image_stop]
|
||||
video = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays
|
||||
|
||||
inputs = [
|
||||
(
|
||||
[INTERLEAVED_PROMPT],
|
||||
[images],
|
||||
[video],
|
||||
),
|
||||
(
|
||||
[NONINTERLEAVED_PROMPT],
|
||||
[images],
|
||||
[video],
|
||||
),
|
||||
]
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="generate",
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"image": 2},
|
||||
max_model_len=32768,
|
||||
max_num_seqs=2,
|
||||
tensor_parallel_size=1,
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
vllm_outputs_per_case = [
|
||||
vllm_model.generate_greedy(
|
||||
prompts, max_tokens, images=images, videos=videos
|
||||
)
|
||||
for prompts, images, videos in inputs
|
||||
]
|
||||
|
||||
all_results = [output[0][1] for output in vllm_outputs_per_case]
|
||||
outputs = [
|
||||
(total_str, total_str.find("assistant\n") + len("assistant\n"))
|
||||
for total_str in all_results
|
||||
]
|
||||
prompt_lengths = [prompt_len for _, prompt_len in outputs]
|
||||
generated_strs = [total_str[prompt_len:] for total_str, prompt_len in outputs]
|
||||
interleaved_prompt_len, noninterleaved_prompt_len = prompt_lengths
|
||||
interleaved_output_str, noninterleaved_output_str = generated_strs
|
||||
|
||||
# The two prompts are identical except for the order of modality tokens.
|
||||
assert interleaved_prompt_len == noninterleaved_prompt_len
|
||||
|
||||
# The two generated strings should be different because of the
|
||||
# interleaved modality tokens.
|
||||
assert interleaved_output_str != noninterleaved_output_str
|
||||
86
tests/models/multimodal/generation/test_keye.py
Normal file
86
tests/models/multimodal/generation/test_keye.py
Normal file
@@ -0,0 +1,86 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from dataclasses import asdict
|
||||
from typing import NamedTuple
|
||||
|
||||
import pytest
|
||||
from PIL.Image import Image
|
||||
from transformers import AutoProcessor
|
||||
|
||||
from vllm import LLM, EngineArgs, SamplingParams
|
||||
from vllm.multimodal.utils import encode_image_base64
|
||||
|
||||
MODEL_NAME = "Kwai-Keye/Keye-VL-8B-Preview"
|
||||
|
||||
QUESTION = "What is the content of each image?"
|
||||
|
||||
|
||||
class ModelRequestData(NamedTuple):
|
||||
engine_args: EngineArgs
|
||||
prompt: str
|
||||
image_data: list[Image]
|
||||
stop_token_ids: list[int] | None = None
|
||||
chat_template: str | None = None
|
||||
sampling_params: SamplingParams | None = None
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("question", [QUESTION])
|
||||
def test_keye_vl(
|
||||
image_assets,
|
||||
question: str,
|
||||
):
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
image_urls = [
|
||||
f"data:image/jpeg;base64,{encode_image_base64(image)}" for image in images
|
||||
]
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=MODEL_NAME,
|
||||
trust_remote_code=True,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=5,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*placeholders,
|
||||
{"type": "text", "text": question},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
|
||||
|
||||
prompt = processor.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
|
||||
engine_args = asdict(engine_args) | {"seed": 42}
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.0, max_tokens=256, stop_token_ids=None
|
||||
)
|
||||
|
||||
outputs = llm.generate(
|
||||
{
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": {"image": images},
|
||||
},
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
print("-" * 50)
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
print(generated_text)
|
||||
assert len(generated_text) > 10, (
|
||||
f"Generated text is too short: {generated_text}"
|
||||
)
|
||||
print("-" * 50)
|
||||
723
tests/models/multimodal/generation/test_maverick.py
Normal file
723
tests/models/multimodal/generation/test_maverick.py
Normal file
@@ -0,0 +1,723 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Create a reduced-layer version of the Maverick model for testing purposes.
|
||||
|
||||
This script creates a new model with fewer layers by:
|
||||
1. Loading the original Maverick model configuration
|
||||
2. Creating a reduced configuration
|
||||
3. Generating compatible safetensors files with appropriate weights
|
||||
4. Creating the necessary index files for vLLM compatibility
|
||||
"""
|
||||
|
||||
import json
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from safetensors.torch import save_file
|
||||
from transformers import AutoConfig, AutoProcessor, AutoTokenizer, GenerationConfig
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.v1.executor.abstract import Executor
|
||||
from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, FullAttentionSpec
|
||||
|
||||
from ....utils import multi_gpu_test
|
||||
|
||||
# Sample prompts for testing
|
||||
PROMPTS: list[str] = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
|
||||
def run_maverick_serving(model: str):
|
||||
"""Test Llama-4-Maverick model with vLLM LLM class using CLI equivalent
|
||||
options with reduced layers.
|
||||
"""
|
||||
|
||||
try:
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
llm = LLM(
|
||||
model=model,
|
||||
max_model_len=2048,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=8,
|
||||
enable_expert_parallel=True,
|
||||
trust_remote_code=True,
|
||||
gpu_memory_utilization=0.4,
|
||||
kv_cache_dtype="fp8",
|
||||
)
|
||||
|
||||
outputs = llm.generate(PROMPTS, sampling_params)
|
||||
|
||||
# Print the outputs
|
||||
print("\nGenerated Outputs:\n" + "-" * 60)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}")
|
||||
print(f"Output: {generated_text!r}")
|
||||
print("-" * 60)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error initializing or running model: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def get_rope_layers_config(model_path: str) -> list[int]:
|
||||
"""
|
||||
Get the interleaved RoPE configuration from HuggingFace config
|
||||
|
||||
Args:
|
||||
model_path: Path to the local directory containing the reduced
|
||||
Maverick model checkpoint
|
||||
|
||||
Returns:
|
||||
List of 0 or 1 indicating whether each layer uses RoPE and local attn
|
||||
0 indicates that RoPE is not used while 1 indicates that RoPE is used.
|
||||
"""
|
||||
config_path = Path(model_path) / "config.json"
|
||||
model_config = json.loads(config_path.read_text())
|
||||
text_config = model_config["text_config"]
|
||||
no_rope_layers = text_config["no_rope_layers"]
|
||||
print(f"Found no_rope_layers: {no_rope_layers}")
|
||||
return no_rope_layers
|
||||
|
||||
|
||||
def create_reduced_maverick_model(
|
||||
original_model_name: str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||
output_dir: str = "/tmp/reduced_maverick",
|
||||
text_layers: int = 4,
|
||||
num_experts: int = 4,
|
||||
vision_layers: int = 2,
|
||||
force_recreate: bool = False,
|
||||
) -> str:
|
||||
"""
|
||||
Create a reduced-layer version of the Maverick model.
|
||||
|
||||
Args:
|
||||
original_model_name: Name of the original Maverick model
|
||||
output_dir: Directory to save the reduced model
|
||||
text_layers: Number of text transformer layers
|
||||
num_experts: Number of experts per layer
|
||||
vision_layers: Number of vision transformer layers
|
||||
force_recreate: Whether to recreate if output_dir already exists
|
||||
|
||||
Returns:
|
||||
Path to the created reduced model directory
|
||||
"""
|
||||
|
||||
print(
|
||||
f"Creating reduced Maverick model with {text_layers} text layers and "
|
||||
f"{vision_layers} vision layers..."
|
||||
)
|
||||
|
||||
# Create output directory
|
||||
output_path = Path(output_dir)
|
||||
if output_path.exists():
|
||||
if force_recreate:
|
||||
shutil.rmtree(output_path)
|
||||
else:
|
||||
print(
|
||||
f"Output directory {output_dir} already exists. "
|
||||
"Use --force-recreate to overwrite."
|
||||
)
|
||||
return str(output_path)
|
||||
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
print("Loading original model configuration...")
|
||||
original_config = AutoConfig.from_pretrained(
|
||||
original_model_name, trust_remote_code=True
|
||||
)
|
||||
print("Creating reduced configuration...")
|
||||
reduced_config = create_reduced_config(
|
||||
original_config, text_layers, num_experts, vision_layers
|
||||
)
|
||||
|
||||
config_path = output_path / "config.json"
|
||||
with open(config_path, "w") as f:
|
||||
json.dump(reduced_config, f, indent=2)
|
||||
print(f"Saved reduced config to {config_path}")
|
||||
|
||||
print("Copying tokenizer files...")
|
||||
copy_tokenizer_files(original_model_name, output_path)
|
||||
|
||||
print("Creating reduced safetensors files...")
|
||||
create_reduced_safetensors(original_config, reduced_config, output_path)
|
||||
|
||||
print("Creating preprocessor config...")
|
||||
create_preprocessor_config(original_config, output_path)
|
||||
|
||||
try:
|
||||
gen_config = GenerationConfig.from_pretrained(original_model_name)
|
||||
gen_config.save_pretrained(output_path)
|
||||
print("Copied generation config")
|
||||
except Exception as e:
|
||||
print(f"Could not copy generation config: {e}")
|
||||
|
||||
print(f"Successfully created reduced Maverick model at {output_path}")
|
||||
return str(output_path)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error creating reduced model: {e}")
|
||||
# Clean up on failure
|
||||
if output_path.exists():
|
||||
shutil.rmtree(output_path)
|
||||
raise
|
||||
|
||||
|
||||
def create_reduced_config(
|
||||
original_config: Any, text_layers: int, num_experts: int, vision_layers: int
|
||||
) -> dict[str, Any]:
|
||||
"""Create a reduced configuration based on the original."""
|
||||
|
||||
# Convert config to dictionary
|
||||
config_dict = original_config.to_dict()
|
||||
|
||||
# Reduce text layers
|
||||
if "text_config" in config_dict:
|
||||
original_text_layers = config_dict["text_config"]["num_hidden_layers"]
|
||||
config_dict["text_config"]["num_hidden_layers"] = text_layers
|
||||
original_layer_types = config_dict["text_config"]["layer_types"]
|
||||
config_dict["text_config"]["layer_types"] = original_layer_types[:text_layers]
|
||||
print(f"Reduced text layers from {original_text_layers} to {text_layers}")
|
||||
|
||||
original_num_experts = config_dict["text_config"]["num_local_experts"]
|
||||
config_dict["text_config"]["num_local_experts"] = num_experts
|
||||
print(f"Reduced num experts from {original_num_experts} to {num_experts}")
|
||||
|
||||
hidden_dim_divisor = 4
|
||||
|
||||
original_hidden_size = config_dict["text_config"]["hidden_size"]
|
||||
new_hidden_size = original_hidden_size // hidden_dim_divisor
|
||||
config_dict["text_config"]["hidden_size"] = new_hidden_size
|
||||
print(f"Reduced hidden size from {original_hidden_size} to {new_hidden_size}")
|
||||
|
||||
original_head_dim = config_dict["text_config"]["head_dim"]
|
||||
new_head_dim = original_head_dim // hidden_dim_divisor
|
||||
config_dict["text_config"]["head_dim"] = new_head_dim
|
||||
print(f"Reduced head dim from {original_head_dim} to {new_head_dim}")
|
||||
|
||||
# Reduce vision layers
|
||||
if "vision_config" in config_dict:
|
||||
original_vision_layers = config_dict["vision_config"]["num_hidden_layers"]
|
||||
config_dict["vision_config"]["num_hidden_layers"] = vision_layers
|
||||
print(f"Reduced vision layers from {original_vision_layers} to {vision_layers}")
|
||||
|
||||
# Update model name to indicate it's a reduced version
|
||||
config_dict["_name_or_path"] = f"reduced_maverick_{text_layers}t_{vision_layers}v"
|
||||
|
||||
return config_dict
|
||||
|
||||
|
||||
def copy_tokenizer_files(original_model_name: str, output_path: Path) -> None:
|
||||
"""Copy tokenizer files from the original model."""
|
||||
|
||||
try:
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
original_model_name, trust_remote_code=True
|
||||
)
|
||||
tokenizer.save_pretrained(output_path)
|
||||
print("Tokenizer files copied successfully")
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not copy tokenizer files: {e}")
|
||||
|
||||
|
||||
def create_preprocessor_config(original_config: Any, output_path: Path) -> None:
|
||||
"""Create preprocessor_config.json for multimodal model."""
|
||||
|
||||
# Try to load the original preprocessor config
|
||||
try:
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
original_config._name_or_path
|
||||
or "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
processor.save_pretrained(output_path)
|
||||
print("Copied original preprocessor config")
|
||||
return
|
||||
except Exception as e:
|
||||
print(f"Could not copy original preprocessor config: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def create_reduced_safetensors(
|
||||
original_config: Any, reduced_config: dict[str, Any], output_path: Path
|
||||
) -> None:
|
||||
"""Create safetensors files with weights for the reduced model."""
|
||||
|
||||
print("Generating synthetic weights for reduced model...")
|
||||
|
||||
text_config = reduced_config["text_config"]
|
||||
vision_config = reduced_config["vision_config"]
|
||||
|
||||
weights = {}
|
||||
|
||||
print("Creating text model weights...")
|
||||
weights.update(create_text_model_weights(text_config))
|
||||
|
||||
print("Creating vision model weights...")
|
||||
weights.update(create_vision_model_weights(vision_config))
|
||||
|
||||
print("Creating shared model weights...")
|
||||
weights.update(create_shared_weights(text_config, vision_config))
|
||||
|
||||
print("Saving weights to safetensors files...")
|
||||
save_weights_to_safetensors(weights, output_path)
|
||||
|
||||
|
||||
def create_text_model_weights(text_config: dict[str, Any]) -> dict[str, torch.Tensor]:
|
||||
"""Create synthetic weights for the text model with MoE structure."""
|
||||
|
||||
weights = {}
|
||||
|
||||
vocab_size = text_config["vocab_size"]
|
||||
hidden_size = text_config["hidden_size"]
|
||||
intermediate_size = text_config["intermediate_size"]
|
||||
intermediate_size_mlp = text_config["intermediate_size_mlp"]
|
||||
num_layers = text_config["num_hidden_layers"]
|
||||
num_attention_heads = text_config["num_attention_heads"]
|
||||
num_key_value_heads = text_config.get("num_key_value_heads", num_attention_heads)
|
||||
|
||||
# MoE specific parameters
|
||||
num_experts = text_config.get("num_local_experts")
|
||||
assert num_experts is not None, "num_local_experts must be specified for MoE"
|
||||
|
||||
head_dim = hidden_size // num_attention_heads
|
||||
|
||||
# Embedding layers
|
||||
weights["language_model.model.embed_tokens.weight"] = torch.randn(
|
||||
vocab_size, hidden_size, dtype=torch.float16
|
||||
)
|
||||
|
||||
# Transformer layers
|
||||
for layer_idx in range(num_layers):
|
||||
layer_prefix = f"language_model.model.layers.{layer_idx}"
|
||||
print(f"Creating weights for layer {layer_prefix}...")
|
||||
|
||||
# Self-attention weights (separate q, k, v projections)
|
||||
weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
|
||||
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
|
||||
hidden_size, num_key_value_heads * head_dim, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
|
||||
num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
|
||||
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16
|
||||
)
|
||||
print("Self-attention weights created.")
|
||||
|
||||
# Feed-forward weights - MoE pattern based on interleave_moe_layer_step
|
||||
# For interleave_moe_layer_step=2: layers 1,3,5,... are MoE, layers
|
||||
# 0,2,4,... are dense
|
||||
interleave_step = text_config.get("interleave_moe_layer_step", 1)
|
||||
is_moe_layer = interleave_step > 0 and (layer_idx + 1) % interleave_step == 0
|
||||
|
||||
if is_moe_layer:
|
||||
# MoE layer structure
|
||||
# 1. Router weights
|
||||
weights[f"{layer_prefix}.feed_forward.router.weight"] = torch.randn(
|
||||
num_experts, hidden_size, dtype=torch.float16
|
||||
)
|
||||
|
||||
# 2. Individual expert weights (not fused)
|
||||
for expert_idx in range(num_experts):
|
||||
expert_prefix = f"{layer_prefix}.feed_forward.experts.{expert_idx}"
|
||||
|
||||
weights[f"{expert_prefix}.gate_proj.weight"] = torch.randn(
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{expert_prefix}.up_proj.weight"] = torch.randn(
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{expert_prefix}.down_proj.weight"] = torch.randn(
|
||||
hidden_size, intermediate_size, dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
# Expert weight scales (FP8 quantization)
|
||||
weights[f"{expert_prefix}.gate_proj.weight_scale"] = torch.ones(
|
||||
intermediate_size, 1, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{expert_prefix}.up_proj.weight_scale"] = torch.ones(
|
||||
intermediate_size, 1, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{expert_prefix}.down_proj.weight_scale"] = torch.ones(
|
||||
hidden_size, 1, dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
# 3. Shared expert weights
|
||||
shared_expert_prefix = f"{layer_prefix}.feed_forward.shared_expert"
|
||||
weights[f"{shared_expert_prefix}.gate_proj.weight"] = torch.randn(
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{shared_expert_prefix}.up_proj.weight"] = torch.randn(
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{shared_expert_prefix}.down_proj.weight"] = torch.randn(
|
||||
hidden_size, intermediate_size, dtype=torch.bfloat16
|
||||
)
|
||||
print(f"MoE feed-forward weights created for layer {layer_idx}.")
|
||||
else:
|
||||
# Dense layer structure
|
||||
weights[f"{layer_prefix}.feed_forward.gate_proj.weight"] = torch.randn(
|
||||
intermediate_size_mlp, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.feed_forward.up_proj.weight"] = torch.randn(
|
||||
intermediate_size_mlp, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.feed_forward.down_proj.weight"] = torch.randn(
|
||||
hidden_size, intermediate_size_mlp, dtype=torch.bfloat16
|
||||
)
|
||||
print(f"Dense feed-forward weights created for layer {layer_idx}.")
|
||||
|
||||
# Layer norms
|
||||
weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
print("Layer norms created.")
|
||||
|
||||
# Final layer norm and output projection
|
||||
weights["language_model.model.norm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights["language_model.lm_head.weight"] = torch.randn(
|
||||
vocab_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
return weights
|
||||
|
||||
|
||||
def create_vision_model_weights(
|
||||
vision_config: dict[str, Any],
|
||||
) -> dict[str, torch.Tensor]:
|
||||
"""Create synthetic weights for the vision model."""
|
||||
|
||||
weights = {}
|
||||
|
||||
hidden_size = vision_config["hidden_size"]
|
||||
intermediate_size = vision_config["intermediate_size"]
|
||||
num_layers = vision_config["num_hidden_layers"]
|
||||
|
||||
# Vision transformer layers
|
||||
for layer_idx in range(num_layers):
|
||||
layer_prefix = f"vision_model.model.layers.{layer_idx}"
|
||||
|
||||
weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
|
||||
hidden_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.q_proj.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
|
||||
hidden_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.k_proj.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
|
||||
hidden_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.v_proj.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
|
||||
hidden_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.o_proj.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
weights[f"{layer_prefix}.mlp.fc1.weight"] = torch.randn(
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.mlp.fc1.bias"] = torch.zeros(
|
||||
intermediate_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.mlp.fc2.weight"] = torch.randn(
|
||||
hidden_size, intermediate_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.mlp.fc2.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.input_layernorm.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.post_attention_layernorm.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
return weights
|
||||
|
||||
|
||||
def create_shared_weights(
|
||||
text_config: dict[str, Any], vision_config: dict[str, Any]
|
||||
) -> dict[str, torch.Tensor]:
|
||||
"""Create weights for shared components (vision-language connector)"""
|
||||
|
||||
weights = {}
|
||||
|
||||
text_hidden_size = text_config["hidden_size"]
|
||||
projector_input_dim = vision_config["projector_input_dim"]
|
||||
|
||||
# Vision-language connector (projects vision features to text space)
|
||||
weights["multi_modal_projector.linear_1.weight"] = torch.randn(
|
||||
text_hidden_size, projector_input_dim, dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
return weights
|
||||
|
||||
|
||||
def save_weights_to_safetensors(
|
||||
weights: dict[str, torch.Tensor], output_path: Path
|
||||
) -> None:
|
||||
"""Save weights to safetensors files and create index."""
|
||||
|
||||
# Determine how to shard the weights
|
||||
max_shard_size = 5 * 1024 * 1024 * 1024 # 5GB per shard
|
||||
|
||||
# Calculate sizes and create shards
|
||||
shards = []
|
||||
current_shard: dict[str, torch.Tensor] = {}
|
||||
current_size = 0
|
||||
|
||||
for name, tensor in weights.items():
|
||||
tensor_size = tensor.numel() * tensor.element_size()
|
||||
|
||||
if current_size + tensor_size > max_shard_size and current_shard:
|
||||
shards.append(current_shard)
|
||||
current_shard = {}
|
||||
current_size = 0
|
||||
|
||||
current_shard[name] = tensor
|
||||
current_size += tensor_size
|
||||
|
||||
if current_shard:
|
||||
shards.append(current_shard)
|
||||
|
||||
# Save shards and create index
|
||||
weight_map = {}
|
||||
|
||||
if len(shards) == 1:
|
||||
# Single file
|
||||
filename = "model.safetensors"
|
||||
save_file(shards[0], output_path / filename)
|
||||
weight_map = {name: filename for name in shards[0]}
|
||||
print(f"Saved weights to single file: {filename}")
|
||||
else:
|
||||
# Multiple shards
|
||||
for i, shard in enumerate(shards):
|
||||
filename = f"model-{i + 1:05d}-of-{len(shards):05d}.safetensors"
|
||||
save_file(shard, output_path / filename)
|
||||
for name in shard:
|
||||
weight_map[name] = filename
|
||||
print(f"Saved shard {i + 1}/{len(shards)}: {filename}")
|
||||
|
||||
# Create index file
|
||||
index_data = {
|
||||
"metadata": {
|
||||
"total_size": sum(
|
||||
tensor.numel() * tensor.element_size() for tensor in weights.values()
|
||||
)
|
||||
},
|
||||
"weight_map": weight_map,
|
||||
}
|
||||
|
||||
index_path = output_path / "model.safetensors.index.json"
|
||||
with open(index_path, "w") as f:
|
||||
json.dump(index_data, f, indent=2)
|
||||
|
||||
print(f"Created index file: {index_path}")
|
||||
print(
|
||||
f"Total model size: {index_data['metadata']['total_size'] / (1024**3):.2f} GB"
|
||||
)
|
||||
|
||||
|
||||
def check_attention_spec_interleaved_rope(
|
||||
llm: LLM,
|
||||
num_attention_layers: int,
|
||||
num_ranks: int,
|
||||
rope_layers: list[int],
|
||||
):
|
||||
"""Check that the attention spec is correct."""
|
||||
assert isinstance(llm.llm_engine.model_executor, Executor)
|
||||
kv_cache_specs_per_rank = llm.llm_engine.model_executor.get_kv_cache_specs()
|
||||
for rank in range(num_ranks):
|
||||
kv_cache_specs = kv_cache_specs_per_rank[rank]
|
||||
assert len(kv_cache_specs.keys()) == num_attention_layers
|
||||
for i in range(num_attention_layers):
|
||||
if rope_layers[i] == 0:
|
||||
expected_spec = FullAttentionSpec
|
||||
else:
|
||||
expected_spec = ChunkedLocalAttentionSpec
|
||||
assert isinstance(
|
||||
kv_cache_specs[f"language_model.model.layers.{i}.self_attn.attn"],
|
||||
expected_spec,
|
||||
)
|
||||
|
||||
|
||||
def run_reduced_model(llm: LLM, should_profile: bool = False) -> None:
|
||||
"""Test the created reduced model with vLLM."""
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=50)
|
||||
|
||||
if should_profile:
|
||||
llm.start_profile()
|
||||
outputs = llm.generate(PROMPTS, sampling_params)
|
||||
if should_profile:
|
||||
llm.stop_profile()
|
||||
|
||||
print("Test generation successful!")
|
||||
for output in outputs:
|
||||
print(f"Prompt: {output.prompt}")
|
||||
print(f"Output: {output.outputs[0].text}")
|
||||
print("-" * 40)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize(
|
||||
"original_model_name,text_layers,num_experts,vision_layers,",
|
||||
[("meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", 4, 4, 2)],
|
||||
)
|
||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
||||
@pytest.mark.parametrize("tp,ep", [(2, True)])
|
||||
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
|
||||
def test_dummy_maverick(
|
||||
monkeypatch,
|
||||
original_model_name: str,
|
||||
text_layers: int,
|
||||
num_experts: int,
|
||||
vision_layers: int,
|
||||
enforce_eager: bool,
|
||||
tp: int,
|
||||
ep: bool,
|
||||
output_dir: str = "/tmp/reduced_maverick",
|
||||
force_recreate: bool = True,
|
||||
profile: bool = False,
|
||||
) -> None:
|
||||
# Disable multiprocessing allows us to access model executor from LLM engine
|
||||
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||
|
||||
model_path = create_reduced_maverick_model(
|
||||
original_model_name=original_model_name,
|
||||
output_dir=output_dir,
|
||||
text_layers=text_layers,
|
||||
num_experts=num_experts,
|
||||
vision_layers=vision_layers,
|
||||
force_recreate=force_recreate,
|
||||
)
|
||||
|
||||
print(f"\nReduced model created successfully at: {model_path}")
|
||||
|
||||
rope_layers = get_rope_layers_config(model_path)
|
||||
|
||||
llm = LLM(
|
||||
model=model_path,
|
||||
trust_remote_code=True,
|
||||
max_model_len=512, # Small context for testing
|
||||
gpu_memory_utilization=0.3, # Conservative memory usage
|
||||
enforce_eager=enforce_eager,
|
||||
tensor_parallel_size=tp,
|
||||
enable_expert_parallel=ep,
|
||||
)
|
||||
|
||||
check_attention_spec_interleaved_rope(
|
||||
llm,
|
||||
text_layers,
|
||||
tp,
|
||||
rope_layers,
|
||||
)
|
||||
|
||||
print(f"\nTesting reduced model at {model_path}...")
|
||||
run_reduced_model(llm=llm, should_profile=profile)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function to create and test the reduced model."""
|
||||
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Create a reduced-layer Maverick model"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
default="/tmp/reduced_maverick",
|
||||
help="Output directory for the reduced model",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--text-layers",
|
||||
type=int,
|
||||
default=4,
|
||||
help="Number of text transformer layers",
|
||||
)
|
||||
parser.add_argument("--num-experts", type=int, default=4, help="Number of experts")
|
||||
parser.add_argument(
|
||||
"--vision-layers",
|
||||
type=int,
|
||||
default=2,
|
||||
help="Number of vision transformer layers",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--force-recreate",
|
||||
action="store_true",
|
||||
help="Force recreation if output directory exists",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--test", action="store_true", help="Test the created model with vLLM"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--profile", action="store_true", help="Profile the created model with vLLM"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--test-original",
|
||||
action="store_true",
|
||||
help="Test the original model with vLLM",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--original-model",
|
||||
default="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||
help="Original model name to base the reduction on",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.test:
|
||||
test_dummy_maverick(
|
||||
original_model_name=args.original_model,
|
||||
output_dir=args.output_dir,
|
||||
text_layers=args.text_layers,
|
||||
num_experts=args.num_experts,
|
||||
vision_layers=args.vision_layers,
|
||||
force_recreate=args.force_recreate,
|
||||
tp=2,
|
||||
ep=True,
|
||||
enforce_eager=True,
|
||||
profile=args.profile,
|
||||
)
|
||||
|
||||
if args.test_original:
|
||||
run_maverick_serving(args.original_model)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit(main())
|
||||
180
tests/models/multimodal/generation/test_multimodal_gguf.py
Normal file
180
tests/models/multimodal/generation/test_multimodal_gguf.py
Normal file
@@ -0,0 +1,180 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
|
||||
from typing import Any, NamedTuple
|
||||
|
||||
import pytest
|
||||
from huggingface_hub import hf_hub_download
|
||||
from pytest import MarkDecorator
|
||||
from transformers import AutoModelForImageTextToText
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.utils.torch_utils import set_default_torch_num_threads
|
||||
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
|
||||
class GGUFMMTestConfig(NamedTuple):
|
||||
original_model: str
|
||||
gguf_repo: str
|
||||
gguf_backbone: str
|
||||
gguf_mmproj: str
|
||||
prompt: list[str]
|
||||
image_names: list[str] # Store names, load PIL images at runtime
|
||||
max_model_len: int = 4096
|
||||
marks: list[MarkDecorator] = []
|
||||
mm_processor_kwargs: dict[str, Any] = {}
|
||||
|
||||
@property
|
||||
def gguf_model(self):
|
||||
hf_hub_download(self.gguf_repo, filename=self.gguf_mmproj)
|
||||
return hf_hub_download(self.gguf_repo, filename=self.gguf_backbone)
|
||||
|
||||
|
||||
# Common prompts aligned with test_common.py "gemma3" entry format
|
||||
_GEMMA3_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": (
|
||||
"<bos><start_of_turn>user\n"
|
||||
"<start_of_image>What's the content in the center of the image?"
|
||||
"<end_of_turn>\n<start_of_turn>model\n"
|
||||
),
|
||||
"cherry_blossom": (
|
||||
"<bos><start_of_turn>user\n"
|
||||
"<start_of_image>What is the season?"
|
||||
"<end_of_turn>\n<start_of_turn>model\n"
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
# Image asset names - load at runtime to avoid pickle issues with subprocess
|
||||
_GEMMA3_IMAGE_NAMES = ["stop_sign", "cherry_blossom"]
|
||||
|
||||
# Regular multimodal (no pan-and-scan) - uses QAT Q4_0 GGUF
|
||||
GEMMA3_CONFIG = GGUFMMTestConfig(
|
||||
original_model="google/gemma-3-4b-it",
|
||||
gguf_repo="google/gemma-3-4b-it-qat-q4_0-gguf",
|
||||
gguf_backbone="gemma-3-4b-it-q4_0.gguf",
|
||||
gguf_mmproj="mmproj-model-f16-4B.gguf",
|
||||
prompt=_GEMMA3_PROMPTS,
|
||||
image_names=_GEMMA3_IMAGE_NAMES,
|
||||
max_model_len=4096,
|
||||
marks=[pytest.mark.core_model],
|
||||
mm_processor_kwargs={},
|
||||
)
|
||||
|
||||
# Pan-and-scan multimodal - uses unquantized BF16 GGUF
|
||||
GEMMA3_CONFIG_PAN_AND_SCAN = GGUFMMTestConfig(
|
||||
original_model="google/gemma-3-4b-it",
|
||||
gguf_repo="unsloth/gemma-3-4b-it-GGUF",
|
||||
gguf_backbone="gemma-3-4b-it-BF16.gguf",
|
||||
gguf_mmproj="mmproj-BF16.gguf",
|
||||
prompt=_GEMMA3_PROMPTS,
|
||||
image_names=_GEMMA3_IMAGE_NAMES,
|
||||
max_model_len=4096,
|
||||
marks=[pytest.mark.core_model],
|
||||
mm_processor_kwargs={"do_pan_and_scan": True},
|
||||
)
|
||||
|
||||
MODELS_TO_TEST = [GEMMA3_CONFIG, GEMMA3_CONFIG_PAN_AND_SCAN]
|
||||
|
||||
|
||||
def run_multimodal_gguf_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
model: GGUFMMTestConfig,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
):
|
||||
# Load images at runtime (inside subprocess) to avoid pickle issues
|
||||
images = [ImageAsset(name).pil_image for name in model.image_names]
|
||||
size_factors = [0.25, 0.5, 1.0]
|
||||
inputs_per_image = [
|
||||
(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
)
|
||||
for image, prompt in zip(images, model.prompt)
|
||||
]
|
||||
|
||||
# NOTE: Run vLLM first to avoid CUDA init issues with multiprocessing fork.
|
||||
# Run GGUF model via vLLM.
|
||||
with (
|
||||
set_default_torch_num_threads(1),
|
||||
vllm_runner(
|
||||
model_name=model.gguf_model,
|
||||
enforce_eager=True,
|
||||
tokenizer_name=model.original_model,
|
||||
dtype=dtype,
|
||||
max_model_len=model.max_model_len,
|
||||
mm_processor_kwargs=model.mm_processor_kwargs,
|
||||
) as gguf_model,
|
||||
):
|
||||
gguf_outputs_per_case = [
|
||||
gguf_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
)
|
||||
for prompts, images in inputs_per_image
|
||||
]
|
||||
|
||||
# Then run HfRunner for HuggingFace baseline comparison.
|
||||
with hf_runner(
|
||||
model.original_model,
|
||||
dtype=dtype,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
) as hf_model:
|
||||
hf_outputs_per_case = [
|
||||
hf_model.generate_greedy_logprobs_limit(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
)
|
||||
for prompts, images in inputs_per_image
|
||||
]
|
||||
|
||||
for hf_outputs, gguf_outputs in zip(hf_outputs_per_case, gguf_outputs_per_case):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=gguf_outputs,
|
||||
name_0="hf",
|
||||
name_1="gguf",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("gguf"),
|
||||
reason="gguf is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
pytest.param(test_config, marks=test_config.marks)
|
||||
for test_config in MODELS_TO_TEST
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_gemma3_mm_gguf(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
model: GGUFMMTestConfig,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
run_multimodal_gguf_test(
|
||||
hf_runner, vllm_runner, model, dtype, max_tokens, num_logprobs
|
||||
)
|
||||
317
tests/models/multimodal/generation/test_phi4mm.py
Normal file
317
tests/models/multimodal/generation/test_phi4mm.py
Normal file
@@ -0,0 +1,317 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
from collections.abc import Sequence
|
||||
|
||||
import librosa
|
||||
import pytest
|
||||
import regex as re
|
||||
from huggingface_hub import snapshot_download
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal.image import convert_image_mode, rescale_image_size
|
||||
|
||||
from ....conftest import (
|
||||
IMAGE_ASSETS,
|
||||
HfRunner,
|
||||
PromptAudioInput,
|
||||
PromptImageInput,
|
||||
VllmRunner,
|
||||
)
|
||||
from ....utils import large_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
"cherry_blossom": "<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
}
|
||||
)
|
||||
HF_MULTIIMAGE_IMAGE_PROMPT = (
|
||||
"<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
|
||||
)
|
||||
|
||||
model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
|
||||
# Since the vision-lora and speech-lora co-exist with the base model,
|
||||
# we have to manually specify the path of the lora weights.
|
||||
vision_lora_path = os.path.join(model_path, "vision-lora")
|
||||
speech_question = os.path.join(
|
||||
model_path, "examples", "what_is_shown_in_this_image.wav"
|
||||
)
|
||||
models = [model_path]
|
||||
|
||||
|
||||
def vllm_to_hf_output(
|
||||
vllm_output: tuple[list[int], str, SampleLogprobs | None], model: str
|
||||
):
|
||||
"""Sanitize vllm output to be comparable with hf output."""
|
||||
_, output_str, out_logprobs = vllm_output
|
||||
|
||||
output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
|
||||
assert output_str_without_image[0] == " "
|
||||
output_str_without_image = output_str_without_image[1:]
|
||||
|
||||
hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
hf_output_ids = tokenizer.encode(output_str_without_image)
|
||||
assert hf_output_ids[0] == 1
|
||||
hf_output_ids = hf_output_ids[1:]
|
||||
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
target_dtype = "half"
|
||||
|
||||
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
inputs: Sequence[tuple[list[str], PromptImageInput, PromptAudioInput | None]],
|
||||
model: str,
|
||||
*,
|
||||
max_model_len: int,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
mm_limit: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: str | None = None,
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
Note, the text input is also adjusted to abide by vllm contract.
|
||||
The text output is sanitized to be able to compare with hf.
|
||||
"""
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=2,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"image": mm_limit},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enable_lora=True,
|
||||
max_lora_rank=320,
|
||||
gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
lora_request = LoRARequest("vision", 1, vision_lora_path)
|
||||
vllm_outputs_per_case = [
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
audios=audios,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
for prompts, images, audios in inputs
|
||||
]
|
||||
|
||||
# This error occurs inside `get_peft_model`
|
||||
# FIXME: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/75
|
||||
pytest.skip("HF impl is not compatible with current transformers")
|
||||
|
||||
hf_model_kwargs = {"_attn_implementation": "sdpa"}
|
||||
with hf_runner(model, dtype=dtype, model_kwargs=hf_model_kwargs) as hf_model:
|
||||
hf_processor = hf_model.processor
|
||||
eos_token_id = hf_processor.tokenizer.eos_token_id
|
||||
|
||||
def patch_hf_processor(
|
||||
*args, text="", images=None, audio=None, sampling_rate=None, **kwargs
|
||||
):
|
||||
audios = None
|
||||
if audio is not None and sampling_rate is not None:
|
||||
audios = [(audio, sampling_rate)]
|
||||
return hf_processor(
|
||||
*args, text=text, images=images, audios=audios, **kwargs
|
||||
)
|
||||
|
||||
hf_model.processor = patch_hf_processor
|
||||
|
||||
hf_outputs_per_case = [
|
||||
hf_model.generate_greedy_logprobs_limit(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
audios=audios,
|
||||
eos_token_id=eos_token_id,
|
||||
num_logits_to_keep=0,
|
||||
)
|
||||
for prompts, images, audios in inputs
|
||||
]
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# No image
|
||||
[],
|
||||
# Single-scale
|
||||
[1.0],
|
||||
# Single-scale, batched
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_model_len", [12800])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model,
|
||||
size_factors,
|
||||
dtype: str,
|
||||
max_model_len: int,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_image = [
|
||||
(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
None,
|
||||
)
|
||||
for image, prompt in zip(images, HF_IMAGE_PROMPTS)
|
||||
]
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs_per_image,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=max_model_len,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=1,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=48)
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# No image
|
||||
# [],
|
||||
# Single-scale
|
||||
[1.0],
|
||||
# Single-scale, batched
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_model_len", [25600])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_multi_images_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model,
|
||||
size_factors,
|
||||
dtype: str,
|
||||
max_model_len: int,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_case = [
|
||||
(
|
||||
[HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
|
||||
[
|
||||
[rescale_image_size(image, factor) for image in images]
|
||||
for factor in size_factors
|
||||
],
|
||||
None,
|
||||
),
|
||||
]
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs_per_case,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=max_model_len,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=2,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_model_len", [12800])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_vision_speech_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model,
|
||||
dtype: str,
|
||||
max_model_len: int,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
# use the example speech question so that the model outputs are reasonable
|
||||
audio = librosa.load(speech_question, sr=None)
|
||||
image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
|
||||
|
||||
inputs_vision_speech = [
|
||||
(
|
||||
["<|user|><|image_1|><|audio_1|><|end|><|assistant|>"],
|
||||
[image],
|
||||
[audio],
|
||||
),
|
||||
]
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs_vision_speech,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=max_model_len,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=1,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
211
tests/models/multimodal/generation/test_pixtral.py
Normal file
211
tests/models/multimodal/generation/test_pixtral.py
Normal file
@@ -0,0 +1,211 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import json
|
||||
from dataclasses import asdict
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import pytest
|
||||
from mistral_common.multimodal import download_image
|
||||
from mistral_common.protocol.instruct.chunk import ImageURLChunk
|
||||
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
|
||||
from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
|
||||
from transformers import AutoProcessor
|
||||
|
||||
from vllm import SamplingParams, TextPrompt, TokensPrompt
|
||||
from vllm.logprobs import Logprob, SampleLogprobs
|
||||
from vllm.multimodal import MultiModalDataBuiltins
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ....utils import VLLM_PATH, large_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from _typeshed import StrPath
|
||||
|
||||
PIXTRAL_ID = "mistralai/Pixtral-12B-2409"
|
||||
MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
|
||||
|
||||
MODELS = [PIXTRAL_ID, MISTRAL_SMALL_3_1_ID]
|
||||
|
||||
IMG_URLS = [
|
||||
"237-400x300.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
|
||||
"231-200x300.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
|
||||
"27-500x500.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
|
||||
"17-150x600.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
|
||||
]
|
||||
PROMPT = "Describe each image in one short sentence."
|
||||
|
||||
|
||||
def _create_msg_format(urls: list[str]) -> list[dict[str, Any]]:
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": PROMPT,
|
||||
}
|
||||
]
|
||||
+ [{"type": "image_url", "image_url": {"url": url}} for url in urls],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def _create_msg_format_hf(urls: list[str]) -> list[dict[str, Any]]:
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"content": PROMPT,
|
||||
},
|
||||
*({"type": "image", "image": download_image(url)} for url in urls),
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def _create_engine_inputs(urls: list[str]) -> TokensPrompt:
|
||||
msg = _create_msg_format(urls)
|
||||
|
||||
tokenizer = MistralTokenizer.from_model("pixtral")
|
||||
|
||||
request = ChatCompletionRequest(messages=msg) # type: ignore[type-var]
|
||||
tokenized = tokenizer.encode_chat_completion(request)
|
||||
|
||||
engine_inputs = TokensPrompt(prompt_token_ids=tokenized.tokens)
|
||||
|
||||
images = []
|
||||
for chunk in request.messages[0].content:
|
||||
if isinstance(chunk, ImageURLChunk):
|
||||
images.append(image_from_chunk(chunk))
|
||||
|
||||
mm_data = MultiModalDataBuiltins(image=images)
|
||||
engine_inputs["multi_modal_data"] = mm_data
|
||||
|
||||
return engine_inputs
|
||||
|
||||
|
||||
def _create_engine_inputs_hf(urls: list[str]) -> TextPrompt:
|
||||
msg = _create_msg_format_hf(urls)
|
||||
|
||||
tokenizer = AutoProcessor.from_pretrained("mistral-community/pixtral-12b")
|
||||
prompt = tokenizer.apply_chat_template(msg)
|
||||
|
||||
images = []
|
||||
for chunk in msg[0]["content"]:
|
||||
if chunk["type"] == "image":
|
||||
images.append(chunk["image"])
|
||||
|
||||
mm_data = MultiModalDataBuiltins(image=images)
|
||||
engine_inputs = TextPrompt(prompt=prompt, multi_modal_data=mm_data)
|
||||
|
||||
return engine_inputs
|
||||
|
||||
|
||||
SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
|
||||
LIMIT_MM_PER_PROMPT = dict(image=4)
|
||||
|
||||
MAX_MODEL_LEN = [8192, 65536]
|
||||
|
||||
FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
|
||||
assert FIXTURES_PATH.exists()
|
||||
|
||||
FIXTURE_LOGPROBS_CHAT = {
|
||||
PIXTRAL_ID: FIXTURES_PATH / "pixtral_chat.json",
|
||||
MISTRAL_SMALL_3_1_ID: FIXTURES_PATH / "mistral_small_3_chat.json",
|
||||
}
|
||||
|
||||
OutputsLogprobs = list[tuple[list[int], str, SampleLogprobs | None]]
|
||||
|
||||
|
||||
# For the test author to store golden output in JSON
|
||||
def _dump_outputs_w_logprobs(
|
||||
outputs: OutputsLogprobs,
|
||||
filename: "StrPath",
|
||||
) -> None:
|
||||
json_data = [
|
||||
(
|
||||
tokens,
|
||||
text,
|
||||
[
|
||||
{k: asdict(v) for k, v in token_logprobs.items()}
|
||||
for token_logprobs in (logprobs or [])
|
||||
],
|
||||
)
|
||||
for tokens, text, logprobs in outputs
|
||||
]
|
||||
|
||||
with open(filename, "w") as f:
|
||||
json.dump(json_data, f)
|
||||
|
||||
|
||||
def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
|
||||
with open(filename, "rb") as f:
|
||||
json_data = json.load(f)
|
||||
|
||||
return [
|
||||
(
|
||||
tokens,
|
||||
text,
|
||||
[
|
||||
{int(k): Logprob(**v) for k, v in token_logprobs.items()}
|
||||
for token_logprobs in logprobs
|
||||
],
|
||||
)
|
||||
for tokens, text, logprobs in json_data
|
||||
]
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=80)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
def test_chat(
|
||||
vllm_runner, max_model_len: int, model: str, dtype: str, local_asset_server
|
||||
) -> None:
|
||||
if (
|
||||
model == MISTRAL_SMALL_3_1_ID
|
||||
and max_model_len == 65536
|
||||
and current_platform.is_rocm()
|
||||
):
|
||||
pytest.skip(
|
||||
"OOM on ROCm: 24B model with 65536 context length exceeds GPU memory"
|
||||
)
|
||||
|
||||
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT[model])
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="mistral",
|
||||
load_format="mistral",
|
||||
config_format="mistral",
|
||||
max_model_len=max_model_len,
|
||||
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
|
||||
) as vllm_model:
|
||||
outputs = []
|
||||
|
||||
urls_all = [local_asset_server.url_for(u) for u in IMG_URLS]
|
||||
msgs = [
|
||||
_create_msg_format(urls_all[:1]),
|
||||
_create_msg_format(urls_all[:2]),
|
||||
_create_msg_format(urls_all),
|
||||
]
|
||||
for msg in msgs:
|
||||
output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS)
|
||||
|
||||
outputs.extend(output)
|
||||
|
||||
logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
|
||||
# Remove last `None` prompt_logprobs to compare with fixture
|
||||
for i in range(len(logprobs)):
|
||||
assert logprobs[i][-1] is None
|
||||
logprobs[i] = logprobs[i][:-1]
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
|
||||
outputs_1_lst=logprobs,
|
||||
name_0="h100_ref",
|
||||
name_1="output",
|
||||
)
|
||||
148
tests/models/multimodal/generation/test_qwen2_5_vl.py
Normal file
148
tests/models/multimodal/generation/test_qwen2_5_vl.py
Normal file
@@ -0,0 +1,148 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.multimodal.video import sample_frames_from_video
|
||||
|
||||
from ....conftest import VIDEO_ASSETS
|
||||
|
||||
models = ["Qwen/Qwen2.5-VL-3B-Instruct"]
|
||||
target_dtype = "bfloat16"
|
||||
|
||||
VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
|
||||
|
||||
|
||||
def qwen2_5_vl_chat_template(*query):
|
||||
return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n" # noqa: E501
|
||||
|
||||
|
||||
VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
|
||||
{
|
||||
"baby_reading": qwen2_5_vl_chat_template(
|
||||
VIDEO_PLACEHOLDER,
|
||||
"Describe this video with a short sentence ",
|
||||
"(no more than 20 words)",
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("video_pruning_rate", [0.0, 0.75])
|
||||
@pytest.mark.parametrize("num_frames", [16])
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
|
||||
def test_qwen2_5_vl_evs_functionality(
|
||||
vllm_runner,
|
||||
video_assets,
|
||||
model,
|
||||
video_pruning_rate: float,
|
||||
num_frames: int,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
use_bytecode_hook: bool,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
"""Test EVS (Efficient Video Sampling) functionality with different
|
||||
pruning rates.
|
||||
"""
|
||||
# Set the environment variable for this test
|
||||
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
|
||||
|
||||
# Sample frames from video assets
|
||||
sampled_vids = [
|
||||
sample_frames_from_video(asset.np_ndarrays, num_frames)
|
||||
for asset in video_assets
|
||||
]
|
||||
|
||||
prompts = [VIDEO_PROMPTS[0]]
|
||||
videos = [sampled_vids[0]]
|
||||
|
||||
# Initialize model with EVS configuration
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=4000,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"video": 1},
|
||||
video_pruning_rate=video_pruning_rate,
|
||||
) as vllm_model:
|
||||
# Generate output - this should not crash
|
||||
outputs = vllm_model.generate_greedy(prompts, max_tokens, videos=videos)
|
||||
|
||||
# Basic validation that we got a response
|
||||
assert len(outputs) == 1
|
||||
output_ids, output_text = outputs[0]
|
||||
|
||||
# Ensure we got some output
|
||||
assert len(output_ids) > 0
|
||||
assert len(output_text) > 0
|
||||
|
||||
# Ensure the output is a string
|
||||
assert isinstance(output_text, str)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("video_pruning_rate", [0.0, 0.75])
|
||||
@pytest.mark.parametrize("num_frames", [16])
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
|
||||
def test_qwen2_5_vl_evs_batched_videos(
|
||||
vllm_runner,
|
||||
video_assets,
|
||||
model,
|
||||
video_pruning_rate: float,
|
||||
num_frames: int,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
use_bytecode_hook: bool,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
"""Test EVS functionality with batched videos.
|
||||
|
||||
This test validates that:
|
||||
1. The model handles batched video inputs correctly with EVS
|
||||
2. Both pruning configurations work with multiple videos
|
||||
3. The model doesn't crash when processing multiple videos simultaneously
|
||||
"""
|
||||
# Set the environment variable for this test
|
||||
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
|
||||
# Sample frames from video assets
|
||||
sampled_vids = [
|
||||
sample_frames_from_video(asset.np_ndarrays, num_frames)
|
||||
for asset in video_assets
|
||||
]
|
||||
|
||||
# Test batched videos
|
||||
prompts = [VIDEO_PROMPTS[0], VIDEO_PROMPTS[0]]
|
||||
videos = [sampled_vids[0], sampled_vids[0]] # Use same video twice for testing
|
||||
|
||||
# Initialize model with EVS configuration
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=4000,
|
||||
max_num_seqs=2,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"video": 2},
|
||||
tensor_parallel_size=1,
|
||||
video_pruning_rate=video_pruning_rate,
|
||||
) as vllm_model:
|
||||
# Generate output - this should not crash
|
||||
outputs = vllm_model.generate_greedy(prompts, max_tokens, videos=videos)
|
||||
|
||||
# Basic validation that we got responses for both videos
|
||||
assert len(outputs) == 2
|
||||
|
||||
for output_ids, output_text in outputs:
|
||||
# Ensure we got some output for each video
|
||||
assert len(output_ids) > 0
|
||||
assert len(output_text) > 0
|
||||
|
||||
# Ensure the output is a string
|
||||
assert isinstance(output_text, str)
|
||||
473
tests/models/multimodal/generation/test_qwen2_vl.py
Normal file
473
tests/models/multimodal/generation/test_qwen2_vl.py
Normal file
@@ -0,0 +1,473 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Any, TypedDict
|
||||
|
||||
import numpy.typing as npt
|
||||
import pytest
|
||||
import torch
|
||||
from PIL import Image
|
||||
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
|
||||
|
||||
from ....conftest import (
|
||||
IMAGE_ASSETS,
|
||||
VIDEO_ASSETS,
|
||||
PromptImageInput,
|
||||
PromptVideoInput,
|
||||
VllmRunner,
|
||||
)
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def enable_pickle(monkeypatch):
|
||||
"""`LLM.apply_model` requires pickling a function."""
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
|
||||
|
||||
models = ["Qwen/Qwen2-VL-2B-Instruct"]
|
||||
target_dtype = "half"
|
||||
|
||||
IMAGE_PLACEHOLDER = "<|vision_start|><|image_pad|><|vision_end|>"
|
||||
VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
|
||||
MODEL_HIDDEN_SIZE = 1536
|
||||
|
||||
|
||||
def qwen2_vl_chat_template(*query):
|
||||
return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n" # noqa: E501
|
||||
|
||||
|
||||
IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": qwen2_vl_chat_template(
|
||||
IMAGE_PLACEHOLDER,
|
||||
"What is the biggest text's content in this image?",
|
||||
),
|
||||
"cherry_blossom": qwen2_vl_chat_template(
|
||||
IMAGE_PLACEHOLDER,
|
||||
"What is the season shown in this image? ",
|
||||
"Reply with a short sentence (no more than 20 words)",
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
|
||||
{
|
||||
"baby_reading": qwen2_vl_chat_template(
|
||||
VIDEO_PLACEHOLDER,
|
||||
"Describe this video with a short sentence ",
|
||||
"(no more than 20 words)",
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
MULTIIMAGE_PROMPT = qwen2_vl_chat_template(
|
||||
IMAGE_PLACEHOLDER,
|
||||
IMAGE_PLACEHOLDER,
|
||||
"Describe these two images separately. ",
|
||||
"For each image, reply with a short sentence ",
|
||||
"(no more than 10 words).",
|
||||
)
|
||||
|
||||
|
||||
class Qwen2VLPromptImageEmbeddingInput(TypedDict):
|
||||
image_embeds: torch.Tensor
|
||||
image_grid_thw: torch.Tensor
|
||||
|
||||
|
||||
class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
|
||||
video_embeds: torch.Tensor
|
||||
video_grid_thw: torch.Tensor
|
||||
|
||||
|
||||
def batch_make_image_embeddings(
|
||||
image_batches: list[Image.Image | list[Image.Image]],
|
||||
processor,
|
||||
llm: VllmRunner,
|
||||
) -> list[Qwen2VLPromptImageEmbeddingInput]:
|
||||
"""batched image embeddings for Qwen2-VL
|
||||
|
||||
This will infer all images' embeddings in a single batch,
|
||||
and split the result according to input batches.
|
||||
|
||||
image_batches:
|
||||
- Single-image batches: `list[Image.Image]`
|
||||
- Multiple-image batches: `list[list[Image.Image]]]`
|
||||
|
||||
returns: `list[Qwen2VLPromptImageEmbeddingInput]`
|
||||
"""
|
||||
|
||||
image_batches_: list[Any] = image_batches[:]
|
||||
|
||||
# convert single-image batches to multiple-image batches
|
||||
for idx in range(len(image_batches_)):
|
||||
if not isinstance(image_batches_[idx], list):
|
||||
image_batches_[idx] = [image_batches_[idx]]
|
||||
|
||||
assert isinstance(image_batches_[idx], list)
|
||||
|
||||
# append all images into a list (as a batch)
|
||||
images: list[Image.Image] = []
|
||||
for image_batch in image_batches_:
|
||||
images += image_batch
|
||||
|
||||
# image to pixel values
|
||||
image_processor = processor.image_processor
|
||||
|
||||
preprocess_result = image_processor.preprocess(
|
||||
images=images, return_tensors="pt"
|
||||
).data
|
||||
pixel_values = preprocess_result["pixel_values"]
|
||||
image_grid_thw = preprocess_result["image_grid_thw"]
|
||||
|
||||
# pixel values to embeddings & grid_thws
|
||||
def get_image_embeds(model):
|
||||
with torch.no_grad():
|
||||
visual = model.visual
|
||||
|
||||
pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
|
||||
return visual(pixel_values_on_device, grid_thw=image_grid_thw).cpu()
|
||||
|
||||
image_embeds = torch.concat(llm.apply_model(get_image_embeds))
|
||||
|
||||
# split into original batches
|
||||
result: list[Qwen2VLPromptImageEmbeddingInput] = []
|
||||
image_counter = 0
|
||||
embed_counter = 0
|
||||
for image_batch in image_batches_:
|
||||
cur_batch_image_count = len(image_batch)
|
||||
merge_size = image_processor.merge_size
|
||||
cur_batch_embed_len = sum(
|
||||
grid_thw.prod(-1) // merge_size // merge_size
|
||||
for grid_thw in image_grid_thw[
|
||||
image_counter : image_counter + cur_batch_image_count
|
||||
]
|
||||
)
|
||||
|
||||
result.append(
|
||||
{
|
||||
"image_embeds": image_embeds[
|
||||
embed_counter : embed_counter + cur_batch_embed_len
|
||||
],
|
||||
"image_grid_thw": image_grid_thw[
|
||||
image_counter : image_counter + cur_batch_image_count
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
embed_counter += cur_batch_embed_len
|
||||
image_counter += cur_batch_image_count
|
||||
|
||||
# ensure we don't lose any images or embeddings
|
||||
assert embed_counter == image_embeds.size(0)
|
||||
assert image_counter == image_grid_thw.size(0)
|
||||
assert len(image_batches) == len(result)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def batch_make_video_embeddings(
|
||||
video_batches: PromptVideoInput, processor, llm: VllmRunner
|
||||
) -> list[Qwen2VLPromptVideoEmbeddingInput]:
|
||||
"""batched video embeddings for Qwen2-VL
|
||||
|
||||
A NDArray represents a single video's all frames.
|
||||
|
||||
This will infer all videos' embeddings in a single batch,
|
||||
and split the result according to input batches.
|
||||
|
||||
video_batches:
|
||||
- Single-video batches: `list[NDArray]`
|
||||
- Multiple-video batches: `list[list[NDArray]]`
|
||||
"""
|
||||
|
||||
video_batches_: list[Any] = video_batches[:]
|
||||
|
||||
for idx in range(len(video_batches_)):
|
||||
if not isinstance(video_batches_[idx], list):
|
||||
single_video_batch: list[npt.NDArray] = [video_batches_[idx]]
|
||||
video_batches_[idx] = single_video_batch
|
||||
|
||||
assert isinstance(video_batches_[idx], list)
|
||||
|
||||
# append all videos into a list (as a batch)
|
||||
videos: list[npt.NDArray] = []
|
||||
for video_batch in video_batches_:
|
||||
videos += video_batch
|
||||
|
||||
# video to pixel values
|
||||
image_processor = processor.image_processor
|
||||
|
||||
preprocess_result = image_processor.preprocess(
|
||||
images=None, videos=videos, return_tensors="pt"
|
||||
).data
|
||||
pixel_values = preprocess_result["pixel_values_videos"]
|
||||
video_grid_thw = preprocess_result["video_grid_thw"]
|
||||
|
||||
# pixel values to embeddings & grid_thws
|
||||
def get_image_embeds(model):
|
||||
with torch.no_grad():
|
||||
visual = model.visual
|
||||
|
||||
pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
|
||||
return visual(pixel_values_on_device, grid_thw=video_grid_thw).cpu()
|
||||
|
||||
video_embeds = torch.concat(llm.apply_model(get_image_embeds))
|
||||
|
||||
# split into original batches
|
||||
result: list[Qwen2VLPromptVideoEmbeddingInput] = []
|
||||
video_counter = 0
|
||||
embed_counter = 0
|
||||
for video_batch in video_batches_:
|
||||
cur_batch_video_count = len(video_batch)
|
||||
merge_size = image_processor.merge_size
|
||||
cur_batch_embed_len = sum(
|
||||
grid_thw.prod(-1) // merge_size // merge_size
|
||||
for grid_thw in video_grid_thw[
|
||||
video_counter : video_counter + cur_batch_video_count
|
||||
]
|
||||
)
|
||||
|
||||
result.append(
|
||||
{
|
||||
"video_embeds": video_embeds[
|
||||
embed_counter : embed_counter + cur_batch_embed_len
|
||||
],
|
||||
"video_grid_thw": video_grid_thw[
|
||||
video_counter : video_counter + cur_batch_video_count
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
embed_counter += cur_batch_embed_len
|
||||
video_counter += cur_batch_video_count
|
||||
|
||||
# ensure we don't lose any videos or embeddings
|
||||
assert embed_counter == video_embeds.size(0)
|
||||
assert video_counter == video_grid_thw.size(0)
|
||||
assert len(video_batches) == len(result)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def run_embedding_input_test(
|
||||
vllm_runner: type[VllmRunner],
|
||||
inputs: list[tuple[list[str], PromptImageInput, PromptVideoInput]],
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
mm_limit: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: str | None = None,
|
||||
):
|
||||
"""Inference result should be the same between
|
||||
original image/video input and image/video embeddings input.
|
||||
"""
|
||||
from transformers import AutoProcessor # noqa: F401
|
||||
|
||||
processor = AutoProcessor.from_pretrained(model)
|
||||
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=4000,
|
||||
max_num_seqs=3,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"image": mm_limit, "video": mm_limit},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
default_torch_num_threads=1,
|
||||
enable_mm_embeds=True,
|
||||
) as vllm_model:
|
||||
outputs_per_case_for_original_input = [
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images or None,
|
||||
videos=videos or None,
|
||||
)
|
||||
for prompts, images, videos in inputs
|
||||
]
|
||||
|
||||
outputs_per_case_for_embeddings_input = [
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=batch_make_image_embeddings(images, processor, vllm_model)
|
||||
if images
|
||||
else None,
|
||||
videos=batch_make_video_embeddings(videos, processor, vllm_model)
|
||||
if videos
|
||||
else None,
|
||||
)
|
||||
for prompts, images, videos in inputs
|
||||
]
|
||||
|
||||
for outputs_for_original_input, outputs_for_embeddings_input in zip(
|
||||
outputs_per_case_for_original_input, outputs_per_case_for_embeddings_input
|
||||
):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=outputs_for_original_input,
|
||||
outputs_1_lst=outputs_for_embeddings_input,
|
||||
name_0="original_input",
|
||||
name_1="embeddings_input",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# Single-scale
|
||||
[0.5],
|
||||
# Single-scale, batched
|
||||
[0.5, 0.5],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 0.5],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_qwen2_vl_image_embeddings_input(
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model,
|
||||
size_factors,
|
||||
dtype,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
|
||||
(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
[],
|
||||
)
|
||||
for image, prompt in zip(images, IMAGE_PROMPTS)
|
||||
]
|
||||
|
||||
run_embedding_input_test(
|
||||
vllm_runner,
|
||||
inputs_per_case,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=1,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
[],
|
||||
# Single-scale
|
||||
[0.5],
|
||||
# Single-scale, batched
|
||||
[0.5, 0.5],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 0.5],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_qwen2_vl_multiple_image_embeddings_input(
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model,
|
||||
size_factors,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
|
||||
(
|
||||
[MULTIIMAGE_PROMPT for _ in size_factors],
|
||||
[
|
||||
[rescale_image_size(image, factor) for image in images]
|
||||
for factor in size_factors
|
||||
],
|
||||
[],
|
||||
)
|
||||
]
|
||||
|
||||
run_embedding_input_test(
|
||||
vllm_runner,
|
||||
inputs_per_case,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=2,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# Single-scale
|
||||
[0.5],
|
||||
# Single-scale, batched
|
||||
[0.5, 0.5],
|
||||
# Multi-scale
|
||||
[0.25, 0.25, 0.5],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_qwen2_vl_video_embeddings_input(
|
||||
vllm_runner,
|
||||
video_assets,
|
||||
model,
|
||||
size_factors,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
num_frames = 4
|
||||
sampled_vids = [
|
||||
sample_frames_from_video(asset.np_ndarrays, num_frames)
|
||||
for asset in video_assets
|
||||
]
|
||||
|
||||
inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
|
||||
(
|
||||
[prompt for _ in size_factors],
|
||||
[],
|
||||
[rescale_video_size(video, factor) for factor in size_factors],
|
||||
)
|
||||
for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)
|
||||
]
|
||||
|
||||
run_embedding_input_test(
|
||||
vllm_runner,
|
||||
inputs_per_case,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=1,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
185
tests/models/multimodal/generation/test_ultravox.py
Normal file
185
tests/models/multimodal/generation/test_ultravox.py
Normal file
@@ -0,0 +1,185 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from ....conftest import AUDIO_ASSETS, AudioTestAssets, VllmRunner
|
||||
from ....utils import RemoteOpenAIServer
|
||||
from ...registry import HF_EXAMPLE_MODELS
|
||||
|
||||
MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
|
||||
|
||||
AUDIO_PROMPTS = AUDIO_ASSETS.prompts(
|
||||
{
|
||||
"mary_had_lamb": "Transcribe this into English.",
|
||||
"winning_call": "What is happening in this audio clip?",
|
||||
}
|
||||
)
|
||||
|
||||
MULTI_AUDIO_PROMPT = "Describe each of the audios above."
|
||||
|
||||
AudioTuple = tuple[np.ndarray, int]
|
||||
|
||||
VLLM_PLACEHOLDER = "<|audio|>"
|
||||
HF_PLACEHOLDER = "<|audio|>"
|
||||
|
||||
CHUNKED_PREFILL_KWARGS = {
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_seqs": 2,
|
||||
# Use a very small limit to exercise chunked prefill.
|
||||
"max_num_batched_tokens": 16,
|
||||
}
|
||||
|
||||
|
||||
def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]:
|
||||
"""Convert kwargs to CLI args."""
|
||||
args = []
|
||||
for key, value in params_kwargs.items():
|
||||
if isinstance(value, bool):
|
||||
if value:
|
||||
args.append(f"--{key.replace('_', '-')}")
|
||||
else:
|
||||
args.append(f"--{key.replace('_', '-')}={value}")
|
||||
return args
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
pytest.param({}, marks=pytest.mark.cpu_model),
|
||||
pytest.param(CHUNKED_PREFILL_KWARGS),
|
||||
]
|
||||
)
|
||||
def server(request, audio_assets: AudioTestAssets):
|
||||
args = [
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"4096",
|
||||
"--enforce-eager",
|
||||
"--limit-mm-per-prompt",
|
||||
json.dumps({"audio": len(audio_assets)}),
|
||||
"--trust-remote-code",
|
||||
] + params_kwargs_to_cli_args(request.param)
|
||||
|
||||
with RemoteOpenAIServer(
|
||||
MODEL_NAME, args, env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"}
|
||||
) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
def _get_prompt(audio_count, question, placeholder):
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||
placeholder = f"{placeholder}\n" * audio_count
|
||||
|
||||
return tokenizer.apply_chat_template(
|
||||
[{"role": "user", "content": f"{placeholder}{question}"}],
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
)
|
||||
|
||||
|
||||
def run_multi_audio_test(
|
||||
vllm_runner: type[VllmRunner],
|
||||
prompts_and_audios: list[tuple[str, list[AudioTuple]]],
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
**kwargs,
|
||||
):
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
enforce_eager=True,
|
||||
limit_mm_per_prompt={
|
||||
"audio": max((len(audio) for _, audio in prompts_and_audios))
|
||||
},
|
||||
**kwargs,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
[prompt for prompt, _ in prompts_and_audios],
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
audios=[audios for _, audios in prompts_and_audios],
|
||||
)
|
||||
|
||||
# The HuggingFace model doesn't support multiple audios yet, so
|
||||
# just assert that some tokens were generated.
|
||||
assert all(tokens for tokens, *_ in vllm_outputs)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize(
|
||||
"vllm_kwargs",
|
||||
[
|
||||
pytest.param({}, marks=pytest.mark.cpu_model),
|
||||
pytest.param(CHUNKED_PREFILL_KWARGS),
|
||||
],
|
||||
)
|
||||
def test_models_with_multiple_audios(
|
||||
vllm_runner,
|
||||
audio_assets: AudioTestAssets,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
vllm_kwargs: dict,
|
||||
) -> None:
|
||||
vllm_prompt = _get_prompt(len(audio_assets), MULTI_AUDIO_PROMPT, VLLM_PLACEHOLDER)
|
||||
run_multi_audio_test(
|
||||
vllm_runner,
|
||||
[(vllm_prompt, [audio.audio_and_sample_rate for audio in audio_assets])],
|
||||
MODEL_NAME,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
**vllm_kwargs,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_online_serving(client, audio_assets: AudioTestAssets):
|
||||
"""Exercises online serving with/without chunked prefill enabled."""
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*[
|
||||
{"type": "audio_url", "audio_url": {"url": audio.url}}
|
||||
for audio in audio_assets
|
||||
],
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"What's happening in these {len(audio_assets)} audio clips?", # noqa: E501
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME, messages=messages, max_tokens=10
|
||||
)
|
||||
|
||||
assert len(chat_completion.choices) == 1
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
@@ -0,0 +1,435 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Consolidated test for ViT attention backend functionality across multiple models.
|
||||
|
||||
This test validates that each multimodal model can successfully generate outputs
|
||||
using different ViT attention backends. Tests are parametrized by model and backend.
|
||||
"""
|
||||
|
||||
from dataclasses import asdict
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
from transformers import AutoProcessor
|
||||
|
||||
from vllm import LLM, EngineArgs, SamplingParams
|
||||
from vllm.attention.backends.registry import AttentionBackendEnum
|
||||
from vllm.multimodal.utils import encode_image_base64
|
||||
from vllm.multimodal.video import sample_frames_from_video
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ....utils import create_new_process_for_each_test
|
||||
from ...utils import dummy_hf_overrides
|
||||
|
||||
# Dots.OCR prompt from official repository
|
||||
# https://github.com/rednote-hilab/dots.ocr/blob/d72d1d8c5bdd0362eb264f714cdbd1e5daa7cdff/dots_ocr/utils/prompts.py#L3
|
||||
# ruff: noqa: E501
|
||||
DOTS_OCR_PROMPT = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
|
||||
|
||||
1. Bbox format: [x1, y1, x2, y2]
|
||||
|
||||
2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
|
||||
|
||||
3. Text Extraction & Formatting Rules:
|
||||
- Picture: For the 'Picture' category, the text field should be omitted.
|
||||
- Formula: Format its text as LaTeX.
|
||||
- Table: Format its text as HTML.
|
||||
- All Others (Text, Title, etc.): Format their text as Markdown.
|
||||
|
||||
4. Constraints:
|
||||
- The output text must be the original text from the image, with no translation.
|
||||
- All layout elements must be sorted according to human reading order.
|
||||
|
||||
5. Final Output: The entire output must be a single JSON object.
|
||||
"""
|
||||
|
||||
VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
|
||||
|
||||
|
||||
# Model configurations
|
||||
MODEL_CONFIGS: dict[str, dict[str, Any]] = {
|
||||
"dots_ocr": {
|
||||
"model_name": "rednote-hilab/dots.ocr",
|
||||
"interface": "llm_chat",
|
||||
"max_model_len": 32768,
|
||||
"max_num_seqs": 1,
|
||||
"limit_mm_per_prompt": {"image": 1},
|
||||
"sampling_params": {
|
||||
"temperature": 0.1,
|
||||
"max_tokens": 16384,
|
||||
"top_p": 0.9,
|
||||
"stop_token_ids": None,
|
||||
},
|
||||
"use_specific_image": "stop_sign",
|
||||
"prompt_builder": "build_dots_ocr_prompt",
|
||||
"output_validator": lambda x: len(x) > 10 and "stop" in x.lower(),
|
||||
},
|
||||
"ernie45_vl": {
|
||||
"model_name": "baidu/ERNIE-4.5-VL-28B-A3B-PT",
|
||||
"interface": "llm_generate",
|
||||
"max_model_len": 16384,
|
||||
"max_num_seqs": 2,
|
||||
"sampling_params": {
|
||||
"temperature": 0.0,
|
||||
"max_tokens": 256,
|
||||
"stop_token_ids": None,
|
||||
},
|
||||
"use_processor": True,
|
||||
"question": "What is the content of each image?",
|
||||
},
|
||||
"glm4_1v": {
|
||||
"model_name": "zai-org/GLM-4.1V-9B-Thinking",
|
||||
"interface": "llm_generate",
|
||||
"max_model_len": 32768,
|
||||
"max_num_seqs": 2,
|
||||
"sampling_params": {
|
||||
"temperature": 0.0,
|
||||
"max_tokens": 256,
|
||||
"stop_token_ids": None,
|
||||
},
|
||||
"use_processor": True,
|
||||
"question": "What is the content of each image?",
|
||||
},
|
||||
"keye_vl": {
|
||||
"model_name": "Kwai-Keye/Keye-VL-8B-Preview",
|
||||
"interface": "llm_generate",
|
||||
"max_model_len": 8192,
|
||||
"max_num_seqs": 5,
|
||||
"sampling_params": {
|
||||
"temperature": 0.0,
|
||||
"max_tokens": 256,
|
||||
"stop_token_ids": None,
|
||||
},
|
||||
"supported_backends": {
|
||||
AttentionBackendEnum.FLASH_ATTN,
|
||||
AttentionBackendEnum.ROCM_AITER_FA,
|
||||
},
|
||||
"use_processor": True,
|
||||
"question": "What is the content of each image?",
|
||||
},
|
||||
"ovis2_5": {
|
||||
"model_name": "AIDC-AI/Ovis2.5-2B",
|
||||
"interface": "llm_generate",
|
||||
"max_model_len": 8192,
|
||||
"max_num_seqs": 2,
|
||||
"sampling_params": {
|
||||
"temperature": 0.0,
|
||||
"max_tokens": 256,
|
||||
"stop_token_ids": None,
|
||||
},
|
||||
"prompt_builder": "build_ovis_prompt",
|
||||
"question": "What is the content of each image?",
|
||||
},
|
||||
"qwen2_5_vl": {
|
||||
"model_name": "Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
"interface": "vllm_runner",
|
||||
"media_type": "video",
|
||||
"max_model_len": 4000,
|
||||
"max_num_seqs": 1,
|
||||
"limit_mm_per_prompt": {"video": 1},
|
||||
"sampling_params": {
|
||||
"max_tokens": 128,
|
||||
},
|
||||
"runner_kwargs": {
|
||||
"runner": "generate",
|
||||
"dtype": "bfloat16",
|
||||
},
|
||||
"video_params": {
|
||||
"num_frames": 16,
|
||||
"pruning_rates": [0.0, 0.75],
|
||||
},
|
||||
},
|
||||
"qwen2_5_omni": {
|
||||
"model_name": "Qwen/Qwen2.5-Omni-3B",
|
||||
"interface": "llm_generate",
|
||||
"max_model_len": 32768,
|
||||
"max_num_seqs": 2,
|
||||
"limit_mm_per_prompt": {"image": 3, "video": 3, "audio": 3},
|
||||
"sampling_params": {
|
||||
"temperature": 0.6,
|
||||
"top_p": 0.95,
|
||||
"top_k": 20,
|
||||
"max_tokens": 16384,
|
||||
},
|
||||
"use_processor": True,
|
||||
"question": "What is the content of each image?",
|
||||
},
|
||||
"qwen3_omni": {
|
||||
"model_name": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
|
||||
"interface": "llm_generate",
|
||||
"max_model_len": 32768,
|
||||
"max_num_seqs": 2,
|
||||
"limit_mm_per_prompt": {"image": 3, "video": 3, "audio": 3},
|
||||
"sampling_params": {
|
||||
"temperature": 0.6,
|
||||
"top_p": 0.95,
|
||||
"top_k": 20,
|
||||
"max_tokens": 16384,
|
||||
},
|
||||
"use_processor": True,
|
||||
"question": "What is the content of each image?",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# Prompt builder functions
|
||||
def build_dots_ocr_prompt(images, config):
|
||||
"""Build Dots.OCR specific prompt with OCR instructions."""
|
||||
# Use only stop_sign image for Dots.OCR
|
||||
image = images[0] # Already filtered to stop_sign
|
||||
|
||||
image_url = f"data:image/jpeg;base64,{encode_image_base64(image)}"
|
||||
|
||||
placeholders = [{"type": "image_url", "image_url": {"url": image_url}}]
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*placeholders,
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"<|img|><|imgpad|><|endofimg|>{DOTS_OCR_PROMPT}",
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
return messages
|
||||
|
||||
|
||||
def build_processor_prompt(images, config):
|
||||
"""Build prompt using AutoProcessor.apply_chat_template()."""
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
config["model_name"], trust_remote_code=True
|
||||
)
|
||||
|
||||
image_urls = [
|
||||
f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
|
||||
]
|
||||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*placeholders,
|
||||
{"type": "text", "text": config["question"]},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
return processor.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
|
||||
|
||||
def build_ovis_prompt(images, config):
|
||||
"""Build Ovis2.5 specific prompt with custom format."""
|
||||
image_urls = [
|
||||
f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
|
||||
]
|
||||
|
||||
placeholders = "\n".join(
|
||||
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
|
||||
)
|
||||
|
||||
return (
|
||||
f"<|im_start|>user\n\n{placeholders}\n{config['question']}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n"
|
||||
)
|
||||
|
||||
|
||||
def build_qwen2_5_video_prompt():
|
||||
"""Build Qwen2.5-VL video prompt with EVS placeholder."""
|
||||
return (
|
||||
f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
||||
f"<|im_start|>user\n{VIDEO_PLACEHOLDER}"
|
||||
"Describe this video with a short sentence (no more than 20 words)"
|
||||
"<|im_end|><|im_start|>assistant\n"
|
||||
)
|
||||
|
||||
|
||||
# Handler functions
|
||||
def run_llm_generate_test(config, mm_encoder_attn_backend, image_assets):
|
||||
"""Standard LLM.generate() interface handler."""
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
# Build prompt
|
||||
if config.get("use_processor"):
|
||||
prompt = build_processor_prompt(images, config)
|
||||
else:
|
||||
prompt_builder_name = config.get("prompt_builder", "build_ovis_prompt")
|
||||
prompt_builder = globals()[prompt_builder_name]
|
||||
prompt = prompt_builder(images, config)
|
||||
|
||||
# Determine limit_mm_per_prompt
|
||||
limit_mm_per_prompt = config.get("limit_mm_per_prompt", {"image": len(images)})
|
||||
|
||||
# Create engine
|
||||
engine_args = EngineArgs(
|
||||
model=config["model_name"],
|
||||
trust_remote_code=True,
|
||||
max_model_len=config["max_model_len"],
|
||||
max_num_seqs=config["max_num_seqs"],
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
mm_encoder_attn_backend=mm_encoder_attn_backend,
|
||||
hf_overrides=dummy_hf_overrides,
|
||||
load_format="dummy",
|
||||
)
|
||||
|
||||
engine_dict = asdict(engine_args) | {"seed": 42}
|
||||
llm = LLM(**engine_dict)
|
||||
|
||||
# Generate
|
||||
sampling_params = SamplingParams(**config["sampling_params"])
|
||||
outputs = llm.generate(
|
||||
{
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": {"image": images},
|
||||
},
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
# Validate
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
validator = config.get("output_validator", lambda x: len(x) > 10)
|
||||
assert validator(generated_text), (
|
||||
f"Validation failed for {config['model_name']}: {generated_text}"
|
||||
)
|
||||
|
||||
|
||||
def run_llm_chat_test(config, mm_encoder_attn_backend, image_assets):
|
||||
"""LLM.chat() interface handler for Dots.OCR."""
|
||||
# Filter to stop_sign image only
|
||||
stop_sign_image = [
|
||||
asset.pil_image for asset in image_assets if asset.name == "stop_sign"
|
||||
][0]
|
||||
|
||||
# Build messages
|
||||
messages = build_dots_ocr_prompt([stop_sign_image], config)
|
||||
|
||||
# Create engine
|
||||
engine_args = EngineArgs(
|
||||
model=config["model_name"],
|
||||
trust_remote_code=True,
|
||||
max_model_len=config["max_model_len"],
|
||||
max_num_seqs=config["max_num_seqs"],
|
||||
limit_mm_per_prompt=config["limit_mm_per_prompt"],
|
||||
mm_encoder_attn_backend=mm_encoder_attn_backend,
|
||||
hf_overrides=dummy_hf_overrides,
|
||||
load_format="dummy",
|
||||
)
|
||||
|
||||
engine_dict = asdict(engine_args) | {"seed": 42}
|
||||
llm = LLM(**engine_dict)
|
||||
|
||||
# Generate using chat
|
||||
sampling_params = SamplingParams(**config["sampling_params"])
|
||||
outputs = llm.chat(messages=messages, sampling_params=sampling_params)
|
||||
|
||||
# Validate
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
validator = config.get("output_validator", lambda x: len(x) > 10)
|
||||
assert validator(generated_text), (
|
||||
f"Validation failed for {config['model_name']}: {generated_text}"
|
||||
)
|
||||
|
||||
|
||||
def run_video_test(config, mm_encoder_attn_backend, video_assets, vllm_runner):
|
||||
"""Video test with EVS (Efficient Video Sampling) handler."""
|
||||
for pruning_rate in config["video_params"]["pruning_rates"]:
|
||||
num_frames = config["video_params"]["num_frames"]
|
||||
|
||||
# Sample frames from video
|
||||
sampled_vids = [
|
||||
sample_frames_from_video(asset.np_ndarrays, num_frames)
|
||||
for asset in video_assets
|
||||
]
|
||||
|
||||
# Build prompt and prepare video
|
||||
prompt = build_qwen2_5_video_prompt()
|
||||
prompts = [prompt]
|
||||
videos = [sampled_vids[0]]
|
||||
|
||||
# Run with vllm_runner context manager
|
||||
with vllm_runner(
|
||||
config["model_name"],
|
||||
max_model_len=config["max_model_len"],
|
||||
max_num_seqs=config["max_num_seqs"],
|
||||
limit_mm_per_prompt=config["limit_mm_per_prompt"],
|
||||
tensor_parallel_size=1,
|
||||
video_pruning_rate=pruning_rate,
|
||||
mm_encoder_attn_backend=mm_encoder_attn_backend,
|
||||
hf_overrides=dummy_hf_overrides,
|
||||
load_format="dummy",
|
||||
**config["runner_kwargs"],
|
||||
) as vllm_model:
|
||||
outputs = vllm_model.generate_greedy(
|
||||
prompts,
|
||||
config["sampling_params"]["max_tokens"],
|
||||
videos=videos,
|
||||
)
|
||||
|
||||
# Validate output
|
||||
assert len(outputs) == 1, f"Expected 1 output, got {len(outputs)}"
|
||||
output_ids, output_text = outputs[0]
|
||||
assert len(output_ids) > 0, "Generated no output IDs"
|
||||
assert len(output_text) > 0, "Generated empty text"
|
||||
assert isinstance(output_text, str), (
|
||||
f"Output is not string: {type(output_text)}"
|
||||
)
|
||||
|
||||
|
||||
# Main test function
|
||||
@pytest.mark.parametrize("model_key", list(MODEL_CONFIGS.keys()))
|
||||
@pytest.mark.parametrize(
|
||||
"mm_encoder_attn_backend",
|
||||
[None] + current_platform.get_supported_vit_attn_backends(),
|
||||
)
|
||||
@pytest.mark.skip(reason="Broken test due to memory segmentation fault")
|
||||
@create_new_process_for_each_test()
|
||||
def test_vit_backend_functionality(
|
||||
model_key: str,
|
||||
mm_encoder_attn_backend: AttentionBackendEnum | None,
|
||||
image_assets,
|
||||
video_assets,
|
||||
vllm_runner,
|
||||
request,
|
||||
):
|
||||
"""Test ViT attention backend functionality for multimodal models.
|
||||
|
||||
This test validates that each model can successfully generate outputs
|
||||
using different ViT attention backends. The test:
|
||||
1. Filters unsupported backends per model
|
||||
2. Applies appropriate GPU marks
|
||||
3. Routes to the correct test handler based on interface
|
||||
4. Validates output meets minimum requirements
|
||||
"""
|
||||
config = MODEL_CONFIGS[model_key]
|
||||
|
||||
# Step 1: Backend filtering
|
||||
if (
|
||||
"supported_backends" in config
|
||||
and mm_encoder_attn_backend is not None
|
||||
and mm_encoder_attn_backend not in config["supported_backends"]
|
||||
):
|
||||
pytest.skip(
|
||||
f"{model_key} does not support {mm_encoder_attn_backend} backend now."
|
||||
)
|
||||
|
||||
# Step 2: Apply GPU marks dynamically
|
||||
if "gpu_marks" in config:
|
||||
for mark in config["gpu_marks"]:
|
||||
request.applymarker(mark)
|
||||
|
||||
# Step 3: Route to appropriate handler
|
||||
if config.get("media_type") == "video":
|
||||
run_video_test(config, mm_encoder_attn_backend, video_assets, vllm_runner)
|
||||
elif config["interface"] == "llm_chat":
|
||||
run_llm_chat_test(config, mm_encoder_attn_backend, image_assets)
|
||||
elif config["interface"] == "llm_generate":
|
||||
run_llm_generate_test(config, mm_encoder_attn_backend, image_assets)
|
||||
else:
|
||||
raise ValueError(f"Unknown interface: {config['interface']}")
|
||||
114
tests/models/multimodal/generation/test_voxtral.py
Normal file
114
tests/models/multimodal/generation/test_voxtral.py
Normal file
@@ -0,0 +1,114 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from mistral_common.audio import Audio
|
||||
from mistral_common.protocol.instruct.chunk import AudioChunk, RawAudio, TextChunk
|
||||
from mistral_common.protocol.instruct.messages import UserMessage
|
||||
|
||||
from vllm.tokenizers.mistral import MistralTokenizer
|
||||
|
||||
from ....conftest import AudioTestAssets
|
||||
from ....utils import RemoteOpenAIServer
|
||||
from .test_ultravox import MULTI_AUDIO_PROMPT, run_multi_audio_test
|
||||
|
||||
MODEL_NAME = "mistralai/Voxtral-Mini-3B-2507"
|
||||
MISTRAL_FORMAT_ARGS = [
|
||||
"--tokenizer_mode",
|
||||
"mistral",
|
||||
"--config_format",
|
||||
"mistral",
|
||||
"--load_format",
|
||||
"mistral",
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def server(request, audio_assets: AudioTestAssets):
|
||||
args = [
|
||||
"--enforce-eager",
|
||||
"--limit-mm-per-prompt",
|
||||
json.dumps({"audio": len(audio_assets)}),
|
||||
] + MISTRAL_FORMAT_ARGS
|
||||
|
||||
with RemoteOpenAIServer(
|
||||
MODEL_NAME, args, env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"}
|
||||
) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
def _get_prompt(audio_assets, question):
|
||||
tokenizer = MistralTokenizer.from_pretrained(MODEL_NAME)
|
||||
|
||||
audios = [
|
||||
Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
|
||||
for i in range(len(audio_assets))
|
||||
]
|
||||
audio_chunks = [
|
||||
AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
|
||||
]
|
||||
|
||||
text_chunk = TextChunk(text=question)
|
||||
messages = [UserMessage(content=[*audio_chunks, text_chunk]).to_openai()]
|
||||
|
||||
return tokenizer.apply_chat_template(messages=messages)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models_with_multiple_audios(
|
||||
vllm_runner,
|
||||
audio_assets: AudioTestAssets,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
vllm_prompt = _get_prompt(audio_assets, MULTI_AUDIO_PROMPT)
|
||||
run_multi_audio_test(
|
||||
vllm_runner,
|
||||
[(vllm_prompt, [audio.audio_and_sample_rate for audio in audio_assets])],
|
||||
MODEL_NAME,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tokenizer_mode="mistral",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_online_serving(client, audio_assets: AudioTestAssets):
|
||||
"""Exercises online serving with/without chunked prefill enabled."""
|
||||
|
||||
def asset_to_chunk(asset):
|
||||
audio = Audio.from_file(str(asset.get_local_path()), strict=False)
|
||||
audio.format = "wav"
|
||||
audio_dict = AudioChunk.from_audio(audio).to_openai()
|
||||
return audio_dict
|
||||
|
||||
audio_chunks = [asset_to_chunk(asset) for asset in audio_assets]
|
||||
text = f"What's happening in these {len(audio_assets)} audio clips?"
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [*audio_chunks, {"type": "text", "text": text}],
|
||||
}
|
||||
]
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME, messages=messages, max_tokens=10
|
||||
)
|
||||
|
||||
assert len(chat_completion.choices) == 1
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
178
tests/models/multimodal/generation/test_whisper.py
Normal file
178
tests/models/multimodal/generation/test_whisper.py
Normal file
@@ -0,0 +1,178 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Sequence
|
||||
from typing import Any
|
||||
|
||||
import librosa
|
||||
import pytest
|
||||
from transformers import AutoModelForSpeechSeq2Seq
|
||||
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ....conftest import HfRunner, PromptAudioInput, VllmRunner
|
||||
from ....utils import create_new_process_for_each_test, multi_gpu_test
|
||||
from ...registry import HF_EXAMPLE_MODELS
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
VLLM_PROMPT = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
|
||||
HF_PROMPT = ""
|
||||
# Whisper expects 16kHz audio
|
||||
WHISPER_SAMPLE_RATE = 16000
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def use_spawn_for_whisper(monkeypatch):
|
||||
"""Whisper has issues with forked workers, use spawn instead."""
|
||||
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
|
||||
|
||||
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
inputs: Sequence[tuple[list[str], list[str], PromptAudioInput]],
|
||||
model: str,
|
||||
*,
|
||||
max_model_len: int,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: str | None = None,
|
||||
enforce_eager: bool = True,
|
||||
) -> None:
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the audio fixtures for the test are from AudioAsset.
|
||||
For huggingface runner, we provide the audio as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
"""
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=max_model_len,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
limit_mm_per_prompt={"audio": 2},
|
||||
enforce_eager=enforce_eager,
|
||||
disable_custom_all_reduce=True,
|
||||
) as vllm_model:
|
||||
vllm_outputs_per_case = [
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
vllm_prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
audios=audios,
|
||||
)
|
||||
for vllm_prompts, _, audios in inputs
|
||||
]
|
||||
|
||||
with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
|
||||
hf_outputs_per_case = [
|
||||
hf_model.generate_greedy_logprobs_limit(
|
||||
hf_prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
audios=audios,
|
||||
)
|
||||
for _, hf_prompts, audios in inputs
|
||||
]
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def input_audios() -> list[tuple[list[str], list[str], list[tuple[Any, int]]]]:
|
||||
audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
|
||||
inputs = []
|
||||
for asset in audio_assets:
|
||||
audio, orig_sr = asset.audio_and_sample_rate
|
||||
# Resample to Whisper's expected sample rate (16kHz)
|
||||
if orig_sr != WHISPER_SAMPLE_RATE:
|
||||
audio = librosa.resample(
|
||||
audio, orig_sr=orig_sr, target_sr=WHISPER_SAMPLE_RATE
|
||||
)
|
||||
# vLLM prompts, HF prompts, audio inputs
|
||||
inputs.append(([VLLM_PROMPT], [HF_PROMPT], [(audio, WHISPER_SAMPLE_RATE)]))
|
||||
return inputs
|
||||
|
||||
|
||||
def check_model_available(model: str) -> None:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.cpu_model
|
||||
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
||||
@create_new_process_for_each_test("spawn")
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
num_logprobs: int,
|
||||
input_audios,
|
||||
enforce_eager: bool,
|
||||
) -> None:
|
||||
check_model_available(model)
|
||||
if current_platform.is_cpu() and not enforce_eager:
|
||||
pytest.skip("Skipping test for CPU with non-eager mode")
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
input_audios,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=448,
|
||||
max_tokens=200,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=1,
|
||||
enforce_eager=enforce_eager,
|
||||
)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [200])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@create_new_process_for_each_test("spawn")
|
||||
def test_models_distributed(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model: str,
|
||||
distributed_executor_backend: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
input_audios,
|
||||
) -> None:
|
||||
check_model_available(model)
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
input_audios,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=448,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=False,
|
||||
)
|
||||
347
tests/models/multimodal/generation/vlm_utils/builders.py
Normal file
347
tests/models/multimodal/generation/vlm_utils/builders.py
Normal file
@@ -0,0 +1,347 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Helpers for building inputs that can be leveraged for different test types."""
|
||||
|
||||
from collections.abc import Callable, Iterable
|
||||
from pathlib import PosixPath
|
||||
from typing import Any
|
||||
|
||||
import numpy.typing as npt
|
||||
import torch
|
||||
|
||||
from vllm.multimodal.audio import AudioResampler
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.multimodal.video import (
|
||||
rescale_video_size,
|
||||
resize_video,
|
||||
sample_frames_from_video,
|
||||
)
|
||||
|
||||
from .....conftest import AudioTestAssets, ImageTestAssets, VideoTestAssets
|
||||
from .types import (
|
||||
SINGLE_AUDIO_BASE_PROMPT,
|
||||
SINGLE_IMAGE_BASE_PROMPTS,
|
||||
TEST_AUDIO_PLACEHOLDER,
|
||||
TEST_IMG_PLACEHOLDER,
|
||||
TEST_VIDEO_PLACEHOLDER,
|
||||
VIDEO_BASE_PROMPT,
|
||||
ImageSizeWrapper,
|
||||
PromptWithMultiModalInput,
|
||||
SizeType,
|
||||
VLMTestInfo,
|
||||
)
|
||||
|
||||
|
||||
def replace_test_placeholder(
|
||||
prompt: str, mm_idx_to_prompt: Callable[[int], str], test_placeholder: str
|
||||
) -> str:
|
||||
"""Given a prompt, replaces each test placeholder with the
|
||||
model-specific tag.
|
||||
"""
|
||||
prompt_segments = prompt.split(test_placeholder)
|
||||
img_prompt = prompt_segments[0]
|
||||
for placeholder_idx, next_seg in enumerate(prompt_segments[1:], start=1):
|
||||
img_prompt += mm_idx_to_prompt(placeholder_idx)
|
||||
img_prompt += next_seg
|
||||
return img_prompt
|
||||
|
||||
|
||||
def get_model_prompts(
|
||||
base_prompts: Iterable[str],
|
||||
img_idx_to_prompt: Callable[[int], str] | None,
|
||||
video_idx_to_prompt: Callable[[int], str] | None,
|
||||
audio_idx_to_prompt: Callable[[int], str] | None,
|
||||
prompt_formatter: Callable[[str], str],
|
||||
) -> list[str]:
|
||||
"""Given a model-agnostic base prompt and test configuration for a model(s)
|
||||
to be tested, update the media placeholders and apply the prompt formatting
|
||||
to get the test prompt string for this model.
|
||||
|
||||
Example for phi3v, given the base_prompt: "<image>What is the season?"
|
||||
1. Replace img placeholder(s)
|
||||
-> "<|image_1|>\nWhat is the season?"
|
||||
2. Apply prompt formatter:
|
||||
-> <|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n
|
||||
"""
|
||||
assert isinstance(base_prompts, (list, tuple))
|
||||
model_prompts = []
|
||||
for base_prompt in base_prompts:
|
||||
# Replace the multimodal placeholders in the base prompt with
|
||||
# the correct ones for the model that we are testing
|
||||
if img_idx_to_prompt:
|
||||
base_prompt = replace_test_placeholder(
|
||||
base_prompt, img_idx_to_prompt, TEST_IMG_PLACEHOLDER
|
||||
)
|
||||
|
||||
if video_idx_to_prompt:
|
||||
base_prompt = replace_test_placeholder(
|
||||
base_prompt, video_idx_to_prompt, TEST_VIDEO_PLACEHOLDER
|
||||
)
|
||||
|
||||
if audio_idx_to_prompt:
|
||||
base_prompt = replace_test_placeholder(
|
||||
base_prompt, audio_idx_to_prompt, TEST_AUDIO_PLACEHOLDER
|
||||
)
|
||||
|
||||
# Apply the prompt formatter to wrap the base prompt with
|
||||
# the correct media placeholders to get the model test prompt
|
||||
model_prompt = prompt_formatter(base_prompt)
|
||||
model_prompts.append(model_prompt)
|
||||
return model_prompts
|
||||
|
||||
|
||||
def build_single_image_inputs_from_test_info(
|
||||
test_info: VLMTestInfo,
|
||||
image_assets: ImageTestAssets,
|
||||
size_wrapper: ImageSizeWrapper,
|
||||
tmp_path: PosixPath | None = None,
|
||||
) -> list[PromptWithMultiModalInput]:
|
||||
if test_info.prompt_formatter is None:
|
||||
raise ValueError("Prompt formatter must be set to build single image inputs")
|
||||
|
||||
model_prompts = get_model_prompts(
|
||||
test_info.single_image_prompts,
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.audio_idx_to_prompt,
|
||||
test_info.prompt_formatter,
|
||||
)
|
||||
|
||||
# For models that require a local path / URL encoded in the image; export
|
||||
# assets and encode into tmp_path for this test. This should be avoided
|
||||
# where possible (currently needed for Qwen-VL).
|
||||
if test_info.prompt_path_encoder is not None:
|
||||
if tmp_path is None:
|
||||
raise ValueError("Prompt path encoder requires setting local path")
|
||||
model_prompts = [
|
||||
test_info.prompt_path_encoder(tmp_path, prompt, [asset])
|
||||
for prompt, asset in zip(model_prompts, image_assets)
|
||||
]
|
||||
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
assert len(images) == len(model_prompts)
|
||||
return build_single_image_inputs(images, model_prompts, size_wrapper)
|
||||
|
||||
|
||||
def build_single_image_inputs(
|
||||
images, model_prompts, size_wrapper: ImageSizeWrapper
|
||||
) -> list[PromptWithMultiModalInput]:
|
||||
# For every image / prompt pair, get a pair containing two lists of
|
||||
# length size_factors, where the first contains duplicates of the model
|
||||
# prompt [str], and the second contains copies of the image after being
|
||||
# scaled by one of the size factors.
|
||||
#
|
||||
# NOTE: rescaling preserves the image aspect ratio.
|
||||
return [
|
||||
PromptWithMultiModalInput(
|
||||
prompts=[prompt for _ in size_wrapper.data],
|
||||
image_data=[
|
||||
apply_image_size_scaling(image, size, size_wrapper.type)
|
||||
for size in size_wrapper.data
|
||||
],
|
||||
)
|
||||
for image, prompt in zip(images, model_prompts)
|
||||
]
|
||||
|
||||
|
||||
def build_multi_image_inputs_from_test_info(
|
||||
test_info: VLMTestInfo,
|
||||
image_assets: ImageTestAssets,
|
||||
size_wrapper: ImageSizeWrapper,
|
||||
tmp_path: PosixPath | None = None,
|
||||
) -> list[PromptWithMultiModalInput]:
|
||||
if test_info.prompt_formatter is None:
|
||||
raise ValueError("Prompt formatter must be set to build multi image inputs")
|
||||
|
||||
model_prompts = get_model_prompts(
|
||||
[test_info.multi_image_prompt],
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.audio_idx_to_prompt,
|
||||
test_info.prompt_formatter,
|
||||
)
|
||||
|
||||
if test_info.prompt_path_encoder is not None:
|
||||
if tmp_path is None:
|
||||
raise ValueError("Prompt path encoder requires setting local path")
|
||||
model_prompts = [
|
||||
test_info.prompt_path_encoder(tmp_path, model_prompt, image_assets)
|
||||
for model_prompt in model_prompts
|
||||
]
|
||||
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
# Currently, we only have one multi-image list & one multi-image prompt
|
||||
return build_multi_image_inputs(
|
||||
image_lists=[images],
|
||||
model_prompts=model_prompts,
|
||||
size_wrapper=size_wrapper,
|
||||
)
|
||||
|
||||
|
||||
def build_multi_image_inputs(
|
||||
image_lists, model_prompts, size_wrapper: ImageSizeWrapper
|
||||
) -> list[PromptWithMultiModalInput]:
|
||||
return [
|
||||
PromptWithMultiModalInput(
|
||||
prompts=[prompt for _ in size_wrapper.data],
|
||||
image_data=[
|
||||
[
|
||||
apply_image_size_scaling(image, size, size_wrapper.type)
|
||||
for image in images
|
||||
]
|
||||
for size in size_wrapper.data
|
||||
],
|
||||
)
|
||||
for images, prompt in zip(image_lists, model_prompts)
|
||||
]
|
||||
|
||||
|
||||
def build_embedding_inputs_from_test_info(
|
||||
test_info: VLMTestInfo,
|
||||
image_assets: ImageTestAssets,
|
||||
size_wrapper: ImageSizeWrapper,
|
||||
):
|
||||
# These conditions will always be true if invoked through filtering,
|
||||
# but we still check them in case this is ever called directly
|
||||
if test_info.prompt_formatter is None:
|
||||
raise ValueError("Prompt formatter must be set to build image embedding inputs")
|
||||
if size_wrapper.type != SizeType.SIZE_FACTOR or not all(
|
||||
factor == 1.0 for factor in size_wrapper.data
|
||||
):
|
||||
raise ValueError("Embedding tests require constant (1.0) size factors")
|
||||
if test_info.convert_assets_to_embeddings is None:
|
||||
raise ValueError("No conversion func for getting embeddings found")
|
||||
|
||||
model_prompts = get_model_prompts(
|
||||
SINGLE_IMAGE_BASE_PROMPTS,
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.audio_idx_to_prompt,
|
||||
test_info.prompt_formatter,
|
||||
)
|
||||
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
embeds = test_info.convert_assets_to_embeddings(image_assets)
|
||||
if test_info.dtype != "auto":
|
||||
dtype = getattr(torch, test_info.dtype) # type: ignore
|
||||
embeds = [e.to(dtype=dtype) for e in embeds]
|
||||
assert len(images) == len(model_prompts)
|
||||
|
||||
inputs = build_single_image_inputs(images, model_prompts, size_wrapper)
|
||||
vllm_embeddings = build_single_image_inputs(embeds, model_prompts, size_wrapper)
|
||||
return inputs, vllm_embeddings
|
||||
|
||||
|
||||
def build_video_inputs_from_test_info(
|
||||
test_info: VLMTestInfo,
|
||||
video_assets: VideoTestAssets,
|
||||
size_wrapper: ImageSizeWrapper,
|
||||
num_frames: int,
|
||||
needs_video_metadata: bool,
|
||||
) -> list[PromptWithMultiModalInput]:
|
||||
if test_info.prompt_formatter is None:
|
||||
raise ValueError("Prompt formatter must be set to build video inputs")
|
||||
model_prompts = get_model_prompts(
|
||||
[VIDEO_BASE_PROMPT],
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.audio_idx_to_prompt,
|
||||
test_info.prompt_formatter,
|
||||
)
|
||||
|
||||
sampled_vids = [
|
||||
sample_frames_with_video_metadata(
|
||||
(asset.np_ndarrays, asset.metadata),
|
||||
num_frames,
|
||||
)
|
||||
for asset in video_assets
|
||||
]
|
||||
|
||||
video_scaler = (
|
||||
resize_video if size_wrapper.type == SizeType.FIXED_SIZE else rescale_video_size
|
||||
)
|
||||
|
||||
return [
|
||||
PromptWithMultiModalInput(
|
||||
prompts=[prompt for _ in size_wrapper.data],
|
||||
video_data=[
|
||||
(
|
||||
video_scaler(video, size)
|
||||
if not needs_video_metadata
|
||||
else (video_scaler(video, size), meta)
|
||||
)
|
||||
for size in size_wrapper.data
|
||||
],
|
||||
)
|
||||
for (video, meta), prompt in zip(sampled_vids, model_prompts)
|
||||
]
|
||||
|
||||
|
||||
def sample_frames_with_video_metadata(
|
||||
video_with_meta: tuple[npt.NDArray, dict[str, Any]],
|
||||
num_frames: int,
|
||||
) -> tuple[npt.NDArray, dict[str, Any]]:
|
||||
video, meta = video_with_meta
|
||||
video = sample_frames_from_video(video, num_frames)
|
||||
|
||||
meta["do_sample_frames"] = meta["total_num_frames"] == num_frames
|
||||
meta["total_num_frames"] = num_frames
|
||||
meta["fps"] = meta["duration"] / num_frames
|
||||
meta["frames_indices"] = list(range(num_frames))
|
||||
return video, meta
|
||||
|
||||
|
||||
def apply_image_size_scaling(image, size: float | tuple[int, int], size_type: SizeType):
|
||||
"""Applies a size scaler to one image; this can be an image size factor,
|
||||
which scales the image while maintaining the aspect ratio"""
|
||||
# Special case for embeddings; if it's a tensor, it's only valid if we
|
||||
# are considering size factors at constant scale, i.e., we just clone
|
||||
# the tensor
|
||||
if isinstance(image, torch.Tensor):
|
||||
assert size_type == SizeType.SIZE_FACTOR and size == 1
|
||||
return image
|
||||
if size_type == SizeType.SIZE_FACTOR:
|
||||
# We have a list of image size factors
|
||||
return rescale_image_size(image, size)
|
||||
elif size_type == SizeType.FIXED_SIZE:
|
||||
# We have a list of fixed sizes
|
||||
return image.resize(size)
|
||||
raise ValueError("ImageSizeWrapper type must be FIXED_SIZE or SIZE_FACTOR")
|
||||
|
||||
|
||||
def build_audio_inputs_from_test_info(
|
||||
test_info: VLMTestInfo,
|
||||
audio_assets: AudioTestAssets,
|
||||
) -> list[PromptWithMultiModalInput]:
|
||||
if test_info.prompt_formatter is None:
|
||||
raise ValueError("Prompt formatter must be set to build audio inputs")
|
||||
model_prompts = get_model_prompts(
|
||||
SINGLE_AUDIO_BASE_PROMPT,
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.audio_idx_to_prompt,
|
||||
test_info.prompt_formatter,
|
||||
)
|
||||
resampler = AudioResampler(
|
||||
target_sr=16000,
|
||||
method="librosa",
|
||||
)
|
||||
audios = [asset.audio_and_sample_rate for asset in audio_assets]
|
||||
resampled_audios = [
|
||||
(
|
||||
resampler.resample(
|
||||
audio,
|
||||
orig_sr=sr,
|
||||
),
|
||||
int(resampler.target_sr),
|
||||
)
|
||||
for audio, sr in audios
|
||||
]
|
||||
|
||||
return [
|
||||
PromptWithMultiModalInput(
|
||||
prompts=model_prompts,
|
||||
audio_data=resampled_audios,
|
||||
)
|
||||
]
|
||||
183
tests/models/multimodal/generation/vlm_utils/case_filtering.py
Normal file
183
tests/models/multimodal/generation/vlm_utils/case_filtering.py
Normal file
@@ -0,0 +1,183 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Utils for determining which subset of model tests belong to a specific
|
||||
modality, getting all combinations (similar to pytest's parametrization),
|
||||
handling multimodal placeholder substitution, and so on.
|
||||
"""
|
||||
|
||||
import itertools
|
||||
from collections import OrderedDict
|
||||
from collections.abc import Iterable
|
||||
|
||||
import pytest
|
||||
|
||||
from .types import (
|
||||
EMBEDDING_SIZE_FACTORS,
|
||||
ExpandableVLMTestArgs,
|
||||
ImageSizeWrapper,
|
||||
SizeType,
|
||||
VLMTestInfo,
|
||||
VLMTestType,
|
||||
)
|
||||
|
||||
|
||||
def get_filtered_test_settings(
|
||||
test_settings: dict[str, VLMTestInfo],
|
||||
test_type: VLMTestType,
|
||||
new_proc_per_test: bool,
|
||||
) -> dict[str, VLMTestInfo]:
|
||||
"""Given the dict of potential test settings to run, return a subdict
|
||||
of tests who have the current test type enabled with the matching val for
|
||||
fork_per_test.
|
||||
"""
|
||||
|
||||
def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType):
|
||||
return test_info.test_type == test_type or (
|
||||
isinstance(test_info.test_type, Iterable)
|
||||
and test_type in test_info.test_type
|
||||
)
|
||||
|
||||
matching_tests = {}
|
||||
for test_name, test_info in test_settings.items():
|
||||
# Otherwise check if the test has the right type & keep if it does
|
||||
if matches_test_type(test_info, test_type):
|
||||
# Embedding tests need to have a conversion func in their test info
|
||||
if matches_test_type(test_info, VLMTestType.EMBEDDING):
|
||||
assert test_info.convert_assets_to_embeddings is not None
|
||||
# Custom test inputs need to explicitly define the mm limit/inputs
|
||||
if matches_test_type(test_info, VLMTestType.CUSTOM_INPUTS):
|
||||
assert test_info.custom_test_opts is not None and isinstance(
|
||||
test_info.custom_test_opts, Iterable
|
||||
)
|
||||
# For all types besides custom inputs, we need a prompt formatter
|
||||
else:
|
||||
assert test_info.prompt_formatter is not None
|
||||
|
||||
# Everything looks okay; keep if this is correct proc handling
|
||||
if (
|
||||
test_info.distributed_executor_backend is not None
|
||||
) == new_proc_per_test:
|
||||
matching_tests[test_name] = test_info
|
||||
|
||||
return matching_tests
|
||||
|
||||
|
||||
def get_model_type_cases(
|
||||
model_type: str,
|
||||
test_info: VLMTestInfo,
|
||||
test_type: VLMTestType,
|
||||
):
|
||||
# Ensure that something is wrapped as an iterable it's not already
|
||||
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,)
|
||||
|
||||
# This is essentially the same as nesting a bunch of mark.parametrize
|
||||
# decorators, but we do it programmatically to allow overrides for on
|
||||
# a per-model basis, while still being able to execute each of these
|
||||
# as individual test cases in pytest.
|
||||
iter_kwargs = OrderedDict(
|
||||
[
|
||||
("model", ensure_wrapped(test_info.models)),
|
||||
("max_tokens", ensure_wrapped(test_info.max_tokens)),
|
||||
("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
|
||||
("dtype", ensure_wrapped(test_info.dtype)),
|
||||
(
|
||||
"distributed_executor_backend",
|
||||
ensure_wrapped(test_info.distributed_executor_backend),
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
# num_frames is video only
|
||||
if test_type == VLMTestType.VIDEO:
|
||||
iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames)
|
||||
iter_kwargs["needs_video_metadata"] = ensure_wrapped(
|
||||
test_info.needs_video_metadata
|
||||
)
|
||||
|
||||
# No sizes passed for custom inputs, since inputs are directly provided
|
||||
if test_type not in (
|
||||
VLMTestType.CUSTOM_INPUTS,
|
||||
VLMTestType.AUDIO,
|
||||
):
|
||||
wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
|
||||
if wrapped_sizes is None:
|
||||
raise ValueError(f"Sizes must be set for test type {test_type}")
|
||||
iter_kwargs["size_wrapper"] = wrapped_sizes
|
||||
|
||||
# Otherwise expand the custom test options instead
|
||||
elif test_type == VLMTestType.CUSTOM_INPUTS:
|
||||
if test_info.custom_test_opts is None:
|
||||
raise ValueError("Test has type CUSTOM_INPUTS, but none given")
|
||||
iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
|
||||
|
||||
# Wrap all model cases in a pytest parameter & pass marks through
|
||||
return [
|
||||
pytest.param(
|
||||
model_type,
|
||||
ExpandableVLMTestArgs(**{k: v for k, v in zip(iter_kwargs.keys(), case)}),
|
||||
marks=test_info.marks if test_info.marks is not None else [],
|
||||
)
|
||||
for case in list(itertools.product(*iter_kwargs.values()))
|
||||
]
|
||||
|
||||
|
||||
def get_parametrized_options(
|
||||
test_settings: dict[str, VLMTestInfo],
|
||||
test_type: VLMTestType,
|
||||
create_new_process_for_each_test: bool,
|
||||
):
|
||||
"""Converts all of our VLMTestInfo into an expanded list of parameters.
|
||||
This is similar to nesting pytest parametrize calls, but done directly
|
||||
through an itertools product so that each test can set things like
|
||||
size factors etc, while still running in isolated test cases.
|
||||
"""
|
||||
matching_tests = get_filtered_test_settings(
|
||||
test_settings, test_type, create_new_process_for_each_test
|
||||
)
|
||||
|
||||
# Get a list per model type, where each entry contains a tuple of all of
|
||||
# that model type's cases, then flatten them into the top level so that
|
||||
# we can consume them in one mark.parametrize call.
|
||||
cases_by_model_type = [
|
||||
get_model_type_cases(model_type, test_info, test_type)
|
||||
for model_type, test_info in matching_tests.items()
|
||||
]
|
||||
return list(itertools.chain(*cases_by_model_type))
|
||||
|
||||
|
||||
def get_wrapped_test_sizes(
|
||||
test_info: VLMTestInfo, test_type: VLMTestType
|
||||
) -> tuple[ImageSizeWrapper, ...]:
|
||||
"""Given a test info which may have size factors or fixed sizes, wrap them
|
||||
and combine them into an iterable, each of which will be used in parameter
|
||||
expansion.
|
||||
|
||||
Args:
|
||||
test_info: Test configuration to be expanded.
|
||||
test_type: The type of test being filtered for.
|
||||
"""
|
||||
# If it is an embedding test, we always use the EMBEDDING_SIZE_FACTORS
|
||||
if test_type == VLMTestType.EMBEDDING:
|
||||
return tuple(
|
||||
[
|
||||
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
|
||||
for factor in EMBEDDING_SIZE_FACTORS
|
||||
]
|
||||
)
|
||||
# Audio and Custom inputs have preprocessed inputs
|
||||
elif test_type in (VLMTestType.AUDIO, VLMTestType.CUSTOM_INPUTS):
|
||||
return tuple()
|
||||
|
||||
size_factors = test_info.image_size_factors if test_info.image_size_factors else []
|
||||
fixed_sizes = test_info.image_sizes if test_info.image_sizes else []
|
||||
|
||||
wrapped_factors = [
|
||||
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
|
||||
for factor in size_factors
|
||||
]
|
||||
|
||||
wrapped_sizes = [
|
||||
ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size) for size in fixed_sizes
|
||||
]
|
||||
|
||||
return tuple(wrapped_factors + wrapped_sizes)
|
||||
189
tests/models/multimodal/generation/vlm_utils/core.py
Normal file
189
tests/models/multimodal/generation/vlm_utils/core.py
Normal file
@@ -0,0 +1,189 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Core test implementation to be shared across modalities."""
|
||||
|
||||
from collections.abc import Callable
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||
|
||||
from vllm.config.model import RunnerOption
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
|
||||
from .....conftest import HfRunner, VllmRunner
|
||||
from ....registry import HF_EXAMPLE_MODELS
|
||||
from .types import PromptWithMultiModalInput, RunnerOutput
|
||||
|
||||
|
||||
def run_test(
|
||||
*,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
inputs: list[PromptWithMultiModalInput],
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
enforce_eager: bool,
|
||||
max_model_len: int,
|
||||
max_num_seqs: int,
|
||||
hf_output_post_proc: Callable[[RunnerOutput, str], Any] | None,
|
||||
vllm_output_post_proc: Callable[[RunnerOutput, str], Any] | None,
|
||||
auto_cls: type[_BaseAutoModelClass],
|
||||
use_tokenizer_eos: bool,
|
||||
comparator: Callable[..., None],
|
||||
get_stop_token_ids: Callable[[TokenizerLike], list[int]] | None,
|
||||
stop_str: list[str] | None,
|
||||
limit_mm_per_prompt: dict[str, int],
|
||||
vllm_runner_kwargs: dict[str, Any] | None,
|
||||
hf_model_kwargs: dict[str, Any] | None,
|
||||
patch_hf_runner: Callable[[HfRunner], HfRunner] | None,
|
||||
runner: RunnerOption = "auto",
|
||||
distributed_executor_backend: str | None = None,
|
||||
tensor_parallel_size: int = 1,
|
||||
vllm_embeddings: torch.Tensor | None = None,
|
||||
):
|
||||
"""Modality agnostic test executor for comparing HF/vLLM outputs."""
|
||||
# In the case of embeddings, vLLM takes separate input tensors
|
||||
vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs
|
||||
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
# Disable other modalities to save memory
|
||||
default_limits = {"image": 0, "video": 0, "audio": 0}
|
||||
limit_mm_per_prompt = default_limits | limit_mm_per_prompt
|
||||
|
||||
vllm_outputs_per_mm = []
|
||||
hf_outputs_per_mm = []
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
|
||||
vllm_runner_kwargs_: dict[str, Any] = {"mm_processor_cache_gb": 0}
|
||||
if model_info.tokenizer:
|
||||
vllm_runner_kwargs_["tokenizer_name"] = model_info.tokenizer
|
||||
if model_info.tokenizer_mode:
|
||||
vllm_runner_kwargs_["tokenizer_mode"] = model_info.tokenizer_mode
|
||||
if model_info.hf_overrides:
|
||||
vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides
|
||||
if model_info.require_embed_inputs:
|
||||
for k in ("skip_tokenizer_init", "enable_prompt_embeds", "enable_mm_embeds"):
|
||||
vllm_runner_kwargs_[k] = model_info.require_embed_inputs
|
||||
|
||||
if vllm_runner_kwargs:
|
||||
vllm_runner_kwargs_.update(vllm_runner_kwargs)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=max_num_seqs,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=enforce_eager,
|
||||
runner=runner,
|
||||
**vllm_runner_kwargs_,
|
||||
) as vllm_model:
|
||||
tokenizer = vllm_model.llm.get_tokenizer()
|
||||
|
||||
vllm_kwargs: dict[str, Any] = {}
|
||||
if get_stop_token_ids is not None:
|
||||
vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
|
||||
if stop_str:
|
||||
vllm_kwargs["stop"] = stop_str
|
||||
|
||||
for prompts, image_data, video_data, audio_data in vllm_inputs:
|
||||
mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
|
||||
vllm_kwargs_with_mm_data = vllm_kwargs | mm_data
|
||||
vllm_output = vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
**vllm_kwargs_with_mm_data,
|
||||
)
|
||||
vllm_outputs_per_mm.append(vllm_output)
|
||||
|
||||
hf_model = hf_runner(
|
||||
model, dtype=dtype, auto_cls=auto_cls, model_kwargs=hf_model_kwargs
|
||||
)
|
||||
|
||||
# Some models need to patch things like the model processor, e.g., internvl
|
||||
if patch_hf_runner is not None:
|
||||
hf_model = patch_hf_runner(hf_model)
|
||||
|
||||
with hf_model, torch.no_grad():
|
||||
tokenizer = hf_model.tokenizer
|
||||
|
||||
# Some models need to explicitly pass the eos_token_id off the tokenizer
|
||||
# or processor for a good comparison;
|
||||
# currently assume processor/tokenizer agree on the EOS, and pull it off
|
||||
# the tokenizer if requested.
|
||||
hf_kwargs = {}
|
||||
if use_tokenizer_eos:
|
||||
hf_kwargs["eos_token_id"] = tokenizer.eos_token_id
|
||||
if stop_str:
|
||||
hf_kwargs["stop_strings"] = stop_str
|
||||
|
||||
for prompts, image_data, video_data, audio_data in inputs:
|
||||
mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
|
||||
hf_kwargs_with_mm_data = hf_kwargs | mm_data
|
||||
hf_output = hf_model.generate_greedy_logprobs_limit(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tokenizer=tokenizer,
|
||||
**hf_kwargs_with_mm_data,
|
||||
)
|
||||
hf_outputs_per_mm.append(hf_output)
|
||||
|
||||
# Apply output processing / sanitation to the vLLM and HF runner results
|
||||
hf_outputs_per_mm, vllm_outputs_per_mm = process_runner_outputs(
|
||||
model,
|
||||
first_runner_outputs=hf_outputs_per_mm,
|
||||
second_runner_outputs=vllm_outputs_per_mm,
|
||||
first_runner_processor=hf_output_post_proc,
|
||||
second_runner_processor=vllm_output_post_proc,
|
||||
)
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm, vllm_outputs_per_mm):
|
||||
# This is usually check_logprobs_close, but it's passed through to
|
||||
# allow things like check_outputs_equal where needed
|
||||
comparator(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
def process_runner_outputs(
|
||||
model,
|
||||
first_runner_outputs,
|
||||
second_runner_outputs,
|
||||
first_runner_processor=None,
|
||||
second_runner_processor=None,
|
||||
):
|
||||
"""Applies the runner processor(s) to the runner outputs, if any."""
|
||||
if first_runner_processor is not None:
|
||||
first_runner_outputs = process_outputs(
|
||||
first_runner_processor, model, first_runner_outputs
|
||||
)
|
||||
if second_runner_processor is not None:
|
||||
second_runner_outputs = process_outputs(
|
||||
second_runner_processor, model, second_runner_outputs
|
||||
)
|
||||
return first_runner_outputs, second_runner_outputs
|
||||
|
||||
|
||||
def process_outputs(output_processor, model, outputs_per_image):
|
||||
"""Applies a model specific post-processor function to a runner's output"""
|
||||
return [
|
||||
[output_processor(res, model) for res in outputs]
|
||||
for outputs in outputs_per_image
|
||||
]
|
||||
156
tests/models/multimodal/generation/vlm_utils/custom_inputs.py
Normal file
156
tests/models/multimodal/generation/vlm_utils/custom_inputs.py
Normal file
@@ -0,0 +1,156 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Custom input builders for edge-cases in different models."""
|
||||
|
||||
from collections.abc import Callable
|
||||
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.multimodal.video import (
|
||||
rescale_video_size,
|
||||
resize_video,
|
||||
sample_frames_from_video,
|
||||
)
|
||||
|
||||
from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
|
||||
from .builders import build_multi_image_inputs, build_single_image_inputs
|
||||
from .types import ImageSizeWrapper, PromptWithMultiModalInput, SizeType
|
||||
|
||||
|
||||
def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
|
||||
"""Builds inputs for multi-image (varied sizes/aspect ratio) testing.
|
||||
|
||||
Args:
|
||||
formatter: model-specific prompt formatter.
|
||||
"""
|
||||
stop_sign = IMAGE_ASSETS[0].pil_image
|
||||
cherry_blossom = IMAGE_ASSETS[1].pil_image
|
||||
|
||||
# Apply the selected formatter to the base prompts
|
||||
img_prompts = [
|
||||
"<image><image>\nDescribe 2 images.",
|
||||
"<image><image>\nDescribe 2 images.",
|
||||
"<image><image><image><image>\nDescribe 4 images.",
|
||||
"<image>\nWhat is the season?",
|
||||
]
|
||||
formatted_prompts = [formatter(prompt) for prompt in img_prompts]
|
||||
aspect_ratio_images = [
|
||||
[stop_sign, cherry_blossom],
|
||||
# Images with different sizes and aspect-ratios
|
||||
[
|
||||
rescale_image_size(stop_sign, 0.1),
|
||||
stop_sign,
|
||||
],
|
||||
[
|
||||
stop_sign,
|
||||
rescale_image_size(stop_sign, 0.25),
|
||||
cherry_blossom.resize((183, 488)),
|
||||
cherry_blossom.resize((488, 183)),
|
||||
],
|
||||
cherry_blossom,
|
||||
]
|
||||
|
||||
return [
|
||||
PromptWithMultiModalInput(
|
||||
prompts=formatted_prompts,
|
||||
image_data=aspect_ratio_images,
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def multi_video_multi_aspect_ratio_inputs(
|
||||
formatter: Callable[[str], str], num_frames: int = 16
|
||||
):
|
||||
"""Builds inputs for multi-video (varied sizes/aspect ratio) testing.
|
||||
|
||||
Args:
|
||||
formatter: model-specific prompt formatter.
|
||||
"""
|
||||
video = sample_frames_from_video(VIDEO_ASSETS[0].np_ndarrays, num_frames)
|
||||
# Apply the selected formatter to the base prompts
|
||||
video_prompts = [
|
||||
"<video><video>\nDescribe 2 videos.",
|
||||
"<video><video>\nDescribe 2 videos.",
|
||||
"<video><video><video><video>\nDescribe 4 videos.",
|
||||
"<video>\nWhy is this video funny?",
|
||||
]
|
||||
formatted_prompts = [formatter(prompt) for prompt in video_prompts]
|
||||
aspect_ratio_videos = [
|
||||
[video, video],
|
||||
# Videos with different sizes and aspect-ratios
|
||||
[
|
||||
rescale_video_size(video, 0.1),
|
||||
video,
|
||||
],
|
||||
[
|
||||
video,
|
||||
rescale_video_size(video, 0.25),
|
||||
resize_video(video, (183, 488)),
|
||||
resize_video(video, (488, 183)),
|
||||
],
|
||||
video,
|
||||
]
|
||||
|
||||
return [
|
||||
PromptWithMultiModalInput(
|
||||
prompts=formatted_prompts,
|
||||
video_data=aspect_ratio_videos,
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def different_patch_input_cases_internvl():
|
||||
images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
|
||||
formatter = (
|
||||
lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501
|
||||
)
|
||||
single_img_prompts = [
|
||||
"<image>\nWhat's the content in the center of the image?",
|
||||
"<image>\nWhat is the season?",
|
||||
]
|
||||
multi_img_prompts = [
|
||||
"Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.\n", # noqa: E501
|
||||
]
|
||||
formatted_sprompts = [formatter(prompt) for prompt in single_img_prompts]
|
||||
formatted_mprompts = [formatter(prompt) for prompt in multi_img_prompts]
|
||||
|
||||
wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5, 1.0])
|
||||
return [
|
||||
build_single_image_inputs(images, formatted_sprompts, wrapped_sf),
|
||||
build_multi_image_inputs([images], formatted_mprompts, wrapped_sf),
|
||||
]
|
||||
|
||||
|
||||
def windows_attention_image_qwen2_5_vl():
|
||||
# image from regression issue: https://github.com/vllm-project/vllm/issues/15122 # noqa: E501
|
||||
image = ImageAsset("hato").pil_image
|
||||
|
||||
question = "Describe the image."
|
||||
img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
|
||||
prompt = (
|
||||
f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n<|im_start|>assistant\n"
|
||||
)
|
||||
|
||||
wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
|
||||
return build_single_image_inputs([image], [prompt], wrapped_sf)
|
||||
|
||||
|
||||
def video_with_metadata_glm4_1v():
|
||||
video_array = VIDEO_ASSETS[0].np_ndarrays
|
||||
metadata = VIDEO_ASSETS[0].metadata
|
||||
question = "Describe the video."
|
||||
video_prompt = "<|begin_of_video|><|video|><|end_of_video|>"
|
||||
formatted_prompt = f"[gMASK]<|user|>\n{video_prompt}{question}<|assistant|>\n"
|
||||
|
||||
scales = [0.1, 0.2, 0.25]
|
||||
video_input = [
|
||||
[(rescale_video_size(video_array, scale), metadata)] for scale in scales
|
||||
]
|
||||
prompts = [formatted_prompt] * len(video_input)
|
||||
|
||||
return [
|
||||
PromptWithMultiModalInput(
|
||||
prompts=prompts,
|
||||
video_data=video_input,
|
||||
)
|
||||
]
|
||||
1008
tests/models/multimodal/generation/vlm_utils/model_utils.py
Normal file
1008
tests/models/multimodal/generation/vlm_utils/model_utils.py
Normal file
File diff suppressed because it is too large
Load Diff
190
tests/models/multimodal/generation/vlm_utils/runners.py
Normal file
190
tests/models/multimodal/generation/vlm_utils/runners.py
Normal file
@@ -0,0 +1,190 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Entrypoints for wrapping the core run_test implementation for specific test
|
||||
types / modalities.
|
||||
"""
|
||||
|
||||
from pathlib import PosixPath
|
||||
|
||||
from .....conftest import (
|
||||
AudioTestAssets,
|
||||
HfRunner,
|
||||
ImageTestAssets,
|
||||
VideoTestAssets,
|
||||
VllmRunner,
|
||||
)
|
||||
from . import builders, core
|
||||
from .types import ExpandableVLMTestArgs, VLMTestInfo
|
||||
|
||||
|
||||
####### Entrypoints for running different test types
|
||||
def run_single_image_test(
|
||||
*,
|
||||
tmp_path: PosixPath,
|
||||
model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
assert test_case.size_wrapper is not None
|
||||
inputs = builders.build_single_image_inputs_from_test_info(
|
||||
model_test_info, image_assets, test_case.size_wrapper, tmp_path
|
||||
)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
inputs=inputs,
|
||||
model=test_case.model,
|
||||
dtype=test_case.dtype,
|
||||
max_tokens=test_case.max_tokens,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
|
||||
|
||||
def run_multi_image_test(
|
||||
*,
|
||||
tmp_path: PosixPath,
|
||||
model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
assert test_case.size_wrapper is not None
|
||||
inputs = builders.build_multi_image_inputs_from_test_info(
|
||||
model_test_info, image_assets, test_case.size_wrapper, tmp_path
|
||||
)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
inputs=inputs,
|
||||
model=test_case.model,
|
||||
dtype=test_case.dtype,
|
||||
max_tokens=test_case.max_tokens,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"image": len(image_assets)},
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
|
||||
|
||||
def run_embedding_test(
|
||||
*,
|
||||
model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
assert test_case.size_wrapper is not None
|
||||
inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
|
||||
model_test_info, image_assets, test_case.size_wrapper
|
||||
)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
inputs=inputs,
|
||||
model=test_case.model,
|
||||
dtype=test_case.dtype,
|
||||
max_tokens=test_case.max_tokens,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
vllm_embeddings=vllm_embeddings,
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
|
||||
|
||||
def run_video_test(
|
||||
*,
|
||||
model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
video_assets: VideoTestAssets,
|
||||
):
|
||||
assert test_case.size_wrapper is not None
|
||||
assert test_case.num_video_frames is not None
|
||||
inputs = builders.build_video_inputs_from_test_info(
|
||||
model_test_info,
|
||||
video_assets,
|
||||
test_case.size_wrapper,
|
||||
test_case.num_video_frames,
|
||||
test_case.needs_video_metadata,
|
||||
)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
inputs=inputs,
|
||||
model=test_case.model,
|
||||
dtype=test_case.dtype,
|
||||
max_tokens=test_case.max_tokens,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"video": len(video_assets)},
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
|
||||
|
||||
def run_audio_test(
|
||||
*,
|
||||
model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
audio_assets: AudioTestAssets,
|
||||
):
|
||||
inputs = builders.build_audio_inputs_from_test_info(model_test_info, audio_assets)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
inputs=inputs,
|
||||
model=test_case.model,
|
||||
dtype=test_case.dtype,
|
||||
max_tokens=test_case.max_tokens,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"audio": 1},
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
|
||||
|
||||
def run_custom_inputs_test(
|
||||
*,
|
||||
model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
):
|
||||
# Custom test cases can provide inputs directly, but they need to
|
||||
# explicitly provided a CustomTestConfig, which wraps the inputs and
|
||||
# the limit_mm_per_prompt
|
||||
assert test_case.custom_test_opts is not None
|
||||
|
||||
inputs = test_case.custom_test_opts.inputs
|
||||
limit_mm_per_prompt = test_case.custom_test_opts.limit_mm_per_prompt
|
||||
# Inputs and limit_mm_per_prompt should all be set
|
||||
assert inputs is not None
|
||||
assert limit_mm_per_prompt is not None
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
inputs=inputs,
|
||||
model=test_case.model,
|
||||
dtype=test_case.dtype,
|
||||
max_tokens=test_case.max_tokens,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
218
tests/models/multimodal/generation/vlm_utils/types.py
Normal file
218
tests/models/multimodal/generation/vlm_utils/types.py
Normal file
@@ -0,0 +1,218 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Types for writing multimodal model tests."""
|
||||
|
||||
from collections.abc import Callable, Iterable
|
||||
from enum import Enum
|
||||
from pathlib import PosixPath
|
||||
from typing import Any, NamedTuple
|
||||
|
||||
import torch
|
||||
from pytest import MarkDecorator
|
||||
from transformers import AutoModelForCausalLM
|
||||
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||
|
||||
from vllm.config.model import RunnerOption
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
|
||||
from .....conftest import (
|
||||
AUDIO_ASSETS,
|
||||
IMAGE_ASSETS,
|
||||
HfRunner,
|
||||
ImageAsset,
|
||||
ImageTestAssets,
|
||||
PromptAudioInput,
|
||||
PromptImageInput,
|
||||
PromptVideoInput,
|
||||
)
|
||||
from ....utils import check_logprobs_close
|
||||
|
||||
# meta image tag; will be replaced by the appropriate tag for the model
|
||||
TEST_IMG_PLACEHOLDER = "<vlm_image>"
|
||||
TEST_VIDEO_PLACEHOLDER = "<vlm_video>"
|
||||
TEST_AUDIO_PLACEHOLDER = "<lmm_audio>"
|
||||
|
||||
SINGLE_IMAGE_BASE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": f"{TEST_IMG_PLACEHOLDER}What's the content of the image?",
|
||||
"cherry_blossom": f"{TEST_IMG_PLACEHOLDER}What is the season?",
|
||||
}
|
||||
)
|
||||
SINGLE_AUDIO_BASE_PROMPT = AUDIO_ASSETS.prompts(
|
||||
{
|
||||
"mary_had_lamb": f"{TEST_AUDIO_PLACEHOLDER}Transcribe this audio into English.", # noqa: E501
|
||||
"winning_call": f"{TEST_AUDIO_PLACEHOLDER}What is happening in this audio clip?", # noqa: E501
|
||||
}
|
||||
)
|
||||
|
||||
MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PLACEHOLDER}Describe the two images in detail.\n" # noqa: E501
|
||||
VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"
|
||||
|
||||
|
||||
IMAGE_SIZE_FACTORS = [(1.0,), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
|
||||
EMBEDDING_SIZE_FACTORS = [(1.0,), (1.0, 1.0, 1.0)]
|
||||
RunnerOutput = tuple[list[int], str, SampleLogprobs | None]
|
||||
|
||||
|
||||
class PromptWithMultiModalInput(NamedTuple):
|
||||
"""Holds the multimodal input for a single test case."""
|
||||
|
||||
prompts: list[str]
|
||||
image_data: PromptImageInput | None = None
|
||||
video_data: PromptVideoInput | None = None
|
||||
audio_data: PromptAudioInput | None = None
|
||||
|
||||
|
||||
class VLMTestType(Enum):
|
||||
IMAGE = 1
|
||||
MULTI_IMAGE = 2
|
||||
EMBEDDING = 3
|
||||
VIDEO = 4
|
||||
AUDIO = 5
|
||||
CUSTOM_INPUTS = 6
|
||||
|
||||
|
||||
class SizeType(Enum):
|
||||
SIZE_FACTOR = 1
|
||||
FIXED_SIZE = 2
|
||||
|
||||
|
||||
class CustomTestOptions(NamedTuple):
|
||||
inputs: list[PromptWithMultiModalInput]
|
||||
limit_mm_per_prompt: dict[str, int]
|
||||
|
||||
|
||||
class ImageSizeWrapper(NamedTuple):
|
||||
type: SizeType
|
||||
# A size factor is a wrapper of 0+ floats,
|
||||
# while a fixed size contains an iterable of integer pairs
|
||||
data: Iterable[float] | Iterable[tuple[int, int]]
|
||||
|
||||
|
||||
class VLMTestInfo(NamedTuple):
|
||||
"""Holds the configuration for 1+ tests for one model architecture."""
|
||||
|
||||
models: list[str]
|
||||
test_type: VLMTestType | Iterable[VLMTestType]
|
||||
|
||||
# Should be None only if this is a CUSTOM_INPUTS test
|
||||
prompt_formatter: Callable[[str], str] | None = None
|
||||
img_idx_to_prompt: Callable[[int], str] = lambda idx: "<image>\n"
|
||||
video_idx_to_prompt: Callable[[int], str] = lambda idx: "<video>\n"
|
||||
audio_idx_to_prompt: Callable[[int], str] = lambda idx: "<audio>\n"
|
||||
|
||||
# Most models work on the single / multi-image prompts above, but in some
|
||||
# cases the log prob check fails, e.g., for paligemma. We allow passing
|
||||
# an override for the single image prompts / multi-image prompt for this
|
||||
# reason.
|
||||
single_image_prompts: Iterable[str] = SINGLE_IMAGE_BASE_PROMPTS
|
||||
multi_image_prompt: str = MULTI_IMAGE_BASE_PROMPT
|
||||
|
||||
# Function for converting ImageAssets to image embeddings;
|
||||
# We need to define this explicitly for embedding tests
|
||||
convert_assets_to_embeddings: (
|
||||
Callable[[ImageTestAssets], list[torch.Tensor]] | None
|
||||
) = None
|
||||
|
||||
# Exposed options for vLLM runner; we change these in a several tests,
|
||||
# but the defaults are derived from VllmRunner & the engine defaults
|
||||
# These settings are chosen to avoid OOMs when running in the CI
|
||||
enforce_eager: bool = True
|
||||
max_model_len: int = 1024
|
||||
max_num_seqs: int = 256
|
||||
runner: RunnerOption = "auto"
|
||||
tensor_parallel_size: int = 1
|
||||
vllm_runner_kwargs: dict[str, Any] | None = None
|
||||
|
||||
# Optional callable which gets a list of token IDs from the model tokenizer
|
||||
get_stop_token_ids: Callable[[TokenizerLike], list[int]] | None = None
|
||||
# Optional list of strings to stop generation, useful when stop tokens are
|
||||
# not special tokens in the tokenizer
|
||||
stop_str: list[str] | None = None
|
||||
|
||||
# Exposed options for HF runner
|
||||
hf_model_kwargs: dict[str, Any] | None = None
|
||||
# Indicates we should explicitly pass the EOS from the tokenizer
|
||||
use_tokenizer_eos: bool = False
|
||||
auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM
|
||||
patch_hf_runner: Callable[[HfRunner], HfRunner] | None = None
|
||||
|
||||
# Post processors that if defined, will run oun the outputs of the
|
||||
# vLLM and HF runner, respectively (useful for sanitization, etc).
|
||||
vllm_output_post_proc: Callable[[RunnerOutput, str], Any] | None = None
|
||||
hf_output_post_proc: Callable[[RunnerOutput, str], Any] | None = None
|
||||
|
||||
# Consumes the output of the callables above and checks if they're equal
|
||||
comparator: Callable[..., None] = check_logprobs_close
|
||||
|
||||
# Default expandable params per test; these defaults can be overridden in
|
||||
# instances of this object; the complete set of test cases for the model
|
||||
# is all combinations of .models + all fields below
|
||||
max_tokens: int = 128
|
||||
num_logprobs: int = 5
|
||||
dtype: str = "auto"
|
||||
distributed_executor_backend: str | None = None
|
||||
# Only expanded in video tests
|
||||
num_video_frames: int | tuple[int] = 16
|
||||
needs_video_metadata: bool = False
|
||||
|
||||
# Fixed image sizes / image size factors; most tests use image_size_factors
|
||||
# The values provided for these two fields will be stacked and expanded
|
||||
# such that each model will consider each image size factor / image size
|
||||
# once per tests (much like concatenating and wrapping in one parametrize
|
||||
# call)
|
||||
image_size_factors: Iterable[Iterable[float]] = IMAGE_SIZE_FACTORS
|
||||
image_sizes: Iterable[Iterable[tuple[int, int]]] | None = None
|
||||
|
||||
# Hack for updating a prompt to take into a local path; currently only used
|
||||
# for Qwen-VL, which requires encoding the image path / url into the prompt
|
||||
# for HF runner
|
||||
prompt_path_encoder: (
|
||||
Callable[[PosixPath, str, list[ImageAsset] | ImageTestAssets], str] | None
|
||||
) = None # noqa: E501
|
||||
|
||||
# Allows configuring a test to run with custom inputs
|
||||
custom_test_opts: list[CustomTestOptions] | None = None
|
||||
|
||||
marks: list[MarkDecorator] | None = None
|
||||
|
||||
def get_non_parametrized_runner_kwargs(self):
|
||||
"""Returns a dictionary of expandable kwargs for items that are used
|
||||
in all test types, which are NOT used when creating the parametrized
|
||||
test cases.
|
||||
"""
|
||||
return {
|
||||
"enforce_eager": self.enforce_eager,
|
||||
"max_model_len": self.max_model_len,
|
||||
"max_num_seqs": self.max_num_seqs,
|
||||
"runner": self.runner,
|
||||
"tensor_parallel_size": self.tensor_parallel_size,
|
||||
"vllm_runner_kwargs": self.vllm_runner_kwargs,
|
||||
"hf_output_post_proc": self.hf_output_post_proc,
|
||||
"vllm_output_post_proc": self.vllm_output_post_proc,
|
||||
"auto_cls": self.auto_cls,
|
||||
"use_tokenizer_eos": self.use_tokenizer_eos,
|
||||
"comparator": self.comparator,
|
||||
"get_stop_token_ids": self.get_stop_token_ids,
|
||||
"hf_model_kwargs": self.hf_model_kwargs,
|
||||
"stop_str": self.stop_str,
|
||||
"patch_hf_runner": self.patch_hf_runner,
|
||||
}
|
||||
|
||||
|
||||
class ExpandableVLMTestArgs(NamedTuple):
|
||||
"""The expanded kwargs which correspond to a single test case."""
|
||||
|
||||
model: str
|
||||
max_tokens: int
|
||||
num_logprobs: int
|
||||
dtype: str
|
||||
distributed_executor_backend: str | None
|
||||
# Sizes are used for everything except for custom input tests
|
||||
size_wrapper: ImageSizeWrapper | None = None
|
||||
# Video only
|
||||
num_video_frames: int | None = None
|
||||
needs_video_metadata: bool = False
|
||||
# Custom inputs only
|
||||
custom_test_opts: CustomTestOptions | None = None
|
||||
0
tests/models/multimodal/pooling/__init__.py
Normal file
0
tests/models/multimodal/pooling/__init__.py
Normal file
24
tests/models/multimodal/pooling/conftest.py
Normal file
24
tests/models/multimodal/pooling/conftest.py
Normal file
@@ -0,0 +1,24 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Pytest configuration for vLLM pooling tests."""
|
||||
|
||||
import os
|
||||
import warnings
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
def pytest_collection_modifyitems(config, items):
|
||||
"""Set FLEX_ATTENTION backend for SigLIP tests on ROCm."""
|
||||
if not current_platform.is_rocm():
|
||||
return
|
||||
|
||||
siglip_tests = [item for item in items if "test_siglip" in item.nodeid]
|
||||
|
||||
if siglip_tests:
|
||||
os.environ["VLLM_ATTENTION_BACKEND"] = "FLEX_ATTENTION"
|
||||
warnings.warn(
|
||||
"ROCm: Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION for SigLIP tests",
|
||||
UserWarning,
|
||||
stacklevel=1,
|
||||
)
|
||||
139
tests/models/multimodal/pooling/test_clip.py
Normal file
139
tests/models/multimodal/pooling/test_clip.py
Normal file
@@ -0,0 +1,139 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
from transformers import CLIPModel
|
||||
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
||||
from ...utils import check_embeddings_close
|
||||
|
||||
HF_TEXT_PROMPTS = [
|
||||
"a photo of a stop sign",
|
||||
"a photo of a cherry blossom",
|
||||
]
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "",
|
||||
"cherry_blossom": "",
|
||||
}
|
||||
)
|
||||
|
||||
MODELS = ["openai/clip-vit-base-patch32"]
|
||||
|
||||
|
||||
def _run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
input_texts: list[str],
|
||||
input_images: PromptImageInput,
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
with vllm_runner(
|
||||
model, runner="pooling", dtype=dtype, enforce_eager=True, max_model_len=77
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(input_texts, images=input_images)
|
||||
|
||||
with hf_runner(model, dtype=dtype, auto_cls=CLIPModel) as hf_model:
|
||||
all_inputs = hf_model.get_inputs(input_texts, images=input_images)
|
||||
|
||||
all_outputs = []
|
||||
for inputs in all_inputs:
|
||||
inputs = hf_model.wrap_device(inputs)
|
||||
|
||||
if "pixel_values" in inputs:
|
||||
pooled_output = hf_model.model.get_image_features(
|
||||
pixel_values=inputs.pixel_values,
|
||||
).squeeze(0)
|
||||
else:
|
||||
pooled_output = hf_model.model.get_text_features(
|
||||
input_ids=inputs.input_ids,
|
||||
attention_mask=inputs.attention_mask,
|
||||
).squeeze(0)
|
||||
|
||||
all_outputs.append(pooled_output.tolist())
|
||||
|
||||
hf_outputs = all_outputs
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
def test_models_text(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
input_texts,
|
||||
input_images, # type: ignore
|
||||
model,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
def test_models_image(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
input_texts_images = [
|
||||
(text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
|
||||
]
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
input_texts,
|
||||
input_images,
|
||||
model,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
def test_models_text_image_no_crash(
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
texts = [HF_TEXT_PROMPTS[0]]
|
||||
images = [image_assets[0].pil_image]
|
||||
|
||||
with vllm_runner(
|
||||
model, runner="pooling", dtype=dtype, enforce_eager=True, max_model_len=77
|
||||
) as vllm_model:
|
||||
with pytest.raises(ValueError, match="not both"):
|
||||
vllm_model.embed(texts, images=images)
|
||||
|
||||
# Should still be able to run subsequent requests
|
||||
vllm_model.embed(texts)
|
||||
vllm_model.embed([""], images=images)
|
||||
215
tests/models/multimodal/pooling/test_dse_qwen2_vl.py
Normal file
215
tests/models/multimodal/pooling/test_dse_qwen2_vl.py
Normal file
@@ -0,0 +1,215 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Callable
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from PIL import Image
|
||||
from transformers import Qwen2VLForConditionalGeneration
|
||||
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
||||
from ....utils import large_gpu_test
|
||||
from ...utils import check_embeddings_close
|
||||
|
||||
HF_TEXT_PROMPTS = [
|
||||
# T -> X
|
||||
(
|
||||
"Query: Find me an everyday image that matches the given caption: The label of the object is stop sign", # noqa: E501,
|
||||
Image.new("RGB", (56, 56)),
|
||||
),
|
||||
# T -> X
|
||||
(
|
||||
"Query: Retrieve an image of this caption: cherry blossom",
|
||||
Image.new("RGB", (56, 56)),
|
||||
),
|
||||
]
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "What is shown in this image?",
|
||||
"cherry_blossom": "What is shown in this image?",
|
||||
}
|
||||
)
|
||||
|
||||
MODELS = ["MrLight/dse-qwen2-2b-mrl-v1"]
|
||||
|
||||
|
||||
def get_messages(image: Image.Image, text: str, embed_text: bool):
|
||||
# assert False, 'remember to use outer [] as required'
|
||||
if embed_text:
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image",
|
||||
"image": Image.new("RGB", (56, 56)),
|
||||
"resized_height": 1,
|
||||
"resized_width": 1,
|
||||
}, # need a dummy image here for an easier process.
|
||||
{"type": "text", "text": text},
|
||||
],
|
||||
}
|
||||
]
|
||||
else:
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image", "image": image},
|
||||
{"type": "text", "text": text},
|
||||
],
|
||||
}
|
||||
]
|
||||
return messages
|
||||
|
||||
|
||||
def apply_chat_template_and_add_eos(
|
||||
messages: list[dict],
|
||||
apply_chat_template_fn: Callable,
|
||||
):
|
||||
prompt = (
|
||||
apply_chat_template_fn(messages, tokenize=False, add_generation_prompt=True)
|
||||
+ "<|endoftext|>"
|
||||
)
|
||||
return prompt
|
||||
|
||||
|
||||
def _run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
input_texts: list[str],
|
||||
input_images: PromptImageInput,
|
||||
embed_texts: list[bool],
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
"""SET PYTHONPATH"""
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
with vllm_runner(
|
||||
model, runner="pooling", dtype=dtype, enforce_eager=True, max_model_len=8192
|
||||
) as vllm_model:
|
||||
tokenizer = vllm_model.llm.get_tokenizer()
|
||||
texts = [
|
||||
# this is necessary because vllm_model.embed will not apply any
|
||||
# templating to the prompt, and therefore lacks an image_pad
|
||||
# token unless one is inserted beforehand (the (28,28) image
|
||||
# above is converted to an image pad token by the chat template).
|
||||
apply_chat_template_and_add_eos(
|
||||
get_messages(image, text, False),
|
||||
apply_chat_template_fn=tokenizer.apply_chat_template,
|
||||
)
|
||||
for text, image in zip(input_texts, input_images)
|
||||
# vllm will replace the pad token with the actual image,
|
||||
# which may be a placeholder image, later.
|
||||
]
|
||||
vllm_outputs = vllm_model.embed(texts, images=input_images)
|
||||
|
||||
hf_outputs = []
|
||||
with hf_runner(
|
||||
model, dtype=dtype, auto_cls=Qwen2VLForConditionalGeneration
|
||||
) as hf_model:
|
||||
prompts = []
|
||||
for text, image, embed_text in zip(input_texts, input_images, embed_texts):
|
||||
# dse requires non-standard input processing
|
||||
# because it needs an image_pad token
|
||||
messages = get_messages(image, text, embed_text)
|
||||
prompt = apply_chat_template_and_add_eos(
|
||||
messages, hf_model.processor.apply_chat_template
|
||||
)
|
||||
|
||||
prompts.append(prompt)
|
||||
|
||||
all_inputs = hf_model.get_inputs(
|
||||
prompts=prompts,
|
||||
images=input_images,
|
||||
)
|
||||
|
||||
with torch.no_grad():
|
||||
all_outputs = []
|
||||
for inputs in all_inputs:
|
||||
inputs = hf_model.model.prepare_inputs_for_generation(
|
||||
**inputs,
|
||||
cache_position=torch.arange(1), # 1 for batch size
|
||||
use_cache=False,
|
||||
)
|
||||
outputs = hf_model.model(
|
||||
**hf_model.wrap_device(inputs),
|
||||
return_dict=True,
|
||||
output_hidden_states=True,
|
||||
)
|
||||
pooled_output = F.normalize(
|
||||
outputs.hidden_states[-1][0, -1], p=2, dim=-1
|
||||
)
|
||||
|
||||
all_outputs.append(pooled_output.tolist())
|
||||
|
||||
hf_outputs = all_outputs
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
def test_models_text(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
input_texts_images = [
|
||||
(text, image_placeholder) for text, image_placeholder in HF_TEXT_PROMPTS
|
||||
]
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
embed_texts = [True] * len(input_texts)
|
||||
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
input_texts,
|
||||
input_images, # type: ignore
|
||||
embed_texts,
|
||||
model,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=48)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
def test_models_image(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
input_texts_images = [
|
||||
(text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
|
||||
]
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
embed_texts = [False] * len(input_texts)
|
||||
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
input_texts,
|
||||
input_images,
|
||||
embed_texts,
|
||||
model,
|
||||
dtype=dtype,
|
||||
)
|
||||
81
tests/models/multimodal/pooling/test_intern_vit.py
Normal file
81
tests/models/multimodal/pooling/test_intern_vit.py
Normal file
@@ -0,0 +1,81 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from huggingface_hub import snapshot_download
|
||||
from transformers import AutoConfig, AutoModel, CLIPImageProcessor
|
||||
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
from ....conftest import ImageTestAssets
|
||||
|
||||
# we use snapshot_download to prevent conflicts between
|
||||
# dynamic_module and trust_remote_code for hf_runner
|
||||
DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def run_intern_vit_test(
|
||||
image_assets: ImageTestAssets,
|
||||
model_id: str,
|
||||
*,
|
||||
dtype: str,
|
||||
):
|
||||
model = snapshot_download(model_id, allow_patterns=DOWNLOAD_PATTERN)
|
||||
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
|
||||
|
||||
img_processor = CLIPImageProcessor.from_pretrained(model)
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
pixel_values = [
|
||||
img_processor(images, return_tensors="pt").pixel_values.to(torch_dtype)
|
||||
for images in images
|
||||
]
|
||||
|
||||
config = AutoConfig.from_pretrained(model, trust_remote_code=True)
|
||||
if not getattr(config, "norm_type", None):
|
||||
config.norm_type = "rms_norm"
|
||||
|
||||
hf_model = AutoModel.from_pretrained(
|
||||
model, dtype=torch_dtype, trust_remote_code=True
|
||||
).to("cuda")
|
||||
hf_outputs_per_image = [
|
||||
hf_model(pixel_value.to("cuda")).last_hidden_state
|
||||
for pixel_value in pixel_values
|
||||
]
|
||||
|
||||
from vllm.model_executor.models.intern_vit import InternVisionModel
|
||||
|
||||
vllm_model = InternVisionModel(config)
|
||||
vllm_model.load_weights(hf_model.state_dict().items())
|
||||
|
||||
del hf_model
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
vllm_model = vllm_model.to("cuda", torch_dtype)
|
||||
vllm_outputs_per_image = [
|
||||
vllm_model(pixel_values=pixel_value.to("cuda")) for pixel_value in pixel_values
|
||||
]
|
||||
del vllm_model
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
cos_similar = nn.CosineSimilarity(dim=-1)
|
||||
for vllm_output, hf_output in zip(vllm_outputs_per_image, hf_outputs_per_image):
|
||||
assert cos_similar(vllm_output, hf_output).mean() > 0.99
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_id",
|
||||
[
|
||||
"OpenGVLab/InternViT-300M-448px",
|
||||
"OpenGVLab/InternViT-6B-448px-V1-5",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_models(dist_init, image_assets, model_id, dtype: str) -> None:
|
||||
run_intern_vit_test(
|
||||
image_assets,
|
||||
model_id,
|
||||
dtype=dtype,
|
||||
)
|
||||
194
tests/models/multimodal/pooling/test_jinavl_reranker.py
Normal file
194
tests/models/multimodal/pooling/test_jinavl_reranker.py
Normal file
@@ -0,0 +1,194 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
from transformers import AutoModel
|
||||
|
||||
from vllm.entrypoints.chat_utils import ChatCompletionContentPartImageParam
|
||||
from vllm.entrypoints.score_utils import ScoreMultiModalParam
|
||||
|
||||
from ....conftest import HfRunner, VllmRunner
|
||||
|
||||
model_name = "jinaai/jina-reranker-m0"
|
||||
|
||||
mm_processor_kwargs = {
|
||||
"min_pixels": 3136,
|
||||
"max_pixels": 602112,
|
||||
}
|
||||
|
||||
limit_mm_per_prompt = {"image": 2}
|
||||
|
||||
|
||||
def vllm_reranker(
|
||||
vllm_runner: type[VllmRunner],
|
||||
model_name: str,
|
||||
dtype: str,
|
||||
query_strs: list[str],
|
||||
document_strs: list[str],
|
||||
query_type: str = "text",
|
||||
doc_type: str = "text",
|
||||
):
|
||||
def create_image_param(url: str) -> ChatCompletionContentPartImageParam:
|
||||
return {"type": "image_url", "image_url": {"url": f"{url}"}}
|
||||
|
||||
query: list[str] | ScoreMultiModalParam
|
||||
if query_type == "text":
|
||||
query = query_strs
|
||||
elif query_type == "image":
|
||||
query = ScoreMultiModalParam(
|
||||
content=[create_image_param(url) for url in query_strs]
|
||||
)
|
||||
|
||||
documents: list[str] | ScoreMultiModalParam
|
||||
if doc_type == "text":
|
||||
documents = document_strs
|
||||
elif doc_type == "image":
|
||||
documents = ScoreMultiModalParam(
|
||||
content=[create_image_param(url) for url in document_strs]
|
||||
)
|
||||
|
||||
with vllm_runner(
|
||||
model_name,
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
max_num_seqs=2,
|
||||
max_model_len=2048,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
) as vllm_model:
|
||||
outputs = vllm_model.llm.score(query, documents)
|
||||
|
||||
return [output.outputs.score for output in outputs]
|
||||
|
||||
|
||||
def hf_reranker(
|
||||
hf_runner: type[HfRunner],
|
||||
model_name: str,
|
||||
dtype: str,
|
||||
query_strs: list[str],
|
||||
document_strs: list[str],
|
||||
query_type: str = "text",
|
||||
doc_type: str = "text",
|
||||
):
|
||||
checkpoint_to_hf_mapper = {
|
||||
"visual.": "model.visual.",
|
||||
"model.": "model.language_model.",
|
||||
}
|
||||
|
||||
data_pairs = [[query_strs[0], d] for d in document_strs]
|
||||
|
||||
with hf_runner(
|
||||
model_name,
|
||||
dtype=dtype,
|
||||
trust_remote_code=True,
|
||||
auto_cls=AutoModel,
|
||||
model_kwargs={"key_mapping": checkpoint_to_hf_mapper},
|
||||
) as hf_model:
|
||||
return hf_model.model.compute_score(
|
||||
data_pairs, max_length=2048, query_type=query_type, doc_type=doc_type
|
||||
)
|
||||
|
||||
|
||||
# Visual Documents Reranking
|
||||
@pytest.mark.parametrize("model_name", [model_name])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_model_text_image(hf_runner, vllm_runner, model_name, dtype):
|
||||
query = ["slm markdown"]
|
||||
documents = [
|
||||
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png",
|
||||
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
|
||||
]
|
||||
|
||||
hf_outputs = hf_reranker(
|
||||
hf_runner, model_name, dtype, query, documents, "text", "image"
|
||||
)
|
||||
vllm_outputs = vllm_reranker(
|
||||
vllm_runner, model_name, dtype, query, documents, "text", "image"
|
||||
)
|
||||
|
||||
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
|
||||
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
|
||||
|
||||
|
||||
# Textual Documents Reranking
|
||||
@pytest.mark.parametrize("model_name", [model_name])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_model_text_text(hf_runner, vllm_runner, model_name, dtype):
|
||||
query = ["slm markdown"]
|
||||
documents = [
|
||||
"""We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient
|
||||
web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML
|
||||
into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding
|
||||
large language models. The models effectiveness results from two key innovations: (1) a three-stage
|
||||
data synthesis pipeline that generates high quality, diverse training data by iteratively drafting,
|
||||
refining, and critiquing web content extraction; and (2) a unified training framework combining
|
||||
continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that
|
||||
ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated
|
||||
benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly
|
||||
lower computational requirements.""", # noqa: E501
|
||||
"数据提取么?为什么不用正则啊,你用正则不就全解决了么?",
|
||||
]
|
||||
hf_outputs = hf_reranker(
|
||||
hf_runner, model_name, dtype, query, documents, "text", "text"
|
||||
)
|
||||
vllm_outputs = vllm_reranker(
|
||||
vllm_runner, model_name, dtype, query, documents, "text", "text"
|
||||
)
|
||||
|
||||
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
|
||||
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
|
||||
|
||||
|
||||
# Image Querying for Textual Documents
|
||||
@pytest.mark.parametrize("model_name", [model_name])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_model_image_text(hf_runner, vllm_runner, model_name, dtype):
|
||||
query = [
|
||||
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
|
||||
]
|
||||
documents = [
|
||||
"""We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient
|
||||
web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML
|
||||
into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding
|
||||
large language models. The models effectiveness results from two key innovations: (1) a three-stage
|
||||
data synthesis pipeline that generates high quality, diverse training data by iteratively drafting,
|
||||
refining, and critiquing web content extraction; and (2) a unified training framework combining
|
||||
continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that
|
||||
ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated
|
||||
benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly
|
||||
lower computational requirements.""", # noqa: E501
|
||||
"数据提取么?为什么不用正则啊,你用正则不就全解决了么?",
|
||||
]
|
||||
|
||||
hf_outputs = hf_reranker(
|
||||
hf_runner, model_name, dtype, query, documents, "image", "text"
|
||||
)
|
||||
vllm_outputs = vllm_reranker(
|
||||
vllm_runner, model_name, dtype, query, documents, "image", "text"
|
||||
)
|
||||
|
||||
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
|
||||
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
|
||||
|
||||
|
||||
# Image Querying for Image Documents
|
||||
@pytest.mark.parametrize("model_name", [model_name])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_model_image_image(hf_runner, vllm_runner, model_name, dtype):
|
||||
query = [
|
||||
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
|
||||
]
|
||||
documents = [
|
||||
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png",
|
||||
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
|
||||
]
|
||||
|
||||
hf_outputs = hf_reranker(
|
||||
hf_runner, model_name, dtype, query, documents, "image", "image"
|
||||
)
|
||||
vllm_outputs = vllm_reranker(
|
||||
vllm_runner, model_name, dtype, query, documents, "image", "image"
|
||||
)
|
||||
|
||||
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
|
||||
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
|
||||
159
tests/models/multimodal/pooling/test_llava_next.py
Normal file
159
tests/models/multimodal/pooling/test_llava_next.py
Normal file
@@ -0,0 +1,159 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch.nn.functional as F
|
||||
from transformers import AutoModelForImageTextToText
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
||||
from ....utils import large_gpu_test
|
||||
from ...utils import check_embeddings_close
|
||||
|
||||
# Llava Next embedding implementation is only supported by CUDA.
|
||||
# If run on ROCm, hf_model.model.resize_token_embeddings will
|
||||
# cause the following error:
|
||||
# RuntimeError: Calling torch.linalg.cholesky on a CUDA tensor
|
||||
# requires compiling PyTorch with MAGMA. Please use PyTorch
|
||||
# built with MAGMA support.
|
||||
# If run on CPU, hf_model.model.resize_token_embeddings will
|
||||
# cause the following error:
|
||||
# RuntimeError: Calling torch.linalg.cholesky on a CPU tensor
|
||||
# requires compiling PyTorch with LAPACK. Please use PyTorch
|
||||
# built with LAPACK support.
|
||||
pytestmark = pytest.mark.skipif(
|
||||
not current_platform.is_cuda(),
|
||||
reason="Llava Next model uses op that is only supported in CUDA",
|
||||
)
|
||||
|
||||
llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" # noqa: E501
|
||||
|
||||
HF_TEXT_PROMPTS = [
|
||||
# T -> X
|
||||
llama3_template.format(
|
||||
"The label of the object is stop sign\nSummary above sentence in one word: " # noqa: E501
|
||||
),
|
||||
# T -> X
|
||||
llama3_template.format("cherry blossom\nSummary above sentence in one word: "),
|
||||
]
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
# I -> X
|
||||
"stop_sign": llama3_template.format(
|
||||
"<image>\nSummary above image in one word: "
|
||||
),
|
||||
# I -> X
|
||||
"cherry_blossom": llama3_template.format(
|
||||
"<image>\nSummary above image in one word: "
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
MODELS = ["royokong/e5-v"]
|
||||
|
||||
|
||||
def _run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
input_texts: list[str],
|
||||
input_images: PromptImageInput,
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
with vllm_runner(
|
||||
model, runner="pooling", dtype=dtype, max_model_len=4096, enforce_eager=True
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(input_texts, images=input_images)
|
||||
|
||||
with hf_runner(
|
||||
model, dtype=dtype, auto_cls=AutoModelForImageTextToText
|
||||
) as hf_model:
|
||||
# Patch the issue where generation_config.json is missing
|
||||
hf_model.processor.patch_size = hf_model.model.config.vision_config.patch_size
|
||||
|
||||
# Patch the issue where image_token_id
|
||||
# exceeds the maximum allowed vocab size
|
||||
hf_model.model.resize_token_embeddings(
|
||||
hf_model.model.language_model.vocab_size + 1
|
||||
)
|
||||
|
||||
all_inputs = hf_model.get_inputs(input_texts, images=input_images)
|
||||
|
||||
all_outputs = []
|
||||
for inputs in all_inputs:
|
||||
# Based on: https://huggingface.co/royokong/e5-v
|
||||
outputs = hf_model.model(
|
||||
**hf_model.wrap_device(inputs),
|
||||
return_dict=True,
|
||||
output_hidden_states=True,
|
||||
)
|
||||
pooled_output = F.normalize(outputs.hidden_states[-1][0, -1, :], dim=-1)
|
||||
|
||||
all_outputs.append(pooled_output.tolist())
|
||||
|
||||
hf_outputs = all_outputs
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_models_text(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
input_texts,
|
||||
input_images, # type: ignore
|
||||
model,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=48)
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_models_image(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
input_texts_images = [
|
||||
(text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
|
||||
]
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
input_texts,
|
||||
input_images,
|
||||
model,
|
||||
dtype=dtype,
|
||||
)
|
||||
142
tests/models/multimodal/pooling/test_phi3v.py
Normal file
142
tests/models/multimodal/pooling/test_phi3v.py
Normal file
@@ -0,0 +1,142 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch.nn.functional as F
|
||||
from PIL import Image
|
||||
|
||||
from vllm.assets.base import get_vllm_public_assets
|
||||
from vllm.assets.image import VLM_IMAGES_DIR
|
||||
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
||||
from ....utils import large_gpu_test
|
||||
from ...utils import check_embeddings_close
|
||||
|
||||
HF_TEXT_PROMPTS = [
|
||||
# T -> X
|
||||
"Find me an everyday image that matches the given caption: The label of the object is stop sign", # noqa: E501
|
||||
# T -> X
|
||||
"Retrieve an image of this caption: cherry blossom",
|
||||
]
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
# T + I -> X
|
||||
"stop_sign": "<|image_1|> Select the portion of the image that isolates the object of the given label: The label of the object is stop sign", # noqa: E501
|
||||
# I -> X
|
||||
"cherry_blossom": "<|image_1|> Represent the given image for classification", # noqa: E501
|
||||
}
|
||||
)
|
||||
|
||||
MODELS = ["TIGER-Lab/VLM2Vec-Full"]
|
||||
|
||||
|
||||
def _run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
input_texts: list[str],
|
||||
input_images: PromptImageInput,
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
with vllm_runner(
|
||||
model, runner="pooling", dtype=dtype, enforce_eager=True
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(input_texts, images=input_images)
|
||||
|
||||
# use eager mode for hf runner, since phi3_v didn't work with flash_attn
|
||||
hf_model_kwargs = {"_attn_implementation": "eager"}
|
||||
with hf_runner(model, dtype=dtype, model_kwargs=hf_model_kwargs) as hf_model:
|
||||
all_inputs = hf_model.get_inputs(input_texts, images=input_images)
|
||||
|
||||
all_outputs = []
|
||||
for inputs in all_inputs:
|
||||
# Based on: https://github.com/TIGER-AI-Lab/VLM2Vec/blob/db3b951bccabba220c1f53ab46a734e50dd2fc08/src/model.py
|
||||
outputs = hf_model.model(
|
||||
**hf_model.wrap_device(inputs),
|
||||
return_dict=True,
|
||||
output_hidden_states=True,
|
||||
)
|
||||
last_hidden_state = outputs.hidden_states[-1][0]
|
||||
reps = last_hidden_state[inputs.attention_mask[0].sum() - 1]
|
||||
pooled_output = F.normalize(reps, p=2, dim=-1)
|
||||
|
||||
all_outputs.append(pooled_output.tolist())
|
||||
|
||||
hf_outputs = all_outputs
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_models_text(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
input_texts,
|
||||
input_images, # type: ignore
|
||||
model,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=48)
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_models_image(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
input_texts_images = [
|
||||
(text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
|
||||
]
|
||||
# add cases for special_tokens
|
||||
input_texts_images.append(
|
||||
(
|
||||
"\n<s><|user|>\n <|image_1|>\n\t <s>"
|
||||
"Represent the given image for classification<|end|>"
|
||||
"\n<|assistant|>\n",
|
||||
Image.open(
|
||||
get_vllm_public_assets(
|
||||
filename="cherry_blossom.jpg", s3_prefix=VLM_IMAGES_DIR
|
||||
)
|
||||
),
|
||||
)
|
||||
)
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
input_texts,
|
||||
input_images,
|
||||
model,
|
||||
dtype=dtype,
|
||||
)
|
||||
60
tests/models/multimodal/pooling/test_prithvi_mae.py
Normal file
60
tests/models/multimodal/pooling/test_prithvi_mae.py
Normal file
@@ -0,0 +1,60 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from ....conftest import VllmRunner
|
||||
|
||||
|
||||
def generate_test_mm_data():
|
||||
mm_data = {
|
||||
"pixel_values": torch.full((6, 512, 512), 1.0, dtype=torch.float16),
|
||||
"location_coords": torch.full((1, 2), 1.0, dtype=torch.float16),
|
||||
}
|
||||
return mm_data
|
||||
|
||||
|
||||
def _run_test(
|
||||
vllm_runner: type[VllmRunner],
|
||||
model: str,
|
||||
) -> None:
|
||||
prompt = [
|
||||
{
|
||||
# This model deals with no text input
|
||||
"prompt_token_ids": [1],
|
||||
"multi_modal_data": generate_test_mm_data(),
|
||||
}
|
||||
for _ in range(10)
|
||||
]
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
dtype="half",
|
||||
enforce_eager=True,
|
||||
skip_tokenizer_init=True,
|
||||
enable_mm_embeds=True,
|
||||
# Limit the maximum number of sequences to avoid the
|
||||
# test going OOM during the warmup run
|
||||
max_num_seqs=32,
|
||||
default_torch_num_threads=1,
|
||||
) as vllm_model:
|
||||
vllm_model.llm.encode(prompt, pooling_task="plugin")
|
||||
|
||||
|
||||
MODELS = ["mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
def test_models_image(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
) -> None:
|
||||
_run_test(
|
||||
vllm_runner,
|
||||
model,
|
||||
)
|
||||
98
tests/models/multimodal/pooling/test_radio.py
Normal file
98
tests/models/multimodal/pooling/test_radio.py
Normal file
@@ -0,0 +1,98 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from huggingface_hub import snapshot_download
|
||||
from transformers import AutoConfig, AutoModel, CLIPImageProcessor
|
||||
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.model_executor.models.radio import RadioModel
|
||||
from vllm.transformers_utils.configs.radio import RadioConfig
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
from ....conftest import ImageTestAssets
|
||||
|
||||
# we use snapshot_download to prevent conflicts between
|
||||
# dynamic_module and trust_remote_code for hf_runner
|
||||
DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def run_radio_test(
|
||||
image_assets: ImageTestAssets,
|
||||
model_id: str,
|
||||
*,
|
||||
dtype: str,
|
||||
):
|
||||
model = snapshot_download(model_id, allow_patterns=DOWNLOAD_PATTERN)
|
||||
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
|
||||
|
||||
img_processor = CLIPImageProcessor.from_pretrained(model)
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
# Input resolution must be a multiple of `self.min_resolution_step`.
|
||||
# Using `self.get_nearest_supported_resolution`, for assets 432x642 the
|
||||
# nearest supported resolution is 432x640.
|
||||
pixel_values = [
|
||||
img_processor(image, return_tensors="pt").pixel_values.to(torch_dtype)[
|
||||
:, :, :, :640
|
||||
]
|
||||
for image in images
|
||||
]
|
||||
|
||||
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
|
||||
|
||||
# RADIO model on HF does not properly handle torch_dtype argument
|
||||
# And relies on args["dtype"] which we have to patch manually:
|
||||
config.args["dtype"] = torch_dtype
|
||||
|
||||
hf_model = AutoModel.from_pretrained(
|
||||
model_id,
|
||||
config=config,
|
||||
dtype=torch_dtype,
|
||||
trust_remote_code=True,
|
||||
).to("cuda")
|
||||
hf_model.eval()
|
||||
|
||||
# A HF model has image normalization as a part of model's forward
|
||||
# However in vLLM we don't make normalization a part of the model
|
||||
# forward step since mean/std stored as model's parameters and
|
||||
# subject to precision loss (when using fp16/bf16) which negatively
|
||||
# affects evaluation benchmarks.
|
||||
hf_model.make_preprocessor_external()
|
||||
|
||||
hf_outputs_per_image = [
|
||||
hf_model(pixel_value.to("cuda")).features for pixel_value in pixel_values
|
||||
]
|
||||
|
||||
radio_config = RadioConfig(
|
||||
model_name=config.args["model"], reg_tokens=config.args["register_multiple"]
|
||||
)
|
||||
vllm_model = RadioModel(radio_config)
|
||||
vllm_model.load_weights(hf_model.state_dict())
|
||||
vllm_model = vllm_model.to("cuda", torch_dtype)
|
||||
|
||||
vllm_outputs_per_image = [
|
||||
vllm_model(pixel_values=pixel_value.to("cuda")) for pixel_value in pixel_values
|
||||
]
|
||||
del vllm_model, hf_model
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
cos_similar = nn.CosineSimilarity(dim=-1)
|
||||
for vllm_output, hf_output in zip(vllm_outputs_per_image, hf_outputs_per_image):
|
||||
assert cos_similar(vllm_output, hf_output).mean() > 0.99
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_id",
|
||||
[
|
||||
"nvidia/C-RADIOv2-H",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half", "bfloat16"])
|
||||
def test_radio(dist_init, image_assets, model_id, dtype: str) -> None:
|
||||
run_radio_test(
|
||||
image_assets,
|
||||
model_id,
|
||||
dtype=dtype,
|
||||
)
|
||||
162
tests/models/multimodal/pooling/test_siglip.py
Normal file
162
tests/models/multimodal/pooling/test_siglip.py
Normal file
@@ -0,0 +1,162 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
from transformers import SiglipModel
|
||||
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
||||
from ...utils import check_embeddings_close
|
||||
|
||||
HF_TEXT_PROMPTS = [
|
||||
"a photo of a stop sign",
|
||||
"a photo of a cherry blossom",
|
||||
]
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "",
|
||||
"cherry_blossom": "",
|
||||
}
|
||||
)
|
||||
|
||||
MODELS = [
|
||||
"google/siglip-base-patch16-224",
|
||||
"google/siglip2-base-patch16-224",
|
||||
# Different image embedding dim than text_config.hidden_size
|
||||
"google/siglip2-giant-opt-patch16-384",
|
||||
]
|
||||
|
||||
|
||||
def _run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
input_texts: list[str],
|
||||
input_images: PromptImageInput,
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
tokenization_kwargs: dict[str, Any] | None = None,
|
||||
) -> None:
|
||||
if tokenization_kwargs is None:
|
||||
tokenization_kwargs = {}
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
enforce_eager=True,
|
||||
max_model_len=64,
|
||||
gpu_memory_utilization=0.7,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(
|
||||
input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs
|
||||
)
|
||||
|
||||
with hf_runner(model, dtype=dtype, auto_cls=SiglipModel) as hf_model:
|
||||
all_inputs = hf_model.get_inputs(
|
||||
input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs
|
||||
)
|
||||
|
||||
all_outputs = []
|
||||
for inputs in all_inputs:
|
||||
inputs = hf_model.wrap_device(inputs)
|
||||
|
||||
if "pixel_values" in inputs:
|
||||
pooled_output = hf_model.model.get_image_features(
|
||||
pixel_values=inputs.pixel_values,
|
||||
).squeeze(0)
|
||||
else:
|
||||
pooled_output = hf_model.model.get_text_features(
|
||||
input_ids=inputs.input_ids,
|
||||
).squeeze(0)
|
||||
|
||||
all_outputs.append(pooled_output.tolist())
|
||||
|
||||
hf_outputs = all_outputs
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
def test_models_text(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
input_texts,
|
||||
input_images, # type: ignore
|
||||
model,
|
||||
dtype=dtype,
|
||||
tokenization_kwargs={
|
||||
"padding": "max_length",
|
||||
"max_length": 64,
|
||||
}, # siglip2 was trained with this padding setting.
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
def test_models_image(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
input_texts_images = [
|
||||
(text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
|
||||
]
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
input_texts,
|
||||
input_images,
|
||||
model,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
def test_models_text_image_no_crash(
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
texts = [HF_TEXT_PROMPTS[0]]
|
||||
images = [image_assets[0].pil_image]
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
enforce_eager=True,
|
||||
max_model_len=64,
|
||||
gpu_memory_utilization=0.7,
|
||||
) as vllm_model:
|
||||
with pytest.raises(ValueError, match="not both"):
|
||||
vllm_model.embed(texts, images=images)
|
||||
|
||||
vllm_model.embed(texts)
|
||||
vllm_model.embed([""], images=images)
|
||||
0
tests/models/multimodal/processing/__init__.py
Normal file
0
tests/models/multimodal/processing/__init__.py
Normal file
125
tests/models/multimodal/processing/test_audioflamingo3.py
Normal file
125
tests/models/multimodal/processing/test_audioflamingo3.py
Normal file
@@ -0,0 +1,125 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Copyright 2025 The vLLM team.
|
||||
# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
|
||||
# reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from tests.models.registry import HF_EXAMPLE_MODELS
|
||||
|
||||
|
||||
class MockAudioFlamingo3Config(PretrainedConfig):
|
||||
model_type = "audioflamingo3"
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.audio_config = PretrainedConfig()
|
||||
self.text_config = PretrainedConfig()
|
||||
|
||||
|
||||
class MockAudioFlamingo3Processor:
|
||||
def __init__(self):
|
||||
self.audio_token = "<sound>"
|
||||
self.audio_token_id = 12345
|
||||
self.feature_extractor = MockFeatureExtractor()
|
||||
|
||||
def __call__(self, text=None, audios=None, **kwargs):
|
||||
return {"input_ids": [1, 2, 3], "input_features": [np.zeros((3000, 80))]}
|
||||
|
||||
|
||||
class MockFeatureExtractor:
|
||||
def __init__(self):
|
||||
self.sampling_rate = 16000
|
||||
self.chunk_length = 30
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_ctx():
|
||||
config = MockAudioFlamingo3Config()
|
||||
|
||||
ctx = MagicMock()
|
||||
ctx.get_hf_config.return_value = config
|
||||
ctx.get_hf_processor.return_value = MockAudioFlamingo3Processor()
|
||||
ctx.model_config.hf_config = config
|
||||
return ctx
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def check_transformers_version():
|
||||
# Check if the model is supported by the current transformers version
|
||||
model_info = HF_EXAMPLE_MODELS.get_hf_info("AudioFlamingo3ForConditionalGeneration")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
|
||||
def test_audio_chunk_counting(mock_ctx):
|
||||
from vllm.model_executor.models.audioflamingo3 import (
|
||||
AudioFlamingo3DummyInputsBuilder,
|
||||
AudioFlamingo3MultiModalProcessor,
|
||||
AudioFlamingo3ProcessingInfo,
|
||||
)
|
||||
|
||||
info = AudioFlamingo3ProcessingInfo(mock_ctx)
|
||||
processor = AudioFlamingo3MultiModalProcessor(
|
||||
info, AudioFlamingo3DummyInputsBuilder(info)
|
||||
)
|
||||
|
||||
sr = 16000
|
||||
audio_1 = np.zeros(30 * sr)
|
||||
audio_2 = np.zeros(45 * sr)
|
||||
|
||||
mm_data = {"audio": [audio_1, audio_2]}
|
||||
prompt = "<|user|>Listen.<|end|>"
|
||||
|
||||
from vllm.multimodal.processing import BaseMultiModalProcessor
|
||||
|
||||
def mock_base_call(self, prompt, mm_data, mm_kwargs, tok_kwargs):
|
||||
return {"input_ids": [1, 2, 3], "input_features": torch.randn(1, 80, 3000)}
|
||||
|
||||
with pytest.MonkeyPatch.context() as mp:
|
||||
mp.setattr(BaseMultiModalProcessor, "_call_hf_processor", mock_base_call)
|
||||
|
||||
processed = processor._call_hf_processor(prompt, mm_data, {}, {})
|
||||
|
||||
chunk_counts = processed["chunk_counts"]
|
||||
|
||||
assert chunk_counts[0].item() == 1
|
||||
assert chunk_counts[1].item() == 2
|
||||
assert len(chunk_counts) == 2
|
||||
|
||||
|
||||
def test_dummy_data_generation(mock_ctx):
|
||||
from vllm.model_executor.models.audioflamingo3 import (
|
||||
AudioFlamingo3DummyInputsBuilder,
|
||||
AudioFlamingo3ProcessingInfo,
|
||||
)
|
||||
|
||||
info = AudioFlamingo3ProcessingInfo(mock_ctx)
|
||||
builder = AudioFlamingo3DummyInputsBuilder(info)
|
||||
|
||||
mm_counts = {"audio": 2}
|
||||
dummy_data = builder.get_dummy_mm_data(100, mm_counts, None)
|
||||
|
||||
assert "audio" in dummy_data
|
||||
assert len(dummy_data["audio"]) == 2
|
||||
|
||||
expected_len = 600 * 16000
|
||||
assert len(dummy_data["audio"][0]) == expected_len
|
||||
418
tests/models/multimodal/processing/test_common.py
Normal file
418
tests/models/multimodal/processing/test_common.py
Normal file
@@ -0,0 +1,418 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Set as AbstractSet
|
||||
from functools import partial
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from mistral_common.protocol.instruct.chunk import ImageChunk, TextChunk
|
||||
from mistral_common.protocol.instruct.messages import UserMessage
|
||||
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||
from PIL import Image
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.config.multimodal import (
|
||||
AudioDummyOptions,
|
||||
BaseDummyOptions,
|
||||
ImageDummyOptions,
|
||||
VideoDummyOptions,
|
||||
)
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
|
||||
from vllm.multimodal.cache import MultiModalProcessorOnlyCache
|
||||
from vllm.multimodal.inputs import MultiModalInputs, batched_tensors_equal
|
||||
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
|
||||
from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
|
||||
from vllm.tokenizers.mistral import MistralTokenizer
|
||||
|
||||
from ....multimodal.utils import random_audio, random_image, random_video
|
||||
from ...registry import (
|
||||
_MULTIMODAL_EXAMPLE_MODELS,
|
||||
_TRANSFORMERS_BACKEND_MODELS,
|
||||
HF_EXAMPLE_MODELS,
|
||||
)
|
||||
|
||||
|
||||
def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
|
||||
"""
|
||||
Patch the multimodal data for GLM4.1V model.
|
||||
"""
|
||||
# Ensure video metadata is included
|
||||
if "video" in mm_data:
|
||||
# GLM4.1V doesn't support multiple videos
|
||||
video = mm_data["video"]
|
||||
num_frames = len(video)
|
||||
mm_data["video"] = (
|
||||
video,
|
||||
{
|
||||
"total_num_frames": num_frames,
|
||||
"fps": num_frames,
|
||||
"duration": 1,
|
||||
"frames_indices": [i for i in range(num_frames)],
|
||||
"video_backend": "opencv",
|
||||
"do_sample_frames": True,
|
||||
},
|
||||
)
|
||||
return mm_data
|
||||
|
||||
|
||||
def qwen3_vl_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
|
||||
"""
|
||||
Patch the multimodal data for Qwen3-VL model.
|
||||
"""
|
||||
|
||||
def create_metadata(frames: np.ndarray):
|
||||
num_frames = len(frames)
|
||||
return {
|
||||
"total_num_frames": num_frames,
|
||||
"fps": 2.0,
|
||||
"duration": num_frames / 2.0,
|
||||
"video_backend": "opencv",
|
||||
"frames_indices": list(range(num_frames)),
|
||||
"do_sample_frames": True,
|
||||
}
|
||||
|
||||
# Ensure video metadata is included
|
||||
if "video" in mm_data:
|
||||
video = mm_data["video"]
|
||||
if isinstance(video, list):
|
||||
# multiple videos
|
||||
mm_data["video"] = [(vid, create_metadata(vid)) for vid in video]
|
||||
else:
|
||||
# single video
|
||||
mm_data["video"] = (video, create_metadata(video))
|
||||
return mm_data
|
||||
|
||||
|
||||
# For some multimodal models, tokenizer will always add bos_token
|
||||
# at the beginning of prompt by default, causing hf_processor outputs
|
||||
# incorrect token ids. So we need use `add_special_tokens=False` here
|
||||
# to leave bos_token to be added by the processor.
|
||||
_ADD_SPECIAL_TOKENS_OVERRIDES = {
|
||||
"ovis": False,
|
||||
"ovis2_5": False,
|
||||
"paligemma": False,
|
||||
"ultravox": False,
|
||||
"whisper": False,
|
||||
}
|
||||
|
||||
_IGNORE_MM_KEYS = {
|
||||
# In Ultravox, the audio_features can be different depending on padding
|
||||
# The slight difference should not be a problem though, since
|
||||
# attention_mask lets us ignore the difference.
|
||||
"ultravox": {"audio_features"},
|
||||
}
|
||||
|
||||
MM_DATA_PATCHES = {
|
||||
# GLM4.1V and Qwen3-VL requires video metadata to be included in the input
|
||||
"glm4v": glm4_1v_patch_mm_data,
|
||||
"glm4v_moe": glm4_1v_patch_mm_data,
|
||||
"qwen3_vl": qwen3_vl_patch_mm_data,
|
||||
"qwen3_vl_moe": qwen3_vl_patch_mm_data,
|
||||
}
|
||||
|
||||
|
||||
def _iter_model_ids_to_test(model_arch_list: AbstractSet[str]):
|
||||
for model_arch in model_arch_list:
|
||||
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
|
||||
yield model_info.default
|
||||
|
||||
for extra_type, extra_model_id in model_info.extras.items():
|
||||
if "fp" in extra_type:
|
||||
continue # Redundant to test quantized models
|
||||
|
||||
yield extra_model_id
|
||||
|
||||
|
||||
def _get_model_ids_to_test(model_arch_list: AbstractSet[str]):
|
||||
return list(_iter_model_ids_to_test(model_arch_list))
|
||||
|
||||
|
||||
def get_model_ids_to_test():
|
||||
transformers_arch_ids = {
|
||||
model_id
|
||||
for info in _TRANSFORMERS_BACKEND_MODELS.values()
|
||||
for model_id in (info.default, *info.extras.values())
|
||||
}
|
||||
vllm_only_archs = {
|
||||
arch
|
||||
for arch, info in _MULTIMODAL_EXAMPLE_MODELS.items()
|
||||
if not any(
|
||||
model_id in transformers_arch_ids
|
||||
for model_id in (info.default, *info.extras.values())
|
||||
)
|
||||
}
|
||||
|
||||
return _get_model_ids_to_test(vllm_only_archs)
|
||||
|
||||
|
||||
def get_text_token_prompts(
|
||||
processor: BaseMultiModalProcessor,
|
||||
mm_data: MultiModalDataDict,
|
||||
):
|
||||
dummy_inputs = processor.dummy_inputs
|
||||
tokenizer: TokenizerLike = processor.info.get_tokenizer()
|
||||
model_config = processor.info.ctx.model_config
|
||||
|
||||
model_type = model_config.hf_config.model_type
|
||||
if model_type in MM_DATA_PATCHES:
|
||||
mm_data = MM_DATA_PATCHES[model_type](mm_data)
|
||||
|
||||
parsed_data = processor.data_parser.parse_mm_data(mm_data)
|
||||
mm_counts = {k: len(vs) for k, vs in parsed_data.items()}
|
||||
|
||||
text_prompt: str | None
|
||||
token_prompt: list[int]
|
||||
if isinstance(tokenizer, MistralTokenizer):
|
||||
images = parsed_data.get("image", [])
|
||||
request = ChatCompletionRequest(
|
||||
messages=[
|
||||
UserMessage(
|
||||
content=[
|
||||
TextChunk(text=""),
|
||||
*(ImageChunk(image=image) for image in images),
|
||||
]
|
||||
),
|
||||
]
|
||||
)
|
||||
res = tokenizer.mistral.encode_chat_completion(request)
|
||||
|
||||
# Mistral does not support decode_tokens with skip_special_tokens=False
|
||||
text_prompt = None
|
||||
token_prompt = res.tokens
|
||||
else:
|
||||
inputs = dummy_inputs.get_dummy_processor_inputs(
|
||||
model_config.max_model_len,
|
||||
mm_counts,
|
||||
)
|
||||
assert isinstance(inputs.prompt, str)
|
||||
|
||||
text_prompt = inputs.prompt
|
||||
token_prompt = tokenizer.encode(
|
||||
text_prompt,
|
||||
add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type, True),
|
||||
)
|
||||
|
||||
return text_prompt, token_prompt
|
||||
|
||||
|
||||
def _test_processing_correctness(
|
||||
model_id_or_arch: str,
|
||||
hit_rate: float,
|
||||
num_batches: int,
|
||||
simplify_rate: float,
|
||||
):
|
||||
if model_id_or_arch in HF_EXAMPLE_MODELS.get_supported_archs():
|
||||
# Use model architecture to get the default model id
|
||||
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_id_or_arch)
|
||||
model_id = model_info.default
|
||||
else:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch)
|
||||
model_id = model_id_or_arch
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
model_config = ModelConfig(
|
||||
model_id,
|
||||
tokenizer=model_info.tokenizer or model_id,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
revision=model_info.revision,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
hf_overrides=model_info.hf_overrides,
|
||||
# Ensure that the cache can fit all of the data
|
||||
mm_processor_cache_gb=2048,
|
||||
skip_tokenizer_init=model_info.require_embed_inputs,
|
||||
enable_prompt_embeds=model_info.require_embed_inputs,
|
||||
enable_mm_embeds=model_info.require_embed_inputs,
|
||||
enforce_eager=model_info.enforce_eager,
|
||||
dtype=model_info.dtype,
|
||||
)
|
||||
|
||||
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
|
||||
factories = model_cls._processor_factory
|
||||
ctx = InputProcessingContext(
|
||||
model_config,
|
||||
tokenizer=cached_tokenizer_from_config(model_config),
|
||||
)
|
||||
cache = MultiModalProcessorOnlyCache(model_config)
|
||||
|
||||
processing_info = factories.info(ctx)
|
||||
supported_mm_limits = processing_info.get_supported_mm_limits()
|
||||
# Keep integer limits for local data generation
|
||||
limit_mm_per_prompt_ints = {
|
||||
modality: 3 if limit is None else limit
|
||||
for modality, limit in supported_mm_limits.items()
|
||||
}
|
||||
|
||||
def _to_dummy_options(modality: str, count: int) -> BaseDummyOptions:
|
||||
if modality == "video":
|
||||
return VideoDummyOptions(count=count)
|
||||
if modality == "image":
|
||||
return ImageDummyOptions(count=count)
|
||||
if modality == "audio":
|
||||
return AudioDummyOptions(count=count)
|
||||
return BaseDummyOptions(count=count)
|
||||
|
||||
# Assign normalized DummyOptions to the model config
|
||||
model_config.get_multimodal_config().limit_per_prompt = {
|
||||
modality: _to_dummy_options(modality, count)
|
||||
for modality, count in limit_mm_per_prompt_ints.items()
|
||||
}
|
||||
|
||||
baseline_processor = factories.build_processor(ctx, cache=None)
|
||||
cached_processor = factories.build_processor(ctx, cache=cache)
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
input_to_hit = {
|
||||
"image": Image.new("RGB", size=(128, 128)),
|
||||
"video": np.zeros((4, 128, 128, 3), dtype=np.uint8),
|
||||
"audio": (np.zeros((512,)), 16000),
|
||||
}
|
||||
input_factory = {
|
||||
"image": partial(random_image, rng, min_wh=128, max_wh=256),
|
||||
"video": partial(
|
||||
random_video, rng, min_frames=2, max_frames=16, min_wh=128, max_wh=256
|
||||
),
|
||||
"audio": partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
|
||||
}
|
||||
|
||||
for batch_idx in range(num_batches):
|
||||
mm_data = {
|
||||
k: [
|
||||
(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
|
||||
for _ in range(rng.randint(limit + 1))
|
||||
]
|
||||
for k, limit in limit_mm_per_prompt_ints.items()
|
||||
}
|
||||
|
||||
# Drop unnecessary keys and test single -> multi conversion
|
||||
if rng.rand() < simplify_rate:
|
||||
for k in list(mm_data.keys()):
|
||||
if not mm_data[k]:
|
||||
del mm_data[k]
|
||||
elif len(mm_data[k]) == 1:
|
||||
mm_data[k] = mm_data[k][0]
|
||||
|
||||
_test_processing_correctness_one(
|
||||
model_config,
|
||||
mm_data,
|
||||
baseline_processor,
|
||||
cached_processor,
|
||||
batch_idx,
|
||||
)
|
||||
|
||||
|
||||
def _test_processing_correctness_one(
|
||||
model_config: ModelConfig,
|
||||
mm_data: MultiModalDataDict,
|
||||
baseline_processor: BaseMultiModalProcessor,
|
||||
cached_processor: BaseMultiModalProcessor,
|
||||
batch_idx: int,
|
||||
):
|
||||
model_type = model_config.hf_config.model_type
|
||||
|
||||
text_prompt, token_prompt = get_text_token_prompts(baseline_processor, mm_data)
|
||||
ignore_mm_keys = _IGNORE_MM_KEYS.get(model_type, set[str]())
|
||||
|
||||
baseline_tokenized_result = baseline_processor.apply(
|
||||
token_prompt,
|
||||
mm_data=mm_data,
|
||||
hf_processor_mm_kwargs={},
|
||||
)
|
||||
|
||||
cached_tokenized_result = cached_processor.apply(
|
||||
token_prompt,
|
||||
mm_data=mm_data,
|
||||
hf_processor_mm_kwargs={},
|
||||
)
|
||||
|
||||
_assert_inputs_equal(
|
||||
baseline_tokenized_result,
|
||||
cached_tokenized_result,
|
||||
ignore_mm_keys=ignore_mm_keys,
|
||||
msg=f"Failed ({batch_idx=}, {token_prompt=}, {mm_data=})",
|
||||
)
|
||||
|
||||
if text_prompt is not None:
|
||||
baseline_text_result = baseline_processor.apply(
|
||||
text_prompt,
|
||||
mm_data=mm_data,
|
||||
hf_processor_mm_kwargs={},
|
||||
)
|
||||
cached_text_result = cached_processor.apply(
|
||||
text_prompt,
|
||||
mm_data=mm_data,
|
||||
hf_processor_mm_kwargs={},
|
||||
)
|
||||
|
||||
_assert_inputs_equal(
|
||||
baseline_text_result,
|
||||
cached_text_result,
|
||||
ignore_mm_keys=ignore_mm_keys,
|
||||
msg=f"Failed ({batch_idx=}, {text_prompt=}, {mm_data=})",
|
||||
)
|
||||
|
||||
_assert_inputs_equal(
|
||||
baseline_text_result,
|
||||
baseline_tokenized_result,
|
||||
ignore_mm_keys=ignore_mm_keys,
|
||||
msg=f"Failed ({batch_idx=}, {text_prompt=}, {token_prompt=}, {mm_data=})",
|
||||
)
|
||||
|
||||
_assert_inputs_equal(
|
||||
cached_text_result,
|
||||
cached_tokenized_result,
|
||||
ignore_mm_keys=ignore_mm_keys,
|
||||
msg=f"Failed ({batch_idx=}, {text_prompt=}, {token_prompt=}, {mm_data=})",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", get_model_ids_to_test())
|
||||
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
|
||||
@pytest.mark.parametrize("num_batches", [32])
|
||||
@pytest.mark.parametrize("simplify_rate", [1.0])
|
||||
def test_processing_correctness(
|
||||
model_id: str,
|
||||
hit_rate: float,
|
||||
num_batches: int,
|
||||
simplify_rate: float,
|
||||
):
|
||||
if model_id == "google/gemma-3n-E2B-it":
|
||||
pytest.skip("Fix later")
|
||||
if model_id == "OpenGVLab/InternVL2-2B":
|
||||
pytest.skip("Fix later")
|
||||
if model_id == "jinaai/jina-reranker-m0":
|
||||
pytest.skip("Fix later")
|
||||
|
||||
_test_processing_correctness(
|
||||
model_id,
|
||||
hit_rate=hit_rate,
|
||||
num_batches=num_batches,
|
||||
simplify_rate=simplify_rate,
|
||||
)
|
||||
|
||||
|
||||
def _assert_inputs_equal(
|
||||
a: MultiModalInputs,
|
||||
b: MultiModalInputs,
|
||||
*,
|
||||
ignore_mm_keys: set[str] | None = None,
|
||||
msg: str = "",
|
||||
):
|
||||
if ignore_mm_keys is None:
|
||||
ignore_mm_keys = set()
|
||||
|
||||
a_rest = {k: v for k, v in a.items() if k != "mm_kwargs"}
|
||||
b_rest = {k: v for k, v in b.items() if k != "mm_kwargs"}
|
||||
|
||||
assert a_rest == b_rest, msg
|
||||
|
||||
a_data = a["mm_kwargs"].get_data()
|
||||
b_data = b["mm_kwargs"].get_data()
|
||||
|
||||
for key in ignore_mm_keys:
|
||||
a_data.pop(key, None)
|
||||
b_data.pop(key, None)
|
||||
|
||||
assert batched_tensors_equal(a_data, b_data), msg
|
||||
42
tests/models/multimodal/processing/test_gemma3.py
Normal file
42
tests/models/multimodal/processing/test_gemma3.py
Normal file
@@ -0,0 +1,42 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
from ....conftest import ImageTestAssets
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["google/gemma-3-4b-it"])
|
||||
def test_get_image_size_with_most_features(
|
||||
image_assets: ImageTestAssets, model_id: str
|
||||
):
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
mm_processor_kwargs={"do_pan_and_scan": True},
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
|
||||
hf_processor_mm_kwargs: dict[str, object] = {}
|
||||
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
|
||||
max_image_size = processor.info.get_image_size_with_most_features()
|
||||
max_tokens = processor.info.get_num_image_tokens(
|
||||
image_width=max_image_size.width,
|
||||
image_height=max_image_size.height,
|
||||
processor=hf_processor,
|
||||
)
|
||||
|
||||
prompt = "<start_of_image>"
|
||||
image_seq_length = hf_processor.image_seq_length
|
||||
|
||||
for asset in image_assets:
|
||||
mm_data = {"image": [asset.pil_image]}
|
||||
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
|
||||
mm_kwargs_data = processed_inputs["mm_kwargs"].get_data()
|
||||
num_patches_tensor = mm_kwargs_data["num_patches"]
|
||||
tokens = int(num_patches_tensor.item()) * image_seq_length
|
||||
assert tokens <= max_tokens
|
||||
110
tests/models/multimodal/processing/test_glm4_1v.py
Normal file
110
tests/models/multimodal/processing/test_glm4_1v.py
Normal file
@@ -0,0 +1,110 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.assets.video import VideoAsset
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import batched_tensors_equal
|
||||
from vllm.multimodal.video import OpenCVDynamicVideoBackend, OpenCVVideoBackend
|
||||
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"])
|
||||
@pytest.mark.parametrize("expected_toks_per_frame", [299])
|
||||
@pytest.mark.parametrize(
|
||||
"num_frames, fps, expected_grid_t",
|
||||
[
|
||||
# pre-sampled fixed frames (unexpected behavior,
|
||||
# but we still expect it to work without errors)
|
||||
(32, 1, 16),
|
||||
(32, 2, 16),
|
||||
(128, 1, 64),
|
||||
(128, 2, 64),
|
||||
# post-sampled frames (expected behavior)
|
||||
(-1, 1, 5),
|
||||
(-1, 2, 10),
|
||||
],
|
||||
)
|
||||
def test_processor_override(
|
||||
model_id: str,
|
||||
expected_toks_per_frame: int,
|
||||
expected_grid_t: int,
|
||||
fps: int,
|
||||
num_frames: int,
|
||||
):
|
||||
"""Ensure GLM4vMultiModalProcessor can handle video frames properly."""
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"video": 1},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
tokenizer = processor.info.get_tokenizer()
|
||||
hf_processor_mm_kwargs = {"fps": fps}
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
video_assets = VideoAsset(name="baby_reading", num_frames=num_frames)
|
||||
prompt = "<|begin_of_video|><|video|><|end_of_video|>"
|
||||
|
||||
video, metadata = video_assets.np_ndarrays, video_assets.metadata
|
||||
metadata["fps"] = fps
|
||||
mm_data = {"video": [(video, metadata)]}
|
||||
|
||||
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token)
|
||||
video_tok_count = processed_inputs["prompt_token_ids"].count(video_token_id)
|
||||
grid_t, _, _ = processed_inputs["mm_kwargs"].get_data()["video_grid_thw"][0]
|
||||
|
||||
assert grid_t == expected_grid_t
|
||||
assert video_tok_count == expected_toks_per_frame * grid_t
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"])
|
||||
@pytest.mark.parametrize("fps", [2])
|
||||
def test_video_loader_consistency(
|
||||
model_id: str,
|
||||
fps: int,
|
||||
):
|
||||
"""
|
||||
Ensure dynamic video loader (pre-sampled by loader) and normal video
|
||||
loader (post-sampled by processor) produce same video processing outputs.
|
||||
"""
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"video": 1},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
hf_processor_mm_kwargs = {"fps": fps}
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
prompt = "<|begin_of_video|><|video|><|end_of_video|>"
|
||||
|
||||
video_path = VideoAsset(name="baby_reading", num_frames=-1).video_path
|
||||
with open(video_path, "rb") as f:
|
||||
video_bytes = f.read()
|
||||
|
||||
static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes)
|
||||
dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes(
|
||||
video_bytes, fps=fps
|
||||
)
|
||||
|
||||
# pre-sampled loader shouldn't read all frames
|
||||
assert len(dynamic_video) < len(static_video)
|
||||
|
||||
static_mm_data = {"video": [(static_video, static_metadata)]}
|
||||
dynamic_mm_data = {"video": [(dynamic_video, dynamic_metadata)]}
|
||||
|
||||
static_outputs = processor.apply(prompt, static_mm_data, hf_processor_mm_kwargs)
|
||||
dynamic_outputs = processor.apply(prompt, dynamic_mm_data, hf_processor_mm_kwargs)
|
||||
|
||||
assert static_outputs["prompt_token_ids"] == dynamic_outputs["prompt_token_ids"]
|
||||
assert batched_tensors_equal(
|
||||
static_outputs["mm_kwargs"].get_data(),
|
||||
dynamic_outputs["mm_kwargs"].get_data(),
|
||||
)
|
||||
177
tests/models/multimodal/processing/test_h2ovl.py
Normal file
177
tests/models/multimodal/processing/test_h2ovl.py
Normal file
@@ -0,0 +1,177 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for H2OVL's multimodal preprocessing kwargs."""
|
||||
|
||||
from collections.abc import Mapping
|
||||
|
||||
import pytest
|
||||
from PIL import Image
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.multimodal.processing import BaseMultiModalProcessor
|
||||
|
||||
from ....conftest import ImageTestAssets
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
def _get_expected_num_patches(
|
||||
config: PretrainedConfig,
|
||||
image: Image.Image,
|
||||
num_imgs: int,
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
):
|
||||
from vllm.model_executor.models.h2ovl import (
|
||||
calculate_h2ovl_targets,
|
||||
get_h2ovl_target_ratios,
|
||||
)
|
||||
|
||||
width, height = image.size
|
||||
|
||||
# Calculate the expected number of blocks
|
||||
if num_imgs == 1 and config.use_msac:
|
||||
# First pass
|
||||
blocks1, _, _, aspect_ratio = calculate_h2ovl_targets(
|
||||
orig_width=width,
|
||||
orig_height=height,
|
||||
target_ratios=get_h2ovl_target_ratios(
|
||||
min_num=1,
|
||||
max_num=max_num,
|
||||
prior_aspect_ratio=None,
|
||||
),
|
||||
image_size=config.vision_config.image_size,
|
||||
use_thumbnail=False, # Thumbnail is handled separately
|
||||
)
|
||||
|
||||
# Second pass
|
||||
blocks2, _, _, _ = calculate_h2ovl_targets(
|
||||
orig_width=width,
|
||||
orig_height=height,
|
||||
target_ratios=get_h2ovl_target_ratios(
|
||||
min_num=3,
|
||||
max_num=max_num,
|
||||
prior_aspect_ratio=aspect_ratio,
|
||||
),
|
||||
image_size=config.vision_config.image_size,
|
||||
use_thumbnail=False,
|
||||
)
|
||||
|
||||
# Add thumbnail if use_thumbnail is True and total_blocks > 1
|
||||
if config.use_thumbnail:
|
||||
blocks1 += 1 if blocks1 > 1 else 0
|
||||
blocks2 += 1 if blocks2 > 1 else 0
|
||||
|
||||
# Total blocks is the sum of blocks from both passes minus
|
||||
# overlapping
|
||||
total_blocks = blocks1 + blocks2 - 1
|
||||
|
||||
return total_blocks
|
||||
|
||||
blocks, _, _, _ = calculate_h2ovl_targets(
|
||||
orig_width=width,
|
||||
orig_height=height,
|
||||
target_ratios=get_h2ovl_target_ratios(
|
||||
min_num,
|
||||
max_num,
|
||||
prior_aspect_ratio=None,
|
||||
),
|
||||
image_size=config.vision_config.image_size,
|
||||
use_thumbnail=False,
|
||||
)
|
||||
expected_num_patches = blocks
|
||||
|
||||
if config.use_thumbnail and expected_num_patches > 1:
|
||||
expected_num_patches += 1
|
||||
|
||||
return expected_num_patches
|
||||
|
||||
|
||||
def _run_check(
|
||||
processor: BaseMultiModalProcessor,
|
||||
images: list[Image.Image],
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
mm_processor_kwargs: Mapping[str, object],
|
||||
):
|
||||
tokenizer = processor.info.get_tokenizer()
|
||||
config = processor.info.get_hf_config()
|
||||
|
||||
prompt = "<image>" * len(images)
|
||||
mm_data = {"image": images}
|
||||
|
||||
total_expected_num_patches = sum(
|
||||
_get_expected_num_patches(config, image, len(images), min_num, max_num)
|
||||
for image in images
|
||||
)
|
||||
|
||||
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
|
||||
pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape
|
||||
|
||||
assert img_tok_count == 256 * total_expected_num_patches
|
||||
assert pixel_shape[0] == total_expected_num_patches
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_id",
|
||||
[
|
||||
"h2oai/h2ovl-mississippi-800m",
|
||||
"h2oai/h2ovl-mississippi-2b",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# Single-scale
|
||||
[1.0],
|
||||
# Single-scale, batched
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 1.0],
|
||||
[4.0, 2.0, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
("min_dynamic_patch", "max_dynamic_patch"),
|
||||
[(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
|
||||
)
|
||||
@pytest.mark.parametrize("dynamic_image_size", [True, False])
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
def test_processor_override(
|
||||
model_id: str,
|
||||
image_assets: ImageTestAssets,
|
||||
size_factors: list[int],
|
||||
min_dynamic_patch: int,
|
||||
max_dynamic_patch: int,
|
||||
dynamic_image_size: bool | None,
|
||||
kwargs_on_init: bool,
|
||||
):
|
||||
mm_processor_kwargs = {
|
||||
"min_dynamic_patch": min_dynamic_patch,
|
||||
"max_dynamic_patch": max_dynamic_patch,
|
||||
"dynamic_image_size": dynamic_image_size,
|
||||
}
|
||||
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": len(size_factors)},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
min_num = min_dynamic_patch if dynamic_image_size else 1
|
||||
max_num = max_dynamic_patch if dynamic_image_size else 1
|
||||
|
||||
_run_check(
|
||||
processor,
|
||||
[rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
|
||||
min_num,
|
||||
max_num,
|
||||
hf_processor_mm_kwargs,
|
||||
)
|
||||
68
tests/models/multimodal/processing/test_idefics3.py
Normal file
68
tests/models/multimodal/processing/test_idefics3.py
Normal file
@@ -0,0 +1,68 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for Idefics3's multimodal preprocessing kwargs."""
|
||||
|
||||
import pytest
|
||||
from transformers import Idefics3Config
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
from ....conftest import ImageTestAssets
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["HuggingFaceM4/Idefics3-8B-Llama3"])
|
||||
@pytest.mark.parametrize(
|
||||
("mm_processor_kwargs", "expected_toks_per_img"),
|
||||
[
|
||||
({"size": {"longest_edge": 364}}, 169),
|
||||
({"size": {"longest_edge": 728}}, 169 * (2**2 + 1)),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
def test_processor_override(
|
||||
image_assets: ImageTestAssets,
|
||||
model_id: str,
|
||||
mm_processor_kwargs: dict[str, object],
|
||||
expected_toks_per_img: int,
|
||||
num_imgs: int,
|
||||
kwargs_on_init: bool,
|
||||
):
|
||||
"""Ensure Idefics3MultiModalProcessor handles num_crops properly."""
|
||||
# Same as the previous test - don't initialize mm_processor_kwargs
|
||||
# in this test and assume that the kwargs will be correctly expanded by
|
||||
# the partial when calling the custom input processor.
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
placeholders = (
|
||||
"<image>"
|
||||
if num_imgs == 1
|
||||
else "\n".join(f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
|
||||
)
|
||||
prompt = f"<|begin_of_text|>User:{placeholders}\n<end_of_utterance>\nAssistant:" # noqa: E501
|
||||
|
||||
# Build mm_data
|
||||
image_size = ctx.get_hf_config(Idefics3Config).vision_config.image_size
|
||||
dummy_image_size = (image_size * 4, image_size * 4)
|
||||
dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
|
||||
mm_data = {"image": [dummy_image] * num_imgs}
|
||||
|
||||
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
|
||||
|
||||
# Ensure the placeholders format are correct
|
||||
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
|
||||
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs["input_ids"][0]
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
image_token_id = ctx.get_hf_config().image_token_id
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
|
||||
assert img_tok_count == expected_toks_per_img * num_imgs
|
||||
131
tests/models/multimodal/processing/test_internvl.py
Normal file
131
tests/models/multimodal/processing/test_internvl.py
Normal file
@@ -0,0 +1,131 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for InternVL's multimodal preprocessing kwargs."""
|
||||
|
||||
from collections.abc import Mapping
|
||||
|
||||
import pytest
|
||||
from PIL import Image
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.multimodal.processing import BaseMultiModalProcessor
|
||||
|
||||
from ....conftest import ImageTestAssets
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
def _get_expected_num_patches(
|
||||
config: PretrainedConfig,
|
||||
image: Image.Image,
|
||||
num_imgs: int,
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
):
|
||||
from vllm.model_executor.models.internvl import (
|
||||
calculate_internvl_targets,
|
||||
get_internvl_target_ratios,
|
||||
)
|
||||
|
||||
width, height = image.size
|
||||
|
||||
blocks, _, _ = calculate_internvl_targets(
|
||||
orig_width=width,
|
||||
orig_height=height,
|
||||
target_ratios=get_internvl_target_ratios(
|
||||
min_num,
|
||||
max_num,
|
||||
),
|
||||
image_size=config.vision_config.image_size,
|
||||
use_thumbnail=False,
|
||||
)
|
||||
expected_num_patches = blocks
|
||||
|
||||
if config.use_thumbnail and expected_num_patches > 1:
|
||||
expected_num_patches += 1
|
||||
|
||||
return expected_num_patches
|
||||
|
||||
|
||||
def _run_check(
|
||||
processor: BaseMultiModalProcessor,
|
||||
images: list[Image.Image],
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
mm_processor_kwargs: Mapping[str, object],
|
||||
):
|
||||
tokenizer = processor.info.get_tokenizer()
|
||||
config = processor.info.get_hf_config()
|
||||
|
||||
prompt = "<image>" * len(images)
|
||||
mm_data = {"image": images}
|
||||
|
||||
total_expected_num_patches = sum(
|
||||
_get_expected_num_patches(config, image, len(images), min_num, max_num)
|
||||
for image in images
|
||||
)
|
||||
|
||||
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
|
||||
pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape
|
||||
|
||||
assert img_tok_count == 256 * total_expected_num_patches
|
||||
assert pixel_shape[0] == total_expected_num_patches
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["OpenGVLab/InternVL2-2B"])
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# Single-scale
|
||||
[1.0],
|
||||
# Single-scale, batched
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 1.0],
|
||||
[4.0, 2.0, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
("min_dynamic_patch", "max_dynamic_patch"),
|
||||
[(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
|
||||
)
|
||||
@pytest.mark.parametrize("dynamic_image_size", [True, False])
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
def test_processor_override(
|
||||
model_id: str,
|
||||
image_assets: ImageTestAssets,
|
||||
size_factors: list[int],
|
||||
min_dynamic_patch: int,
|
||||
max_dynamic_patch: int,
|
||||
dynamic_image_size: bool | None,
|
||||
kwargs_on_init: bool,
|
||||
):
|
||||
mm_processor_kwargs = {
|
||||
"min_dynamic_patch": min_dynamic_patch,
|
||||
"max_dynamic_patch": max_dynamic_patch,
|
||||
"dynamic_image_size": dynamic_image_size,
|
||||
}
|
||||
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": len(size_factors)},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
min_num = min_dynamic_patch if dynamic_image_size else 1
|
||||
max_num = max_dynamic_patch if dynamic_image_size else 1
|
||||
|
||||
_run_check(
|
||||
processor,
|
||||
[rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
|
||||
min_num,
|
||||
max_num,
|
||||
hf_processor_mm_kwargs,
|
||||
)
|
||||
85
tests/models/multimodal/processing/test_llama4.py
Normal file
85
tests/models/multimodal/processing/test_llama4.py
Normal file
@@ -0,0 +1,85 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for Llama4's multimodal preprocessing kwargs."""
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
from ....conftest import ImageTestAssets
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["meta-llama/Llama-4-Scout-17B-16E-Instruct"])
|
||||
@pytest.mark.parametrize("mm_processor_kwargs", [{}])
|
||||
@pytest.mark.parametrize("num_imgs", [1, 5])
|
||||
@pytest.mark.parametrize("mm_processor_cache_gb", [0, 4])
|
||||
@pytest.mark.parametrize("tokenized_prompt", [True, False])
|
||||
def test_processor_override(
|
||||
image_assets: ImageTestAssets,
|
||||
model_id: str,
|
||||
mm_processor_kwargs: dict,
|
||||
num_imgs: int,
|
||||
mm_processor_cache_gb: int,
|
||||
tokenized_prompt: bool,
|
||||
):
|
||||
"""Ensure llama4 processor works properly."""
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
mm_processor_cache_gb=mm_processor_cache_gb,
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
config = processor.info.get_hf_config()
|
||||
tokenizer = processor.info.get_tokenizer()
|
||||
hf_processor = processor.info.get_hf_processor()
|
||||
vocab = tokenizer.get_vocab()
|
||||
|
||||
prompt = (
|
||||
"<|begin_of_text|><|header_start|>user<|header_end|>"
|
||||
+ "<|image|>" * num_imgs
|
||||
+ "<|eot|><|header_start|>assistant<|header_end|>"
|
||||
)
|
||||
mm_data = {
|
||||
"image": [
|
||||
image_assets[(i % len(image_assets))].pil_image for i in range(num_imgs)
|
||||
]
|
||||
}
|
||||
if tokenized_prompt:
|
||||
prompt = tokenizer.encode(prompt)
|
||||
|
||||
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
|
||||
mm_data = processed_inputs["mm_kwargs"].get_data()
|
||||
|
||||
# place holder replacements
|
||||
prompt_token_ids = processed_inputs["prompt_token_ids"]
|
||||
assert prompt_token_ids.count(config.boi_token_index) == num_imgs
|
||||
assert prompt_token_ids.count(config.eoi_token_index) == num_imgs
|
||||
assert prompt_token_ids.count(vocab[hf_processor.image_token]) == num_imgs
|
||||
aspect_ratios = mm_data["aspect_ratios"]
|
||||
num_x_separators = num_y_separators = 0
|
||||
for tiles_y, tiles_x in aspect_ratios:
|
||||
if tiles_x * tiles_y > 1:
|
||||
num_x_separators += (tiles_x - 1) * tiles_y
|
||||
num_y_separators += tiles_y
|
||||
assert prompt_token_ids.count(vocab[hf_processor.tile_token]) == num_x_separators
|
||||
assert (
|
||||
prompt_token_ids.count(vocab[hf_processor.tile_global_token])
|
||||
== num_y_separators
|
||||
)
|
||||
|
||||
# image token offsets
|
||||
img_locs = processed_inputs["mm_placeholders"].get("image", [])
|
||||
assert len(img_locs) == num_imgs
|
||||
assert [img_loc.offset for img_loc in img_locs] == [
|
||||
i for i, v in enumerate(prompt_token_ids) if v == config.boi_token_index
|
||||
]
|
||||
|
||||
# patch sizes and masks
|
||||
num_patches_per_chunk = processor.info.get_patch_per_chunk(config.vision_config)
|
||||
assert (
|
||||
prompt_token_ids.count(config.image_token_index)
|
||||
== sum(mm_data["patches_per_image"]) * num_patches_per_chunk
|
||||
)
|
||||
assert len(mm_data["pixel_values"]) == sum(mm_data["patches_per_image"])
|
||||
194
tests/models/multimodal/processing/test_llava_next.py
Normal file
194
tests/models/multimodal/processing/test_llava_next.py
Normal file
@@ -0,0 +1,194 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import itertools
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
from PIL import Image
|
||||
from pqdm.threads import pqdm
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.parse import ImageSize
|
||||
from vllm.multimodal.processing import BaseMultiModalProcessor
|
||||
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
def _validate_image_max_tokens_one(
|
||||
processor: BaseMultiModalProcessor,
|
||||
max_tokens: int,
|
||||
failed_size_excs: list[tuple[ImageSize, Exception]],
|
||||
image_size: ImageSize,
|
||||
) -> None:
|
||||
info = processor.info
|
||||
feature_size = info.get_num_image_tokens(
|
||||
image_width=image_size.width, image_height=image_size.height
|
||||
)
|
||||
|
||||
try:
|
||||
assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
|
||||
except Exception as exc:
|
||||
failed_size_excs.append((image_size, exc))
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
"This test takes around 5 minutes to run. Comment this out to run it manually."
|
||||
)
|
||||
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
|
||||
def test_processor_max_tokens(model_id):
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
info = processor.info
|
||||
|
||||
seen_aspect_ratios = set[float]()
|
||||
image_sizes = list[ImageSize]()
|
||||
|
||||
# The aspect ratio of the grid layout is between 1 and 2
|
||||
# NOTE: Assumes that feature size calculation is the same if we
|
||||
# swap the width and height of the image
|
||||
for w, h in itertools.product(range(32, 4096), repeat=2):
|
||||
aspect_ratio = w / h
|
||||
if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios:
|
||||
image_sizes.append(ImageSize(w, h))
|
||||
seen_aspect_ratios.add(aspect_ratio)
|
||||
|
||||
failed_size_excs = list[tuple[ImageSize, Exception]]()
|
||||
|
||||
validate_one = partial(
|
||||
_validate_image_max_tokens_one,
|
||||
processor,
|
||||
info.get_max_image_tokens(), # type: ignore
|
||||
failed_size_excs,
|
||||
)
|
||||
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
|
||||
|
||||
if failed_size_excs:
|
||||
msg = "Found failing image sizes:" + "\n========\n".join(
|
||||
f"[{size}]\n{exc}" for size, exc in failed_size_excs
|
||||
)
|
||||
raise AssertionError(msg)
|
||||
|
||||
|
||||
def _validate_image_prompt_replacements_one(
|
||||
processor: BaseMultiModalProcessor,
|
||||
num_imgs: int,
|
||||
failed_size_excs: list[tuple[ImageSize, Exception]],
|
||||
image_size: ImageSize,
|
||||
) -> None:
|
||||
prompt = "<image>" * num_imgs
|
||||
image = Image.new("RGB", size=image_size)
|
||||
mm_data = {"image": [image] * num_imgs}
|
||||
|
||||
try:
|
||||
# The processor will throw an error if there is a mismatch
|
||||
# in the prompt replacements
|
||||
processed_inputs = processor.apply(prompt, mm_data, {})
|
||||
|
||||
image_placeholders = processed_inputs["mm_placeholders"]["image"]
|
||||
assert len(image_placeholders) == num_imgs
|
||||
|
||||
first_placeholder = image_placeholders[0]
|
||||
|
||||
# NOTE: There is a BOS token
|
||||
assert first_placeholder.offset == 1
|
||||
assert (
|
||||
first_placeholder.length
|
||||
== (len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
|
||||
)
|
||||
|
||||
except Exception as exc:
|
||||
failed_size_excs.append((image_size, exc))
|
||||
|
||||
|
||||
def _test_image_prompt_replacements(
|
||||
processor,
|
||||
*,
|
||||
num_imgs: int,
|
||||
image_sizes: list[ImageSize],
|
||||
) -> None:
|
||||
"""
|
||||
Ensure LlavaNextMultiModalProcessor
|
||||
handles prompt replacement properly for input images.
|
||||
"""
|
||||
failed_size_excs = list[tuple[ImageSize, Exception]]()
|
||||
|
||||
validate_one = partial(
|
||||
_validate_image_prompt_replacements_one,
|
||||
processor,
|
||||
num_imgs,
|
||||
failed_size_excs,
|
||||
)
|
||||
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
|
||||
|
||||
if failed_size_excs:
|
||||
msg = "Found failing image sizes:" + "\n========\n".join(
|
||||
f"[{size}]\n{exc}" for size, exc in failed_size_excs
|
||||
)
|
||||
raise AssertionError(msg)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
|
||||
image_ratios = [
|
||||
(171, 152),
|
||||
(184, 161),
|
||||
(198, 176),
|
||||
(333, 296),
|
||||
(369, 328),
|
||||
(488, 183),
|
||||
(2560, 1669),
|
||||
]
|
||||
image_sizes = [
|
||||
size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
|
||||
]
|
||||
|
||||
_test_image_prompt_replacements(
|
||||
processor,
|
||||
num_imgs=num_imgs,
|
||||
image_sizes=image_sizes,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
"This test takes around 2 hours to run. Comment this out to run it manually."
|
||||
)
|
||||
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
|
||||
@pytest.mark.parametrize("num_imgs", [1])
|
||||
def test_processor_prompt_replacements_all(model_id, num_imgs):
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
|
||||
seen_aspect_ratios = set[float]()
|
||||
image_sizes = list[ImageSize]()
|
||||
|
||||
# The aspect ratio of the grid layout is between 1 and 2
|
||||
# NOTE: Assumes that feature size calculation is the same if we
|
||||
# swap the width and height of the image
|
||||
for w, h in itertools.product(range(64, 1024), repeat=2):
|
||||
aspect_ratio = w / h
|
||||
if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios:
|
||||
image_sizes.append(ImageSize(w, h))
|
||||
seen_aspect_ratios.add(aspect_ratio)
|
||||
|
||||
_test_image_prompt_replacements(
|
||||
processor,
|
||||
num_imgs=num_imgs,
|
||||
image_sizes=image_sizes,
|
||||
)
|
||||
192
tests/models/multimodal/processing/test_llava_onevision.py
Normal file
192
tests/models/multimodal/processing/test_llava_onevision.py
Normal file
@@ -0,0 +1,192 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import itertools
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
from PIL import Image
|
||||
from pqdm.threads import pqdm
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.parse import ImageSize
|
||||
from vllm.multimodal.processing import BaseMultiModalProcessor
|
||||
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
def _validate_image_max_tokens_one(
|
||||
processor: BaseMultiModalProcessor,
|
||||
max_tokens: int,
|
||||
failed_size_excs: list[tuple[ImageSize, Exception]],
|
||||
image_size: ImageSize,
|
||||
) -> None:
|
||||
info = processor.info
|
||||
feature_size = info.get_num_image_tokens(
|
||||
image_width=image_size.width, image_height=image_size.height
|
||||
)
|
||||
|
||||
try:
|
||||
assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
|
||||
except Exception as exc:
|
||||
failed_size_excs.append((image_size, exc))
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
"This test takes around 5 minutes to run. Comment this out to run it manually."
|
||||
)
|
||||
@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
|
||||
def test_processor_max_tokens(model_id):
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
info = processor.info
|
||||
|
||||
seen_aspect_ratios = set[float]()
|
||||
image_sizes = list[ImageSize]()
|
||||
|
||||
# The aspect ratio of the grid layout is between 1 and 6
|
||||
# NOTE: Assumes that feature size calculation is the same if we
|
||||
# swap the width and height of the image
|
||||
for w, h in itertools.product(range(32, 4096), repeat=2):
|
||||
aspect_ratio = w / h
|
||||
if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
|
||||
image_sizes.append(ImageSize(w, h))
|
||||
seen_aspect_ratios.add(aspect_ratio)
|
||||
|
||||
failed_size_excs = list[tuple[ImageSize, Exception]]()
|
||||
|
||||
validate_one = partial(
|
||||
_validate_image_max_tokens_one,
|
||||
processor,
|
||||
info.get_max_image_tokens(), # type: ignore
|
||||
failed_size_excs,
|
||||
)
|
||||
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
|
||||
|
||||
if failed_size_excs:
|
||||
msg = "Found failing image sizes:" + "\n========\n".join(
|
||||
f"[{size}]\n{exc}" for size, exc in failed_size_excs
|
||||
)
|
||||
raise AssertionError(msg)
|
||||
|
||||
|
||||
def _validate_image_prompt_replacements_one(
|
||||
processor: BaseMultiModalProcessor,
|
||||
num_imgs: int,
|
||||
failed_size_excs: list[tuple[ImageSize, Exception]],
|
||||
image_size: ImageSize,
|
||||
) -> None:
|
||||
prompt = "<image>" * num_imgs
|
||||
image = Image.new("RGB", size=image_size)
|
||||
mm_data = {"image": [image] * num_imgs}
|
||||
|
||||
try:
|
||||
# The processor will throw an error if there is a mismatch
|
||||
# in the prompt replacements
|
||||
processed_inputs = processor.apply(prompt, mm_data, {})
|
||||
|
||||
image_placeholders = processed_inputs["mm_placeholders"]["image"]
|
||||
assert len(image_placeholders) == num_imgs
|
||||
|
||||
first_placeholder = image_placeholders[0]
|
||||
|
||||
assert first_placeholder.offset == 0
|
||||
assert (
|
||||
first_placeholder.length
|
||||
== len(processed_inputs["prompt_token_ids"]) // num_imgs
|
||||
)
|
||||
except Exception as exc:
|
||||
failed_size_excs.append((image_size, exc))
|
||||
|
||||
|
||||
def _test_image_prompt_replacements(
|
||||
processor,
|
||||
*,
|
||||
num_imgs: int,
|
||||
image_sizes: list[ImageSize],
|
||||
) -> None:
|
||||
"""
|
||||
Ensure LlavaOnevisionMultiModalProcessor
|
||||
handles prompt replacement properly for input images.
|
||||
"""
|
||||
failed_size_excs = list[tuple[ImageSize, Exception]]()
|
||||
|
||||
validate_one = partial(
|
||||
_validate_image_prompt_replacements_one,
|
||||
processor,
|
||||
num_imgs,
|
||||
failed_size_excs,
|
||||
)
|
||||
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
|
||||
|
||||
if failed_size_excs:
|
||||
msg = "Found failing image sizes:" + "\n========\n".join(
|
||||
f"[{size}]\n{exc}" for size, exc in failed_size_excs
|
||||
)
|
||||
raise AssertionError(msg)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
|
||||
image_ratios = [
|
||||
(171, 152),
|
||||
(184, 161),
|
||||
(198, 176),
|
||||
(333, 296),
|
||||
(369, 328),
|
||||
(488, 183),
|
||||
(2560, 1669),
|
||||
]
|
||||
image_sizes = [
|
||||
size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
|
||||
]
|
||||
|
||||
_test_image_prompt_replacements(
|
||||
processor,
|
||||
num_imgs=num_imgs,
|
||||
image_sizes=image_sizes,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
"This test takes around 2 hours to run. Comment this out to run it manually."
|
||||
)
|
||||
@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
|
||||
@pytest.mark.parametrize("num_imgs", [1])
|
||||
def test_processor_prompt_replacements_all(model_id, num_imgs):
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
|
||||
seen_aspect_ratios = set[float]()
|
||||
image_sizes = list[ImageSize]()
|
||||
|
||||
# The aspect ratio of the grid layout is between 1 and 6
|
||||
# NOTE: Assumes that feature size calculation is the same if we
|
||||
# swap the width and height of the image
|
||||
for w, h in itertools.product(range(64, 1024), repeat=2):
|
||||
aspect_ratio = w / h
|
||||
if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
|
||||
image_sizes.append(ImageSize(w, h))
|
||||
seen_aspect_ratios.add(aspect_ratio)
|
||||
|
||||
_test_image_prompt_replacements(
|
||||
processor,
|
||||
num_imgs=num_imgs,
|
||||
image_sizes=image_sizes,
|
||||
)
|
||||
105
tests/models/multimodal/processing/test_minimax_vl_01.py
Normal file
105
tests/models/multimodal/processing/test_minimax_vl_01.py
Normal file
@@ -0,0 +1,105 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
from PIL import Image
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.parse import ImageSize
|
||||
from vllm.multimodal.processing import BaseMultiModalProcessor
|
||||
|
||||
from ....conftest import ImageTestAssets
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["MiniMaxAI/MiniMax-VL-01"])
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
def test_processor_override(
|
||||
image_assets: ImageTestAssets,
|
||||
model_id: str,
|
||||
num_imgs: int,
|
||||
):
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
prompt = "<image>" * num_imgs
|
||||
image = Image.new("RGB", size=(364, 364))
|
||||
mm_data = {"image": [image] * num_imgs}
|
||||
|
||||
processed_inputs = processor.apply(prompt, mm_data, {})
|
||||
image_placeholders = processed_inputs["mm_placeholders"]["image"]
|
||||
|
||||
assert len(image_placeholders) == num_imgs
|
||||
|
||||
|
||||
def _validate_image_prompt_replacements_one(
|
||||
processor: BaseMultiModalProcessor,
|
||||
num_imgs: int,
|
||||
failed_size_excs: list[tuple[ImageSize, Exception]],
|
||||
image_size: ImageSize,
|
||||
) -> None:
|
||||
prompt = "<image>" * num_imgs
|
||||
image = Image.new("RGB", size=image_size)
|
||||
mm_data = {"image": [image] * num_imgs}
|
||||
|
||||
try:
|
||||
processed_inputs = processor.apply(prompt, mm_data, {})
|
||||
|
||||
image_placeholders = processed_inputs["mm_placeholders"]["image"]
|
||||
assert len(image_placeholders) == num_imgs
|
||||
|
||||
except Exception as exc:
|
||||
failed_size_excs.append((image_size, exc))
|
||||
|
||||
|
||||
def _test_image_prompt_replacements(
|
||||
processor,
|
||||
*,
|
||||
num_imgs: int,
|
||||
image_sizes: list[ImageSize],
|
||||
) -> None:
|
||||
failed_size_excs = list[tuple[ImageSize, Exception]]()
|
||||
|
||||
for size in image_sizes:
|
||||
_validate_image_prompt_replacements_one(
|
||||
processor, num_imgs, failed_size_excs, size
|
||||
)
|
||||
|
||||
if failed_size_excs:
|
||||
msg = "Found failing image sizes:" + "\n========\n".join(
|
||||
f"[{size}]\n{exc}" for size, exc in failed_size_excs
|
||||
)
|
||||
raise AssertionError(msg)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["MiniMaxAI/MiniMax-VL-01"])
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
|
||||
image_ratios = [
|
||||
(171, 152),
|
||||
(184, 161),
|
||||
(198, 176),
|
||||
(333, 296),
|
||||
(369, 328),
|
||||
(488, 183),
|
||||
(2560, 1669),
|
||||
]
|
||||
image_sizes = [
|
||||
size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
|
||||
]
|
||||
|
||||
_test_image_prompt_replacements(
|
||||
processor,
|
||||
num_imgs=num_imgs,
|
||||
image_sizes=image_sizes,
|
||||
)
|
||||
72
tests/models/multimodal/processing/test_mllama4.py
Normal file
72
tests/models/multimodal/processing/test_mllama4.py
Normal file
@@ -0,0 +1,72 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for mllama's multimodal preprocessing and profiling."""
|
||||
|
||||
import pytest
|
||||
from torch import prod
|
||||
from transformers import Llama4Config
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.profiling import MultiModalProfiler
|
||||
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["meta-llama/Llama-Guard-4-12B"])
|
||||
@pytest.mark.parametrize("max_model_len", [4096, 8192, 25600, 131072])
|
||||
def test_profiling(model_id: str, max_model_len: int):
|
||||
model_config_kwargs = {
|
||||
"max_model_len": max_model_len,
|
||||
}
|
||||
mm_counts = {"image": 1}
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
model_config_kwargs=model_config_kwargs,
|
||||
limit_mm_per_prompt=mm_counts,
|
||||
)
|
||||
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
profiler = MultiModalProfiler(processor)
|
||||
|
||||
decoder_dummy_data = profiler.get_decoder_dummy_data(
|
||||
max_model_len,
|
||||
mm_counts=mm_counts,
|
||||
)
|
||||
dummy_mm_data = processor.dummy_inputs.get_dummy_processor_inputs(
|
||||
max_model_len,
|
||||
mm_counts=mm_counts,
|
||||
)
|
||||
|
||||
hf_config = ctx.get_hf_config(Llama4Config)
|
||||
|
||||
mm_data = processor.apply(
|
||||
prompt=dummy_mm_data.prompt,
|
||||
mm_data=dummy_mm_data.mm_data,
|
||||
hf_processor_mm_kwargs=dict(),
|
||||
)["mm_kwargs"].get_data()
|
||||
|
||||
image_size = hf_config.vision_config.image_size
|
||||
patch_size = hf_config.vision_config.patch_size
|
||||
downsample_ratio = int(
|
||||
round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2))
|
||||
)
|
||||
tokens_per_patch = ((image_size // patch_size) ** 2) // downsample_ratio
|
||||
chunks_per_image = prod(mm_data["patches_per_image"])
|
||||
total_num_patches = chunks_per_image * tokens_per_patch
|
||||
num_tiles = (
|
||||
mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][1]
|
||||
) # x-y separator tokens
|
||||
total_tokens = (
|
||||
total_num_patches.item() + num_tiles.item() + 3
|
||||
) # image start, image, image end
|
||||
|
||||
profiled_tokens = profiler.get_mm_max_tokens(
|
||||
max_model_len,
|
||||
mm_counts=mm_counts,
|
||||
)
|
||||
|
||||
assert total_num_patches == profiled_tokens["image"]
|
||||
assert total_tokens == sum(
|
||||
placeholder.length
|
||||
for placeholder in decoder_dummy_data.multi_modal_placeholders["image"]
|
||||
)
|
||||
133
tests/models/multimodal/processing/test_nemotron_vl.py
Normal file
133
tests/models/multimodal/processing/test_nemotron_vl.py
Normal file
@@ -0,0 +1,133 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for Nemotron-Nano-VL's multimodal preprocessing kwargs."""
|
||||
|
||||
from collections.abc import Mapping
|
||||
|
||||
import pytest
|
||||
from PIL import Image
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.multimodal.processing import BaseMultiModalProcessor
|
||||
|
||||
from ....conftest import ImageTestAssets
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
def _get_expected_num_patches(
|
||||
config: PretrainedConfig,
|
||||
image: Image.Image,
|
||||
num_imgs: int,
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
):
|
||||
from vllm.model_executor.models.nemotron_vl import (
|
||||
calculate_nemotron_vl_targets,
|
||||
get_nemotron_vl_target_ratios,
|
||||
)
|
||||
|
||||
width, height = image.size
|
||||
|
||||
blocks, _, _ = calculate_nemotron_vl_targets(
|
||||
orig_width=width,
|
||||
orig_height=height,
|
||||
target_ratios=get_nemotron_vl_target_ratios(
|
||||
min_num,
|
||||
max_num,
|
||||
),
|
||||
image_size=config.force_image_size,
|
||||
use_thumbnail=False,
|
||||
)
|
||||
expected_num_patches = blocks
|
||||
|
||||
if config.use_thumbnail and expected_num_patches > 1:
|
||||
expected_num_patches += 1
|
||||
|
||||
return expected_num_patches
|
||||
|
||||
|
||||
def _run_check(
|
||||
processor: BaseMultiModalProcessor,
|
||||
images: list[Image.Image],
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
mm_processor_kwargs: Mapping[str, object],
|
||||
):
|
||||
tokenizer = processor.info.get_tokenizer()
|
||||
config = processor.info.get_hf_config()
|
||||
image_processor = processor.info.get_image_processor()
|
||||
|
||||
config.use_thumbnail = image_processor.use_thumbnail
|
||||
prompt = "<image>" * len(images)
|
||||
mm_data = {"image": images}
|
||||
|
||||
total_expected_num_patches = sum(
|
||||
_get_expected_num_patches(config, image, len(images), min_num, max_num)
|
||||
for image in images
|
||||
)
|
||||
print(total_expected_num_patches)
|
||||
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
image_token_id = tokenizer.convert_tokens_to_ids("<image>")
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
|
||||
pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape
|
||||
print("Image token count:", img_tok_count, "Pixel shape:", pixel_shape)
|
||||
assert img_tok_count == 256 * total_expected_num_patches
|
||||
assert pixel_shape[0] == total_expected_num_patches
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"])
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# Single-scale
|
||||
[1.0],
|
||||
# Single-scale, batched
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 1.0],
|
||||
[4.0, 2.0, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
("min_dynamic_patch", "max_dynamic_patch"),
|
||||
[(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
|
||||
)
|
||||
@pytest.mark.parametrize("dynamic_image_size", [True, False])
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
def test_processor_override(
|
||||
model_id: str,
|
||||
image_assets: ImageTestAssets,
|
||||
size_factors: list[int],
|
||||
min_dynamic_patch: int,
|
||||
max_dynamic_patch: int,
|
||||
dynamic_image_size: bool | None,
|
||||
kwargs_on_init: bool,
|
||||
):
|
||||
mm_processor_kwargs = {
|
||||
"min_dynamic_patch": min_dynamic_patch,
|
||||
"max_dynamic_patch": max_dynamic_patch,
|
||||
"dynamic_image_size": dynamic_image_size,
|
||||
}
|
||||
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": len(size_factors)},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
min_num = min_dynamic_patch if dynamic_image_size else 1
|
||||
max_num = max_dynamic_patch if dynamic_image_size else 1
|
||||
|
||||
_run_check(
|
||||
processor,
|
||||
[rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
|
||||
min_num,
|
||||
max_num,
|
||||
hf_processor_mm_kwargs,
|
||||
)
|
||||
54
tests/models/multimodal/processing/test_phi3v.py
Normal file
54
tests/models/multimodal/processing/test_phi3v.py
Normal file
@@ -0,0 +1,54 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for phi3v's multimodal preprocessing kwargs."""
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
from ....conftest import ImageTestAssets
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"])
|
||||
@pytest.mark.parametrize(
|
||||
("mm_processor_kwargs", "expected_toks_per_img"),
|
||||
[
|
||||
({"num_crops": 4}, 757),
|
||||
({"num_crops": 16}, 1921),
|
||||
# the default num_crops of phi-3.5-vision is 4
|
||||
({}, 757),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
def test_processor_override(
|
||||
image_assets: ImageTestAssets,
|
||||
model_id: str,
|
||||
mm_processor_kwargs: dict[str, int],
|
||||
expected_toks_per_img: int,
|
||||
num_imgs: int,
|
||||
kwargs_on_init: bool,
|
||||
):
|
||||
"""Ensure Phi3VMultiModalProcessor handles num_crops properly."""
|
||||
# Avoid initializing CUDA early
|
||||
from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
|
||||
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
|
||||
prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
|
||||
mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
|
||||
|
||||
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)
|
||||
assert img_tok_count == expected_toks_per_img * num_imgs
|
||||
60
tests/models/multimodal/processing/test_phi4mm.py
Normal file
60
tests/models/multimodal/processing/test_phi4mm.py
Normal file
@@ -0,0 +1,60 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for phi4mm's multimodal preprocessing kwargs."""
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
from ....conftest import ImageTestAssets
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["microsoft/Phi-4-multimodal-instruct"])
|
||||
@pytest.mark.parametrize(
|
||||
("mm_processor_kwargs", "expected_toks_per_img"),
|
||||
[
|
||||
({"dynamic_hd": 4}, 1329),
|
||||
({"dynamic_hd": 16}, 4433),
|
||||
# the default num_crops of phi-4-multimodal is 36
|
||||
({}, 9585),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
def test_processor_override(
|
||||
image_assets: ImageTestAssets,
|
||||
model_id: str,
|
||||
mm_processor_kwargs: dict[str, int],
|
||||
expected_toks_per_img: int,
|
||||
num_imgs: int,
|
||||
kwargs_on_init: bool,
|
||||
):
|
||||
"""Ensure Phi4MMMultiModalProcessor handles dynamic_hd properly."""
|
||||
# Avoid initializing CUDA early
|
||||
from vllm.model_executor.models.phi4mm import _IMAGE_PLACEHOLDER_TOKEN_ID
|
||||
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
|
||||
prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
|
||||
|
||||
image_size = ctx.get_hf_config().embd_layer["image_embd_layer"]["crop_size"]
|
||||
dummy_image_size = (image_size * 7, image_size * 7)
|
||||
dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
|
||||
mm_data = {"image": [dummy_image] * num_imgs}
|
||||
|
||||
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(
|
||||
_IMAGE_PLACEHOLDER_TOKEN_ID
|
||||
)
|
||||
assert img_tok_count == expected_toks_per_img * num_imgs
|
||||
90
tests/models/multimodal/processing/test_qwen2_vl.py
Normal file
90
tests/models/multimodal/processing/test_qwen2_vl.py
Normal file
@@ -0,0 +1,90 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
from ....conftest import ImageTestAssets
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
|
||||
@pytest.mark.parametrize(
|
||||
("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"),
|
||||
[
|
||||
({}, 1426, (5704, 1176)),
|
||||
({"min_pixels": 64**2, "max_pixels": 512**2}, 330, (1320, 1176)),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
def test_processor_override(
|
||||
image_assets: ImageTestAssets,
|
||||
model_id: str,
|
||||
mm_processor_kwargs: dict[str, object],
|
||||
expected_toks_per_img: int,
|
||||
expected_pixels_shape: tuple[int, int],
|
||||
num_imgs: int,
|
||||
kwargs_on_init: bool,
|
||||
):
|
||||
"""Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
tokenizer = processor.info.get_tokenizer()
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs
|
||||
mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
|
||||
|
||||
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
|
||||
pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values"].shape
|
||||
|
||||
assert img_tok_count == expected_toks_per_img * num_imgs
|
||||
assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs
|
||||
assert pixel_shape[1] == expected_pixels_shape[1]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
|
||||
@pytest.mark.parametrize("max_pixels", [1280 * 28 * 28, 1283 * 28 * 28])
|
||||
def test_get_image_size_with_most_features(
|
||||
image_assets: ImageTestAssets,
|
||||
model_id: str,
|
||||
max_pixels: int,
|
||||
):
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
mm_processor_kwargs={"max_pixels": max_pixels},
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
|
||||
hf_processor_mm_kwargs: dict[str, object] = {}
|
||||
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
merge_size = processor.info.get_hf_config().vision_config.spatial_merge_size
|
||||
|
||||
max_image_size = processor.info.get_image_size_with_most_features()
|
||||
max_tokens = processor.info.get_num_image_tokens(
|
||||
image_width=max_image_size.width,
|
||||
image_height=max_image_size.height,
|
||||
image_processor=hf_processor.image_processor,
|
||||
)
|
||||
|
||||
prompt = "<|vision_start|><|image_pad|><|vision_end|>"
|
||||
for asset in image_assets:
|
||||
mm_data = {"image": [asset.pil_image]}
|
||||
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
|
||||
grid_thw = processed_inputs["mm_kwargs"].get_data()["image_grid_thw"].tolist()
|
||||
t, h, w = grid_thw[0]
|
||||
tokens = (t * h * w) // (merge_size**2)
|
||||
assert tokens < max_tokens
|
||||
68
tests/models/multimodal/processing/test_smolvlm.py
Normal file
68
tests/models/multimodal/processing/test_smolvlm.py
Normal file
@@ -0,0 +1,68 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for smolvlm's multimodal preprocessing kwargs."""
|
||||
|
||||
import pytest
|
||||
from transformers import SmolVLMConfig
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
from ....conftest import ImageTestAssets
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["HuggingFaceTB/SmolVLM2-2.2B-Instruct"])
|
||||
@pytest.mark.parametrize(
|
||||
("mm_processor_kwargs", "expected_toks_per_img"),
|
||||
[
|
||||
({"max_image_size": {"longest_edge": 384}}, 1377),
|
||||
({"max_image_size": {"longest_edge": 768}}, 405),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
def test_processor_override(
|
||||
image_assets: ImageTestAssets,
|
||||
model_id: str,
|
||||
mm_processor_kwargs: dict[str, object],
|
||||
expected_toks_per_img: int,
|
||||
num_imgs: int,
|
||||
kwargs_on_init: bool,
|
||||
):
|
||||
"""Ensure Idefics3MultiModalProcessor handles num_crops properly."""
|
||||
# Same as the previous test - don't initialize mm_processor_kwargs
|
||||
# in this test and assume that the kwargs will be correctly expanded by
|
||||
# the partial when calling the custom input processor.
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
placeholders = (
|
||||
"<image>"
|
||||
if num_imgs == 1
|
||||
else "\n".join(f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
|
||||
)
|
||||
prompt = f"<|im_start|>User:{placeholders}\n<end_of_utterance>\nAssistant:" # noqa: E501
|
||||
|
||||
# Build mm_data
|
||||
image_size = ctx.get_hf_config(SmolVLMConfig).vision_config.image_size
|
||||
dummy_image_size = (image_size * 4, image_size * 4)
|
||||
dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
|
||||
mm_data = {"image": [dummy_image] * num_imgs}
|
||||
|
||||
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
|
||||
|
||||
# Ensure the placeholders format are correct
|
||||
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
|
||||
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs["input_ids"][0]
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
image_token_id = ctx.get_hf_config().image_token_id
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
|
||||
assert img_tok_count == expected_toks_per_img * num_imgs
|
||||
258
tests/models/multimodal/processing/test_tensor_schema.py
Normal file
258
tests/models/multimodal/processing/test_tensor_schema.py
Normal file
@@ -0,0 +1,258 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import tempfile
|
||||
from collections.abc import Iterable
|
||||
from contextlib import contextmanager
|
||||
from functools import partial
|
||||
from typing import Any, TypeAlias
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from PIL import Image
|
||||
|
||||
from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.config.multimodal import (
|
||||
AudioDummyOptions,
|
||||
BaseDummyOptions,
|
||||
ImageDummyOptions,
|
||||
VideoDummyOptions,
|
||||
)
|
||||
from vllm.distributed import (
|
||||
cleanup_dist_env_and_memory,
|
||||
init_distributed_environment,
|
||||
initialize_model_parallel,
|
||||
)
|
||||
from vllm.model_executor.models.interfaces import (
|
||||
SupportsMultiModal,
|
||||
supports_multimodal,
|
||||
)
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
|
||||
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
|
||||
from vllm.multimodal.utils import group_mm_kwargs_by_modality
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.tokenizers import cached_tokenizer_from_config
|
||||
from vllm.utils.collection_utils import is_list_of
|
||||
from vllm.utils.torch_utils import set_default_torch_dtype
|
||||
|
||||
from ....utils import create_new_process_for_each_test
|
||||
from ...registry import HF_EXAMPLE_MODELS
|
||||
from ...utils import dummy_hf_overrides
|
||||
from .test_common import get_model_ids_to_test, get_text_token_prompts
|
||||
|
||||
ImageInput = list[Image.Image]
|
||||
VideoInput: TypeAlias = (
|
||||
list[Image.Image] | list[np.ndarray] | list[tuple[np.ndarray, dict[str, Any]]]
|
||||
)
|
||||
AudioInput = list[tuple[np.ndarray, int]]
|
||||
|
||||
|
||||
MM_OPTIONS_OVERRIDES = {
|
||||
# Qwen3-VL's default profiling video size (64x64) can cause trouble
|
||||
# after resizing, so we override it here for testing.
|
||||
"qwen3_vl": dict(
|
||||
video=VideoDummyOptions(num_frames=128, width=256, height=256),
|
||||
),
|
||||
"qwen3_vl_moe": dict(
|
||||
video=VideoDummyOptions(num_frames=128, width=256, height=256),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def _resize_data(
|
||||
_data: Image.Image | np.ndarray, size_factor: float
|
||||
) -> Image.Image | np.ndarray:
|
||||
assert size_factor <= 1, "Size factor must be less than 1"
|
||||
# Image input
|
||||
if isinstance(_data, Image.Image):
|
||||
W, H = _data.width, _data.height
|
||||
W, H = map(lambda x: int(x * size_factor), (W, H))
|
||||
return _data.resize((W, H))
|
||||
# Video input with PIL Images
|
||||
elif is_list_of(_data, Image.Image):
|
||||
W, H = next(iter(_data)).width, next(iter(_data)).height
|
||||
T = len(_data)
|
||||
T, W, H = map(lambda x: max(int(x * size_factor), 1), (T, W, H))
|
||||
return [d.resize((W, H)) for d in _data[:T]]
|
||||
# Video input with numpy arrays
|
||||
elif isinstance(_data, np.ndarray) and _data.ndim >= 4:
|
||||
T, H, W, C = _data.shape[-4:]
|
||||
T, H, W = map(lambda x: max(int(x * size_factor), 1), (T, H, W))
|
||||
return _data[..., :T, :H, :W, :C]
|
||||
# Audio input
|
||||
elif isinstance(_data, np.ndarray) and _data.ndim == 1:
|
||||
return _data[: int(len(_data) * size_factor)]
|
||||
raise AssertionError("This line should be unreachable.")
|
||||
|
||||
|
||||
def resize_mm_data(
|
||||
data: ImageInput | VideoInput | AudioInput, size_factors: tuple[float, ...]
|
||||
) -> ImageInput | VideoInput | AudioInput:
|
||||
size_factors = size_factors[: len(data)]
|
||||
if is_list_of(data, (Image.Image, np.ndarray, list)):
|
||||
return [_resize_data(d, s) for d, s in zip(data, size_factors)]
|
||||
elif is_list_of(data, tuple):
|
||||
return [_resize_data(d, s) for (d, _), s in zip(data, size_factors)]
|
||||
raise ValueError("Unsupported multimodal data type.")
|
||||
|
||||
|
||||
def create_batched_mm_kwargs(
|
||||
model_cls: type[SupportsMultiModal],
|
||||
model_config: ModelConfig,
|
||||
processor: BaseMultiModalProcessor,
|
||||
size_factors: tuple[float, ...] = (1.0, 0.5, 0.25),
|
||||
) -> Iterable[tuple[str, int, BatchedTensorInputs]]:
|
||||
model_type = model_config.hf_config.model_type
|
||||
|
||||
processing_info = processor.info
|
||||
dummy_inputs = processor.dummy_inputs
|
||||
supported_mm_limits = processing_info.get_supported_mm_limits()
|
||||
mm_counts = {
|
||||
modality: 3 if limit is None else limit
|
||||
for modality, limit in supported_mm_limits.items()
|
||||
}
|
||||
processor_inputs = dummy_inputs.get_dummy_processor_inputs(
|
||||
seq_len=model_config.max_model_len,
|
||||
mm_counts=mm_counts,
|
||||
mm_options=MM_OPTIONS_OVERRIDES.get(model_type),
|
||||
)
|
||||
mm_data = processor_inputs.mm_data
|
||||
resized_mm_data = {
|
||||
modality: resize_mm_data(data, size_factors)
|
||||
for modality, data in mm_data.items()
|
||||
}
|
||||
|
||||
# video metadata will be added back to the resized video data here.
|
||||
text_prompt, token_prompt = get_text_token_prompts(processor, resized_mm_data)
|
||||
|
||||
mm_kwargs = processor.apply(
|
||||
prompt=token_prompt if text_prompt is None else text_prompt,
|
||||
mm_data=resized_mm_data,
|
||||
hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=processor_inputs.tokenization_kwargs,
|
||||
)["mm_kwargs"].require_data()
|
||||
|
||||
return group_mm_kwargs_by_modality(
|
||||
[item for modality in supported_mm_limits for item in mm_kwargs[modality]]
|
||||
)
|
||||
|
||||
|
||||
# TODO(Isotr0py): Don't initalize model during test
|
||||
@contextmanager
|
||||
def initialize_dummy_model(
|
||||
model_cls: type[nn.Module],
|
||||
model_config: ModelConfig,
|
||||
):
|
||||
temp_file = tempfile.mkstemp()[1]
|
||||
init_distributed_environment(
|
||||
world_size=1,
|
||||
rank=0,
|
||||
distributed_init_method=f"file://{temp_file}",
|
||||
local_rank=0,
|
||||
backend="nccl",
|
||||
)
|
||||
initialize_model_parallel(tensor_model_parallel_size=1)
|
||||
|
||||
current_device = torch.get_default_device()
|
||||
vllm_config = VllmConfig(model_config=model_config)
|
||||
with set_current_vllm_config(vllm_config=vllm_config):
|
||||
with set_default_torch_dtype(model_config.dtype):
|
||||
torch.set_default_device(current_platform.device_type)
|
||||
model = model_cls(vllm_config=vllm_config)
|
||||
torch.set_default_device(current_device)
|
||||
yield model
|
||||
|
||||
del model
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
@pytest.mark.parametrize("model_id", get_model_ids_to_test())
|
||||
def test_model_tensor_schema(model_id: str):
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
model_arch = next(
|
||||
arch for arch, info in HF_EXAMPLE_MODELS.hf_models.items() if info == model_info
|
||||
)
|
||||
|
||||
hf_overrides_fn = partial(
|
||||
dummy_hf_overrides,
|
||||
model_arch=model_arch,
|
||||
exist_overrides=model_info.hf_overrides,
|
||||
)
|
||||
|
||||
# ROCm: Detect if model uses AWQ quantization and set appropriate dtype
|
||||
if "awq" in model_id.lower() and current_platform.is_rocm():
|
||||
dtype = "float16"
|
||||
else:
|
||||
dtype = model_info.dtype
|
||||
|
||||
model_config = ModelConfig(
|
||||
model_id,
|
||||
tokenizer=model_info.tokenizer or model_id,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
revision=model_info.revision,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
hf_overrides=hf_overrides_fn,
|
||||
skip_tokenizer_init=model_info.require_embed_inputs,
|
||||
enable_prompt_embeds=model_info.require_embed_inputs,
|
||||
enable_mm_embeds=model_info.require_embed_inputs,
|
||||
enforce_eager=model_info.enforce_eager,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
|
||||
assert supports_multimodal(model_cls)
|
||||
|
||||
factories = model_cls._processor_factory
|
||||
|
||||
inputs_parse_methods = []
|
||||
for attr_name in dir(model_cls):
|
||||
attr = getattr(model_cls, attr_name)
|
||||
if hasattr(attr, "__annotations__"):
|
||||
return_type = attr.__annotations__.get("return", None)
|
||||
if return_type is not None and "Input" in str(return_type):
|
||||
inputs_parse_methods.append(attr_name)
|
||||
|
||||
if not any(inputs_parse_methods):
|
||||
pytest.skip(f"{model_arch} does not support tensor schema validation.")
|
||||
|
||||
ctx = InputProcessingContext(
|
||||
model_config,
|
||||
tokenizer=cached_tokenizer_from_config(model_config),
|
||||
)
|
||||
processing_info = factories.info(ctx)
|
||||
supported_mm_limits = processing_info.get_supported_mm_limits()
|
||||
limit_mm_per_prompt = {
|
||||
modality: 3 if limit is None else limit
|
||||
for modality, limit in supported_mm_limits.items()
|
||||
}
|
||||
|
||||
def _to_dummy_options(modality: str, count: int) -> BaseDummyOptions:
|
||||
if modality == "video":
|
||||
return VideoDummyOptions(count=count)
|
||||
if modality == "image":
|
||||
return ImageDummyOptions(count=count)
|
||||
if modality == "audio":
|
||||
return AudioDummyOptions(count=count)
|
||||
return BaseDummyOptions(count=count)
|
||||
|
||||
model_config.get_multimodal_config().limit_per_prompt = {
|
||||
modality: _to_dummy_options(modality, count)
|
||||
for modality, count in limit_mm_per_prompt.items()
|
||||
}
|
||||
processor = factories.build_processor(ctx, cache=None)
|
||||
|
||||
with initialize_dummy_model(model_cls, model_config) as model:
|
||||
for modality, _, mm_kwargs in create_batched_mm_kwargs(
|
||||
model_cls, model_config, processor
|
||||
):
|
||||
for method_name in inputs_parse_methods:
|
||||
print(
|
||||
f"Testing `{method_name}` with modality={modality} "
|
||||
f"and mm_kwargs{list(mm_kwargs.keys())}"
|
||||
)
|
||||
getattr(model, method_name)(modality=modality, **mm_kwargs)
|
||||
56
tests/models/multimodal/processing/test_transformers.py
Normal file
56
tests/models/multimodal/processing/test_transformers.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
|
||||
def test_multimodal_processor(model_id):
|
||||
model_config = ModelConfig(
|
||||
model=model_id,
|
||||
model_impl="transformers",
|
||||
)
|
||||
|
||||
mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config)
|
||||
|
||||
image_pil = ImageAsset("cherry_blossom").pil_image
|
||||
mm_data = {"image": image_pil}
|
||||
str_prompt = "<|im_start|>user <image>\nWhat is the content of this image?<|im_end|><|im_start|>assistant\n" # noqa: E501
|
||||
str_processed_inputs = mm_processor.apply(
|
||||
prompt=str_prompt,
|
||||
mm_data=mm_data,
|
||||
hf_processor_mm_kwargs={},
|
||||
)
|
||||
|
||||
ids_prompt = [
|
||||
151644,
|
||||
872,
|
||||
220,
|
||||
151646,
|
||||
198,
|
||||
3838,
|
||||
374,
|
||||
279,
|
||||
2213,
|
||||
315,
|
||||
419,
|
||||
2168,
|
||||
30,
|
||||
151645,
|
||||
151644,
|
||||
77091,
|
||||
198,
|
||||
]
|
||||
ids_processed_inputs = mm_processor.apply(
|
||||
prompt=ids_prompt,
|
||||
mm_data=mm_data,
|
||||
hf_processor_mm_kwargs={},
|
||||
)
|
||||
|
||||
assert (
|
||||
str_processed_inputs["prompt_token_ids"]
|
||||
== ids_processed_inputs["prompt_token_ids"]
|
||||
)
|
||||
104
tests/models/multimodal/test_mapping.py
Normal file
104
tests/models/multimodal/test_mapping.py
Normal file
@@ -0,0 +1,104 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from collections.abc import Iterable
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import transformers
|
||||
from transformers import AutoConfig, PreTrainedModel
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.model_executor.models.utils import WeightsMapper
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.transformers_utils.config import try_get_safetensors_metadata
|
||||
|
||||
from ..registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS
|
||||
|
||||
|
||||
def create_repo_dummy_weights(repo: str) -> Iterable[tuple[str, torch.Tensor]]:
|
||||
"""Create weights from safetensors checkpoint metadata"""
|
||||
metadata = try_get_safetensors_metadata(repo)
|
||||
weight_names = list(metadata.weight_map.keys())
|
||||
with torch.device("meta"):
|
||||
return ((name, torch.empty(0)) for name in weight_names)
|
||||
|
||||
|
||||
def create_dummy_model(repo: str, model_arch: str) -> PreTrainedModel:
|
||||
"""
|
||||
Create weights from a dummy meta deserialized hf model with name conversion
|
||||
"""
|
||||
model_cls: PreTrainedModel = getattr(transformers, model_arch)
|
||||
config = AutoConfig.from_pretrained(repo)
|
||||
with torch.device("meta"):
|
||||
return model_cls._from_config(config)
|
||||
|
||||
|
||||
def model_architectures_for_test() -> list[str]:
|
||||
arch_to_test = list[str]()
|
||||
for model_arch, info in _MULTIMODAL_EXAMPLE_MODELS.items():
|
||||
if not info.trust_remote_code and hasattr(transformers, model_arch):
|
||||
model_cls: PreTrainedModel = getattr(transformers, model_arch)
|
||||
if getattr(model_cls, "_checkpoint_conversion_mapping", None):
|
||||
arch_to_test.append(model_arch)
|
||||
return arch_to_test
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model_arch", model_architectures_for_test())
|
||||
def test_hf_model_weights_mapper(model_arch: str):
|
||||
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
is_mistral_model = model_arch in [
|
||||
"Mistral3ForConditionalGeneration",
|
||||
"PixtralForConditionalGeneration",
|
||||
"VoxtralForConditionalGeneration",
|
||||
]
|
||||
|
||||
if not is_mistral_model or model_info.tokenizer_mode == "mistral":
|
||||
tokenizer_mode = model_info.tokenizer_mode
|
||||
else:
|
||||
tokenizer_mode = "hf"
|
||||
|
||||
model_id = model_info.default
|
||||
|
||||
model_config = ModelConfig(
|
||||
model_id,
|
||||
tokenizer=model_info.tokenizer or model_id,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
config_format="hf",
|
||||
revision=model_info.revision,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
hf_overrides=model_info.hf_overrides,
|
||||
skip_tokenizer_init=model_info.require_embed_inputs,
|
||||
enable_prompt_embeds=model_info.require_embed_inputs,
|
||||
enable_mm_embeds=model_info.require_embed_inputs,
|
||||
enforce_eager=model_info.enforce_eager,
|
||||
dtype=model_info.dtype,
|
||||
)
|
||||
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
|
||||
|
||||
original_weights = create_repo_dummy_weights(model_id)
|
||||
hf_dummy_model = create_dummy_model(model_id, model_arch)
|
||||
hf_converted_weights = hf_dummy_model.named_parameters()
|
||||
hf_converted_buffers = hf_dummy_model.named_buffers()
|
||||
mapper: WeightsMapper = model_cls.hf_to_vllm_mapper
|
||||
|
||||
mapped_original_weights = mapper.apply(original_weights)
|
||||
mapped_hf_converted_weights = mapper.apply(hf_converted_weights)
|
||||
mapped_hf_converted_buffers = mapper.apply(hf_converted_buffers)
|
||||
|
||||
ref_weight_names = set(map(lambda x: x[0], mapped_original_weights))
|
||||
weight_names = set(map(lambda x: x[0], mapped_hf_converted_weights))
|
||||
buffer_names = set(map(lambda x: x[0], mapped_hf_converted_buffers))
|
||||
|
||||
# Some checkpoints may have buffers, we ignore them for this test
|
||||
ref_weight_names -= buffer_names
|
||||
|
||||
weights_missing = ref_weight_names - weight_names
|
||||
weights_unmapped = weight_names - ref_weight_names
|
||||
assert not weights_missing and not weights_unmapped, (
|
||||
f"Following weights are not mapped correctly: {weights_unmapped}, "
|
||||
f"Missing expected weights: {weights_missing}."
|
||||
)
|
||||
Reference in New Issue
Block a user