add qwen3
This commit is contained in:
0
vllm-v0.6.2/tests/models/__init__.py
Normal file
0
vllm-v0.6.2/tests/models/__init__.py
Normal file
0
vllm-v0.6.2/tests/models/decoder_only/__init__.py
Normal file
0
vllm-v0.6.2/tests/models/decoder_only/__init__.py
Normal file
@@ -0,0 +1,268 @@
|
||||
from typing import List, Optional, Tuple, Type
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from transformers import AutoModel, AutoTokenizer, BatchEncoding
|
||||
|
||||
from vllm.sequence import SampleLogprobs
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
from ....conftest import HfRunner, VllmRunner
|
||||
from ....utils import RemoteOpenAIServer
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
MODEL_NAME = "fixie-ai/ultravox-v0_3"
|
||||
|
||||
AudioTuple = Tuple[np.ndarray, int]
|
||||
|
||||
VLLM_PLACEHOLDER = "<|reserved_special_token_0|>"
|
||||
HF_PLACEHOLDER = "<|audio|>"
|
||||
|
||||
CHUNKED_PREFILL_KWARGS = {
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_seqs": 2,
|
||||
# Use a very small limit to exercise chunked prefill.
|
||||
"max_num_batched_tokens": 16
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def audio_assets():
|
||||
from vllm.assets.audio import AudioAsset
|
||||
return [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=("mary_had_lamb", "winning_call"))
|
||||
def audio(request):
|
||||
from vllm.assets.audio import AudioAsset
|
||||
return AudioAsset(request.param)
|
||||
|
||||
|
||||
@pytest.fixture(params=[
|
||||
pytest.param({}, marks=pytest.mark.cpu_model),
|
||||
pytest.param(CHUNKED_PREFILL_KWARGS),
|
||||
])
|
||||
def server(request, audio_assets):
|
||||
args = [
|
||||
"--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager",
|
||||
f"--limit-mm-per-prompt=audio={len(audio_assets)}"
|
||||
] + [
|
||||
f"--{key.replace('_','-')}={value}"
|
||||
for key, value in request.param.items()
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
def _get_prompt(audio_count, question, placeholder):
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||
placeholder = f"{placeholder}\n" * audio_count
|
||||
|
||||
return tokenizer.apply_chat_template([{
|
||||
'role': 'user',
|
||||
'content': f"{placeholder}{question}"
|
||||
}],
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
|
||||
|
||||
def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
|
||||
Optional[SampleLogprobs]],
|
||||
model: str):
|
||||
"""Sanitize vllm output to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
eos_token_id = tokenizer.eos_token_id
|
||||
|
||||
hf_output_ids = output_ids[:]
|
||||
hf_output_str = output_str
|
||||
if hf_output_ids[-1] == eos_token_id:
|
||||
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
|
||||
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def run_test(
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
prompts_and_audios: List[Tuple[str, str, AudioTuple]],
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
**kwargs,
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm."""
|
||||
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
|
||||
with vllm_runner(model, dtype=dtype, enforce_eager=True,
|
||||
**kwargs) as vllm_model:
|
||||
vllm_outputs_per_audio = [
|
||||
vllm_model.generate_greedy_logprobs([vllm_prompt],
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
audios=[audio])
|
||||
for vllm_prompt, _, audio in prompts_and_audios
|
||||
]
|
||||
|
||||
def process(hf_inputs: BatchEncoding, **kwargs):
|
||||
hf_inputs["audio_values"] = hf_inputs["audio_values"] \
|
||||
.to(torch_dtype) # type: ignore
|
||||
return hf_inputs
|
||||
|
||||
with hf_runner(model,
|
||||
dtype=dtype,
|
||||
postprocess_inputs=process,
|
||||
auto_cls=AutoModel) as hf_model:
|
||||
import librosa
|
||||
|
||||
hf_outputs_per_audio = [
|
||||
hf_model.generate_greedy_logprobs_limit(
|
||||
[hf_prompt],
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
audios=[(librosa.resample(audio[0],
|
||||
orig_sr=audio[1],
|
||||
target_sr=16000), 16000)])
|
||||
for _, hf_prompt, audio in prompts_and_audios
|
||||
]
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_audio,
|
||||
vllm_outputs_per_audio):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=[
|
||||
vllm_to_hf_output(vllm_output, model)
|
||||
for vllm_output in vllm_outputs
|
||||
],
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
def run_multi_audio_test(
|
||||
vllm_runner: Type[VllmRunner],
|
||||
prompts_and_audios: List[Tuple[str, List[AudioTuple]]],
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
**kwargs,
|
||||
):
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
enforce_eager=True,
|
||||
limit_mm_per_prompt={
|
||||
"audio":
|
||||
max((len(audio) for _, audio in prompts_and_audios))
|
||||
},
|
||||
**kwargs) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
[prompt for prompt, _ in prompts_and_audios],
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
audios=[audios for _, audios in prompts_and_audios])
|
||||
|
||||
# The HuggingFace model doesn't support multiple audios yet, so
|
||||
# just assert that some tokens were generated.
|
||||
assert all(tokens for tokens, *_ in vllm_outputs)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("vllm_kwargs", [
|
||||
pytest.param({}, marks=pytest.mark.cpu_model),
|
||||
pytest.param(CHUNKED_PREFILL_KWARGS),
|
||||
])
|
||||
def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
|
||||
num_logprobs: int, vllm_kwargs: dict) -> None:
|
||||
|
||||
vllm_prompt = _get_prompt(1, "Describe the audio above.", VLLM_PLACEHOLDER)
|
||||
hf_prompt = _get_prompt(1, "Describe the audio above.", HF_PLACEHOLDER)
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
[(vllm_prompt, hf_prompt, audio.audio_and_sample_rate)],
|
||||
MODEL_NAME,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
**vllm_kwargs,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("vllm_kwargs", [
|
||||
pytest.param({}, marks=pytest.mark.cpu_model),
|
||||
pytest.param(CHUNKED_PREFILL_KWARGS),
|
||||
])
|
||||
def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
|
||||
max_tokens: int, num_logprobs: int,
|
||||
vllm_kwargs: dict) -> None:
|
||||
|
||||
vllm_prompt = _get_prompt(len(audio_assets),
|
||||
"Describe each of the audios above.",
|
||||
VLLM_PLACEHOLDER)
|
||||
run_multi_audio_test(
|
||||
vllm_runner,
|
||||
[(vllm_prompt, [audio.audio_and_sample_rate
|
||||
for audio in audio_assets])],
|
||||
MODEL_NAME,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
**vllm_kwargs,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_online_inference(client, audio_assets):
|
||||
"""Exercises online inference with/without chunked prefill enabled."""
|
||||
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
*[{
|
||||
"type": "audio_url",
|
||||
"audio_url": {
|
||||
"url": audio.url
|
||||
}
|
||||
} for audio in audio_assets],
|
||||
{
|
||||
"type":
|
||||
"text",
|
||||
"text":
|
||||
f"What's happening in these {len(audio_assets)} audio clips?"
|
||||
},
|
||||
],
|
||||
}]
|
||||
|
||||
chat_completion = await client.chat.completions.create(model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_tokens=10)
|
||||
|
||||
assert len(chat_completion.choices) == 1
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
69
vllm-v0.6.2/tests/models/decoder_only/language/test_aqlm.py
Normal file
69
vllm-v0.6.2/tests/models/decoder_only/language/test_aqlm.py
Normal file
@@ -0,0 +1,69 @@
|
||||
"""Compare the outputs of a AQLM model between vLLM and HF Transformers
|
||||
|
||||
Run `pytest tests/models/test_aqlm.py`.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
# These ground truth generations were generated using `transformers==4.38.1
|
||||
# aqlm==1.1.0 torch==2.2.0`
|
||||
# and the below code:
|
||||
# ```python
|
||||
# from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
# model_id = "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"
|
||||
# quantized_model = AutoModelForCausalLM.from_pretrained(model_id,
|
||||
# torch_dtype="auto", device_map="cuda").cuda()
|
||||
# tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||
# outputs = []
|
||||
# for prompt in example_prompts:
|
||||
# input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")
|
||||
# hf_outputs = quantized_model.generate(input_ids, max_new_tokens=32)
|
||||
# outputs.append(tokenizer.decode(hf_outputs[0][input_ids.shape[1]:]))
|
||||
# print(outputs)
|
||||
# ```
|
||||
ground_truth_generations = [
|
||||
'\n### Features\n\n- **High-throughput**: v',
|
||||
'The major milestones in the development of artificial intelligence from '
|
||||
'195',
|
||||
'Compare and contrast artificial intelligence with human intelligence in '
|
||||
'terms of processing information. The',
|
||||
'Explain the difference between supervised and unsupervised learning.'
|
||||
'\nExplain',
|
||||
'Write a short story about a robot that dreams for the first time. The',
|
||||
'Analyze the impact of the COVID-19 pandemic on global economic',
|
||||
'The Mona Lisa is a painting by Leonardo da Vinci, and it',
|
||||
'The early bird catches the worm.\nThe early bird catches the'
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.quant_model
|
||||
@pytest.mark.skipif(not is_quant_method_supported("aqlm"),
|
||||
reason="AQLM is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [16])
|
||||
@pytest.mark.parametrize("num_logprobs", [1])
|
||||
def test_models(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
# loop through the prompts to compare against the ground truth generations
|
||||
for prompt_idx in range(len(example_prompts)):
|
||||
vllm_output_ids, vllm_output_str, vllm_logprobs = vllm_outputs[
|
||||
prompt_idx]
|
||||
|
||||
print("Prompt: ", repr(example_prompts[prompt_idx]))
|
||||
print("Reference output:", repr(ground_truth_generations[prompt_idx]))
|
||||
print("Output output: ", repr(vllm_output_str))
|
||||
assert vllm_output_str == ground_truth_generations[prompt_idx]
|
||||
@@ -0,0 +1,70 @@
|
||||
"""Compare the outputs of HF and vLLM when using greedy sampling.
|
||||
|
||||
This tests bigger models and use half precision.
|
||||
|
||||
Run `pytest tests/models/test_big_models.py`.
|
||||
"""
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from ...utils import check_outputs_equal
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief(MODELS): Only test Llama-2-7b-hf, disable gpt-j-6b.
|
||||
'''
|
||||
MODELS = [
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
# "mistralai/Mistral-7B-v0.1", # Tested by test_mistral.py
|
||||
# "Deci/DeciLM-7b", # Broken
|
||||
# "tiiuae/falcon-7b", # Broken
|
||||
# "EleutherAI/gpt-j-6b",
|
||||
# "mosaicml/mpt-7b", # Broken
|
||||
# "Qwen/Qwen1.5-0.5B" # Broken,
|
||||
]
|
||||
|
||||
#TODO: remove this after CPU float16 support ready
|
||||
target_dtype = "float"
|
||||
if torch.cuda.is_available():
|
||||
target_dtype = "half"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
def test_model_print(
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
# This test is for verifying whether the model's extra_repr
|
||||
# can be printed correctly.
|
||||
print(vllm_model.model.llm_engine.model_executor.driver_worker.
|
||||
model_runner.model)
|
||||
100
vllm-v0.6.2/tests/models/decoder_only/language/test_fp8.py
Normal file
100
vllm-v0.6.2/tests/models/decoder_only/language/test_fp8.py
Normal file
@@ -0,0 +1,100 @@
|
||||
# flake8: noqa
|
||||
"""Tests fp8 models against ground truth generation
|
||||
Note: these tests will only pass on L4 GPU.
|
||||
"""
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.kernels.utils import override_backend_env_variable
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
|
||||
|
||||
@pytest.mark.quant_model
|
||||
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
|
||||
reason="fp8 is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize(
|
||||
"kv_cache_dtype,base_model,test_model,scale_path",
|
||||
[
|
||||
# Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
|
||||
("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
|
||||
"nm-testing/Llama-3.2-1B-Instruct-FP8-KV", None),
|
||||
# Test FP16 checkpoint w. fp8_e5m2 kv-cache.
|
||||
("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
|
||||
"meta-llama/Llama-3.2-1B-Instruct", None),
|
||||
# Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
|
||||
("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
|
||||
"meta-llama/Llama-2-7b-chat-hf",
|
||||
"./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json")
|
||||
])
|
||||
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
|
||||
@pytest.mark.parametrize("max_tokens", [4])
|
||||
@pytest.mark.parametrize("enforce_eager", [True])
|
||||
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
|
||||
# NOTE: Increasing this in this suite will fail CI because we currently cannot
|
||||
# reset distributed env properly. Use a value > 1 just when you test.
|
||||
@pytest.mark.parametrize("tensor_parallel_size", [1])
|
||||
# Due to low-precision numerical divergence, this test is too sensitive for
|
||||
# the async postprocessor
|
||||
@pytest.mark.parametrize("disable_async_output_proc", [True])
|
||||
def test_models(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
kv_cache_dtype: str,
|
||||
base_model: str,
|
||||
test_model: str,
|
||||
scale_path: Optional[str],
|
||||
max_tokens: int,
|
||||
enforce_eager: bool,
|
||||
backend: str,
|
||||
tensor_parallel_size: int,
|
||||
disable_async_output_proc: bool,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
"""
|
||||
Only checks log probs match to cover the discrepancy in
|
||||
numerical sensitive kernels.
|
||||
"""
|
||||
override_backend_env_variable(monkeypatch, backend)
|
||||
|
||||
MAX_MODEL_LEN = 1024
|
||||
NUM_LOG_PROBS = 8
|
||||
|
||||
with vllm_runner(
|
||||
base_model,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
enforce_eager=enforce_eager,
|
||||
kv_cache_dtype="auto",
|
||||
disable_async_output_proc=disable_async_output_proc,
|
||||
) as vllm_model:
|
||||
baseline_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||
|
||||
extra_kwargs = {}
|
||||
if scale_path is not None:
|
||||
extra_kwargs["quantization_param_path"] = scale_path
|
||||
|
||||
with vllm_runner(
|
||||
test_model,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
enforce_eager=enforce_eager,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
disable_async_output_proc=disable_async_output_proc,
|
||||
**extra_kwargs,
|
||||
) as vllm_model:
|
||||
test_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=baseline_outputs,
|
||||
outputs_1_lst=test_outputs,
|
||||
name_0="fp16_kv_cache",
|
||||
name_1="fp8_kv_cache",
|
||||
)
|
||||
87
vllm-v0.6.2/tests/models/decoder_only/language/test_gguf.py
Normal file
87
vllm-v0.6.2/tests/models/decoder_only/language/test_gguf.py
Normal file
@@ -0,0 +1,87 @@
|
||||
"""
|
||||
Tests gguf models against unquantized models generations
|
||||
Note: To pass the test, quantization higher than Q4 should be used
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from huggingface_hub import hf_hub_download
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
|
||||
MAX_MODEL_LEN = 1024
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("gguf"),
|
||||
reason="gguf is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize(("original_model", "gguf_id", "gguf_path"), [
|
||||
("meta-llama/Llama-3.2-1B-Instruct",
|
||||
"bartowski/Llama-3.2-1B-Instruct-GGUF",
|
||||
"Llama-3.2-1B-Instruct-Q4_K_M.gguf"),
|
||||
("meta-llama/Llama-3.2-1B-Instruct",
|
||||
"bartowski/Llama-3.2-1B-Instruct-GGUF",
|
||||
"Llama-3.2-1B-Instruct-IQ4_XS.gguf"),
|
||||
("Qwen/Qwen2-1.5B-Instruct", "Qwen/Qwen2-1.5B-Instruct-GGUF",
|
||||
"qwen2-1_5b-instruct-q4_k_m.gguf"),
|
||||
("Qwen/Qwen2-1.5B-Instruct", "legraphista/Qwen2-1.5B-Instruct-IMat-GGUF",
|
||||
"Qwen2-1.5B-Instruct.IQ4_XS.gguf"),
|
||||
])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("tp_size", [1, 2])
|
||||
def test_models(
|
||||
num_gpus_available,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
original_model,
|
||||
gguf_id,
|
||||
gguf_path,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tp_size: int,
|
||||
) -> None:
|
||||
if num_gpus_available < tp_size:
|
||||
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
|
||||
|
||||
gguf_model = hf_hub_download(gguf_id, filename=gguf_path)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(original_model)
|
||||
messages = [[{
|
||||
'role': 'user',
|
||||
'content': prompt
|
||||
}] for prompt in example_prompts]
|
||||
example_prompts = tokenizer.apply_chat_template(messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
|
||||
# Run unquantized model.
|
||||
with vllm_runner(model_name=original_model,
|
||||
dtype=dtype,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=tp_size) as original_model:
|
||||
|
||||
original_outputs = original_model.generate_greedy_logprobs(
|
||||
example_prompts[:-1], max_tokens, num_logprobs)
|
||||
|
||||
# Run gguf model.
|
||||
with vllm_runner(model_name=gguf_model,
|
||||
dtype=dtype,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=tp_size) as gguf_model:
|
||||
gguf_outputs = gguf_model.generate_greedy_logprobs(
|
||||
example_prompts[:-1], max_tokens, num_logprobs)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=original_outputs,
|
||||
outputs_1_lst=gguf_outputs,
|
||||
name_0="original",
|
||||
name_1="gguf",
|
||||
)
|
||||
@@ -0,0 +1,84 @@
|
||||
"""Compares the outputs of gptq vs gptq_marlin
|
||||
Note: GPTQ and Marlin do not have bitwise correctness.
|
||||
As a result, in this test, we just confirm that the top selected tokens of the
|
||||
Marlin/GPTQ models are in the top 5 selections of each other.
|
||||
Note: Marlin internally uses locks to synchronize the threads. This can
|
||||
result in very slight nondeterminism for Marlin. As a result, we re-run the test
|
||||
up to 3 times to see if we pass.
|
||||
|
||||
Run `pytest tests/models/test_gptq_marlin.py`.
|
||||
"""
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
|
||||
MAX_MODEL_LEN = 1024
|
||||
|
||||
MODELS = [
|
||||
# act_order==True, group_size=128
|
||||
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
|
||||
|
||||
# 8-bit, act_order==True, group_size=channelwise
|
||||
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
|
||||
|
||||
# 4-bit, act_order==True, group_size=128
|
||||
("TechxGenus/gemma-1.1-2b-it-GPTQ", "main")
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.quant_model
|
||||
@pytest.mark.flaky(reruns=3)
|
||||
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
|
||||
reason="gptq_marlin is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half", "bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
model_name, revision = model
|
||||
|
||||
# Run marlin.
|
||||
with vllm_runner(model_name=model_name,
|
||||
revision=revision,
|
||||
dtype=dtype,
|
||||
quantization="marlin",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=1) as gptq_marlin_model:
|
||||
|
||||
gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
|
||||
example_prompts[:-1], max_tokens, num_logprobs)
|
||||
_ROPE_DICT.clear() # clear rope cache to avoid rope dtype error
|
||||
|
||||
# Run gptq.
|
||||
# The naive gptq kernel doesn't support bf16 yet.
|
||||
# Here we always compare fp16/bf16 gpt marlin kernel
|
||||
# to fp16 gptq kernel.
|
||||
with vllm_runner(model_name=model_name,
|
||||
revision=revision,
|
||||
dtype="half",
|
||||
quantization="gptq",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=1) as gptq_model:
|
||||
gptq_outputs = gptq_model.generate_greedy_logprobs(
|
||||
example_prompts[:-1], max_tokens, num_logprobs)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=gptq_outputs,
|
||||
outputs_1_lst=gptq_marlin_outputs,
|
||||
name_0="gptq",
|
||||
name_1="gptq_marlin",
|
||||
)
|
||||
@@ -0,0 +1,73 @@
|
||||
"""Compare the outputs of a GPTQ model to a Marlin_24 model.
|
||||
|
||||
Note: GPTQ and Marlin_24 do not have bitwise correctness.
|
||||
As a result, in this test, we just confirm that the top selected tokens of the
|
||||
Marlin/GPTQ models are in the top 3 selections of each other.
|
||||
|
||||
Run `pytest tests/models/test_marlin_24.py`.
|
||||
"""
|
||||
from dataclasses import dataclass
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelPair:
|
||||
model_marlin: str
|
||||
model_gptq: str
|
||||
|
||||
|
||||
model_pairs = [
|
||||
# 4-bit, group_size == 128
|
||||
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
|
||||
model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"),
|
||||
# # 4-bit, group_size == channelwise
|
||||
# ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
|
||||
# model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
|
||||
|
||||
# 8-bit, group_size == 128
|
||||
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
|
||||
model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"),
|
||||
# # 8-bit, group_size == channelwise
|
||||
# ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
|
||||
# model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.quant_model
|
||||
@pytest.mark.flaky(reruns=2)
|
||||
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24"),
|
||||
reason="Marlin24 is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("model_pair", model_pairs)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [8])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model_pair: ModelPair,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with vllm_runner(model_pair.model_marlin,
|
||||
dtype=dtype,
|
||||
quantization="gptq_marlin_24") as marlin_24_model:
|
||||
marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with vllm_runner(model_pair.model_gptq, dtype=dtype,
|
||||
quantization="gptq") as gptq_model:
|
||||
gptq_outputs = gptq_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=gptq_outputs,
|
||||
outputs_1_lst=marlin_24_outputs,
|
||||
name_0="gptq",
|
||||
name_1="marlin_24",
|
||||
)
|
||||
@@ -0,0 +1,41 @@
|
||||
"""Compare the outputs of HF and vLLM for Granite models using greedy sampling.
|
||||
|
||||
Run `pytest tests/models/test_granite.py`.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
MODELS = [
|
||||
# TODO(sang): Sliding window should be tested separately.
|
||||
"ibm/PowerLM-3b",
|
||||
"ibm/PowerMoE-3b",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
299
vllm-v0.6.2/tests/models/decoder_only/language/test_jamba.py
Normal file
299
vllm-v0.6.2/tests/models/decoder_only/language/test_jamba.py
Normal file
@@ -0,0 +1,299 @@
|
||||
import pytest
|
||||
|
||||
from tests.utils import multi_gpu_test
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.worker.model_runner import _get_graph_batch_size
|
||||
|
||||
from ...utils import check_outputs_equal
|
||||
|
||||
MODELS = ["ai21labs/Jamba-tiny-dev"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [96])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
|
||||
with hf_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
model_kwargs={
|
||||
"use_mamba_kernels":
|
||||
False, # mamba kernels are not installed so HF
|
||||
# don't use them
|
||||
}) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
# This test is for verifying whether the model's extra_repr
|
||||
# can be printed correctly.
|
||||
print(vllm_model.model.llm_engine.model_executor.driver_worker.
|
||||
model_runner.model)
|
||||
|
||||
for i in range(len(example_prompts)):
|
||||
hf_output_ids, hf_output_str = hf_outputs[i]
|
||||
vllm_output_ids, vllm_output_str = vllm_outputs[i]
|
||||
assert hf_output_str == vllm_output_str, (
|
||||
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
|
||||
assert hf_output_ids == vllm_output_ids, (
|
||||
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [96])
|
||||
def test_batching(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
# To pass the small model tests, we need full precision.
|
||||
for_loop_outputs = []
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
for prompt in example_prompts:
|
||||
for_loop_outputs.append(
|
||||
vllm_model.generate_greedy([prompt], max_tokens)[0])
|
||||
|
||||
batched_outputs = vllm_model.generate_greedy(example_prompts,
|
||||
max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=for_loop_outputs,
|
||||
outputs_1_lst=batched_outputs,
|
||||
name_0="for_loop_vllm",
|
||||
name_1="batched_vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float16"])
|
||||
@pytest.mark.parametrize("max_tokens", [10])
|
||||
def test_mamba_prefill_chunking_with_parallel_sampling(
|
||||
hf_runner, vllm_runner, example_prompts, model: str, dtype: str,
|
||||
max_tokens: int) -> None:
|
||||
# Tests prefill chunking in conjunction with n>1, in this case,
|
||||
# prefill is populated with decoding tokens and we test that it
|
||||
# doesn't fail This test might fail if cache is not allocated
|
||||
# correctly for n > 1 decoding steps inside a
|
||||
# chunked prefill forward pass (where we have both prefills
|
||||
# and decoding together )
|
||||
sampling_params = SamplingParams(n=3,
|
||||
temperature=1,
|
||||
seed=0,
|
||||
max_tokens=max_tokens)
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
enable_chunked_prefill=True,
|
||||
max_num_batched_tokens=30,
|
||||
max_num_seqs=10 # forces prefill chunks with decoding
|
||||
) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [10])
|
||||
def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
|
||||
model: str, dtype: str,
|
||||
max_tokens: int) -> None:
|
||||
# numeric error during prefill chucking produces different generation
|
||||
# compared to w/o prefill chunking for those examples, removed them for now
|
||||
example_prompts.pop(7)
|
||||
example_prompts.pop(2)
|
||||
example_prompts.pop(1)
|
||||
|
||||
with hf_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
model_kwargs={
|
||||
"use_mamba_kernels":
|
||||
False, # mamba kernels are not installed so HF
|
||||
# don't use them
|
||||
}) as hf_model:
|
||||
non_chunked = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
enable_chunked_prefill=True,
|
||||
max_num_batched_tokens=5,
|
||||
max_num_seqs=2) as vllm_model:
|
||||
chunked = vllm_model.generate_greedy(example_prompts,
|
||||
max_tokens=max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=chunked,
|
||||
outputs_1_lst=non_chunked,
|
||||
name_0="chunked",
|
||||
name_1="non_chunked",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [15])
|
||||
def test_parallel_sampling(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
for_loop_outputs = []
|
||||
for _ in range(10):
|
||||
for_loop_outputs.append(
|
||||
# using example_prompts index 1 instead of 0 since with 0 the
|
||||
# logprobs get really close and the test doesn't pass
|
||||
vllm_model.generate_greedy([example_prompts[1]], max_tokens)
|
||||
[0])
|
||||
sampling_params = SamplingParams(n=10,
|
||||
temperature=0.001,
|
||||
seed=0,
|
||||
max_tokens=max_tokens)
|
||||
n_lt_1_outputs = vllm_model.generate([example_prompts[1]],
|
||||
sampling_params)
|
||||
token_ids, texts = n_lt_1_outputs[0]
|
||||
n_lt_1_outputs = [(token_id, text)
|
||||
for token_id, text in zip(token_ids, texts)]
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=n_lt_1_outputs,
|
||||
outputs_1_lst=for_loop_outputs,
|
||||
name_0="vllm_n_lt_1_outputs",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [20])
|
||||
def test_mamba_cache_cg_padding(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
# This test is for verifying that mamba cache is padded to CG captured
|
||||
# batch size. If it's not, a torch RuntimeError will be raised because
|
||||
# tensor dimensions aren't compatible
|
||||
while len(example_prompts) == _get_graph_batch_size(len(example_prompts)):
|
||||
example_prompts.append(example_prompts[0])
|
||||
|
||||
try:
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
except RuntimeError:
|
||||
pytest.fail(
|
||||
"Couldn't run batch size which is not equal to a Cuda Graph "
|
||||
"captured batch size. "
|
||||
"Could be related to mamba cache not padded correctly")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [20])
|
||||
def test_models_preemption_recompute(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
# Tests that outputs are identical with and w/o preemtions (recompute)
|
||||
assert dtype == "float"
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_model.model.llm_engine.scheduler[
|
||||
0].ENABLE_ARTIFICIAL_PREEMPT = True
|
||||
preempt_vllm_outputs = vllm_model.generate_greedy(
|
||||
example_prompts, max_tokens)
|
||||
|
||||
vllm_model.model.llm_engine.scheduler[
|
||||
0].ENABLE_ARTIFICIAL_PREEMPT = False
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=preempt_vllm_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="vllm_preepmtions",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
example_prompts,
|
||||
) -> None:
|
||||
# This test is for verifying that the Jamba inner state management doesn't
|
||||
# collapse in case where the number of incoming requests and
|
||||
# finished_requests_ids is larger than the maximum mamba block capacity.
|
||||
# This could generally happen due to the fact that Jamba does support
|
||||
# statelessness mechanism where it can cleanup new incoming requests in
|
||||
# a single step.
|
||||
try:
|
||||
with vllm_runner(model, dtype=dtype, max_num_seqs=10) as vllm_model:
|
||||
vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
|
||||
except ValueError:
|
||||
pytest.fail("Jamba inner state wasn't cleaned up properly between"
|
||||
"steps finished requests registered unnecessarily ")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
def test_state_cleanup(
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
example_prompts,
|
||||
) -> None:
|
||||
# This test is for verifying that the Jamba state is cleaned up between
|
||||
# steps, If its not cleaned, an error would be expected.
|
||||
try:
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
for _ in range(10):
|
||||
vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
|
||||
except ValueError:
|
||||
pytest.fail("Jamba inner state wasn't cleaned up between states, "
|
||||
"could be related to finished_requests_ids")
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
def test_jamba_distributed_produces_identical_generation(
|
||||
vllm_runner, model: str, dtype: str, max_tokens: int,
|
||||
example_prompts) -> None:
|
||||
|
||||
with vllm_runner(model, dtype=dtype, tensor_parallel_size=2) as vllm_model:
|
||||
vllm_outputs_tp_2 = vllm_model.generate_greedy(example_prompts,
|
||||
max_tokens)
|
||||
|
||||
with vllm_runner(model, dtype=dtype, tensor_parallel_size=1) as vllm_model:
|
||||
vllm_outputs_tp_1 = vllm_model.generate_greedy(example_prompts,
|
||||
max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_outputs_tp_1,
|
||||
outputs_1_lst=vllm_outputs_tp_2,
|
||||
name_0="vllm_tp_1",
|
||||
name_1="vllm_tp_2",
|
||||
)
|
||||
285
vllm-v0.6.2/tests/models/decoder_only/language/test_mamba.py
Normal file
285
vllm-v0.6.2/tests/models/decoder_only/language/test_mamba.py
Normal file
@@ -0,0 +1,285 @@
|
||||
"""Compare the outputs of HF and vLLM when using greedy sampling for Mamba.
|
||||
|
||||
Run `pytest tests/models/test_mamba.py`.
|
||||
"""
|
||||
import pytest
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.worker.model_runner import _get_graph_batch_size
|
||||
|
||||
from ...utils import check_outputs_equal
|
||||
|
||||
MODELS = ["state-spaces/mamba-130m-hf", "tiiuae/falcon-mamba-tiny-dev"]
|
||||
|
||||
|
||||
# Use lower-level interfaces to create this greedy generator, as mamba will
|
||||
# choke on the model_kwarg 'attention_mask' if hf_model.generate_greedy is used.
|
||||
def generate_greedy(model_name, example_prompts, max_tokens):
|
||||
# Create a text generation pipeline
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name)
|
||||
|
||||
# Generate texts from the prompts
|
||||
outputs = []
|
||||
for prompt in example_prompts:
|
||||
# Tokenize the input prompt with truncation
|
||||
inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
|
||||
input_ids = inputs["input_ids"].to(model.device)
|
||||
|
||||
# Generate text using the model's generate method directly
|
||||
generated_ids = model.generate(input_ids, max_new_tokens=max_tokens)
|
||||
generated_text = tokenizer.decode(generated_ids[0],
|
||||
skip_special_tokens=True)
|
||||
|
||||
outputs.append((generated_ids[0].tolist(), generated_text))
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [96])
|
||||
def test_models(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
hf_outputs = generate_greedy(model, example_prompts, max_tokens)
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
# This test is for verifying whether the model's extra_repr
|
||||
# can be printed correctly.
|
||||
print(vllm_model.model.llm_engine.model_executor.driver_worker.
|
||||
model_runner.model)
|
||||
|
||||
for i in range(len(example_prompts)):
|
||||
hf_output_ids, hf_output_str = hf_outputs[i]
|
||||
vllm_output_ids, vllm_output_str = vllm_outputs[i]
|
||||
assert hf_output_str == vllm_output_str, (
|
||||
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
|
||||
assert hf_output_ids == vllm_output_ids, (
|
||||
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [96])
|
||||
def test_batching(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
# To pass the small model tests, we need full precision.
|
||||
for_loop_outputs = []
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
for prompt in example_prompts:
|
||||
for_loop_outputs.append(
|
||||
vllm_model.generate_greedy([prompt], max_tokens)[0])
|
||||
|
||||
batched_outputs = vllm_model.generate_greedy(example_prompts,
|
||||
max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=for_loop_outputs,
|
||||
outputs_1_lst=batched_outputs,
|
||||
name_0="for_loop_vllm",
|
||||
name_1="batched_vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [10])
|
||||
def test_chunked_prefill_with_parallel_sampling(vllm_runner, example_prompts,
|
||||
model: str, dtype: str,
|
||||
max_tokens: int) -> None:
|
||||
# Tests chunked prefill in conjunction with n>1. In this case, prefill is
|
||||
# populated with decoding tokens and we test that it doesn't fail.
|
||||
# This test might fail if cache is not allocated correctly for n > 1
|
||||
# decoding steps inside a chunked prefill forward pass (where we have both
|
||||
# prefill and decode together )
|
||||
sampling_params = SamplingParams(n=3,
|
||||
temperature=1,
|
||||
seed=0,
|
||||
max_tokens=max_tokens)
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
enable_chunked_prefill=True,
|
||||
max_num_batched_tokens=30,
|
||||
max_num_seqs=10 # forces prefill chunks with decoding
|
||||
) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
|
||||
def test_chunked_prefill(vllm_runner, example_prompts, model: str, dtype: str,
|
||||
max_tokens: int,
|
||||
chunked_prefill_token_size: int) -> None:
|
||||
"""
|
||||
Checks exact match decode between huggingface model and vllm runner with
|
||||
chunked prefill.
|
||||
"""
|
||||
max_num_seqs = chunked_prefill_token_size
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
|
||||
non_chunked = generate_greedy(model, example_prompts, max_tokens)
|
||||
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
enable_chunked_prefill=True,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
max_num_seqs=max_num_seqs) as vllm_model:
|
||||
chunked = vllm_model.generate_greedy(example_prompts,
|
||||
max_tokens=max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=chunked,
|
||||
outputs_1_lst=non_chunked,
|
||||
name_0="chunked",
|
||||
name_1="non_chunked",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [15])
|
||||
def test_parallel_sampling(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
for_loop_outputs = []
|
||||
for _ in range(10):
|
||||
for_loop_outputs.append(
|
||||
# using example_prompts index 1 instead of 0 since with 0 the
|
||||
# logprobs get really close and the test doesn't pass
|
||||
vllm_model.generate_greedy([example_prompts[1]], max_tokens)
|
||||
[0])
|
||||
sampling_params = SamplingParams(n=10,
|
||||
temperature=0.001,
|
||||
seed=0,
|
||||
max_tokens=max_tokens)
|
||||
n_lt_1_outputs = vllm_model.generate([example_prompts[1]],
|
||||
sampling_params)
|
||||
token_ids, texts = n_lt_1_outputs[0]
|
||||
n_lt_1_outputs = [(token_id, text)
|
||||
for token_id, text in zip(token_ids, texts)]
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=n_lt_1_outputs,
|
||||
outputs_1_lst=for_loop_outputs,
|
||||
name_0="vllm_n_lt_1_outputs",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [20])
|
||||
def test_mamba_cache_cg_padding(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
# This test is for verifying that mamba cache is padded to CG captured
|
||||
# batch size. If it's not, a torch RuntimeError will be raised because
|
||||
# tensor dimensions aren't compatible
|
||||
while len(example_prompts) == _get_graph_batch_size(len(example_prompts)):
|
||||
example_prompts.append(example_prompts[0])
|
||||
|
||||
try:
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
except RuntimeError:
|
||||
pytest.fail(
|
||||
"Couldn't run batch size which is not equal to a Cuda Graph "
|
||||
"captured batch size. "
|
||||
"Could be related to mamba cache not padded correctly")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [20])
|
||||
def test_models_preemption_recompute(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
# Tests that outputs are identical with and w/o preemtions (recompute)
|
||||
assert dtype == "float"
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_model.model.llm_engine.scheduler[
|
||||
0].ENABLE_ARTIFICIAL_PREEMPT = True
|
||||
preempt_vllm_outputs = vllm_model.generate_greedy(
|
||||
example_prompts, max_tokens)
|
||||
|
||||
vllm_model.model.llm_engine.scheduler[
|
||||
0].ENABLE_ARTIFICIAL_PREEMPT = False
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=preempt_vllm_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="vllm_preepmtions",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
example_prompts,
|
||||
) -> None:
|
||||
# This test is for verifying that the Mamba inner state management doesn't
|
||||
# collapse in case where the number of incoming requests and
|
||||
# finished_requests_ids is larger than the maximum Mamba block capacity.
|
||||
# This could generally happen due to the fact that Mamba does support
|
||||
# statelessness mechanism where it can cleanup new incoming requests in
|
||||
# a single step.
|
||||
try:
|
||||
with vllm_runner(model, dtype=dtype, max_num_seqs=10) as vllm_model:
|
||||
vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
|
||||
except ValueError:
|
||||
pytest.fail("Mamba inner state wasn't cleaned up properly between"
|
||||
"steps finished requests registered unnecessarily ")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
def test_state_cleanup(
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
example_prompts,
|
||||
) -> None:
|
||||
# This test is for verifying that the Mamba state is cleaned up between
|
||||
# steps, If its not cleaned, an error would be expected.
|
||||
try:
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
for _ in range(10):
|
||||
vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
|
||||
except ValueError:
|
||||
pytest.fail("Mamba inner state wasn't cleaned up between states, "
|
||||
"could be related to finished_requests_ids")
|
||||
253
vllm-v0.6.2/tests/models/decoder_only/language/test_mistral.py
Normal file
253
vllm-v0.6.2/tests/models/decoder_only/language/test_mistral.py
Normal file
@@ -0,0 +1,253 @@
|
||||
"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
|
||||
|
||||
Run `pytest tests/models/test_mistral.py`.
|
||||
"""
|
||||
import copy
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import SamplingParams
|
||||
from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import ( # noqa
|
||||
MistralToolParser)
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
MODELS = [
|
||||
"mistralai/Mistral-7B-Instruct-v0.1",
|
||||
]
|
||||
|
||||
MISTRAL_FORMAT_MODELS = [
|
||||
"mistralai/Mistral-7B-Instruct-v0.3",
|
||||
# uses the v3-Tekken tokenizer
|
||||
"mistralai/Ministral-8B-Instruct-2410",
|
||||
# Mistral-Nemo is to big for CI, but passes locally
|
||||
# "mistralai/Mistral-Nemo-Instruct-2407"
|
||||
]
|
||||
|
||||
SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
|
||||
SYMBOLIC_LANG_PROMPTS = [
|
||||
"勇敢な船乗りについての詩を書く", # japanese
|
||||
"寫一首關於勇敢的水手的詩", # chinese
|
||||
"ပုံပြင်လေးပြောပြပါ်:\n", # burmese
|
||||
"Repeat the phrase 'URGENCY🌶️':\nURGENCY🌶️\nURGENCY🌶️\n", # see https://github.com/vllm-project/vllm/pull/9625
|
||||
]
|
||||
|
||||
# for function calling
|
||||
TOOLS = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type":
|
||||
"string",
|
||||
"description":
|
||||
"The city to find the weather for, e.g. 'San Francisco'"
|
||||
},
|
||||
"state": {
|
||||
"type":
|
||||
"string",
|
||||
"description":
|
||||
"the two-letter abbreviation for the state that the city is"
|
||||
" in, e.g. 'CA' which would mean 'California'"
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"description": "The unit to fetch the temperature in",
|
||||
"enum": ["celsius", "fahrenheit"]
|
||||
}
|
||||
},
|
||||
"required": ["city", "state", "unit"]
|
||||
}
|
||||
},
|
||||
}, {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "rewrite",
|
||||
"description": "Rewrites text",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"required": [],
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "The input text to rewrite."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}]
|
||||
MSGS = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are an assistant."
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"Could you please rewrite the below article? \n\n My English needs improvving, maybe I make errors." # noqa
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"assistant",
|
||||
"content":
|
||||
"",
|
||||
"tool_calls": [{
|
||||
"id": "bbc5b7ede",
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name":
|
||||
"rewrite",
|
||||
"arguments":
|
||||
'{\"text\":\"My English needs improvving, maybe I make errors.\"}' # noqa
|
||||
}
|
||||
}]
|
||||
},
|
||||
{
|
||||
"role": "tool",
|
||||
"content":
|
||||
"{\"action\":\"rewrite\",\"outcome\":\"My English needs improving, maybe I make errors.\"}", # noqa
|
||||
"tool_call_id": "bbc5b7ede",
|
||||
"name": "rewrite"
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "---\n\nMy English needs improving, maybe I make errors"
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": ("Can you tell me what the temperate"
|
||||
" will be in Dallas, in fahrenheit?")
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
# TODO(sang): Sliding window should be tested separately.
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with vllm_runner(model, dtype=dtype,
|
||||
tokenizer_mode="mistral") as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_mistral_format(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="auto",
|
||||
load_format="safetensors",
|
||||
config_format="hf",
|
||||
) as hf_format_model:
|
||||
hf_format_outputs = hf_format_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="mistral",
|
||||
load_format="mistral",
|
||||
config_format="mistral",
|
||||
) as mistral_format_model:
|
||||
mistral_format_outputs = mistral_format_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_format_outputs,
|
||||
outputs_1_lst=mistral_format_outputs,
|
||||
name_0="hf",
|
||||
name_1="mistral",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
def test_mistral_symbolic_languages(
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
max_model_len=8192,
|
||||
tokenizer_mode="mistral",
|
||||
config_format="mistral",
|
||||
load_format="mistral") as vllm_model:
|
||||
for prompt in SYMBOLIC_LANG_PROMPTS:
|
||||
msg = {"role": "user", "content": prompt}
|
||||
outputs = vllm_model.model.chat([msg],
|
||||
sampling_params=SAMPLING_PARAMS)
|
||||
assert "<EFBFBD>" not in outputs[0].outputs[0].text.strip()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("model",
|
||||
MISTRAL_FORMAT_MODELS) # v1 can't do func calling
|
||||
def test_mistral_function_calling(
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="mistral",
|
||||
config_format="mistral",
|
||||
load_format="mistral") as vllm_model:
|
||||
|
||||
msgs = copy.deepcopy(MSGS)
|
||||
outputs = vllm_model.model.chat(msgs,
|
||||
tools=TOOLS,
|
||||
sampling_params=SAMPLING_PARAMS)
|
||||
|
||||
tokenizer = vllm_model.model.get_tokenizer()
|
||||
tool_parser = MistralToolParser(tokenizer)
|
||||
|
||||
model_output = outputs[0].outputs[0].text.strip()
|
||||
assert model_output.startswith(tool_parser.bot_token), model_output
|
||||
parsed_message = tool_parser.extract_tool_calls(model_output, None)
|
||||
|
||||
assert parsed_message.tools_called
|
||||
assert parsed_message.tool_calls[0].id == "0UAqFzWsD"
|
||||
assert parsed_message.tool_calls[
|
||||
0].function.name == "get_current_weather"
|
||||
assert parsed_message.tool_calls[
|
||||
0].function.arguments == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}' # noqa
|
||||
assert parsed_message.content is None
|
||||
@@ -0,0 +1,80 @@
|
||||
# flake8: noqa
|
||||
"""Tests Model Optimizer fp8 models against ground truth generation
|
||||
Note: these tests will only pass on H100
|
||||
"""
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
|
||||
MAX_MODEL_LEN = 1024
|
||||
|
||||
MODELS = ["nvidia/Llama-3.1-8B-Instruct-FP8"]
|
||||
|
||||
EXPECTED_STRS_MAP = {
|
||||
"nvidia/Llama-3.1-8B-Instruct-FP8": [
|
||||
"You're referring to VLLM, a high-performance Large Language Model (LLM) inference and",
|
||||
'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
|
||||
'The comparison between artificial intelligence (AI) and human intelligence in terms of processing information is a complex and',
|
||||
'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
|
||||
'**The Spark of Imagination**\n\nZeta-5, a sleek and efficient robot, whir',
|
||||
'The COVID-19 pandemic has had a profound impact on global economic structures and business models, leading to',
|
||||
'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
|
||||
'Here are the translations:\n\n**Japanese:** 「早起きは早く獲物をとる'
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
# This test compares against golden strings for exact match since
|
||||
# there is no baseline implementation to compare against
|
||||
# and is unstable w.r.t specifics of the fp8 implementation or
|
||||
# the hardware being run on.
|
||||
# Disabled to prevent it from breaking the build
|
||||
@pytest.mark.skip(
|
||||
reason=
|
||||
"Prevent unstable test based on golden strings from breaking the build.")
|
||||
@pytest.mark.quant_model
|
||||
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
|
||||
reason="fp8 is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("model_name", MODELS)
|
||||
def test_models(example_prompts, model_name) -> None:
|
||||
model = LLM(
|
||||
model=model_name,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
trust_remote_code=True,
|
||||
enforce_eager=True,
|
||||
quantization="modelopt",
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
formatted_prompts = [
|
||||
tokenizer.apply_chat_template([{
|
||||
"role": "user",
|
||||
"content": prompt
|
||||
}],
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
for prompt in example_prompts
|
||||
]
|
||||
params = SamplingParams(max_tokens=20, temperature=0)
|
||||
generations: List[str] = []
|
||||
# Note: these need to be run 1 at a time due to numerical precision,
|
||||
# since the expected strs were generated this way.
|
||||
for prompt in formatted_prompts:
|
||||
outputs = model.generate(prompt, params)
|
||||
generations.append(outputs[0].outputs[0].text)
|
||||
del model
|
||||
|
||||
print(model_name, generations)
|
||||
expected_strs = EXPECTED_STRS_MAP[model_name]
|
||||
for i in range(len(example_prompts)):
|
||||
generated_str = generations[i]
|
||||
expected_str = expected_strs[i]
|
||||
assert expected_str == generated_str, (
|
||||
f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
|
||||
@@ -0,0 +1,88 @@
|
||||
"""Compare the outputs of HF and vLLM when using greedy sampling.
|
||||
|
||||
Run `pytest tests/models/test_models.py`.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief(MODELS): Only test gpt2, Llama-3.2-1B-Instruct, opt-125m.
|
||||
'''
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
# pytest.param(
|
||||
# "bigscience/bloom-560m", # bloom - testing alibi slopes
|
||||
# marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
# ),
|
||||
pytest.param(
|
||||
"openai-community/gpt2", # gpt2
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
# pytest.param("Milos/slovak-gpt-j-405M"), # gptj
|
||||
# pytest.param("bigcode/tiny_starcoder_py"), # gpt_bigcode
|
||||
# pytest.param("EleutherAI/pythia-70m"), # gpt_neox
|
||||
# pytest.param(
|
||||
# "google/gemma-1.1-2b-it", # gemma
|
||||
# marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
# ),
|
||||
pytest.param(
|
||||
"meta-llama/Llama-3.2-1B-Instruct", # llama
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
# pytest.param(
|
||||
# "openbmb/MiniCPM3-4B",
|
||||
# # fused_moe not supported on CPU
|
||||
# marks=[pytest.mark.core_model],
|
||||
# ),
|
||||
pytest.param(
|
||||
"facebook/opt-125m", # opt
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
# pytest.param(
|
||||
# "microsoft/phi-2", # phi
|
||||
# marks=[pytest.mark.core_model],
|
||||
# ),
|
||||
# pytest.param(
|
||||
# "Qwen/Qwen2.5-0.5B-Instruct", # qwen2
|
||||
# marks=[pytest.mark.core_model],
|
||||
# ),
|
||||
# pytest.param("stabilityai/stablelm-3b-4e1t"), # stablelm
|
||||
# pytest.param("bigcode/starcoder2-3b"), # starcoder2
|
||||
])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
# This test is for verifying whether the model's extra_repr
|
||||
# can be printed correctly.
|
||||
print(vllm_model.model.llm_engine.model_executor.driver_worker.
|
||||
model_runner.model)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
102
vllm-v0.6.2/tests/models/decoder_only/language/test_phimoe.py
Normal file
102
vllm-v0.6.2/tests/models/decoder_only/language/test_phimoe.py
Normal file
@@ -0,0 +1,102 @@
|
||||
"""Compare the outputs of HF and vLLM for moe models using greedy sampling.
|
||||
|
||||
Run `pytest tests/models/test_phimoe.py`.
|
||||
"""
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ....utils import large_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
MODELS = [
|
||||
"microsoft/Phi-3.5-MoE-instruct",
|
||||
]
|
||||
|
||||
|
||||
def test_phimoe_routing_function():
|
||||
from vllm.model_executor.models.phimoe import phimoe_routing_function
|
||||
test_case = {
|
||||
0: {
|
||||
"hidden_states":
|
||||
torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
|
||||
dtype=torch.float32,
|
||||
requires_grad=False).view(4, 2),
|
||||
"gating_output":
|
||||
torch.tensor([0.1, 0.2, 0.3, 0.4],
|
||||
dtype=torch.float32,
|
||||
requires_grad=False),
|
||||
"topk":
|
||||
2,
|
||||
"renormalize":
|
||||
False,
|
||||
},
|
||||
1: {
|
||||
"hidden_states":
|
||||
torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
|
||||
dtype=torch.float32,
|
||||
requires_grad=False).view(4, 2),
|
||||
"gating_output":
|
||||
torch.tensor([0.4, 0.2, 0.3, 0.4],
|
||||
dtype=torch.float32,
|
||||
requires_grad=False),
|
||||
"topk":
|
||||
2,
|
||||
"renormalize":
|
||||
False,
|
||||
}
|
||||
}
|
||||
|
||||
ground_truth = {
|
||||
0: {
|
||||
"topk_weights":
|
||||
torch.tensor([1., 1.], dtype=torch.float32, requires_grad=False),
|
||||
"topk_ids":
|
||||
torch.tensor([3, 2], dtype=torch.long, requires_grad=False),
|
||||
},
|
||||
1: {
|
||||
"topk_weights":
|
||||
torch.tensor([0.5, 1.], dtype=torch.float32, requires_grad=False),
|
||||
"topk_ids":
|
||||
torch.tensor([0, 3], dtype=torch.long, requires_grad=False),
|
||||
}
|
||||
}
|
||||
|
||||
for test_id in test_case:
|
||||
topk_weights, topk_ids = phimoe_routing_function(**test_case[test_id])
|
||||
assert torch.allclose(topk_weights,
|
||||
ground_truth[test_id]["topk_weights"])
|
||||
assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])
|
||||
|
||||
|
||||
@pytest.mark.skipif(condition=current_platform.is_cpu(),
|
||||
reason="This test takes a lot time to run on CPU, "
|
||||
"and vllm CI's disk space is not enough for this model.")
|
||||
@large_gpu_test(min_gb=80)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
@@ -0,0 +1,187 @@
|
||||
"""Tests for Idefics3's multimodal preprocessing kwargs."""
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import transformers
|
||||
from transformers import AutoImageProcessor, AutoTokenizer
|
||||
|
||||
from vllm.inputs import InputContext, token_inputs
|
||||
from vllm.multimodal import MultiModalRegistry
|
||||
|
||||
from .....conftest import _ImageAssets
|
||||
from ....utils import build_model_context
|
||||
|
||||
models = ["HuggingFaceM4/Idefics3-8B-Llama3"]
|
||||
|
||||
|
||||
# Wrap lazy imports to avoid initializing CUDA during test collection
|
||||
@pytest.fixture()
|
||||
def input_processor_for_idefics3():
|
||||
from vllm.model_executor.models.idefics3 import (
|
||||
input_processor_for_idefics3)
|
||||
return input_processor_for_idefics3
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def dummy_data_for_idefics3():
|
||||
from vllm.model_executor.models.idefics3 import dummy_data_for_idefics3
|
||||
return dummy_data_for_idefics3
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def get_max_idefics3_image_tokens():
|
||||
from vllm.model_executor.models.idefics3 import (
|
||||
get_max_idefics3_image_tokens)
|
||||
return get_max_idefics3_image_tokens
|
||||
|
||||
|
||||
@pytest.mark.skipif(transformers.__version__ < "4.46.0",
|
||||
reason="Model introduced in HF >= 4.46.0")
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("longest_edge", [None, 168, 336, 400, 2 * 336])
|
||||
def test_input_mapper_override(model: str, image_assets: _ImageAssets,
|
||||
longest_edge: Optional[int]):
|
||||
"""Ensure that the [default] input mapper handles size properly."""
|
||||
|
||||
mm_processor_kwargs = {
|
||||
"size": {
|
||||
"longest_edge": longest_edge
|
||||
}
|
||||
} if longest_edge is not None else {}
|
||||
ctx = build_model_context(
|
||||
model_name=model,
|
||||
tokenizer_name=model,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
)
|
||||
|
||||
hf_processor = AutoImageProcessor.from_pretrained(model,
|
||||
trust_remote_code=True,
|
||||
**mm_processor_kwargs)
|
||||
|
||||
mm_registry = MultiModalRegistry()
|
||||
mm_registry.init_mm_limits_per_prompt(ctx.model_config)
|
||||
|
||||
image = image_assets[0].pil_image
|
||||
hf_result = hf_processor.preprocess(
|
||||
image,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
vllm_result = mm_registry.map_input(
|
||||
ctx.model_config,
|
||||
{"image": image},
|
||||
)
|
||||
|
||||
assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"])
|
||||
|
||||
|
||||
@pytest.mark.skipif(transformers.__version__ < "4.46.0",
|
||||
reason="Model introduced in HF >= 4.46.0")
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("longest_edge, expected_max_tokens", [
|
||||
(None, 2873),
|
||||
(168, 169),
|
||||
(336, 169),
|
||||
(400, 338),
|
||||
(672, 338),
|
||||
])
|
||||
def test_max_tokens_override(get_max_idefics3_image_tokens, model: str,
|
||||
longest_edge: Optional[int],
|
||||
expected_max_tokens: int):
|
||||
"""Ensure get_max_idefics3_image_tokens handles mm_processor_kwargs."""
|
||||
size = {"longest_edge": longest_edge} if longest_edge is not None else None
|
||||
ctx = build_model_context(
|
||||
model_name=model,
|
||||
tokenizer_name=model,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs=None,
|
||||
)
|
||||
|
||||
actual_max_tokens = get_max_idefics3_image_tokens(
|
||||
ctx=InputContext(ctx.model_config),
|
||||
size=size,
|
||||
)
|
||||
|
||||
assert expected_max_tokens == actual_max_tokens
|
||||
|
||||
|
||||
@pytest.mark.skipif(transformers.__version__ < "4.46.0",
|
||||
reason="Model introduced in HF >= 4.46.0")
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("longest_edge, toks_per_img, num_imgs", [
|
||||
(168, 169, 1),
|
||||
(168, 169, 2),
|
||||
(400, 338, 1),
|
||||
(400, 338, 2),
|
||||
])
|
||||
def test_dummy_data_override(dummy_data_for_idefics3, model: str,
|
||||
longest_edge: int, toks_per_img: int,
|
||||
num_imgs: int):
|
||||
"""Ensure dummy_data_for_idefics3 handles num_crops properly."""
|
||||
# Same as the previous test - don't initialize mm_processor_kwargs
|
||||
# in this test and assume that the kwargs will be correctly expanded by
|
||||
# the partial when calling the dummy data func.
|
||||
size = {"longest_edge": longest_edge} if longest_edge is not None else None
|
||||
ctx = build_model_context(
|
||||
model_name=model,
|
||||
tokenizer_name=model,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs=None,
|
||||
)
|
||||
|
||||
dummy_data = dummy_data_for_idefics3(
|
||||
ctx=ctx,
|
||||
seq_len=8192, # Should be bigger than num_imgs * toks_per_img
|
||||
mm_counts={"image": num_imgs},
|
||||
size=size)
|
||||
sequence_data = dummy_data.seq_data
|
||||
# Ensure we have the right number of placeholders per size
|
||||
image_token_id = ctx.get_hf_config().image_token_id
|
||||
img_tok_count = sequence_data.get_token_ids().count(image_token_id)
|
||||
assert img_tok_count == toks_per_img * num_imgs
|
||||
|
||||
|
||||
@pytest.mark.skipif(transformers.__version__ < "4.46.0",
|
||||
reason="Model introduced in HF >= 4.46.0")
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("longest_edge,expected_toks_per_img,num_imgs", [
|
||||
(336, 169 * (1**2 + 1), 1),
|
||||
(336, 169 * (1**2 + 1), 2),
|
||||
(400, 169 * (2**2 + 1), 1),
|
||||
(400, 169 * (2**2 + 1), 2),
|
||||
])
|
||||
def test_input_processor_override(input_processor_for_idefics3,
|
||||
image_assets: _ImageAssets, model: str,
|
||||
longest_edge: int,
|
||||
expected_toks_per_img: int, num_imgs: int):
|
||||
"""Ensure input_processor_for_idefics3 handles num_crops properly."""
|
||||
# Same as the previous test - don't initialize mm_processor_kwargs
|
||||
# in this test and assume that the kwargs will be correctly expanded by
|
||||
# the partial when calling the custom input processor.
|
||||
size = {"longest_edge": longest_edge} if longest_edge is not None else None
|
||||
ctx = build_model_context(
|
||||
model_name=model,
|
||||
tokenizer_name=model,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs=None,
|
||||
)
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
placeholders = "<image>" if num_imgs == 1 else "\n".join(
|
||||
f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
|
||||
prompt = f"<|begin_of_text|>User:{placeholders}\n<end_of_utterance>\nAssistant:" # noqa: E501
|
||||
images = [image_assets[0].pil_image.resize((336 * 4, 336 * 4))] * num_imgs
|
||||
|
||||
inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
|
||||
prompt=prompt,
|
||||
multi_modal_data={"image": images})
|
||||
|
||||
processed_inputs = input_processor_for_idefics3(ctx, inputs, size=size)
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
image_token_id = ctx.get_hf_config().image_token_id
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
|
||||
assert img_tok_count == expected_toks_per_img * num_imgs
|
||||
@@ -0,0 +1,70 @@
|
||||
import pytest
|
||||
|
||||
from vllm.inputs import InputContext
|
||||
|
||||
from ....utils import build_model_context
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def get_max_llava_next_image_tokens():
|
||||
from vllm.model_executor.models.llava_next import (
|
||||
get_max_llava_next_image_tokens)
|
||||
return get_max_llava_next_image_tokens
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def dummy_data_for_llava_next():
|
||||
from vllm.model_executor.models.llava_next import dummy_data_for_llava_next
|
||||
return dummy_data_for_llava_next
|
||||
|
||||
|
||||
@pytest.mark.parametrize("gridpoints,expected_max_tokens", [
|
||||
([[336, 336]], 1176),
|
||||
([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], 2928),
|
||||
])
|
||||
def test_get_max_llava_next_image_tokens(gridpoints, expected_max_tokens,
|
||||
get_max_llava_next_image_tokens):
|
||||
ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf")
|
||||
|
||||
# Update the config image_grid_pinpoints
|
||||
# and calculate the resulting max tokens
|
||||
ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
|
||||
|
||||
actual_max_tokens = get_max_llava_next_image_tokens(
|
||||
InputContext(ctx.model_config))
|
||||
|
||||
assert expected_max_tokens == actual_max_tokens
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"gridpoints,expected_size",
|
||||
[
|
||||
# One point; it has to be the largest
|
||||
([[336, 336]], (336, 336)),
|
||||
# Default for most llava next models; the 2x2 tile is the largest
|
||||
([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]],
|
||||
(672, 672)),
|
||||
# If two rectangular gridpoints are the same, the more vertical
|
||||
# one has the higher feature count due to newline features
|
||||
([[336, 672], [672, 336]], (672, 336))
|
||||
])
|
||||
def test_dummy_data_for_llava_next_feature_size(dummy_data_for_llava_next,
|
||||
gridpoints, expected_size):
|
||||
ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf")
|
||||
|
||||
# Update the config image_grid_pinpoints
|
||||
ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
|
||||
seq_len = 5000 # bigger than the max feature size for any image
|
||||
|
||||
dummy_data = dummy_data_for_llava_next(
|
||||
ctx,
|
||||
seq_len=seq_len,
|
||||
mm_counts={"image": 1},
|
||||
)
|
||||
seq_data = dummy_data.seq_data
|
||||
mm_data = dummy_data.multi_modal_data
|
||||
|
||||
# The dummy data dims should match the gridpoint with the biggest feat size
|
||||
assert mm_data["image"].height == expected_size[0]
|
||||
assert mm_data["image"].width == expected_size[1]
|
||||
assert len(seq_data.get_token_ids()) >= seq_len
|
||||
@@ -0,0 +1,182 @@
|
||||
"""Tests for phi3v's multimodal preprocessing kwargs."""
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoImageProcessor, AutoTokenizer
|
||||
|
||||
from vllm.inputs import InputContext, token_inputs
|
||||
from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
|
||||
from vllm.multimodal import MultiModalRegistry
|
||||
|
||||
from .....conftest import _ImageAssets
|
||||
from ....utils import build_model_context
|
||||
|
||||
models = ["microsoft/Phi-3.5-vision-instruct"]
|
||||
|
||||
|
||||
# Wrap lazy imports to avoid initializing CUDA during test collection
|
||||
@pytest.fixture()
|
||||
def input_processor_for_phi3v():
|
||||
from vllm.model_executor.models.phi3v import input_processor_for_phi3v
|
||||
return input_processor_for_phi3v
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def dummy_data_for_phi3v():
|
||||
from vllm.model_executor.models.phi3v import dummy_data_for_phi3v
|
||||
return dummy_data_for_phi3v
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def get_max_phi3v_image_tokens():
|
||||
from vllm.model_executor.models.phi3v import get_max_phi3v_image_tokens
|
||||
return get_max_phi3v_image_tokens
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("num_crops", [4, 16, None])
|
||||
def test_input_mapper_override(model: str, image_assets: _ImageAssets,
|
||||
num_crops: Optional[int]):
|
||||
"""Ensure that the [default] input mapper handles num_crops properly."""
|
||||
# We pass the processor kwargs here since for this model, we fall back to
|
||||
# the default mapper; this will fall back to the HF mapper and forward
|
||||
# mm_processor_kwargs to it.
|
||||
mm_processor_kwargs = {
|
||||
"num_crops": num_crops
|
||||
} if num_crops is not None else {}
|
||||
ctx = build_model_context(
|
||||
model_name=model,
|
||||
tokenizer_name=model,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
)
|
||||
|
||||
hf_processor = AutoImageProcessor.from_pretrained(model,
|
||||
trust_remote_code=True,
|
||||
**mm_processor_kwargs)
|
||||
|
||||
mm_registry = MultiModalRegistry()
|
||||
mm_registry.init_mm_limits_per_prompt(ctx.model_config)
|
||||
|
||||
image = image_assets[0].pil_image
|
||||
hf_result = hf_processor.preprocess(
|
||||
image,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
vllm_result = mm_registry.map_input(
|
||||
ctx.model_config,
|
||||
{"image": image},
|
||||
)
|
||||
|
||||
assert torch.all(hf_result["image_sizes"] == vllm_result["image_sizes"])
|
||||
assert torch.all(
|
||||
hf_result["num_img_tokens"] == vllm_result["num_img_tokens"])
|
||||
|
||||
# For pixel values, the second axis should be the num_crops + 1
|
||||
# for the rescaled original image. The default value in VLLM falls
|
||||
# back to the HF config, which is why we compare to the processor num_crops
|
||||
assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"])
|
||||
assert vllm_result["pixel_values"].shape[1] == hf_processor.num_crops + 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("num_crops,expected_max_tokens", [
|
||||
(4, 781),
|
||||
(16, 2653),
|
||||
])
|
||||
def test_max_tokens_override(get_max_phi3v_image_tokens, model: str,
|
||||
num_crops: int, expected_max_tokens: int):
|
||||
"""Ensure get_max_phi3v_image_tokens handles num_crops properly."""
|
||||
# NOTE: mm_processor_kwargs on the context in this test is unused, since
|
||||
# this is testing the mapper directly. In practice, the processor kwargs
|
||||
# are wrapped in a closure when calling the max tokens func. We explicitly
|
||||
# do NOT use the mm_processor_kwargs in the model context here to ensure
|
||||
# that the max image tokens implementation is referencing a mix of the
|
||||
# kwargs to the function and the original mm_processor_kwargs in case
|
||||
# values are somehow updated and end up in a bad state.
|
||||
ctx = build_model_context(
|
||||
model_name=model,
|
||||
tokenizer_name=model,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs=None,
|
||||
)
|
||||
|
||||
actual_max_tokens = get_max_phi3v_image_tokens(
|
||||
InputContext(ctx.model_config),
|
||||
num_crops=num_crops,
|
||||
)
|
||||
|
||||
assert expected_max_tokens == actual_max_tokens
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("num_crops,toks_per_img,num_imgs", [
|
||||
(4, 781, 1),
|
||||
(4, 781, 2),
|
||||
(16, 2653, 1),
|
||||
(16, 2653, 2),
|
||||
])
|
||||
def test_dummy_data_override(dummy_data_for_phi3v, model: str, num_crops: int,
|
||||
toks_per_img: int, num_imgs: int):
|
||||
"""Ensure dummy_data_for_phi3v handles num_crops properly."""
|
||||
# Same as the previous test - don't initialize mm_processor_kwargs
|
||||
# in this test and assume that the kwargs will be correctly expanded by
|
||||
# the partial when calling the dummy data func.
|
||||
ctx = build_model_context(
|
||||
model_name=model,
|
||||
tokenizer_name=model,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs=None,
|
||||
)
|
||||
|
||||
dummy_data = dummy_data_for_phi3v(
|
||||
ctx=ctx,
|
||||
seq_len=8192, # Should be bigger than num_imgs * toks_per_img
|
||||
mm_counts={"image": num_imgs},
|
||||
num_crops=num_crops,
|
||||
)
|
||||
sequence_data = dummy_data.seq_data
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
img_tok_count = sequence_data.get_token_ids().count(_IMAGE_TOKEN_ID)
|
||||
assert img_tok_count == toks_per_img * num_imgs
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("num_crops,expected_toks_per_img,num_imgs", [
|
||||
(4, 757, 1),
|
||||
(4, 757, 2),
|
||||
(16, 1921, 1),
|
||||
(16, 1921, 2),
|
||||
])
|
||||
def test_input_processor_override(input_processor_for_phi3v,
|
||||
image_assets: _ImageAssets, model: str,
|
||||
num_crops: int, expected_toks_per_img: int,
|
||||
num_imgs: int):
|
||||
"""Ensure input_processor_for_phi3v handles num_crops properly."""
|
||||
# Same as the previous test - don't initialize mm_processor_kwargs
|
||||
# in this test and assume that the kwargs will be correctly expanded by
|
||||
# the partial when calling the custom input processor.
|
||||
ctx = build_model_context(
|
||||
model_name=model,
|
||||
tokenizer_name=model,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
|
||||
prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
|
||||
images = [image_assets[0].pil_image] * num_imgs
|
||||
|
||||
inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
|
||||
prompt=prompt,
|
||||
multi_modal_data={"image": images})
|
||||
|
||||
processed_inputs = input_processor_for_phi3v(ctx,
|
||||
inputs,
|
||||
num_crops=num_crops)
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)
|
||||
assert img_tok_count == expected_toks_per_img * num_imgs
|
||||
@@ -0,0 +1,144 @@
|
||||
"""Tests for Qwen's multimodal preprocessing kwargs."""
|
||||
from typing import Dict, List, Union
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from PIL.Image import Image
|
||||
|
||||
from vllm.inputs import InputContext, token_inputs
|
||||
from vllm.multimodal import MultiModalKwargs
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
|
||||
from .....conftest import IMAGE_ASSETS
|
||||
from ....utils import build_model_context
|
||||
|
||||
### Multimodal preprocessing tests
|
||||
SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image
|
||||
# These values are specific to Qwen-VL/Chat; we can get these from the model
|
||||
# config also, but they are hardcoded here to keep the parameterize/fixtures
|
||||
# easy to read.
|
||||
IMG_START_ID = 151857
|
||||
IMG_END_ID = 151858
|
||||
IMG_PAD_ID = 151859
|
||||
TOKS_PER_IMG = 256
|
||||
VIS_ENC_DIM = 4096
|
||||
IMG_SIZE = 448
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def input_mapper_for_qwen():
|
||||
# Lazy import to avoid initializing CUDA during test collection
|
||||
from vllm.model_executor.models.qwen import input_mapper_for_qwen
|
||||
return input_mapper_for_qwen
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def input_processor_for_qwen():
|
||||
# Lazy import to avoid initializing CUDA during test collection
|
||||
from vllm.model_executor.models.qwen import input_processor_for_qwen
|
||||
return input_processor_for_qwen
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def qwen_vl_context() -> InputContext:
|
||||
"""Get an InputContext for Qwen-VL."""
|
||||
return build_model_context(model_name="Qwen/Qwen-VL",
|
||||
trust_remote_code=True)
|
||||
|
||||
|
||||
# Happy path tests for single/multi-image scenarios for the multimodal
|
||||
# input processor and mapper, respectively
|
||||
@pytest.mark.parametrize("num_images", [1, 2])
|
||||
def test_input_processor_valid_mm_data(input_processor_for_qwen,
|
||||
qwen_vl_context: InputContext,
|
||||
num_images: int):
|
||||
"""Happy cases for image inputs to Qwen's multimodal input processor."""
|
||||
prompt = "".join(
|
||||
[f"Picture {num}: <img></img>\n" for num in range(1, num_images + 1)])
|
||||
inputs = token_inputs(
|
||||
prompt=prompt,
|
||||
# When processing multimodal data for a multimodal model, the qwen
|
||||
# input processor will overwrite the provided prompt_token_ids with
|
||||
# the image prompts
|
||||
prompt_token_ids=[],
|
||||
multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)},
|
||||
)
|
||||
proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs)
|
||||
assert isinstance(proc_inputs, dict)
|
||||
|
||||
# Each image should have one start / stop and a fixed context of 256
|
||||
proc_tokens = proc_inputs["prompt_token_ids"]
|
||||
assert proc_tokens.count(IMG_START_ID) == num_images
|
||||
assert proc_tokens.count(IMG_END_ID) == num_images
|
||||
assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"img_data,expected_shape",
|
||||
[
|
||||
# single / multi-image
|
||||
(SAMPLE_IMAGE, (1, 3, IMG_SIZE, IMG_SIZE)),
|
||||
(2 * [SAMPLE_IMAGE], (2, 3, IMG_SIZE, IMG_SIZE)),
|
||||
# single / multi-image embeddings
|
||||
(torch.rand(
|
||||
(TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
|
||||
(torch.rand(
|
||||
(1, TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
|
||||
(torch.rand(
|
||||
(2, TOKS_PER_IMG, VIS_ENC_DIM)), (2, TOKS_PER_IMG, VIS_ENC_DIM)),
|
||||
])
|
||||
def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
|
||||
qwen_vl_context: InputContext,
|
||||
img_data: Union[torch.Tensor, List[Image],
|
||||
Image],
|
||||
expected_shape: List[int]):
|
||||
"""Happy cases for image inputs to Qwen's multimodal input mapper."""
|
||||
mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data)
|
||||
# Ensure that we get the appropriately shaped pixel_values
|
||||
# for images and image embeddings, respectively.
|
||||
assert isinstance(mapped_img_data, MultiModalKwargs)
|
||||
assert "pixel_values" in mapped_img_data
|
||||
assert mapped_img_data["pixel_values"].shape == expected_shape
|
||||
|
||||
|
||||
# Sad path tests for the multimodal input processor and mapper, respectively
|
||||
@pytest.mark.parametrize("mm_data", [
|
||||
{
|
||||
"image": torch.rand(5)
|
||||
},
|
||||
{
|
||||
"image": torch.rand((5, 5, 5, 5, 5))
|
||||
},
|
||||
])
|
||||
def test_input_processor_invalid_mm_data(input_processor_for_qwen,
|
||||
qwen_vl_context: InputContext,
|
||||
mm_data: Dict[str, torch.Tensor]):
|
||||
"""Test sad cases validated in Qwen's multimodal input processor."""
|
||||
tokenizer = cached_get_tokenizer(qwen_vl_context.model_config.tokenizer,
|
||||
trust_remote_code=True)
|
||||
prompt = "Picture 1: <img></img>\n"
|
||||
prompt_token_ids = tokenizer.encode(prompt)
|
||||
inputs = token_inputs(prompt=prompt,
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
multi_modal_data=mm_data)
|
||||
# Should fail since we have too many or too few dimensions for embeddings
|
||||
with pytest.raises(ValueError):
|
||||
input_processor_for_qwen(qwen_vl_context, inputs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"img_data",
|
||||
[
|
||||
# Wrong context length
|
||||
torch.rand((1, TOKS_PER_IMG + 10, VIS_ENC_DIM)),
|
||||
# Wrong visual encoder output size
|
||||
torch.rand((1, TOKS_PER_IMG, VIS_ENC_DIM + 10)),
|
||||
])
|
||||
def test_input_mapper_invalid_mm_data(
|
||||
input_mapper_for_qwen,
|
||||
qwen_vl_context: InputContext,
|
||||
img_data: Union[torch.Tensor, List[Image], Image],
|
||||
):
|
||||
"""Sad cases validated in Qwen VL's multimodal input mapper."""
|
||||
with pytest.raises(ValueError):
|
||||
input_mapper_for_qwen(qwen_vl_context, img_data)
|
||||
@@ -0,0 +1,167 @@
|
||||
from typing import Any, Dict, Tuple
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from PIL.Image import Image
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm.inputs import InputContext, token_inputs
|
||||
from vllm.multimodal import MultiModalRegistry
|
||||
|
||||
from .....conftest import _ImageAssets
|
||||
from ....utils import build_model_context
|
||||
|
||||
MODEL = "Qwen/Qwen2-VL-2B-Instruct"
|
||||
MIN_PIXELS = "min_pixels"
|
||||
MAX_PIXELS = "max_pixels"
|
||||
|
||||
|
||||
# Fixtures lazy import to avoid initializing CUDA during test collection
|
||||
# NOTE: Qwen2VL supports multiple input modalities, so it registers multiple
|
||||
# input mappers.
|
||||
@pytest.fixture()
|
||||
def image_input_mapper_for_qwen2_vl():
|
||||
from vllm.model_executor.models.qwen2_vl import (
|
||||
image_input_mapper_for_qwen2_vl)
|
||||
return image_input_mapper_for_qwen2_vl
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def input_processor_for_qwen2_vl():
|
||||
from vllm.model_executor.models.qwen2_vl import (
|
||||
input_processor_for_qwen2_vl)
|
||||
return input_processor_for_qwen2_vl
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def qwen2_vl_context() -> InputContext:
|
||||
return build_model_context(model_name=MODEL)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def get_max_qwen2_vl_image_tokens():
|
||||
from vllm.model_executor.models.qwen2_vl import (
|
||||
get_max_qwen2_vl_image_tokens)
|
||||
return get_max_qwen2_vl_image_tokens
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def dummy_data_for_qwen2_vl():
|
||||
from vllm.model_executor.models.qwen2_vl import dummy_data_for_qwen2_vl
|
||||
return dummy_data_for_qwen2_vl
|
||||
|
||||
|
||||
@pytest.mark.parametrize("mm_processor_kwargs,expected_max_tokens", [
|
||||
({}, 1225),
|
||||
({
|
||||
MIN_PIXELS: 64**2,
|
||||
MAX_PIXELS: 512**2
|
||||
}, 324),
|
||||
])
|
||||
def test_qwen2_vl_max_image_tokens(get_max_qwen2_vl_image_tokens,
|
||||
qwen2_vl_context: InputContext,
|
||||
mm_processor_kwargs: Dict[str, Any],
|
||||
expected_max_tokens: int):
|
||||
"""Ensure that the max token calc handles min/max pixels properly."""
|
||||
actual_max_tokens = get_max_qwen2_vl_image_tokens(qwen2_vl_context,
|
||||
**mm_processor_kwargs)
|
||||
assert actual_max_tokens == expected_max_tokens
|
||||
|
||||
|
||||
@pytest.mark.parametrize("mm_processor_kwargs,token_count,img_size", [
|
||||
[{}, 1225, (980, 980)],
|
||||
[{
|
||||
MIN_PIXELS: 64**2,
|
||||
MAX_PIXELS: 512**2
|
||||
}, 324, (504, 504)],
|
||||
])
|
||||
def test_qwen2_vl_dummy_data(dummy_data_for_qwen2_vl,
|
||||
qwen2_vl_context: InputContext,
|
||||
mm_processor_kwargs: Dict[str, Any],
|
||||
token_count: int, img_size: Tuple[int, int]):
|
||||
"""Ensure that the dummy data handles min/max pixels properly."""
|
||||
seq_len = 3000
|
||||
hf_config = qwen2_vl_context.get_hf_config()
|
||||
image_token_id = hf_config.image_token_id
|
||||
|
||||
# NOTE: video value is required, but isn't actually used
|
||||
# when making the dummy data except for error handling currently
|
||||
dummy_data = dummy_data_for_qwen2_vl(
|
||||
ctx=qwen2_vl_context,
|
||||
seq_len=seq_len,
|
||||
mm_counts={
|
||||
"image": 1,
|
||||
"video": 0
|
||||
},
|
||||
**mm_processor_kwargs,
|
||||
)
|
||||
seq_data = dummy_data.seq_data
|
||||
mm_data = dummy_data.multi_modal_data
|
||||
|
||||
# Ensure we have the right number of placeholders for min/max pixel values
|
||||
assert seq_data.get_token_ids().count(image_token_id) == token_count
|
||||
|
||||
# Ensure the images were resized correctly
|
||||
image = mm_data["image"]
|
||||
assert isinstance(image, Image)
|
||||
assert image.size == img_size
|
||||
|
||||
|
||||
@pytest.mark.parametrize("mm_processor_kwargs,num_placeholders", [
|
||||
({}, 1426),
|
||||
({
|
||||
MIN_PIXELS: 64**2,
|
||||
MAX_PIXELS: 512**2
|
||||
}, 330),
|
||||
])
|
||||
def test_input_processor(input_processor_for_qwen2_vl,
|
||||
qwen2_vl_context: InputContext,
|
||||
image_assets: _ImageAssets, num_placeholders: int,
|
||||
mm_processor_kwargs: Dict[str, Any]):
|
||||
"""Ensure that the image processor handles min/max pixels properly."""
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL)
|
||||
prompt = "<|vision_start|><|image_pad|><|vision_end|>"
|
||||
|
||||
image = image_assets[0].pil_image
|
||||
hf_config = qwen2_vl_context.get_hf_config()
|
||||
image_token_id = hf_config.image_token_id
|
||||
|
||||
inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
|
||||
prompt=prompt,
|
||||
multi_modal_data={"image": [image]})
|
||||
|
||||
processed_inputs = input_processor_for_qwen2_vl(qwen2_vl_context, inputs,
|
||||
**mm_processor_kwargs)
|
||||
assert processed_inputs["prompt_token_ids"].count(
|
||||
image_token_id) == num_placeholders
|
||||
assert len(processed_inputs["multi_modal_data"]["image"]) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("mm_processor_kwargs,pixels_shape", [
|
||||
({}, [5704, 1176]),
|
||||
({
|
||||
MIN_PIXELS: 64**2,
|
||||
MAX_PIXELS: 512**2
|
||||
}, [1320, 1176]),
|
||||
])
|
||||
def test_image_mapper_override(qwen2_vl_context: InputContext,
|
||||
image_assets: _ImageAssets,
|
||||
mm_processor_kwargs: Dict[str, Any],
|
||||
pixels_shape: Tuple[int, int]):
|
||||
"""Ensure that the image mapper handles min/max pixels properly."""
|
||||
mm_registry = MultiModalRegistry()
|
||||
mm_registry.init_mm_limits_per_prompt(qwen2_vl_context.model_config)
|
||||
|
||||
image = image_assets[0].pil_image
|
||||
|
||||
mapped_output = mm_registry.map_input(
|
||||
qwen2_vl_context.model_config,
|
||||
{"image": image},
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
)
|
||||
|
||||
# Dimension 0 of pixel values should match the product of image_grid_thw
|
||||
actual_pixels_shape = mapped_output["pixel_values"].shape
|
||||
assert list(actual_pixels_shape) == pixels_shape
|
||||
assert actual_pixels_shape[0] == torch.prod(
|
||||
mapped_output["image_grid_thw"])
|
||||
@@ -0,0 +1,120 @@
|
||||
from typing import List, Optional, Type
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.multimodal.utils import rescale_image_size
|
||||
|
||||
from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
"<|im_start|>User\n<image>\nWhat's the content in the center of the image?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||
"cherry_blossom":
|
||||
"<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||
})
|
||||
|
||||
|
||||
def run_awq_test(
|
||||
vllm_runner: Type[VllmRunner],
|
||||
image_assets: _ImageAssets,
|
||||
source_model: str,
|
||||
quant_model: str,
|
||||
*,
|
||||
size_factors: List[float],
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
):
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_image = [(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(source_model,
|
||||
max_model_len=4096,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True) as vllm_model:
|
||||
source_outputs_per_image = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images)
|
||||
for prompts, images in inputs_per_image
|
||||
]
|
||||
|
||||
with vllm_runner(quant_model,
|
||||
quantization="awq",
|
||||
max_model_len=4096,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True) as vllm_model:
|
||||
quant_outputs_per_image = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images)
|
||||
for prompts, images in inputs_per_image
|
||||
]
|
||||
|
||||
for source_outputs, quant_outputs in zip(source_outputs_per_image,
|
||||
quant_outputs_per_image):
|
||||
# TODO: Check whether using original CLIPVisionModel can improve
|
||||
# consistency against HF
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=source_outputs,
|
||||
outputs_1_lst=quant_outputs,
|
||||
name_0="source",
|
||||
name_1="awq",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.quant_model
|
||||
@pytest.mark.parametrize(
|
||||
("source_model", "quant_model"),
|
||||
[("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# No image
|
||||
[],
|
||||
# Single-scale
|
||||
[1.0],
|
||||
# Single-scale, batched
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@torch.inference_mode()
|
||||
def test_awq_models(vllm_runner, image_assets, source_model, quant_model,
|
||||
size_factors, dtype, max_tokens, num_logprobs) -> None:
|
||||
run_awq_test(
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
source_model,
|
||||
quant_model,
|
||||
size_factors=size_factors,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
@@ -0,0 +1,129 @@
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from PIL.Image import Image
|
||||
from transformers import AutoConfig
|
||||
|
||||
# Import the functions to test
|
||||
from vllm.model_executor.models.h2ovl import (calculate_num_blocks,
|
||||
image_to_pixel_values_wrapper)
|
||||
from vllm.multimodal.utils import rescale_image_size
|
||||
|
||||
models = [
|
||||
"h2oai/h2ovl-mississippi-800m", # Replace with your actual model names
|
||||
"h2oai/h2ovl-mississippi-2b",
|
||||
]
|
||||
|
||||
|
||||
def run_preprocessing_test(
|
||||
image: Image,
|
||||
config,
|
||||
max_dynamic_patch: Optional[int] = None,
|
||||
) -> Tuple[torch.Tensor, int]:
|
||||
"""Test the image preprocessing and calculate expected blocks."""
|
||||
|
||||
if max_dynamic_patch is None:
|
||||
max_dynamic_patch = config.max_dynamic_patch
|
||||
|
||||
width, height = image.size
|
||||
use_MSAC = config.use_msac
|
||||
|
||||
# Create the mapper function with the provided configuration
|
||||
mapper = image_to_pixel_values_wrapper(config, max_dynamic_patch, use_MSAC)
|
||||
pixel_values = mapper(image)
|
||||
|
||||
# Calculate the expected number of blocks
|
||||
if use_MSAC:
|
||||
# First pass
|
||||
blocks1, _, _, aspect_ratio = calculate_num_blocks(
|
||||
width,
|
||||
height,
|
||||
config.min_dynamic_patch,
|
||||
max_dynamic_patch,
|
||||
config.vision_config.image_size,
|
||||
use_thumbnail=False, # Thumbnail is handled separately
|
||||
prior_aspect_ratio=None,
|
||||
)
|
||||
|
||||
# Second pass
|
||||
blocks2, _, _, _ = calculate_num_blocks(
|
||||
width,
|
||||
height,
|
||||
config.min_dynamic_patch,
|
||||
max_dynamic_patch,
|
||||
config.vision_config.image_size,
|
||||
use_thumbnail=False,
|
||||
prior_aspect_ratio=aspect_ratio,
|
||||
)
|
||||
|
||||
# Add thumbnail if use_thumbnail is True and total_blocks > 1
|
||||
if config.use_thumbnail:
|
||||
blocks1 += 1 if blocks1 > 1 else 0
|
||||
blocks2 += 1 if blocks2 > 1 else 0
|
||||
|
||||
# Total blocks is the sum of blocks from both passes minus overlapping
|
||||
total_blocks = blocks1 + blocks2 - 1
|
||||
|
||||
expected_blocks = total_blocks
|
||||
|
||||
else:
|
||||
blocks, _, _, _ = calculate_num_blocks(
|
||||
width,
|
||||
height,
|
||||
config.min_dynamic_patch,
|
||||
max_dynamic_patch,
|
||||
config.vision_config.image_size,
|
||||
use_thumbnail=False,
|
||||
prior_aspect_ratio=None,
|
||||
)
|
||||
expected_blocks = blocks
|
||||
|
||||
if config.use_thumbnail and expected_blocks > 1:
|
||||
expected_blocks += 1
|
||||
|
||||
return pixel_values, expected_blocks
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", models)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# Single-scale
|
||||
[1.0],
|
||||
# Single-scale, batched
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("max_dynamic_patch", [None, 2, 4, 8])
|
||||
def test_image_preprocessing(image_assets, model_name, size_factors,
|
||||
max_dynamic_patch):
|
||||
"""Test image preprocessing pipeline with different configurations."""
|
||||
# Load the configuration from the model
|
||||
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
|
||||
|
||||
for asset in image_assets:
|
||||
image = asset.pil_image
|
||||
for factor in size_factors:
|
||||
scaled_image = rescale_image_size(image, factor)
|
||||
|
||||
# Test preprocessing and get expected number of blocks
|
||||
pixel_values, expected_blocks = run_preprocessing_test(
|
||||
scaled_image, config, max_dynamic_patch)
|
||||
|
||||
# Verify output shapes and properties
|
||||
actual_blocks = pixel_values.shape[0]
|
||||
assert actual_blocks == expected_blocks, (
|
||||
f"Expected {expected_blocks} blocks, got {actual_blocks}")
|
||||
|
||||
# Check image dimensions
|
||||
expected_size = (
|
||||
3, # Number of channels (C, H, W)
|
||||
config.vision_config.image_size,
|
||||
config.vision_config.image_size,
|
||||
)
|
||||
for img in pixel_values:
|
||||
assert img.shape == expected_size, (
|
||||
f"Expected image size {expected_size}, got {img.shape}")
|
||||
@@ -0,0 +1,77 @@
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from huggingface_hub import snapshot_download
|
||||
from transformers import AutoConfig, AutoModel, CLIPImageProcessor
|
||||
|
||||
from ....conftest import _ImageAssets
|
||||
|
||||
# we use snapshot_download to prevent conflicts between
|
||||
# dynamic_module and trust_remote_code for hf_runner
|
||||
DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]
|
||||
|
||||
|
||||
def run_intern_vit_test(
|
||||
image_assets: _ImageAssets,
|
||||
model_id: str,
|
||||
*,
|
||||
dtype: str,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
):
|
||||
model = snapshot_download(model_id, allow_patterns=DOWNLOAD_PATTERN)
|
||||
|
||||
img_processor = CLIPImageProcessor.from_pretrained(model)
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
pixel_values = [
|
||||
img_processor(images, return_tensors='pt').pixel_values.to(dtype)
|
||||
for images in images
|
||||
]
|
||||
|
||||
config = AutoConfig.from_pretrained(model, trust_remote_code=True)
|
||||
if not getattr(config, "norm_type", None):
|
||||
config.norm_type = "rms_norm"
|
||||
|
||||
hf_model = AutoModel.from_pretrained(model,
|
||||
torch_dtype=dtype,
|
||||
trust_remote_code=True).to("cuda")
|
||||
hf_outputs_per_image = [
|
||||
hf_model(pixel_value.to("cuda")).last_hidden_state
|
||||
for pixel_value in pixel_values
|
||||
]
|
||||
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.model_executor.models.intern_vit import InternVisionModel
|
||||
vllm_model = InternVisionModel(config)
|
||||
vllm_model.load_weights(hf_model.state_dict().items())
|
||||
|
||||
del hf_model
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
vllm_model = vllm_model.to("cuda", dtype)
|
||||
vllm_outputs_per_image = [
|
||||
vllm_model(pixel_values=pixel_value.to("cuda"))
|
||||
for pixel_value in pixel_values
|
||||
]
|
||||
del vllm_model
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
cos_similar = nn.CosineSimilarity(dim=-1)
|
||||
for vllm_output, hf_output in zip(vllm_outputs_per_image,
|
||||
hf_outputs_per_image):
|
||||
assert cos_similar(vllm_output, hf_output).mean() > 0.99
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", [
|
||||
"OpenGVLab/InternViT-300M-448px",
|
||||
"OpenGVLab/InternViT-6B-448px-V1-5",
|
||||
])
|
||||
@pytest.mark.parametrize("dtype", [torch.half])
|
||||
@torch.inference_mode()
|
||||
def test_models(dist_init, image_assets, model_id, dtype: str) -> None:
|
||||
run_intern_vit_test(
|
||||
image_assets,
|
||||
model_id,
|
||||
dtype=dtype,
|
||||
)
|
||||
@@ -0,0 +1,657 @@
|
||||
"""Common tests for testing .generate() functionality for single / multiple
|
||||
image, embedding, and video support for different VLMs in vLLM.
|
||||
"""
|
||||
import os
|
||||
from pathlib import PosixPath
|
||||
from typing import Type
|
||||
|
||||
import pytest
|
||||
import transformers
|
||||
from transformers import AutoModelForVision2Seq
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import cuda_device_count_stateless, identity
|
||||
|
||||
from ....conftest import (IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets,
|
||||
_VideoAssets)
|
||||
from ....utils import fork_new_process_for_each_test, large_gpu_mark
|
||||
from ...utils import check_outputs_equal
|
||||
from .vlm_utils import custom_inputs, model_utils, runners
|
||||
from .vlm_utils.case_filtering import get_parametrized_options
|
||||
from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs,
|
||||
VLMTestInfo, VLMTestType)
|
||||
|
||||
# This hack is needed for phi3v & paligemma models
|
||||
# ROCm Triton FA can run into shared memory issues with these models,
|
||||
# use other backends in the meantime
|
||||
# FIXME (mattwong, gshtrasb, hongxiayan)
|
||||
if current_platform.is_rocm():
|
||||
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
|
||||
|
||||
# yapf: disable
|
||||
COMMON_BROADCAST_SETTINGS = {
|
||||
"test_type": VLMTestType.IMAGE,
|
||||
"dtype": "half",
|
||||
"max_tokens": 5,
|
||||
"tensor_parallel_size": 2,
|
||||
"model_kwargs": {"device_map": "auto"},
|
||||
"image_size_factors": [(.25, 0.5, 1.0)],
|
||||
"distributed_executor_backend": (
|
||||
"ray",
|
||||
"mp",
|
||||
)
|
||||
}
|
||||
|
||||
### Test configuration for specific models
|
||||
# NOTE: The convention of the test settings below is to lead each test key
|
||||
# with the name of the model arch used in the test, using underscores in place
|
||||
# of hyphens; this makes it more convenient to filter tests for a specific kind
|
||||
# of model. For example....
|
||||
#
|
||||
# To run all test types for a specific key:
|
||||
# use the k flag to substring match with a leading square bracket; if the
|
||||
# model arch happens to be a substring of another one, you can add a
|
||||
# trailing hyphen. E.g.,
|
||||
# - pytest $TEST_FILE -k "[llava-"
|
||||
# prevents matching on "[llava_next-" & will match just the enabled cases
|
||||
# for llava, i.e., single image, image embedding, and custom input tests.
|
||||
#
|
||||
# To run a test for a Test Info for just one of multiple models:
|
||||
# use the k flag to substring match the model name, e.g.,
|
||||
# - pytest $TEST_FILE -k OpenGVLab/InternVL2-1B
|
||||
# prevents matching on nGVLab/InternVL2-2B.
|
||||
#
|
||||
# You can also combine substrings to match more granularly.
|
||||
# ex 1:
|
||||
# pytest $TEST_FILE -k "test_single_image and OpenGVLab/InternVL2-1B"
|
||||
# will run only test_single_image* for OpenGVLab/InternVL2-1B; this would
|
||||
# match both wrappers for single image tests, since it also matches
|
||||
# test_single_image_heavy (which forks if we have a distributed backend)
|
||||
# ex 2:
|
||||
# pytest $TEST_FILE -k "[llava- or [intern_vl-"
|
||||
# will run all of the tests for only llava & internvl.
|
||||
#
|
||||
# NOTE you can add --collect-only to any of the above commands to see
|
||||
# which cases would be selected and deselected by pytest. In general,
|
||||
# this is a good idea for checking your command first, since tests are slow.
|
||||
|
||||
VLM_TEST_SETTINGS = {
|
||||
#### Core tests to always run in the CI
|
||||
"llava": VLMTestInfo(
|
||||
models=["llava-hf/llava-1.5-7b-hf"],
|
||||
test_type=(
|
||||
VLMTestType.EMBEDDING,
|
||||
VLMTestType.IMAGE,
|
||||
VLMTestType.CUSTOM_INPUTS
|
||||
),
|
||||
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||
convert_assets_to_embeddings=model_utils.get_llava_embeddings,
|
||||
max_model_len=4096,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||
custom_test_opts=[CustomTestOptions(
|
||||
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
||||
formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:"
|
||||
),
|
||||
limit_mm_per_prompt={"image": 4},
|
||||
)],
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
"paligemma": VLMTestInfo(
|
||||
models=["google/paligemma-3b-mix-224"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
prompt_formatter=identity,
|
||||
img_idx_to_prompt = lambda idx: "",
|
||||
# Paligemma uses its own sample prompts because the default one fails
|
||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
||||
"stop_sign": "caption es",
|
||||
"cherry_blossom": "What is in the picture?",
|
||||
}),
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
postprocess_inputs=model_utils.get_key_type_post_processor(
|
||||
"pixel_values"
|
||||
),
|
||||
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
|
||||
dtype=("half" if current_platform.is_cpu() or current_platform.is_rocm()
|
||||
else ("half", "float")),
|
||||
marks=[pytest.mark.core_model],
|
||||
),
|
||||
"qwen2_vl": VLMTestInfo(
|
||||
models=["Qwen/Qwen2-VL-2B-Instruct"],
|
||||
test_type=(
|
||||
VLMTestType.IMAGE,
|
||||
VLMTestType.MULTI_IMAGE,
|
||||
VLMTestType.VIDEO
|
||||
),
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
||||
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
|
||||
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
#### Extended model tests
|
||||
"blip2": VLMTestInfo(
|
||||
models=["Salesforce/blip2-opt-2.7b"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
|
||||
img_idx_to_prompt=lambda idx: "",
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
|
||||
),
|
||||
"chameleon": VLMTestInfo(
|
||||
models=["facebook/chameleon-7b"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||
max_model_len=4096,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
postprocess_inputs=model_utils.get_key_type_post_processor(
|
||||
"pixel_values"
|
||||
),
|
||||
# For chameleon, we only compare the sequences
|
||||
vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
|
||||
hf_output_post_proc = lambda hf_output, model: hf_output[:2],
|
||||
comparator=check_outputs_equal,
|
||||
max_tokens=8,
|
||||
dtype="bfloat16",
|
||||
marks=[
|
||||
pytest.mark.skipif(
|
||||
transformers.__version__ < "4.46.2",
|
||||
reason="Model broken in HF, see huggingface/transformers#34379"
|
||||
),
|
||||
]
|
||||
),
|
||||
"fuyu": VLMTestInfo(
|
||||
models=["adept/fuyu-8b"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
prompt_formatter=lambda img_prompt: f"{img_prompt}\n",
|
||||
img_idx_to_prompt=lambda idx: "",
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
use_tokenizer_eos=True,
|
||||
vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
|
||||
num_logprobs=10,
|
||||
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||
),
|
||||
"glm4": VLMTestInfo(
|
||||
models=["THUDM/glm-4v-9b"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
prompt_formatter=identity,
|
||||
img_idx_to_prompt=lambda idx: "",
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
dtype="bfloat16",
|
||||
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
|
||||
patch_hf_runner=model_utils.glm_patch_hf_runner,
|
||||
marks=[large_gpu_mark(min_gb=48)],
|
||||
),
|
||||
"h2ovl": VLMTestInfo(
|
||||
models = [
|
||||
"h2oai/h2ovl-mississippi-800m",
|
||||
"h2oai/h2ovl-mississippi-2b",
|
||||
],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
||||
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<image>\nWhat is the season?",
|
||||
}),
|
||||
multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501
|
||||
max_model_len=8192,
|
||||
dtype="bfloat16",
|
||||
use_tokenizer_eos=True,
|
||||
patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
|
||||
),
|
||||
"idefics3": VLMTestInfo(
|
||||
models=["HuggingFaceM4/Idefics3-8B-Llama3"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<image>",
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
marks=[
|
||||
pytest.mark.skipif(
|
||||
transformers.__version__ < "4.46.0",
|
||||
reason="Model introduced in HF >= 4.46.0"
|
||||
),
|
||||
large_gpu_mark(min_gb=48),
|
||||
],
|
||||
),
|
||||
"intern_vl": VLMTestInfo(
|
||||
models=[
|
||||
"OpenGVLab/InternVL2-1B",
|
||||
"OpenGVLab/InternVL2-2B",
|
||||
"OpenGVLab/Mono-InternVL-2B",
|
||||
],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
||||
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<image>\nWhat is the season?",
|
||||
}),
|
||||
multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501
|
||||
max_model_len=4096,
|
||||
# NOTE: Mono-InternVL-2B doesn't work with fp16,
|
||||
# it will result NaN during inference.
|
||||
# See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9
|
||||
dtype="bfloat16",
|
||||
use_tokenizer_eos=True,
|
||||
patch_hf_runner=model_utils.internvl_patch_hf_runner,
|
||||
),
|
||||
"llava_next": VLMTestInfo(
|
||||
models=["llava-hf/llava-v1.6-mistral-7b-hf"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
|
||||
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
|
||||
max_model_len=10240,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||
custom_test_opts=[CustomTestOptions(
|
||||
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
||||
formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]"
|
||||
),
|
||||
limit_mm_per_prompt={"image": 4},
|
||||
)],
|
||||
# Llava-next tests fixed sizes & the default size factors
|
||||
image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
|
||||
),
|
||||
"llava_one_vision": VLMTestInfo(
|
||||
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
|
||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
num_video_frames=16,
|
||||
max_model_len=16384,
|
||||
postprocess_inputs=model_utils.get_key_type_post_processor(
|
||||
"pixel_values_videos"
|
||||
),
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
||||
# Llava-one-vision tests fixed sizes & the default size factors
|
||||
image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
|
||||
custom_test_opts=[CustomTestOptions(
|
||||
inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs(
|
||||
formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
),
|
||||
limit_mm_per_prompt={"video": 4},
|
||||
runner_mm_key="videos",
|
||||
)],
|
||||
),
|
||||
"llava_next_video": VLMTestInfo(
|
||||
models=["llava-hf/LLaVA-NeXT-Video-7B-hf"],
|
||||
test_type=VLMTestType.VIDEO,
|
||||
prompt_formatter=lambda vid_prompt: f"USER: {vid_prompt} ASSISTANT:",
|
||||
num_video_frames=16,
|
||||
max_model_len=4096,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output,
|
||||
image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
|
||||
marks=[
|
||||
pytest.mark.skipif(
|
||||
transformers.__version__ < "4.46.2",
|
||||
reason="Model broken with changes in transformers 4.46"
|
||||
)
|
||||
],
|
||||
),
|
||||
"minicpmv": VLMTestInfo(
|
||||
models=["openbmb/MiniCPM-Llama3-V-2_5"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
|
||||
postprocess_inputs=model_utils.wrap_inputs_post_processor,
|
||||
hf_output_post_proc=model_utils.minicmpv_trunc_hf_output,
|
||||
),
|
||||
# Tests for phi3v currently live in another file because of a bug in
|
||||
# transformers. Once this issue is fixed, we can enable them here instead.
|
||||
# https://github.com/huggingface/transformers/issues/34307
|
||||
# "phi3v": VLMTestInfo(
|
||||
# models=["microsoft/Phi-3.5-vision-instruct"],
|
||||
# test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
# prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
# img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
|
||||
# max_model_len=4096,
|
||||
# max_num_seqs=2,
|
||||
# task="generate",
|
||||
# # use eager mode for hf runner since phi3v didn't work with flash_attn
|
||||
# model_kwargs={"_attn_implementation": "eager"},
|
||||
# use_tokenizer_eos=True,
|
||||
# vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
|
||||
# num_logprobs=10,
|
||||
# ),
|
||||
"pixtral_hf": VLMTestInfo(
|
||||
models=["nm-testing/pixtral-12b-FP8-dynamic"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<s>[INST]{img_prompt}[/INST]",
|
||||
img_idx_to_prompt=lambda idx: "[IMG]",
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
marks=[large_gpu_mark(min_gb=48)],
|
||||
),
|
||||
"qwen": VLMTestInfo(
|
||||
models=["Qwen/Qwen-VL"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=identity,
|
||||
img_idx_to_prompt=lambda idx: f"Picture {idx}: <img></img>\n",
|
||||
max_model_len=1024,
|
||||
max_num_seqs=2,
|
||||
vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,
|
||||
prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
|
||||
),
|
||||
### Tensor parallel / multi-gpu broadcast tests
|
||||
"broadcast-chameleon": VLMTestInfo(
|
||||
models=["facebook/chameleon-7b"],
|
||||
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||
max_model_len=4096,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
postprocess_inputs=model_utils.get_key_type_post_processor(
|
||||
"pixel_values"
|
||||
),
|
||||
vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
|
||||
hf_output_post_proc = lambda hf_output, model: hf_output[:2],
|
||||
comparator=check_outputs_equal,
|
||||
marks=[
|
||||
pytest.mark.distributed_2_gpus,
|
||||
pytest.mark.skipif(
|
||||
cuda_device_count_stateless() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.",
|
||||
),
|
||||
pytest.mark.skipif(
|
||||
transformers.__version__ < "4.46.2",
|
||||
reason="Model broken in HF, see huggingface/transformers#34379"
|
||||
)
|
||||
],
|
||||
**COMMON_BROADCAST_SETTINGS # type: ignore
|
||||
),
|
||||
"broadcast-llava": VLMTestInfo(
|
||||
models=["llava-hf/llava-1.5-7b-hf"],
|
||||
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||
max_model_len=4096,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||
marks=[
|
||||
pytest.mark.distributed_2_gpus,
|
||||
pytest.mark.skipif(
|
||||
cuda_device_count_stateless() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.",
|
||||
)
|
||||
],
|
||||
**COMMON_BROADCAST_SETTINGS # type: ignore
|
||||
),
|
||||
"broadcast-llava_next": VLMTestInfo(
|
||||
models=["llava-hf/llava-v1.6-mistral-7b-hf"],
|
||||
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
|
||||
max_model_len=10240,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||
marks=[
|
||||
pytest.mark.distributed_2_gpus,
|
||||
pytest.mark.skipif(
|
||||
cuda_device_count_stateless() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.",
|
||||
)
|
||||
],
|
||||
**COMMON_BROADCAST_SETTINGS # type: ignore
|
||||
),
|
||||
### Custom input edge-cases for specific models
|
||||
"intern_vl-diff-patches": VLMTestInfo(
|
||||
models=["OpenGVLab/InternVL2-2B"],
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||
max_model_len=4096,
|
||||
use_tokenizer_eos=True,
|
||||
patch_hf_runner=model_utils.internvl_patch_hf_runner,
|
||||
custom_test_opts=[
|
||||
CustomTestOptions(
|
||||
inputs=inp,
|
||||
limit_mm_per_prompt={"image": 2},
|
||||
) for inp in custom_inputs.different_patch_input_cases_internvl()
|
||||
],
|
||||
),
|
||||
"llava_one_vision-multiple-images": VLMTestInfo(
|
||||
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
|
||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||
max_model_len=16384,
|
||||
max_num_seqs=2,
|
||||
postprocess_inputs=model_utils.get_key_type_post_processor(
|
||||
"pixel_values"
|
||||
),
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
||||
custom_test_opts=[CustomTestOptions(
|
||||
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
||||
formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
),
|
||||
limit_mm_per_prompt={"image": 4},
|
||||
)],
|
||||
),
|
||||
}
|
||||
# yapf: enable
|
||||
|
||||
|
||||
### Test wrappers
|
||||
# Wrappers around the core test running func for:
|
||||
# - single image
|
||||
# - multi-image
|
||||
# - image embeddings
|
||||
# - video
|
||||
# - custom inputs
|
||||
@pytest.mark.parametrize("model_type,test_case",
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.IMAGE,
|
||||
fork_new_process_for_each_test=False,
|
||||
))
|
||||
def test_single_image_models(tmp_path: PosixPath, model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
image_assets: _ImageAssets):
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_single_image_test(
|
||||
tmp_path=tmp_path,
|
||||
model_test_info=model_test_info,
|
||||
test_case=test_case,
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
image_assets=image_assets,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_type,test_case",
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.MULTI_IMAGE,
|
||||
fork_new_process_for_each_test=False,
|
||||
))
|
||||
def test_multi_image_models(tmp_path: PosixPath, model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
image_assets: _ImageAssets):
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_multi_image_test(
|
||||
tmp_path=tmp_path,
|
||||
model_test_info=model_test_info,
|
||||
test_case=test_case,
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
image_assets=image_assets,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_type,test_case",
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.EMBEDDING,
|
||||
fork_new_process_for_each_test=False,
|
||||
))
|
||||
def test_image_embedding_models(model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
image_assets: _ImageAssets):
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_embedding_test(
|
||||
model_test_info=model_test_info,
|
||||
test_case=test_case,
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
image_assets=image_assets,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_type,test_case",
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.VIDEO,
|
||||
fork_new_process_for_each_test=False,
|
||||
))
|
||||
def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner],
|
||||
video_assets: _VideoAssets):
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_video_test(
|
||||
model_test_info=model_test_info,
|
||||
test_case=test_case,
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
video_assets=video_assets,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_type,test_case",
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||
fork_new_process_for_each_test=False,
|
||||
))
|
||||
def test_custom_inputs_models(
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
):
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_custom_inputs_test(
|
||||
model_test_info=model_test_info,
|
||||
test_case=test_case,
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
)
|
||||
|
||||
|
||||
#### Tests filtering for things running each test as a new process
|
||||
@pytest.mark.parametrize("model_type,test_case",
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.IMAGE,
|
||||
fork_new_process_for_each_test=True,
|
||||
))
|
||||
@fork_new_process_for_each_test
|
||||
def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
image_assets: _ImageAssets):
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_single_image_test(
|
||||
tmp_path=tmp_path,
|
||||
model_test_info=model_test_info,
|
||||
test_case=test_case,
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
image_assets=image_assets,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_type,test_case",
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.MULTI_IMAGE,
|
||||
fork_new_process_for_each_test=True,
|
||||
))
|
||||
@fork_new_process_for_each_test
|
||||
def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
image_assets: _ImageAssets):
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_multi_image_test(
|
||||
tmp_path=tmp_path,
|
||||
model_test_info=model_test_info,
|
||||
test_case=test_case,
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
image_assets=image_assets,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_type,test_case",
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.EMBEDDING,
|
||||
fork_new_process_for_each_test=True,
|
||||
))
|
||||
@fork_new_process_for_each_test
|
||||
def test_image_embedding_models_heavy(model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
image_assets: _ImageAssets):
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_embedding_test(
|
||||
model_test_info=model_test_info,
|
||||
test_case=test_case,
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
image_assets=image_assets,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_type,test_case",
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.VIDEO,
|
||||
fork_new_process_for_each_test=True,
|
||||
))
|
||||
def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
video_assets: _VideoAssets):
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_video_test(
|
||||
model_test_info=model_test_info,
|
||||
test_case=test_case,
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
video_assets=video_assets,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_type,test_case",
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||
fork_new_process_for_each_test=True,
|
||||
))
|
||||
@fork_new_process_for_each_test
|
||||
def test_custom_inputs_models_heavy(
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
):
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_custom_inputs_test(
|
||||
model_test_info=model_test_info,
|
||||
test_case=test_case,
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
)
|
||||
@@ -0,0 +1,234 @@
|
||||
import os
|
||||
import re
|
||||
from typing import List, Optional, Tuple, Type
|
||||
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm.multimodal.utils import rescale_image_size
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
"<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
"cherry_blossom":
|
||||
"<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n",
|
||||
})
|
||||
HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
|
||||
|
||||
models = ["microsoft/Phi-3.5-vision-instruct"]
|
||||
|
||||
|
||||
def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
|
||||
Optional[SampleLogprobs]],
|
||||
model: str):
|
||||
"""Sanitize vllm output to be comparable with hf output."""
|
||||
_, output_str, out_logprobs = vllm_output
|
||||
|
||||
output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
|
||||
assert output_str_without_image[0] == " "
|
||||
output_str_without_image = output_str_without_image[1:]
|
||||
|
||||
hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
hf_output_ids = tokenizer.encode(output_str_without_image)
|
||||
assert hf_output_ids[0] == 1
|
||||
hf_output_ids = hf_output_ids[1:]
|
||||
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
target_dtype = "half"
|
||||
|
||||
# ROCm Triton FA can run into shared memory issues with these models,
|
||||
# use other backends in the meantime
|
||||
# FIXME (mattwong, gshtrasb, hongxiayan)
|
||||
if current_platform.is_rocm():
|
||||
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
|
||||
|
||||
|
||||
def run_test(
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
inputs: List[Tuple[List[str], PromptImageInput]],
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
mm_limit: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
Note, the text input is also adjusted to abide by vllm contract.
|
||||
The text output is sanitized to be able to compare with hf.
|
||||
"""
|
||||
# HACK - this is an attempted workaround for the following bug
|
||||
# https://github.com/huggingface/transformers/issues/34307
|
||||
from transformers import AutoImageProcessor # noqa: F401
|
||||
from transformers import AutoProcessor # noqa: F401
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(model,
|
||||
task="generate",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"image": mm_limit},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True) as vllm_model:
|
||||
vllm_outputs_per_case = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images)
|
||||
for prompts, images in inputs
|
||||
]
|
||||
|
||||
# use eager mode for hf runner, since phi3_v didn't work with flash_attn
|
||||
hf_model_kwargs = {"_attn_implementation": "eager"}
|
||||
with hf_runner(model, dtype=dtype,
|
||||
model_kwargs=hf_model_kwargs) as hf_model:
|
||||
eos_token_id = hf_model.processor.tokenizer.eos_token_id
|
||||
hf_outputs_per_case = [
|
||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
eos_token_id=eos_token_id)
|
||||
for prompts, images in inputs
|
||||
]
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
|
||||
vllm_outputs_per_case):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=[
|
||||
vllm_to_hf_output(vllm_output, model)
|
||||
for vllm_output in vllm_outputs
|
||||
],
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
# Since we use _attn_implementation="eager" for hf_runner, there is more
|
||||
# significant numerical difference. The basic `logprobs=5` fails to pass.
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# No image
|
||||
[],
|
||||
# Single-scale
|
||||
[1.0],
|
||||
# Single-scale, batched
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
||||
dtype: str, max_tokens: int, num_logprobs: int) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_image = [(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs_per_image,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=1,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
def test_regression_7840(hf_runner, vllm_runner, image_assets, model,
|
||||
dtype) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_regresion_7840 = [
|
||||
([prompt], [image]) for image, prompt in zip(images, HF_IMAGE_PROMPTS)
|
||||
]
|
||||
|
||||
# Regression test for #7840.
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs_regresion_7840,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=128,
|
||||
num_logprobs=10,
|
||||
mm_limit=1,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# No image
|
||||
[],
|
||||
# Single-scale
|
||||
[1.0],
|
||||
# Single-scale, batched
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
|
||||
size_factors, dtype: str, max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_case = [
|
||||
([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
|
||||
[[rescale_image_size(image, factor) for image in images]
|
||||
for factor in size_factors])
|
||||
]
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs_per_case,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=2,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
@@ -0,0 +1,193 @@
|
||||
"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
|
||||
|
||||
Run `pytest tests/models/test_mistral.py`.
|
||||
"""
|
||||
import json
|
||||
import uuid
|
||||
from dataclasses import asdict
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
|
||||
|
||||
import pytest
|
||||
from mistral_common.protocol.instruct.messages import ImageURLChunk
|
||||
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
|
||||
from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
|
||||
|
||||
from vllm import EngineArgs, LLMEngine, SamplingParams, TokensPrompt
|
||||
from vllm.multimodal import MultiModalDataBuiltins
|
||||
from vllm.sequence import Logprob, SampleLogprobs
|
||||
|
||||
from ....utils import VLLM_PATH, large_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from _typeshed import StrPath
|
||||
|
||||
MODELS = ["mistralai/Pixtral-12B-2409"]
|
||||
IMG_URLS = [
|
||||
"https://picsum.photos/id/237/400/300",
|
||||
"https://picsum.photos/id/231/200/300",
|
||||
"https://picsum.photos/id/27/500/500",
|
||||
"https://picsum.photos/id/17/150/600",
|
||||
]
|
||||
PROMPT = "Describe each image in one short sentence."
|
||||
|
||||
|
||||
def _create_msg_format(urls: List[str]) -> List[Dict[str, Any]]:
|
||||
return [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [{
|
||||
"type": "text",
|
||||
"text": PROMPT,
|
||||
}] + [{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": url
|
||||
}
|
||||
} for url in urls],
|
||||
}]
|
||||
|
||||
|
||||
def _create_engine_inputs(urls: List[str]) -> TokensPrompt:
|
||||
msg = _create_msg_format(urls)
|
||||
|
||||
tokenizer = MistralTokenizer.from_model("pixtral")
|
||||
|
||||
request = ChatCompletionRequest(messages=msg) # type: ignore[type-var]
|
||||
tokenized = tokenizer.encode_chat_completion(request)
|
||||
|
||||
engine_inputs = TokensPrompt(prompt_token_ids=tokenized.tokens)
|
||||
|
||||
images = []
|
||||
for chunk in request.messages[0].content:
|
||||
if isinstance(chunk, ImageURLChunk):
|
||||
images.append(image_from_chunk(chunk))
|
||||
|
||||
mm_data = MultiModalDataBuiltins(image=images)
|
||||
engine_inputs["multi_modal_data"] = mm_data
|
||||
|
||||
return engine_inputs
|
||||
|
||||
|
||||
MSGS = [
|
||||
_create_msg_format(IMG_URLS[:1]),
|
||||
_create_msg_format(IMG_URLS[:2]),
|
||||
_create_msg_format(IMG_URLS),
|
||||
]
|
||||
ENGINE_INPUTS = [
|
||||
_create_engine_inputs(IMG_URLS[:1]),
|
||||
_create_engine_inputs(IMG_URLS[:2]),
|
||||
_create_engine_inputs(IMG_URLS),
|
||||
]
|
||||
|
||||
SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
|
||||
LIMIT_MM_PER_PROMPT = dict(image=4)
|
||||
|
||||
MAX_MODEL_LEN = [8192, 65536]
|
||||
|
||||
FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
|
||||
assert FIXTURES_PATH.exists()
|
||||
|
||||
FIXTURE_LOGPROBS_CHAT = FIXTURES_PATH / "pixtral_chat.json"
|
||||
FIXTURE_LOGPROBS_ENGINE = FIXTURES_PATH / "pixtral_chat_engine.json"
|
||||
|
||||
OutputsLogprobs = List[Tuple[List[int], str, Optional[SampleLogprobs]]]
|
||||
|
||||
|
||||
# For the test author to store golden output in JSON
|
||||
def _dump_outputs_w_logprobs(
|
||||
outputs: OutputsLogprobs,
|
||||
filename: "StrPath",
|
||||
) -> None:
|
||||
json_data = [(tokens, text,
|
||||
[{k: asdict(v)
|
||||
for k, v in token_logprobs.items()}
|
||||
for token_logprobs in (logprobs or [])])
|
||||
for tokens, text, logprobs in outputs]
|
||||
|
||||
with open(filename, "w") as f:
|
||||
json.dump(json_data, f)
|
||||
|
||||
|
||||
def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
|
||||
with open(filename, "rb") as f:
|
||||
json_data = json.load(f)
|
||||
|
||||
return [(tokens, text,
|
||||
[{int(k): Logprob(**v)
|
||||
for k, v in token_logprobs.items()}
|
||||
for token_logprobs in logprobs])
|
||||
for tokens, text, logprobs in json_data]
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=80)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
def test_chat(
|
||||
vllm_runner,
|
||||
max_model_len: int,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT)
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="mistral",
|
||||
enable_chunked_prefill=False,
|
||||
max_model_len=max_model_len,
|
||||
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
|
||||
) as vllm_model:
|
||||
outputs = []
|
||||
for msg in MSGS:
|
||||
output = vllm_model.model.chat(msg,
|
||||
sampling_params=SAMPLING_PARAMS)
|
||||
|
||||
outputs.extend(output)
|
||||
|
||||
logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
|
||||
check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
|
||||
outputs_1_lst=logprobs,
|
||||
name_0="h100_ref",
|
||||
name_1="output")
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=80)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
def test_model_engine(vllm_runner, model: str, dtype: str) -> None:
|
||||
EXPECTED_ENGINE_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_ENGINE)
|
||||
args = EngineArgs(
|
||||
model=model,
|
||||
tokenizer_mode="mistral",
|
||||
enable_chunked_prefill=False,
|
||||
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
|
||||
dtype=dtype,
|
||||
)
|
||||
engine = LLMEngine.from_engine_args(args)
|
||||
|
||||
engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[0], SAMPLING_PARAMS)
|
||||
engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[1], SAMPLING_PARAMS)
|
||||
|
||||
outputs = []
|
||||
count = 0
|
||||
while True:
|
||||
out = engine.step()
|
||||
count += 1
|
||||
for request_output in out:
|
||||
if request_output.finished:
|
||||
outputs.append(request_output)
|
||||
|
||||
if count == 2:
|
||||
engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[2],
|
||||
SAMPLING_PARAMS)
|
||||
if not engine.has_unfinished_requests():
|
||||
break
|
||||
|
||||
logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
|
||||
check_logprobs_close(outputs_0_lst=EXPECTED_ENGINE_LOGPROBS,
|
||||
outputs_1_lst=logprobs,
|
||||
name_0="h100_ref",
|
||||
name_1="output")
|
||||
@@ -0,0 +1,428 @@
|
||||
from typing import Any, List, Optional, Tuple, Type, TypedDict, Union
|
||||
|
||||
import numpy.typing as npt
|
||||
import pytest
|
||||
import torch
|
||||
from PIL import Image
|
||||
|
||||
from vllm.entrypoints.llm import LLM
|
||||
from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
|
||||
sample_frames_from_video)
|
||||
|
||||
from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
|
||||
PromptVideoInput, VllmRunner)
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
models = ["Qwen/Qwen2-VL-2B-Instruct"]
|
||||
target_dtype = "half"
|
||||
|
||||
IMAGE_PLACEHOLDER = "<|vision_start|><|image_pad|><|vision_end|>"
|
||||
VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
|
||||
|
||||
|
||||
def qwen2_vl_chat_template(*query):
|
||||
return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n" # noqa: E501
|
||||
|
||||
|
||||
IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
qwen2_vl_chat_template(
|
||||
IMAGE_PLACEHOLDER,
|
||||
"What is the biggest text's content in this image?",
|
||||
),
|
||||
"cherry_blossom":
|
||||
qwen2_vl_chat_template(
|
||||
IMAGE_PLACEHOLDER,
|
||||
"What is the season shown in this image? ",
|
||||
"Reply with a short sentence (no more than 20 words)",
|
||||
),
|
||||
})
|
||||
|
||||
VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
|
||||
"sample_demo_1":
|
||||
qwen2_vl_chat_template(
|
||||
VIDEO_PLACEHOLDER,
|
||||
"Describe this video with a short sentence ",
|
||||
"(no more than 20 words)",
|
||||
),
|
||||
})
|
||||
|
||||
MULTIIMAGE_PROMPT = qwen2_vl_chat_template(
|
||||
IMAGE_PLACEHOLDER,
|
||||
IMAGE_PLACEHOLDER,
|
||||
"Describe these two images separately. ",
|
||||
"For each image, reply with a short sentence ",
|
||||
"(no more than 10 words).",
|
||||
)
|
||||
|
||||
|
||||
class Qwen2VLPromptImageEmbeddingInput(TypedDict):
|
||||
image_embeds: torch.Tensor
|
||||
image_grid_thw: torch.Tensor
|
||||
|
||||
|
||||
class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
|
||||
video_embeds: torch.Tensor
|
||||
video_grid_thw: torch.Tensor
|
||||
|
||||
|
||||
def batch_make_image_embeddings(
|
||||
image_batches: List[Union[Image.Image, List[Image.Image]]], processor,
|
||||
llm: LLM) -> List[Qwen2VLPromptImageEmbeddingInput]:
|
||||
"""batched image embeddings for Qwen2-VL
|
||||
|
||||
This will infer all images' embeddings in a single batch,
|
||||
and split the result according to input batches.
|
||||
|
||||
image_batches:
|
||||
- Single-image batches: `List[Image.Image]`
|
||||
- Multiple-image batches: `List[List[Image.Image]]]`
|
||||
|
||||
returns: `List[Qwen2VLPromptImageEmbeddingInput]`
|
||||
"""
|
||||
|
||||
image_batches_: List[Any] = image_batches[:]
|
||||
|
||||
# convert single-image batches to multiple-image batches
|
||||
for idx in range(len(image_batches_)):
|
||||
if not isinstance(image_batches_[idx], list):
|
||||
image_batches_[idx] = [image_batches_[idx]]
|
||||
|
||||
assert isinstance(image_batches_[idx], list)
|
||||
|
||||
# append all images into a list (as a batch)
|
||||
images: List[Image.Image] = []
|
||||
for image_batch in image_batches_:
|
||||
images += image_batch
|
||||
|
||||
# image to pixel values
|
||||
image_processor = processor.image_processor
|
||||
|
||||
preprocess_result = image_processor \
|
||||
.preprocess(images=images, return_tensors="pt") \
|
||||
.data
|
||||
pixel_values = preprocess_result["pixel_values"]
|
||||
image_grid_thw = preprocess_result["image_grid_thw"]
|
||||
|
||||
# pixel values to embeddinds & grid_thws
|
||||
with torch.no_grad():
|
||||
visual = llm.llm_engine.model_executor.driver_worker. \
|
||||
model_runner.model.visual
|
||||
|
||||
pixel_values_on_device = pixel_values.to(visual.device,
|
||||
dtype=visual.dtype)
|
||||
image_grid_thw_on_device = image_grid_thw.to(visual.device,
|
||||
dtype=torch.int64)
|
||||
image_embeds = visual(pixel_values_on_device,
|
||||
grid_thw=image_grid_thw_on_device)
|
||||
|
||||
# split into original batches
|
||||
result: List[Qwen2VLPromptImageEmbeddingInput] = []
|
||||
image_counter = 0
|
||||
embed_counter = 0
|
||||
for image_batch in image_batches_:
|
||||
cur_batch_image_count = len(image_batch)
|
||||
merge_size = image_processor.merge_size
|
||||
cur_batch_embed_len = sum([
|
||||
grid_thw.prod() // merge_size // merge_size
|
||||
for grid_thw in image_grid_thw[image_counter:image_counter +
|
||||
cur_batch_image_count]
|
||||
])
|
||||
|
||||
result.append({
|
||||
"image_embeds":
|
||||
image_embeds[embed_counter:embed_counter + cur_batch_embed_len],
|
||||
"image_grid_thw":
|
||||
image_grid_thw[image_counter:image_counter +
|
||||
cur_batch_image_count],
|
||||
})
|
||||
|
||||
embed_counter += cur_batch_embed_len
|
||||
image_counter += cur_batch_image_count
|
||||
|
||||
# ensure we don't lost any images or embeddings
|
||||
assert embed_counter == image_embeds.size(0)
|
||||
assert image_counter == image_grid_thw.size(0)
|
||||
assert len(image_batches) == len(result)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def batch_make_video_embeddings(
|
||||
video_batches: PromptVideoInput, processor,
|
||||
llm: LLM) -> List[Qwen2VLPromptVideoEmbeddingInput]:
|
||||
"""batched video embeddings for Qwen2-VL
|
||||
|
||||
A NDArray represents a single video's all frames.
|
||||
|
||||
This will infer all videos' embeddings in a single batch,
|
||||
and split the result according to input batches.
|
||||
|
||||
video_batches:
|
||||
- Single-video batches: `List[NDArray]`
|
||||
- Multiple-video batches: `List[List[NDArray]]`
|
||||
"""
|
||||
|
||||
video_batches_: List[Any] = video_batches[:]
|
||||
|
||||
for idx in range(len(video_batches_)):
|
||||
if not isinstance(video_batches_[idx], list):
|
||||
single_video_batch: List[npt.NDArray] = [video_batches_[idx]]
|
||||
video_batches_[idx] = single_video_batch
|
||||
|
||||
assert isinstance(video_batches_[idx], list)
|
||||
|
||||
# append all videos into a list (as a batch)
|
||||
videos: List[npt.NDArray] = []
|
||||
for video_batch in video_batches_:
|
||||
videos += video_batch
|
||||
|
||||
# video to pixel values
|
||||
image_processor = processor.image_processor
|
||||
|
||||
preprocess_result = image_processor \
|
||||
.preprocess(images=None, videos=videos, return_tensors="pt") \
|
||||
.data
|
||||
pixel_values = preprocess_result["pixel_values_videos"]
|
||||
video_grid_thw = preprocess_result["video_grid_thw"]
|
||||
|
||||
# pixel values to embeddinds & grid_thws
|
||||
with torch.no_grad():
|
||||
visual = llm.llm_engine.model_executor.driver_worker.\
|
||||
model_runner.model.visual
|
||||
|
||||
pixel_values_on_device = pixel_values.to(visual.device,
|
||||
dtype=visual.dtype)
|
||||
video_grid_thw_on_device = video_grid_thw.to(visual.device,
|
||||
dtype=torch.int64)
|
||||
video_embeds = visual(pixel_values_on_device,
|
||||
grid_thw=video_grid_thw_on_device)
|
||||
|
||||
# split into original batches
|
||||
result: List[Qwen2VLPromptVideoEmbeddingInput] = []
|
||||
video_counter = 0
|
||||
embed_counter = 0
|
||||
for video_batch in video_batches_:
|
||||
cur_batch_video_count = len(video_batch)
|
||||
merge_size = image_processor.merge_size
|
||||
cur_batch_embed_len = sum([
|
||||
grid_thw.prod() // merge_size // merge_size
|
||||
for grid_thw in video_grid_thw[video_counter:video_counter +
|
||||
cur_batch_video_count]
|
||||
])
|
||||
|
||||
result.append({
|
||||
"video_embeds":
|
||||
video_embeds[embed_counter:embed_counter + cur_batch_embed_len],
|
||||
"video_grid_thw":
|
||||
video_grid_thw[video_counter:video_counter +
|
||||
cur_batch_video_count],
|
||||
})
|
||||
|
||||
embed_counter += cur_batch_embed_len
|
||||
video_counter += cur_batch_video_count
|
||||
|
||||
# ensure we don't lost any videos or embeddings
|
||||
assert embed_counter == video_embeds.size(0)
|
||||
assert video_counter == video_grid_thw.size(0)
|
||||
assert len(video_batches) == len(result)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def run_test(
|
||||
vllm_runner: Type[VllmRunner],
|
||||
inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]],
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
mm_limit: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
):
|
||||
"""Inference result should be the same between
|
||||
original image/video input and image/video embeddings input.
|
||||
"""
|
||||
from transformers import AutoProcessor # noqa: F401
|
||||
|
||||
processor = AutoProcessor.from_pretrained(model)
|
||||
|
||||
# NOTE:
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(model,
|
||||
task="generate",
|
||||
max_model_len=4000,
|
||||
max_num_seqs=3,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={
|
||||
"image": mm_limit,
|
||||
"video": mm_limit
|
||||
},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend
|
||||
) as vllm_model:
|
||||
|
||||
outputs_per_case_for_original_input = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images or None,
|
||||
videos=videos or None)
|
||||
for prompts, images, videos in inputs
|
||||
]
|
||||
|
||||
outputs_per_case_for_embeddings_input = [
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=batch_make_image_embeddings(
|
||||
images, processor, vllm_model.model) if images else None,
|
||||
videos=batch_make_video_embeddings(
|
||||
videos, processor, vllm_model.model) if videos else None)
|
||||
for prompts, images, videos in inputs
|
||||
]
|
||||
|
||||
for outputs_for_original_input, \
|
||||
outputs_for_embeddings_input \
|
||||
in zip(outputs_per_case_for_original_input,
|
||||
outputs_per_case_for_embeddings_input):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=outputs_for_original_input,
|
||||
outputs_1_lst=outputs_for_embeddings_input,
|
||||
name_0="original_input",
|
||||
name_1="embeddings_input",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# Single-scale
|
||||
[0.5],
|
||||
# Single-scale, batched
|
||||
[0.5, 0.5],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 0.5],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
|
||||
size_factors, dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_case: List[Tuple[
|
||||
List[str], PromptImageInput, PromptVideoInput]] = [(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
[],
|
||||
) for image, prompt in zip(images, IMAGE_PROMPTS)]
|
||||
|
||||
run_test(
|
||||
vllm_runner,
|
||||
inputs_per_case,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=1,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
[],
|
||||
# Single-scale
|
||||
[0.5],
|
||||
# Single-scale, batched
|
||||
[0.5, 0.5],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 0.5],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
|
||||
model, size_factors,
|
||||
dtype: str, max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_case: List[Tuple[List[str], PromptImageInput,
|
||||
PromptVideoInput]] = [(
|
||||
[MULTIIMAGE_PROMPT for _ in size_factors],
|
||||
[[
|
||||
rescale_image_size(image, factor)
|
||||
for image in images
|
||||
] for factor in size_factors],
|
||||
[],
|
||||
)]
|
||||
|
||||
run_test(
|
||||
vllm_runner,
|
||||
inputs_per_case,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=2,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# Single-scale
|
||||
[0.5],
|
||||
# Single-scale, batched
|
||||
[0.5, 0.5],
|
||||
# Multi-scale
|
||||
[0.25, 0.25, 0.5],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
|
||||
size_factors, dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
num_frames = 4
|
||||
sampled_vids = [
|
||||
sample_frames_from_video(asset.np_ndarrays, num_frames)
|
||||
for asset in video_assets
|
||||
]
|
||||
|
||||
inputs_per_case: List[Tuple[
|
||||
List[str], PromptImageInput, PromptVideoInput]] = [(
|
||||
[prompt for _ in size_factors],
|
||||
[],
|
||||
[rescale_video_size(video, factor) for factor in size_factors],
|
||||
) for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)]
|
||||
|
||||
run_test(
|
||||
vllm_runner,
|
||||
inputs_per_case,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=1,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
@@ -0,0 +1,235 @@
|
||||
"""Helpers for building inputs that can be leveraged for different test types.
|
||||
"""
|
||||
from pathlib import PosixPath
|
||||
from typing import Callable, Iterable, List, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
|
||||
resize_video, sample_frames_from_video)
|
||||
|
||||
from .....conftest import _ImageAssets, _VideoAssets
|
||||
from .types import (SINGLE_IMAGE_BASE_PROMPTS, TEST_IMG_PLACEHOLDER,
|
||||
TEST_VIDEO_PLACEHOLDER, VIDEO_BASE_PROMPT,
|
||||
ImageSizeWrapper, SizeType, VLMTestInfo)
|
||||
|
||||
|
||||
def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int],
|
||||
str],
|
||||
test_placeholder: str) -> str:
|
||||
"""Given a prompt, replaces each test placeholder with the
|
||||
model-specific tag.
|
||||
"""
|
||||
prompt_segments = prompt.split(test_placeholder)
|
||||
img_prompt = prompt_segments[0]
|
||||
for placeholder_idx, next_seg in enumerate(prompt_segments[1:], start=1):
|
||||
img_prompt += img_idx_to_prompt(placeholder_idx)
|
||||
img_prompt += next_seg
|
||||
return img_prompt
|
||||
|
||||
|
||||
def get_model_prompts(base_prompts: Iterable[str],
|
||||
img_idx_to_prompt: Optional[Callable[[int], str]],
|
||||
video_idx_to_prompt: Optional[Callable[[int], str]],
|
||||
prompt_formatter: Callable[[str], str]) -> List[str]:
|
||||
"""Given a model-agnostic base prompt and test configuration for a model(s)
|
||||
to be tested, update the media placeholders and apply the prompt formatting
|
||||
to get the test prompt string for this model.
|
||||
|
||||
Example for phi3v, given the base_prompt: "<image>What is the season?"
|
||||
1. Replace img placeholder(s)
|
||||
-> "<|image_1|>\nWhat is the season?"
|
||||
2. Apply prompt formatter:
|
||||
-> <|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n
|
||||
"""
|
||||
assert isinstance(base_prompts, (list, tuple))
|
||||
model_prompts = []
|
||||
for base_prompt in base_prompts:
|
||||
# Replace the multimodal placeholders in the base prompt with
|
||||
# the correct ones for the model that we are testing
|
||||
if img_idx_to_prompt:
|
||||
base_prompt = replace_test_placeholder(base_prompt,
|
||||
img_idx_to_prompt,
|
||||
TEST_IMG_PLACEHOLDER)
|
||||
|
||||
if video_idx_to_prompt:
|
||||
base_prompt = replace_test_placeholder(base_prompt,
|
||||
video_idx_to_prompt,
|
||||
TEST_VIDEO_PLACEHOLDER)
|
||||
|
||||
# Apply the prompt formatter to wrap the base prompt with
|
||||
# the correct media placeholders to get the model test prompt
|
||||
model_prompt = prompt_formatter(base_prompt)
|
||||
model_prompts.append(model_prompt)
|
||||
return model_prompts
|
||||
|
||||
|
||||
def build_single_image_inputs_from_test_info(
|
||||
test_info: VLMTestInfo,
|
||||
image_assets: _ImageAssets,
|
||||
size_wrapper: ImageSizeWrapper,
|
||||
tmp_path: Optional[PosixPath] = None):
|
||||
if test_info.prompt_formatter is None:
|
||||
raise ValueError(
|
||||
"Prompt formatter must be set to build single image inputs")
|
||||
|
||||
model_prompts = get_model_prompts(test_info.single_image_prompts,
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.prompt_formatter)
|
||||
|
||||
# For models that require a local path / URL encoded in the image; export
|
||||
# assets and encode into tmp_path for this test. This should be avoided
|
||||
# where possible (currently needed for Qwen-VL).
|
||||
if test_info.prompt_path_encoder is not None:
|
||||
if tmp_path is None:
|
||||
raise ValueError("Prompt path encoder requires setting local path")
|
||||
model_prompts = [
|
||||
test_info.prompt_path_encoder(tmp_path, prompt, [asset])
|
||||
for prompt, asset in zip(model_prompts, image_assets)
|
||||
]
|
||||
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
assert len(images) == len(model_prompts)
|
||||
return build_single_image_inputs(images, model_prompts, size_wrapper)
|
||||
|
||||
|
||||
def build_single_image_inputs(images, model_prompts,
|
||||
size_wrapper: ImageSizeWrapper):
|
||||
# For every image / prompt pair, get a pair containing two lists of
|
||||
# length size_factors, where the first contains duplicates of the model
|
||||
# prompt [str], and the second contains copies of the image after being
|
||||
# scaled by one of the size factors.
|
||||
#
|
||||
# NOTE: rescaling preserves the image aspect ratio.
|
||||
return [(
|
||||
[prompt for _ in size_wrapper.data],
|
||||
[
|
||||
apply_image_size_scaling(image, size, size_wrapper.type)
|
||||
for size in size_wrapper.data
|
||||
],
|
||||
) for image, prompt in zip(images, model_prompts)]
|
||||
|
||||
|
||||
def build_multi_image_inputs_from_test_info(
|
||||
test_info: VLMTestInfo,
|
||||
image_assets: _ImageAssets,
|
||||
size_wrapper: ImageSizeWrapper,
|
||||
tmp_path: Optional[PosixPath] = None):
|
||||
if test_info.prompt_formatter is None:
|
||||
raise ValueError(
|
||||
"Prompt formatter must be set to build multi image inputs")
|
||||
|
||||
model_prompts = get_model_prompts([test_info.multi_image_prompt],
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.prompt_formatter)
|
||||
|
||||
if test_info.prompt_path_encoder is not None:
|
||||
if tmp_path is None:
|
||||
raise ValueError("Prompt path encoder requires setting local path")
|
||||
model_prompts = [
|
||||
test_info.prompt_path_encoder(tmp_path, model_prompt, image_assets)
|
||||
for model_prompt in model_prompts
|
||||
]
|
||||
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
# Currently, we only have one multi-image list & one multi-image prompt
|
||||
return build_multi_image_inputs(
|
||||
image_lists=[images],
|
||||
model_prompts=model_prompts,
|
||||
size_wrapper=size_wrapper,
|
||||
)
|
||||
|
||||
|
||||
def build_multi_image_inputs(image_lists, model_prompts,
|
||||
size_wrapper: ImageSizeWrapper):
|
||||
return [(
|
||||
[prompt for _ in size_wrapper.data],
|
||||
[[
|
||||
apply_image_size_scaling(image, size, size_wrapper.type)
|
||||
for image in images
|
||||
] for size in size_wrapper.data],
|
||||
) for images, prompt in zip(image_lists, model_prompts)]
|
||||
|
||||
|
||||
def build_embedding_inputs_from_test_info(
|
||||
test_info: VLMTestInfo,
|
||||
image_assets: _ImageAssets,
|
||||
size_wrapper: ImageSizeWrapper,
|
||||
):
|
||||
# These conditions will always be true if invoked through filtering,
|
||||
# but we still check them in case this is ever called directly
|
||||
if test_info.prompt_formatter is None:
|
||||
raise ValueError(
|
||||
"Prompt formatter must be set to build image embedding inputs")
|
||||
if size_wrapper.type != SizeType.SIZE_FACTOR or not \
|
||||
all(factor == 1.0 for factor in size_wrapper.data):
|
||||
raise ValueError("Embedding tests require constant (1.0) size factors")
|
||||
if test_info.convert_assets_to_embeddings is None:
|
||||
raise ValueError("No conversion func for getting embeddings found")
|
||||
|
||||
model_prompts = get_model_prompts(
|
||||
SINGLE_IMAGE_BASE_PROMPTS,
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.prompt_formatter,
|
||||
)
|
||||
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
embeds = test_info.convert_assets_to_embeddings(image_assets)
|
||||
assert len(images) == len(model_prompts)
|
||||
|
||||
inputs = build_single_image_inputs(images, model_prompts, size_wrapper)
|
||||
vllm_embeddings = build_single_image_inputs(embeds, model_prompts,
|
||||
size_wrapper)
|
||||
return inputs, vllm_embeddings
|
||||
|
||||
|
||||
def build_video_inputs_from_test_info(
|
||||
test_info: VLMTestInfo,
|
||||
video_assets: _VideoAssets,
|
||||
size_wrapper: ImageSizeWrapper,
|
||||
num_frames: int,
|
||||
):
|
||||
if test_info.prompt_formatter is None:
|
||||
raise ValueError("Prompt formatter must be set to build video inputs")
|
||||
model_prompts = get_model_prompts(
|
||||
[VIDEO_BASE_PROMPT],
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.prompt_formatter,
|
||||
)
|
||||
|
||||
sampled_vids = [
|
||||
sample_frames_from_video(asset.np_ndarrays, num_frames)
|
||||
for asset in video_assets
|
||||
]
|
||||
|
||||
video_scaler = (resize_video if size_wrapper.type == SizeType.FIXED_SIZE
|
||||
else rescale_video_size)
|
||||
|
||||
return [(
|
||||
[prompt for _ in size_wrapper.data],
|
||||
[video_scaler(video, size) for size in size_wrapper.data],
|
||||
) for video, prompt in zip(sampled_vids, model_prompts)]
|
||||
|
||||
|
||||
def apply_image_size_scaling(image, size: Union[float, Tuple[int, int]],
|
||||
size_type: SizeType):
|
||||
"""Applies a size scaler to one image; this can be a an image size factor,
|
||||
which scales the image while maintaining the aspect ratio"""
|
||||
# Special case for embeddings; if it's a tensor, it's only valid if we
|
||||
# are considering size factors at constant scale, i.e., we just clone
|
||||
# the tensor
|
||||
if isinstance(image, torch.Tensor):
|
||||
assert size_type == SizeType.SIZE_FACTOR and size == 1
|
||||
return image
|
||||
if size_type == SizeType.SIZE_FACTOR:
|
||||
# We have a list of image size factors
|
||||
return rescale_image_size(image, size)
|
||||
elif size_type == SizeType.FIXED_SIZE:
|
||||
# We have a list of fixed sizes
|
||||
return image.resize(size)
|
||||
raise ValueError("ImageSizeWrapper type must be FIXED_SIZE or SIZE_FACTOR")
|
||||
@@ -0,0 +1,157 @@
|
||||
"""Utils for determining which subset of model tests belong to a specific
|
||||
modality, getting all combinations (similar to pytest's parametrization),
|
||||
handling multimodal placeholder substitution, and so on.
|
||||
"""
|
||||
import itertools
|
||||
from collections import OrderedDict
|
||||
from typing import Dict, Iterable, Tuple
|
||||
|
||||
import pytest
|
||||
|
||||
from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs,
|
||||
ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType)
|
||||
|
||||
|
||||
def get_filtered_test_settings(test_settings: Dict[str, VLMTestInfo],
|
||||
test_type: VLMTestType,
|
||||
fork_per_test: bool) -> Dict[str, VLMTestInfo]:
|
||||
"""Given the dict of potential test settings to run, return a subdict
|
||||
of tests who have the current test type enabled with the matching val for
|
||||
fork_per_test.
|
||||
"""
|
||||
|
||||
def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType):
|
||||
return test_info.test_type == test_type or (
|
||||
isinstance(test_info.test_type, Iterable)
|
||||
and test_type in test_info.test_type)
|
||||
|
||||
matching_tests = {}
|
||||
for test_name, test_info in test_settings.items():
|
||||
# Otherwise check if the test has the right type & keep if it does
|
||||
if matches_test_type(test_info, test_type):
|
||||
# Embedding tests need to have a conversion func in their test info
|
||||
if matches_test_type(test_info, VLMTestType.EMBEDDING):
|
||||
assert test_info.convert_assets_to_embeddings is not None
|
||||
# Custom test inputs need to explicitly define the mm limit/inputs
|
||||
if matches_test_type(test_info, VLMTestType.CUSTOM_INPUTS):
|
||||
assert (test_info.custom_test_opts is not None
|
||||
and isinstance(test_info.custom_test_opts, Iterable))
|
||||
# For all types besides custom inputs, we need a prompt formatter
|
||||
else:
|
||||
assert test_info.prompt_formatter is not None
|
||||
|
||||
# Everything looks okay; keep if this is has correct proc handling
|
||||
if (test_info.distributed_executor_backend
|
||||
is not None) == fork_per_test:
|
||||
matching_tests[test_name] = test_info
|
||||
|
||||
return matching_tests
|
||||
|
||||
|
||||
def get_parametrized_options(test_settings: Dict[str, VLMTestInfo],
|
||||
test_type: VLMTestType,
|
||||
fork_new_process_for_each_test: bool):
|
||||
"""Converts all of our VLMTestInfo into an expanded list of parameters.
|
||||
This is similar to nesting pytest parametrize calls, but done directly
|
||||
through an itertools product so that each test can set things like
|
||||
size factors etc, while still running in isolated test cases.
|
||||
"""
|
||||
matching_tests = get_filtered_test_settings(
|
||||
test_settings, test_type, fork_new_process_for_each_test)
|
||||
|
||||
# Ensure that something is wrapped as an iterable it's not already
|
||||
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e, )
|
||||
|
||||
def get_model_type_cases(model_type: str, test_info: VLMTestInfo):
|
||||
# This is essentially the same as nesting a bunch of mark.parametrize
|
||||
# decorators, but we do it programmatically to allow overrides for on
|
||||
# a per-model basis, while still being able to execute each of these
|
||||
# as individual test cases in pytest.
|
||||
iter_kwargs = OrderedDict([
|
||||
("model", ensure_wrapped(test_info.models)),
|
||||
("max_tokens", ensure_wrapped(test_info.max_tokens)),
|
||||
("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
|
||||
("dtype", ensure_wrapped(test_info.dtype)),
|
||||
("distributed_executor_backend",
|
||||
ensure_wrapped(test_info.distributed_executor_backend)),
|
||||
])
|
||||
|
||||
# num_frames is video only
|
||||
if test_type == VLMTestType.VIDEO:
|
||||
iter_kwargs["num_video_frames"] = ensure_wrapped(
|
||||
test_info.num_video_frames)
|
||||
|
||||
# No sizes passed for custom inputs, since inputs are directly provided
|
||||
if test_type != VLMTestType.CUSTOM_INPUTS:
|
||||
wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
|
||||
if wrapped_sizes is None:
|
||||
raise ValueError(
|
||||
f"Sizes must be set for test type {test_type}")
|
||||
iter_kwargs["size_wrapper"] = wrapped_sizes
|
||||
|
||||
#Otherwise expand the custom test options instead
|
||||
else:
|
||||
if test_info.custom_test_opts is None:
|
||||
raise ValueError("Test has type CUSTOM_INPUTS, but none given")
|
||||
iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
|
||||
|
||||
# yapf: disable
|
||||
# Wrap all model cases in a pytest parameter & pass marks through
|
||||
return [
|
||||
pytest.param(
|
||||
model_type,
|
||||
ExpandableVLMTestArgs(
|
||||
**{k: v for k, v in zip(iter_kwargs.keys(), case)}
|
||||
),
|
||||
marks=test_info.marks if test_info.marks is not None else []
|
||||
) for case in list(itertools.product(*iter_kwargs.values()))
|
||||
]
|
||||
# yapf: enable
|
||||
|
||||
# Get a list per model type, where each entry contains a tuple of all of
|
||||
# that model type's cases, then flatten them into the top level so that
|
||||
# we can consume them in one mark.parametrize call.
|
||||
cases_by_model_type = [
|
||||
get_model_type_cases(model_type, test_info)
|
||||
for model_type, test_info in matching_tests.items()
|
||||
]
|
||||
return list(itertools.chain(*cases_by_model_type))
|
||||
|
||||
|
||||
def get_wrapped_test_sizes(
|
||||
test_info: VLMTestInfo,
|
||||
test_type: VLMTestType) -> Tuple[ImageSizeWrapper, ...]:
|
||||
"""Given a test info which may have size factors or fixed sizes, wrap them
|
||||
and combine them into an iterable, each of which will be used in parameter
|
||||
expansion.
|
||||
|
||||
Args:
|
||||
test_info: Test configuration to be expanded.
|
||||
test_type: The type of test being filtered for.
|
||||
"""
|
||||
# If it is an embedding test, we always use the EMBEDDING_SIZE_FACTORS
|
||||
if test_type == VLMTestType.EMBEDDING:
|
||||
return tuple([
|
||||
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
|
||||
for factor in EMBEDDING_SIZE_FACTORS
|
||||
])
|
||||
# Custom inputs have preprocessed inputs
|
||||
elif test_type == VLMTestType.CUSTOM_INPUTS:
|
||||
return tuple()
|
||||
|
||||
size_factors = test_info.image_size_factors \
|
||||
if test_info.image_size_factors else []
|
||||
fixed_sizes = test_info.image_sizes \
|
||||
if test_info.image_sizes else []
|
||||
|
||||
wrapped_factors = [
|
||||
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
|
||||
for factor in size_factors
|
||||
]
|
||||
|
||||
wrapped_sizes = [
|
||||
ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size)
|
||||
for size in fixed_sizes
|
||||
]
|
||||
|
||||
return tuple(wrapped_factors + wrapped_sizes)
|
||||
@@ -0,0 +1,141 @@
|
||||
"""Core test implementation to be shared across modalities."""
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
|
||||
|
||||
import torch
|
||||
from PIL.Image import Image
|
||||
from transformers import AutoTokenizer, BatchEncoding
|
||||
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||
|
||||
from .....conftest import HfRunner, VllmRunner
|
||||
from .types import RunnerOutput
|
||||
|
||||
|
||||
def run_test(
|
||||
*,
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
inputs: List[Tuple[List[str], List[Union[List[Image], Image]]]],
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
enforce_eager: bool,
|
||||
max_model_len: int,
|
||||
max_num_seqs: int,
|
||||
hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
|
||||
vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
|
||||
auto_cls: Type[_BaseAutoModelClass],
|
||||
use_tokenizer_eos: bool,
|
||||
postprocess_inputs: Callable[[BatchEncoding], BatchEncoding],
|
||||
comparator: Callable[..., None],
|
||||
get_stop_token_ids: Optional[Callable[[AutoTokenizer], List[int]]],
|
||||
limit_mm_per_prompt: Dict[str, int],
|
||||
model_kwargs: Optional[Dict[str, Any]],
|
||||
patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]],
|
||||
task: str = "auto",
|
||||
runner_mm_key: str = "images",
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
tensor_parallel_size: int = 1,
|
||||
vllm_embeddings: Optional[torch.Tensor] = None,
|
||||
):
|
||||
"""Modality agnostic test test executor for comparing HF/vLLM outputs."""
|
||||
# In the case of embeddings, vLLM takes separate input tensors
|
||||
vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs
|
||||
tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
|
||||
|
||||
vllm_outputs_per_mm = []
|
||||
hf_outputs_per_mm = []
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
vllm_kwargs = {}
|
||||
if get_stop_token_ids is not None:
|
||||
vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
|
||||
|
||||
with vllm_runner(model,
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=max_num_seqs,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=enforce_eager,
|
||||
task=task) as vllm_model:
|
||||
for prompts, media in vllm_inputs:
|
||||
vllm_kwargs[runner_mm_key] = media
|
||||
vllm_output = vllm_model.generate_greedy_logprobs(
|
||||
prompts, max_tokens, num_logprobs=num_logprobs, **vllm_kwargs)
|
||||
vllm_outputs_per_mm.append(vllm_output)
|
||||
|
||||
hf_model = hf_runner(model,
|
||||
dtype=dtype,
|
||||
auto_cls=auto_cls,
|
||||
postprocess_inputs=postprocess_inputs,
|
||||
model_kwargs=model_kwargs)
|
||||
|
||||
# Some models need to patch things like the model processor, e.g., internvl
|
||||
if patch_hf_runner is not None:
|
||||
hf_model = patch_hf_runner(hf_model)
|
||||
|
||||
# Some models need to explicitly pass the eos_token_id off the tokenizer or
|
||||
# processor for a good comparison; currently assume processor/tokenizer
|
||||
# agree on the EOS, and pull it off the tokenizer if requested.
|
||||
hf_kwargs = {}
|
||||
if use_tokenizer_eos:
|
||||
hf_kwargs["eos_token_id"] = tokenizer.eos_token_id
|
||||
|
||||
with hf_model, torch.no_grad():
|
||||
for prompts, media in inputs:
|
||||
hf_kwargs[runner_mm_key] = media
|
||||
hf_output = hf_model.generate_greedy_logprobs_limit(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tokenizer=tokenizer,
|
||||
**hf_kwargs)
|
||||
hf_outputs_per_mm.append(hf_output)
|
||||
|
||||
# Apply output processing / sanitation to the vLLM and HF runner results
|
||||
hf_outputs_per_mm, vllm_outputs_per_mm = process_runner_outputs(
|
||||
model,
|
||||
first_runner_outputs=hf_outputs_per_mm,
|
||||
second_runner_outputs=vllm_outputs_per_mm,
|
||||
first_runner_processor=hf_output_post_proc,
|
||||
second_runner_processor=vllm_output_post_proc,
|
||||
)
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm,
|
||||
vllm_outputs_per_mm):
|
||||
# This is usually check_logprobs_close, but it's passed through to
|
||||
# allow things like check_outputs_equal where needed
|
||||
comparator(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
def process_runner_outputs(
|
||||
model,
|
||||
first_runner_outputs,
|
||||
second_runner_outputs,
|
||||
first_runner_processor=None,
|
||||
second_runner_processor=None,
|
||||
):
|
||||
"""Applies the runner processor(s) to the runner outputs, if any."""
|
||||
if first_runner_processor is not None:
|
||||
first_runner_outputs = process_outputs(first_runner_processor, model,
|
||||
first_runner_outputs)
|
||||
if second_runner_processor is not None:
|
||||
second_runner_outputs = process_outputs(second_runner_processor, model,
|
||||
second_runner_outputs)
|
||||
return first_runner_outputs, second_runner_outputs
|
||||
|
||||
|
||||
def process_outputs(output_processor, model, outputs_per_image):
|
||||
"""Applies a model specific post-processor function to a runner's output"""
|
||||
return [[output_processor(res, model) for res in outputs]
|
||||
for outputs in outputs_per_image]
|
||||
@@ -0,0 +1,102 @@
|
||||
"""Custom input builders for edge-cases in different models."""
|
||||
from typing import Callable
|
||||
|
||||
from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
|
||||
resize_video, sample_frames_from_video)
|
||||
|
||||
from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
|
||||
from .builders import build_multi_image_inputs, build_single_image_inputs
|
||||
from .types import ImageSizeWrapper, SizeType
|
||||
|
||||
|
||||
def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
|
||||
"""Builds inputs for multi-image (varied sizes/aspect ratio) testing.
|
||||
|
||||
Args:
|
||||
formatter: model-specific prompt formatter.
|
||||
"""
|
||||
stop_sign = IMAGE_ASSETS[0].pil_image
|
||||
cherry_blossom = IMAGE_ASSETS[1].pil_image
|
||||
|
||||
# Apply the selected formatter to the base prompts
|
||||
img_prompts = [
|
||||
"<image><image>\nDescribe 2 images.",
|
||||
"<image><image>\nDescribe 2 images.",
|
||||
"<image><image><image><image>\nDescribe 4 images.",
|
||||
"<image>\nWhat is the season?",
|
||||
]
|
||||
formatted_prompts = [formatter(prompt) for prompt in img_prompts]
|
||||
|
||||
return [(
|
||||
formatted_prompts,
|
||||
[
|
||||
[stop_sign, cherry_blossom],
|
||||
# Images with different sizes and aspect-ratios
|
||||
[
|
||||
rescale_image_size(stop_sign, 0.1),
|
||||
stop_sign,
|
||||
],
|
||||
[
|
||||
stop_sign,
|
||||
rescale_image_size(stop_sign, 0.25),
|
||||
cherry_blossom.resize((183, 488)),
|
||||
cherry_blossom.resize((488, 183))
|
||||
],
|
||||
cherry_blossom,
|
||||
])]
|
||||
|
||||
|
||||
def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
|
||||
num_frames: int = 16):
|
||||
"""Builds inputs for multi-video (varied sizes/aspect ratio) testing.
|
||||
|
||||
Args:
|
||||
formatter: model-specific prompt formatter.
|
||||
"""
|
||||
video = sample_frames_from_video(VIDEO_ASSETS[0].np_ndarrays, num_frames)
|
||||
# Apply the selected formatter to the base prompts
|
||||
video_prompts = [
|
||||
"<video><video>\nDescribe 2 videos.",
|
||||
"<video><video>\nDescribe 2 videos.",
|
||||
"<video><video><video><video>\nDescribe 4 videos.",
|
||||
"<video>\nWhy is this video funny?",
|
||||
]
|
||||
formatted_prompts = [formatter(prompt) for prompt in video_prompts]
|
||||
|
||||
return [(
|
||||
formatted_prompts,
|
||||
[
|
||||
[video, video],
|
||||
# Videos with different sizes and aspect-ratios
|
||||
[
|
||||
rescale_video_size(video, 0.1),
|
||||
video,
|
||||
],
|
||||
[
|
||||
video,
|
||||
rescale_video_size(video, 0.25),
|
||||
resize_video(video, (183, 488)),
|
||||
resize_video(video, (488, 183))
|
||||
],
|
||||
video,
|
||||
])]
|
||||
|
||||
|
||||
def different_patch_input_cases_internvl():
|
||||
images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
|
||||
formatter = lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501
|
||||
single_img_prompts = [
|
||||
"<image>\nWhat's the content in the center of the image?",
|
||||
"<image>\nWhat is the season?",
|
||||
]
|
||||
multi_img_prompts = [
|
||||
"Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.\n", # noqa: E501
|
||||
]
|
||||
formatted_sprompts = [formatter(prompt) for prompt in single_img_prompts]
|
||||
formatted_mprompts = [formatter(prompt) for prompt in multi_img_prompts]
|
||||
|
||||
wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5, 1.0])
|
||||
return [
|
||||
build_single_image_inputs(images, formatted_sprompts, wrapped_sf),
|
||||
build_multi_image_inputs([images], formatted_mprompts, wrapped_sf),
|
||||
]
|
||||
@@ -0,0 +1,409 @@
|
||||
"""Common utility functions relating to different models that are useful
|
||||
for manipulating the input / output of HF & vLLM test runners, which are
|
||||
typically specific to a small subset of models.
|
||||
"""
|
||||
import re
|
||||
import types
|
||||
from pathlib import PosixPath
|
||||
from typing import Callable, List, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
from PIL.Image import Image
|
||||
from transformers import AutoConfig, AutoTokenizer, BatchEncoding
|
||||
|
||||
from vllm.sequence import SampleLogprobs
|
||||
from vllm.transformers_utils.tokenizer import patch_padding_side
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
from .....conftest import HfRunner, ImageAsset, _ImageAssets
|
||||
from .types import RunnerOutput
|
||||
|
||||
|
||||
####### vLLM output processors functions
|
||||
def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output [blip2 models] to be comparable with hf output."""
|
||||
_, output_str, out_logprobs = vllm_output
|
||||
|
||||
hf_output_str = output_str + "\n"
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
hf_output_ids = tokenizer.encode(hf_output_str)
|
||||
assert hf_output_ids[0] == tokenizer.bos_token_id
|
||||
hf_output_ids = hf_output_ids[1:]
|
||||
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output [fuyu models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
hf_output_str = output_str.lstrip() + "|ENDOFTEXT|"
|
||||
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def qwen_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput,
|
||||
model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]:
|
||||
"""Sanitize vllm output [qwen models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
hf_output_str = output_str + "<|endoftext|>"
|
||||
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def qwen2_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput,
|
||||
model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]:
|
||||
"""Sanitize vllm output [qwen2 models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
hf_output_str = output_str + "<|im_end|>"
|
||||
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
config = AutoConfig.from_pretrained(model)
|
||||
mm_token_id = config.image_token_index
|
||||
return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
|
||||
|
||||
|
||||
def llava_video_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput,
|
||||
model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]:
|
||||
config = AutoConfig.from_pretrained(model)
|
||||
mm_token_id = config.video_token_index
|
||||
return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
|
||||
|
||||
|
||||
def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
|
||||
mm_token_id: int) -> RunnerOutput:
|
||||
"""Sanitize vllm output [Llava models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
eos_token_id = tokenizer.eos_token_id
|
||||
|
||||
hf_output_ids = [
|
||||
token_id for idx, token_id in enumerate(output_ids)
|
||||
if token_id != mm_token_id or output_ids[idx - 1] != mm_token_id
|
||||
]
|
||||
|
||||
assert output_str[0] == " "
|
||||
hf_output_str = output_str[1:]
|
||||
if hf_output_ids[-1] == eos_token_id:
|
||||
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
|
||||
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output [llava-onevision] to compare with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
config = AutoConfig.from_pretrained(model)
|
||||
video_token_id = config.video_token_index
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
eos_token_id = tokenizer.eos_token_id
|
||||
|
||||
hf_output_ids = [
|
||||
token_id for idx, token_id in enumerate(output_ids)
|
||||
if token_id != video_token_id or output_ids[idx - 1] != video_token_id
|
||||
]
|
||||
|
||||
hf_output_str = output_str
|
||||
if hf_output_ids[-1] == eos_token_id:
|
||||
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
|
||||
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output [phi3v] to be comparable with hf output."""
|
||||
_, output_str, out_logprobs = vllm_output
|
||||
|
||||
output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
|
||||
assert output_str_without_image[0] == " "
|
||||
output_str_without_image = output_str_without_image[1:]
|
||||
|
||||
hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
hf_output_ids = tokenizer.encode(output_str_without_image)
|
||||
assert hf_output_ids[0] == 1
|
||||
hf_output_ids = hf_output_ids[1:]
|
||||
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
config = AutoConfig.from_pretrained(model)
|
||||
image_token_id = config.image_token_index
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
eos_token_id = tokenizer.eos_token_id
|
||||
|
||||
hf_output_ids = [
|
||||
token_id for idx, token_id in enumerate(output_ids)
|
||||
if token_id != image_token_id or output_ids[idx - 1] != image_token_id
|
||||
]
|
||||
|
||||
hf_output_str = output_str
|
||||
|
||||
if hf_output_ids[-1] == eos_token_id:
|
||||
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
|
||||
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
####### Post-processors for HF outputs
|
||||
def minicmpv_trunc_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
if output_str.endswith("<|eot_id|>"):
|
||||
output_str = output_str.split("<|eot_id|>")[0]
|
||||
return output_ids, output_str, out_logprobs
|
||||
|
||||
|
||||
####### Functions for converting image assets to embeddings
|
||||
def get_llava_embeddings(image_assets: _ImageAssets):
|
||||
return [asset.image_embeds for asset in image_assets]
|
||||
|
||||
|
||||
####### postprocessors to run on HF BatchEncoding
|
||||
def get_key_type_post_processor(
|
||||
hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]:
|
||||
"""Gets a handle to a post processor which converts a given key into a
|
||||
target data type."""
|
||||
|
||||
def process(hf_inputs: BatchEncoding, dtype: str):
|
||||
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
|
||||
hf_inputs[hf_inp_key] = hf_inputs[hf_inp_key].to(torch_dtype)
|
||||
return hf_inputs
|
||||
|
||||
return process
|
||||
|
||||
|
||||
def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str):
|
||||
return {"model_inputs": hf_inputs}
|
||||
|
||||
|
||||
####### Prompt path encoders for models that need models on disk
|
||||
def qwen_prompt_path_encoder(
|
||||
tmp_path: PosixPath, prompt: str, assets: Union[List[ImageAsset],
|
||||
_ImageAssets]) -> str:
|
||||
"""Given a temporary dir path, export one or more image assets into the
|
||||
tempdir & replace its contents with the local path to the string so that
|
||||
the HF version of Qwen-VL can resolve the path and load the image in its
|
||||
forward() call.
|
||||
|
||||
Args:
|
||||
tmp_path: Tempdir for test under consideration.
|
||||
prompt: Prompt with image placeholders.
|
||||
assets: List of image assets whose len equals the num placeholders.
|
||||
"""
|
||||
# Ensure that the number of placeholders matches the number of assets;
|
||||
# If this is not true, the test is probably written incorrectly.
|
||||
assert prompt.count("<img></img>") == len(assets)
|
||||
|
||||
# Replace the placeholders with local paths to the exported assets
|
||||
for asset in assets:
|
||||
image_tmp_path = tmp_path / f"{asset.name}.jpg"
|
||||
asset.pil_image.save(image_tmp_path)
|
||||
prompt = prompt.replace(
|
||||
"<img></img>",
|
||||
f"<img>{image_tmp_path}</img>",
|
||||
1,
|
||||
)
|
||||
return prompt
|
||||
|
||||
|
||||
####### Model-specific HuggingFace runner patchers
|
||||
def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for GLM4."""
|
||||
hf_processor = hf_model.processor
|
||||
patch_padding_side(hf_processor)
|
||||
|
||||
def processor(*args, text="", images=None, **kwargs):
|
||||
if images is None:
|
||||
return hf_processor(*args, **kwargs)
|
||||
|
||||
return hf_processor.apply_chat_template(
|
||||
[{
|
||||
"role": "user",
|
||||
"image": images,
|
||||
"content": text
|
||||
}],
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
hf_model.processor = processor
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.transformer.output_layer
|
||||
return hf_model
|
||||
|
||||
|
||||
def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for H2OVL."""
|
||||
|
||||
class H2OVLProcessor:
|
||||
"""A simple processor for H2OVL models."""
|
||||
|
||||
def __init__(self, hf_runner: HfRunner):
|
||||
self.num_image_token = hf_runner.model.num_image_token
|
||||
self.tokenizer = hf_runner.tokenizer
|
||||
self.dtype = hf_runner.model.dtype
|
||||
|
||||
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
|
||||
trust_remote_code=True)
|
||||
self.vision_config = self.config.vision_config
|
||||
self.use_thumbnail = self.config.use_thumbnail
|
||||
self.min_num = self.config.min_dynamic_patch
|
||||
self.max_num = self.config.max_dynamic_patch
|
||||
self.image_size = self.vision_config.image_size
|
||||
|
||||
def __call__(self, text: str, images: Union[Image, List[Image]],
|
||||
**kwargs):
|
||||
# yapf: disable
|
||||
from vllm.model_executor.models.h2ovl import (
|
||||
IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
|
||||
|
||||
# yapf: enable
|
||||
images = [images] if isinstance(images, Image) else images
|
||||
pixel_values = [
|
||||
image_to_pixel_values(image,
|
||||
self.image_size,
|
||||
self.min_num,
|
||||
self.max_num,
|
||||
self.use_thumbnail,
|
||||
use_MSAC=self.config.use_msac).to(
|
||||
self.dtype) for image in images
|
||||
]
|
||||
num_patches_list = [
|
||||
pixel_value.shape[0] for pixel_value in pixel_values
|
||||
]
|
||||
pixel_values = torch.cat(pixel_values, dim=0)
|
||||
for num_patches in num_patches_list:
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token \
|
||||
* num_patches
|
||||
image_tokens = IMG_START + context_tokens + IMG_END
|
||||
text = text.replace('<image>', image_tokens, 1)
|
||||
prompt = self.tokenizer(text, return_tensors="pt")
|
||||
prompt.update({"pixel_values": pixel_values})
|
||||
return prompt
|
||||
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
|
||||
"<IMG_CONTEXT>")
|
||||
hf_model.model.img_context_token_id = img_context_token_id
|
||||
hf_model.processor = H2OVLProcessor(hf_model)
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.language_model.get_output_embeddings()
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate,
|
||||
hf_model.model)
|
||||
return hf_model
|
||||
|
||||
|
||||
def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for InternVL."""
|
||||
|
||||
class InternVLProcessor:
|
||||
"""A simple processor for InternVL2 which misses a processor."""
|
||||
|
||||
def __init__(self, hf_runner: HfRunner):
|
||||
self.num_image_token = hf_runner.model.num_image_token
|
||||
self.tokenizer = hf_runner.tokenizer
|
||||
self.dtype = hf_runner.model.dtype
|
||||
|
||||
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
|
||||
trust_remote_code=True)
|
||||
self.vision_config = self.config.vision_config
|
||||
self.use_thumbnail = self.config.use_thumbnail
|
||||
self.min_num = self.config.min_dynamic_patch
|
||||
self.max_num = self.config.max_dynamic_patch
|
||||
self.image_size = self.vision_config.image_size
|
||||
|
||||
def __call__(self, text: str, images: Union[Image, List[Image]],
|
||||
**kwargs):
|
||||
from vllm.model_executor.models.internvl import (
|
||||
IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
|
||||
images = [images] if isinstance(images, Image) else images
|
||||
pixel_values = [
|
||||
image_to_pixel_values(image, self.image_size, self.min_num,
|
||||
self.max_num,
|
||||
self.use_thumbnail).to(self.dtype)
|
||||
for image in images
|
||||
]
|
||||
num_patches_list = [
|
||||
pixel_value.shape[0] for pixel_value in pixel_values
|
||||
]
|
||||
pixel_values = torch.cat(pixel_values, dim=0)
|
||||
for num_patches in num_patches_list:
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token \
|
||||
* num_patches
|
||||
image_tokens = IMG_START + context_tokens + IMG_END
|
||||
text = text.replace('<image>', image_tokens, 1)
|
||||
prompt = self.tokenizer(text, return_tensors="pt")
|
||||
prompt.update({"pixel_values": pixel_values})
|
||||
return prompt
|
||||
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
|
||||
"<IMG_CONTEXT>")
|
||||
hf_model.model.img_context_token_id = img_context_token_id
|
||||
hf_model.processor = InternVLProcessor(hf_model)
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.language_model.get_output_embeddings()
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate,
|
||||
hf_model.model)
|
||||
return hf_model
|
||||
|
||||
|
||||
def _internvl_generate(
|
||||
self,
|
||||
pixel_values: torch.FloatTensor,
|
||||
input_ids: torch.FloatTensor,
|
||||
attention_mask: Optional[torch.LongTensor] = None,
|
||||
**generate_kwargs,
|
||||
) -> torch.LongTensor:
|
||||
"""Generate method for InternVL2 model without fixed use_cache."""
|
||||
assert self.img_context_token_id is not None
|
||||
vit_embeds = self.extract_feature(pixel_values)
|
||||
input_embeds = self.language_model.get_input_embeddings()(input_ids)
|
||||
B, N, C = input_embeds.shape
|
||||
input_embeds = input_embeds.reshape(B * N, C)
|
||||
|
||||
input_ids = input_ids.reshape(B * N)
|
||||
selected = (input_ids == self.img_context_token_id)
|
||||
assert selected.sum() != 0
|
||||
input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
|
||||
|
||||
input_embeds = input_embeds.reshape(B, N, C)
|
||||
|
||||
forward_kwargs = dict(
|
||||
inputs_embeds=input_embeds,
|
||||
attention_mask=attention_mask,
|
||||
)
|
||||
if getattr(self, "use_visual_token_mask", False):
|
||||
visual_token_mask = selected.reshape(B, N, 1).to(input_embeds.dtype)
|
||||
forward_kwargs["visual_token_mask"] = visual_token_mask
|
||||
outputs = self.language_model.generate(
|
||||
**forward_kwargs,
|
||||
**generate_kwargs,
|
||||
)
|
||||
|
||||
return outputs
|
||||
@@ -0,0 +1,139 @@
|
||||
"""Entrypoints for wrapping the core run_test implementation for specific test
|
||||
types / modalities.
|
||||
"""
|
||||
from pathlib import PosixPath
|
||||
from typing import Type
|
||||
|
||||
from .....conftest import HfRunner, VllmRunner, _ImageAssets, _VideoAssets
|
||||
from . import builders, core
|
||||
from .types import ExpandableVLMTestArgs, VLMTestInfo
|
||||
|
||||
|
||||
####### Entrypoints for running different test types
|
||||
def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
image_assets: _ImageAssets):
|
||||
assert test_case.size_wrapper is not None
|
||||
inputs = builders.build_single_image_inputs_from_test_info(
|
||||
model_test_info, image_assets, test_case.size_wrapper, tmp_path)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
inputs=inputs,
|
||||
model=test_case.model,
|
||||
dtype=test_case.dtype,
|
||||
max_tokens=test_case.max_tokens,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
runner_mm_key="images",
|
||||
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||
|
||||
|
||||
def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
image_assets: _ImageAssets):
|
||||
assert test_case.size_wrapper is not None
|
||||
inputs = builders.build_multi_image_inputs_from_test_info(
|
||||
model_test_info, image_assets, test_case.size_wrapper, tmp_path)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
inputs=inputs,
|
||||
model=test_case.model,
|
||||
dtype=test_case.dtype,
|
||||
max_tokens=test_case.max_tokens,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"image": len(image_assets)},
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
runner_mm_key="images",
|
||||
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||
|
||||
|
||||
def run_embedding_test(*, model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
image_assets: _ImageAssets):
|
||||
assert test_case.size_wrapper is not None
|
||||
inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
|
||||
model_test_info, image_assets, test_case.size_wrapper)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
inputs=inputs,
|
||||
model=test_case.model,
|
||||
dtype=test_case.dtype,
|
||||
max_tokens=test_case.max_tokens,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
vllm_embeddings=vllm_embeddings,
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
runner_mm_key="images",
|
||||
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||
|
||||
|
||||
def run_video_test(
|
||||
*,
|
||||
model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
video_assets: _VideoAssets,
|
||||
):
|
||||
assert test_case.size_wrapper is not None
|
||||
assert test_case.num_video_frames is not None
|
||||
inputs = builders.build_video_inputs_from_test_info(
|
||||
model_test_info, video_assets, test_case.size_wrapper,
|
||||
test_case.num_video_frames)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
inputs=inputs,
|
||||
model=test_case.model,
|
||||
dtype=test_case.dtype,
|
||||
max_tokens=test_case.max_tokens,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"video": len(video_assets)},
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
runner_mm_key="videos",
|
||||
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||
|
||||
|
||||
def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner]):
|
||||
# Custom test cases can provide inputs directly, but they need to
|
||||
# explicitly provided a CustomTestConfig, which wraps the inputs and
|
||||
# the limit_mm_per_prompt
|
||||
assert test_case.custom_test_opts is not None
|
||||
|
||||
inputs = test_case.custom_test_opts.inputs
|
||||
limit_mm_per_prompt = test_case.custom_test_opts.limit_mm_per_prompt
|
||||
runner_mm_key = test_case.custom_test_opts.runner_mm_key
|
||||
# Inputs, limit_mm_per_prompt, and runner_mm_key should all be set
|
||||
assert inputs is not None
|
||||
assert limit_mm_per_prompt is not None
|
||||
assert runner_mm_key is not None
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
inputs=inputs,
|
||||
model=test_case.model,
|
||||
dtype=test_case.dtype,
|
||||
max_tokens=test_case.max_tokens,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
runner_mm_key=runner_mm_key,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||
@@ -0,0 +1,186 @@
|
||||
"""Types for writing multimodal model tests."""
|
||||
from enum import Enum
|
||||
from pathlib import PosixPath
|
||||
from typing import (Any, Callable, Dict, Iterable, List, NamedTuple, Optional,
|
||||
Tuple, Type, Union)
|
||||
|
||||
import torch
|
||||
from PIL.Image import Image
|
||||
from pytest import MarkDecorator
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, BatchEncoding
|
||||
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||
|
||||
from vllm.sequence import SampleLogprobs
|
||||
from vllm.utils import identity
|
||||
|
||||
from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets
|
||||
from ....utils import check_logprobs_close
|
||||
|
||||
# meta image tag; will be replaced by the appropriate tag for the model
|
||||
TEST_IMG_PLACEHOLDER = "<vlm_image>"
|
||||
TEST_VIDEO_PLACEHOLDER = "<vlm_video>"
|
||||
|
||||
# yapf: disable
|
||||
SINGLE_IMAGE_BASE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign": f"{TEST_IMG_PLACEHOLDER}What's the content of the image?",
|
||||
"cherry_blossom": f"{TEST_IMG_PLACEHOLDER}What is the season?",
|
||||
})
|
||||
|
||||
MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PLACEHOLDER}Describe the two images in detail.\n" # noqa: E501
|
||||
VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"
|
||||
|
||||
|
||||
IMAGE_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
|
||||
EMBEDDING_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0)]
|
||||
RunnerOutput = Tuple[List[int], str, Optional[SampleLogprobs]]
|
||||
# yapf: enable
|
||||
|
||||
|
||||
class VLMTestType(Enum):
|
||||
IMAGE = 1
|
||||
MULTI_IMAGE = 2
|
||||
EMBEDDING = 3
|
||||
VIDEO = 4
|
||||
CUSTOM_INPUTS = 5
|
||||
|
||||
|
||||
class SizeType(Enum):
|
||||
SIZE_FACTOR = 1
|
||||
FIXED_SIZE = 2
|
||||
|
||||
|
||||
class CustomTestOptions(NamedTuple):
|
||||
inputs: List[Tuple[List[str], List[Union[List[Image], Image]]]]
|
||||
limit_mm_per_prompt: Dict[str, int]
|
||||
# kwarg to pass multimodal data in as to vllm/hf runner instances.
|
||||
runner_mm_key: str = "images"
|
||||
|
||||
|
||||
class ImageSizeWrapper(NamedTuple):
|
||||
type: SizeType
|
||||
# A size factor is a wrapper of 0+ floats,
|
||||
# while a fixed size contains an iterable of integer pairs
|
||||
data: Union[Iterable[float], Iterable[Tuple[int, int]]]
|
||||
|
||||
|
||||
class VLMTestInfo(NamedTuple):
|
||||
"""Holds the configuration for 1+ tests for one model architecture."""
|
||||
|
||||
models: Union[List[str]]
|
||||
test_type: Union[VLMTestType, Iterable[VLMTestType]]
|
||||
|
||||
# Should be None only if this is a CUSTOM_INPUTS test
|
||||
prompt_formatter: Optional[Callable[[str], str]] = None
|
||||
img_idx_to_prompt: Callable[[int], str] = lambda idx: "<image>\n"
|
||||
video_idx_to_prompt: Callable[[int], str] = lambda idx: "<video>\n"
|
||||
|
||||
# Most models work on the single / multi-image prompts above, but in some
|
||||
# cases the log prob check fails, e.g., for paligemma. We allow passing
|
||||
# an override for the single image prompts / multi-image prompt for this
|
||||
# reason.
|
||||
single_image_prompts: Iterable[str] = SINGLE_IMAGE_BASE_PROMPTS
|
||||
multi_image_prompt: str = MULTI_IMAGE_BASE_PROMPT
|
||||
|
||||
# Function for converting ImageAssets to image embeddings;
|
||||
# We need to define this explicitly for embedding tests
|
||||
convert_assets_to_embeddings: Optional[Callable[[_ImageAssets],
|
||||
torch.Tensor]] = None
|
||||
|
||||
# Exposed options for vLLM runner; we change these in a several tests,
|
||||
# but the defaults are derived from VllmRunner & the engine defaults
|
||||
# These settings are chosen to avoid OOMs when running in the CI
|
||||
enforce_eager: bool = True
|
||||
max_model_len: int = 1024
|
||||
max_num_seqs: int = 256
|
||||
task: str = "auto"
|
||||
tensor_parallel_size: int = 1
|
||||
|
||||
# Optional callable which gets a list of token IDs from the model tokenizer
|
||||
get_stop_token_ids: Optional[Callable[[AutoTokenizer], List[int]]] = None
|
||||
|
||||
# Exposed options for HF runner
|
||||
model_kwargs: Optional[Dict[str, Any]] = None
|
||||
# Indicates we should explicitly pass the EOS from the tokeniezr
|
||||
use_tokenizer_eos: bool = False
|
||||
auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM
|
||||
# Callable to pass to the HF runner to run on inputs; for now, we also pass
|
||||
# the data type to input post processing, because almost all of the uses of
|
||||
# postprocess_inputs are to fix the data types of BatchEncoding values.
|
||||
postprocess_inputs: Callable[[BatchEncoding, str],
|
||||
BatchEncoding] = identity
|
||||
patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]] = None
|
||||
|
||||
# Post processors that if defined, will run oun the outputs of the
|
||||
# vLLM and HF runner, respectively (useful for sanitization, etc).
|
||||
vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]] = None
|
||||
hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]] = None
|
||||
|
||||
# Consumes the output of the callables above and checks if they're equal
|
||||
comparator: Callable[..., None] = check_logprobs_close
|
||||
|
||||
# Default expandable params per test; these defaults can be overridden in
|
||||
# instances of this object; the complete set of test cases for the model
|
||||
# is all combinations of .models + all fields below
|
||||
max_tokens: Union[int, Tuple[int]] = 128
|
||||
num_logprobs: Union[int, Tuple[int]] = 5
|
||||
dtype: Union[str, Iterable[str]] = "half"
|
||||
distributed_executor_backend: Optional[Union[str, Iterable[str]]] = None
|
||||
# Only expanded in video tests
|
||||
num_video_frames: Union[int, Tuple[int]] = 16
|
||||
|
||||
# Fixed image sizes / image size factors; most tests use image_size_factors
|
||||
# The values provided for these two fields will be stacked and expanded
|
||||
# such that each model will consider each image size factor / image size
|
||||
# once per tests (much like concatenating and wrapping in one parametrize
|
||||
# call)
|
||||
image_size_factors: Iterable[Iterable[float]] = IMAGE_SIZE_FACTORS
|
||||
image_sizes: Optional[Iterable[Iterable[Tuple[int, int]]]] = None
|
||||
|
||||
# Hack for updating a prompt to take into a local path; currently only used
|
||||
# for Qwen-VL, which requires encoding the image path / url into the prompt
|
||||
# for HF runner
|
||||
prompt_path_encoder: Optional[
|
||||
Callable[[PosixPath, str, Union[List[ImageAsset], _ImageAssets]],
|
||||
str]] = None # noqa: E501
|
||||
|
||||
# Allows configuring a test to run with custom inputs
|
||||
custom_test_opts: Optional[List[CustomTestOptions]] = None
|
||||
|
||||
marks: Optional[List[MarkDecorator]] = None
|
||||
|
||||
def get_non_parametrized_runner_kwargs(self):
|
||||
"""Returns a dictionary of expandable kwargs for items that are used
|
||||
in all test types, which are NOT used when creating the parametrized
|
||||
test cases.
|
||||
"""
|
||||
return {
|
||||
"enforce_eager": self.enforce_eager,
|
||||
"max_model_len": self.max_model_len,
|
||||
"max_num_seqs": self.max_num_seqs,
|
||||
"task": self.task,
|
||||
"tensor_parallel_size": self.tensor_parallel_size,
|
||||
"hf_output_post_proc": self.hf_output_post_proc,
|
||||
"vllm_output_post_proc": self.vllm_output_post_proc,
|
||||
"auto_cls": self.auto_cls,
|
||||
"use_tokenizer_eos": self.use_tokenizer_eos,
|
||||
"postprocess_inputs": self.postprocess_inputs,
|
||||
"comparator": self.comparator,
|
||||
"get_stop_token_ids": self.get_stop_token_ids,
|
||||
"model_kwargs": self.model_kwargs,
|
||||
"patch_hf_runner": self.patch_hf_runner,
|
||||
}
|
||||
|
||||
|
||||
class ExpandableVLMTestArgs(NamedTuple):
|
||||
"""The expanded kwargs which correspond to a single test case."""
|
||||
model: str
|
||||
max_tokens: int
|
||||
num_logprobs: int
|
||||
dtype: str
|
||||
distributed_executor_backend: Optional[str]
|
||||
# Sizes are used for everything except for custom input tests
|
||||
size_wrapper: Optional[ImageSizeWrapper] = None
|
||||
# Video only
|
||||
num_video_frames: Optional[int] = None
|
||||
# Custom inputs only
|
||||
custom_test_opts: Optional[CustomTestOptions] = None
|
||||
0
vllm-v0.6.2/tests/models/embedding/__init__.py
Normal file
0
vllm-v0.6.2/tests/models/embedding/__init__.py
Normal file
@@ -0,0 +1,45 @@
|
||||
"""Compare the outputs of HF and vLLM when using greedy sampling.
|
||||
|
||||
This test only tests small models. Big models such as 7B should be tested from
|
||||
test_big_models.py because it could use a larger instance to run tests.
|
||||
|
||||
Run `pytest tests/models/test_cls_models.py`.
|
||||
"""
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoModelForSequenceClassification
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
pytest.param("jason9693/Qwen2.5-1.5B-apeach",
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
def test_classification_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.classify(example_prompts)
|
||||
# This test is for verifying whether the model's extra_repr
|
||||
# can be printed correctly.
|
||||
print(vllm_model.model.llm_engine.model_executor.driver_worker.
|
||||
model_runner.model)
|
||||
|
||||
with hf_runner(model,
|
||||
dtype=dtype,
|
||||
auto_cls=AutoModelForSequenceClassification) as hf_model:
|
||||
hf_outputs = hf_model.classify(example_prompts)
|
||||
|
||||
# check logits difference
|
||||
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
|
||||
hf_output = torch.tensor(hf_output)
|
||||
vllm_output = torch.tensor(vllm_output)
|
||||
|
||||
assert torch.allclose(hf_output, vllm_output, 1e-3)
|
||||
@@ -0,0 +1,60 @@
|
||||
"""Compare the embedding outputs of HF and vLLM models.
|
||||
|
||||
Run `pytest tests/models/embedding/language/test_embedding.py`.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from ..utils import check_embeddings_close
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
# [Encoder-only]
|
||||
pytest.param("BAAI/bge-base-en-v1.5",
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
|
||||
pytest.param("intfloat/multilingual-e5-large"),
|
||||
# [Encoder-decoder]
|
||||
pytest.param("intfloat/e5-mistral-7b-instruct",
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
|
||||
pytest.param("BAAI/bge-multilingual-gemma2",
|
||||
marks=[pytest.mark.core_model]),
|
||||
pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"),
|
||||
pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
# The example_prompts has ending "\n", for example:
|
||||
# "Write a short story about a robot that dreams for the first time.\n"
|
||||
# sentence_transformers will strip the input texts, see:
|
||||
# https://github.com/UKPLab/sentence-transformers/blob/v3.1.1/sentence_transformers/models/Transformer.py#L159
|
||||
# This makes the input_ids different between hf_model and vllm_model.
|
||||
# So we need to strip the input texts to avoid test failing.
|
||||
example_prompts = [str(s).strip() for s in example_prompts]
|
||||
|
||||
with hf_runner(model, dtype=dtype,
|
||||
is_sentence_transformer=True) as hf_model:
|
||||
hf_outputs = hf_model.encode(example_prompts)
|
||||
|
||||
with vllm_runner(model, task="embedding", dtype=dtype,
|
||||
max_model_len=None) as vllm_model:
|
||||
vllm_outputs = vllm_model.encode(example_prompts)
|
||||
# This test is for verifying whether the model's extra_repr
|
||||
# can be printed correctly.
|
||||
print(vllm_model.model.llm_engine.model_executor.driver_worker.
|
||||
model_runner.model)
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
tol=1e-2,
|
||||
)
|
||||
30
vllm-v0.6.2/tests/models/embedding/utils.py
Normal file
30
vllm-v0.6.2/tests/models/embedding/utils.py
Normal file
@@ -0,0 +1,30 @@
|
||||
from typing import List, Sequence
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
def check_embeddings_close(
|
||||
*,
|
||||
embeddings_0_lst: Sequence[List[float]],
|
||||
embeddings_1_lst: Sequence[List[float]],
|
||||
name_0: str,
|
||||
name_1: str,
|
||||
tol: float = 1e-3,
|
||||
) -> None:
|
||||
assert len(embeddings_0_lst) == len(embeddings_1_lst)
|
||||
|
||||
for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
|
||||
zip(embeddings_0_lst, embeddings_1_lst)):
|
||||
assert len(embeddings_0) == len(embeddings_1), (
|
||||
f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")
|
||||
|
||||
sim = F.cosine_similarity(torch.tensor(embeddings_0),
|
||||
torch.tensor(embeddings_1),
|
||||
dim=0)
|
||||
|
||||
fail_msg = (f"Test{prompt_idx}:"
|
||||
f"\n{name_0}:\t{embeddings_0!r}"
|
||||
f"\n{name_1}:\t{embeddings_1!r}")
|
||||
|
||||
assert sim >= 1 - tol, fail_msg
|
||||
@@ -0,0 +1,209 @@
|
||||
from functools import partial
|
||||
from typing import Callable, Dict, List, Type
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import BatchEncoding, Qwen2VLForConditionalGeneration
|
||||
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
||||
from ....utils import large_gpu_test
|
||||
from ..utils import check_embeddings_close
|
||||
|
||||
HF_TEXT_PROMPTS = [
|
||||
# T -> X
|
||||
(
|
||||
"Query: Find me an everyday image that matches the given caption: The label of the object is stop sign", # noqa: E501,
|
||||
Image.new("RGB", (56, 56))),
|
||||
# T -> X
|
||||
("Query: Retrieve an image of this caption: cherry blossom",
|
||||
Image.new("RGB", (56, 56))),
|
||||
]
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
"What is shown in this image?",
|
||||
"cherry_blossom":
|
||||
"What is shown in this image?"
|
||||
})
|
||||
|
||||
MODELS = ["MrLight/dse-qwen2-2b-mrl-v1"]
|
||||
|
||||
|
||||
def get_messages(image: Image.Image, text: str, embed_text: bool):
|
||||
# assert False, 'remember to use outer [] as required'
|
||||
if embed_text:
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image",
|
||||
"image": Image.new("RGB", (56, 56)),
|
||||
"resized_height": 1,
|
||||
"resized_width": 1
|
||||
}, # need a dummy image here for an easier process.
|
||||
{
|
||||
"type": "text",
|
||||
"text": text
|
||||
},
|
||||
]
|
||||
}]
|
||||
else:
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [{
|
||||
"type": "image",
|
||||
"image": image
|
||||
}, {
|
||||
"type": "text",
|
||||
"text": text
|
||||
}]
|
||||
}]
|
||||
return messages
|
||||
|
||||
|
||||
def apply_chat_template_and_add_eos(
|
||||
messages: List[Dict],
|
||||
apply_chat_template_fn: Callable,
|
||||
):
|
||||
prompt = apply_chat_template_fn(
|
||||
messages, tokenize=False, add_generation_prompt=True) + "<|endoftext|>"
|
||||
return prompt
|
||||
|
||||
|
||||
def postprocess_inputs(hf_model: HfRunner, inputs: BatchEncoding, **kwargs):
|
||||
return hf_model.model.prepare_inputs_for_generation(**inputs, **kwargs)
|
||||
|
||||
|
||||
def _run_test(
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
input_texts: List[str],
|
||||
input_images: PromptImageInput,
|
||||
embed_texts: List[bool],
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
'''SET PYTHONPATH'''
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
with vllm_runner(model,
|
||||
task="embedding",
|
||||
dtype=dtype,
|
||||
enforce_eager=True,
|
||||
max_model_len=8192) as vllm_model:
|
||||
tokenizer = vllm_model.model.get_tokenizer()
|
||||
texts = [
|
||||
# this is necessary because vllm_model.encode will not apply any
|
||||
# templating to the prompt, and therefore lacks an image_pad
|
||||
# token unless one is inserted beforehand (the (28,28) image
|
||||
# above is converted to an image pad token by the chat template).
|
||||
apply_chat_template_and_add_eos(
|
||||
get_messages(image, text, False),
|
||||
apply_chat_template_fn=tokenizer.apply_chat_template,
|
||||
) for text, image in zip(input_texts, input_images)
|
||||
# vllm will replace the pad token with the actual image,
|
||||
# which may be a placeholder image, later.
|
||||
]
|
||||
vllm_outputs = vllm_model.encode(texts, images=input_images)
|
||||
|
||||
hf_outputs = []
|
||||
with hf_runner(model,
|
||||
dtype=dtype,
|
||||
auto_cls=Qwen2VLForConditionalGeneration) as hf_model:
|
||||
hf_model.postprocess_inputs = partial(
|
||||
postprocess_inputs,
|
||||
hf_model,
|
||||
cache_position=torch.arange(
|
||||
0,
|
||||
1, # 1 for batch size
|
||||
requires_grad=False),
|
||||
use_cache=False)
|
||||
for text, image, embed_text in zip(input_texts, input_images,
|
||||
embed_texts):
|
||||
# dse requires non-standard input processing
|
||||
# because it needs an image_pad token
|
||||
messages = get_messages(image, text, embed_text)
|
||||
prompt = apply_chat_template_and_add_eos(
|
||||
messages, hf_model.processor.apply_chat_template)
|
||||
inputs = hf_model.get_inputs(
|
||||
prompts=[[prompt]],
|
||||
images=[[image]],
|
||||
)
|
||||
with torch.no_grad():
|
||||
outputs = hf_model.model(
|
||||
**hf_model.wrap_device(inputs[0],
|
||||
device=hf_model.model.device.type),
|
||||
return_dict=True,
|
||||
output_hidden_states=True,
|
||||
)
|
||||
pooled_output = torch.nn.functional.normalize(
|
||||
outputs.hidden_states[-1][0, -1], p=2, dim=-1)
|
||||
hf_outputs.append(pooled_output.tolist())
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
def test_models_text(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
input_texts_images = [(text, image_placeholder)
|
||||
for text, image_placeholder in HF_TEXT_PROMPTS]
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
embed_texts = [True] * len(input_texts)
|
||||
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
input_texts,
|
||||
input_images, # type: ignore
|
||||
embed_texts,
|
||||
model,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=48)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
def test_models_image(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
input_texts_images = [
|
||||
(text, asset.pil_image)
|
||||
for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
|
||||
]
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
embed_texts = [False] * len(input_texts)
|
||||
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
input_texts,
|
||||
input_images,
|
||||
embed_texts,
|
||||
model,
|
||||
dtype=dtype,
|
||||
)
|
||||
@@ -0,0 +1,140 @@
|
||||
from typing import List, Type
|
||||
|
||||
import pytest
|
||||
import torch.nn.functional as F
|
||||
import transformers
|
||||
from transformers import AutoModelForVision2Seq
|
||||
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
||||
from ....utils import large_gpu_test
|
||||
from ..utils import check_embeddings_close
|
||||
|
||||
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501
|
||||
|
||||
HF_TEXT_PROMPTS = [
|
||||
# T -> X
|
||||
llama3_template.format(
|
||||
"The label of the object is stop sign\nSummary above sentence in one word: " # noqa: E501
|
||||
),
|
||||
# T -> X
|
||||
llama3_template.format(
|
||||
"cherry blossom\nSummary above sentence in one word: "),
|
||||
]
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
# I -> X
|
||||
"stop_sign":
|
||||
llama3_template.format("<image>\nSummary above image in one word: "),
|
||||
# I -> X
|
||||
"cherry_blossom":
|
||||
llama3_template.format("<image>\nSummary above image in one word: "),
|
||||
})
|
||||
|
||||
MODELS = ["royokong/e5-v"]
|
||||
|
||||
|
||||
def _run_test(
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
input_texts: List[str],
|
||||
input_images: PromptImageInput,
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
with vllm_runner(model,
|
||||
task="embedding",
|
||||
dtype=dtype,
|
||||
max_model_len=4096,
|
||||
enforce_eager=True) as vllm_model:
|
||||
vllm_outputs = vllm_model.encode(input_texts, images=input_images)
|
||||
|
||||
with hf_runner(model, dtype=dtype,
|
||||
auto_cls=AutoModelForVision2Seq) as hf_model:
|
||||
# Patch the issue where image_token_id
|
||||
# exceeds the maximum allowed vocab size
|
||||
hf_model.model.resize_token_embeddings(
|
||||
hf_model.model.language_model.vocab_size + 1)
|
||||
|
||||
all_inputs = hf_model.get_inputs(input_texts, images=input_images)
|
||||
|
||||
all_outputs = []
|
||||
for inputs in all_inputs:
|
||||
# Based on: https://huggingface.co/royokong/e5-v
|
||||
outputs = hf_model.model(
|
||||
**hf_model.wrap_device(inputs,
|
||||
device=hf_model.model.device.type),
|
||||
return_dict=True,
|
||||
output_hidden_states=True,
|
||||
)
|
||||
pooled_output = F.normalize(outputs.hidden_states[-1][0, -1, :],
|
||||
dim=-1)
|
||||
|
||||
all_outputs.append(pooled_output.tolist())
|
||||
|
||||
hf_outputs = all_outputs
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(transformers.__version__.startswith("4.46"),
|
||||
reason="Model broken with changes in transformers 4.46")
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_models_text(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
input_texts,
|
||||
input_images, # type: ignore
|
||||
model,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=48)
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_models_image(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
input_texts_images = [
|
||||
(text, asset.pil_image)
|
||||
for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
|
||||
]
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
input_texts,
|
||||
input_images,
|
||||
model,
|
||||
dtype=dtype,
|
||||
)
|
||||
126
vllm-v0.6.2/tests/models/embedding/vision_language/test_phi3v.py
Normal file
126
vllm-v0.6.2/tests/models/embedding/vision_language/test_phi3v.py
Normal file
@@ -0,0 +1,126 @@
|
||||
from typing import List, Type
|
||||
|
||||
import pytest
|
||||
import torch.nn.functional as F
|
||||
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
||||
from ....utils import large_gpu_test
|
||||
from ..utils import check_embeddings_close
|
||||
|
||||
HF_TEXT_PROMPTS = [
|
||||
# T -> X
|
||||
"Find me an everyday image that matches the given caption: The label of the object is stop sign", # noqa: E501
|
||||
# T -> X
|
||||
"Retrieve an image of this caption: cherry blossom",
|
||||
]
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
# T + I -> X
|
||||
"stop_sign":
|
||||
"<|image_1|> Select the portion of the image that isolates the object of the given label: The label of the object is stop sign", # noqa: E501
|
||||
# I -> X
|
||||
"cherry_blossom":
|
||||
"<|image_1|> Represent the given image for classification", # noqa: E501
|
||||
})
|
||||
|
||||
MODELS = ["TIGER-Lab/VLM2Vec-Full"]
|
||||
|
||||
|
||||
def _run_test(
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
input_texts: List[str],
|
||||
input_images: PromptImageInput,
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
with vllm_runner(model, task="embedding", dtype=dtype,
|
||||
enforce_eager=True) as vllm_model:
|
||||
vllm_outputs = vllm_model.encode(input_texts, images=input_images)
|
||||
|
||||
# use eager mode for hf runner, since phi3_v didn't work with flash_attn
|
||||
hf_model_kwargs = {"_attn_implementation": "eager"}
|
||||
with hf_runner(model, dtype=dtype,
|
||||
model_kwargs=hf_model_kwargs) as hf_model:
|
||||
all_inputs = hf_model.get_inputs(input_texts, images=input_images)
|
||||
|
||||
all_outputs = []
|
||||
for inputs in all_inputs:
|
||||
# Based on: https://github.com/TIGER-AI-Lab/VLM2Vec/blob/db3b951bccabba220c1f53ab46a734e50dd2fc08/src/model.py
|
||||
outputs = hf_model.model(
|
||||
**hf_model.wrap_device(inputs,
|
||||
device=hf_model.model.device.type),
|
||||
return_dict=True,
|
||||
output_hidden_states=True,
|
||||
)
|
||||
last_hidden_state = outputs.hidden_states[-1][0]
|
||||
reps = last_hidden_state[inputs.attention_mask[0].sum() - 1]
|
||||
pooled_output = F.normalize(reps, p=2, dim=-1)
|
||||
|
||||
all_outputs.append(pooled_output.tolist())
|
||||
|
||||
hf_outputs = all_outputs
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_models_text(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
input_texts,
|
||||
input_images, # type: ignore
|
||||
model,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=48)
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_models_image(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
input_texts_images = [
|
||||
(text, asset.pil_image)
|
||||
for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
|
||||
]
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
input_texts,
|
||||
input_images,
|
||||
model,
|
||||
dtype=dtype,
|
||||
)
|
||||
222
vllm-v0.6.2/tests/models/encoder_decoder/language/test_bart.py
Normal file
222
vllm-v0.6.2/tests/models/encoder_decoder/language/test_bart.py
Normal file
@@ -0,0 +1,222 @@
|
||||
"""Compare the outputs of HF and vLLM for BART models using greedy sampling.
|
||||
|
||||
Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
|
||||
"""
|
||||
from typing import List, Optional, Tuple, Type
|
||||
|
||||
import pytest
|
||||
from transformers import AutoModelForSeq2SeqLM
|
||||
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ....conftest import (DecoderPromptType, ExplicitEncoderDecoderPrompt,
|
||||
HfRunner, VllmRunner)
|
||||
from ....utils import multi_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
|
||||
def vllm_to_hf_output(
|
||||
vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]],
|
||||
decoder_prompt_type: DecoderPromptType,
|
||||
):
|
||||
"""Sanitize vllm output to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
hf_output_str = output_str + "</s>"
|
||||
if decoder_prompt_type == DecoderPromptType.NONE:
|
||||
hf_output_str = "<s>" + hf_output_str
|
||||
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def run_test(
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
|
||||
decoder_prompt_type: DecoderPromptType,
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
) -> None:
|
||||
'''
|
||||
Test the vLLM BART model for a variety of encoder/decoder input prompts,
|
||||
by validating it against HuggingFace (HF) BART.
|
||||
|
||||
Arguments:
|
||||
|
||||
* hf_runner: HuggingFace (HF) test model runner
|
||||
* vllm_runner: vLLM test model runner
|
||||
* example_encoder_decoder_prompts: test fixture which provides a
|
||||
dictionary of dummy prompts
|
||||
* model: the HF ID of the specific BART variant under test
|
||||
* dtype: the tensor datatype to employ
|
||||
* max_tokens
|
||||
* num_logprobs
|
||||
* decoder_prompt_type: key into the example_encoder_decoder_prompts
|
||||
dictionary; selects specific encoder/decoder
|
||||
prompt scenarios to test
|
||||
|
||||
A note on using HF BART as a baseline for validating vLLM BART,
|
||||
specifically when the decoder prompt is None.
|
||||
|
||||
The HF GenerationMixin's default behavior is to force the first
|
||||
decoded token to be <BOS> if the prompt does not already contain
|
||||
<BOS> (this is accomplished using a logit
|
||||
processor setting.)
|
||||
|
||||
So when we use HF BART as our baseline for comparison, note that
|
||||
when the user provides a request with a None decoder prompt
|
||||
(i.e. a singleton encoder prompt, or else an explicit encoder/
|
||||
decoder prompt with the decoder sub-prompt set to None), HF and
|
||||
vLLM handle this in different ways:
|
||||
|
||||
* HF will (1) tokenize the None prompt as an empty token-list,
|
||||
(2) append <decoder-start-token> to the beginning, yielding
|
||||
[<decoder-start-token>], (3) pass this token list to the model, and
|
||||
then (4) after computing logits during prefill, override the model
|
||||
logits & force <BOS> to be the first generated token.
|
||||
|
||||
* vLLM will (1) tokenize the None prompt as [<BOS>], (2) append decoder-
|
||||
start-token to the beginning, yielding [<decoder-start-token><BOS>],
|
||||
(3) pass these tokens to the model & proceed with generation.
|
||||
|
||||
The net effect is that compared to vLLM, the list of HF *decoded* tokens
|
||||
will contain one more initial <BOS> than the vLLM generated tokens,
|
||||
because vLLM's <BOS> token is injected into the prompt rather than into
|
||||
the generated output. This is in spite of the fact that overall, the
|
||||
complete sequences (prompt + decoded tokens) produced by vLLM will match
|
||||
HF.
|
||||
|
||||
So when we use HF decoded token output to validate vLLM's decoded token
|
||||
output, the testing process must account for the difference in decoded
|
||||
token sequences between vLLM and HF specifically in the
|
||||
decoder-prompt-is-None case.
|
||||
|
||||
One option is to disable the logit processor feature that forces the
|
||||
<BOS> token to be decoded (forced_bos_token_id = None), eliminating
|
||||
the problem entirely. However this is not "normal" BART usage.
|
||||
|
||||
The other option is - only in the decoder-prompt-is-None case - to
|
||||
discard the first decoded token from the HF output before comparing it
|
||||
to vLLM.
|
||||
|
||||
To that end, when testing the scenario where the decoder prompt is None
|
||||
(and only in that one scenario), this test skips the first HF decoded
|
||||
token during the process of validating the vLLM decoded output.
|
||||
'''
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default).
|
||||
|
||||
# Note: currently encoder/decoder models are only compatible with
|
||||
# enforce_eager=True. Normally this is not a problem because
|
||||
# for encoder/decoder models vLLM will
|
||||
# default to enforce_eager=True if enforce_eager
|
||||
# is left unspecified. However, the
|
||||
# VllmRunner test fixture (which wraps around the LLM class) defaults to
|
||||
# enforce_eager=False (a behavior which a number of already-exisitng
|
||||
# decoder-only unit tests expect), so when testing an encoder/decoder
|
||||
# model we must explicitly specify enforce_eager=True in the VllmRunner
|
||||
# constructor.
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
|
||||
prompts, max_tokens, num_logprobs)
|
||||
|
||||
# Configuration settings for HF baseline
|
||||
hf_kwargs = {
|
||||
"top_k": None,
|
||||
"num_beams": 1,
|
||||
"repetition_penalty": 1.0,
|
||||
"top_p": 1.0,
|
||||
"length_penalty": 1.0,
|
||||
"early_stopping": False,
|
||||
"no_repeat_ngram_size": None,
|
||||
"min_length": 0
|
||||
}
|
||||
|
||||
with hf_runner(model, dtype=dtype,
|
||||
auto_cls=AutoModelForSeq2SeqLM) as hf_model:
|
||||
hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
**hf_kwargs,
|
||||
))
|
||||
|
||||
hf_skip_tokens = (1
|
||||
if decoder_prompt_type == DecoderPromptType.NONE else 0)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=[
|
||||
vllm_to_hf_output(vllm_output, decoder_prompt_type)
|
||||
for vllm_output in vllm_outputs
|
||||
],
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
num_outputs_0_skip_tokens=hf_skip_tokens,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
pytest.param("facebook/bart-base",
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
|
||||
pytest.param("facebook/bart-large-cnn"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
|
||||
def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
|
||||
dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None:
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_encoder_decoder_prompts[decoder_prompt_type],
|
||||
decoder_prompt_type,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
|
||||
@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
|
||||
def test_models_distributed(hf_runner, vllm_runner,
|
||||
example_encoder_decoder_prompts,
|
||||
distributed_executor_backend, model, dtype,
|
||||
max_tokens, num_logprobs,
|
||||
decoder_prompt_type) -> None:
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_encoder_decoder_prompts[decoder_prompt_type],
|
||||
decoder_prompt_type,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
)
|
||||
@@ -0,0 +1,35 @@
|
||||
import pytest
|
||||
|
||||
from ....utils import multi_gpu_test
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
|
||||
@pytest.mark.parametrize("model", [
|
||||
"meta-llama/Llama-3.2-11B-Vision-Instruct",
|
||||
])
|
||||
def test_models(hf_runner, vllm_runner, image_assets,
|
||||
distributed_executor_backend, model) -> None:
|
||||
|
||||
dtype = "half"
|
||||
max_tokens = 5
|
||||
num_logprobs = 5
|
||||
tensor_parallel_size = 2
|
||||
|
||||
if model.startswith("meta-llama/Llama-3.2-11B-Vision-Instruct"):
|
||||
from .test_mllama import models, run_test
|
||||
else:
|
||||
raise NotImplementedError(f"Unsupported model: {model}")
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model=models[0],
|
||||
size_factors=[0.25, 0.5, 1.0],
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
)
|
||||
@@ -0,0 +1,102 @@
|
||||
from functools import partial
|
||||
from typing import List, Optional, Tuple, Type
|
||||
|
||||
import pytest
|
||||
from PIL import Image
|
||||
|
||||
from vllm.inputs.data import ExplicitEncoderDecoderPrompt
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ....conftest import HfRunner, VllmRunner
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
Florence2Prompt = partial(ExplicitEncoderDecoderPrompt,
|
||||
decoder_prompt=None,
|
||||
mm_processor_kwargs=None)
|
||||
|
||||
MODELS = ["microsoft/Florence-2-base"]
|
||||
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
|
||||
# Therefore, we borrow the BartTokenizer from the original Bart model
|
||||
TOKENIZER = "facebook/bart-base"
|
||||
PROMPTS = [
|
||||
Florence2Prompt(encoder_prompt="<CAPTION>"),
|
||||
Florence2Prompt(encoder_prompt="<DETAILED_CAPTION>"),
|
||||
Florence2Prompt(encoder_prompt="<MORE_DETAILED_CAPTION>"),
|
||||
Florence2Prompt(encoder_prompt="<CAPTION_TO_PHRASE_GROUNDING>"),
|
||||
Florence2Prompt(encoder_prompt="<DENSE_REGION_CAPTION>"),
|
||||
Florence2Prompt(encoder_prompt="<REGION_PROPOSAL>"),
|
||||
Florence2Prompt(encoder_prompt="<OCR_WITH_REGION>"),
|
||||
Florence2Prompt(encoder_prompt="<OCR>"),
|
||||
Florence2Prompt(encoder_prompt="<OD>"),
|
||||
]
|
||||
|
||||
|
||||
def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
|
||||
Optional[SampleLogprobs]], ):
|
||||
"""Sanitize vllm output to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
hf_output_str = "</s><s>" + output_str + "</s>"
|
||||
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def run_test(
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
prompts: List[ExplicitEncoderDecoderPrompt],
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
) -> None:
|
||||
with vllm_runner(model,
|
||||
tokenizer_name=TOKENIZER,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
|
||||
prompts, max_tokens, num_logprobs)
|
||||
|
||||
# Florence-2 processors require image inputs
|
||||
dummy_image = Image.new(mode="RGB", size=(2, 2))
|
||||
with hf_runner(model, dtype=dtype, skip_tokenizer_init=True) as hf_model:
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.language_model.lm_head
|
||||
hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
images=[dummy_image] * len(prompts),
|
||||
))
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=[
|
||||
vllm_to_hf_output(vllm_output) for vllm_output in vllm_outputs
|
||||
],
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(hf_runner, vllm_runner, model, dtype, max_tokens,
|
||||
num_logprobs) -> None:
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
PROMPTS,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
@@ -0,0 +1,367 @@
|
||||
from typing import List, Optional, Tuple, Type, overload
|
||||
|
||||
import pytest
|
||||
from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
|
||||
BatchEncoding)
|
||||
|
||||
from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
|
||||
global_force_attn_backend_context_manager)
|
||||
from vllm.multimodal.utils import rescale_image_size
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
|
||||
_ImageAssets)
|
||||
from ....utils import large_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
_LIMIT_IMAGE_PER_PROMPT = 3
|
||||
|
||||
LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN]
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
"<|image|><|begin_of_text|>The meaning of the image is",
|
||||
"cherry_blossom":
|
||||
"<|image|><|begin_of_text|>The city is",
|
||||
})
|
||||
|
||||
text_only_prompts = [
|
||||
"The color of the sky is blue but sometimes it can also be",
|
||||
]
|
||||
|
||||
models = [
|
||||
"meta-llama/Llama-3.2-11B-Vision-Instruct",
|
||||
]
|
||||
|
||||
|
||||
def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
|
||||
Optional[SampleLogprobs]],
|
||||
model: str):
|
||||
"""Sanitize vllm output to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
config = AutoConfig.from_pretrained(model)
|
||||
image_token_id = config.image_token_index
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
eos_token_id = tokenizer.eos_token_id
|
||||
|
||||
hf_output_ids = [
|
||||
token_id for idx, token_id in enumerate(output_ids)
|
||||
if token_id != image_token_id or output_ids[idx - 1] != image_token_id
|
||||
]
|
||||
|
||||
hf_output_str = output_str
|
||||
if hf_output_ids[-1] == eos_token_id:
|
||||
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
|
||||
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def _get_inputs(
|
||||
image_assets: _ImageAssets,
|
||||
*,
|
||||
size_factors: Optional[List[float]] = None,
|
||||
sizes: Optional[List[Tuple[int, int]]] = None,
|
||||
) -> List[Tuple[List[str], PromptImageInput]]:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
if size_factors is not None:
|
||||
inputs_per_image = [(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||
elif sizes is not None:
|
||||
inputs_per_image = [(
|
||||
[
|
||||
prompt if size is not None else text_only_prompts[0]
|
||||
for size in sizes
|
||||
],
|
||||
[
|
||||
image.resize(size) if size is not None else None
|
||||
for size in sizes
|
||||
],
|
||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||
if len(sizes) == 0:
|
||||
inputs_per_image.append(
|
||||
(text_only_prompts, [None] * len(text_only_prompts)))
|
||||
else:
|
||||
raise ValueError("You must provide either `size_factors` or `sizes`")
|
||||
|
||||
return inputs_per_image
|
||||
|
||||
|
||||
@overload
|
||||
def run_test(
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
image_assets: _ImageAssets,
|
||||
model: str,
|
||||
*,
|
||||
size_factors: List[float],
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
):
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def run_test(
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
image_assets: _ImageAssets,
|
||||
model: str,
|
||||
*,
|
||||
sizes: List[Tuple[int, int]],
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
):
|
||||
...
|
||||
|
||||
|
||||
def run_test(
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
image_assets: _ImageAssets,
|
||||
model: str,
|
||||
*,
|
||||
size_factors: Optional[List[float]] = None,
|
||||
sizes: Optional[List[Tuple[int, int]]] = None,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
):
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
_get_inputs(image_assets, size_factors=size_factors, sizes=sizes),
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
)
|
||||
|
||||
|
||||
def _run_test(
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
inputs: List[Tuple[List[str], PromptImageInput]],
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
Note, the text input is also adjusted to abide by vllm contract.
|
||||
The text output is sanitized to be able to compare with hf.
|
||||
"""
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True,
|
||||
limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT
|
||||
}) as vllm_model:
|
||||
vllm_outputs_per_image = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images)
|
||||
for prompts, images in inputs
|
||||
]
|
||||
|
||||
def process(hf_inputs: BatchEncoding, **kwargs):
|
||||
return hf_inputs
|
||||
|
||||
with hf_runner(model,
|
||||
dtype=dtype,
|
||||
model_kwargs={"device_map": "auto"},
|
||||
postprocess_inputs=process,
|
||||
auto_cls=AutoModelForVision2Seq) as hf_model:
|
||||
hf_outputs_per_image = [
|
||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images)
|
||||
for prompts, images in inputs
|
||||
]
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
|
||||
vllm_outputs_per_image):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=[
|
||||
vllm_to_hf_output(vllm_output, model)
|
||||
for vllm_output in vllm_outputs
|
||||
],
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def clear_cache():
|
||||
"""Fixture to clear backend cache before each test."""
|
||||
_cached_get_attn_backend.cache_clear() # Clear the cache
|
||||
yield # This allows the test to run
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=48)
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize(
|
||||
"sizes",
|
||||
[
|
||||
# Text only
|
||||
[],
|
||||
# Single-size
|
||||
[(512, 512)],
|
||||
# Single-size, batched
|
||||
[(512, 512), (512, 512), (512, 512)],
|
||||
# Multi-size, batched
|
||||
[(512, 512), (1024, 512), (1536, 512), (2048, 512), (512, 1024),
|
||||
(1024, 1024), (512, 1536), (512, 2028)],
|
||||
# Multi-size, batched, including text only
|
||||
[(512, 512), (1024, 512), (1536, 512), (2048, 512), (512, 1024),
|
||||
(1024, 1024), (512, 1536), (512, 2028), None],
|
||||
# mllama has 8 possible aspect ratios, carefully set the sizes
|
||||
# to cover all of them
|
||||
])
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
|
||||
def test_models_single_leading_image(hf_runner, vllm_runner, image_assets,
|
||||
model, sizes, dtype, max_tokens,
|
||||
num_logprobs,
|
||||
attn_backend: _Backend) -> None:
|
||||
with global_force_attn_backend_context_manager(attn_backend):
|
||||
if attn_backend == _Backend.FLASH_ATTN:
|
||||
# Flash Attention works only with bfloat16 data-type
|
||||
dtype = 'bfloat16'
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model,
|
||||
sizes=sizes,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=48)
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
|
||||
def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets,
|
||||
model, dtype, max_tokens, num_logprobs,
|
||||
attn_backend: _Backend) -> None:
|
||||
|
||||
stop_sign = image_assets[0].pil_image
|
||||
cherry_blossom = image_assets[1].pil_image
|
||||
|
||||
inputs = [(
|
||||
[
|
||||
"<|image|><|image|><|begin_of_text|>Describe 2 images.", # noqa: E501
|
||||
"<|image|><|image|><|begin_of_text|>Describe 2 images.", # noqa: E501
|
||||
"<|image|><|image|><|image|><|begin_of_text|>Describe 3 images.", # noqa: E501
|
||||
],
|
||||
[
|
||||
[stop_sign, cherry_blossom],
|
||||
# Images with different sizes.
|
||||
[
|
||||
stop_sign.resize((512, 512)),
|
||||
stop_sign,
|
||||
],
|
||||
[
|
||||
stop_sign,
|
||||
stop_sign.resize((512, 1536)),
|
||||
cherry_blossom.resize((512, 1024)),
|
||||
],
|
||||
])]
|
||||
with global_force_attn_backend_context_manager(attn_backend):
|
||||
if attn_backend == _Backend.FLASH_ATTN:
|
||||
# Flash Attention works only with bfloat16 data-type
|
||||
dtype = 'bfloat16'
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=48)
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
|
||||
def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
|
||||
dtype, max_tokens, num_logprobs,
|
||||
attn_backend: _Backend) -> None:
|
||||
|
||||
stop_sign = image_assets[0].pil_image
|
||||
cherry_blossom = image_assets[1].pil_image
|
||||
|
||||
inputs = [(
|
||||
[
|
||||
"<|begin_of_text|>The content of the image <|image|> is", # noqa: E501
|
||||
"<|begin_of_text|>Between the first image <|image|> and the second image<|image|>, " # noqa: E501
|
||||
"which is a stop sign and which is a cherry blossom?", # noqa: E501
|
||||
],
|
||||
[
|
||||
[stop_sign],
|
||||
[stop_sign, cherry_blossom],
|
||||
])]
|
||||
with global_force_attn_backend_context_manager(attn_backend):
|
||||
if attn_backend == _Backend.FLASH_ATTN:
|
||||
# Flash Attention works only with bfloat16 data-type
|
||||
dtype = 'bfloat16'
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
1
vllm-v0.6.2/tests/models/fixtures/pixtral_chat.json
Normal file
1
vllm-v0.6.2/tests/models/fixtures/pixtral_chat.json
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
216
vllm-v0.6.2/tests/models/registry.py
Normal file
216
vllm-v0.6.2/tests/models/registry.py
Normal file
@@ -0,0 +1,216 @@
|
||||
from dataclasses import dataclass, field
|
||||
from typing import AbstractSet, Mapping, Optional
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class _HfExamplesInfo:
|
||||
default: str
|
||||
"""The default model to use for testing this architecture."""
|
||||
|
||||
extras: Mapping[str, str] = field(default_factory=dict)
|
||||
"""Extra models to use for testing this architecture."""
|
||||
|
||||
tokenizer: Optional[str] = None
|
||||
"""Set the tokenizer to load for this architecture."""
|
||||
|
||||
tokenizer_mode: str = "auto"
|
||||
"""Set the tokenizer type for this architecture."""
|
||||
|
||||
speculative_model: Optional[str] = None
|
||||
"""
|
||||
The default model to use for testing this architecture, which is only used
|
||||
for speculative decoding.
|
||||
"""
|
||||
|
||||
is_available_online: bool = True
|
||||
"""
|
||||
Set this to ``False`` if the name of this architecture no longer exists on
|
||||
the HF repo. To maintain backwards compatibility, we have not removed them
|
||||
from the main model registry, so without this flag the registry tests will
|
||||
fail.
|
||||
"""
|
||||
|
||||
trust_remote_code: bool = False
|
||||
"""The ``trust_remote_code`` level required to load the model."""
|
||||
|
||||
|
||||
# yapf: disable
|
||||
_TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
# [Decoder-only]
|
||||
"AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B",
|
||||
trust_remote_code=True),
|
||||
"AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B",
|
||||
trust_remote_code=True),
|
||||
"ArcticForCausalLM": _HfExamplesInfo("Snowflake/snowflake-arctic-instruct",
|
||||
trust_remote_code=True),
|
||||
"BaiChuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan-7B",
|
||||
trust_remote_code=True),
|
||||
"BaichuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan2-7B-chat",
|
||||
trust_remote_code=True),
|
||||
"BloomForCausalLM": _HfExamplesInfo("bigscience/bloomz-1b1"),
|
||||
# ChatGLMModel supports multimodal
|
||||
"CohereForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r-v01",
|
||||
trust_remote_code=True),
|
||||
"DbrxForCausalLM": _HfExamplesInfo("databricks/dbrx-instruct"),
|
||||
"DeciLMForCausalLM": _HfExamplesInfo("Deci/DeciLM-7B-instruct",
|
||||
trust_remote_code=True),
|
||||
"DeepseekForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-llm-7b-chat"),
|
||||
"DeepseekV2ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V2-Lite-Chat", # noqa: E501
|
||||
trust_remote_code=True),
|
||||
"ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"), # noqa: E501
|
||||
"FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
|
||||
"GemmaForCausalLM": _HfExamplesInfo("google/gemma-2b"),
|
||||
"Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
|
||||
"GPT2LMHeadModel": _HfExamplesInfo("gpt2"),
|
||||
"GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder"),
|
||||
"GPTJForCausalLM": _HfExamplesInfo("EleutherAI/gpt-j-6b"),
|
||||
"GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-160m"),
|
||||
"GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"),
|
||||
"GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"),
|
||||
"InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
|
||||
trust_remote_code=True),
|
||||
"InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b",
|
||||
trust_remote_code=True),
|
||||
"InternLM2VEForCausalLM": _HfExamplesInfo("OpenGVLab/Mono-InternVL-2B",
|
||||
trust_remote_code=True),
|
||||
"JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
|
||||
"JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini"),
|
||||
"LlamaForCausalLM": _HfExamplesInfo("meta-llama/Meta-Llama-3-8B"),
|
||||
"LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
|
||||
is_available_online=False),
|
||||
"MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
|
||||
"FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"), # noqa: E501
|
||||
"MiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-2B-sft-bf16",
|
||||
trust_remote_code=True),
|
||||
"MiniCPM3ForCausalLM": _HfExamplesInfo("openbmb/MiniCPM3-4B",
|
||||
trust_remote_code=True),
|
||||
"MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"),
|
||||
"MixtralForCausalLM": _HfExamplesInfo("mistralai/Mixtral-8x7B-Instruct-v0.1"), # noqa: E501
|
||||
"QuantMixtralForCausalLM": _HfExamplesInfo("mistral-community/Mixtral-8x22B-v0.1-AWQ"), # noqa: E501
|
||||
"MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False),
|
||||
"MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"),
|
||||
"NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"),
|
||||
"OlmoForCausalLM": _HfExamplesInfo("allenai/OLMo-1B-hf"),
|
||||
"OlmoeForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924-Instruct"),
|
||||
"OPTForCausalLM": _HfExamplesInfo("facebook/opt-iml-max-1.3b"),
|
||||
"OrionForCausalLM": _HfExamplesInfo("OrionStarAI/Orion-14B-Chat",
|
||||
trust_remote_code=True),
|
||||
"PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"),
|
||||
"PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2"),
|
||||
"Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"),
|
||||
"Phi3SmallForCausalLM": _HfExamplesInfo("microsoft/Phi-3-small-8k-instruct",
|
||||
trust_remote_code=True),
|
||||
"PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
|
||||
trust_remote_code=True),
|
||||
# QWenLMHeadModel supports multimodal
|
||||
"Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-7B-Instruct"),
|
||||
"Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
|
||||
"RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b",
|
||||
is_available_online=False),
|
||||
"StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b", # noqa: E501
|
||||
is_available_online=False),
|
||||
"StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
|
||||
"Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
|
||||
"SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"),
|
||||
"XverseForCausalLM": _HfExamplesInfo("xverse/XVERSE-7B-Chat",
|
||||
is_available_online=False,
|
||||
trust_remote_code=True),
|
||||
# [Encoder-decoder]
|
||||
"BartModel": _HfExamplesInfo("facebook/bart-base"),
|
||||
"BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"),
|
||||
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
|
||||
# Therefore, we borrow the BartTokenizer from the original Bart model
|
||||
"Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base", # noqa: E501
|
||||
tokenizer="facebook/bart-base",
|
||||
trust_remote_code=True), # noqa: E501
|
||||
}
|
||||
|
||||
_EMBEDDING_EXAMPLE_MODELS = {
|
||||
# [Text-only]
|
||||
"BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
|
||||
"Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"),
|
||||
"LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
|
||||
"MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
|
||||
"Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
|
||||
"Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"),
|
||||
"Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"), # noqa: E501
|
||||
"RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2"), # noqa: E501
|
||||
"XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-large"),
|
||||
# [Multimodal]
|
||||
"LlavaNextForConditionalGeneration": _HfExamplesInfo("royokong/e5-v"),
|
||||
"Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full",
|
||||
trust_remote_code=True),
|
||||
"Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"), # noqa: E501
|
||||
}
|
||||
|
||||
_MULTIMODAL_EXAMPLE_MODELS = {
|
||||
# [Decoder-only]
|
||||
"Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"), # noqa: E501
|
||||
"ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"), # noqa: E501
|
||||
"ChatGLMModel": _HfExamplesInfo("THUDM/glm-4v-9b",
|
||||
extras={"text_only": "THUDM/chatglm3-6b"},
|
||||
trust_remote_code=True),
|
||||
"ChatGLMForConditionalGeneration": _HfExamplesInfo("chatglm2-6b",
|
||||
is_available_online=False),
|
||||
"FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
|
||||
"H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m"),
|
||||
"InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
|
||||
trust_remote_code=True),
|
||||
"Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3"), # noqa: E501
|
||||
"LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
|
||||
extras={"mistral": "mistral-community/pixtral-12b"}), # noqa: E501
|
||||
"LlavaNextForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-v1.6-mistral-7b-hf"), # noqa: E501
|
||||
"LlavaNextVideoForConditionalGeneration": _HfExamplesInfo("llava-hf/LLaVA-NeXT-Video-7B-hf"), # noqa: E501
|
||||
"LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501
|
||||
"MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
|
||||
trust_remote_code=True),
|
||||
"MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
|
||||
trust_remote_code=True),
|
||||
"NVLM_D": _HfExamplesInfo("nvidia/NVLM-D-72B",
|
||||
trust_remote_code=True),
|
||||
"PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-pt-224"), # noqa: E501
|
||||
"Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
|
||||
trust_remote_code=True),
|
||||
"PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409", # noqa: E501
|
||||
tokenizer_mode="mistral"),
|
||||
"QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-VL-Chat",
|
||||
extras={"text_only": "Qwen/Qwen-7B-Chat"}, # noqa: E501
|
||||
trust_remote_code=True),
|
||||
"Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"), # noqa: E501
|
||||
"Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501
|
||||
"UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3"),
|
||||
# [Encoder-decoder]
|
||||
"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501
|
||||
}
|
||||
|
||||
_SPECULATIVE_DECODING_EXAMPLE_MODELS = {
|
||||
"EAGLEModel": _HfExamplesInfo("JackFram/llama-68m",
|
||||
speculative_model="abhigoyal/vllm-eagle-llama-68m-random"), # noqa: E501
|
||||
"MedusaModel": _HfExamplesInfo("JackFram/llama-68m",
|
||||
speculative_model="abhigoyal/vllm-medusa-llama-68m-random"), # noqa: E501
|
||||
"MLPSpeculatorPreTrainedModel": _HfExamplesInfo("JackFram/llama-160m",
|
||||
speculative_model="ibm-fms/llama-160m-accelerator"), # noqa: E501
|
||||
}
|
||||
|
||||
_EXAMPLE_MODELS = {
|
||||
**_TEXT_GENERATION_EXAMPLE_MODELS,
|
||||
**_EMBEDDING_EXAMPLE_MODELS,
|
||||
**_MULTIMODAL_EXAMPLE_MODELS,
|
||||
**_SPECULATIVE_DECODING_EXAMPLE_MODELS,
|
||||
}
|
||||
|
||||
|
||||
class HfExampleModels:
|
||||
def __init__(self, hf_models: Mapping[str, _HfExamplesInfo]) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.hf_models = hf_models
|
||||
|
||||
def get_supported_archs(self) -> AbstractSet[str]:
|
||||
return self.hf_models.keys()
|
||||
|
||||
def get_hf_info(self, model_arch: str) -> _HfExamplesInfo:
|
||||
return self.hf_models[model_arch]
|
||||
|
||||
|
||||
HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
|
||||
55
vllm-v0.6.2/tests/models/test_initialization.py
Normal file
55
vllm-v0.6.2/tests/models/test_initialization.py
Normal file
@@ -0,0 +1,55 @@
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
import transformers
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
from .registry import HF_EXAMPLE_MODELS
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
|
||||
def test_can_initialize(model_arch):
|
||||
if (model_arch == "Idefics3ForConditionalGeneration"
|
||||
and transformers.__version__ < "4.46.0"):
|
||||
pytest.skip(reason="Model introduced in HF >= 4.46.0")
|
||||
|
||||
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
|
||||
if not model_info.is_available_online:
|
||||
pytest.skip("Model is not available online")
|
||||
|
||||
# Avoid OOM
|
||||
def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
|
||||
if hasattr(hf_config, "text_config"):
|
||||
text_config: PretrainedConfig = hf_config.text_config
|
||||
else:
|
||||
text_config = hf_config
|
||||
|
||||
text_config.update({
|
||||
"num_layers": 1,
|
||||
"num_hidden_layers": 1,
|
||||
"num_experts": 2,
|
||||
"num_experts_per_tok": 2,
|
||||
"num_local_experts": 2,
|
||||
})
|
||||
|
||||
return hf_config
|
||||
|
||||
# Avoid calling model.forward()
|
||||
def _initialize_kv_caches(self) -> None:
|
||||
self.cache_config.num_gpu_blocks = 0
|
||||
self.cache_config.num_cpu_blocks = 0
|
||||
|
||||
with patch.object(LLM.get_engine_class(), "_initialize_kv_caches",
|
||||
_initialize_kv_caches):
|
||||
LLM(
|
||||
model_info.default,
|
||||
tokenizer=model_info.tokenizer,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
speculative_model=model_info.speculative_model,
|
||||
num_speculative_tokens=1 if model_info.speculative_model else None,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
load_format="dummy",
|
||||
hf_overrides=hf_overrides,
|
||||
)
|
||||
81
vllm-v0.6.2/tests/models/test_oot_registration.py
Normal file
81
vllm-v0.6.2/tests/models/test_oot_registration.py
Normal file
@@ -0,0 +1,81 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM, PoolingParams, SamplingParams
|
||||
from vllm.assets.image import ImageAsset
|
||||
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
def test_plugin(dummy_opt_path):
|
||||
os.environ["VLLM_PLUGINS"] = ""
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
LLM(model=dummy_opt_path, load_format="dummy")
|
||||
assert "are not supported for now" in str(excinfo.value)
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
def test_oot_registration_text_generation(dummy_opt_path):
|
||||
os.environ["VLLM_PLUGINS"] = "register_dummy_model"
|
||||
prompts = ["Hello, my name is", "The text does not matter"]
|
||||
sampling_params = SamplingParams(temperature=0)
|
||||
llm = LLM(model=dummy_opt_path, load_format="dummy")
|
||||
first_token = llm.get_tokenizer().decode(0)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
for output in outputs:
|
||||
generated_text = output.outputs[0].text
|
||||
# make sure only the first token is generated
|
||||
rest = generated_text.replace(first_token, "")
|
||||
assert rest == ""
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
def test_oot_registration_embedding(dummy_gemma2_embedding_path):
|
||||
os.environ["VLLM_PLUGINS"] = "register_dummy_model"
|
||||
prompts = ["Hello, my name is", "The text does not matter"]
|
||||
sampling_params = PoolingParams()
|
||||
llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
|
||||
outputs = llm.encode(prompts, sampling_params)
|
||||
|
||||
for output in outputs:
|
||||
assert all(v == 0 for v in output.outputs.embedding)
|
||||
|
||||
|
||||
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
def test_oot_registration_multimodal(dummy_llava_path):
|
||||
os.environ["VLLM_PLUGINS"] = "register_dummy_model"
|
||||
prompts = [{
|
||||
"prompt": "What's in the image?<image>",
|
||||
"multi_modal_data": {
|
||||
"image": image
|
||||
},
|
||||
}, {
|
||||
"prompt": "Describe the image<image>",
|
||||
"multi_modal_data": {
|
||||
"image": image
|
||||
},
|
||||
}]
|
||||
|
||||
sampling_params = SamplingParams(temperature=0)
|
||||
llm = LLM(model=dummy_llava_path,
|
||||
load_format="dummy",
|
||||
max_num_seqs=1,
|
||||
trust_remote_code=True,
|
||||
gpu_memory_utilization=0.98,
|
||||
max_model_len=4096,
|
||||
enforce_eager=True,
|
||||
limit_mm_per_prompt={"image": 1})
|
||||
first_token = llm.get_tokenizer().decode(0)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
for output in outputs:
|
||||
generated_text = output.outputs[0].text
|
||||
# make sure only the first token is generated
|
||||
rest = generated_text.replace(first_token, "")
|
||||
assert rest == ""
|
||||
97
vllm-v0.6.2/tests/models/test_registry.py
Normal file
97
vllm-v0.6.2/tests/models/test_registry.py
Normal file
@@ -0,0 +1,97 @@
|
||||
import warnings
|
||||
|
||||
import pytest
|
||||
import torch.cuda
|
||||
|
||||
from vllm.model_executor.models import (is_embedding_model,
|
||||
is_text_generation_model,
|
||||
supports_multimodal)
|
||||
from vllm.model_executor.models.registry import (_EMBEDDING_MODELS,
|
||||
_MULTIMODAL_MODELS,
|
||||
_SPECULATIVE_DECODING_MODELS,
|
||||
_TEXT_GENERATION_MODELS,
|
||||
ModelRegistry)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
from .registry import HF_EXAMPLE_MODELS
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs())
|
||||
def test_registry_imports(model_arch):
|
||||
|
||||
# MLU not support mllama yet.
|
||||
if model_arch == 'MllamaForConditionalGeneration' or model_arch == 'CustomForCausalLM':
|
||||
return
|
||||
|
||||
# Ensure all model classes can be imported successfully
|
||||
model_cls, _ = ModelRegistry.resolve_model_cls(model_arch)
|
||||
|
||||
if model_arch in _SPECULATIVE_DECODING_MODELS:
|
||||
pass # Ignore these models which do not have a unified format
|
||||
else:
|
||||
assert is_text_generation_model(model_cls) is (
|
||||
model_arch in _TEXT_GENERATION_MODELS
|
||||
or model_arch in _MULTIMODAL_MODELS)
|
||||
|
||||
assert is_embedding_model(model_cls) is (model_arch
|
||||
in _EMBEDDING_MODELS)
|
||||
|
||||
assert supports_multimodal(model_cls) is (model_arch
|
||||
in _MULTIMODAL_MODELS)
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@pytest.mark.parametrize("model_arch,is_mm,init_cuda", [
|
||||
("LlamaForCausalLM", False, False),
|
||||
# ("MllamaForConditionalGeneration", True, False),
|
||||
("LlavaForConditionalGeneration", True, True),
|
||||
])
|
||||
def test_registry_is_multimodal(model_arch, is_mm, init_cuda):
|
||||
assert ModelRegistry.is_multimodal_model(model_arch) is is_mm
|
||||
|
||||
if init_cuda and current_platform.is_cuda_alike():
|
||||
assert not torch.cuda.is_initialized()
|
||||
|
||||
ModelRegistry.resolve_model_cls(model_arch)
|
||||
if not torch.cuda.is_initialized():
|
||||
warnings.warn(
|
||||
"This model no longer initializes CUDA on import. "
|
||||
"Please test using a different one.",
|
||||
stacklevel=2)
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@pytest.mark.parametrize("model_arch,is_pp,init_cuda", [
|
||||
("MLPSpeculatorPreTrainedModel", False, False),
|
||||
("DeepseekV2ForCausalLM", True, False),
|
||||
("Qwen2VLForConditionalGeneration", True, True),
|
||||
])
|
||||
def test_registry_is_pp(model_arch, is_pp, init_cuda):
|
||||
assert ModelRegistry.is_pp_supported_model(model_arch) is is_pp
|
||||
|
||||
if init_cuda and current_platform.is_cuda_alike():
|
||||
assert not torch.cuda.is_initialized()
|
||||
|
||||
ModelRegistry.resolve_model_cls(model_arch)
|
||||
if not torch.cuda.is_initialized():
|
||||
warnings.warn(
|
||||
"This model no longer initializes CUDA on import. "
|
||||
"Please test using a different one.",
|
||||
stacklevel=2)
|
||||
|
||||
|
||||
def test_hf_registry_coverage():
|
||||
untested_archs = (ModelRegistry.get_supported_archs() -
|
||||
HF_EXAMPLE_MODELS.get_supported_archs())
|
||||
|
||||
# Skip custom registry check
|
||||
if "CustomForCausalLM" in untested_archs:
|
||||
untested_archs -= set(["CustomForCausalLM"])
|
||||
|
||||
if "HunYuanForCausalLM" in untested_archs:
|
||||
untested_archs -= set(["HunYuanForCausalLM"])
|
||||
|
||||
assert not untested_archs, (
|
||||
"Please add the following architectures to "
|
||||
f"`tests/models/registry.py`: {untested_archs}")
|
||||
285
vllm-v0.6.2/tests/models/utils.py
Normal file
285
vllm-v0.6.2/tests/models/utils.py
Normal file
@@ -0,0 +1,285 @@
|
||||
import warnings
|
||||
from typing import Dict, List, Optional, Sequence, Tuple, Union
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.config import ModelConfig, TaskOption
|
||||
from vllm.inputs import InputContext
|
||||
from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
|
||||
|
||||
TokensText = Tuple[List[int], str]
|
||||
|
||||
|
||||
def check_outputs_equal(
|
||||
*,
|
||||
outputs_0_lst: Sequence[TokensText],
|
||||
outputs_1_lst: Sequence[TokensText],
|
||||
name_0: str,
|
||||
name_1: str,
|
||||
):
|
||||
"""
|
||||
Compare the two sequences generated by different models,
|
||||
which should be equal.
|
||||
"""
|
||||
assert len(outputs_0_lst) == len(outputs_1_lst)
|
||||
|
||||
for prompt_idx, (outputs_0,
|
||||
outputs_1) in enumerate(zip(outputs_0_lst,
|
||||
outputs_1_lst)):
|
||||
output_ids_0, output_str_0 = outputs_0
|
||||
output_ids_1, output_str_1 = outputs_1
|
||||
|
||||
# The text and token outputs should exactly match
|
||||
fail_msg = (f"Test{prompt_idx}:"
|
||||
f"\n{name_0}:\t{output_str_0!r}"
|
||||
f"\n{name_1}:\t{output_str_1!r}")
|
||||
|
||||
assert output_str_0 == output_str_1, fail_msg
|
||||
assert output_ids_0 == output_ids_1, fail_msg
|
||||
|
||||
|
||||
# Representation of generated sequence as a tuple of
|
||||
# * Token ID list
|
||||
# * String
|
||||
# * List of top sample logprobs for each sampled token
|
||||
#
|
||||
# Assumes prompt logprobs were not requested.
|
||||
TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int,
|
||||
float]],
|
||||
SampleLogprobs]]]
|
||||
|
||||
# Allow for tokens to be represented as str's rather than IDs;
|
||||
# tuple of
|
||||
# * Token string representations list
|
||||
# * String
|
||||
# * Optional list of top sample logprobs for each sampled token
|
||||
#
|
||||
# Assumes prompt logprobs were not requested.
|
||||
TextTextLogprobs = Tuple[List[str], str, Optional[Union[List[Dict[str, float]],
|
||||
List[Dict[str,
|
||||
Logprob]]]]]
|
||||
|
||||
# Representation of generated sequence as a tuple of
|
||||
# * Token ID list
|
||||
# * String
|
||||
# * Optional list of top sample logprobs for each sampled token
|
||||
# * Optional list of top prompt logprobs for each prompt token
|
||||
#
|
||||
# Allows prompt logprobs to be requested.
|
||||
TokensTextLogprobsPromptLogprobs = Tuple[
|
||||
List[int], str, Optional[Union[List[Dict[int, float]], SampleLogprobs]],
|
||||
Optional[Union[List[Optional[Dict[int, float]]], PromptLogprobs]]]
|
||||
|
||||
|
||||
def check_logprobs_close(
|
||||
*,
|
||||
outputs_0_lst: Sequence[Union[TokensTextLogprobs,
|
||||
TokensTextLogprobsPromptLogprobs,
|
||||
TextTextLogprobs]],
|
||||
outputs_1_lst: Sequence[Union[TokensTextLogprobs,
|
||||
TokensTextLogprobsPromptLogprobs,
|
||||
TextTextLogprobs]],
|
||||
name_0: str,
|
||||
name_1: str,
|
||||
num_outputs_0_skip_tokens: int = 0,
|
||||
warn_on_mismatch: bool = True,
|
||||
always_check_logprobs: bool = False,
|
||||
) -> None:
|
||||
"""Compare the logprobs of two sequences generated by different models,
|
||||
which should be similar but not necessarily equal.
|
||||
|
||||
How sample logprobs are compared:
|
||||
* `always_check_logprobs == True`: set of highest-logprob token ids
|
||||
must match between seq0 and seq1 at all sampled token offsets
|
||||
* `always_check_logprobs == False`: highest-logprob token ids are
|
||||
only compared at sampled token offsets for which generated token
|
||||
ids don't match
|
||||
|
||||
Prompt logprobs must be provided either for both input sequences, or
|
||||
for neither. If prompt logprobs are provided, then highest-logprob
|
||||
prompt token ids must match between seq0 and seq1 at all prompt token
|
||||
offsets.
|
||||
|
||||
Args:
|
||||
outputs_0_lst: First sequence to compare
|
||||
outputs_0_lst: Second sequence to compare
|
||||
name_0: sequence #0 name
|
||||
name_1: sequence #1 name
|
||||
num_outputs_0_skip_tokens: If > 0, specifies the number of initial
|
||||
sequence #0 tokens & logprobs to discard
|
||||
before comparison, i.e. all
|
||||
of sequence #1 will be compared to
|
||||
sequence #0 beginning at index
|
||||
num_outputs_0_skip_tokens
|
||||
warn_on_mismatch: Issue a warning if there is token-wise or text-wise
|
||||
mismatch between the two sequences
|
||||
always_check_logprobs: If true, check logprobs even when tokens match
|
||||
"""
|
||||
assert len(outputs_0_lst) == len(outputs_1_lst)
|
||||
|
||||
# Loop through responses to each prompt.
|
||||
for prompt_idx, (outputs_0,
|
||||
outputs_1) in enumerate(zip(outputs_0_lst,
|
||||
outputs_1_lst)):
|
||||
assert len(outputs_0) == len(outputs_1)
|
||||
if len(outputs_0) == 3:
|
||||
assert len(outputs_1) == 3
|
||||
# Break out tokens, text & sample logprobs
|
||||
# (prompt logprobs were not provided)
|
||||
output_ids_0, output_str_0, logprobs_0 = outputs_0
|
||||
output_ids_1, output_str_1, logprobs_1 = outputs_1
|
||||
elif len(outputs_0) == 4:
|
||||
assert len(outputs_1) == 4
|
||||
# Break out tokens, text, sample logprobs & prompt logprobs
|
||||
(
|
||||
output_ids_0,
|
||||
output_str_0,
|
||||
logprobs_0,
|
||||
prompt_logprobs_0,
|
||||
) = outputs_0
|
||||
(
|
||||
output_ids_1,
|
||||
output_str_1,
|
||||
logprobs_1,
|
||||
prompt_logprobs_1,
|
||||
) = outputs_1
|
||||
|
||||
# Test prompt logprobs closeness
|
||||
if (prompt_logprobs_0 is not None
|
||||
and prompt_logprobs_1 is not None):
|
||||
# Both sequences' prompt logprobs lists are not `None``
|
||||
# (although individual list elements may be `None`);
|
||||
# for each token's logprobs:
|
||||
for idx, (logprobs_elem_0, logprobs_elem_1) in enumerate(
|
||||
zip(prompt_logprobs_0, prompt_logprobs_1)):
|
||||
fail_msg = (
|
||||
f"Prompt logprobs test:"
|
||||
f"\n{name_0}:\tPrompt index {idx}\t{logprobs_elem_0}"
|
||||
f"\n{name_1}:\tPrompt index {idx}\t{logprobs_elem_1}")
|
||||
|
||||
if logprobs_elem_0 is None:
|
||||
# If the seq 0 token's logprobs are `None`,
|
||||
# the seq 1 token's logprobs must be `None`
|
||||
assert logprobs_elem_1 is None, fail_msg
|
||||
else:
|
||||
# If the seq 0 token's logprobs are not `None`,
|
||||
# the seq 1 token's logprobs must not be `None`
|
||||
assert logprobs_elem_1 is not None, fail_msg
|
||||
# Logprobs check: top-k token choices must be the same
|
||||
assert (set(logprobs_elem_0.keys()) == set(
|
||||
logprobs_elem_1.keys())), fail_msg
|
||||
else:
|
||||
# Both sequence logprobs lists must be `None`
|
||||
fail_msg = (f"Prompt logprobs test:"
|
||||
f"\n{name_0}:\tlogprobs\t{prompt_logprobs_0}"
|
||||
f"\n{name_1}:\tlogprobs\t{prompt_logprobs_1}")
|
||||
|
||||
assert (prompt_logprobs_0 is None
|
||||
and prompt_logprobs_1 is None), fail_msg
|
||||
else:
|
||||
raise ValueError(f"Outputs tuple must have 3 or 4 elements but "
|
||||
f"{len(outputs_0)} elements were provided: "
|
||||
f"{outputs_0}")
|
||||
|
||||
if logprobs_0 is None:
|
||||
logprobs_0 = [None] * len(output_ids_0)
|
||||
if logprobs_1 is None:
|
||||
logprobs_1 = [None] * len(output_ids_1)
|
||||
|
||||
# Skip specified number of initial sequence #0 tokens
|
||||
# & logprobs, leaving output text as-is for simplicity
|
||||
# (text mismatches may generate warnings but do not
|
||||
# cause the test to fail.)
|
||||
if num_outputs_0_skip_tokens < 0:
|
||||
raise ValueError("num_outputs_0_skip_tokens must be non-negative")
|
||||
output_ids_0 = output_ids_0[num_outputs_0_skip_tokens:]
|
||||
logprobs_0 = logprobs_0[num_outputs_0_skip_tokens:]
|
||||
|
||||
# Loop through generated tokens.
|
||||
for idx, (output_id_0,
|
||||
output_id_1) in enumerate(zip(output_ids_0, output_ids_1)):
|
||||
|
||||
is_tok_mismatch = output_id_0 != output_id_1
|
||||
|
||||
# If generated tokens don't match
|
||||
# or it is desired to always check logprobs,
|
||||
# then
|
||||
if is_tok_mismatch or always_check_logprobs:
|
||||
logprobs_elem_0 = logprobs_0[idx]
|
||||
logprobs_elem_1 = logprobs_1[idx]
|
||||
|
||||
# Each predicted token must be in top N logprobs of the other
|
||||
fail_msg = (
|
||||
f"Test{prompt_idx}:"
|
||||
f"\nMatched tokens:\t{output_ids_0[:idx]}"
|
||||
f"\n{name_0}:\t{output_str_0!r}\t{logprobs_elem_0}"
|
||||
f"\n{name_1}:\t{output_str_1!r}\t{logprobs_elem_1}")
|
||||
|
||||
assert logprobs_elem_0 is not None, fail_msg
|
||||
assert logprobs_elem_1 is not None, fail_msg
|
||||
assert output_id_0 in logprobs_elem_1, fail_msg
|
||||
assert output_id_1 in logprobs_elem_0, fail_msg
|
||||
|
||||
if warn_on_mismatch and is_tok_mismatch:
|
||||
with warnings.catch_warnings():
|
||||
# This ensures that repeated warnings are shown
|
||||
# in the output, not just the first occurrence
|
||||
warnings.simplefilter("always")
|
||||
|
||||
warnings.warn(fail_msg, stacklevel=2)
|
||||
|
||||
# Break out since sequences will now diverge.
|
||||
break
|
||||
else:
|
||||
if output_str_0 != output_str_1 and warn_on_mismatch:
|
||||
# The token outputs exactly match,
|
||||
# so the text outputs should exactly match as well
|
||||
fail_msg = (f"Test{prompt_idx}:"
|
||||
f"\n{name_0}:\t{output_str_0!r}"
|
||||
f"\n{name_1}:\t{output_str_1!r}")
|
||||
|
||||
with warnings.catch_warnings():
|
||||
# This ensures that repeated warnings are shown
|
||||
# in the output, not just the first occurrence
|
||||
warnings.simplefilter("always")
|
||||
|
||||
warnings.warn(fail_msg, stacklevel=2)
|
||||
|
||||
|
||||
def build_model_context(model_name: str,
|
||||
task: TaskOption = "auto",
|
||||
tokenizer_name: Optional[str] = None,
|
||||
trust_remote_code: bool = False,
|
||||
dtype: Optional[Union[str, torch.dtype]] = None,
|
||||
mm_processor_kwargs: Optional[Dict] = None,
|
||||
limit_mm_per_prompt: Optional[Dict] = None):
|
||||
"""Creates an InputContext for a given model.
|
||||
|
||||
Args:
|
||||
model_name: Name of the model being considered.
|
||||
tokenizer_name: Name of the tokenizer being considered.
|
||||
trust_remote_code: Whether or not to allow loading remote code.
|
||||
mm_processor_kwargs: optional processor kwargs for to be leveraged
|
||||
in the input processor, mapper, dummy data creation, etc.
|
||||
limit_mm_per_prompt: Multimodal limits.
|
||||
|
||||
Returns:
|
||||
InputContext for the model being considered.
|
||||
"""
|
||||
if tokenizer_name is None:
|
||||
tokenizer_name = model_name
|
||||
if dtype is None:
|
||||
dtype = "half"
|
||||
|
||||
model_config = ModelConfig(
|
||||
model_name,
|
||||
task=task,
|
||||
tokenizer=tokenizer_name,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=trust_remote_code,
|
||||
dtype=dtype,
|
||||
seed=0,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
)
|
||||
return InputContext(model_config)
|
||||
Reference in New Issue
Block a user