Sync from v0.13
This commit is contained in:
0
tests/models/language/__init__.py
Normal file
0
tests/models/language/__init__.py
Normal file
0
tests/models/language/generation/__init__.py
Normal file
0
tests/models/language/generation/__init__.py
Normal file
185
tests/models/language/generation/test_common.py
Normal file
185
tests/models/language/generation/test_common.py
Normal file
@@ -0,0 +1,185 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ....utils import large_gpu_mark
|
||||
from ...registry import HF_EXAMPLE_MODELS
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
# This list contains the model that are using AITER kernel.
|
||||
# Skip model that are not using AITER tests.
|
||||
# When more AITER kernels are added, this list will not be
|
||||
# needed as all the models will be calling AITER kernels
|
||||
# in parts of the operators
|
||||
AITER_MODEL_LIST = [
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
"openbmb/MiniCPM3-4B",
|
||||
"Qwen/Qwen-7B-Chat",
|
||||
"Qwen/Qwen2.5-0.5B-Instruct",
|
||||
"TitanML/tiny-mixtral",
|
||||
"Qwen/Qwen3-8B",
|
||||
]
|
||||
|
||||
|
||||
# @maybe_test_rocm_aiter
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
pytest.param(
|
||||
"bigscience/bloom-560m", # bloom - testing alibi slopes
|
||||
marks=[
|
||||
pytest.mark.core_model,
|
||||
pytest.mark.slow_test,
|
||||
pytest.mark.cpu_model,
|
||||
],
|
||||
),
|
||||
pytest.param(
|
||||
"openai-community/gpt2", # gpt2
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
pytest.param("Milos/slovak-gpt-j-405M"), # gptj
|
||||
pytest.param("bigcode/tiny_starcoder_py"), # gpt_bigcode
|
||||
pytest.param("EleutherAI/pythia-70m"), # gpt_neox
|
||||
pytest.param(
|
||||
"google/gemma-1.1-2b-it", # gemma
|
||||
marks=[
|
||||
pytest.mark.core_model,
|
||||
pytest.mark.cpu_model,
|
||||
pytest.mark.slow_test,
|
||||
],
|
||||
),
|
||||
pytest.param(
|
||||
"google/gemma-2-2b-it", # test hybrid attention
|
||||
marks=[pytest.mark.cpu_model],
|
||||
),
|
||||
pytest.param(
|
||||
"zai-org/chatglm3-6b", # chatglm (text-only)
|
||||
),
|
||||
pytest.param(
|
||||
"meta-llama/Llama-3.2-1B-Instruct", # llama
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
pytest.param(
|
||||
"openbmb/MiniCPM3-4B",
|
||||
marks=[pytest.mark.core_model, large_gpu_mark(min_gb=32)],
|
||||
),
|
||||
pytest.param(
|
||||
"facebook/opt-125m", # opt
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
pytest.param(
|
||||
"microsoft/phi-2", # phi
|
||||
marks=[pytest.mark.core_model, pytest.mark.slow_test],
|
||||
),
|
||||
pytest.param(
|
||||
"Qwen/Qwen-7B-Chat", # qwen (text-only)
|
||||
),
|
||||
pytest.param(
|
||||
"Qwen/Qwen2.5-0.5B-Instruct", # qwen2
|
||||
marks=[
|
||||
pytest.mark.core_model,
|
||||
pytest.mark.cpu_model,
|
||||
pytest.mark.slow_test,
|
||||
],
|
||||
),
|
||||
pytest.param(
|
||||
"Qwen/Qwen3-8B", # qwen (text-only)
|
||||
),
|
||||
pytest.param("stabilityai/stablelm-3b-4e1t"), # stablelm
|
||||
pytest.param("bigcode/starcoder2-3b"), # starcoder2
|
||||
pytest.param(
|
||||
"TitanML/tiny-mixtral", # mixtral
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
pytest.param("swiss-ai/Apertus-8B-Instruct-2509"), # apertus
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize(
|
||||
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
|
||||
)
|
||||
@pytest.mark.parametrize("use_prompt_embeds", [True, False])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
use_rocm_aiter: bool,
|
||||
use_prompt_embeds: bool,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
if use_rocm_aiter and (model in AITER_MODEL_LIST):
|
||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
|
||||
elif use_rocm_aiter and model not in AITER_MODEL_LIST:
|
||||
# Skip model that are not using AITER tests.
|
||||
# When more AITER kernels are added, this list will not be
|
||||
# needed as all the models will be calling AITER kernels
|
||||
# in parts of the operators
|
||||
pytest.skip(f"Skipping '{model}' model test with AITER kernel.")
|
||||
|
||||
with hf_runner(model) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
prompt_embeds: list[torch.Tensor] | None = [] if use_prompt_embeds else None
|
||||
|
||||
prompt_token_ids = []
|
||||
for prompt in example_prompts:
|
||||
token_ids = hf_model.tokenizer(prompt, return_tensors="pt").input_ids.to(
|
||||
hf_model.model.device
|
||||
)
|
||||
prompt_token_ids.append(token_ids)
|
||||
if prompt_embeds is not None:
|
||||
prompt_embeds.append(
|
||||
hf_model.model.get_input_embeddings()(token_ids).squeeze(0)
|
||||
)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
tokenizer_name=model_info.tokenizer or model,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
max_num_seqs=2,
|
||||
enable_prompt_embeds=use_prompt_embeds,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
if prompt_embeds is not None:
|
||||
vllm_outputs_from_embeds = vllm_model.generate_greedy_logprobs(
|
||||
prompt_embeds, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
if prompt_embeds is not None:
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=vllm_outputs,
|
||||
outputs_1_lst=vllm_outputs_from_embeds,
|
||||
name_0="vllm",
|
||||
name_1="vllm_from_embeds",
|
||||
)
|
||||
|
||||
if use_rocm_aiter:
|
||||
# this is to ensure that vllm engine
|
||||
# has deallocated the memory before running the next
|
||||
# unit tests. On ROCm, when using AITER
|
||||
# the memory might not be deallocated completely
|
||||
# before running the next test case
|
||||
torch.cuda.synchronize()
|
||||
27
tests/models/language/generation/test_gemma.py
Normal file
27
tests/models/language/generation/test_gemma.py
Normal file
@@ -0,0 +1,27 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
MODELS = ["google/gemma-2b", "google/gemma-2-2b", "google/gemma-3-4b-it"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
def test_dummy_loader(vllm_runner, monkeypatch, model: str) -> None:
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
with vllm_runner(
|
||||
model,
|
||||
load_format="dummy",
|
||||
) as llm:
|
||||
if model == "google/gemma-3-4b-it":
|
||||
normalizers = llm.llm.collective_rpc(
|
||||
lambda self: self.model_runner.model.language_model.model.normalizer.cpu().item() # noqa: E501
|
||||
)
|
||||
config = llm.llm.llm_engine.model_config.hf_config.text_config
|
||||
else:
|
||||
normalizers = llm.llm.collective_rpc(
|
||||
lambda self: self.model_runner.model.model.normalizer.cpu().item()
|
||||
)
|
||||
config = llm.llm.llm_engine.model_config.hf_config
|
||||
assert np.allclose(normalizers, config.hidden_size**0.5, rtol=2e-3)
|
||||
41
tests/models/language/generation/test_granite.py
Normal file
41
tests/models/language/generation/test_granite.py
Normal file
@@ -0,0 +1,41 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
MODELS = [
|
||||
# TODO(sang): Sliding window should be tested separately.
|
||||
"ibm/PowerLM-3b",
|
||||
"ibm/PowerMoE-3b",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
758
tests/models/language/generation/test_hybrid.py
Normal file
758
tests/models/language/generation/test_hybrid.py
Normal file
@@ -0,0 +1,758 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Callable
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.models.registry import HF_EXAMPLE_MODELS
|
||||
from tests.utils import multi_gpu_test
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
from ...utils import check_logprobs_close, check_outputs_equal
|
||||
|
||||
# Mark all tests as hybrid
|
||||
pytestmark = pytest.mark.hybrid_model
|
||||
|
||||
# NOTE: The first model in each list is taken as the primary model,
|
||||
# meaning that it will be used in all tests in this file
|
||||
# The rest of the models will only be tested by test_models
|
||||
|
||||
APC_MULTIPLY_BY = 300
|
||||
|
||||
SSM_MODELS = [
|
||||
"state-spaces/mamba-130m-hf",
|
||||
"tiiuae/falcon-mamba-tiny-dev",
|
||||
# mamba2-codestral in transformers is broken pending:
|
||||
# https://github.com/huggingface/transformers/pull/40861
|
||||
# "yujiepan/mamba2-codestral-v0.1-tiny-random",
|
||||
]
|
||||
|
||||
HYBRID_MODELS = [
|
||||
"ai21labs/Jamba-tiny-dev",
|
||||
"pfnet/plamo-2-1b",
|
||||
"Zyphra/Zamba2-1.2B-instruct",
|
||||
"hmellor/tiny-random-BambaForCausalLM",
|
||||
"ibm-granite/granite-4.0-tiny-preview",
|
||||
"tiiuae/Falcon-H1-0.5B-Base",
|
||||
"LiquidAI/LFM2-1.2B",
|
||||
"tiny-random/qwen3-next-moe",
|
||||
]
|
||||
|
||||
FULL_CUDA_GRAPH_MODELS = [
|
||||
"ai21labs/Jamba-tiny-dev",
|
||||
"pfnet/plamo-2-1b",
|
||||
"Zyphra/Zamba2-1.2B-instruct",
|
||||
]
|
||||
|
||||
FP32_STATE_MODELS = [
|
||||
"state-spaces/mamba-130m-hf",
|
||||
"Zyphra/Zamba2-1.2B-instruct",
|
||||
]
|
||||
|
||||
# Avoid OOM
|
||||
MAX_NUM_SEQS = 4
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
monkeypatch,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
with hf_runner(model) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_batching(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
for_loop_outputs = []
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
for prompt in example_prompts:
|
||||
(single_output,) = vllm_model.generate_greedy_logprobs(
|
||||
[prompt], max_tokens, num_logprobs
|
||||
)
|
||||
for_loop_outputs.append(single_output)
|
||||
|
||||
batched_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=for_loop_outputs,
|
||||
outputs_1_lst=batched_outputs,
|
||||
name_0="for_loop_vllm",
|
||||
name_1="batched_vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
@pytest.mark.parametrize("max_tokens", [10])
|
||||
def test_chunked_prefill_with_parallel_sampling(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
"""
|
||||
Tests chunked prefill in conjunction with n > 1.
|
||||
|
||||
In this case, prefill is populated with decoding tokens and
|
||||
we test that it doesn't fail.
|
||||
|
||||
This test might fail if cache is not allocated correctly for n > 1
|
||||
decoding steps inside a chunked prefill forward pass
|
||||
(where we have both prefill and decode together)
|
||||
"""
|
||||
sampling_params = SamplingParams(n=3, temperature=1, seed=0, max_tokens=max_tokens)
|
||||
with vllm_runner(
|
||||
model,
|
||||
enable_chunked_prefill=True,
|
||||
# forces prefill chunks with decoding
|
||||
max_num_batched_tokens=MAX_NUM_SEQS * 3,
|
||||
max_num_seqs=MAX_NUM_SEQS,
|
||||
) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
@pytest.mark.parametrize("max_tokens", [20])
|
||||
def test_mamba_cache_cg_padding(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
"""
|
||||
This test is for verifying that mamba cache is padded to CG captured
|
||||
batch size. If it's not, a torch RuntimeError will be raised because
|
||||
tensor dimensions aren't compatible.
|
||||
"""
|
||||
vllm_config = EngineArgs(model=model, trust_remote_code=True).create_engine_config()
|
||||
while len(example_prompts) == vllm_config.pad_for_cudagraph(len(example_prompts)):
|
||||
example_prompts.append(example_prompts[0])
|
||||
|
||||
try:
|
||||
with vllm_runner(model) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
except RuntimeError:
|
||||
pytest.fail(
|
||||
"Couldn't run batch size which is not equal to a Cuda Graph "
|
||||
"captured batch size. "
|
||||
"Could be related to mamba cache not padded correctly"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
) -> None:
|
||||
"""
|
||||
This test is for verifying that the hybrid inner state management doesn't
|
||||
collapse in case where the number of incoming requests and
|
||||
finished_requests_ids is larger than the maximum mamba block capacity.
|
||||
|
||||
This could generally happen due to the fact that hybrid does support
|
||||
statelessness mechanism where it can clean up new incoming requests in
|
||||
a single step.
|
||||
"""
|
||||
try:
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
|
||||
except ValueError:
|
||||
pytest.fail(
|
||||
"Hybrid inner state wasn't cleaned up properly between"
|
||||
"steps finished requests registered unnecessarily "
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
def test_state_cleanup(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
) -> None:
|
||||
"""
|
||||
This test is for verifying that the Hybrid state is cleaned up between
|
||||
steps.
|
||||
|
||||
If it's not cleaned, an error would be expected.
|
||||
"""
|
||||
try:
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
for _ in range(10):
|
||||
vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
|
||||
except ValueError:
|
||||
pytest.fail(
|
||||
"Hybrid inner state wasn't cleaned up between states, "
|
||||
"could be related to finished_requests_ids"
|
||||
)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_distributed_correctness(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with vllm_runner(
|
||||
model, tensor_parallel_size=1, max_num_seqs=MAX_NUM_SEQS
|
||||
) as vllm_model:
|
||||
vllm_outputs_tp_1 = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(
|
||||
model, tensor_parallel_size=2, max_num_seqs=MAX_NUM_SEQS
|
||||
) as vllm_model:
|
||||
vllm_outputs_tp_2 = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=vllm_outputs_tp_1,
|
||||
outputs_1_lst=vllm_outputs_tp_2,
|
||||
name_0="vllm_tp_1",
|
||||
name_1="vllm_tp_2",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", FULL_CUDA_GRAPH_MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_full_cuda_graph(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
monkeypatch,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
with hf_runner(model) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", FP32_STATE_MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize(
|
||||
"cache_dtype_param", ["mamba_ssm_cache_dtype", "mamba_cache_dtype"]
|
||||
)
|
||||
def test_fp32_cache_state(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
monkeypatch,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
cache_dtype_param: str,
|
||||
) -> None:
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
with hf_runner(model) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(
|
||||
model, max_num_seqs=MAX_NUM_SEQS, **{cache_dtype_param: "float32"}
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
# Helper functions for the APC tests
|
||||
def _get_vllm_runner_params(
|
||||
model: str,
|
||||
max_model_len: int,
|
||||
tensor_parallel_size: int = 1,
|
||||
):
|
||||
return {
|
||||
"model_name": model,
|
||||
"enable_chunked_prefill": True,
|
||||
"enable_prefix_caching": False,
|
||||
"max_model_len": max_model_len,
|
||||
"tensor_parallel_size": tensor_parallel_size,
|
||||
"gpu_memory_utilization": 0.4,
|
||||
}
|
||||
|
||||
|
||||
def _get_vLLM_output(
|
||||
vllm_runner,
|
||||
kwargs,
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
num_repetitions=1,
|
||||
vllm_model=None,
|
||||
):
|
||||
outs = []
|
||||
if vllm_model is None:
|
||||
vllm_model = vllm_runner(**kwargs)
|
||||
for _ in range(num_repetitions):
|
||||
if num_logprobs < 0:
|
||||
vllm_output = vllm_model.generate_greedy(prompts, max_tokens)
|
||||
else:
|
||||
vllm_output = vllm_model.generate_greedy_logprobs(
|
||||
prompts, max_tokens, num_logprobs
|
||||
)
|
||||
outs.append(vllm_output)
|
||||
|
||||
return outs, vllm_model
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("n_repetitions", [2])
|
||||
# If num_logprobs is set to -1, then the stringent version
|
||||
# of the test is executed using `check_outputs_equal`
|
||||
# instead of `check_logprobs_close`
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("tensor_parallel_size", [1])
|
||||
def test_apc_single_prompt(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
monkeypatch,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
n_repetitions: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
) -> None:
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
compare_operator: Callable = (
|
||||
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
|
||||
)
|
||||
|
||||
# Sample prompts.
|
||||
generated_prompts = [APC_MULTIPLY_BY * example_prompts[0]]
|
||||
|
||||
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
|
||||
vllm_runner_kwargs = _get_vllm_runner_params(
|
||||
model, max_model_len, tensor_parallel_size=tensor_parallel_size
|
||||
)
|
||||
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
|
||||
vllm_outputs_no_cache, _ = _get_vLLM_output(
|
||||
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
vllm_runner_kwargs["enable_prefix_caching"] = True
|
||||
vllm_outputs_cache_rep, _ = _get_vLLM_output(
|
||||
vllm_runner,
|
||||
vllm_runner_kwargs,
|
||||
generated_prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
n_repetitions,
|
||||
)
|
||||
|
||||
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
|
||||
# In the first repetition, the caches are filled
|
||||
# In the second repetition, these caches are reused
|
||||
|
||||
compare_operator(
|
||||
outputs_0_lst=vllm_outputs_no_cache[0],
|
||||
outputs_1_lst=vllm_outputs_cache_itn,
|
||||
name_0="vllm_no_cache",
|
||||
name_1=f"vllm_cache_it_{r_idx + 1}",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("n_repetitions", [2])
|
||||
# If num_logprobs is set to -1, then the stringent version
|
||||
# of the test is executed using `check_outputs_equal`
|
||||
# instead of `check_logprobs_close`
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("tensor_parallel_size", [1])
|
||||
def test_apc_single_prompt_block_align_alignment(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
monkeypatch,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
n_repetitions: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
) -> None:
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
compare_operator: Callable = (
|
||||
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
|
||||
)
|
||||
|
||||
# Sample prompts. This custom prompt is used, as it causes the most issues
|
||||
generated_prompts = ["The president of the United States is " * APC_MULTIPLY_BY]
|
||||
|
||||
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
|
||||
vllm_runner_kwargs = _get_vllm_runner_params(
|
||||
model, max_model_len, tensor_parallel_size=tensor_parallel_size
|
||||
)
|
||||
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
|
||||
|
||||
vllm_outputs_no_cache, _ = _get_vLLM_output(
|
||||
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
vllm_runner_kwargs["enable_prefix_caching"] = True
|
||||
with vllm_runner(**vllm_runner_kwargs) as vllm_model:
|
||||
# Retrieve the default mamba state block size
|
||||
mamba_block_size = vllm_model.llm.llm_engine.cache_config.mamba_block_size
|
||||
|
||||
# In case the hybrid model does not have the
|
||||
# "mamba_block_size" assume a fixed constant
|
||||
if mamba_block_size is None:
|
||||
mamba_block_size = 512
|
||||
|
||||
mamba_block_size_multiplier = 10
|
||||
for offsets in [-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3]:
|
||||
vllm_runner_kwargs["max_num_batched_tokens"] = (
|
||||
mamba_block_size_multiplier * mamba_block_size - offsets
|
||||
)
|
||||
vllm_outputs_cache_rep, _ = _get_vLLM_output(
|
||||
vllm_runner,
|
||||
vllm_runner_kwargs,
|
||||
generated_prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
n_repetitions,
|
||||
)
|
||||
|
||||
# Check alignment of the output logits when using APC
|
||||
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
|
||||
# In the first repetition, the caches are filled
|
||||
# In the second repetition, these caches are reused
|
||||
|
||||
compare_operator(
|
||||
outputs_0_lst=vllm_outputs_no_cache[0],
|
||||
outputs_1_lst=vllm_outputs_cache_itn,
|
||||
name_0="vllm_no_cache",
|
||||
name_1=f"vllm_cache_it_{r_idx + 1}",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("n_repetitions", [2])
|
||||
# If num_logprobs is set to -1, then the stringent version
|
||||
# of the test is executed using `check_outputs_equal`
|
||||
# instead of `check_logprobs_close`
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("tensor_parallel_size", [1])
|
||||
def test_apc_multiple_prompts_all_cached_outputs(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
monkeypatch,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
n_repetitions: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
) -> None:
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
compare_operator: Callable = (
|
||||
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
|
||||
)
|
||||
|
||||
# Sample prompts.
|
||||
generated_prompts = [APC_MULTIPLY_BY * prompt for prompt in example_prompts]
|
||||
|
||||
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
|
||||
vllm_runner_kwargs = _get_vllm_runner_params(
|
||||
model, max_model_len, tensor_parallel_size=tensor_parallel_size
|
||||
)
|
||||
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
|
||||
|
||||
vllm_outputs_no_cache, _ = _get_vLLM_output(
|
||||
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
vllm_runner_kwargs["enable_prefix_caching"] = True
|
||||
vllm_outputs_cache_rep, _ = _get_vLLM_output(
|
||||
vllm_runner,
|
||||
vllm_runner_kwargs,
|
||||
generated_prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
n_repetitions,
|
||||
)
|
||||
|
||||
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
|
||||
# In the first repetition, the caches are filled
|
||||
# In the second repetition, these caches are reused
|
||||
|
||||
compare_operator(
|
||||
outputs_0_lst=vllm_outputs_no_cache[0],
|
||||
outputs_1_lst=vllm_outputs_cache_itn,
|
||||
name_0="vllm_no_cache",
|
||||
name_1=f"vllm_cache_it_{r_idx + 1}",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("n_repetitions", [2])
|
||||
# If num_logprobs is set to -1, then the stringent version
|
||||
# of the test is executed using `check_outputs_equal`
|
||||
# instead of `check_logprobs_close`
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("tensor_parallel_size", [1])
|
||||
def test_apc_multiple_prompts_block_align_alignment(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
monkeypatch,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
n_repetitions: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
) -> None:
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
compare_operator: Callable = (
|
||||
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
|
||||
)
|
||||
|
||||
# Sample prompts. This custom prompt is used, as it causes the most issues
|
||||
prompt_text = "The president of the United States is "
|
||||
prompt_offsets = [0, 3, 7, 13, 17, 22, 25, 31]
|
||||
generated_prompts = [
|
||||
prompt_text[offset:] * APC_MULTIPLY_BY for offset in prompt_offsets
|
||||
]
|
||||
|
||||
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
|
||||
vllm_runner_kwargs = _get_vllm_runner_params(
|
||||
model, max_model_len, tensor_parallel_size
|
||||
)
|
||||
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
|
||||
|
||||
vllm_outputs_no_cache, _ = _get_vLLM_output(
|
||||
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
vllm_runner_kwargs["enable_prefix_caching"] = True
|
||||
with vllm_runner(**vllm_runner_kwargs) as vllm_model:
|
||||
# Retrieve the default mamba state block size
|
||||
mamba_block_size = vllm_model.llm.llm_engine.cache_config.mamba_block_size
|
||||
|
||||
# In case the hybrid model does not have the
|
||||
# "mamba_block_size" assume a fixed constant
|
||||
if mamba_block_size is None:
|
||||
mamba_block_size = 512
|
||||
|
||||
mamba_block_size_multiplier = 10
|
||||
for offsets in [-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3]:
|
||||
vllm_runner_kwargs["max_num_batched_tokens"] = (
|
||||
mamba_block_size_multiplier * mamba_block_size - offsets
|
||||
)
|
||||
vllm_outputs_cache_rep, _ = _get_vLLM_output(
|
||||
vllm_runner,
|
||||
vllm_runner_kwargs,
|
||||
generated_prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
n_repetitions,
|
||||
)
|
||||
|
||||
# Check alignment of the output logits when using APC
|
||||
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
|
||||
# In the first repetition, the caches are filled
|
||||
# In the second repetition, these caches are reused
|
||||
|
||||
compare_operator(
|
||||
outputs_0_lst=vllm_outputs_no_cache[0],
|
||||
outputs_1_lst=vllm_outputs_cache_itn,
|
||||
name_0="vllm_no_cache",
|
||||
name_1=f"vllm_cache_it_{r_idx + 1}",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("n_repetitions", [2])
|
||||
# If num_logprobs is set to -1, then the stringent version
|
||||
# of the test is executed using `check_outputs_equal`
|
||||
# instead of `check_logprobs_close`
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("tensor_parallel_size", [1])
|
||||
def test_apc_multiple_prompts_partial_cached_outputs(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
monkeypatch,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
n_repetitions: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
) -> None:
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
compare_operator: Callable = (
|
||||
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
|
||||
)
|
||||
|
||||
# Sample prompts.
|
||||
generated_prompts = [APC_MULTIPLY_BY * prompt for prompt in example_prompts]
|
||||
|
||||
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
|
||||
vllm_runner_kwargs = _get_vllm_runner_params(
|
||||
model, max_model_len, tensor_parallel_size=tensor_parallel_size
|
||||
)
|
||||
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
|
||||
|
||||
vllm_outputs_no_cache, _ = _get_vLLM_output(
|
||||
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
# Cache only part of all the prompts
|
||||
vllm_runner_kwargs["enable_prefix_caching"] = True
|
||||
vllm_outputs_partial_cache, vllm_model = _get_vLLM_output(
|
||||
vllm_runner, vllm_runner_kwargs, generated_prompts[:3], max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
compare_operator(
|
||||
outputs_0_lst=vllm_outputs_no_cache[0][:3],
|
||||
outputs_1_lst=vllm_outputs_partial_cache[0],
|
||||
name_0="vllm_no_cache",
|
||||
name_1="vllm_partial_cache",
|
||||
)
|
||||
|
||||
vllm_outputs_cache_rep, _ = _get_vLLM_output(
|
||||
vllm_runner,
|
||||
vllm_runner_kwargs,
|
||||
generated_prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
n_repetitions,
|
||||
vllm_model=vllm_model,
|
||||
)
|
||||
|
||||
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
|
||||
# In the first repetition, the caches are filled
|
||||
# In the second repetition, these caches are reused
|
||||
|
||||
compare_operator(
|
||||
outputs_0_lst=vllm_outputs_no_cache[0],
|
||||
outputs_1_lst=vllm_outputs_cache_itn,
|
||||
name_0="vllm_no_cache",
|
||||
name_1=f"vllm_cache_it_{r_idx + 1}",
|
||||
)
|
||||
352
tests/models/language/generation/test_mistral.py
Normal file
352
tests/models/language/generation/test_mistral.py
Normal file
@@ -0,0 +1,352 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import copy
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.tokenizers.mistral import MistralTokenizer
|
||||
from vllm.tool_parsers.mistral_tool_parser import (
|
||||
MistralToolCall,
|
||||
MistralToolParser,
|
||||
)
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
MODELS = [
|
||||
"mistralai/Mistral-7B-Instruct-v0.3",
|
||||
]
|
||||
|
||||
MISTRAL_FORMAT_MODELS = [
|
||||
"mistralai/Mistral-7B-Instruct-v0.3",
|
||||
# uses the v3-Tekken tokenizer
|
||||
"mistralai/Ministral-8B-Instruct-2410",
|
||||
# Mistral-Nemo is too big for CI, but passes locally
|
||||
# "mistralai/Mistral-Nemo-Instruct-2407"
|
||||
]
|
||||
|
||||
SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
|
||||
SYMBOLIC_LANG_PROMPTS = [
|
||||
"勇敢な船乗りについての詩を書く", # japanese
|
||||
"寫一首關於勇敢的水手的詩", # chinese
|
||||
"ပုံပြင်လေးပြောပြပါ်:\n", # burmese
|
||||
"Repeat the phrase 'URGENCY🌶️':\nURGENCY🌶️\nURGENCY🌶️\n", # see https://github.com/vllm-project/vllm/pull/9625
|
||||
]
|
||||
|
||||
# for function calling
|
||||
TOOLS = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type": "string",
|
||||
"description": "The city to find the weather for, e.g. "
|
||||
"'San Francisco'",
|
||||
},
|
||||
"state": {
|
||||
"type": "string",
|
||||
"description": "the two-letter abbreviation for the state that "
|
||||
"the city is in, e.g. 'CA' which would mean 'California'",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"description": "The unit to fetch the temperature in",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
},
|
||||
},
|
||||
"required": ["city", "state", "unit"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "rewrite",
|
||||
"description": "Rewrites text",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"required": [],
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "The input text to rewrite.",
|
||||
}
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
]
|
||||
MSGS = [
|
||||
{"role": "system", "content": "You are an assistant."},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Could you please rewrite the below article? \n\n My English needs "
|
||||
"improvving, maybe I make errors.",
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "",
|
||||
"tool_calls": [
|
||||
{
|
||||
"id": "bbc5b7ede",
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "rewrite",
|
||||
"arguments": '{"text":"My English needs improvving, maybe '
|
||||
'I make errors."}',
|
||||
},
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "tool",
|
||||
"content": '{"action":"rewrite","outcome":"My English needs improving, maybe '
|
||||
'I make errors."}',
|
||||
"tool_call_id": "bbc5b7ede",
|
||||
"name": "rewrite",
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "---\n\nMy English needs improving, maybe I make errors",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
|
||||
),
|
||||
},
|
||||
]
|
||||
|
||||
SAMPLE_JSON_SCHEMA = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {"type": "string"},
|
||||
"age": {"type": "integer"},
|
||||
"skills": {
|
||||
"type": "array",
|
||||
"items": {"type": "string", "maxLength": 10},
|
||||
"minItems": 3,
|
||||
},
|
||||
"work_history": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"company": {"type": "string"},
|
||||
"duration": {"type": "number"},
|
||||
"position": {"type": "string"},
|
||||
},
|
||||
"required": ["company", "position"],
|
||||
},
|
||||
},
|
||||
},
|
||||
"required": ["name", "age", "skills", "work_history"],
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
# TODO(sang): Sliding window should be tested separately.
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(model, dtype=dtype, tokenizer_mode="mistral") as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_mistral_format(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="mistral",
|
||||
load_format="mistral",
|
||||
config_format="mistral",
|
||||
) as mistral_format_model:
|
||||
mistral_format_outputs = mistral_format_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="hf",
|
||||
load_format="safetensors",
|
||||
config_format="hf",
|
||||
) as hf_format_model:
|
||||
hf_format_outputs = hf_format_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_format_outputs,
|
||||
outputs_1_lst=mistral_format_outputs,
|
||||
name_0="hf",
|
||||
name_1="mistral",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
def test_mistral_symbolic_languages(vllm_runner, model: str, dtype: str) -> None:
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=8192,
|
||||
tokenizer_mode="mistral",
|
||||
config_format="mistral",
|
||||
load_format="mistral",
|
||||
) as vllm_model:
|
||||
for prompt in SYMBOLIC_LANG_PROMPTS:
|
||||
msg = {"role": "user", "content": prompt}
|
||||
outputs = vllm_model.llm.chat([msg], sampling_params=SAMPLING_PARAMS)
|
||||
assert "<EFBFBD>" not in outputs[0].outputs[0].text.strip()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="mistral",
|
||||
config_format="mistral",
|
||||
load_format="mistral",
|
||||
) as vllm_model:
|
||||
msgs = copy.deepcopy(MSGS)
|
||||
outputs = vllm_model.llm.chat(
|
||||
msgs, tools=TOOLS, sampling_params=SAMPLING_PARAMS
|
||||
)
|
||||
|
||||
tokenizer = vllm_model.llm.get_tokenizer()
|
||||
tool_parser = MistralToolParser(tokenizer)
|
||||
|
||||
model_output = outputs[0].outputs[0].text.strip()
|
||||
assert model_output.startswith(tool_parser.bot_token), model_output
|
||||
parsed_message = tool_parser.extract_tool_calls(model_output, None)
|
||||
|
||||
assert parsed_message.tools_called
|
||||
|
||||
assert MistralToolCall.is_valid_id(parsed_message.tool_calls[0].id)
|
||||
assert parsed_message.tool_calls[0].function.name == "get_current_weather"
|
||||
assert (
|
||||
parsed_message.tool_calls[0].function.arguments
|
||||
== '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}'
|
||||
) # noqa
|
||||
assert parsed_message.content is None
|
||||
|
||||
|
||||
def test_mistral_function_call_nested_json():
|
||||
"""Ensure that the function-name regex captures the entire outermost
|
||||
JSON block, including nested braces."""
|
||||
|
||||
# Create a minimal stub tokenizer that provides the few attributes the
|
||||
# parser accesses (`version` and `get_vocab`).
|
||||
class _StubMistralTokenizer(MistralTokenizer):
|
||||
version = 11 # Satisfy the version check
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def get_vocab():
|
||||
# Provide the special TOOL_CALLS token expected by the parser.
|
||||
return {"[TOOL_CALLS]": 0}
|
||||
|
||||
tokenizer = _StubMistralTokenizer()
|
||||
parser = MistralToolParser(tokenizer)
|
||||
|
||||
# Craft a model output featuring nested JSON inside the arguments.
|
||||
args_dict = {
|
||||
"city": "Dallas",
|
||||
"state": "TX",
|
||||
"unit": "fahrenheit",
|
||||
"sub_dict": {"foo": "bar", "inner": {"x": 1, "y": 2}},
|
||||
}
|
||||
|
||||
model_output = f"{parser.bot_token}get_current_weather{json.dumps(args_dict)}"
|
||||
|
||||
parsed = parser.extract_tool_calls(model_output, None)
|
||||
|
||||
# Assertions: the tool call is detected and the full nested JSON is parsed
|
||||
# without truncation.
|
||||
assert parsed.tools_called
|
||||
|
||||
assert MistralToolCall.is_valid_id(parsed.tool_calls[0].id)
|
||||
assert parsed.tool_calls[0].function.name == "get_current_weather"
|
||||
assert json.loads(parsed.tool_calls[0].function.arguments) == args_dict
|
||||
# No additional content outside the tool call should be returned.
|
||||
assert parsed.content is None
|
||||
|
||||
# multiple calls
|
||||
multiple_args_dict = [
|
||||
{
|
||||
"city": "Dallas",
|
||||
"state": "TX",
|
||||
"unit": "fahrenheit",
|
||||
"sub_dict": {"foo": "bar", "inner": {"x": 1, "y": 2}},
|
||||
},
|
||||
{},
|
||||
{"a": 0},
|
||||
{"a": 1, "b": "c"},
|
||||
]
|
||||
names = ["get_current_weather", "get_current_weather_2", "random", "random_2"]
|
||||
|
||||
model_output = "".join(
|
||||
[
|
||||
f"{parser.bot_token}{name}{json.dumps(args)}"
|
||||
for name, args in zip(names, multiple_args_dict)
|
||||
]
|
||||
)
|
||||
|
||||
parsed = parser.extract_tool_calls(model_output, None)
|
||||
|
||||
# Assertions: the tool call is detected and the full nested JSON is parsed
|
||||
# without truncation.
|
||||
assert parsed.tools_called
|
||||
assert len(parsed.tool_calls) == len(multiple_args_dict)
|
||||
|
||||
for i, tool_call in enumerate(parsed.tool_calls):
|
||||
assert MistralToolCall.is_valid_id(tool_call.id)
|
||||
assert tool_call.function.name == names[i]
|
||||
assert json.loads(tool_call.function.arguments) == multiple_args_dict[i]
|
||||
# No additional content outside the tool call should be returned.
|
||||
assert parsed.content is None
|
||||
96
tests/models/language/generation/test_phimoe.py
Normal file
96
tests/models/language/generation/test_phimoe.py
Normal file
@@ -0,0 +1,96 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ....utils import large_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
MODELS = [
|
||||
"microsoft/Phi-3.5-MoE-instruct",
|
||||
]
|
||||
|
||||
|
||||
def test_phimoe_routing_function():
|
||||
from vllm.model_executor.models.phimoe import phimoe_routing_function
|
||||
|
||||
test_case = {
|
||||
0: {
|
||||
"hidden_states": torch.tensor(
|
||||
[1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.float32, requires_grad=False
|
||||
).view(4, 2),
|
||||
"gating_output": torch.tensor(
|
||||
[0.1, 0.2, 0.3, 0.4], dtype=torch.float32, requires_grad=False
|
||||
),
|
||||
"topk": 2,
|
||||
"renormalize": False,
|
||||
},
|
||||
1: {
|
||||
"hidden_states": torch.tensor(
|
||||
[1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.float32, requires_grad=False
|
||||
).view(4, 2),
|
||||
"gating_output": torch.tensor(
|
||||
[0.4, 0.2, 0.3, 0.4], dtype=torch.float32, requires_grad=False
|
||||
),
|
||||
"topk": 2,
|
||||
"renormalize": False,
|
||||
},
|
||||
}
|
||||
|
||||
ground_truth = {
|
||||
0: {
|
||||
"topk_weights": torch.tensor(
|
||||
[1.0, 1.0], dtype=torch.float32, requires_grad=False
|
||||
),
|
||||
"topk_ids": torch.tensor([3, 2], dtype=torch.long, requires_grad=False),
|
||||
},
|
||||
1: {
|
||||
"topk_weights": torch.tensor(
|
||||
[0.5, 1.0], dtype=torch.float32, requires_grad=False
|
||||
),
|
||||
"topk_ids": torch.tensor([0, 3], dtype=torch.long, requires_grad=False),
|
||||
},
|
||||
}
|
||||
|
||||
for test_id in test_case:
|
||||
topk_weights, topk_ids = phimoe_routing_function(**test_case[test_id])
|
||||
assert torch.allclose(topk_weights, ground_truth[test_id]["topk_weights"])
|
||||
assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
condition=current_platform.is_cpu(),
|
||||
reason="This test takes a lot time to run on CPU, "
|
||||
"and vllm CI's disk space is not enough for this model.",
|
||||
)
|
||||
@large_gpu_test(min_gb=80)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
128
tests/models/language/generation_ppl_test/ppl_utils.py
Normal file
128
tests/models/language/generation_ppl_test/ppl_utils.py
Normal file
@@ -0,0 +1,128 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Adapted from https://huggingface.co/docs/transformers/perplexity
|
||||
from typing import cast
|
||||
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
|
||||
import tests.ci_envs as ci_envs
|
||||
from tests.models.utils import (
|
||||
GenerateModelInfo,
|
||||
TokensTextLogprobsPromptLogprobs,
|
||||
get_vllm_extra_kwargs,
|
||||
)
|
||||
from vllm.logprobs import Logprob
|
||||
|
||||
# See #24485
|
||||
PPL_TOL = 0.01
|
||||
MAX_LENGTH = 1024
|
||||
|
||||
|
||||
@torch.inference_mode
|
||||
def wikitext_ppl_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model_info: GenerateModelInfo,
|
||||
max_length=MAX_LENGTH,
|
||||
vllm_extra_kwargs=None,
|
||||
atol=PPL_TOL,
|
||||
):
|
||||
vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
|
||||
|
||||
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
|
||||
|
||||
with vllm_runner(
|
||||
model_info.name,
|
||||
gpu_memory_utilization=0.7,
|
||||
max_model_len=max_length,
|
||||
max_num_seqs=1,
|
||||
**vllm_extra_kwargs,
|
||||
) as vllm_model:
|
||||
# Use max_num_seqs=1 to avoid OOM,
|
||||
# and avoid batch different requests together.
|
||||
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
|
||||
# Confirm whether vllm is using the correct architecture
|
||||
if model_info.architecture:
|
||||
assert model_info.architecture in model_config.architectures
|
||||
|
||||
max_length = min(model_config.max_model_len - 1, max_length)
|
||||
stride = max_length
|
||||
|
||||
tokenizer = vllm_model.llm.get_tokenizer()
|
||||
tokens = tokenizer.encode("\n\n".join(dataset["text"]))
|
||||
n_tokens = len(tokens)
|
||||
|
||||
chunks = []
|
||||
for begin_loc in range(0, n_tokens, stride):
|
||||
end_loc = min(begin_loc + max_length, n_tokens)
|
||||
chunks.append(tokens[begin_loc:end_loc])
|
||||
|
||||
outputs = vllm_model.generate_greedy_logprobs(
|
||||
prompts=chunks,
|
||||
max_tokens=1,
|
||||
num_logprobs=None,
|
||||
num_prompt_logprobs=0,
|
||||
use_tqdm=False,
|
||||
)
|
||||
nll_sum = torch.tensor(0.0, dtype=torch.float32, device="cpu")
|
||||
n_tokens = 0
|
||||
for output in outputs:
|
||||
output = cast(TokensTextLogprobsPromptLogprobs, output)
|
||||
token_datas = cast(list[dict[int, Logprob] | None], output[3])
|
||||
|
||||
assert token_datas[0] is None
|
||||
token_log_probs = []
|
||||
for token_data in token_datas[1:]:
|
||||
assert token_data is not None
|
||||
assert len(token_data) == 1
|
||||
token_log_prob = list(token_data.values())[0].logprob
|
||||
token_log_probs.append(token_log_prob)
|
||||
|
||||
neg_log_likelihood = -torch.tensor(
|
||||
token_log_probs, dtype=torch.float32, device="cpu"
|
||||
).sum()
|
||||
nll_sum += neg_log_likelihood
|
||||
n_tokens += len(token_log_probs)
|
||||
vllm_ppl = float(torch.exp(nll_sum / n_tokens))
|
||||
vllm_dtype = model_config.dtype
|
||||
head_dtype = model_config.head_dtype
|
||||
|
||||
# Accelerate ppl test by setting Transformers ppl score to a constant
|
||||
if model_info.hf_ppl is None:
|
||||
with hf_runner(
|
||||
model_info.name,
|
||||
dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
|
||||
) as hf_model:
|
||||
nll_sum = torch.tensor(0.0, dtype=torch.float32, device="cpu")
|
||||
n_tokens = 0
|
||||
for chunk in chunks:
|
||||
inputs = hf_model.wrap_device({"input_ids": torch.tensor([chunk])})
|
||||
input_ids = inputs["input_ids"]
|
||||
outputs = hf_model.model(input_ids, labels=input_ids)
|
||||
neg_log_likelihood = outputs.loss
|
||||
|
||||
neg_log_likelihood = neg_log_likelihood.to(torch.float32).cpu()
|
||||
|
||||
num_loss_tokens = len(chunk) - 1
|
||||
nll_sum += neg_log_likelihood * num_loss_tokens
|
||||
n_tokens += num_loss_tokens
|
||||
|
||||
hf_ppl = float(torch.exp(nll_sum / n_tokens))
|
||||
hf_dtype = next(hf_model.model.parameters()).dtype
|
||||
else:
|
||||
hf_ppl = model_info.hf_ppl
|
||||
hf_dtype = "Constant"
|
||||
|
||||
differ = (vllm_ppl - hf_ppl) / hf_ppl
|
||||
print("Model:", model_info.name)
|
||||
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_ppl)
|
||||
print("Transformers:", hf_dtype, hf_ppl)
|
||||
print("Difference (%):", differ * 100)
|
||||
|
||||
# PPL the smaller, the better
|
||||
# We are not concerned that the vllm PPL is less than Transformers,
|
||||
# so we only perform one-sided testing.
|
||||
assert differ < atol
|
||||
18
tests/models/language/generation_ppl_test/test_gemma.py
Normal file
18
tests/models/language/generation_ppl_test/test_gemma.py
Normal file
@@ -0,0 +1,18 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
|
||||
from tests.models.utils import GenerateModelInfo
|
||||
|
||||
from .ppl_utils import wikitext_ppl_test
|
||||
|
||||
MODELS = [
|
||||
GenerateModelInfo("google/gemma-2b"),
|
||||
GenerateModelInfo("google/gemma-2-2b"),
|
||||
GenerateModelInfo("google/gemma-3-4b-it"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_ppl(hf_runner, vllm_runner, model_info: GenerateModelInfo):
|
||||
wikitext_ppl_test(hf_runner, vllm_runner, model_info)
|
||||
14
tests/models/language/generation_ppl_test/test_gpt.py
Normal file
14
tests/models/language/generation_ppl_test/test_gpt.py
Normal file
@@ -0,0 +1,14 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
|
||||
from tests.models.utils import GenerateModelInfo
|
||||
|
||||
from .ppl_utils import wikitext_ppl_test
|
||||
|
||||
MODELS = [GenerateModelInfo("openai-community/gpt2-large")]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_ppl(hf_runner, vllm_runner, model_info: GenerateModelInfo):
|
||||
wikitext_ppl_test(hf_runner, vllm_runner, model_info)
|
||||
21
tests/models/language/generation_ppl_test/test_qwen.py
Normal file
21
tests/models/language/generation_ppl_test/test_qwen.py
Normal file
@@ -0,0 +1,21 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.models.utils import GenerateModelInfo
|
||||
|
||||
from .ppl_utils import wikitext_ppl_test
|
||||
|
||||
MODELS = [
|
||||
GenerateModelInfo("Qwen/Qwen3-0.6B"),
|
||||
GenerateModelInfo("Qwen/Qwen3-0.6B-FP8"),
|
||||
# transformers:
|
||||
# Loading a GPTQ quantized model requires optimum, gptqmodel
|
||||
# GenerateModelInfo("Qwen/Qwen3-0.6B-GPTQ-Int8"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_ppl(hf_runner, vllm_runner, model_info: GenerateModelInfo):
|
||||
wikitext_ppl_test(hf_runner, vllm_runner, model_info)
|
||||
0
tests/models/language/pooling/__init__.py
Normal file
0
tests/models/language/pooling/__init__.py
Normal file
67
tests/models/language/pooling/embed_utils.py
Normal file
67
tests/models/language/pooling/embed_utils.py
Normal file
@@ -0,0 +1,67 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from collections.abc import Sequence
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.conftest import HfRunner
|
||||
from tests.models.utils import EmbedModelInfo, check_embeddings_close, matryoshka_fy
|
||||
|
||||
|
||||
def run_embedding_correctness_test(
|
||||
hf_model: "HfRunner",
|
||||
inputs: list[str],
|
||||
vllm_outputs: Sequence[list[float]],
|
||||
dimensions: int | None = None,
|
||||
):
|
||||
hf_outputs = hf_model.encode(inputs)
|
||||
if dimensions:
|
||||
hf_outputs = matryoshka_fy(hf_outputs, dimensions)
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
tol=1e-2,
|
||||
)
|
||||
|
||||
|
||||
def correctness_test_embed_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model_info: EmbedModelInfo,
|
||||
example_prompts,
|
||||
vllm_extra_kwargs=None,
|
||||
hf_model_callback=None,
|
||||
):
|
||||
pytest.skip("Debug only, ci prefers to use mteb test.")
|
||||
|
||||
# The example_prompts has ending "\n", for example:
|
||||
# "Write a short story about a robot that dreams for the first time.\n"
|
||||
# sentence_transformers will strip the input texts, see:
|
||||
# https://github.com/UKPLab/sentence-transformers/blob/v3.1.1/sentence_transformers/models/Transformer.py#L159
|
||||
# This makes the input_ids different between hf_model and vllm_model.
|
||||
# So we need to strip the input texts to avoid test failing.
|
||||
example_prompts = [str(s).strip() for s in example_prompts]
|
||||
|
||||
vllm_extra_kwargs = vllm_extra_kwargs or {}
|
||||
vllm_extra_kwargs["dtype"] = model_info.dtype
|
||||
|
||||
if model_info.hf_overrides is not None:
|
||||
vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
|
||||
|
||||
with vllm_runner(
|
||||
model_info.name, runner="pooling", max_model_len=None, **vllm_extra_kwargs
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(example_prompts)
|
||||
|
||||
with hf_runner(
|
||||
model_info.name,
|
||||
dtype=model_info.hf_dtype,
|
||||
is_sentence_transformer=True,
|
||||
) as hf_model:
|
||||
if hf_model_callback is not None:
|
||||
hf_model_callback(hf_model)
|
||||
|
||||
run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs)
|
||||
@@ -0,0 +1,53 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoModel
|
||||
|
||||
from tests.models.utils import check_embeddings_close
|
||||
from vllm import TokensPrompt
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
["Qwen/Qwen3-Embedding-0.6B"],
|
||||
)
|
||||
@torch.inference_mode
|
||||
def test_embed_models(hf_runner, vllm_runner, model: str):
|
||||
chunk_size = 10
|
||||
n_prompt_tokens = [55, 56, 57]
|
||||
token_prompts = [[1024 + i for i in range(n)] for n in n_prompt_tokens]
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
max_model_len=128,
|
||||
max_num_batched_tokens=chunk_size,
|
||||
enforce_eager=True,
|
||||
# `enable_chunked_prefill`: Set to `False` instead of `None` in VllmRunner
|
||||
enable_chunked_prefill=True,
|
||||
enable_prefix_caching=True,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.token_embed(
|
||||
[TokensPrompt(prompt_token_ids=t) for t in token_prompts],
|
||||
)
|
||||
|
||||
with hf_runner(
|
||||
model,
|
||||
auto_cls=AutoModel,
|
||||
) as hf_model:
|
||||
hf_outputs = []
|
||||
for token_prompt in token_prompts:
|
||||
inputs = hf_model.wrap_device({"input_ids": torch.tensor([token_prompt])})
|
||||
input_ids = inputs["input_ids"]
|
||||
output = hf_model.model(input_ids)
|
||||
hf_outputs.append(output.last_hidden_state.cpu().float()[0])
|
||||
|
||||
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_output,
|
||||
embeddings_1_lst=vllm_output,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
tol=1e-2,
|
||||
)
|
||||
110
tests/models/language/pooling/test_auto_prefix_cache_support.py
Normal file
110
tests/models/language/pooling/test_auto_prefix_cache_support.py
Normal file
@@ -0,0 +1,110 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoModelForSequenceClassification
|
||||
|
||||
from tests.models.language.pooling.embed_utils import run_embedding_correctness_test
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
["jason9693/Qwen2.5-1.5B-apeach"],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_classify_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
# example_prompts is too short for testing prefix_caching
|
||||
example_prompts = [s * 10 for s in example_prompts]
|
||||
|
||||
with vllm_runner(
|
||||
model, max_model_len=512, dtype=dtype, enable_prefix_caching=True
|
||||
) as vllm_model:
|
||||
cache_config = vllm_model.llm.llm_engine.cache_config
|
||||
assert cache_config.enable_prefix_caching
|
||||
|
||||
# First Run
|
||||
vllm_model.classify(example_prompts)
|
||||
|
||||
# assert prefix_caching works
|
||||
pooling_outputs = vllm_model.llm.encode(
|
||||
example_prompts, pooling_task="classify"
|
||||
)
|
||||
for output in pooling_outputs:
|
||||
assert output.num_cached_tokens > 0
|
||||
vllm_outputs = [req_output.outputs.data for req_output in pooling_outputs]
|
||||
|
||||
with hf_runner(
|
||||
model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
|
||||
) as hf_model:
|
||||
hf_outputs = hf_model.classify(example_prompts)
|
||||
|
||||
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
|
||||
hf_output = torch.tensor(hf_output)
|
||||
vllm_output = torch.tensor(vllm_output)
|
||||
|
||||
assert torch.allclose(
|
||||
hf_output, vllm_output, 1e-3 if dtype == "float" else 1e-2
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
["Qwen/Qwen3-Embedding-0.6B"],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_embed_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
):
|
||||
# example_prompts is too short for testing prefix_caching
|
||||
example_prompts = [str(s).strip() * 10 for s in example_prompts]
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
enable_prefix_caching=True,
|
||||
) as vllm_model:
|
||||
cache_config = vllm_model.llm.llm_engine.cache_config
|
||||
assert cache_config.enable_prefix_caching
|
||||
|
||||
# First Run
|
||||
vllm_model.embed(example_prompts)
|
||||
|
||||
# assert prefix_caching works
|
||||
pooling_outputs = vllm_model.llm.encode(example_prompts, pooling_task="embed")
|
||||
for output in pooling_outputs:
|
||||
assert output.num_cached_tokens > 0
|
||||
vllm_outputs = [req_output.outputs.data for req_output in pooling_outputs]
|
||||
|
||||
with hf_runner(
|
||||
model,
|
||||
is_sentence_transformer=True,
|
||||
) as hf_model:
|
||||
run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
"intfloat/e5-small",
|
||||
"Alibaba-NLP/gte-Qwen2-1.5B-instruct", # is_causal == False
|
||||
"papluca/xlm-roberta-base-language-detection",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_non_causal_models(
|
||||
hf_runner, vllm_runner, example_prompts, model: str, dtype: str
|
||||
) -> None:
|
||||
with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
|
||||
cache_config = vllm_model.llm.llm_engine.cache_config
|
||||
assert not cache_config.enable_prefix_caching
|
||||
49
tests/models/language/pooling/test_classification.py
Normal file
49
tests/models/language/pooling/test_classification.py
Normal file
@@ -0,0 +1,49 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoModelForSequenceClassification
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
pytest.param(
|
||||
"jason9693/Qwen2.5-1.5B-apeach",
|
||||
marks=[
|
||||
pytest.mark.core_model,
|
||||
pytest.mark.cpu_model,
|
||||
pytest.mark.slow_test,
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"] if current_platform.is_rocm() else ["float"])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.classify(example_prompts)
|
||||
|
||||
with hf_runner(
|
||||
model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
|
||||
) as hf_model:
|
||||
hf_outputs = hf_model.classify(example_prompts)
|
||||
|
||||
# check logits difference
|
||||
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
|
||||
hf_output = torch.tensor(hf_output)
|
||||
vllm_output = torch.tensor(vllm_output)
|
||||
|
||||
# the tolerance value of 1e-2 is selected based on the
|
||||
# half datatype tests in
|
||||
# tests/models/language/pooling/test_embedding.py
|
||||
assert torch.allclose(
|
||||
hf_output, vllm_output, 1e-3 if dtype == "float" else 1e-2
|
||||
)
|
||||
89
tests/models/language/pooling/test_embedding.py
Normal file
89
tests/models/language/pooling/test_embedding.py
Normal file
@@ -0,0 +1,89 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import PoolerConfig
|
||||
|
||||
from ...utils import check_embeddings_close
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
# Be careful of the order of models, decoder-only models should be
|
||||
# placed before encoder-only models, otherwise `Qwen2.5-0.5B-Instruct`
|
||||
# case won't pass because gte-Qwen2-1.5B-instruct will cache custom
|
||||
# model code with bidirectional attention.
|
||||
# [Decoder-only]
|
||||
pytest.param(
|
||||
"BAAI/bge-multilingual-gemma2",
|
||||
marks=[pytest.mark.core_model, pytest.mark.slow_test],
|
||||
),
|
||||
pytest.param(
|
||||
"intfloat/e5-mistral-7b-instruct",
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
pytest.param(
|
||||
"ssmits/Qwen2-7B-Instruct-embed-base", marks=[pytest.mark.cpu_model]
|
||||
),
|
||||
# [Encoder-only]
|
||||
pytest.param(
|
||||
"BAAI/bge-base-en-v1.5",
|
||||
marks=[
|
||||
pytest.mark.core_model,
|
||||
pytest.mark.cpu_model,
|
||||
pytest.mark.slow_test,
|
||||
],
|
||||
),
|
||||
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
|
||||
pytest.param("intfloat/multilingual-e5-small"),
|
||||
# [Cross-Encoder]
|
||||
pytest.param(
|
||||
"sentence-transformers/stsb-roberta-base-v2",
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model,
|
||||
) -> None:
|
||||
vllm_extra_kwargs = {}
|
||||
if model == "ssmits/Qwen2-7B-Instruct-embed-base":
|
||||
vllm_extra_kwargs["pooler_config"] = PoolerConfig(
|
||||
pooling_type="MEAN", normalize=False
|
||||
)
|
||||
|
||||
max_model_len: int | None = 512
|
||||
if model in [
|
||||
"sentence-transformers/all-MiniLM-L12-v2",
|
||||
"sentence-transformers/stsb-roberta-base-v2",
|
||||
]:
|
||||
max_model_len = None
|
||||
|
||||
# The example_prompts has ending "\n", for example:
|
||||
# "Write a short story about a robot that dreams for the first time.\n"
|
||||
# sentence_transformers will strip the input texts, see:
|
||||
# https://github.com/UKPLab/sentence-transformers/blob/v3.1.1/sentence_transformers/models/Transformer.py#L159
|
||||
# This makes the input_ids different between hf_model and vllm_model.
|
||||
# So we need to strip the input texts to avoid test failing.
|
||||
example_prompts = [str(s).strip() for s in example_prompts]
|
||||
|
||||
with hf_runner(model, is_sentence_transformer=True) as hf_model:
|
||||
hf_outputs = hf_model.encode(example_prompts)
|
||||
|
||||
with vllm_runner(
|
||||
model, runner="pooling", max_model_len=max_model_len, **vllm_extra_kwargs
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(example_prompts)
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
tol=1e-2,
|
||||
)
|
||||
57
tests/models/language/pooling/test_extract_hidden_states.py
Normal file
57
tests/models/language/pooling/test_extract_hidden_states.py
Normal file
@@ -0,0 +1,57 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm import TokensPrompt
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
["Qwen/Qwen3-0.6B"],
|
||||
)
|
||||
@torch.inference_mode
|
||||
def test_extract_hidden_states(hf_runner, vllm_runner, model: str):
|
||||
n_prompt_tokens = [55, 56, 57]
|
||||
token_prompts = [[1024 + i for i in range(n)] for n in n_prompt_tokens]
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=128,
|
||||
enforce_eager=True,
|
||||
runner="pooling",
|
||||
enable_prefix_caching=True,
|
||||
) as vllm_model:
|
||||
pooling_outputs = vllm_model.llm.encode(
|
||||
[TokensPrompt(prompt_token_ids=t) for t in token_prompts],
|
||||
pooling_task="token_embed",
|
||||
)
|
||||
|
||||
for n, output in zip(n_prompt_tokens, pooling_outputs):
|
||||
assert len(output.prompt_token_ids) == n
|
||||
assert len(output.outputs.data) == n
|
||||
assert output.num_cached_tokens == 0
|
||||
|
||||
# test enable_prefix_caching plus all pooling
|
||||
# we need to skip reading cache at this request by
|
||||
# request.skip_reading_prefix_cache
|
||||
pooling_outputs = vllm_model.llm.encode(
|
||||
[TokensPrompt(prompt_token_ids=t) for t in token_prompts],
|
||||
pooling_task="token_embed",
|
||||
)
|
||||
|
||||
for n, output in zip(n_prompt_tokens, pooling_outputs):
|
||||
assert len(output.prompt_token_ids) == n
|
||||
assert len(output.outputs.data) == n
|
||||
assert output.num_cached_tokens == 0
|
||||
|
||||
# skip_reading_prefix_cache can still write to cache
|
||||
# to accelerate following requests
|
||||
pooling_outputs = vllm_model.llm.encode(
|
||||
[TokensPrompt(prompt_token_ids=t) for t in token_prompts],
|
||||
pooling_task="embed",
|
||||
)
|
||||
|
||||
for n, output in zip(n_prompt_tokens, pooling_outputs):
|
||||
assert len(output.prompt_token_ids) == n
|
||||
assert output.num_cached_tokens > 0
|
||||
192
tests/models/language/pooling/test_gritlm.py
Normal file
192
tests/models/language/pooling/test_gritlm.py
Normal file
@@ -0,0 +1,192 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import numpy as np
|
||||
import openai
|
||||
import pytest
|
||||
from scipy.spatial.distance import cosine
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import ModelConfig
|
||||
|
||||
from ....utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "parasail-ai/GritLM-7B-vllm"
|
||||
MAX_MODEL_LEN = 4000
|
||||
ATOL = 0.002
|
||||
|
||||
|
||||
def _arr(arr):
|
||||
"""
|
||||
Convert a list of integers to an array of integers.
|
||||
"""
|
||||
return np.array(arr)
|
||||
|
||||
|
||||
def test_find_array():
|
||||
from vllm.model_executor.models.gritlm import GritLMMeanPool
|
||||
|
||||
model_config = ModelConfig(
|
||||
MODEL_NAME,
|
||||
runner="pooling",
|
||||
dtype="bfloat16",
|
||||
seed=0,
|
||||
)
|
||||
pooling = GritLMMeanPool(model_config=model_config)
|
||||
|
||||
arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
|
||||
|
||||
assert pooling._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
|
||||
assert pooling._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
|
||||
assert pooling._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
|
||||
assert pooling._find_array(arr, _arr([3, 4, 5]), end_idx=3) == -1
|
||||
assert pooling._find_array(arr, _arr([3, 4, 5]), end_idx=4) == 3
|
||||
assert pooling._find_array(arr, _arr([3, 5]), start_idx=0) == -1
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
pooling._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
|
||||
|
||||
|
||||
def run_llm_encode(
|
||||
llm: LLM,
|
||||
queries: list[str],
|
||||
instruction: str,
|
||||
) -> list[list[float]]:
|
||||
outputs = llm.embed([instruction + q for q in queries])
|
||||
return [output.outputs.embedding for output in outputs]
|
||||
|
||||
|
||||
async def run_client_embeddings(
|
||||
client: openai.AsyncOpenAI,
|
||||
queries: list[str],
|
||||
instruction: str,
|
||||
) -> list[list[float]]:
|
||||
outputs = await client.embeddings.create(
|
||||
model=MODEL_NAME,
|
||||
input=[instruction + q for q in queries],
|
||||
)
|
||||
return [data.embedding for data in outputs.data]
|
||||
|
||||
|
||||
def gritlm_instruction(instruction):
|
||||
return (
|
||||
"<|user|>\n" + instruction + "\n<|embed|>\n" if instruction else "<|embed|>\n"
|
||||
)
|
||||
|
||||
|
||||
def get_test_data():
|
||||
"""
|
||||
Grabbed this test data and the expected values from
|
||||
README.md in https://github.com/ContextualAI/gritlm
|
||||
"""
|
||||
q_instruction = gritlm_instruction(
|
||||
"Given a scientific paper title, retrieve the paper's abstract",
|
||||
)
|
||||
queries = [
|
||||
"Bitcoin: A Peer-to-Peer Electronic Cash System",
|
||||
"Generative Representational Instruction Tuning",
|
||||
]
|
||||
|
||||
d_instruction = gritlm_instruction("")
|
||||
documents = [
|
||||
# ruff: noqa: E501
|
||||
"A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution. Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work. The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power. As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers. The network itself requires minimal structure. Messages are broadcast on a best effort basis, and nodes can leave and rejoin the network at will, accepting the longest proof-of-work chain as proof of what happened while they were gone.",
|
||||
"All text-based language problems can be reduced to either generation or embedding. Current models only perform well at one or the other. We introduce generative representational instruction tuning (GRIT) whereby a large language model is trained to handle both generative and embedding tasks by distinguishing between them through instructions. Compared to other open models, our resulting GritLM 7B sets a new state of the art on the Massive Text Embedding Benchmark (MTEB) and outperforms all models up to its size on a range of generative tasks. By scaling up further, GritLM 8X7B outperforms all open generative language models that we tried while still being among the best embedding models. Notably, we find that GRIT matches training on only generative or embedding data, thus we can unify both at no performance loss. Among other benefits, the unification via GRIT speeds up Retrieval-Augmented Generation (RAG) by > 60% for long documents, by no longer requiring separate retrieval and generation models. Models, code, etc. are freely available at https://github.com/ContextualAI/gritlm.",
|
||||
]
|
||||
|
||||
return queries, q_instruction, documents, d_instruction
|
||||
|
||||
|
||||
def validate_embed_output(q_rep: list[list[float]], d_rep: list[list[float]]):
|
||||
cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0])
|
||||
assert cosine_sim_q0_d0 == pytest.approx(0.609, abs=ATOL)
|
||||
|
||||
cosine_sim_q0_d1 = 1 - cosine(q_rep[0], d_rep[1])
|
||||
assert cosine_sim_q0_d1 == pytest.approx(0.101, abs=ATOL)
|
||||
|
||||
cosine_sim_q1_d0 = 1 - cosine(q_rep[1], d_rep[0])
|
||||
assert cosine_sim_q1_d0 == pytest.approx(0.120, abs=ATOL)
|
||||
|
||||
cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1])
|
||||
assert cosine_sim_q1_d1 == pytest.approx(0.534, abs=ATOL)
|
||||
|
||||
|
||||
def test_gritlm_offline_embedding(vllm_runner):
|
||||
queries, q_instruction, documents, d_instruction = get_test_data()
|
||||
|
||||
with vllm_runner(
|
||||
MODEL_NAME,
|
||||
runner="pooling",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
) as vllm_model:
|
||||
llm = vllm_model.llm
|
||||
|
||||
d_rep = run_llm_encode(
|
||||
llm,
|
||||
documents,
|
||||
d_instruction,
|
||||
)
|
||||
q_rep = run_llm_encode(
|
||||
llm,
|
||||
queries,
|
||||
q_instruction,
|
||||
)
|
||||
|
||||
validate_embed_output(q_rep, d_rep)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_gritlm_api_server_embedding():
|
||||
queries, q_instruction, documents, d_instruction = get_test_data()
|
||||
|
||||
args = ["--runner", "pooling", "--max_model_len", str(MAX_MODEL_LEN)]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as server:
|
||||
client_embedding = server.get_async_client()
|
||||
|
||||
d_rep = await run_client_embeddings(
|
||||
client_embedding,
|
||||
documents,
|
||||
d_instruction,
|
||||
)
|
||||
q_rep = await run_client_embeddings(
|
||||
client_embedding,
|
||||
queries,
|
||||
q_instruction,
|
||||
)
|
||||
|
||||
validate_embed_output(q_rep, d_rep)
|
||||
|
||||
|
||||
def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
|
||||
input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
|
||||
|
||||
with vllm_runner(
|
||||
MODEL_NAME,
|
||||
runner="generate",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
) as vllm_model:
|
||||
llm = vllm_model.llm
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.0, max_tokens=256)
|
||||
outputs = llm.generate(input, sampling_params=sampling_params)
|
||||
|
||||
assert outputs[0].outputs[0].text == "The capital of France is Paris."
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_gritlm_api_server_generate():
|
||||
input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
|
||||
|
||||
args = ["--runner", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as server:
|
||||
client_generate = server.get_async_client()
|
||||
|
||||
outputs = await client_generate.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt=input,
|
||||
max_tokens=256,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
assert outputs.choices[0].text == "The capital of France is Paris."
|
||||
47
tests/models/language/pooling/test_head_dtype.py
Normal file
47
tests/models/language/pooling/test_head_dtype.py
Normal file
@@ -0,0 +1,47 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoModelForSequenceClassification
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
["nie3e/sentiment-polish-gpt2-small"],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_classify_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
with hf_runner(
|
||||
model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
|
||||
) as hf_model:
|
||||
hf_outputs = hf_model.classify(example_prompts)
|
||||
|
||||
for head_dtype_str in ["float32", "model"]:
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
hf_overrides={"head_dtype": head_dtype_str},
|
||||
) as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
model_dtype = model_config.dtype
|
||||
head_dtype = model_config.head_dtype
|
||||
|
||||
if head_dtype_str == "float32":
|
||||
assert head_dtype == torch.float32
|
||||
elif head_dtype_str == "model":
|
||||
assert head_dtype == model_dtype
|
||||
|
||||
vllm_outputs = vllm_model.classify(example_prompts)
|
||||
|
||||
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
|
||||
hf_output = torch.tensor(hf_output).float()
|
||||
vllm_output = torch.tensor(vllm_output).float()
|
||||
|
||||
assert torch.allclose(hf_output, vllm_output, atol=1e-2)
|
||||
103
tests/models/language/pooling/test_mm_classifier_conversion.py
Normal file
103
tests/models/language/pooling/test_mm_classifier_conversion.py
Normal file
@@ -0,0 +1,103 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm.config.pooler import PoolerConfig
|
||||
|
||||
|
||||
def test_idefics_multimodal(
|
||||
vllm_runner,
|
||||
) -> None:
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
with vllm_runner(
|
||||
model_name="HuggingFaceM4/Idefics3-8B-Llama3",
|
||||
runner="pooling",
|
||||
convert="classify",
|
||||
load_format="dummy",
|
||||
max_model_len=512,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=1,
|
||||
disable_log_stats=True,
|
||||
dtype="bfloat16",
|
||||
) as vllm_model:
|
||||
llm = vllm_model.get_llm()
|
||||
outputs = llm.classify(prompts)
|
||||
for output in outputs:
|
||||
assert len(output.outputs.probs) == 2
|
||||
|
||||
|
||||
def update_config(config):
|
||||
config.text_config.update(
|
||||
{
|
||||
"architectures": ["Gemma3ForSequenceClassification"],
|
||||
"classifier_from_token": ["A", "B", "C", "D", "E"],
|
||||
"method": "no_post_processing",
|
||||
"id2label": {
|
||||
"A": "Chair",
|
||||
"B": "Couch",
|
||||
"C": "Table",
|
||||
"D": "Bed",
|
||||
"E": "Cupboard",
|
||||
},
|
||||
}
|
||||
)
|
||||
return config
|
||||
|
||||
|
||||
def test_gemma_multimodal(
|
||||
vllm_runner,
|
||||
) -> None:
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": """
|
||||
You are a helpful assistant. You will be given a product description
|
||||
which may also include an image. Classify the following product into
|
||||
one of the categories:
|
||||
|
||||
A = chair
|
||||
B = couch
|
||||
C = table
|
||||
D = bed
|
||||
E = cupboard
|
||||
|
||||
You'll answer with exactly one letter (A, B, C, D, or E).""",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/red_chair.jpg"
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "A fine 19th century piece of furniture."},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
with vllm_runner(
|
||||
model_name="google/gemma-3-4b-it",
|
||||
runner="pooling",
|
||||
convert="classify",
|
||||
load_format="auto",
|
||||
hf_overrides=update_config,
|
||||
pooler_config=PoolerConfig(pooling_type="LAST"),
|
||||
max_model_len=512,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=1,
|
||||
disable_log_stats=True,
|
||||
dtype="bfloat16",
|
||||
) as vllm_model:
|
||||
llm = vllm_model.get_llm()
|
||||
prompts = llm.preprocess_chat(messages)
|
||||
|
||||
result = llm.classify(prompts)
|
||||
assert result[0].outputs.probs[0] > 0.95
|
||||
assert all(c < 0.05 for c in result[0].outputs.probs[1:])
|
||||
45
tests/models/language/pooling/test_multi_vector_retrieval.py
Normal file
45
tests/models/language/pooling/test_multi_vector_retrieval.py
Normal file
@@ -0,0 +1,45 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoModel
|
||||
|
||||
from tests.models.utils import check_embeddings_close
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
["BAAI/bge-m3"],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@torch.inference_mode
|
||||
def test_embed_models(hf_runner, vllm_runner, example_prompts, model: str, dtype: str):
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.token_embed(example_prompts)
|
||||
|
||||
with hf_runner(
|
||||
model,
|
||||
auto_cls=AutoModel,
|
||||
) as hf_model:
|
||||
tokenizer = hf_model.tokenizer
|
||||
hf_outputs = []
|
||||
for prompt in example_prompts:
|
||||
inputs = tokenizer([prompt], return_tensors="pt")
|
||||
inputs = hf_model.wrap_device(inputs)
|
||||
output = hf_model.model(**inputs)
|
||||
embedding = output.last_hidden_state[0].float()
|
||||
# normal
|
||||
hf_outputs.append(embedding.cpu())
|
||||
|
||||
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_output,
|
||||
embeddings_1_lst=vllm_output,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
tol=1e-2,
|
||||
)
|
||||
@@ -0,0 +1,34 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoModelForSequenceClassification
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
["Rami/multi-label-class-classification-on-github-issues"],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_classify_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.classify(example_prompts)
|
||||
|
||||
with hf_runner(
|
||||
model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
|
||||
) as hf_model:
|
||||
hf_outputs = hf_model.classify(example_prompts)
|
||||
|
||||
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
|
||||
hf_output = torch.tensor(hf_output)
|
||||
vllm_output = torch.tensor(vllm_output)
|
||||
|
||||
assert torch.allclose(
|
||||
hf_output, vllm_output, 1e-3 if dtype == "float" else 1e-2
|
||||
)
|
||||
136
tests/models/language/pooling/test_nomic_max_model_len.py
Normal file
136
tests/models/language/pooling/test_nomic_max_model_len.py
Normal file
@@ -0,0 +1,136 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# ruff: noqa: SIM117
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
from ...utils import EmbedModelInfo
|
||||
|
||||
MODELS = [
|
||||
EmbedModelInfo("nomic-ai/nomic-embed-text-v1"),
|
||||
# EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"),
|
||||
# EmbedModelInfo("nomic-ai/CodeRankEmbed"),
|
||||
EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe"),
|
||||
# EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"),
|
||||
]
|
||||
|
||||
rope_theta = 1000
|
||||
factor = 4.0
|
||||
original_max_position_embeddings = 2048
|
||||
max_model_len = int(original_max_position_embeddings * factor)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_default(model_info, vllm_runner):
|
||||
with vllm_runner(
|
||||
model_info.name, runner="pooling", max_model_len=None
|
||||
) as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
|
||||
# For nomic-embed-text-v2-moe the length is set to 512
|
||||
# by sentence_bert_config.json.
|
||||
assert model_config.max_model_len == 512
|
||||
else:
|
||||
assert model_config.max_model_len == original_max_position_embeddings
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_set_max_model_len_legal(model_info, vllm_runner):
|
||||
# set max_model_len <= 512
|
||||
with vllm_runner(
|
||||
model_info.name, runner="pooling", max_model_len=256
|
||||
) as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
assert model_config.max_model_len == 256
|
||||
|
||||
# set 512 < max_model_len <= 2048
|
||||
if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
|
||||
# For nomic-embed-text-v2-moe the length is set to 512
|
||||
# by sentence_bert_config.json.
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(model_info.name, runner="pooling", max_model_len=1024):
|
||||
pass
|
||||
else:
|
||||
with vllm_runner(
|
||||
model_info.name, runner="pooling", max_model_len=1024
|
||||
) as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
assert model_config.max_model_len == 1024
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_set_max_model_len_illegal(model_info, vllm_runner):
|
||||
# set max_model_len > 2048
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(model_info.name, runner="pooling", max_model_len=4096):
|
||||
pass
|
||||
|
||||
# set max_model_len > 2048 by hf_overrides
|
||||
hf_overrides = {"max_model_len": 4096}
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(
|
||||
model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
hf_overrides=hf_overrides,
|
||||
):
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_use_rope_scaling_legal(model_info, vllm_runner):
|
||||
hf_overrides = {
|
||||
"rope_parameters": {
|
||||
"rope_theta": rope_theta,
|
||||
"rope_type": "yarn",
|
||||
"factor": factor,
|
||||
"original_max_position_embeddings": original_max_position_embeddings,
|
||||
},
|
||||
"max_model_len": max_model_len,
|
||||
}
|
||||
|
||||
with vllm_runner(
|
||||
model_info.name, runner="pooling", max_model_len=None, hf_overrides=hf_overrides
|
||||
):
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_use_rope_scaling_illegal(model_info, vllm_runner):
|
||||
hf_overrides: dict[str, Any] = {
|
||||
"rope_parameters": {
|
||||
"rope_theta": rope_theta,
|
||||
"rope_type": "yarn",
|
||||
"factor": factor,
|
||||
"original_max_position_embeddings": original_max_position_embeddings,
|
||||
},
|
||||
}
|
||||
# illegal max_model_len
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(
|
||||
model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=max_model_len + 1,
|
||||
hf_overrides=hf_overrides,
|
||||
):
|
||||
pass
|
||||
|
||||
hf_overrides = {
|
||||
"rope_parameters": {
|
||||
"rope_theta": rope_theta,
|
||||
"rope_type": "yarn",
|
||||
"factor": factor,
|
||||
"original_max_position_embeddings": original_max_position_embeddings,
|
||||
},
|
||||
"max_model_len": max_model_len + 1,
|
||||
}
|
||||
# illegal max_model_len by hf_overrides
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(
|
||||
model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
hf_overrides=hf_overrides,
|
||||
):
|
||||
pass
|
||||
@@ -0,0 +1,167 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from tests.models.utils import softmax
|
||||
from vllm.config import PoolerConfig
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
["jason9693/Qwen2.5-1.5B-apeach", "papluca/xlm-roberta-base-language-detection"],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_classify_models_using_activation(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(use_activation=False),
|
||||
) as vllm_model:
|
||||
wo_activation_out = vllm_model.classify(example_prompts)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(use_activation=True),
|
||||
) as vllm_model:
|
||||
w_activation_out = vllm_model.classify(example_prompts)
|
||||
|
||||
for wo_activation, w_activation in zip(wo_activation_out, w_activation_out):
|
||||
wo_activation = torch.tensor(wo_activation)
|
||||
w_activation = torch.tensor(w_activation)
|
||||
|
||||
assert not torch.allclose(wo_activation, w_activation, atol=1e-2), (
|
||||
"pooler_config is not working"
|
||||
)
|
||||
assert torch.allclose(
|
||||
softmax(wo_activation), w_activation, 1e-3 if dtype == "float" else 1e-2
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
"intfloat/multilingual-e5-small",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_embed_models_using_normalize(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(normalize=False),
|
||||
) as vllm_model:
|
||||
wo_normalize = torch.tensor(vllm_model.embed(example_prompts))
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(normalize=True),
|
||||
) as vllm_model:
|
||||
w_normalize = torch.tensor(vllm_model.embed(example_prompts))
|
||||
|
||||
assert not torch.allclose(wo_normalize, w_normalize, atol=1e-2), (
|
||||
"pooler_config normalize is not working"
|
||||
)
|
||||
assert torch.allclose(
|
||||
F.normalize(wo_normalize, p=2, dim=-1), w_normalize, atol=1e-2
|
||||
), "w_normal should be close to normal(wo_normal)."
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
"internlm/internlm2-1_8b-reward",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_reward_models_using_activation(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(use_activation=False),
|
||||
) as vllm_model:
|
||||
wo_activation = vllm_model.reward(example_prompts)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(use_activation=True),
|
||||
) as vllm_model:
|
||||
w_activation = vllm_model.reward(example_prompts)
|
||||
|
||||
for wo, w in zip(wo_activation, w_activation):
|
||||
wo = torch.tensor(wo)
|
||||
w = torch.tensor(w)
|
||||
|
||||
assert not torch.allclose(wo, w, atol=1e-2), (
|
||||
"pooler_config activation is not working"
|
||||
)
|
||||
assert torch.allclose(softmax(wo), w, atol=1e-2), (
|
||||
"w_activation should be close to activation(wo_activation)."
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
"intfloat/multilingual-e5-small",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_multi_vector_retrieval_models_using_normalize(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(normalize=False),
|
||||
) as vllm_model:
|
||||
wo_normalize = vllm_model.token_embed(example_prompts)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(normalize=True),
|
||||
) as vllm_model:
|
||||
w_normalize = vllm_model.token_embed(example_prompts)
|
||||
|
||||
for wo, w in zip(wo_normalize, w_normalize):
|
||||
assert not torch.allclose(wo, w, atol=1e-2), (
|
||||
"pooler_config normalize is not working"
|
||||
)
|
||||
assert torch.allclose(F.normalize(wo, p=2, dim=-1), w, atol=1e-2), (
|
||||
"w_normal should be close to normal(wo_normal)."
|
||||
)
|
||||
99
tests/models/language/pooling/test_reward.py
Normal file
99
tests/models/language/pooling/test_reward.py
Normal file
@@ -0,0 +1,99 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from transformers import AutoModel
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ....conftest import HfRunner
|
||||
from ...utils import check_transformers_version
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def math_step_prompts():
|
||||
# ruff: noqa: E501
|
||||
data = {
|
||||
"system": "Please reason step by step, and put your final answer within \\boxed{}. ",
|
||||
"query": "Sue lives in a fun neighborhood. One weekend, the neighbors decided to play a prank on Sue. On Friday morning, the neighbors placed 18 pink plastic flamingos out on Sue's front yard. On Saturday morning, the neighbors took back one third of the flamingos, painted them white, and put these newly painted white flamingos back out on Sue's front yard. Then, on Sunday morning, they added another 18 pink plastic flamingos to the collection. At noon on Sunday, how many more pink plastic flamingos were out than white plastic flamingos?",
|
||||
"response": [
|
||||
"To find out how many more pink plastic flamingos were out than white plastic flamingos at noon on Sunday, we can break down the problem into steps. First, on Friday, the neighbors start with 18 pink plastic flamingos.",
|
||||
"On Saturday, they take back one third of the flamingos. Since there were 18 flamingos, (1/3 \\times 18 = 6) flamingos are taken back. So, they have (18 - 6 = 12) flamingos left in their possession. Then, they paint these 6 flamingos white and put them back out on Sue's front yard. Now, Sue has the original 12 pink flamingos plus the 6 new white ones. Thus, by the end of Saturday, Sue has (12 + 6 = 18) pink flamingos and 6 white flamingos.",
|
||||
"On Sunday, the neighbors add another 18 pink plastic flamingos to Sue's front yard. By the end of Sunday morning, Sue has (18 + 18 = 36) pink flamingos and still 6 white flamingos.",
|
||||
"To find the difference, subtract the number of white flamingos from the number of pink flamingos: (36 - 6 = 30). Therefore, at noon on Sunday, there were 30 more pink plastic flamingos out than white plastic flamingos. The answer is (\\boxed{30}).",
|
||||
],
|
||||
}
|
||||
answer = "<extra_0>".join(data["response"]) + "<extra_0>"
|
||||
prompt = f"<im_start>system\n{data['system']}<im_end>\n<im_start>user\n{data['query']}<im_end>\n<im_start>assistant\n{answer}<im_end><|endoftext|>"
|
||||
return [prompt]
|
||||
|
||||
|
||||
def step_reward_patch_hf_model(hf_model: HfRunner):
|
||||
# Patch the hf_runner to use the step reward function
|
||||
def make_step_rewards(
|
||||
logits: torch.Tensor, token_masks: torch.Tensor
|
||||
) -> list[list[float]]:
|
||||
probabilities = F.softmax(logits, dim=-1)
|
||||
probabilities = probabilities * token_masks.unsqueeze(-1)
|
||||
|
||||
all_scores_res: list[list[float]] = []
|
||||
for i in range(probabilities.size(0)):
|
||||
sample = probabilities[i] # seq_len, num_labels
|
||||
positive_probs = sample[sample != 0].view(-1, 2)
|
||||
non_zero_elements_list = positive_probs.cpu().tolist()
|
||||
all_scores_res.append(non_zero_elements_list)
|
||||
return all_scores_res
|
||||
|
||||
def reward(prompts: list[str]) -> list[list[float]]:
|
||||
input_ids = hf_model.tokenizer(prompts, return_tensors="pt").input_ids
|
||||
input_ids = hf_model.wrap_device(input_ids)
|
||||
outputs = hf_model.model(input_ids=input_ids)
|
||||
|
||||
step_sep_id = hf_model.tokenizer.encode("<extra_0>")[0]
|
||||
token_masks = input_ids == step_sep_id
|
||||
return make_step_rewards(outputs[0], token_masks)
|
||||
|
||||
hf_model.reward = reward # type: ignore[attr-defined]
|
||||
|
||||
return hf_model
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
pytest.param(
|
||||
"Qwen/Qwen2.5-Math-PRM-7B",
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_prm_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
math_step_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
check_transformers_version(
|
||||
"Qwen/Qwen2.5-Math-PRM-7B", max_transformers_version="4.53.2"
|
||||
)
|
||||
|
||||
if current_platform.is_cpu():
|
||||
pytest.skip("CPU only supports V1")
|
||||
|
||||
with vllm_runner(model, max_model_len=1024, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.reward(math_step_prompts)
|
||||
|
||||
with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
|
||||
hf_model = step_reward_patch_hf_model(hf_model)
|
||||
hf_outputs = hf_model.reward(math_step_prompts)
|
||||
|
||||
# check logits difference
|
||||
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
|
||||
hf_output = torch.tensor(hf_output).float()
|
||||
vllm_output = torch.tensor(vllm_output).float()
|
||||
|
||||
assert torch.allclose(hf_output, vllm_output, 1.5e-2)
|
||||
169
tests/models/language/pooling/test_scoring.py
Normal file
169
tests/models/language/pooling/test_scoring.py
Normal file
@@ -0,0 +1,169 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
CROSS_ENCODER_MODELS = [
|
||||
"cross-encoder/ms-marco-MiniLM-L-6-v2", # Bert
|
||||
"BAAI/bge-reranker-v2-m3", # Roberta
|
||||
]
|
||||
|
||||
EMBEDDING_MODELS = [
|
||||
"sentence-transformers/all-MiniLM-L12-v2",
|
||||
]
|
||||
|
||||
TEXTS_1 = [
|
||||
"What is the capital of France?",
|
||||
"What is the capital of Germany?",
|
||||
]
|
||||
|
||||
TEXTS_2 = [
|
||||
"The capital of France is Paris.",
|
||||
"The capital of Germany is Berlin.",
|
||||
]
|
||||
|
||||
DTYPE = "half"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=CROSS_ENCODER_MODELS)
|
||||
def model_name(request):
|
||||
yield request.param
|
||||
|
||||
|
||||
def test_cross_encoder_1_to_1(vllm_runner, hf_runner, model_name):
|
||||
text_pair = [TEXTS_1[0], TEXTS_2[0]]
|
||||
|
||||
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
|
||||
hf_outputs = hf_model.predict([text_pair]).tolist()
|
||||
|
||||
with vllm_runner(
|
||||
model_name, runner="pooling", dtype=DTYPE, max_model_len=None
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
|
||||
|
||||
assert len(vllm_outputs) == 1
|
||||
assert len(hf_outputs) == 1
|
||||
|
||||
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
|
||||
|
||||
|
||||
def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name):
|
||||
text_pairs = [
|
||||
[TEXTS_1[0], TEXTS_2[0]],
|
||||
[TEXTS_1[0], TEXTS_2[1]],
|
||||
]
|
||||
|
||||
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
|
||||
hf_outputs = hf_model.predict(text_pairs).tolist()
|
||||
|
||||
with vllm_runner(
|
||||
model_name, runner="pooling", dtype=DTYPE, max_model_len=None
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
|
||||
|
||||
assert len(vllm_outputs) == 2
|
||||
assert len(hf_outputs) == 2
|
||||
|
||||
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
|
||||
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
|
||||
|
||||
|
||||
def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name):
|
||||
text_pairs = [
|
||||
[TEXTS_1[0], TEXTS_2[0]],
|
||||
[TEXTS_1[1], TEXTS_2[1]],
|
||||
]
|
||||
|
||||
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
|
||||
hf_outputs = hf_model.predict(text_pairs).tolist()
|
||||
|
||||
with vllm_runner(
|
||||
model_name, runner="pooling", dtype=DTYPE, max_model_len=None
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
|
||||
|
||||
assert len(vllm_outputs) == 2
|
||||
assert len(hf_outputs) == 2
|
||||
|
||||
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
|
||||
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=EMBEDDING_MODELS)
|
||||
def emb_model_name(request):
|
||||
yield request.param
|
||||
|
||||
|
||||
def test_embedding_1_to_1(vllm_runner, hf_runner, emb_model_name):
|
||||
text_pair = [TEXTS_1[0], TEXTS_2[0]]
|
||||
|
||||
with hf_runner(
|
||||
emb_model_name, dtype=DTYPE, is_sentence_transformer=True
|
||||
) as hf_model:
|
||||
hf_embeddings = hf_model.encode(text_pair)
|
||||
hf_outputs = [F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0)]
|
||||
|
||||
with vllm_runner(
|
||||
emb_model_name, runner="pooling", dtype=DTYPE, max_model_len=None
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
|
||||
|
||||
assert len(vllm_outputs) == 1
|
||||
assert len(hf_outputs) == 1
|
||||
|
||||
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
|
||||
|
||||
|
||||
def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name):
|
||||
text_pairs = [
|
||||
[TEXTS_1[0], TEXTS_2[0]],
|
||||
[TEXTS_1[0], TEXTS_2[1]],
|
||||
]
|
||||
|
||||
with hf_runner(
|
||||
emb_model_name, dtype=DTYPE, is_sentence_transformer=True
|
||||
) as hf_model:
|
||||
hf_embeddings = [hf_model.encode(text_pair) for text_pair in text_pairs]
|
||||
hf_outputs = [
|
||||
F.cosine_similarity(*map(torch.tensor, pair), dim=0)
|
||||
for pair in hf_embeddings
|
||||
]
|
||||
|
||||
with vllm_runner(
|
||||
emb_model_name, runner="pooling", dtype=DTYPE, max_model_len=None
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
|
||||
|
||||
assert len(vllm_outputs) == 2
|
||||
assert len(hf_outputs) == 2
|
||||
|
||||
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
|
||||
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
|
||||
|
||||
|
||||
def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name):
|
||||
text_pairs = [
|
||||
[TEXTS_1[0], TEXTS_2[0]],
|
||||
[TEXTS_1[1], TEXTS_2[1]],
|
||||
]
|
||||
|
||||
with hf_runner(
|
||||
emb_model_name, dtype=DTYPE, is_sentence_transformer=True
|
||||
) as hf_model:
|
||||
hf_embeddings = [hf_model.encode(text_pair) for text_pair in text_pairs]
|
||||
hf_outputs = [
|
||||
F.cosine_similarity(*map(torch.tensor, pair), dim=0)
|
||||
for pair in hf_embeddings
|
||||
]
|
||||
|
||||
with vllm_runner(
|
||||
emb_model_name, runner="pooling", dtype=DTYPE, max_model_len=None
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
|
||||
|
||||
assert len(vllm_outputs) == 2
|
||||
assert len(hf_outputs) == 2
|
||||
|
||||
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
|
||||
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
|
||||
89
tests/models/language/pooling/test_splade_sparse_pooler.py
Normal file
89
tests/models/language/pooling/test_splade_sparse_pooler.py
Normal file
@@ -0,0 +1,89 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import types
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from vllm.model_executor.models.bert import (
|
||||
BertMLMHead,
|
||||
SPLADESparsePooler,
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
# Functional test: SPLADE formula correctness (no HF download needed)
|
||||
# ---------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize("B,T,H,V", [(2, 3, 5, 7)])
|
||||
@torch.inference_mode
|
||||
def test_splade_pooler_matches_reference_formula(B, T, H, V):
|
||||
"""Ensure SPLADESparsePooler forward() matches the mathematical formula:
|
||||
log1p(relu(logits)) -> max over sequence length (after masking)."""
|
||||
torch.manual_seed(0)
|
||||
|
||||
# Prepare [B] sequences of shape [T, H]
|
||||
hs_list = [torch.randn(T, H) for _ in range(B)]
|
||||
hs_tenser = torch.cat(hs_list)
|
||||
|
||||
# Simulate PoolingMetadata (only required fields)
|
||||
prompt_lens = [T, T - 1]
|
||||
prompt_lens_tenser = torch.tensor(prompt_lens, dtype=torch.int32)
|
||||
token_ids = torch.tensor(
|
||||
[
|
||||
[101, 5, 102], # Batch 0: [CLS], token, [SEP]
|
||||
[101, 6, 6], # Batch 1: [CLS], token, token (last token ignored)
|
||||
],
|
||||
dtype=torch.long,
|
||||
)
|
||||
meta = types.SimpleNamespace(
|
||||
prompt_lens=prompt_lens_tenser, prompt_token_ids=token_ids
|
||||
)
|
||||
|
||||
# MLM head (prefer BertMLMHead, fallback to Linear if unavailable)
|
||||
try:
|
||||
mlm_head = BertMLMHead(hidden_size=H, vocab_size=V, layer_norm_eps=1e-12)
|
||||
except Exception:
|
||||
mlm_head = nn.Linear(H, V, bias=True)
|
||||
|
||||
# Forward pass through SPLADE pooler
|
||||
pooler = SPLADESparsePooler(mlm_head=mlm_head, pooling="max", remove_cls_sep=True)
|
||||
pooled = pooler(hidden_states=hs_tenser, pooling_metadata=meta) # list of [V]
|
||||
|
||||
# Basic output checks
|
||||
assert isinstance(pooled, torch.Tensor) and len(pooled) == B
|
||||
for vec in pooled:
|
||||
assert vec.shape == (V,)
|
||||
assert torch.isfinite(vec).all()
|
||||
assert (vec >= 0).all(), "SPLADE outputs must be non-negative."
|
||||
|
||||
# Reference implementation for comparison
|
||||
def ref_one(hs: torch.Tensor, L: int, tid_row: torch.Tensor) -> torch.Tensor:
|
||||
keep = torch.ones(L, dtype=torch.bool)
|
||||
if L > 0 and tid_row[0].item() == 101: # remove CLS
|
||||
keep[0] = False
|
||||
if L > 0 and tid_row[L - 1].item() == 102: # remove SEP
|
||||
keep[L - 1] = False
|
||||
|
||||
valid = hs[:L][keep[:L]]
|
||||
if valid.numel() == 0:
|
||||
return torch.zeros(V, dtype=torch.float32)
|
||||
|
||||
logits = mlm_head(valid) # [L', V]
|
||||
scores = torch.log1p(torch.relu(logits)) # [L', V]
|
||||
return scores.max(dim=0).values.to(torch.float32)
|
||||
|
||||
torch.testing.assert_close(
|
||||
pooled[0],
|
||||
ref_one(hs_list[0], prompt_lens[0], token_ids[0]),
|
||||
rtol=1e-4,
|
||||
atol=1e-4,
|
||||
)
|
||||
torch.testing.assert_close(
|
||||
pooled[1],
|
||||
ref_one(hs_list[1], prompt_lens[1], token_ids[1]),
|
||||
rtol=1e-4,
|
||||
atol=1e-4,
|
||||
)
|
||||
101
tests/models/language/pooling/test_token_classification.py
Normal file
101
tests/models/language/pooling/test_token_classification.py
Normal file
@@ -0,0 +1,101 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoModelForTokenClassification
|
||||
|
||||
from tests.models.utils import softmax
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", ["boltuix/NeuroBERT-NER"])
|
||||
# The float32 is required for this tiny model to pass the test.
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@torch.inference_mode
|
||||
def test_bert_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.token_classify(example_prompts)
|
||||
|
||||
with hf_runner(
|
||||
model, dtype=dtype, auto_cls=AutoModelForTokenClassification
|
||||
) as hf_model:
|
||||
tokenizer = hf_model.tokenizer
|
||||
hf_outputs = []
|
||||
for prompt in example_prompts:
|
||||
inputs = tokenizer([prompt], return_tensors="pt")
|
||||
inputs = hf_model.wrap_device(inputs)
|
||||
output = hf_model.model(**inputs)
|
||||
hf_outputs.append(softmax(output.logits[0]))
|
||||
|
||||
# check logits difference
|
||||
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
|
||||
hf_output = torch.tensor(hf_output).cpu().float()
|
||||
vllm_output = torch.tensor(vllm_output).cpu().float()
|
||||
assert torch.allclose(hf_output, vllm_output, 1e-2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", ["disham993/electrical-ner-ModernBERT-base"])
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@torch.inference_mode
|
||||
def test_modernbert_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.token_classify(example_prompts)
|
||||
|
||||
with hf_runner(
|
||||
model, dtype=dtype, auto_cls=AutoModelForTokenClassification
|
||||
) as hf_model:
|
||||
tokenizer = hf_model.tokenizer
|
||||
hf_outputs = []
|
||||
for prompt in example_prompts:
|
||||
inputs = tokenizer([prompt], return_tensors="pt")
|
||||
inputs = hf_model.wrap_device(inputs)
|
||||
output = hf_model.model(**inputs)
|
||||
hf_outputs.append(softmax(output.logits[0]))
|
||||
|
||||
# check logits difference
|
||||
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
|
||||
hf_output = torch.tensor(hf_output).cpu().float()
|
||||
vllm_output = torch.tensor(vllm_output).cpu().float()
|
||||
assert torch.allclose(hf_output, vllm_output, atol=1e-2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", ["bd2lcco/Qwen3-0.6B-finetuned"])
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@torch.inference_mode
|
||||
def test_auto_conversion(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
with vllm_runner(model, max_model_len=1024, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.token_classify(example_prompts)
|
||||
|
||||
with hf_runner(
|
||||
model, dtype=dtype, auto_cls=AutoModelForTokenClassification
|
||||
) as hf_model:
|
||||
tokenizer = hf_model.tokenizer
|
||||
hf_outputs = []
|
||||
for prompt in example_prompts:
|
||||
inputs = tokenizer([prompt], return_tensors="pt")
|
||||
inputs = hf_model.wrap_device(inputs)
|
||||
output = hf_model.model(**inputs)
|
||||
hf_outputs.append(softmax(output.logits[0]))
|
||||
|
||||
# check logits difference
|
||||
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
|
||||
hf_output = torch.tensor(hf_output).cpu().float()
|
||||
vllm_output = torch.tensor(vllm_output).cpu().float()
|
||||
assert torch.allclose(hf_output, vllm_output, atol=1e-2)
|
||||
76
tests/models/language/pooling/test_truncation_control.py
Normal file
76
tests/models/language/pooling/test_truncation_control.py
Normal file
@@ -0,0 +1,76 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
|
||||
MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
|
||||
max_model_len = 128
|
||||
|
||||
input_str = """Immerse yourself in the enchanting chronicle of calculus, a
|
||||
mathematical domain that has radically transformed our comprehension of
|
||||
change and motion. Despite its roots in ancient civilizations, the
|
||||
formal birth of calculus predominantly occurred in the 17th century,
|
||||
primarily under the influential guidance of Sir Isaac Newton and Gottfried
|
||||
Wilhelm Leibniz. The earliest traces of calculus concepts are found in
|
||||
ancient Greek mathematics,most notably in the works of Eudoxus and
|
||||
Archimedes, around 300 BCE. They utilized the 'method of exhaustion'—a
|
||||
technique for computing areas and volumes through the use of finite sums.
|
||||
This methodology laid crucial foundational work for integral calculus.
|
||||
In the 17th century, both Newton and Leibniz independently pioneered
|
||||
calculus, each contributing unique perspectives that would shape this new
|
||||
field."""
|
||||
|
||||
|
||||
def test_smaller_truncation_size(
|
||||
vllm_runner, model_name=MODEL_NAME, input_str=input_str
|
||||
):
|
||||
truncate_prompt_tokens = 10
|
||||
|
||||
with vllm_runner(
|
||||
model_name, runner="pooling", max_model_len=max_model_len
|
||||
) as vllm_model:
|
||||
vllm_output = vllm_model.llm.embed(
|
||||
input_str, truncate_prompt_tokens=truncate_prompt_tokens
|
||||
)
|
||||
|
||||
prompt_tokens = vllm_output[0].prompt_token_ids
|
||||
|
||||
assert len(prompt_tokens) == truncate_prompt_tokens
|
||||
|
||||
|
||||
def test_max_truncation_size(vllm_runner, model_name=MODEL_NAME, input_str=input_str):
|
||||
truncate_prompt_tokens = -1
|
||||
|
||||
with vllm_runner(
|
||||
model_name, runner="pooling", max_model_len=max_model_len
|
||||
) as vllm_model:
|
||||
vllm_output = vllm_model.llm.embed(
|
||||
input_str, truncate_prompt_tokens=truncate_prompt_tokens
|
||||
)
|
||||
|
||||
prompt_tokens = vllm_output[0].prompt_token_ids
|
||||
|
||||
assert len(prompt_tokens) == max_model_len
|
||||
|
||||
|
||||
def test_bigger_truncation_size(
|
||||
vllm_runner, model_name=MODEL_NAME, input_str=input_str
|
||||
):
|
||||
truncate_prompt_tokens = max_model_len + 1
|
||||
|
||||
with (
|
||||
pytest.raises(ValueError),
|
||||
vllm_runner(
|
||||
model_name, runner="pooling", max_model_len=max_model_len
|
||||
) as vllm_model,
|
||||
):
|
||||
llm_output = vllm_model.llm.embed(
|
||||
input_str, truncate_prompt_tokens=truncate_prompt_tokens
|
||||
)
|
||||
|
||||
assert (
|
||||
llm_output
|
||||
== f"""truncate_prompt_tokens value
|
||||
({truncate_prompt_tokens}) is greater than
|
||||
max_model_len ({max_model_len}). Please, select
|
||||
a smaller truncation size."""
|
||||
)
|
||||
0
tests/models/language/pooling_mteb_test/__init__.py
Normal file
0
tests/models/language/pooling_mteb_test/__init__.py
Normal file
415
tests/models/language/pooling_mteb_test/mteb_utils.py
Normal file
415
tests/models/language/pooling_mteb_test/mteb_utils.py
Normal file
@@ -0,0 +1,415 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import tempfile
|
||||
|
||||
import mteb
|
||||
import numpy as np
|
||||
import requests
|
||||
import torch
|
||||
from mteb.models import ModelMeta
|
||||
from mteb.types import Array
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
import tests.ci_envs as ci_envs
|
||||
from tests.models.utils import (
|
||||
EmbedModelInfo,
|
||||
RerankModelInfo,
|
||||
check_embeddings_close,
|
||||
get_vllm_extra_kwargs,
|
||||
)
|
||||
|
||||
# Most embedding models on the STS12 task (See #17175):
|
||||
# - Model implementation and minor changes in tensor dtype
|
||||
# results in differences less than 1e-4
|
||||
# - Different model results in differences more than 1e-3
|
||||
# 1e-4 is a good tolerance threshold
|
||||
MTEB_EMBED_TASKS = ["STS12"]
|
||||
MTEB_EMBED_TOL = 1e-4
|
||||
|
||||
# See #19344
|
||||
MTEB_RERANK_TASKS = ["NFCorpus"]
|
||||
MTEB_RERANK_LANGS = ["eng"]
|
||||
MTEB_RERANK_TOL = 2e-3
|
||||
|
||||
_empty_model_meta = ModelMeta(
|
||||
loader=None,
|
||||
name="vllm/model",
|
||||
revision="1",
|
||||
release_date=None,
|
||||
languages=None,
|
||||
framework=[],
|
||||
similarity_fn_name=None,
|
||||
n_parameters=None,
|
||||
memory_usage_mb=None,
|
||||
max_tokens=None,
|
||||
embed_dim=None,
|
||||
license=None,
|
||||
open_weights=None,
|
||||
public_training_code=None,
|
||||
public_training_data=None,
|
||||
use_instructions=None,
|
||||
training_datasets=None,
|
||||
modalities=["text"], # 'image' can be added to evaluate multimodal models
|
||||
)
|
||||
|
||||
|
||||
class VllmMtebEncoder(mteb.EncoderProtocol):
|
||||
mteb_model_meta = _empty_model_meta
|
||||
|
||||
def __init__(self, vllm_model):
|
||||
self.llm = vllm_model
|
||||
self.rng = np.random.default_rng(seed=42)
|
||||
|
||||
def encode(
|
||||
self,
|
||||
inputs: DataLoader[mteb.types.BatchedInput],
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> np.ndarray:
|
||||
# Hoping to discover potential scheduling
|
||||
# issues by randomizing the order.
|
||||
sentences = [text for batch in inputs for text in batch["text"]]
|
||||
r = self.rng.permutation(len(sentences))
|
||||
sentences = [sentences[i] for i in r]
|
||||
outputs = self.llm.embed(sentences, use_tqdm=False)
|
||||
embeds = np.array(outputs)
|
||||
embeds = embeds[np.argsort(r)]
|
||||
return embeds
|
||||
|
||||
def similarity(
|
||||
self,
|
||||
embeddings1: np.ndarray,
|
||||
embeddings2: np.ndarray,
|
||||
) -> np.ndarray:
|
||||
# Cosine similarity
|
||||
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
|
||||
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
|
||||
sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T)
|
||||
return sim
|
||||
|
||||
def similarity_pairwise(
|
||||
self,
|
||||
embeddings1: Array,
|
||||
embeddings2: Array,
|
||||
) -> Array:
|
||||
# Cosine similarity
|
||||
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
|
||||
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
|
||||
sim = np.sum(embeddings1 * embeddings2, axis=1) / (
|
||||
norm1.flatten() * norm2.flatten()
|
||||
)
|
||||
return sim
|
||||
|
||||
|
||||
class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
|
||||
mteb_model_meta = _empty_model_meta
|
||||
|
||||
def __init__(self, vllm_model):
|
||||
self.llm = vllm_model
|
||||
self.rng = np.random.default_rng(seed=42)
|
||||
|
||||
def predict(
|
||||
self,
|
||||
inputs1: DataLoader[mteb.types.BatchedInput],
|
||||
inputs2: DataLoader[mteb.types.BatchedInput],
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> np.ndarray:
|
||||
queries = [text for batch in inputs1 for text in batch["text"]]
|
||||
corpus = [text for batch in inputs2 for text in batch["text"]]
|
||||
|
||||
outputs = self.llm.score(
|
||||
queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
|
||||
)
|
||||
scores = np.array(outputs)
|
||||
return scores
|
||||
|
||||
|
||||
class OpenAIClientMtebEncoder(VllmMtebEncoder):
|
||||
def __init__(self, model_name: str, client):
|
||||
self.model_name = model_name
|
||||
self.client = client
|
||||
self.rng = np.random.default_rng(seed=42)
|
||||
|
||||
def encode(
|
||||
self,
|
||||
inputs: DataLoader[mteb.types.BatchedInput],
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> np.ndarray:
|
||||
# Hoping to discover potential scheduling
|
||||
# issues by randomizing the order.
|
||||
sentences = [text for batch in inputs for text in batch["text"]]
|
||||
r = self.rng.permutation(len(sentences))
|
||||
sentences = [sentences[i] for i in r]
|
||||
|
||||
embeddings = self.client.embeddings.create(
|
||||
model=self.model_name, input=sentences
|
||||
)
|
||||
outputs = [d.embedding for d in embeddings.data]
|
||||
embeds = np.array(outputs)
|
||||
embeds = embeds[np.argsort(r)]
|
||||
return embeds
|
||||
|
||||
|
||||
class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol):
|
||||
mteb_model_meta = _empty_model_meta
|
||||
|
||||
def __init__(self, model_name: str, url):
|
||||
self.model_name = model_name
|
||||
self.url = url
|
||||
self.rng = np.random.default_rng(seed=42)
|
||||
|
||||
def predict(
|
||||
self,
|
||||
inputs1: DataLoader[mteb.types.BatchedInput],
|
||||
inputs2: DataLoader[mteb.types.BatchedInput],
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> np.ndarray:
|
||||
queries = [text for batch in inputs1 for text in batch["text"]]
|
||||
full_corpus = [text for batch in inputs2 for text in batch["text"]]
|
||||
|
||||
outputs = []
|
||||
for query, corpus in zip(queries, full_corpus):
|
||||
outputs.append(self.get_score(query, corpus))
|
||||
|
||||
scores = np.array(outputs)
|
||||
return scores
|
||||
|
||||
def get_score(self, query, corpus):
|
||||
response = requests.post(
|
||||
self.url,
|
||||
json={
|
||||
"model": self.model_name,
|
||||
"text_1": query,
|
||||
"text_2": corpus,
|
||||
"truncate_prompt_tokens": -1,
|
||||
},
|
||||
).json()
|
||||
return response["data"][0]["score"]
|
||||
|
||||
|
||||
class RerankClientMtebEncoder(ScoreClientMtebEncoder):
|
||||
def get_score(self, query, corpus):
|
||||
response = requests.post(
|
||||
self.url,
|
||||
json={
|
||||
"model": self.model_name,
|
||||
"query": query,
|
||||
"documents": [corpus],
|
||||
"truncate_prompt_tokens": -1,
|
||||
},
|
||||
).json()
|
||||
return response["results"][0]["relevance_score"]
|
||||
|
||||
|
||||
def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks):
|
||||
tasks = mteb.get_tasks(tasks=tasks)
|
||||
results = mteb.evaluate(
|
||||
encoder,
|
||||
tasks,
|
||||
cache=None,
|
||||
show_progress_bar=False,
|
||||
)
|
||||
|
||||
main_score = results[0].scores["test"][0]["main_score"]
|
||||
return main_score
|
||||
|
||||
|
||||
def mteb_test_embed_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model_info: EmbedModelInfo,
|
||||
vllm_extra_kwargs=None,
|
||||
hf_model_callback=None,
|
||||
atol=MTEB_EMBED_TOL,
|
||||
):
|
||||
vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
|
||||
|
||||
# Test embed_dims, isnan and whether to use normalize
|
||||
example_prompts = ["The chef prepared a delicious meal." * 1000]
|
||||
|
||||
with vllm_runner(
|
||||
model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=model_info.max_model_len,
|
||||
**vllm_extra_kwargs,
|
||||
) as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
|
||||
# Confirm whether vllm is using the correct architecture
|
||||
if model_info.architecture:
|
||||
assert model_info.architecture in model_config.architectures
|
||||
|
||||
# Confirm whether vllm uses the correct default_pooling_type, which
|
||||
# relates to whether chunked prefill and prefix caching are enabled
|
||||
assert (
|
||||
model_config._model_info.default_pooling_type
|
||||
== model_info.default_pooling_type
|
||||
)
|
||||
|
||||
vllm_main_score = run_mteb_embed_task(
|
||||
VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS
|
||||
)
|
||||
vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
|
||||
head_dtype = model_config.head_dtype
|
||||
|
||||
# Test embedding_size, isnan and whether to use normalize
|
||||
vllm_outputs = vllm_model.embed(example_prompts, truncate_prompt_tokens=-1)
|
||||
outputs_tensor = torch.tensor(vllm_outputs)
|
||||
assert not torch.any(torch.isnan(outputs_tensor))
|
||||
embedding_size = model_config.embedding_size
|
||||
assert torch.tensor(vllm_outputs).shape[-1] == embedding_size
|
||||
|
||||
# Accelerate mteb test by setting
|
||||
# SentenceTransformers mteb score to a constant
|
||||
if model_info.mteb_score is None:
|
||||
with hf_runner(
|
||||
model_info.name,
|
||||
is_sentence_transformer=True,
|
||||
dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
|
||||
) as hf_model:
|
||||
# e.g. setting default parameters for the encode method of hf_runner
|
||||
if hf_model_callback is not None:
|
||||
hf_model_callback(hf_model)
|
||||
|
||||
st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
|
||||
st_dtype = next(hf_model.model.parameters()).dtype
|
||||
|
||||
# Check embeddings close to hf outputs
|
||||
hf_outputs = hf_model.encode(example_prompts)
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
tol=1e-2,
|
||||
)
|
||||
else:
|
||||
st_main_score = model_info.mteb_score
|
||||
st_dtype = "Constant"
|
||||
|
||||
print("Model:", model_info.name)
|
||||
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
|
||||
print("SentenceTransformers:", st_dtype, st_main_score)
|
||||
print("Difference:", st_main_score - vllm_main_score)
|
||||
|
||||
# We are not concerned that the vllm mteb results are better
|
||||
# than SentenceTransformers, so we only perform one-sided testing.
|
||||
assert st_main_score - vllm_main_score < atol
|
||||
|
||||
|
||||
def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages):
|
||||
with tempfile.TemporaryDirectory() as prediction_folder:
|
||||
bm25s = mteb.get_model("bm25s")
|
||||
eval_splits = ["test"]
|
||||
|
||||
mteb_tasks: list[mteb.abstasks.AbsTaskRetrieval] = mteb.get_tasks(
|
||||
tasks=tasks, languages=languages, eval_splits=eval_splits
|
||||
)
|
||||
|
||||
mteb.evaluate(
|
||||
bm25s,
|
||||
mteb_tasks,
|
||||
prediction_folder=prediction_folder,
|
||||
show_progress_bar=False,
|
||||
# don't save results for test runs
|
||||
cache=None,
|
||||
overwrite_strategy="always",
|
||||
)
|
||||
|
||||
second_stage_tasks = []
|
||||
for task in mteb_tasks:
|
||||
second_stage_tasks.append(
|
||||
task.convert_to_reranking(
|
||||
prediction_folder,
|
||||
top_k=10,
|
||||
)
|
||||
)
|
||||
|
||||
results = mteb.evaluate(
|
||||
cross_encoder,
|
||||
second_stage_tasks,
|
||||
show_progress_bar=False,
|
||||
cache=None,
|
||||
)
|
||||
main_score = results[0].scores["test"][0]["main_score"]
|
||||
return main_score
|
||||
|
||||
|
||||
def mteb_test_rerank_models_hf(
|
||||
hf_runner, model_name, hf_dtype="float32", hf_model_callback=None
|
||||
):
|
||||
with hf_runner(model_name, is_cross_encoder=True, dtype=hf_dtype) as hf_model:
|
||||
if hf_model_callback is not None:
|
||||
hf_model_callback(hf_model)
|
||||
|
||||
st_main_score = run_mteb_rerank(
|
||||
hf_model, tasks=MTEB_RERANK_TASKS, languages=MTEB_RERANK_LANGS
|
||||
)
|
||||
st_dtype = next(hf_model.model.model.parameters()).dtype
|
||||
return st_main_score, st_dtype
|
||||
|
||||
|
||||
def mteb_test_rerank_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model_info: RerankModelInfo,
|
||||
vllm_extra_kwargs=None,
|
||||
hf_model_callback=None,
|
||||
vllm_mteb_encoder=VllmMtebCrossEncoder,
|
||||
atol=MTEB_RERANK_TOL,
|
||||
):
|
||||
vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
|
||||
|
||||
with vllm_runner(
|
||||
model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
max_num_seqs=8,
|
||||
**vllm_extra_kwargs,
|
||||
) as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
|
||||
# Confirm whether vllm is using the correct architecture
|
||||
if model_info.architecture:
|
||||
assert model_info.architecture in model_config.architectures
|
||||
|
||||
# Score API is only enabled for num_labels == 1
|
||||
assert model_config.hf_config.num_labels == 1
|
||||
|
||||
# Confirm whether vllm uses the correct default_pooling_type, which
|
||||
# relates to whether chunked prefill and prefix caching are enabled
|
||||
assert (
|
||||
model_config._model_info.default_pooling_type
|
||||
== model_info.default_pooling_type
|
||||
)
|
||||
|
||||
vllm_main_score = run_mteb_rerank(
|
||||
vllm_mteb_encoder(vllm_model),
|
||||
tasks=MTEB_RERANK_TASKS,
|
||||
languages=MTEB_RERANK_LANGS,
|
||||
)
|
||||
vllm_dtype = model_config.dtype
|
||||
head_dtype = model_config.head_dtype
|
||||
|
||||
# Accelerate mteb test by setting
|
||||
# SentenceTransformers mteb score to a constant
|
||||
if model_info.mteb_score is None:
|
||||
st_main_score, st_dtype = mteb_test_rerank_models_hf(
|
||||
hf_runner, model_info.name, model_info.hf_dtype, hf_model_callback
|
||||
)
|
||||
else:
|
||||
st_main_score = model_info.mteb_score
|
||||
st_dtype = "Constant"
|
||||
|
||||
print("Model:", model_info.name)
|
||||
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
|
||||
print("SentenceTransformers:", st_dtype, st_main_score)
|
||||
print("Difference:", st_main_score - vllm_main_score)
|
||||
|
||||
# We are not concerned that the vllm mteb results are better
|
||||
# than SentenceTransformers, so we only perform one-sided testing.
|
||||
assert st_main_score - vllm_main_score < atol
|
||||
114
tests/models/language/pooling_mteb_test/test_baai.py
Normal file
114
tests/models/language/pooling_mteb_test/test_baai.py
Normal file
@@ -0,0 +1,114 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
|
||||
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
|
||||
from tests.models.utils import (
|
||||
CLSPoolingEmbedModelInfo,
|
||||
CLSPoolingRerankModelInfo,
|
||||
EmbedModelInfo,
|
||||
LASTPoolingEmbedModelInfo,
|
||||
RerankModelInfo,
|
||||
)
|
||||
|
||||
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
|
||||
|
||||
MODELS = [
|
||||
########## BertModel
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-base-en",
|
||||
architecture="BertModel",
|
||||
mteb_score=0.779336792,
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-base-zh", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-small-en", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-small-zh", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-large-en", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-large-zh", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-large-zh-noinstruct", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-base-en-v1.5", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-base-zh-v1.5", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-small-en-v1.5", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-small-zh-v1.5", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-large-en-v1.5", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-large-zh-v1.5", architecture="BertModel", enable_test=False
|
||||
),
|
||||
########## XLMRobertaModel
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-m3",
|
||||
architecture="XLMRobertaModel",
|
||||
mteb_score=0.787343078,
|
||||
enable_test=True,
|
||||
),
|
||||
########## Qwen2Model
|
||||
LASTPoolingEmbedModelInfo(
|
||||
"BAAI/bge-code-v1",
|
||||
architecture="Qwen2Model",
|
||||
mteb_score=0.75724465,
|
||||
dtype="float32",
|
||||
enable_test=True,
|
||||
),
|
||||
]
|
||||
|
||||
RERANK_MODELS = [
|
||||
########## XLMRobertaForSequenceClassification
|
||||
CLSPoolingRerankModelInfo(
|
||||
"BAAI/bge-reranker-base",
|
||||
architecture="XLMRobertaForSequenceClassification",
|
||||
mteb_score=0.32398,
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingRerankModelInfo(
|
||||
"BAAI/bge-reranker-large",
|
||||
architecture="XLMRobertaForSequenceClassification",
|
||||
enable_test=False,
|
||||
),
|
||||
CLSPoolingRerankModelInfo(
|
||||
"BAAI/bge-reranker-v2-m3",
|
||||
architecture="XLMRobertaForSequenceClassification",
|
||||
enable_test=False,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
|
||||
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_embed_models_correctness(
|
||||
hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
|
||||
) -> None:
|
||||
correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", RERANK_MODELS)
|
||||
def test_rerank_models_mteb(
|
||||
hf_runner, vllm_runner, model_info: RerankModelInfo
|
||||
) -> None:
|
||||
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
|
||||
@@ -0,0 +1,145 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Any
|
||||
|
||||
import mteb
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from tests.conftest import HfRunner
|
||||
from tests.models.language.pooling_mteb_test.mteb_utils import (
|
||||
VllmMtebCrossEncoder,
|
||||
mteb_test_rerank_models,
|
||||
)
|
||||
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
|
||||
|
||||
RERANK_MODELS = [
|
||||
LASTPoolingRerankModelInfo(
|
||||
"BAAI/bge-reranker-v2-gemma",
|
||||
architecture="GemmaForSequenceClassification",
|
||||
mteb_score=0.33757,
|
||||
hf_overrides={
|
||||
"architectures": ["GemmaForSequenceClassification"],
|
||||
"classifier_from_token": ["Yes"],
|
||||
"method": "no_post_processing",
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
PROMPT = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'." # noqa: E501
|
||||
|
||||
|
||||
class GemmaRerankerHfRunner(HfRunner):
|
||||
def __init__(
|
||||
self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
|
||||
) -> None:
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
|
||||
self.yes_loc = self.tokenizer.convert_tokens_to_ids("Yes")
|
||||
|
||||
@torch.no_grad()
|
||||
def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
|
||||
def get_inputs(pairs, tokenizer, prompt=None):
|
||||
if prompt is None:
|
||||
prompt = PROMPT
|
||||
|
||||
sep = "\n"
|
||||
prompt_inputs = tokenizer(
|
||||
prompt, return_tensors=None, add_special_tokens=False
|
||||
)["input_ids"]
|
||||
sep_inputs = tokenizer(sep, return_tensors=None, add_special_tokens=False)[
|
||||
"input_ids"
|
||||
]
|
||||
inputs = []
|
||||
for query, passage in pairs:
|
||||
query_inputs = tokenizer(
|
||||
f"A: {query}",
|
||||
return_tensors=None,
|
||||
add_special_tokens=False,
|
||||
truncation=True,
|
||||
)
|
||||
passage_inputs = tokenizer(
|
||||
f"B: {passage}",
|
||||
return_tensors=None,
|
||||
add_special_tokens=False,
|
||||
truncation=True,
|
||||
)
|
||||
item = tokenizer.prepare_for_model(
|
||||
[tokenizer.bos_token_id] + query_inputs["input_ids"],
|
||||
sep_inputs + passage_inputs["input_ids"],
|
||||
truncation="only_second",
|
||||
padding=False,
|
||||
return_attention_mask=False,
|
||||
return_token_type_ids=False,
|
||||
add_special_tokens=False,
|
||||
)
|
||||
item["input_ids"] = item["input_ids"] + sep_inputs + prompt_inputs
|
||||
item["attention_mask"] = [1] * len(item["input_ids"])
|
||||
inputs.append(item)
|
||||
return tokenizer.pad(
|
||||
inputs,
|
||||
padding=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
scores = []
|
||||
for query, doc, *_ in prompts:
|
||||
pairs = [(query, doc)]
|
||||
inputs = get_inputs(pairs, self.tokenizer)
|
||||
inputs = inputs.to(self.model.device)
|
||||
_n_tokens = inputs["input_ids"].shape[1]
|
||||
logits = self.model(**inputs, return_dict=True).logits
|
||||
_scores = (
|
||||
logits[:, -1, self.yes_loc]
|
||||
.view(
|
||||
-1,
|
||||
)
|
||||
.float()
|
||||
.sigmoid()
|
||||
)
|
||||
scores.append(_scores[0].item())
|
||||
return torch.Tensor(scores)
|
||||
|
||||
|
||||
class GemmaMtebEncoder(VllmMtebCrossEncoder):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.query_template = "A: {query}\n"
|
||||
self.document_template = "B: {doc}\n{prompt}"
|
||||
|
||||
def predict(
|
||||
self,
|
||||
inputs1: DataLoader[mteb.types.BatchedInput],
|
||||
inputs2: DataLoader[mteb.types.BatchedInput],
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> np.ndarray:
|
||||
queries = [
|
||||
self.query_template.format(query=text)
|
||||
for batch in inputs1
|
||||
for text in batch["text"]
|
||||
]
|
||||
corpus = [
|
||||
self.document_template.format(doc=text, prompt=PROMPT)
|
||||
for batch in inputs2
|
||||
for text in batch["text"]
|
||||
]
|
||||
outputs = self.llm.score(
|
||||
queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
|
||||
)
|
||||
scores = np.array(outputs)
|
||||
return scores
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", RERANK_MODELS)
|
||||
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
|
||||
mteb_test_rerank_models(
|
||||
GemmaRerankerHfRunner,
|
||||
vllm_runner,
|
||||
model_info,
|
||||
vllm_mteb_encoder=GemmaMtebEncoder,
|
||||
)
|
||||
@@ -0,0 +1,31 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
|
||||
from tests.models.utils import (
|
||||
CLSPoolingRerankModelInfo,
|
||||
LASTPoolingRerankModelInfo,
|
||||
RerankModelInfo,
|
||||
)
|
||||
|
||||
from .mteb_utils import mteb_test_rerank_models
|
||||
|
||||
RERANK_MODELS = [
|
||||
CLSPoolingRerankModelInfo(
|
||||
"cross-encoder/ms-marco-TinyBERT-L-2-v2",
|
||||
mteb_score=0.32898,
|
||||
architecture="BertForSequenceClassification",
|
||||
),
|
||||
LASTPoolingRerankModelInfo(
|
||||
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
|
||||
mteb_score=0.25736,
|
||||
architecture="Qwen3ForSequenceClassification",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", RERANK_MODELS)
|
||||
def test_rerank_models_mteb(
|
||||
hf_runner, vllm_runner, model_info: RerankModelInfo
|
||||
) -> None:
|
||||
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
|
||||
129
tests/models/language/pooling_mteb_test/test_gte.py
Normal file
129
tests/models/language/pooling_mteb_test/test_gte.py
Normal file
@@ -0,0 +1,129 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
|
||||
from tests.models.utils import (
|
||||
CLSPoolingEmbedModelInfo,
|
||||
CLSPoolingRerankModelInfo,
|
||||
EmbedModelInfo,
|
||||
LASTPoolingEmbedModelInfo,
|
||||
RerankModelInfo,
|
||||
)
|
||||
|
||||
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
|
||||
|
||||
MODELS = [
|
||||
########## BertModel
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"thenlper/gte-large",
|
||||
mteb_score=0.76807651,
|
||||
architecture="BertModel",
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"thenlper/gte-base", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"thenlper/gte-small", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"thenlper/gte-large-zh", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"thenlper/gte-base-zh", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"thenlper/gte-small-zh", architecture="BertModel", enable_test=False
|
||||
),
|
||||
########### NewModel
|
||||
# These three architectures are almost the same, but not exactly the same.
|
||||
# For example,
|
||||
# - whether to use token_type_embeddings
|
||||
# - whether to use context expansion
|
||||
# So only test one (the most widely used) model
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"Alibaba-NLP/gte-multilingual-base",
|
||||
architecture="GteNewModel",
|
||||
mteb_score=0.775074696,
|
||||
hf_overrides={"architectures": ["GteNewModel"]},
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"Alibaba-NLP/gte-base-en-v1.5",
|
||||
architecture="GteNewModel",
|
||||
hf_overrides={"architectures": ["GteNewModel"]},
|
||||
enable_test=False,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"Alibaba-NLP/gte-large-en-v1.5",
|
||||
architecture="GteNewModel",
|
||||
hf_overrides={"architectures": ["GteNewModel"]},
|
||||
enable_test=False,
|
||||
),
|
||||
########### Qwen2ForCausalLM
|
||||
LASTPoolingEmbedModelInfo(
|
||||
"Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
||||
mteb_score=0.758473459018872,
|
||||
architecture="Qwen2ForCausalLM",
|
||||
enable_test=True,
|
||||
),
|
||||
########## ModernBertModel
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"Alibaba-NLP/gte-modernbert-base",
|
||||
mteb_score=0.748193353,
|
||||
architecture="ModernBertModel",
|
||||
enable_test=True,
|
||||
),
|
||||
########## Qwen3ForCausalLM
|
||||
LASTPoolingEmbedModelInfo(
|
||||
"Qwen/Qwen3-Embedding-0.6B",
|
||||
mteb_score=0.771163695,
|
||||
architecture="Qwen3ForCausalLM",
|
||||
dtype="float32",
|
||||
enable_test=True,
|
||||
),
|
||||
LASTPoolingEmbedModelInfo(
|
||||
"Qwen/Qwen3-Embedding-4B",
|
||||
architecture="Qwen3ForCausalLM",
|
||||
dtype="float32",
|
||||
enable_test=False,
|
||||
),
|
||||
]
|
||||
|
||||
RERANK_MODELS = [
|
||||
CLSPoolingRerankModelInfo(
|
||||
# classifier_pooling: mean
|
||||
"Alibaba-NLP/gte-reranker-modernbert-base",
|
||||
mteb_score=0.33386,
|
||||
architecture="ModernBertForSequenceClassification",
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingRerankModelInfo(
|
||||
"Alibaba-NLP/gte-multilingual-reranker-base",
|
||||
mteb_score=0.33062,
|
||||
architecture="GteNewForSequenceClassification",
|
||||
hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
|
||||
enable_test=True,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
|
||||
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_embed_models_correctness(
|
||||
hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
|
||||
) -> None:
|
||||
correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", RERANK_MODELS)
|
||||
def test_rerank_models_mteb(
|
||||
hf_runner, vllm_runner, model_info: RerankModelInfo
|
||||
) -> None:
|
||||
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
|
||||
56
tests/models/language/pooling_mteb_test/test_intfloat.py
Normal file
56
tests/models/language/pooling_mteb_test/test_intfloat.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
|
||||
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
|
||||
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
|
||||
|
||||
from .mteb_utils import mteb_test_embed_models
|
||||
|
||||
MODELS = [
|
||||
########## BertModel
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"intfloat/e5-small",
|
||||
architecture="BertModel",
|
||||
mteb_score=0.742285423,
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"intfloat/e5-base", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"intfloat/e5-large", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"intfloat/multilingual-e5-small", architecture="BertModel", enable_test=False
|
||||
),
|
||||
########## XLMRobertaModel
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"intfloat/multilingual-e5-base",
|
||||
architecture="XLMRobertaModel",
|
||||
mteb_score=0.779325955,
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"intfloat/multilingual-e5-large",
|
||||
architecture="XLMRobertaModel",
|
||||
enable_test=False,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"intfloat/multilingual-e5-large-instruct",
|
||||
architecture="XLMRobertaModel",
|
||||
enable_test=False,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
|
||||
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_embed_models_correctness(
|
||||
hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
|
||||
) -> None:
|
||||
correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)
|
||||
126
tests/models/language/pooling_mteb_test/test_jina.py
Normal file
126
tests/models/language/pooling_mteb_test/test_jina.py
Normal file
@@ -0,0 +1,126 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.models.language.pooling.embed_utils import (
|
||||
check_embeddings_close,
|
||||
correctness_test_embed_models,
|
||||
matryoshka_fy,
|
||||
)
|
||||
from tests.models.utils import (
|
||||
CLSPoolingEmbedModelInfo,
|
||||
CLSPoolingRerankModelInfo,
|
||||
EmbedModelInfo,
|
||||
RerankModelInfo,
|
||||
)
|
||||
from vllm import PoolingParams
|
||||
|
||||
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
|
||||
|
||||
EMBEDDING_MODELS = [
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"jinaai/jina-embeddings-v3",
|
||||
mteb_score=0.824413164,
|
||||
architecture="XLMRobertaModel",
|
||||
is_matryoshka=True,
|
||||
dtype="float32",
|
||||
)
|
||||
]
|
||||
|
||||
RERANK_MODELS = [
|
||||
CLSPoolingRerankModelInfo(
|
||||
"jinaai/jina-reranker-v2-base-multilingual",
|
||||
mteb_score=0.33643,
|
||||
architecture="XLMRobertaForSequenceClassification",
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
|
||||
def hf_model_callback(model):
|
||||
model.encode = partial(model.encode, task="text-matching")
|
||||
|
||||
mteb_test_embed_models(
|
||||
hf_runner, vllm_runner, model_info, hf_model_callback=hf_model_callback
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
|
||||
def test_embed_models_correctness(
|
||||
hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
|
||||
) -> None:
|
||||
def hf_model_callback(model):
|
||||
model.encode = partial(model.encode, task="text-matching")
|
||||
|
||||
correctness_test_embed_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model_info,
|
||||
example_prompts,
|
||||
hf_model_callback=hf_model_callback,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", RERANK_MODELS)
|
||||
def test_rerank_models_mteb(
|
||||
hf_runner, vllm_runner, model_info: RerankModelInfo
|
||||
) -> None:
|
||||
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("dimensions", [16, 32])
|
||||
def test_matryoshka(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model_info,
|
||||
dtype: str,
|
||||
dimensions: int,
|
||||
example_prompts,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
if not model_info.is_matryoshka:
|
||||
pytest.skip("Model is not matryoshka")
|
||||
|
||||
# ST will strip the input texts, see test_embedding.py
|
||||
example_prompts = [str(s).strip() for s in example_prompts]
|
||||
|
||||
with hf_runner(
|
||||
model_info.name,
|
||||
dtype=dtype,
|
||||
is_sentence_transformer=True,
|
||||
) as hf_model:
|
||||
hf_outputs = hf_model.encode(example_prompts, task="text-matching")
|
||||
hf_outputs = matryoshka_fy(hf_outputs, dimensions)
|
||||
|
||||
with vllm_runner(
|
||||
model_info.name, runner="pooling", dtype=dtype, max_model_len=None
|
||||
) as vllm_model:
|
||||
assert vllm_model.llm.llm_engine.model_config.is_matryoshka
|
||||
|
||||
matryoshka_dimensions = (
|
||||
vllm_model.llm.llm_engine.model_config.matryoshka_dimensions
|
||||
)
|
||||
assert matryoshka_dimensions is not None
|
||||
|
||||
if dimensions not in matryoshka_dimensions:
|
||||
with pytest.raises(ValueError):
|
||||
vllm_model.embed(
|
||||
example_prompts, pooling_params=PoolingParams(dimensions=dimensions)
|
||||
)
|
||||
else:
|
||||
vllm_outputs = vllm_model.embed(
|
||||
example_prompts, pooling_params=PoolingParams(dimensions=dimensions)
|
||||
)
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
tol=1e-2,
|
||||
)
|
||||
83
tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
Normal file
83
tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
Normal file
@@ -0,0 +1,83 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.conftest import HfRunner
|
||||
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
|
||||
|
||||
from .mteb_utils import mteb_test_rerank_models
|
||||
|
||||
mxbai_rerank_hf_overrides = {
|
||||
"architectures": ["Qwen2ForSequenceClassification"],
|
||||
"classifier_from_token": ["0", "1"],
|
||||
"method": "from_2_way_softmax",
|
||||
}
|
||||
|
||||
RERANK_MODELS = [
|
||||
LASTPoolingRerankModelInfo(
|
||||
"mixedbread-ai/mxbai-rerank-base-v2",
|
||||
architecture="Qwen2ForSequenceClassification",
|
||||
hf_overrides=mxbai_rerank_hf_overrides,
|
||||
mteb_score=0.273,
|
||||
enable_test=True,
|
||||
),
|
||||
LASTPoolingRerankModelInfo(
|
||||
"mixedbread-ai/mxbai-rerank-large-v2",
|
||||
architecture="Qwen2ForSequenceClassification",
|
||||
hf_overrides=mxbai_rerank_hf_overrides,
|
||||
enable_test=False,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
class MxbaiRerankerHfRunner(HfRunner):
|
||||
def __init__(
|
||||
self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
|
||||
) -> None:
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
|
||||
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
|
||||
self.yes_loc = self.tokenizer.convert_tokens_to_ids("1")
|
||||
self.no_loc = self.tokenizer.convert_tokens_to_ids("0")
|
||||
|
||||
def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
|
||||
def process_inputs(pairs):
|
||||
inputs = self.tokenizer(
|
||||
pairs,
|
||||
padding=False,
|
||||
truncation="longest_first",
|
||||
return_attention_mask=False,
|
||||
)
|
||||
for i, ele in enumerate(inputs["input_ids"]):
|
||||
inputs["input_ids"][i] = ele
|
||||
inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt")
|
||||
for key in inputs:
|
||||
inputs[key] = inputs[key].to(self.model.device)
|
||||
return inputs
|
||||
|
||||
@torch.no_grad()
|
||||
def compute_logits(inputs):
|
||||
logits = self.model(**inputs).logits[:, -1, :]
|
||||
yes_logits = logits[:, self.yes_loc]
|
||||
no_logits = logits[:, self.no_loc]
|
||||
logits = yes_logits - no_logits
|
||||
scores = logits.float().sigmoid()
|
||||
return scores
|
||||
|
||||
scores = []
|
||||
for query, doc, *_ in prompts:
|
||||
pairs = [(query, doc)]
|
||||
inputs = process_inputs(pairs)
|
||||
score = compute_logits(inputs)
|
||||
scores.append(score[0].item())
|
||||
return torch.Tensor(scores)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", RERANK_MODELS)
|
||||
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
|
||||
mteb_test_rerank_models(MxbaiRerankerHfRunner, vllm_runner, model_info)
|
||||
44
tests/models/language/pooling_mteb_test/test_nomic.py
Normal file
44
tests/models/language/pooling_mteb_test/test_nomic.py
Normal file
@@ -0,0 +1,44 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
|
||||
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
|
||||
|
||||
from .mteb_utils import mteb_test_embed_models
|
||||
|
||||
MODELS = [
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"nomic-ai/nomic-embed-text-v1",
|
||||
architecture="NomicBertModel",
|
||||
mteb_score=0.737568559,
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"nomic-ai/nomic-embed-text-v1.5",
|
||||
architecture="NomicBertModel",
|
||||
enable_test=False,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"nomic-ai/CodeRankEmbed", architecture="NomicBertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"nomic-ai/nomic-embed-text-v2-moe",
|
||||
architecture="NomicBertModel",
|
||||
mteb_score=0.715488912,
|
||||
enable_test=True,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
|
||||
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_embed_models_correctness(
|
||||
hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
|
||||
) -> None:
|
||||
correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)
|
||||
@@ -0,0 +1,99 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.conftest import HfRunner
|
||||
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
|
||||
from tests.utils import multi_gpu_test
|
||||
|
||||
from .mteb_utils import mteb_test_rerank_models
|
||||
|
||||
qwen3_reranker_hf_overrides = {
|
||||
"architectures": ["Qwen3ForSequenceClassification"],
|
||||
"classifier_from_token": ["no", "yes"],
|
||||
"is_original_qwen3_reranker": True,
|
||||
}
|
||||
|
||||
RERANK_MODELS = [
|
||||
LASTPoolingRerankModelInfo(
|
||||
"Qwen/Qwen3-Reranker-0.6B",
|
||||
architecture="Qwen3ForSequenceClassification",
|
||||
mteb_score=0.25736,
|
||||
hf_overrides=qwen3_reranker_hf_overrides,
|
||||
enable_test=True,
|
||||
),
|
||||
LASTPoolingRerankModelInfo(
|
||||
"Qwen/Qwen3-Reranker-4B",
|
||||
architecture="Qwen3ForSequenceClassification",
|
||||
hf_overrides=qwen3_reranker_hf_overrides,
|
||||
enable_test=False,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
class Qwen3RerankerHfRunner(HfRunner):
|
||||
def __init__(
|
||||
self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
|
||||
) -> None:
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
|
||||
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
|
||||
self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
|
||||
self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
|
||||
|
||||
def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
|
||||
def process_inputs(pairs):
|
||||
inputs = self.tokenizer(
|
||||
pairs,
|
||||
padding=False,
|
||||
truncation="longest_first",
|
||||
return_attention_mask=False,
|
||||
)
|
||||
for i, ele in enumerate(inputs["input_ids"]):
|
||||
inputs["input_ids"][i] = ele
|
||||
inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt")
|
||||
for key in inputs:
|
||||
inputs[key] = inputs[key].to(self.model.device)
|
||||
return inputs
|
||||
|
||||
@torch.no_grad()
|
||||
def compute_logits(inputs):
|
||||
batch_scores = self.model(**inputs).logits[:, -1, :]
|
||||
true_vector = batch_scores[:, self.token_true_id]
|
||||
false_vector = batch_scores[:, self.token_false_id]
|
||||
batch_scores = torch.stack([false_vector, true_vector], dim=1)
|
||||
batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
|
||||
scores = batch_scores[:, 1].exp()
|
||||
return scores
|
||||
|
||||
scores = []
|
||||
for query, doc, *_ in prompts:
|
||||
pairs = [(query, doc)]
|
||||
inputs = process_inputs(pairs)
|
||||
score = compute_logits(inputs)
|
||||
scores.append(score[0].item())
|
||||
return torch.Tensor(scores)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", RERANK_MODELS)
|
||||
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
|
||||
mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", RERANK_MODELS)
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_rerank_models_mteb_tp(vllm_runner, model_info: RerankModelInfo) -> None:
|
||||
assert model_info.architecture == "Qwen3ForSequenceClassification"
|
||||
|
||||
vllm_extra_kwargs: dict[str, Any] = {
|
||||
"tensor_parallel_size": 2,
|
||||
}
|
||||
|
||||
mteb_test_rerank_models(
|
||||
Qwen3RerankerHfRunner, vllm_runner, model_info, vllm_extra_kwargs
|
||||
)
|
||||
@@ -0,0 +1,77 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
|
||||
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
|
||||
|
||||
from .mteb_utils import mteb_test_embed_models
|
||||
|
||||
MODELS = [
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"Snowflake/snowflake-arctic-embed-xs",
|
||||
is_matryoshka=False,
|
||||
architecture="BertModel",
|
||||
mteb_score=0.714927797,
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"Snowflake/snowflake-arctic-embed-s",
|
||||
is_matryoshka=False,
|
||||
architecture="BertModel",
|
||||
enable_test=False,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"Snowflake/snowflake-arctic-embed-m",
|
||||
is_matryoshka=False,
|
||||
architecture="BertModel",
|
||||
enable_test=False,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"Snowflake/snowflake-arctic-embed-m-long",
|
||||
is_matryoshka=False,
|
||||
architecture="NomicBertModel",
|
||||
mteb_score=0.681146831,
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"Snowflake/snowflake-arctic-embed-l",
|
||||
is_matryoshka=False,
|
||||
architecture="BertModel",
|
||||
enable_test=False,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"Snowflake/snowflake-arctic-embed-m-v1.5",
|
||||
is_matryoshka=True,
|
||||
architecture="BertModel",
|
||||
mteb_score=0.649088363,
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"Snowflake/snowflake-arctic-embed-l-v2.0",
|
||||
is_matryoshka=True,
|
||||
architecture="XLMRobertaModel",
|
||||
mteb_score=0.712258299,
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"Snowflake/snowflake-arctic-embed-m-v2.0",
|
||||
is_matryoshka=True,
|
||||
architecture="GteModel",
|
||||
mteb_score=0.706622444,
|
||||
enable_test=True,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
|
||||
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_embed_models_correctness(
|
||||
hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
|
||||
) -> None:
|
||||
correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)
|
||||
33
tests/models/language/pooling_mteb_test/test_st_projector.py
Normal file
33
tests/models/language/pooling_mteb_test/test_st_projector.py
Normal file
@@ -0,0 +1,33 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
|
||||
from tests.models.utils import (
|
||||
CLSPoolingEmbedModelInfo,
|
||||
EmbedModelInfo,
|
||||
LASTPoolingEmbedModelInfo,
|
||||
)
|
||||
|
||||
from .mteb_utils import mteb_test_embed_models
|
||||
|
||||
# ST models with projector (Dense) layers
|
||||
ST_PROJECTOR_MODELS = [
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"TencentBAC/Conan-embedding-v1",
|
||||
architecture="BertModel",
|
||||
mteb_score=0.688611955,
|
||||
enable_test=True,
|
||||
),
|
||||
LASTPoolingEmbedModelInfo(
|
||||
"google/embeddinggemma-300m",
|
||||
architecture="Gemma3TextModel",
|
||||
mteb_score=0.7473819294684156,
|
||||
enable_test=True,
|
||||
dtype="float32",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", ST_PROJECTOR_MODELS)
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
|
||||
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
|
||||
Reference in New Issue
Block a user