Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

View File

@@ -0,0 +1,90 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import weakref
import pytest
from vllm import LLM, PoolingParams
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "intfloat/multilingual-e5-small"
PROMPTS = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
TOKEN_IDS = [
# Using ID={0, 1, 2, 3} results in NaN values,
# so we add this offset of 1000
[1000],
[1000, 1001],
[1000, 1002, 1001],
[1000, 1003, 1001, 1002],
]
@pytest.fixture(scope="module")
def llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(
model=MODEL_NAME,
max_num_batched_tokens=32768,
tensor_parallel_size=1,
gpu_memory_utilization=0.75,
enforce_eager=True,
seed=0,
)
yield weakref.proxy(llm)
del llm
cleanup_dist_env_and_memory()
@pytest.mark.skip_global_cleanup
def test_multiple_pooling_params(llm: LLM):
pooling_params = [
PoolingParams(),
PoolingParams(),
PoolingParams(),
PoolingParams(),
]
# Multiple PoolingParams should be matched with each prompt
outputs = llm.encode(PROMPTS, pooling_params=pooling_params, pooling_task="embed")
assert len(PROMPTS) == len(outputs)
# Exception raised, if the size of params does not match the size of prompts
with pytest.raises(ValueError):
outputs = llm.encode(
PROMPTS, pooling_params=pooling_params[:3], pooling_task="embed"
)
# Single PoolingParams should be applied to every prompt
single_pooling_params = PoolingParams()
outputs = llm.encode(
PROMPTS, pooling_params=single_pooling_params, pooling_task="embed"
)
assert len(PROMPTS) == len(outputs)
# pooling_params is None, default params should be applied
outputs = llm.encode(PROMPTS, pooling_params=None, pooling_task="embed")
assert len(PROMPTS) == len(outputs)
def test_right_side_truncation(llm: LLM):
# Embeddings models should truncate the end of the prompt
tokenizer = llm.get_tokenizer()
assert tokenizer.truncation_side == "right"

View File

@@ -0,0 +1,119 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
import openai
import pytest
import pytest_asyncio
from tests.utils import RemoteOpenAIServer
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
max_model_len = 128
input = """Immerse yourself in the enchanting chronicle of calculus, a
mathematical domain that has radically transformed our comprehension of
change and motion. Despite its roots in ancient civilizations, the
formal birth of calculus predominantly occurred in the 17th century,
primarily under the influential guidance of Sir Isaac Newton and Gottfried
Wilhelm Leibniz. The earliest traces of calculus concepts are found in
ancient Greek mathematics,most notably in the works of Eudoxus and
Archimedes, around 300 BCE. They utilized the 'method of exhaustion'—a
technique for computing areas and volumes through the use of finite sums.
This methodology laid crucial foundational work for integral calculus.
In the 17th century, both Newton and Leibniz independently pioneered
calculus, each contributing unique perspectives that would shape this new
field."""
@pytest.fixture(scope="module")
def server():
args = [
"--runner",
"pooling",
"--dtype",
"bfloat16",
"--enforce-eager",
"--max-model-len",
str(max_model_len),
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
async def test_smaller_truncation_size(client: openai.AsyncOpenAI):
truncation_size = 10
kwargs: dict[str, Any] = {
"model": MODEL_NAME,
"input": input,
"truncate_prompt_tokens": truncation_size,
}
response = await client.post(path="embeddings", cast_to=object, body={**kwargs})
assert response["usage"]["prompt_tokens"] == truncation_size
@pytest.mark.asyncio
async def test_zero_truncation_size(client: openai.AsyncOpenAI):
truncation_size = 0
kwargs: dict[str, Any] = {
"model": MODEL_NAME,
"input": input,
"truncate_prompt_tokens": truncation_size,
}
response = await client.post(path="embeddings", cast_to=object, body={**kwargs})
assert response["usage"]["prompt_tokens"] == truncation_size
@pytest.mark.asyncio
async def test_bigger_truncation_size(client: openai.AsyncOpenAI):
truncation_size = max_model_len + 1
kwargs: dict[str, Any] = {
"model": MODEL_NAME,
"input": input,
"truncate_prompt_tokens": truncation_size,
}
with pytest.raises(openai.BadRequestError) as err:
await client.post(path="embeddings", cast_to=object, body={**kwargs})
assert err.value.status_code == 400
error_details = err.value.response.json()["error"]
assert error_details["type"] == "BadRequestError"
expected_message = (
"truncate_prompt_tokens value is "
"greater than max_model_len."
" Please, select a smaller truncation size."
)
assert error_details["message"] == expected_message
@pytest.mark.asyncio
async def test_max_truncation_size(client: openai.AsyncOpenAI):
truncation_size = -1
kwargs: dict[str, Any] = {
"model": MODEL_NAME,
"input": input,
"truncate_prompt_tokens": truncation_size,
}
response = await client.post(path="embeddings", cast_to=object, body={**kwargs})
assert response["usage"]["prompt_tokens"] == max_model_len

View File

@@ -0,0 +1,71 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import weakref
import pytest
import torch
from tests.models.utils import softmax
from vllm import LLM, PoolingParams
from vllm.distributed import cleanup_dist_env_and_memory
MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
prompts = ["The chef prepared a delicious meal."]
@pytest.fixture(scope="module")
def llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(
model=MODEL_NAME,
max_num_batched_tokens=32768,
tensor_parallel_size=1,
gpu_memory_utilization=0.75,
enforce_eager=True,
seed=0,
)
yield weakref.proxy(llm)
del llm
cleanup_dist_env_and_memory()
@pytest.mark.skip_global_cleanup
def test_pooling_params(llm: LLM):
def get_outputs(use_activation):
outputs = llm.classify(
prompts,
pooling_params=PoolingParams(use_activation=use_activation),
use_tqdm=False,
)
return torch.tensor([x.outputs.probs for x in outputs])
default = get_outputs(use_activation=None)
w_activation = get_outputs(use_activation=True)
wo_activation = get_outputs(use_activation=False)
assert torch.allclose(default, w_activation, atol=1e-2), (
"Default should use activation."
)
assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
"wo_activation should not use activation."
)
assert torch.allclose(softmax(wo_activation), w_activation, atol=1e-2), (
"w_activation should be close to activation(wo_activation)."
)
@pytest.mark.skip_global_cleanup
def test_token_classify(llm: LLM):
llm.encode(prompts, pooling_task="token_classify", use_tqdm=False)
def test_score_api(llm: LLM):
err_msg = "Score API is only enabled for num_labels == 1."
with pytest.raises(ValueError, match=err_msg):
llm.score("ping", "pong", use_tqdm=False)

View File

@@ -0,0 +1,293 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import requests
import torch
import torch.nn.functional as F
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.classify.protocol import ClassificationResponse
from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
DTYPE = "float32" # Use float32 to avoid NaN issue
@pytest.fixture(scope="module")
def server():
args = [
"--enforce-eager",
"--max-model-len",
"512",
"--dtype",
DTYPE,
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_single_input_classification(server: RemoteOpenAIServer, model_name: str):
input_text = "This product was excellent and exceeded my expectations"
classification_response = requests.post(
server.url_for("classify"),
json={"model": model_name, "input": input_text},
)
classification_response.raise_for_status()
output = ClassificationResponse.model_validate(classification_response.json())
assert output.object == "list"
assert output.model == MODEL_NAME
assert len(output.data) == 1
assert hasattr(output.data[0], "label")
assert hasattr(output.data[0], "probs")
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_add_special_tokens_false(server: RemoteOpenAIServer, model_name: str):
response = requests.post(
server.url_for("classify"),
json={"model": model_name, "input": "hello", "add_special_tokens": False},
)
response.raise_for_status()
ClassificationResponse.model_validate(response.json())
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_multiple_inputs_classification(server: RemoteOpenAIServer, model_name: str):
input_texts = [
"The product arrived on time and works perfectly",
"I'm very satisfied with my purchase, would buy again",
"The customer service was helpful and resolved my issue quickly",
"This product broke after one week, terrible quality",
"I'm very disappointed with this purchase, complete waste of money",
"The customer service was rude and unhelpful",
]
classification_response = requests.post(
server.url_for("classify"),
json={"model": model_name, "input": input_texts},
)
output = ClassificationResponse.model_validate(classification_response.json())
assert len(output.data) == len(input_texts)
for i, item in enumerate(output.data):
assert item.index == i
assert hasattr(item, "label")
assert hasattr(item, "probs")
assert len(item.probs) == item.num_classes
assert item.label in ["Default", "Spoiled"]
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_truncate_prompt_tokens(server: RemoteOpenAIServer, model_name: str):
long_text = "hello " * 600
classification_response = requests.post(
server.url_for("classify"),
json={"model": model_name, "input": long_text, "truncate_prompt_tokens": 5},
)
classification_response.raise_for_status()
output = ClassificationResponse.model_validate(classification_response.json())
assert len(output.data) == 1
assert output.data[0].index == 0
assert hasattr(output.data[0], "probs")
assert output.usage.prompt_tokens == 5
assert output.usage.total_tokens == 5
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_invalid_truncate_prompt_tokens_error(
server: RemoteOpenAIServer, model_name: str
):
classification_response = requests.post(
server.url_for("classify"),
json={"model": model_name, "input": "test", "truncate_prompt_tokens": 513},
)
error = classification_response.json()
assert classification_response.status_code == 400
assert "truncate_prompt_tokens" in error["error"]["message"]
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_empty_input_error(server: RemoteOpenAIServer, model_name: str):
classification_response = requests.post(
server.url_for("classify"),
json={"model": model_name, "input": ""},
)
error = classification_response.json()
assert classification_response.status_code == 400
assert "error" in error
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_batch_classification_empty_list(server: RemoteOpenAIServer, model_name: str):
classification_response = requests.post(
server.url_for("classify"),
json={"model": model_name, "input": []},
)
classification_response.raise_for_status()
output = ClassificationResponse.model_validate(classification_response.json())
assert output.object == "list"
assert isinstance(output.data, list)
assert len(output.data) == 0
@pytest.mark.asyncio
async def test_invocations(server: RemoteOpenAIServer):
request_args = {
"model": MODEL_NAME,
"input": "This product was excellent and exceeded my expectations",
}
classification_response = requests.post(
server.url_for("classify"), json=request_args
)
classification_response.raise_for_status()
invocation_response = requests.post(
server.url_for("invocations"), json=request_args
)
invocation_response.raise_for_status()
classification_output = classification_response.json()
invocation_output = invocation_response.json()
assert classification_output.keys() == invocation_output.keys()
for classification_data, invocation_data in zip(
classification_output["data"], invocation_output["data"]
):
assert classification_data.keys() == invocation_data.keys()
assert classification_data["probs"] == pytest.approx(
invocation_data["probs"], rel=0.01
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_use_activation(server: RemoteOpenAIServer, model_name: str):
input_text = ["This product was excellent and exceeded my expectations"]
async def get_outputs(use_activation):
response = requests.post(
server.url_for("classify"),
json={
"model": model_name,
"input": input_text,
"use_activation": use_activation,
},
)
outputs = response.json()
return torch.tensor([x["probs"] for x in outputs["data"]])
default = await get_outputs(use_activation=None)
w_activation = await get_outputs(use_activation=True)
wo_activation = await get_outputs(use_activation=False)
assert torch.allclose(default, w_activation, atol=1e-2), (
"Default should use activation."
)
assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
"wo_activation should not use activation."
)
assert torch.allclose(F.softmax(wo_activation, dim=-1), w_activation, atol=1e-2), (
"w_activation should be close to activation(wo_activation)."
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_score(server: RemoteOpenAIServer, model_name: str):
# score api is only enabled for num_labels == 1.
response = requests.post(
server.url_for("score"),
json={
"model": model_name,
"text_1": "ping",
"text_2": "pong",
},
)
assert response.json()["error"]["type"] == "BadRequestError"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_rerank(server: RemoteOpenAIServer, model_name: str):
# rerank api is only enabled for num_labels == 1.
response = requests.post(
server.url_for("rerank"),
json={
"model": model_name,
"query": "ping",
"documents": ["pong"],
},
)
assert response.json()["error"]["type"] == "BadRequestError"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_pooling_classify(server: RemoteOpenAIServer, model_name: str):
input_text = "This product was excellent and exceeded my expectations"
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": input_text,
"encoding_format": "float",
"task": "classify",
},
)
poolings = PoolingResponse.model_validate(response.json())
assert len(poolings.data) == 1
assert len(poolings.data[0].data) == 2
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_pooling_token_classify(server: RemoteOpenAIServer, model_name: str):
task = "token_classify"
input_text = ["This product was excellent and exceeded my expectations"]
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": input_text,
"encoding_format": "float",
"task": task,
},
)
poolings = PoolingResponse.model_validate(response.json())
assert len(poolings.data) == 1
assert len(poolings.data[0].data) == 8
assert len(poolings.data[0].data[0]) == 2
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("task", ["embed", "token_embed", "plugin"])
async def test_pooling_not_supported(
server: RemoteOpenAIServer, model_name: str, task: str
):
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": "test",
"encoding_format": "float",
"task": task,
},
)
assert response.json()["error"]["type"] == "BadRequestError"
assert response.json()["error"]["message"].startswith(
f"Task {task} is not supported"
)

View File

@@ -0,0 +1,95 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import pytest
import requests
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.classify.protocol import ClassificationResponse
VLM_MODEL_NAME = "muziyongshixin/Qwen2.5-VL-7B-for-VideoCls"
MAXIMUM_VIDEOS = 1
TEST_VIDEO_URL = "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4"
HF_OVERRIDES = {
"text_config": {
"architectures": ["Qwen2_5_VLForSequenceClassification"],
},
}
@pytest.fixture(scope="module")
def server_vlm_classify():
args = [
"--runner",
"pooling",
"--max-model-len",
"5000",
"--enforce-eager",
"--limit-mm-per-prompt",
json.dumps({"video": MAXIMUM_VIDEOS}),
]
with RemoteOpenAIServer(
VLM_MODEL_NAME, args, override_hf_configs=HF_OVERRIDES
) as remote_server:
yield remote_server
@pytest.mark.parametrize("model_name", [VLM_MODEL_NAME])
def test_classify_accepts_chat_text_only(
server_vlm_classify: RemoteOpenAIServer, model_name: str
) -> None:
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Please classify this text request."},
],
}
]
response = requests.post(
server_vlm_classify.url_for("classify"),
json={"model": model_name, "messages": messages},
)
response.raise_for_status()
output = ClassificationResponse.model_validate(response.json())
assert output.object == "list"
assert output.model == model_name
assert len(output.data) == 1
assert len(output.data[0].probs) == 2
assert output.usage.prompt_tokens == 22
@pytest.mark.parametrize("model_name", [VLM_MODEL_NAME])
def test_classify_accepts_chat_video_url(
server_vlm_classify: RemoteOpenAIServer, model_name: str
) -> None:
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Please classify this video."},
{"type": "video_url", "video_url": {"url": TEST_VIDEO_URL}},
],
}
]
response = requests.post(
server_vlm_classify.url_for("classify"),
json={"model": model_name, "messages": messages},
)
response.raise_for_status()
output = ClassificationResponse.model_validate(response.json())
assert output.object == "list"
assert output.model == model_name
assert len(output.data) == 1
assert len(output.data[0].probs) == 2
assert output.usage.prompt_tokens == 4807

View File

@@ -0,0 +1,47 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest
from tests.models.language.pooling_mteb_test.mteb_utils import (
MTEB_EMBED_TASKS,
MTEB_EMBED_TOL,
OpenAIClientMtebEncoder,
run_mteb_embed_task,
)
from tests.utils import RemoteOpenAIServer
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
MODEL_NAME = "intfloat/e5-small"
MAIN_SCORE = 0.7422994752439667
@pytest.fixture(scope="module")
def server():
args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
def test_mteb_embed(server):
client = server.get_client()
encoder = OpenAIClientMtebEncoder(MODEL_NAME, client)
vllm_main_score = run_mteb_embed_task(encoder, MTEB_EMBED_TASKS)
st_main_score = MAIN_SCORE
print("VLLM main score: ", vllm_main_score)
print("SentenceTransformer main score: ", st_main_score)
print("Difference: ", st_main_score - vllm_main_score)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert st_main_score - vllm_main_score < MTEB_EMBED_TOL

View File

@@ -0,0 +1,68 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import weakref
import pytest
import torch
import torch.nn.functional as F
from vllm import LLM, PoolingParams
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "intfloat/multilingual-e5-small"
prompts = ["The chef prepared a delicious meal."]
@pytest.fixture(scope="module")
def llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(
model=MODEL_NAME,
max_num_batched_tokens=32768,
tensor_parallel_size=1,
gpu_memory_utilization=0.75,
enforce_eager=True,
seed=0,
)
yield weakref.proxy(llm)
del llm
cleanup_dist_env_and_memory()
@pytest.mark.skip_global_cleanup
def test_token_embed(llm: LLM):
outputs = llm.encode(prompts, pooling_task="token_embed", use_tqdm=False)
multi_vector = outputs[0].outputs.data
assert multi_vector.shape == (11, 384)
def test_pooling_params(llm: LLM):
def get_outputs(normalize):
outputs = llm.embed(
prompts, pooling_params=PoolingParams(normalize=normalize), use_tqdm=False
)
return torch.tensor([x.outputs.embedding for x in outputs])
default = get_outputs(normalize=None)
w_normal = get_outputs(normalize=True)
wo_normal = get_outputs(normalize=False)
assert torch.allclose(default, w_normal, atol=1e-2), "Default should use normal."
assert not torch.allclose(w_normal, wo_normal, atol=1e-2), (
"wo_normal should not use normal."
)
assert torch.allclose(w_normal, F.normalize(wo_normal, p=2, dim=-1), atol=1e-2), (
"w_normal should be close to normal(wo_normal)."
)

View File

@@ -0,0 +1,680 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import json
import numpy as np
import openai
import pytest
import pytest_asyncio
import requests
import torch
import torch.nn.functional as F
from tests.models.language.pooling.embed_utils import run_embedding_correctness_test
from tests.models.utils import check_embeddings_close
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
from vllm.platforms import current_platform
from vllm.tokenizers import get_tokenizer
from vllm.utils.serial_utils import (
EMBED_DTYPE_TO_TORCH_DTYPE,
ENDIANNESS,
MetadataItem,
binary2tensor,
build_metadata_items,
decode_pooling_output,
)
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "intfloat/multilingual-e5-small"
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
DTYPE = "bfloat16"
@pytest.fixture(scope="module")
def server():
args = [
"--runner",
"pooling",
# use half precision for speed and memory savings in CI environment
"--dtype",
DTYPE,
"--enforce-eager",
"--max-model-len",
"512",
"--chat-template",
DUMMY_CHAT_TEMPLATE,
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.fixture(scope="module")
def hf_model(hf_runner):
with hf_runner(MODEL_NAME, dtype=DTYPE, is_sentence_transformer=True) as hf_model:
yield hf_model
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding(hf_model, client: openai.AsyncOpenAI, model_name: str):
input_texts = [
"The chef prepared a delicious meal.",
]
# test single embedding
embedding_response = await client.embeddings.create(
model=model_name,
input=input_texts,
encoding_format="float",
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 11
assert embeddings.usage.total_tokens == 11
vllm_outputs = [d.embedding for d in embeddings.data]
run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
# test using token IDs
input_tokens = [1, 1, 1, 1, 1]
embedding_response = await client.embeddings.create(
model=model_name,
input=input_tokens,
encoding_format="float",
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 5
assert embeddings.usage.total_tokens == 5
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI, model_name: str):
# test list[str]
input_texts = [
"The cat sat on the mat.",
"A feline was resting on a rug.",
"Stars twinkle brightly in the night sky.",
]
embedding_response = await client.embeddings.create(
model=model_name,
input=input_texts,
encoding_format="float",
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 3
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 33
assert embeddings.usage.total_tokens == 33
vllm_outputs = [d.embedding for d in embeddings.data]
run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
# test list[list[int]]
input_tokens = [
[4, 5, 7, 9, 20],
[15, 29, 499],
[24, 24, 24, 24, 24],
[25, 32, 64, 77],
]
embedding_response = await client.embeddings.create(
model=model_name,
input=input_tokens,
encoding_format="float",
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 4
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 17
assert embeddings.usage.total_tokens == 17
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_conversation_embedding(
server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
):
messages = [
{
"role": "user",
"content": "The cat sat on the mat.",
},
{
"role": "assistant",
"content": "A feline was resting on a rug.",
},
{
"role": "user",
"content": "Stars twinkle brightly in the night sky.",
},
]
chat_response = requests.post(
server.url_for("v1/embeddings"),
json={
"model": model_name,
"messages": messages,
"encoding_format": "float",
},
)
chat_response.raise_for_status()
chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
tokenizer = get_tokenizer(tokenizer_name=model_name)
prompt = tokenizer.apply_chat_template(
messages,
chat_template=DUMMY_CHAT_TEMPLATE,
add_generation_prompt=True,
continue_final_message=False,
tokenize=False,
)
completion_response = await client.embeddings.create(
model=model_name,
input=prompt,
encoding_format="float",
# To be consistent with chat
extra_body={"add_special_tokens": False},
)
completion_embeddings = EmbeddingResponse.model_validate(
completion_response.model_dump(mode="json")
)
assert chat_embeddings.id is not None
assert completion_embeddings.id is not None
assert chat_embeddings.created <= completion_embeddings.created
assert chat_embeddings.model_dump(exclude={"id", "created"}) == (
completion_embeddings.model_dump(exclude={"id", "created"})
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_base64_embedding(
hf_model, client: openai.AsyncOpenAI, model_name: str
):
input_texts = [
"Hello my name is",
"The best thing about vLLM is that it supports many different models",
]
responses_float = await client.embeddings.create(
input=input_texts, model=model_name, encoding_format="float"
)
float_data = [d.embedding for d in responses_float.data]
run_embedding_correctness_test(hf_model, input_texts, float_data)
responses_base64 = await client.embeddings.create(
input=input_texts, model=model_name, encoding_format="base64"
)
base64_data = []
for data in responses_base64.data:
base64_data.append(
np.frombuffer(base64.b64decode(data.embedding), dtype="float32").tolist()
)
run_embedding_correctness_test(hf_model, input_texts, base64_data)
# Default response is float32 decoded from base64 by OpenAI Client
responses_default = await client.embeddings.create(
input=input_texts, model=model_name
)
default_data = [d.embedding for d in responses_default.data]
run_embedding_correctness_test(hf_model, input_texts, default_data)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_base64_embed_dtype_and_endianness(
server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
):
input_texts = [
"The best thing about vLLM is that it supports many different models",
]
responses_float = await client.embeddings.create(
input=input_texts, model=model_name, encoding_format="float"
)
float_data = [d.embedding for d in responses_float.data]
for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
for endianness in ENDIANNESS:
responses_base64 = requests.post(
server.url_for("/v1/embeddings"),
json={
"model": model_name,
"input": input_texts,
"encoding_format": "base64",
"embed_dtype": embed_dtype,
"endianness": endianness,
},
)
base64_data = []
for data in responses_base64.json()["data"]:
binary = base64.b64decode(data["embedding"])
tensor = binary2tensor(binary, (-1,), embed_dtype, endianness)
base64_data.append(tensor.to(torch.float32).tolist())
check_embeddings_close(
embeddings_0_lst=float_data,
embeddings_1_lst=base64_data,
name_0="float_data",
name_1="base64_data",
tol=1e-2,
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_bytes_embed_dtype_and_endianness(
server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
):
input_texts = [
"The best thing about vLLM is that it supports many different models",
]
responses_float = await client.embeddings.create(
input=input_texts, model=model_name, encoding_format="float"
)
float_data = [d.embedding for d in responses_float.data]
for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
for endianness in ENDIANNESS:
responses_bytes = requests.post(
server.url_for("/v1/embeddings"),
json={
"model": model_name,
"input": input_texts,
"encoding_format": "bytes",
"embed_dtype": embed_dtype,
"endianness": endianness,
},
)
metadata = json.loads(responses_bytes.headers["metadata"])
body = responses_bytes.content
items = [MetadataItem(**x) for x in metadata["data"]]
bytes_data = decode_pooling_output(items=items, body=body)
bytes_data = [x.to(torch.float32).tolist() for x in bytes_data]
check_embeddings_close(
embeddings_0_lst=float_data,
embeddings_1_lst=bytes_data,
name_0="float_data",
name_1="bytes_data",
tol=1e-2,
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_bytes_only_embed_dtype_and_endianness(
server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
):
input_texts = [
"The best thing about vLLM is that it supports many different models",
] * 2
responses_float = await client.embeddings.create(
input=input_texts, model=model_name, encoding_format="float"
)
float_data = [d.embedding for d in responses_float.data]
embedding_size = len(float_data[0])
for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
for endianness in ENDIANNESS:
responses_bytes = requests.post(
server.url_for("/v1/embeddings"),
json={
"model": model_name,
"input": input_texts,
"encoding_format": "bytes_only",
"embed_dtype": embed_dtype,
"endianness": endianness,
},
)
assert "metadata" not in responses_bytes.headers
body = responses_bytes.content
items = build_metadata_items(
embed_dtype=embed_dtype,
endianness=endianness,
shape=(embedding_size,),
n_request=len(input_texts),
)
bytes_data = decode_pooling_output(items=items, body=body)
bytes_data = [x.to(torch.float32).tolist() for x in bytes_data]
check_embeddings_close(
embeddings_0_lst=float_data,
embeddings_1_lst=bytes_data,
name_0="float_data",
name_1="bytes_data",
tol=1e-2,
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"])
async def test_params_not_supported(
server: RemoteOpenAIServer, model_name: str, param_name: str
):
input_texts = [
"The best thing about vLLM is that it supports many different models",
]
responses_base64 = requests.post(
server.url_for("/v1/embeddings"),
json={
"model": model_name,
"input": input_texts,
"encoding_format": "base64",
param_name: f"bad_{param_name}",
},
)
assert responses_base64.status_code == 400
assert "literal_error" in responses_base64.json()["error"]["message"]
assert f"bad_{param_name}" in responses_base64.json()["error"]["message"]
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding_truncation(client: openai.AsyncOpenAI, model_name: str):
input_texts = [
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
]
# test single embedding
embedding_response = await client.embeddings.create(
model=model_name, input=input_texts, extra_body={"truncate_prompt_tokens": 10}
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 10
assert embeddings.usage.total_tokens == 10
input_tokens = [
1,
24428,
289,
18341,
26165,
285,
19323,
283,
289,
26789,
3871,
28728,
9901,
340,
2229,
385,
340,
315,
28741,
28804,
2,
]
embedding_response = await client.embeddings.create(
model=model_name, input=input_tokens, extra_body={"truncate_prompt_tokens": 10}
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 10
assert embeddings.usage.total_tokens == 10
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding_truncation_invalid(
client: openai.AsyncOpenAI, model_name: str
):
input_texts = [
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
]
with pytest.raises(openai.BadRequestError):
response = await client.embeddings.create(
model=model_name,
input=input_texts,
extra_body={"truncate_prompt_tokens": 8193},
)
assert "error" in response.object
assert (
"truncate_prompt_tokens value is greater than max_model_len. "
"Please, select a smaller truncation size." in response.message
)
@pytest.mark.asyncio
async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenAI):
input_texts = [
"The chef prepared a delicious meal.",
]
request_args = {
"model": MODEL_NAME,
"input": input_texts,
"encoding_format": "float",
}
completion_response = await client.embeddings.create(**request_args)
invocation_response = requests.post(
server.url_for("invocations"), json=request_args
)
invocation_response.raise_for_status()
completion_output = completion_response.model_dump()
invocation_output = invocation_response.json()
assert completion_output.keys() == invocation_output.keys()
for completion_data, invocation_data in zip(
completion_output["data"], invocation_output["data"]
):
assert completion_data.keys() == invocation_data.keys()
check_embeddings_close(
embeddings_0_lst=[completion_data["embedding"]],
embeddings_1_lst=[invocation_data["embedding"]],
name_0="completion",
name_1="invocation",
)
@pytest.mark.asyncio
async def test_invocations_conversation(server: RemoteOpenAIServer):
messages = [
{
"role": "user",
"content": "The cat sat on the mat.",
},
{
"role": "assistant",
"content": "A feline was resting on a rug.",
},
{
"role": "user",
"content": "Stars twinkle brightly in the night sky.",
},
]
request_args = {
"model": MODEL_NAME,
"messages": messages,
"encoding_format": "float",
}
chat_response = requests.post(server.url_for("v1/embeddings"), json=request_args)
chat_response.raise_for_status()
invocation_response = requests.post(
server.url_for("invocations"), json=request_args
)
invocation_response.raise_for_status()
chat_output = chat_response.json()
invocation_output = invocation_response.json()
assert chat_output.keys() == invocation_output.keys()
for chat_data, invocation_data in zip(
chat_output["data"], invocation_output["data"]
):
assert chat_data.keys() == invocation_data.keys()
check_embeddings_close(
embeddings_0_lst=[chat_data["embedding"]],
embeddings_1_lst=[invocation_data["embedding"]],
name_0="chat",
name_1="invocation",
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_normalize(server: RemoteOpenAIServer, model_name: str):
input_text = ["The chef prepared a delicious meal."]
async def get_outputs(normalize):
request_args = {
"model": MODEL_NAME,
"input": input_text,
"encoding_format": "float",
"normalize": normalize,
}
response = requests.post(server.url_for("v1/embeddings"), json=request_args)
outputs = response.json()
return torch.tensor([x["embedding"] for x in outputs["data"]])
default = await get_outputs(normalize=None)
w_normal = await get_outputs(normalize=True)
wo_normal = await get_outputs(normalize=False)
assert torch.allclose(default, w_normal, atol=1e-2), "Default should use normal."
assert not torch.allclose(w_normal, wo_normal, atol=1e-2), (
"wo_normal should not use normal."
)
assert torch.allclose(w_normal, F.normalize(wo_normal, p=2, dim=-1), atol=1e-2), (
"w_normal should be close to normal(wo_normal)."
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_pooling_embed(server: RemoteOpenAIServer, model_name: str):
task = "embed"
input_text = ["The chef prepared a delicious meal."]
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": input_text,
"encoding_format": "float",
"task": task,
},
)
poolings = PoolingResponse.model_validate(response.json())
assert len(poolings.data) == 1
assert len(poolings.data[0].data) == 384
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_pooling_token_embed(server: RemoteOpenAIServer, model_name: str):
task = "token_embed"
input_text = ["The chef prepared a delicious meal."]
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": input_text,
"encoding_format": "float",
"task": task,
},
)
poolings = PoolingResponse.model_validate(response.json())
assert len(poolings.data) == 1
assert len(poolings.data[0].data) == 11
assert len(poolings.data[0].data[0]) == 384
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("task", ["classify", "token_classify", "plugin"])
async def test_pooling_not_supported(
server: RemoteOpenAIServer, model_name: str, task: str
):
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": "test",
"encoding_format": "float",
"task": task,
},
)
assert response.json()["error"]["type"] == "BadRequestError"
assert response.json()["error"]["message"].startswith(
f"Task {task} is not supported"
)

View File

@@ -0,0 +1,132 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`.
"""
import openai
import pytest
from tests.conftest import HfRunner
from tests.models.language.pooling.embed_utils import run_embedding_correctness_test
from tests.models.utils import EmbedModelInfo
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODELS = [
EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m-v1.5",
is_matryoshka=True,
matryoshka_dimensions=[256],
),
]
input_texts = [
"The chef prepared a delicious meal.",
]
@pytest.fixture(scope="module", params=MODELS)
def model_info(request):
return request.param
@pytest.fixture(scope="module", params=["bfloat16"])
def dtype(request):
return request.param
@pytest.fixture(scope="module")
def server(model_info, dtype: str):
args = [
"--runner",
"pooling",
# use half precision for speed and memory savings in CI environment
"--dtype",
dtype,
"--enforce-eager",
"--max-model-len",
"512",
]
if model_info.name == "Snowflake/snowflake-arctic-embed-m-v1.5":
# Manually enable Matryoshka Embeddings
args.extend(
["--trust_remote_code", "--hf_overrides", '{"matryoshka_dimensions":[256]}']
)
with RemoteOpenAIServer(model_info.name, args) as remote_server:
yield remote_server
@pytest.fixture(scope="module")
def hf_model(hf_runner, model_info, dtype: str):
with hf_runner(
model_info.name, dtype=dtype, is_sentence_transformer=True
) as hf_model:
yield hf_model
@pytest.mark.asyncio
async def test_matryoshka(
model_info: EmbedModelInfo, server: RemoteOpenAIServer, hf_model: HfRunner
):
client = server.get_async_client()
async def make_request_and_correctness_test(dimensions):
prompts = input_texts * 3
embedding_response = await client.embeddings.create(
model=model_info.name,
input=prompts,
dimensions=dimensions,
encoding_format="float",
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 3
assert len(embeddings.data[0].embedding) > 0
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens > 0
assert embeddings.usage.total_tokens > 0
if dimensions is not None:
assert len(embeddings.data[0].embedding) == dimensions
vllm_outputs = [d.embedding for d in embeddings.data]
run_embedding_correctness_test(hf_model, prompts, vllm_outputs, dimensions)
if model_info.is_matryoshka:
valid_dimensions: list[int | None] = [None]
if model_info.matryoshka_dimensions is not None:
valid_dimensions += model_info.matryoshka_dimensions[:2]
for dimensions in valid_dimensions:
await make_request_and_correctness_test(dimensions)
invalid_dimensions: list[int | None] = [-1]
if model_info.matryoshka_dimensions is not None:
assert 5 not in model_info.matryoshka_dimensions
invalid_dimensions.append(5)
for dimensions in invalid_dimensions:
with pytest.raises(openai.BadRequestError):
await make_request_and_correctness_test(dimensions)
else:
for dimensions in [None]:
await make_request_and_correctness_test(dimensions)
for dimensions in [-1, 16]:
with pytest.raises(openai.BadRequestError):
await make_request_and_correctness_test(dimensions)

View File

@@ -0,0 +1,458 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Test cases for long text embedding with automatic chunking mechanism.
This test suite validates vLLM's automatic chunking functionality for handling
text inputs that exceed the model's maximum token length, specifically targeting
the intfloat/multilingual-e5-small model (max token length: 512).
"""
import random
import openai
import pytest
import pytest_asyncio
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
def _generate_random_text(word_count: int) -> str:
"""Generate random text with approximately the specified word count."""
# Common English words with focus on verbs and nouns for realistic text
common_words = [
# Essential articles and pronouns (minimal)
"the",
"and",
"you",
"they",
"this",
"that",
"these",
"those",
# Action verbs
"create",
"build",
"develop",
"design",
"implement",
"execute",
"analyze",
"process",
"generate",
"calculate",
"evaluate",
"optimize",
"transform",
"integrate",
"configure",
"deploy",
"monitor",
"manage",
"discover",
"explore",
"investigate",
"research",
"study",
"examine",
"improve",
"enhance",
"upgrade",
"modify",
"update",
"maintain",
"solve",
"resolve",
"handle",
"address",
"tackle",
"overcome",
"communicate",
"collaborate",
"coordinate",
"organize",
"plan",
"achieve",
"accomplish",
"complete",
"finish",
"deliver",
"provide",
# Technology and science nouns
"system",
"application",
"software",
"hardware",
"network",
"database",
"algorithm",
"model",
"framework",
"platform",
"interface",
"protocol",
"architecture",
"infrastructure",
"component",
"module",
"service",
"technology",
"innovation",
"solution",
"methodology",
"approach",
"artificial",
"intelligence",
"machine",
"learning",
"neural",
"network",
"computer",
"processor",
"memory",
"storage",
"computation",
"data",
"information",
"knowledge",
"insight",
"pattern",
"trend",
"analysis",
"research",
"development",
"engineering",
"science",
"mathematics",
"statistics",
"probability",
"optimization",
"performance",
"efficiency",
# General nouns
"project",
"team",
"organization",
"company",
"business",
"industry",
"market",
"customer",
"user",
"client",
"product",
"feature",
"function",
"requirement",
"specification",
"documentation",
"report",
"result",
"outcome",
"impact",
"benefit",
"advantage",
"challenge",
"problem",
"opportunity",
"strategy",
"goal",
"objective",
"target",
"milestone",
"process",
"procedure",
"workflow",
"pipeline",
"operation",
"task",
"activity",
"event",
"session",
"meeting",
"discussion",
"decision",
]
words = []
for _ in range(word_count):
words.append(random.choice(common_words))
# Add some punctuation for more realistic text
text = " ".join(words)
# Add periods every 10-20 words
words_list = text.split()
result = []
for i, word in enumerate(words_list):
result.append(word)
if (i + 1) % random.randint(10, 20) == 0 and i < len(words_list) - 1:
result[-1] += "."
return " ".join(result)
MODEL_NAME = "intfloat/multilingual-e5-small"
DTYPE = "bfloat16"
# Test text: Generate text with approximately 1500 words to exceed 1024 tokens
LONG_TEXT_1500_WORDS = _generate_random_text(1500)
# Test text: Generate text with approximately 2500 words to exceed 2048 tokens
LONG_TEXT_2500_WORDS = _generate_random_text(2500)
@pytest.fixture(scope="module")
def server_with_chunked_processing():
"""Start server with automatic chunking processing enabled."""
args = [
"--runner",
"pooling",
"--dtype",
DTYPE,
"--enforce-eager",
"--max-model-len",
"512", # Set smaller max_model_len to trigger chunking mechanism
"--pooler-config",
(
'{"pooling_type": "MEAN", "normalize": true, '
'"enable_chunked_processing": true, "max_embed_len": 10000}'
),
"--gpu-memory-utilization",
"0.8",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client_with_chunked_processing(server_with_chunked_processing):
"""Create async client with chunking processing support."""
async with server_with_chunked_processing.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_long_text_embedding_1500_chars(
client_with_chunked_processing: openai.AsyncOpenAI, model_name: str
):
"""Test embedding processing for ~1500 character long text
(~1028 tokens, exceeding 512 token limit)."""
# Verify text length
# Verify text has sufficient word count (approximately 1500 words)
word_count = len(LONG_TEXT_1500_WORDS.split())
assert word_count >= 1400, f"Test text word count insufficient: {word_count} words"
# Send embedding request
embedding_response = await client_with_chunked_processing.embeddings.create(
model=model_name,
input=[LONG_TEXT_1500_WORDS],
encoding_format="float",
)
# Verify response structure
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert (
len(embeddings.data[0].embedding) == 384
) # multilingual-e5-small embedding dimension
assert embeddings.usage.completion_tokens == 0
# Due to chunked processing, token count should
# reflect actual processed tokens
# With ~1500 words, we expect roughly
# 1024+ tokens (exceeding 512 token limit)
# Should exceed single chunk limit of 512
assert embeddings.usage.prompt_tokens > 800
assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens
# Verify embedding vector validity
embedding_vector = embeddings.data[0].embedding
assert all(isinstance(x, float) for x in embedding_vector), (
"Embedding vector should contain floats"
)
assert not all(x == 0 for x in embedding_vector), (
"Embedding vector should not be all zeros"
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_long_text_embedding_2500_chars(
client_with_chunked_processing: openai.AsyncOpenAI, model_name: str
):
"""Test embedding processing for ~2500 character long text
(~2048 tokens, requiring multiple chunks)."""
# Verify text length
# Verify text has sufficient word count (approximately 2500 words)
word_count = len(LONG_TEXT_2500_WORDS.split())
assert word_count >= 2300, f"Test text word count insufficient: {word_count} words"
# Send embedding request
embedding_response = await client_with_chunked_processing.embeddings.create(
model=model_name,
input=[LONG_TEXT_2500_WORDS],
encoding_format="float",
)
# Verify response structure
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert (
len(embeddings.data[0].embedding) == 384
) # multilingual-e5-small embedding dimension
assert embeddings.usage.completion_tokens == 0
# Due to chunked processing, token count should
# reflect actual processed tokens
# With ~2500 words, we expect
# roughly 2048+ tokens (requiring multiple chunks)
# Should require multiple chunks for processing
assert embeddings.usage.prompt_tokens > 1500
assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens
# Verify embedding vector validity
embedding_vector = embeddings.data[0].embedding
assert all(isinstance(x, float) for x in embedding_vector), (
"Embedding vector should contain floats"
)
assert not all(x == 0 for x in embedding_vector), (
"Embedding vector should not be all zeros"
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_long_text_embedding(
client_with_chunked_processing: openai.AsyncOpenAI, model_name: str
):
"""Test batch long text embedding processing."""
input_texts = [
LONG_TEXT_1500_WORDS,
LONG_TEXT_2500_WORDS,
"This is a short text test.", # Short text for comparison
]
# Send batch embedding request
embedding_response = await client_with_chunked_processing.embeddings.create(
model=model_name,
input=input_texts,
encoding_format="float",
)
# Verify response structure
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 3 # Three input texts
# Verify each embedding dimension
for i, embedding_data in enumerate(embeddings.data):
assert len(embedding_data.embedding) == 384
assert embedding_data.index == i
# Verify embedding vector validity
embedding_vector = embedding_data.embedding
assert all(isinstance(x, float) for x in embedding_vector)
assert not all(x == 0 for x in embedding_vector)
# Verify token usage
assert embeddings.usage.completion_tokens == 0
# Total token count should be very substantial
assert embeddings.usage.prompt_tokens > 1000
assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_chunked_vs_normal_consistency(
client_with_chunked_processing: openai.AsyncOpenAI, model_name: str
):
"""Test consistency between chunked and
normal processing (using short text)."""
# Use a short text within the 512 token limit
short_text = (
"Artificial intelligence technology is changing our world, "
"bringing unprecedented opportunities and challenges."
)
# Send embedding request
embedding_response = await client_with_chunked_processing.embeddings.create(
model=model_name,
input=[short_text],
encoding_format="float",
)
# Verify response structure
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
# Short text should not require chunked processing
assert embeddings.usage.prompt_tokens < 512
assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens
# 验证embedding向量的有效性
embedding_vector = embeddings.data[0].embedding
assert all(isinstance(x, float) for x in embedding_vector)
assert not all(x == 0 for x in embedding_vector)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_chunked_processing_response_format(
client_with_chunked_processing: openai.AsyncOpenAI, model_name: str
):
"""Test response format and structure during chunked processing."""
# Test with long text to trigger chunking
embedding_response = await client_with_chunked_processing.embeddings.create(
model=model_name,
input=[LONG_TEXT_1500_WORDS],
encoding_format="float",
)
# Verify response structure
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert embeddings.data[0].object == "embedding"
assert embeddings.data[0].index == 0
# Verify embedding vector properties
embedding_vector = embeddings.data[0].embedding
import math
vector_norm = math.sqrt(sum(x * x for x in embedding_vector))
# Check that the vector is normalized
# (default behavior for most embedding models)
assert 0.8 < vector_norm < 1.2, (
f"Vector norm should be reasonable, actual: {vector_norm}"
)

View File

@@ -0,0 +1,106 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import pytest
import requests
from transformers import AutoProcessor
from tests.utils import VLLM_PATH, RemoteOpenAIServer
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
from vllm.multimodal.base import MediaWithBytes
from vllm.multimodal.utils import encode_image_base64, fetch_image
MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
MAXIMUM_IMAGES = 2
vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec_phi3v.jinja"
assert vlm2vec_jinja_path.exists()
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_ASSETS = [
"2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
"Grayscale_8bits_palette_sample_image.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/Grayscale_8bits_palette_sample_image.png",
"1280px-Venn_diagram_rgb.svg.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/1280px-Venn_diagram_rgb.svg.png",
"RGBA_comp.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png",
]
@pytest.fixture(scope="module")
def server():
args = [
"--runner",
"pooling",
"--max-model-len",
"2048",
"--max-num-seqs",
"5",
"--enforce-eager",
"--trust-remote-code",
"--limit-mm-per-prompt",
json.dumps({"image": MAXIMUM_IMAGES}),
"--chat-template",
str(vlm2vec_jinja_path),
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.fixture(scope="session")
def base64_encoded_image(local_asset_server) -> dict[str, str]:
return {
image_url: encode_image_base64(local_asset_server.get_image_asset(image_url))
for image_url in TEST_IMAGE_ASSETS
}
def get_hf_prompt_tokens(model_name, content, image_url):
processor = AutoProcessor.from_pretrained(
model_name, trust_remote_code=True, num_crops=4
)
placeholder = "<|image_1|> "
prompt = f"{placeholder}{content}"
image = fetch_image(image_url)
# Unwrap MediaWithBytes if present
if isinstance(image, MediaWithBytes):
image = image.media
images = [image]
inputs = processor(prompt, images, return_tensors="pt")
return inputs.input_ids.shape[1]
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
async def test_image_embedding(
server: RemoteOpenAIServer, model_name: str, image_url: str
):
content_text = "Represent the given image."
messages = [
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "text", "text": content_text},
],
}
]
response = requests.post(
server.url_for("v1/embeddings"),
json={"model": model_name, "messages": messages, "encoding_format": "float"},
)
response.raise_for_status()
embeddings = EmbeddingResponse.model_validate(response.json())
hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 3072
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == hf_prompt_tokens
assert embeddings.usage.total_tokens == hf_prompt_tokens

View File

@@ -0,0 +1,516 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import json
import numpy as np
import pytest
import requests
import torch
from tests.models.utils import check_embeddings_close
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
from vllm.tokenizers import get_tokenizer
from vllm.utils.serial_utils import (
EMBED_DTYPE_TO_TORCH_DTYPE,
ENDIANNESS,
MetadataItem,
binary2tensor,
build_metadata_items,
decode_pooling_output,
)
MODEL_NAME = "internlm/internlm2-1_8b-reward"
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
@pytest.fixture(scope="module")
def server():
args = [
"--runner",
"pooling",
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--enforce-eager",
"--max-model-len",
"512",
"--chat-template",
DUMMY_CHAT_TEMPLATE,
"--trust-remote-code",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_pooling(server: RemoteOpenAIServer, model_name: str):
input_texts = [
"The chef prepared a delicious meal.",
]
# test single pooling
response = requests.post(
server.url_for("pooling"),
json={"model": model_name, "input": input_texts, "encoding_format": "float"},
)
response.raise_for_status()
poolings = PoolingResponse.model_validate(response.json())
assert poolings.id is not None
assert len(poolings.data) == 1
assert len(poolings.data[0].data) == 8
assert poolings.usage.completion_tokens == 0
assert poolings.usage.prompt_tokens == 8
assert poolings.usage.total_tokens == 8
# test using token IDs
input_tokens = [1, 1, 1, 1, 1]
response = requests.post(
server.url_for("pooling"),
json={"model": model_name, "input": input_tokens, "encoding_format": "float"},
)
response.raise_for_status()
poolings = PoolingResponse.model_validate(response.json())
assert poolings.id is not None
assert len(poolings.data) == 1
assert len(poolings.data[0].data) == 5
assert poolings.usage.completion_tokens == 0
assert poolings.usage.prompt_tokens == 5
assert poolings.usage.total_tokens == 5
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str):
# test list[str]
input_texts = [
"The cat sat on the mat.",
"A feline was resting on a rug.",
"Stars twinkle brightly in the night sky.",
]
response = requests.post(
server.url_for("pooling"),
json={"model": model_name, "input": input_texts, "encoding_format": "float"},
)
response.raise_for_status()
poolings = PoolingResponse.model_validate(response.json())
assert poolings.id is not None
assert len(poolings.data) == 3
assert len(poolings.data[0].data) == 8
assert poolings.usage.completion_tokens == 0
assert poolings.usage.prompt_tokens == 29
assert poolings.usage.total_tokens == 29
# test list[list[int]]
input_tokens = [
[4, 5, 7, 9, 20],
[15, 29, 499],
[24, 24, 24, 24, 24],
[25, 32, 64, 77],
]
response = requests.post(
server.url_for("pooling"),
json={"model": model_name, "input": input_tokens, "encoding_format": "float"},
)
response.raise_for_status()
poolings = PoolingResponse.model_validate(response.json())
assert poolings.id is not None
assert len(poolings.data) == 4
assert len(poolings.data[0].data) == 5
assert poolings.usage.completion_tokens == 0
assert poolings.usage.prompt_tokens == 17
assert poolings.usage.total_tokens == 17
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_conversation_pooling(server: RemoteOpenAIServer, model_name: str):
messages = [
{
"role": "user",
"content": "The cat sat on the mat.",
},
{
"role": "assistant",
"content": "A feline was resting on a rug.",
},
{
"role": "user",
"content": "Stars twinkle brightly in the night sky.",
},
]
chat_response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"messages": messages,
"encoding_format": "float",
},
)
chat_response.raise_for_status()
chat_poolings = PoolingResponse.model_validate(chat_response.json())
tokenizer = get_tokenizer(tokenizer_name=model_name, trust_remote_code=True)
prompt = tokenizer.apply_chat_template(
messages,
chat_template=DUMMY_CHAT_TEMPLATE,
add_generation_prompt=True,
continue_final_message=False,
tokenize=False,
)
completions_response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": prompt,
"encoding_format": "float",
# To be consistent with chat
"add_special_tokens": False,
},
)
completions_response.raise_for_status()
completion_poolings = PoolingResponse.model_validate(completions_response.json())
assert chat_poolings.id is not None
assert completion_poolings.id is not None
assert chat_poolings.created <= completion_poolings.created
assert chat_poolings.model_dump(exclude={"id", "created"}) == (
completion_poolings.model_dump(exclude={"id", "created"})
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_base64_pooling(server: RemoteOpenAIServer, model_name: str):
input_texts = [
"Hello my name is",
"The best thing about vLLM is that it supports many different models",
]
float_response = requests.post(
server.url_for("pooling"),
json={
"input": input_texts,
"model": model_name,
"encoding_format": "float",
},
)
float_response.raise_for_status()
responses_float = PoolingResponse.model_validate(float_response.json())
float_data = [np.array(d.data).squeeze(-1).tolist() for d in responses_float.data]
base64_response = requests.post(
server.url_for("pooling"),
json={
"input": input_texts,
"model": model_name,
"encoding_format": "base64",
},
)
base64_response.raise_for_status()
responses_base64 = PoolingResponse.model_validate(base64_response.json())
decoded_responses_base64_data = []
for data in responses_base64.data:
decoded_responses_base64_data.append(
np.frombuffer(base64.b64decode(data.data), dtype="float32").tolist()
)
check_embeddings_close(
embeddings_0_lst=float_data,
embeddings_1_lst=decoded_responses_base64_data,
name_0="float32",
name_1="base64",
)
# Default response is float32 decoded from base64 by OpenAI Client
default_response = requests.post(
server.url_for("pooling"),
json={
"input": input_texts,
"model": model_name,
},
)
default_response.raise_for_status()
responses_default = PoolingResponse.model_validate(default_response.json())
default_data = [
np.array(d.data).squeeze(-1).tolist() for d in responses_default.data
]
check_embeddings_close(
embeddings_0_lst=float_data,
embeddings_1_lst=default_data,
name_0="float32",
name_1="default",
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_base64_embed_dtype_and_endianness(
server: RemoteOpenAIServer, model_name: str
):
input_texts = [
"The best thing about vLLM is that it supports many different models",
]
url = server.url_for("pooling")
float_response = requests.post(
url,
json={
"model": model_name,
"input": input_texts,
"encoding_format": "float",
},
)
responses_float = PoolingResponse.model_validate(float_response.json())
float_data = [np.array(d.data).squeeze(-1).tolist() for d in responses_float.data]
for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
for endianness in ENDIANNESS:
responses_base64 = requests.post(
url,
json={
"model": model_name,
"input": input_texts,
"encoding_format": "base64",
"embed_dtype": embed_dtype,
"endianness": endianness,
},
)
base64_data = []
for data in responses_base64.json()["data"]:
binary = base64.b64decode(data["data"])
tensor = binary2tensor(binary, (-1,), embed_dtype, endianness)
base64_data.append(tensor.to(torch.float32).tolist())
check_embeddings_close(
embeddings_0_lst=float_data,
embeddings_1_lst=base64_data,
name_0="float_data",
name_1="base64_data",
tol=1e-2,
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_bytes_embed_dtype_and_endianness(
server: RemoteOpenAIServer, model_name: str
):
input_texts = [
"The best thing about vLLM is that it supports many different models",
]
url = server.url_for("pooling")
float_response = requests.post(
url,
json={
"model": model_name,
"input": input_texts,
"encoding_format": "float",
},
)
responses_float = PoolingResponse.model_validate(float_response.json())
float_data = [np.array(d.data).squeeze(-1).tolist() for d in responses_float.data]
for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
for endianness in ENDIANNESS:
responses_bytes = requests.post(
url,
json={
"model": model_name,
"input": input_texts,
"encoding_format": "bytes",
"embed_dtype": embed_dtype,
"endianness": endianness,
},
)
metadata = json.loads(responses_bytes.headers["metadata"])
body = responses_bytes.content
items = [MetadataItem(**x) for x in metadata["data"]]
bytes_data = decode_pooling_output(items=items, body=body)
bytes_data = [x.to(torch.float32).view(-1).tolist() for x in bytes_data]
check_embeddings_close(
embeddings_0_lst=float_data,
embeddings_1_lst=bytes_data,
name_0="float_data",
name_1="bytes_data",
tol=1e-2,
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_bytes_only_embed_dtype_and_endianness(
server: RemoteOpenAIServer, model_name: str
):
input_texts = [
"The best thing about vLLM is that it supports many different models",
] * 2
url = server.url_for("pooling")
float_response = requests.post(
url,
json={
"model": model_name,
"input": input_texts,
"encoding_format": "float",
},
)
responses_float = PoolingResponse.model_validate(float_response.json())
float_data = [np.array(d.data).squeeze(-1).tolist() for d in responses_float.data]
n_tokens = responses_float.usage.prompt_tokens // len(input_texts)
for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
for endianness in ENDIANNESS:
responses_bytes = requests.post(
url,
json={
"model": model_name,
"input": input_texts,
"encoding_format": "bytes_only",
"embed_dtype": embed_dtype,
"endianness": endianness,
},
)
assert "metadata" not in responses_bytes.headers
body = responses_bytes.content
items = build_metadata_items(
embed_dtype=embed_dtype,
endianness=endianness,
shape=(n_tokens, 1),
n_request=len(input_texts),
)
bytes_data = decode_pooling_output(items=items, body=body)
bytes_data = [x.to(torch.float32).view(-1).tolist() for x in bytes_data]
check_embeddings_close(
embeddings_0_lst=float_data,
embeddings_1_lst=bytes_data,
name_0="float_data",
name_1="bytes_data",
tol=1e-2,
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"])
async def test_params_not_supported(
server: RemoteOpenAIServer, model_name: str, param_name: str
):
input_texts = [
"The best thing about vLLM is that it supports many different models",
]
responses_base64 = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": input_texts,
"encoding_format": "base64",
param_name: f"bad_{param_name}",
},
)
assert responses_base64.status_code == 400
assert "literal_error" in responses_base64.json()["error"]["message"]
assert f"bad_{param_name}" in responses_base64.json()["error"]["message"]
@pytest.mark.asyncio
async def test_invocations(server: RemoteOpenAIServer):
input_texts = [
"The chef prepared a delicious meal.",
]
request_args = {
"model": MODEL_NAME,
"input": input_texts,
"encoding_format": "float",
}
completion_response = requests.post(server.url_for("pooling"), json=request_args)
completion_response.raise_for_status()
invocation_response = requests.post(
server.url_for("invocations"), json=request_args
)
invocation_response.raise_for_status()
completion_output = completion_response.json()
invocation_output = invocation_response.json()
assert completion_output.keys() == invocation_output.keys()
for completion_data, invocation_data in zip(
completion_output["data"], invocation_output["data"]
):
assert completion_data.keys() == invocation_data.keys()
check_embeddings_close(
embeddings_0_lst=completion_data["data"],
embeddings_1_lst=invocation_data["data"],
name_0="completion",
name_1="invocation",
)
@pytest.mark.asyncio
async def test_invocations_conversation(server: RemoteOpenAIServer):
messages = [
{
"role": "user",
"content": "The cat sat on the mat.",
},
{
"role": "assistant",
"content": "A feline was resting on a rug.",
},
{
"role": "user",
"content": "Stars twinkle brightly in the night sky.",
},
]
request_args = {
"model": MODEL_NAME,
"messages": messages,
"encoding_format": "float",
}
chat_response = requests.post(server.url_for("pooling"), json=request_args)
chat_response.raise_for_status()
invocation_response = requests.post(
server.url_for("invocations"), json=request_args
)
invocation_response.raise_for_status()
chat_output = chat_response.json()
invocation_output = invocation_response.json()
assert chat_output.keys() == invocation_output.keys()
for chat_data, invocation_data in zip(
chat_output["data"], invocation_output["data"]
):
assert chat_data.keys() == invocation_data.keys()
check_embeddings_close(
embeddings_0_lst=chat_data["data"],
embeddings_1_lst=invocation_data["data"],
name_0="chat",
name_1="invocation",
)

View File

@@ -0,0 +1,67 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import weakref
import pytest
import torch
from tests.models.utils import softmax
from vllm import LLM, PoolingParams
from vllm.distributed import cleanup_dist_env_and_memory
MODEL_NAME = "internlm/internlm2-1_8b-reward"
prompts = ["The chef prepared a delicious meal."]
@pytest.fixture(scope="module")
def llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(
model=MODEL_NAME,
max_num_batched_tokens=32768,
tensor_parallel_size=1,
gpu_memory_utilization=0.75,
enforce_eager=True,
trust_remote_code=True,
seed=0,
)
yield weakref.proxy(llm)
del llm
cleanup_dist_env_and_memory()
@pytest.mark.skip_global_cleanup
def test_config(llm: LLM):
vllm_config = llm.llm_engine.vllm_config
assert vllm_config.cache_config.enable_prefix_caching
assert vllm_config.scheduler_config.enable_chunked_prefill
def test_pooling_params(llm: LLM):
def get_outputs(use_activation):
outputs = llm.reward(
prompts,
pooling_params=PoolingParams(use_activation=use_activation),
use_tqdm=False,
)
return torch.cat([x.outputs.data for x in outputs])
default = get_outputs(use_activation=None)
w_activation = get_outputs(use_activation=True)
wo_activation = get_outputs(use_activation=False)
assert torch.allclose(default, w_activation, atol=1e-2), (
"Default should use activation."
)
assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
"wo_activation should not use activation."
)
assert torch.allclose(softmax(wo_activation), w_activation, atol=1e-2), (
"w_activation should be close to activation(wo_activation)."
)

View File

@@ -0,0 +1,62 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest
from tests.models.language.pooling_mteb_test.mteb_utils import (
MTEB_RERANK_LANGS,
MTEB_RERANK_TASKS,
MTEB_RERANK_TOL,
RerankClientMtebEncoder,
ScoreClientMtebEncoder,
run_mteb_rerank,
)
from tests.utils import RemoteOpenAIServer
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
st_main_score = 0.33457
@pytest.fixture(scope="module")
def server():
args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
def test_mteb_score(server):
url = server.url_for("score")
encoder = ScoreClientMtebEncoder(MODEL_NAME, url)
vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS, MTEB_RERANK_LANGS)
print("VLLM main score: ", vllm_main_score)
print("SentenceTransformer main score: ", st_main_score)
print("Difference: ", st_main_score - vllm_main_score)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert st_main_score - vllm_main_score < MTEB_RERANK_TOL
def test_mteb_rerank(server):
url = server.url_for("rerank")
encoder = RerankClientMtebEncoder(MODEL_NAME, url)
vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS, MTEB_RERANK_LANGS)
print("VLLM main score: ", vllm_main_score)
print("SentenceTransformer main score: ", st_main_score)
print("Difference: ", st_main_score - vllm_main_score)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert st_main_score - vllm_main_score < MTEB_RERANK_TOL

View File

@@ -0,0 +1,67 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import weakref
import pytest
import torch
from tests.models.utils import softmax
from vllm import LLM, PoolingParams
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
@pytest.fixture(scope="module")
def llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(
model=MODEL_NAME,
max_num_batched_tokens=32768,
tensor_parallel_size=1,
gpu_memory_utilization=0.75,
enforce_eager=True,
seed=0,
)
yield weakref.proxy(llm)
del llm
cleanup_dist_env_and_memory()
def test_pooling_params(llm: LLM):
def get_outputs(use_activation):
text_1 = "What is the capital of France?"
text_2 = "The capital of France is Paris."
outputs = llm.score(
text_1,
text_2,
pooling_params=PoolingParams(use_activation=use_activation),
use_tqdm=False,
)
return torch.tensor([x.outputs.score for x in outputs])
default = get_outputs(use_activation=None)
w_activation = get_outputs(use_activation=True)
wo_activation = get_outputs(use_activation=False)
assert torch.allclose(default, w_activation, atol=1e-2), (
"Default should use activation."
)
assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
"wo_activation should not use activation."
)
assert torch.allclose(softmax(wo_activation), w_activation, atol=1e-2), (
"w_activation should be close to activation(wo_activation)."
)

View File

@@ -0,0 +1,224 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import requests
import torch
import torch.nn.functional as F
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
from vllm.entrypoints.pooling.score.protocol import RerankResponse
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "BAAI/bge-reranker-base"
DTYPE = "bfloat16"
@pytest.fixture(scope="module")
def server():
args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_rerank_texts(server: RemoteOpenAIServer, model_name: str):
query = "What is the capital of France?"
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
rerank_response = requests.post(
server.url_for("rerank"),
json={
"model": model_name,
"query": query,
"documents": documents,
},
)
rerank_response.raise_for_status()
rerank = RerankResponse.model_validate(rerank_response.json())
assert rerank.id is not None
assert rerank.results is not None
assert len(rerank.results) == 2
assert rerank.results[0].relevance_score >= 0.9
assert rerank.results[1].relevance_score <= 0.01
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_top_n(server: RemoteOpenAIServer, model_name: str):
query = "What is the capital of France?"
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
"Cross-encoder models are neat",
]
rerank_response = requests.post(
server.url_for("rerank"),
json={"model": model_name, "query": query, "documents": documents, "top_n": 2},
)
rerank_response.raise_for_status()
rerank = RerankResponse.model_validate(rerank_response.json())
assert rerank.id is not None
assert rerank.results is not None
assert len(rerank.results) == 2
assert rerank.results[0].relevance_score >= 0.9
assert rerank.results[1].relevance_score <= 0.01
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str):
query = "What is the capital of France?" * 100
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
rerank_response = requests.post(
server.url_for("rerank"),
json={"model": model_name, "query": query, "documents": documents},
)
assert rerank_response.status_code == 400
# Assert just a small fragments of the response
assert "Please reduce the length of the input." in rerank_response.text
def test_invocations(server: RemoteOpenAIServer):
query = "What is the capital of France?"
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
request_args = {
"model": MODEL_NAME,
"query": query,
"documents": documents,
}
rerank_response = requests.post(server.url_for("rerank"), json=request_args)
rerank_response.raise_for_status()
invocation_response = requests.post(
server.url_for("invocations"), json=request_args
)
invocation_response.raise_for_status()
rerank_output = rerank_response.json()
invocation_output = invocation_response.json()
assert rerank_output.keys() == invocation_output.keys()
for rerank_result, invocations_result in zip(
rerank_output["results"], invocation_output["results"]
):
assert rerank_result.keys() == invocations_result.keys()
assert rerank_result["relevance_score"] == pytest.approx(
invocations_result["relevance_score"], rel=0.05
)
# TODO: reset this tolerance to 0.01 once we find
# an alternative to flash_attn with bfloat16
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_use_activation(server: RemoteOpenAIServer, model_name: str):
async def get_outputs(use_activation):
query = "What is the capital of France?"
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
response = requests.post(
server.url_for("rerank"),
json={
"model": model_name,
"query": query,
"documents": documents,
"use_activation": use_activation,
},
)
outputs = response.json()
return torch.tensor([x["relevance_score"] for x in outputs["results"]])
default = await get_outputs(use_activation=None)
w_activation = await get_outputs(use_activation=True)
wo_activation = await get_outputs(use_activation=False)
assert torch.allclose(default, w_activation, atol=1e-2), (
"Default should use activation."
)
assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
"wo_activation should not use activation."
)
assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), (
"w_activation should be close to activation(wo_activation)."
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_pooling_classify(server: RemoteOpenAIServer, model_name: str):
input_text = "This product was excellent and exceeded my expectations"
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": input_text,
"encoding_format": "float",
"task": "classify",
},
)
poolings = PoolingResponse.model_validate(response.json())
assert len(poolings.data) == 1
assert len(poolings.data[0].data) == 1
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_pooling_token_classify(server: RemoteOpenAIServer, model_name: str):
input_text = ["The chef prepared a delicious meal."]
response = requests.post(
server.url_for("pooling"),
json={"model": model_name, "input": input_text, "encoding_format": "float"},
)
poolings = PoolingResponse.model_validate(response.json())
assert len(poolings.data) == 1
assert len(poolings.data[0].data) == 11
assert len(poolings.data[0].data[0]) == 1
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("task", ["embed", "token_embed", "plugin"])
async def test_pooling_not_supported(
server: RemoteOpenAIServer, model_name: str, task: str
):
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": "test",
"encoding_format": "float",
"task": task,
},
)
assert response.json()["error"]["type"] == "BadRequestError"
assert response.json()["error"]["message"].startswith(
f"Task {task} is not supported"
)

View File

@@ -0,0 +1,265 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
import pytest
import requests
import torch
import torch.nn.functional as F
from torch import tensor
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.score.protocol import ScoreResponse
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODELS = [
{"name": "BAAI/bge-reranker-v2-m3", "is_cross_encoder": True},
{"name": "BAAI/bge-base-en-v1.5", "is_cross_encoder": False},
]
DTYPE = "half"
def run_transformers(hf_model, model, text_pairs):
if model["is_cross_encoder"]:
return hf_model.predict(text_pairs).tolist()
else:
hf_embeddings = [hf_model.encode(text_pair) for text_pair in text_pairs]
return [
F.cosine_similarity(tensor(pair[0]), tensor(pair[1]), dim=0)
for pair in hf_embeddings
]
@pytest.fixture(scope="class", params=MODELS)
def model(request):
yield request.param
@pytest.fixture(scope="class")
def server(model: dict[str, Any]):
args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
with RemoteOpenAIServer(model["name"], args) as remote_server:
yield remote_server
@pytest.fixture(scope="class")
def runner(model: dict[str, Any], hf_runner):
kwargs = {
"dtype": DTYPE,
"is_cross_encoder"
if model["is_cross_encoder"]
else "is_sentence_transformer": True,
}
with hf_runner(model["name"], **kwargs) as hf_model:
yield hf_model
class TestModel:
def test_text_1_str_text_2_list(
self, server: RemoteOpenAIServer, model: dict[str, Any], runner
):
text_1 = "What is the capital of France?"
text_2 = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
score_response = requests.post(
server.url_for("score"),
json={
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
vllm_outputs = [d.score for d in score.data]
text_pairs = [[text_1, text_2[0]], [text_1, text_2[1]]]
hf_outputs = run_transformers(runner, model, text_pairs)
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
def test_text_1_list_text_2_list(
self, server: RemoteOpenAIServer, model: dict[str, Any], runner
):
text_1 = [
"What is the capital of the United States?",
"What is the capital of France?",
]
text_2 = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
score_response = requests.post(
server.url_for("score"),
json={
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
vllm_outputs = [d.score for d in score.data]
text_pairs = [[text_1[0], text_2[0]], [text_1[1], text_2[1]]]
hf_outputs = run_transformers(runner, model, text_pairs)
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
def test_text_1_str_text_2_str(
self, server: RemoteOpenAIServer, model: dict[str, Any], runner
):
text_1 = "What is the capital of France?"
text_2 = "The capital of France is Paris."
score_response = requests.post(
server.url_for("score"),
json={
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
vllm_outputs = [d.score for d in score.data]
text_pairs = [[text_1, text_2]]
hf_outputs = run_transformers(runner, model, text_pairs)
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
def test_score_max_model_len(
self, server: RemoteOpenAIServer, model: dict[str, Any]
):
text_1 = "What is the capital of France?" * 20
text_2 = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
score_response = requests.post(
server.url_for("score"),
json={
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
},
)
assert score_response.status_code == 400
# Assert just a small fragments of the response
assert "Please reduce the length of the input." in score_response.text
# Test truncation
score_response = requests.post(
server.url_for("score"),
json={
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
"truncate_prompt_tokens": 101,
},
)
assert score_response.status_code == 400
assert "Please, select a smaller truncation size." in score_response.text
def test_invocations(self, server: RemoteOpenAIServer, model: dict[str, Any]):
text_1 = "What is the capital of France?"
text_2 = "The capital of France is Paris."
request_args = {
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
}
score_response = requests.post(server.url_for("score"), json=request_args)
score_response.raise_for_status()
invocation_response = requests.post(
server.url_for("invocations"), json=request_args
)
invocation_response.raise_for_status()
score_output = score_response.json()
invocation_output = invocation_response.json()
assert score_output.keys() == invocation_output.keys()
for score_data, invocation_data in zip(
score_output["data"], invocation_output["data"]
):
assert score_data.keys() == invocation_data.keys()
assert score_data["score"] == pytest.approx(
invocation_data["score"], rel=0.05
)
# TODO: reset this tolerance to 0.01 once we find
# an alternative to flash_attn with bfloat16
def test_use_activation(self, server: RemoteOpenAIServer, model: dict[str, Any]):
def get_outputs(use_activation):
text_1 = "What is the capital of France?"
text_2 = "The capital of France is Paris."
response = requests.post(
server.url_for("score"),
json={
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
"use_activation": use_activation,
},
)
if response.status_code != 200:
return response
outputs = response.json()
return torch.tensor([x["score"] for x in outputs["data"]])
if model["is_cross_encoder"]:
default = get_outputs(use_activation=None)
w_activation = get_outputs(use_activation=True)
wo_activation = get_outputs(use_activation=False)
assert torch.allclose(default, w_activation, atol=1e-2), (
"Default should use activation."
)
assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
"wo_activation should not use activation."
)
assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), (
"w_activation should be close to activation(wo_activation)."
)
else:
get_outputs(use_activation=None)
# The activation parameter only works for the is_cross_encoder model
response = get_outputs(use_activation=True)
assert response.status_code == 400