Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

@@ -0,0 +1,47 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest
from tests.models.language.pooling_mteb_test.mteb_utils import (
MTEB_EMBED_TASKS,
MTEB_EMBED_TOL,
OpenAIClientMtebEncoder,
run_mteb_embed_task,
)
from tests.utils import RemoteOpenAIServer
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
MODEL_NAME = "intfloat/e5-small"
MAIN_SCORE = 0.7422994752439667
@pytest.fixture(scope="module")
def server():
args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
def test_mteb_embed(server):
client = server.get_client()
encoder = OpenAIClientMtebEncoder(MODEL_NAME, client)
vllm_main_score = run_mteb_embed_task(encoder, MTEB_EMBED_TASKS)
st_main_score = MAIN_SCORE
print("VLLM main score: ", vllm_main_score)
print("SentenceTransformer main score: ", st_main_score)
print("Difference: ", st_main_score - vllm_main_score)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert st_main_score - vllm_main_score < MTEB_EMBED_TOL

View File

@@ -0,0 +1,68 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import weakref
import pytest
import torch
import torch.nn.functional as F
from vllm import LLM, PoolingParams
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "intfloat/multilingual-e5-small"
prompts = ["The chef prepared a delicious meal."]
@pytest.fixture(scope="module")
def llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(
model=MODEL_NAME,
max_num_batched_tokens=32768,
tensor_parallel_size=1,
gpu_memory_utilization=0.75,
enforce_eager=True,
seed=0,
)
yield weakref.proxy(llm)
del llm
cleanup_dist_env_and_memory()
@pytest.mark.skip_global_cleanup
def test_token_embed(llm: LLM):
outputs = llm.encode(prompts, pooling_task="token_embed", use_tqdm=False)
multi_vector = outputs[0].outputs.data
assert multi_vector.shape == (11, 384)
def test_pooling_params(llm: LLM):
def get_outputs(normalize):
outputs = llm.embed(
prompts, pooling_params=PoolingParams(normalize=normalize), use_tqdm=False
)
return torch.tensor([x.outputs.embedding for x in outputs])
default = get_outputs(normalize=None)
w_normal = get_outputs(normalize=True)
wo_normal = get_outputs(normalize=False)
assert torch.allclose(default, w_normal, atol=1e-2), "Default should use normal."
assert not torch.allclose(w_normal, wo_normal, atol=1e-2), (
"wo_normal should not use normal."
)
assert torch.allclose(w_normal, F.normalize(wo_normal, p=2, dim=-1), atol=1e-2), (
"w_normal should be close to normal(wo_normal)."
)

View File

@@ -0,0 +1,680 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import json
import numpy as np
import openai
import pytest
import pytest_asyncio
import requests
import torch
import torch.nn.functional as F
from tests.models.language.pooling.embed_utils import run_embedding_correctness_test
from tests.models.utils import check_embeddings_close
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
from vllm.platforms import current_platform
from vllm.tokenizers import get_tokenizer
from vllm.utils.serial_utils import (
EMBED_DTYPE_TO_TORCH_DTYPE,
ENDIANNESS,
MetadataItem,
binary2tensor,
build_metadata_items,
decode_pooling_output,
)
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "intfloat/multilingual-e5-small"
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
DTYPE = "bfloat16"
@pytest.fixture(scope="module")
def server():
args = [
"--runner",
"pooling",
# use half precision for speed and memory savings in CI environment
"--dtype",
DTYPE,
"--enforce-eager",
"--max-model-len",
"512",
"--chat-template",
DUMMY_CHAT_TEMPLATE,
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.fixture(scope="module")
def hf_model(hf_runner):
with hf_runner(MODEL_NAME, dtype=DTYPE, is_sentence_transformer=True) as hf_model:
yield hf_model
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding(hf_model, client: openai.AsyncOpenAI, model_name: str):
input_texts = [
"The chef prepared a delicious meal.",
]
# test single embedding
embedding_response = await client.embeddings.create(
model=model_name,
input=input_texts,
encoding_format="float",
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 11
assert embeddings.usage.total_tokens == 11
vllm_outputs = [d.embedding for d in embeddings.data]
run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
# test using token IDs
input_tokens = [1, 1, 1, 1, 1]
embedding_response = await client.embeddings.create(
model=model_name,
input=input_tokens,
encoding_format="float",
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 5
assert embeddings.usage.total_tokens == 5
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI, model_name: str):
# test list[str]
input_texts = [
"The cat sat on the mat.",
"A feline was resting on a rug.",
"Stars twinkle brightly in the night sky.",
]
embedding_response = await client.embeddings.create(
model=model_name,
input=input_texts,
encoding_format="float",
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 3
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 33
assert embeddings.usage.total_tokens == 33
vllm_outputs = [d.embedding for d in embeddings.data]
run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
# test list[list[int]]
input_tokens = [
[4, 5, 7, 9, 20],
[15, 29, 499],
[24, 24, 24, 24, 24],
[25, 32, 64, 77],
]
embedding_response = await client.embeddings.create(
model=model_name,
input=input_tokens,
encoding_format="float",
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 4
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 17
assert embeddings.usage.total_tokens == 17
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_conversation_embedding(
server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
):
messages = [
{
"role": "user",
"content": "The cat sat on the mat.",
},
{
"role": "assistant",
"content": "A feline was resting on a rug.",
},
{
"role": "user",
"content": "Stars twinkle brightly in the night sky.",
},
]
chat_response = requests.post(
server.url_for("v1/embeddings"),
json={
"model": model_name,
"messages": messages,
"encoding_format": "float",
},
)
chat_response.raise_for_status()
chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
tokenizer = get_tokenizer(tokenizer_name=model_name)
prompt = tokenizer.apply_chat_template(
messages,
chat_template=DUMMY_CHAT_TEMPLATE,
add_generation_prompt=True,
continue_final_message=False,
tokenize=False,
)
completion_response = await client.embeddings.create(
model=model_name,
input=prompt,
encoding_format="float",
# To be consistent with chat
extra_body={"add_special_tokens": False},
)
completion_embeddings = EmbeddingResponse.model_validate(
completion_response.model_dump(mode="json")
)
assert chat_embeddings.id is not None
assert completion_embeddings.id is not None
assert chat_embeddings.created <= completion_embeddings.created
assert chat_embeddings.model_dump(exclude={"id", "created"}) == (
completion_embeddings.model_dump(exclude={"id", "created"})
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_base64_embedding(
hf_model, client: openai.AsyncOpenAI, model_name: str
):
input_texts = [
"Hello my name is",
"The best thing about vLLM is that it supports many different models",
]
responses_float = await client.embeddings.create(
input=input_texts, model=model_name, encoding_format="float"
)
float_data = [d.embedding for d in responses_float.data]
run_embedding_correctness_test(hf_model, input_texts, float_data)
responses_base64 = await client.embeddings.create(
input=input_texts, model=model_name, encoding_format="base64"
)
base64_data = []
for data in responses_base64.data:
base64_data.append(
np.frombuffer(base64.b64decode(data.embedding), dtype="float32").tolist()
)
run_embedding_correctness_test(hf_model, input_texts, base64_data)
# Default response is float32 decoded from base64 by OpenAI Client
responses_default = await client.embeddings.create(
input=input_texts, model=model_name
)
default_data = [d.embedding for d in responses_default.data]
run_embedding_correctness_test(hf_model, input_texts, default_data)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_base64_embed_dtype_and_endianness(
server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
):
input_texts = [
"The best thing about vLLM is that it supports many different models",
]
responses_float = await client.embeddings.create(
input=input_texts, model=model_name, encoding_format="float"
)
float_data = [d.embedding for d in responses_float.data]
for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
for endianness in ENDIANNESS:
responses_base64 = requests.post(
server.url_for("/v1/embeddings"),
json={
"model": model_name,
"input": input_texts,
"encoding_format": "base64",
"embed_dtype": embed_dtype,
"endianness": endianness,
},
)
base64_data = []
for data in responses_base64.json()["data"]:
binary = base64.b64decode(data["embedding"])
tensor = binary2tensor(binary, (-1,), embed_dtype, endianness)
base64_data.append(tensor.to(torch.float32).tolist())
check_embeddings_close(
embeddings_0_lst=float_data,
embeddings_1_lst=base64_data,
name_0="float_data",
name_1="base64_data",
tol=1e-2,
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_bytes_embed_dtype_and_endianness(
server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
):
input_texts = [
"The best thing about vLLM is that it supports many different models",
]
responses_float = await client.embeddings.create(
input=input_texts, model=model_name, encoding_format="float"
)
float_data = [d.embedding for d in responses_float.data]
for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
for endianness in ENDIANNESS:
responses_bytes = requests.post(
server.url_for("/v1/embeddings"),
json={
"model": model_name,
"input": input_texts,
"encoding_format": "bytes",
"embed_dtype": embed_dtype,
"endianness": endianness,
},
)
metadata = json.loads(responses_bytes.headers["metadata"])
body = responses_bytes.content
items = [MetadataItem(**x) for x in metadata["data"]]
bytes_data = decode_pooling_output(items=items, body=body)
bytes_data = [x.to(torch.float32).tolist() for x in bytes_data]
check_embeddings_close(
embeddings_0_lst=float_data,
embeddings_1_lst=bytes_data,
name_0="float_data",
name_1="bytes_data",
tol=1e-2,
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_bytes_only_embed_dtype_and_endianness(
server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
):
input_texts = [
"The best thing about vLLM is that it supports many different models",
] * 2
responses_float = await client.embeddings.create(
input=input_texts, model=model_name, encoding_format="float"
)
float_data = [d.embedding for d in responses_float.data]
embedding_size = len(float_data[0])
for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
for endianness in ENDIANNESS:
responses_bytes = requests.post(
server.url_for("/v1/embeddings"),
json={
"model": model_name,
"input": input_texts,
"encoding_format": "bytes_only",
"embed_dtype": embed_dtype,
"endianness": endianness,
},
)
assert "metadata" not in responses_bytes.headers
body = responses_bytes.content
items = build_metadata_items(
embed_dtype=embed_dtype,
endianness=endianness,
shape=(embedding_size,),
n_request=len(input_texts),
)
bytes_data = decode_pooling_output(items=items, body=body)
bytes_data = [x.to(torch.float32).tolist() for x in bytes_data]
check_embeddings_close(
embeddings_0_lst=float_data,
embeddings_1_lst=bytes_data,
name_0="float_data",
name_1="bytes_data",
tol=1e-2,
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"])
async def test_params_not_supported(
server: RemoteOpenAIServer, model_name: str, param_name: str
):
input_texts = [
"The best thing about vLLM is that it supports many different models",
]
responses_base64 = requests.post(
server.url_for("/v1/embeddings"),
json={
"model": model_name,
"input": input_texts,
"encoding_format": "base64",
param_name: f"bad_{param_name}",
},
)
assert responses_base64.status_code == 400
assert "literal_error" in responses_base64.json()["error"]["message"]
assert f"bad_{param_name}" in responses_base64.json()["error"]["message"]
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding_truncation(client: openai.AsyncOpenAI, model_name: str):
input_texts = [
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
]
# test single embedding
embedding_response = await client.embeddings.create(
model=model_name, input=input_texts, extra_body={"truncate_prompt_tokens": 10}
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 10
assert embeddings.usage.total_tokens == 10
input_tokens = [
1,
24428,
289,
18341,
26165,
285,
19323,
283,
289,
26789,
3871,
28728,
9901,
340,
2229,
385,
340,
315,
28741,
28804,
2,
]
embedding_response = await client.embeddings.create(
model=model_name, input=input_tokens, extra_body={"truncate_prompt_tokens": 10}
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 10
assert embeddings.usage.total_tokens == 10
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding_truncation_invalid(
client: openai.AsyncOpenAI, model_name: str
):
input_texts = [
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
]
with pytest.raises(openai.BadRequestError):
response = await client.embeddings.create(
model=model_name,
input=input_texts,
extra_body={"truncate_prompt_tokens": 8193},
)
assert "error" in response.object
assert (
"truncate_prompt_tokens value is greater than max_model_len. "
"Please, select a smaller truncation size." in response.message
)
@pytest.mark.asyncio
async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenAI):
input_texts = [
"The chef prepared a delicious meal.",
]
request_args = {
"model": MODEL_NAME,
"input": input_texts,
"encoding_format": "float",
}
completion_response = await client.embeddings.create(**request_args)
invocation_response = requests.post(
server.url_for("invocations"), json=request_args
)
invocation_response.raise_for_status()
completion_output = completion_response.model_dump()
invocation_output = invocation_response.json()
assert completion_output.keys() == invocation_output.keys()
for completion_data, invocation_data in zip(
completion_output["data"], invocation_output["data"]
):
assert completion_data.keys() == invocation_data.keys()
check_embeddings_close(
embeddings_0_lst=[completion_data["embedding"]],
embeddings_1_lst=[invocation_data["embedding"]],
name_0="completion",
name_1="invocation",
)
@pytest.mark.asyncio
async def test_invocations_conversation(server: RemoteOpenAIServer):
messages = [
{
"role": "user",
"content": "The cat sat on the mat.",
},
{
"role": "assistant",
"content": "A feline was resting on a rug.",
},
{
"role": "user",
"content": "Stars twinkle brightly in the night sky.",
},
]
request_args = {
"model": MODEL_NAME,
"messages": messages,
"encoding_format": "float",
}
chat_response = requests.post(server.url_for("v1/embeddings"), json=request_args)
chat_response.raise_for_status()
invocation_response = requests.post(
server.url_for("invocations"), json=request_args
)
invocation_response.raise_for_status()
chat_output = chat_response.json()
invocation_output = invocation_response.json()
assert chat_output.keys() == invocation_output.keys()
for chat_data, invocation_data in zip(
chat_output["data"], invocation_output["data"]
):
assert chat_data.keys() == invocation_data.keys()
check_embeddings_close(
embeddings_0_lst=[chat_data["embedding"]],
embeddings_1_lst=[invocation_data["embedding"]],
name_0="chat",
name_1="invocation",
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_normalize(server: RemoteOpenAIServer, model_name: str):
input_text = ["The chef prepared a delicious meal."]
async def get_outputs(normalize):
request_args = {
"model": MODEL_NAME,
"input": input_text,
"encoding_format": "float",
"normalize": normalize,
}
response = requests.post(server.url_for("v1/embeddings"), json=request_args)
outputs = response.json()
return torch.tensor([x["embedding"] for x in outputs["data"]])
default = await get_outputs(normalize=None)
w_normal = await get_outputs(normalize=True)
wo_normal = await get_outputs(normalize=False)
assert torch.allclose(default, w_normal, atol=1e-2), "Default should use normal."
assert not torch.allclose(w_normal, wo_normal, atol=1e-2), (
"wo_normal should not use normal."
)
assert torch.allclose(w_normal, F.normalize(wo_normal, p=2, dim=-1), atol=1e-2), (
"w_normal should be close to normal(wo_normal)."
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_pooling_embed(server: RemoteOpenAIServer, model_name: str):
task = "embed"
input_text = ["The chef prepared a delicious meal."]
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": input_text,
"encoding_format": "float",
"task": task,
},
)
poolings = PoolingResponse.model_validate(response.json())
assert len(poolings.data) == 1
assert len(poolings.data[0].data) == 384
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_pooling_token_embed(server: RemoteOpenAIServer, model_name: str):
task = "token_embed"
input_text = ["The chef prepared a delicious meal."]
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": input_text,
"encoding_format": "float",
"task": task,
},
)
poolings = PoolingResponse.model_validate(response.json())
assert len(poolings.data) == 1
assert len(poolings.data[0].data) == 11
assert len(poolings.data[0].data[0]) == 384
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("task", ["classify", "token_classify", "plugin"])
async def test_pooling_not_supported(
server: RemoteOpenAIServer, model_name: str, task: str
):
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": "test",
"encoding_format": "float",
"task": task,
},
)
assert response.json()["error"]["type"] == "BadRequestError"
assert response.json()["error"]["message"].startswith(
f"Task {task} is not supported"
)

View File

@@ -0,0 +1,132 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`.
"""
import openai
import pytest
from tests.conftest import HfRunner
from tests.models.language.pooling.embed_utils import run_embedding_correctness_test
from tests.models.utils import EmbedModelInfo
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODELS = [
EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m-v1.5",
is_matryoshka=True,
matryoshka_dimensions=[256],
),
]
input_texts = [
"The chef prepared a delicious meal.",
]
@pytest.fixture(scope="module", params=MODELS)
def model_info(request):
return request.param
@pytest.fixture(scope="module", params=["bfloat16"])
def dtype(request):
return request.param
@pytest.fixture(scope="module")
def server(model_info, dtype: str):
args = [
"--runner",
"pooling",
# use half precision for speed and memory savings in CI environment
"--dtype",
dtype,
"--enforce-eager",
"--max-model-len",
"512",
]
if model_info.name == "Snowflake/snowflake-arctic-embed-m-v1.5":
# Manually enable Matryoshka Embeddings
args.extend(
["--trust_remote_code", "--hf_overrides", '{"matryoshka_dimensions":[256]}']
)
with RemoteOpenAIServer(model_info.name, args) as remote_server:
yield remote_server
@pytest.fixture(scope="module")
def hf_model(hf_runner, model_info, dtype: str):
with hf_runner(
model_info.name, dtype=dtype, is_sentence_transformer=True
) as hf_model:
yield hf_model
@pytest.mark.asyncio
async def test_matryoshka(
model_info: EmbedModelInfo, server: RemoteOpenAIServer, hf_model: HfRunner
):
client = server.get_async_client()
async def make_request_and_correctness_test(dimensions):
prompts = input_texts * 3
embedding_response = await client.embeddings.create(
model=model_info.name,
input=prompts,
dimensions=dimensions,
encoding_format="float",
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 3
assert len(embeddings.data[0].embedding) > 0
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens > 0
assert embeddings.usage.total_tokens > 0
if dimensions is not None:
assert len(embeddings.data[0].embedding) == dimensions
vllm_outputs = [d.embedding for d in embeddings.data]
run_embedding_correctness_test(hf_model, prompts, vllm_outputs, dimensions)
if model_info.is_matryoshka:
valid_dimensions: list[int | None] = [None]
if model_info.matryoshka_dimensions is not None:
valid_dimensions += model_info.matryoshka_dimensions[:2]
for dimensions in valid_dimensions:
await make_request_and_correctness_test(dimensions)
invalid_dimensions: list[int | None] = [-1]
if model_info.matryoshka_dimensions is not None:
assert 5 not in model_info.matryoshka_dimensions
invalid_dimensions.append(5)
for dimensions in invalid_dimensions:
with pytest.raises(openai.BadRequestError):
await make_request_and_correctness_test(dimensions)
else:
for dimensions in [None]:
await make_request_and_correctness_test(dimensions)
for dimensions in [-1, 16]:
with pytest.raises(openai.BadRequestError):
await make_request_and_correctness_test(dimensions)

View File

@@ -0,0 +1,458 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Test cases for long text embedding with automatic chunking mechanism.
This test suite validates vLLM's automatic chunking functionality for handling
text inputs that exceed the model's maximum token length, specifically targeting
the intfloat/multilingual-e5-small model (max token length: 512).
"""
import random
import openai
import pytest
import pytest_asyncio
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
def _generate_random_text(word_count: int) -> str:
"""Generate random text with approximately the specified word count."""
# Common English words with focus on verbs and nouns for realistic text
common_words = [
# Essential articles and pronouns (minimal)
"the",
"and",
"you",
"they",
"this",
"that",
"these",
"those",
# Action verbs
"create",
"build",
"develop",
"design",
"implement",
"execute",
"analyze",
"process",
"generate",
"calculate",
"evaluate",
"optimize",
"transform",
"integrate",
"configure",
"deploy",
"monitor",
"manage",
"discover",
"explore",
"investigate",
"research",
"study",
"examine",
"improve",
"enhance",
"upgrade",
"modify",
"update",
"maintain",
"solve",
"resolve",
"handle",
"address",
"tackle",
"overcome",
"communicate",
"collaborate",
"coordinate",
"organize",
"plan",
"achieve",
"accomplish",
"complete",
"finish",
"deliver",
"provide",
# Technology and science nouns
"system",
"application",
"software",
"hardware",
"network",
"database",
"algorithm",
"model",
"framework",
"platform",
"interface",
"protocol",
"architecture",
"infrastructure",
"component",
"module",
"service",
"technology",
"innovation",
"solution",
"methodology",
"approach",
"artificial",
"intelligence",
"machine",
"learning",
"neural",
"network",
"computer",
"processor",
"memory",
"storage",
"computation",
"data",
"information",
"knowledge",
"insight",
"pattern",
"trend",
"analysis",
"research",
"development",
"engineering",
"science",
"mathematics",
"statistics",
"probability",
"optimization",
"performance",
"efficiency",
# General nouns
"project",
"team",
"organization",
"company",
"business",
"industry",
"market",
"customer",
"user",
"client",
"product",
"feature",
"function",
"requirement",
"specification",
"documentation",
"report",
"result",
"outcome",
"impact",
"benefit",
"advantage",
"challenge",
"problem",
"opportunity",
"strategy",
"goal",
"objective",
"target",
"milestone",
"process",
"procedure",
"workflow",
"pipeline",
"operation",
"task",
"activity",
"event",
"session",
"meeting",
"discussion",
"decision",
]
words = []
for _ in range(word_count):
words.append(random.choice(common_words))
# Add some punctuation for more realistic text
text = " ".join(words)
# Add periods every 10-20 words
words_list = text.split()
result = []
for i, word in enumerate(words_list):
result.append(word)
if (i + 1) % random.randint(10, 20) == 0 and i < len(words_list) - 1:
result[-1] += "."
return " ".join(result)
MODEL_NAME = "intfloat/multilingual-e5-small"
DTYPE = "bfloat16"
# Test text: Generate text with approximately 1500 words to exceed 1024 tokens
LONG_TEXT_1500_WORDS = _generate_random_text(1500)
# Test text: Generate text with approximately 2500 words to exceed 2048 tokens
LONG_TEXT_2500_WORDS = _generate_random_text(2500)
@pytest.fixture(scope="module")
def server_with_chunked_processing():
"""Start server with automatic chunking processing enabled."""
args = [
"--runner",
"pooling",
"--dtype",
DTYPE,
"--enforce-eager",
"--max-model-len",
"512", # Set smaller max_model_len to trigger chunking mechanism
"--pooler-config",
(
'{"pooling_type": "MEAN", "normalize": true, '
'"enable_chunked_processing": true, "max_embed_len": 10000}'
),
"--gpu-memory-utilization",
"0.8",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client_with_chunked_processing(server_with_chunked_processing):
"""Create async client with chunking processing support."""
async with server_with_chunked_processing.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_long_text_embedding_1500_chars(
client_with_chunked_processing: openai.AsyncOpenAI, model_name: str
):
"""Test embedding processing for ~1500 character long text
(~1028 tokens, exceeding 512 token limit)."""
# Verify text length
# Verify text has sufficient word count (approximately 1500 words)
word_count = len(LONG_TEXT_1500_WORDS.split())
assert word_count >= 1400, f"Test text word count insufficient: {word_count} words"
# Send embedding request
embedding_response = await client_with_chunked_processing.embeddings.create(
model=model_name,
input=[LONG_TEXT_1500_WORDS],
encoding_format="float",
)
# Verify response structure
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert (
len(embeddings.data[0].embedding) == 384
) # multilingual-e5-small embedding dimension
assert embeddings.usage.completion_tokens == 0
# Due to chunked processing, token count should
# reflect actual processed tokens
# With ~1500 words, we expect roughly
# 1024+ tokens (exceeding 512 token limit)
# Should exceed single chunk limit of 512
assert embeddings.usage.prompt_tokens > 800
assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens
# Verify embedding vector validity
embedding_vector = embeddings.data[0].embedding
assert all(isinstance(x, float) for x in embedding_vector), (
"Embedding vector should contain floats"
)
assert not all(x == 0 for x in embedding_vector), (
"Embedding vector should not be all zeros"
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_long_text_embedding_2500_chars(
client_with_chunked_processing: openai.AsyncOpenAI, model_name: str
):
"""Test embedding processing for ~2500 character long text
(~2048 tokens, requiring multiple chunks)."""
# Verify text length
# Verify text has sufficient word count (approximately 2500 words)
word_count = len(LONG_TEXT_2500_WORDS.split())
assert word_count >= 2300, f"Test text word count insufficient: {word_count} words"
# Send embedding request
embedding_response = await client_with_chunked_processing.embeddings.create(
model=model_name,
input=[LONG_TEXT_2500_WORDS],
encoding_format="float",
)
# Verify response structure
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert (
len(embeddings.data[0].embedding) == 384
) # multilingual-e5-small embedding dimension
assert embeddings.usage.completion_tokens == 0
# Due to chunked processing, token count should
# reflect actual processed tokens
# With ~2500 words, we expect
# roughly 2048+ tokens (requiring multiple chunks)
# Should require multiple chunks for processing
assert embeddings.usage.prompt_tokens > 1500
assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens
# Verify embedding vector validity
embedding_vector = embeddings.data[0].embedding
assert all(isinstance(x, float) for x in embedding_vector), (
"Embedding vector should contain floats"
)
assert not all(x == 0 for x in embedding_vector), (
"Embedding vector should not be all zeros"
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_long_text_embedding(
client_with_chunked_processing: openai.AsyncOpenAI, model_name: str
):
"""Test batch long text embedding processing."""
input_texts = [
LONG_TEXT_1500_WORDS,
LONG_TEXT_2500_WORDS,
"This is a short text test.", # Short text for comparison
]
# Send batch embedding request
embedding_response = await client_with_chunked_processing.embeddings.create(
model=model_name,
input=input_texts,
encoding_format="float",
)
# Verify response structure
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 3 # Three input texts
# Verify each embedding dimension
for i, embedding_data in enumerate(embeddings.data):
assert len(embedding_data.embedding) == 384
assert embedding_data.index == i
# Verify embedding vector validity
embedding_vector = embedding_data.embedding
assert all(isinstance(x, float) for x in embedding_vector)
assert not all(x == 0 for x in embedding_vector)
# Verify token usage
assert embeddings.usage.completion_tokens == 0
# Total token count should be very substantial
assert embeddings.usage.prompt_tokens > 1000
assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_chunked_vs_normal_consistency(
client_with_chunked_processing: openai.AsyncOpenAI, model_name: str
):
"""Test consistency between chunked and
normal processing (using short text)."""
# Use a short text within the 512 token limit
short_text = (
"Artificial intelligence technology is changing our world, "
"bringing unprecedented opportunities and challenges."
)
# Send embedding request
embedding_response = await client_with_chunked_processing.embeddings.create(
model=model_name,
input=[short_text],
encoding_format="float",
)
# Verify response structure
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
# Short text should not require chunked processing
assert embeddings.usage.prompt_tokens < 512
assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens
# 验证embedding向量的有效性
embedding_vector = embeddings.data[0].embedding
assert all(isinstance(x, float) for x in embedding_vector)
assert not all(x == 0 for x in embedding_vector)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_chunked_processing_response_format(
client_with_chunked_processing: openai.AsyncOpenAI, model_name: str
):
"""Test response format and structure during chunked processing."""
# Test with long text to trigger chunking
embedding_response = await client_with_chunked_processing.embeddings.create(
model=model_name,
input=[LONG_TEXT_1500_WORDS],
encoding_format="float",
)
# Verify response structure
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json")
)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert embeddings.data[0].object == "embedding"
assert embeddings.data[0].index == 0
# Verify embedding vector properties
embedding_vector = embeddings.data[0].embedding
import math
vector_norm = math.sqrt(sum(x * x for x in embedding_vector))
# Check that the vector is normalized
# (default behavior for most embedding models)
assert 0.8 < vector_norm < 1.2, (
f"Vector norm should be reasonable, actual: {vector_norm}"
)

View File

@@ -0,0 +1,106 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import pytest
import requests
from transformers import AutoProcessor
from tests.utils import VLLM_PATH, RemoteOpenAIServer
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
from vllm.multimodal.base import MediaWithBytes
from vllm.multimodal.utils import encode_image_base64, fetch_image
MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
MAXIMUM_IMAGES = 2
vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec_phi3v.jinja"
assert vlm2vec_jinja_path.exists()
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_ASSETS = [
"2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
"Grayscale_8bits_palette_sample_image.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/Grayscale_8bits_palette_sample_image.png",
"1280px-Venn_diagram_rgb.svg.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/1280px-Venn_diagram_rgb.svg.png",
"RGBA_comp.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png",
]
@pytest.fixture(scope="module")
def server():
args = [
"--runner",
"pooling",
"--max-model-len",
"2048",
"--max-num-seqs",
"5",
"--enforce-eager",
"--trust-remote-code",
"--limit-mm-per-prompt",
json.dumps({"image": MAXIMUM_IMAGES}),
"--chat-template",
str(vlm2vec_jinja_path),
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.fixture(scope="session")
def base64_encoded_image(local_asset_server) -> dict[str, str]:
return {
image_url: encode_image_base64(local_asset_server.get_image_asset(image_url))
for image_url in TEST_IMAGE_ASSETS
}
def get_hf_prompt_tokens(model_name, content, image_url):
processor = AutoProcessor.from_pretrained(
model_name, trust_remote_code=True, num_crops=4
)
placeholder = "<|image_1|> "
prompt = f"{placeholder}{content}"
image = fetch_image(image_url)
# Unwrap MediaWithBytes if present
if isinstance(image, MediaWithBytes):
image = image.media
images = [image]
inputs = processor(prompt, images, return_tensors="pt")
return inputs.input_ids.shape[1]
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
async def test_image_embedding(
server: RemoteOpenAIServer, model_name: str, image_url: str
):
content_text = "Represent the given image."
messages = [
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "text", "text": content_text},
],
}
]
response = requests.post(
server.url_for("v1/embeddings"),
json={"model": model_name, "messages": messages, "encoding_format": "float"},
)
response.raise_for_status()
embeddings = EmbeddingResponse.model_validate(response.json())
hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 3072
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == hf_prompt_tokens
assert embeddings.usage.total_tokens == hf_prompt_tokens