Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/tests/entrypoints/pooling/embed/init.py
+++ b/tests/entrypoints/pooling/embed/init.py
--- a/tests/entrypoints/pooling/embed/test_correctness_mteb.py
+++ b/tests/entrypoints/pooling/embed/test_correctness_mteb.py
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+
+import pytest
+
+from tests.models.language.pooling_mteb_test.mteb_utils import (
+    MTEB_EMBED_TASKS,
+    MTEB_EMBED_TOL,
+    OpenAIClientMtebEncoder,
+    run_mteb_embed_task,
+)
+from tests.utils import RemoteOpenAIServer
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
+
+os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
+
+MODEL_NAME = "intfloat/e5-small"
+MAIN_SCORE = 0.7422994752439667
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+def test_mteb_embed(server):
+    client = server.get_client()
+    encoder = OpenAIClientMtebEncoder(MODEL_NAME, client)
+    vllm_main_score = run_mteb_embed_task(encoder, MTEB_EMBED_TASKS)
+    st_main_score = MAIN_SCORE
+
+    print("VLLM main score: ", vllm_main_score)
+    print("SentenceTransformer main score: ", st_main_score)
+    print("Difference: ", st_main_score - vllm_main_score)
+
+    # We are not concerned that the vllm mteb results are better
+    # than SentenceTransformers, so we only perform one-sided testing.
+    assert st_main_score - vllm_main_score < MTEB_EMBED_TOL
--- a/tests/entrypoints/pooling/embed/test_offline.py
+++ b/tests/entrypoints/pooling/embed/test_offline.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import weakref
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from vllm import LLM, PoolingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
+
+MODEL_NAME = "intfloat/multilingual-e5-small"
+
+prompts = ["The chef prepared a delicious meal."]
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(
+        model=MODEL_NAME,
+        max_num_batched_tokens=32768,
+        tensor_parallel_size=1,
+        gpu_memory_utilization=0.75,
+        enforce_eager=True,
+        seed=0,
+    )
+
+    yield weakref.proxy(llm)
+
+    del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.mark.skip_global_cleanup
+def test_token_embed(llm: LLM):
+    outputs = llm.encode(prompts, pooling_task="token_embed", use_tqdm=False)
+    multi_vector = outputs[0].outputs.data
+    assert multi_vector.shape == (11, 384)
+
+
+def test_pooling_params(llm: LLM):
+    def get_outputs(normalize):
+        outputs = llm.embed(
+            prompts, pooling_params=PoolingParams(normalize=normalize), use_tqdm=False
+        )
+        return torch.tensor([x.outputs.embedding for x in outputs])
+
+    default = get_outputs(normalize=None)
+    w_normal = get_outputs(normalize=True)
+    wo_normal = get_outputs(normalize=False)
+
+    assert torch.allclose(default, w_normal, atol=1e-2), "Default should use normal."
+    assert not torch.allclose(w_normal, wo_normal, atol=1e-2), (
+        "wo_normal should not use normal."
+    )
+    assert torch.allclose(w_normal, F.normalize(wo_normal, p=2, dim=-1), atol=1e-2), (
+        "w_normal should be close to normal(wo_normal)."
+    )
--- a/tests/entrypoints/pooling/embed/test_online.py
+++ b/tests/entrypoints/pooling/embed/test_online.py
@@ -0,0 +1,680 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import base64
+import json
+
+import numpy as np
+import openai
+import pytest
+import pytest_asyncio
+import requests
+import torch
+import torch.nn.functional as F
+
+from tests.models.language.pooling.embed_utils import run_embedding_correctness_test
+from tests.models.utils import check_embeddings_close
+from tests.utils import RemoteOpenAIServer
+from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
+from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
+from vllm.platforms import current_platform
+from vllm.tokenizers import get_tokenizer
+from vllm.utils.serial_utils import (
+    EMBED_DTYPE_TO_TORCH_DTYPE,
+    ENDIANNESS,
+    MetadataItem,
+    binary2tensor,
+    build_metadata_items,
+    decode_pooling_output,
+)
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
+
+MODEL_NAME = "intfloat/multilingual-e5-small"
+DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
+DTYPE = "bfloat16"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--runner",
+        "pooling",
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        DTYPE,
+        "--enforce-eager",
+        "--max-model-len",
+        "512",
+        "--chat-template",
+        DUMMY_CHAT_TEMPLATE,
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.fixture(scope="module")
+def hf_model(hf_runner):
+    with hf_runner(MODEL_NAME, dtype=DTYPE, is_sentence_transformer=True) as hf_model:
+        yield hf_model
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_embedding(hf_model, client: openai.AsyncOpenAI, model_name: str):
+    input_texts = [
+        "The chef prepared a delicious meal.",
+    ]
+
+    # test single embedding
+    embedding_response = await client.embeddings.create(
+        model=model_name,
+        input=input_texts,
+        encoding_format="float",
+    )
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json")
+    )
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 384
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 11
+    assert embeddings.usage.total_tokens == 11
+
+    vllm_outputs = [d.embedding for d in embeddings.data]
+    run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
+
+    # test using token IDs
+    input_tokens = [1, 1, 1, 1, 1]
+    embedding_response = await client.embeddings.create(
+        model=model_name,
+        input=input_tokens,
+        encoding_format="float",
+    )
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json")
+    )
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 384
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 5
+    assert embeddings.usage.total_tokens == 5
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI, model_name: str):
+    # test list[str]
+    input_texts = [
+        "The cat sat on the mat.",
+        "A feline was resting on a rug.",
+        "Stars twinkle brightly in the night sky.",
+    ]
+    embedding_response = await client.embeddings.create(
+        model=model_name,
+        input=input_texts,
+        encoding_format="float",
+    )
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json")
+    )
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 3
+    assert len(embeddings.data[0].embedding) == 384
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 33
+    assert embeddings.usage.total_tokens == 33
+
+    vllm_outputs = [d.embedding for d in embeddings.data]
+    run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
+
+    # test list[list[int]]
+    input_tokens = [
+        [4, 5, 7, 9, 20],
+        [15, 29, 499],
+        [24, 24, 24, 24, 24],
+        [25, 32, 64, 77],
+    ]
+    embedding_response = await client.embeddings.create(
+        model=model_name,
+        input=input_tokens,
+        encoding_format="float",
+    )
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json")
+    )
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 4
+    assert len(embeddings.data[0].embedding) == 384
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 17
+    assert embeddings.usage.total_tokens == 17
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_conversation_embedding(
+    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
+):
+    messages = [
+        {
+            "role": "user",
+            "content": "The cat sat on the mat.",
+        },
+        {
+            "role": "assistant",
+            "content": "A feline was resting on a rug.",
+        },
+        {
+            "role": "user",
+            "content": "Stars twinkle brightly in the night sky.",
+        },
+    ]
+
+    chat_response = requests.post(
+        server.url_for("v1/embeddings"),
+        json={
+            "model": model_name,
+            "messages": messages,
+            "encoding_format": "float",
+        },
+    )
+    chat_response.raise_for_status()
+    chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
+
+    tokenizer = get_tokenizer(tokenizer_name=model_name)
+    prompt = tokenizer.apply_chat_template(
+        messages,
+        chat_template=DUMMY_CHAT_TEMPLATE,
+        add_generation_prompt=True,
+        continue_final_message=False,
+        tokenize=False,
+    )
+    completion_response = await client.embeddings.create(
+        model=model_name,
+        input=prompt,
+        encoding_format="float",
+        # To be consistent with chat
+        extra_body={"add_special_tokens": False},
+    )
+    completion_embeddings = EmbeddingResponse.model_validate(
+        completion_response.model_dump(mode="json")
+    )
+
+    assert chat_embeddings.id is not None
+    assert completion_embeddings.id is not None
+    assert chat_embeddings.created <= completion_embeddings.created
+    assert chat_embeddings.model_dump(exclude={"id", "created"}) == (
+        completion_embeddings.model_dump(exclude={"id", "created"})
+    )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_batch_base64_embedding(
+    hf_model, client: openai.AsyncOpenAI, model_name: str
+):
+    input_texts = [
+        "Hello my name is",
+        "The best thing about vLLM is that it supports many different models",
+    ]
+
+    responses_float = await client.embeddings.create(
+        input=input_texts, model=model_name, encoding_format="float"
+    )
+    float_data = [d.embedding for d in responses_float.data]
+    run_embedding_correctness_test(hf_model, input_texts, float_data)
+
+    responses_base64 = await client.embeddings.create(
+        input=input_texts, model=model_name, encoding_format="base64"
+    )
+    base64_data = []
+    for data in responses_base64.data:
+        base64_data.append(
+            np.frombuffer(base64.b64decode(data.embedding), dtype="float32").tolist()
+        )
+
+    run_embedding_correctness_test(hf_model, input_texts, base64_data)
+
+    # Default response is float32 decoded from base64 by OpenAI Client
+    responses_default = await client.embeddings.create(
+        input=input_texts, model=model_name
+    )
+    default_data = [d.embedding for d in responses_default.data]
+    run_embedding_correctness_test(hf_model, input_texts, default_data)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_base64_embed_dtype_and_endianness(
+    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
+):
+    input_texts = [
+        "The best thing about vLLM is that it supports many different models",
+    ]
+
+    responses_float = await client.embeddings.create(
+        input=input_texts, model=model_name, encoding_format="float"
+    )
+    float_data = [d.embedding for d in responses_float.data]
+
+    for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
+        for endianness in ENDIANNESS:
+            responses_base64 = requests.post(
+                server.url_for("/v1/embeddings"),
+                json={
+                    "model": model_name,
+                    "input": input_texts,
+                    "encoding_format": "base64",
+                    "embed_dtype": embed_dtype,
+                    "endianness": endianness,
+                },
+            )
+
+            base64_data = []
+            for data in responses_base64.json()["data"]:
+                binary = base64.b64decode(data["embedding"])
+                tensor = binary2tensor(binary, (-1,), embed_dtype, endianness)
+                base64_data.append(tensor.to(torch.float32).tolist())
+
+            check_embeddings_close(
+                embeddings_0_lst=float_data,
+                embeddings_1_lst=base64_data,
+                name_0="float_data",
+                name_1="base64_data",
+                tol=1e-2,
+            )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_bytes_embed_dtype_and_endianness(
+    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
+):
+    input_texts = [
+        "The best thing about vLLM is that it supports many different models",
+    ]
+
+    responses_float = await client.embeddings.create(
+        input=input_texts, model=model_name, encoding_format="float"
+    )
+    float_data = [d.embedding for d in responses_float.data]
+
+    for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
+        for endianness in ENDIANNESS:
+            responses_bytes = requests.post(
+                server.url_for("/v1/embeddings"),
+                json={
+                    "model": model_name,
+                    "input": input_texts,
+                    "encoding_format": "bytes",
+                    "embed_dtype": embed_dtype,
+                    "endianness": endianness,
+                },
+            )
+
+            metadata = json.loads(responses_bytes.headers["metadata"])
+            body = responses_bytes.content
+            items = [MetadataItem(**x) for x in metadata["data"]]
+
+            bytes_data = decode_pooling_output(items=items, body=body)
+            bytes_data = [x.to(torch.float32).tolist() for x in bytes_data]
+
+            check_embeddings_close(
+                embeddings_0_lst=float_data,
+                embeddings_1_lst=bytes_data,
+                name_0="float_data",
+                name_1="bytes_data",
+                tol=1e-2,
+            )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_bytes_only_embed_dtype_and_endianness(
+    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
+):
+    input_texts = [
+        "The best thing about vLLM is that it supports many different models",
+    ] * 2
+
+    responses_float = await client.embeddings.create(
+        input=input_texts, model=model_name, encoding_format="float"
+    )
+    float_data = [d.embedding for d in responses_float.data]
+    embedding_size = len(float_data[0])
+
+    for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
+        for endianness in ENDIANNESS:
+            responses_bytes = requests.post(
+                server.url_for("/v1/embeddings"),
+                json={
+                    "model": model_name,
+                    "input": input_texts,
+                    "encoding_format": "bytes_only",
+                    "embed_dtype": embed_dtype,
+                    "endianness": endianness,
+                },
+            )
+
+            assert "metadata" not in responses_bytes.headers
+            body = responses_bytes.content
+            items = build_metadata_items(
+                embed_dtype=embed_dtype,
+                endianness=endianness,
+                shape=(embedding_size,),
+                n_request=len(input_texts),
+            )
+
+            bytes_data = decode_pooling_output(items=items, body=body)
+            bytes_data = [x.to(torch.float32).tolist() for x in bytes_data]
+
+            check_embeddings_close(
+                embeddings_0_lst=float_data,
+                embeddings_1_lst=bytes_data,
+                name_0="float_data",
+                name_1="bytes_data",
+                tol=1e-2,
+            )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"])
+async def test_params_not_supported(
+    server: RemoteOpenAIServer, model_name: str, param_name: str
+):
+    input_texts = [
+        "The best thing about vLLM is that it supports many different models",
+    ]
+
+    responses_base64 = requests.post(
+        server.url_for("/v1/embeddings"),
+        json={
+            "model": model_name,
+            "input": input_texts,
+            "encoding_format": "base64",
+            param_name: f"bad_{param_name}",
+        },
+    )
+
+    assert responses_base64.status_code == 400
+    assert "literal_error" in responses_base64.json()["error"]["message"]
+    assert f"bad_{param_name}" in responses_base64.json()["error"]["message"]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_embedding_truncation(client: openai.AsyncOpenAI, model_name: str):
+    input_texts = [
+        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
+    ]
+
+    # test single embedding
+    embedding_response = await client.embeddings.create(
+        model=model_name, input=input_texts, extra_body={"truncate_prompt_tokens": 10}
+    )
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json")
+    )
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 384
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 10
+    assert embeddings.usage.total_tokens == 10
+
+    input_tokens = [
+        1,
+        24428,
+        289,
+        18341,
+        26165,
+        285,
+        19323,
+        283,
+        289,
+        26789,
+        3871,
+        28728,
+        9901,
+        340,
+        2229,
+        385,
+        340,
+        315,
+        28741,
+        28804,
+        2,
+    ]
+    embedding_response = await client.embeddings.create(
+        model=model_name, input=input_tokens, extra_body={"truncate_prompt_tokens": 10}
+    )
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json")
+    )
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 384
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 10
+    assert embeddings.usage.total_tokens == 10
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_embedding_truncation_invalid(
+    client: openai.AsyncOpenAI, model_name: str
+):
+    input_texts = [
+        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
+    ]
+
+    with pytest.raises(openai.BadRequestError):
+        response = await client.embeddings.create(
+            model=model_name,
+            input=input_texts,
+            extra_body={"truncate_prompt_tokens": 8193},
+        )
+        assert "error" in response.object
+        assert (
+            "truncate_prompt_tokens value is greater than max_model_len. "
+            "Please, select a smaller truncation size." in response.message
+        )
+
+
+@pytest.mark.asyncio
+async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenAI):
+    input_texts = [
+        "The chef prepared a delicious meal.",
+    ]
+
+    request_args = {
+        "model": MODEL_NAME,
+        "input": input_texts,
+        "encoding_format": "float",
+    }
+
+    completion_response = await client.embeddings.create(**request_args)
+
+    invocation_response = requests.post(
+        server.url_for("invocations"), json=request_args
+    )
+    invocation_response.raise_for_status()
+
+    completion_output = completion_response.model_dump()
+    invocation_output = invocation_response.json()
+
+    assert completion_output.keys() == invocation_output.keys()
+    for completion_data, invocation_data in zip(
+        completion_output["data"], invocation_output["data"]
+    ):
+        assert completion_data.keys() == invocation_data.keys()
+        check_embeddings_close(
+            embeddings_0_lst=[completion_data["embedding"]],
+            embeddings_1_lst=[invocation_data["embedding"]],
+            name_0="completion",
+            name_1="invocation",
+        )
+
+
+@pytest.mark.asyncio
+async def test_invocations_conversation(server: RemoteOpenAIServer):
+    messages = [
+        {
+            "role": "user",
+            "content": "The cat sat on the mat.",
+        },
+        {
+            "role": "assistant",
+            "content": "A feline was resting on a rug.",
+        },
+        {
+            "role": "user",
+            "content": "Stars twinkle brightly in the night sky.",
+        },
+    ]
+
+    request_args = {
+        "model": MODEL_NAME,
+        "messages": messages,
+        "encoding_format": "float",
+    }
+
+    chat_response = requests.post(server.url_for("v1/embeddings"), json=request_args)
+    chat_response.raise_for_status()
+
+    invocation_response = requests.post(
+        server.url_for("invocations"), json=request_args
+    )
+    invocation_response.raise_for_status()
+
+    chat_output = chat_response.json()
+    invocation_output = invocation_response.json()
+
+    assert chat_output.keys() == invocation_output.keys()
+    for chat_data, invocation_data in zip(
+        chat_output["data"], invocation_output["data"]
+    ):
+        assert chat_data.keys() == invocation_data.keys()
+        check_embeddings_close(
+            embeddings_0_lst=[chat_data["embedding"]],
+            embeddings_1_lst=[invocation_data["embedding"]],
+            name_0="chat",
+            name_1="invocation",
+        )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_normalize(server: RemoteOpenAIServer, model_name: str):
+    input_text = ["The chef prepared a delicious meal."]
+
+    async def get_outputs(normalize):
+        request_args = {
+            "model": MODEL_NAME,
+            "input": input_text,
+            "encoding_format": "float",
+            "normalize": normalize,
+        }
+
+        response = requests.post(server.url_for("v1/embeddings"), json=request_args)
+        outputs = response.json()
+
+        return torch.tensor([x["embedding"] for x in outputs["data"]])
+
+    default = await get_outputs(normalize=None)
+    w_normal = await get_outputs(normalize=True)
+    wo_normal = await get_outputs(normalize=False)
+
+    assert torch.allclose(default, w_normal, atol=1e-2), "Default should use normal."
+    assert not torch.allclose(w_normal, wo_normal, atol=1e-2), (
+        "wo_normal should not use normal."
+    )
+    assert torch.allclose(w_normal, F.normalize(wo_normal, p=2, dim=-1), atol=1e-2), (
+        "w_normal should be close to normal(wo_normal)."
+    )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_pooling_embed(server: RemoteOpenAIServer, model_name: str):
+    task = "embed"
+    input_text = ["The chef prepared a delicious meal."]
+
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_text,
+            "encoding_format": "float",
+            "task": task,
+        },
+    )
+
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 384
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_pooling_token_embed(server: RemoteOpenAIServer, model_name: str):
+    task = "token_embed"
+    input_text = ["The chef prepared a delicious meal."]
+
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_text,
+            "encoding_format": "float",
+            "task": task,
+        },
+    )
+
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 11
+    assert len(poolings.data[0].data[0]) == 384
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("task", ["classify", "token_classify", "plugin"])
+async def test_pooling_not_supported(
+    server: RemoteOpenAIServer, model_name: str, task: str
+):
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": "test",
+            "encoding_format": "float",
+            "task": task,
+        },
+    )
+    assert response.json()["error"]["type"] == "BadRequestError"
+    assert response.json()["error"]["message"].startswith(
+        f"Task {task} is not supported"
+    )
--- a/tests/entrypoints/pooling/embed/test_online_dimensions.py
+++ b/tests/entrypoints/pooling/embed/test_online_dimensions.py
@@ -0,0 +1,132 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`.
+"""
+
+import openai
+import pytest
+
+from tests.conftest import HfRunner
+from tests.models.language.pooling.embed_utils import run_embedding_correctness_test
+from tests.models.utils import EmbedModelInfo
+from tests.utils import RemoteOpenAIServer
+from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
+
+MODELS = [
+    EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
+    EmbedModelInfo(
+        "Snowflake/snowflake-arctic-embed-m-v1.5",
+        is_matryoshka=True,
+        matryoshka_dimensions=[256],
+    ),
+]
+
+input_texts = [
+    "The chef prepared a delicious meal.",
+]
+
+
+@pytest.fixture(scope="module", params=MODELS)
+def model_info(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=["bfloat16"])
+def dtype(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def server(model_info, dtype: str):
+    args = [
+        "--runner",
+        "pooling",
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        dtype,
+        "--enforce-eager",
+        "--max-model-len",
+        "512",
+    ]
+
+    if model_info.name == "Snowflake/snowflake-arctic-embed-m-v1.5":
+        # Manually enable Matryoshka Embeddings
+        args.extend(
+            ["--trust_remote_code", "--hf_overrides", '{"matryoshka_dimensions":[256]}']
+        )
+
+    with RemoteOpenAIServer(model_info.name, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def hf_model(hf_runner, model_info, dtype: str):
+    with hf_runner(
+        model_info.name, dtype=dtype, is_sentence_transformer=True
+    ) as hf_model:
+        yield hf_model
+
+
+@pytest.mark.asyncio
+async def test_matryoshka(
+    model_info: EmbedModelInfo, server: RemoteOpenAIServer, hf_model: HfRunner
+):
+    client = server.get_async_client()
+
+    async def make_request_and_correctness_test(dimensions):
+        prompts = input_texts * 3
+
+        embedding_response = await client.embeddings.create(
+            model=model_info.name,
+            input=prompts,
+            dimensions=dimensions,
+            encoding_format="float",
+        )
+        embeddings = EmbeddingResponse.model_validate(
+            embedding_response.model_dump(mode="json")
+        )
+
+        assert embeddings.id is not None
+        assert len(embeddings.data) == 3
+        assert len(embeddings.data[0].embedding) > 0
+        assert embeddings.usage.completion_tokens == 0
+        assert embeddings.usage.prompt_tokens > 0
+        assert embeddings.usage.total_tokens > 0
+
+        if dimensions is not None:
+            assert len(embeddings.data[0].embedding) == dimensions
+
+        vllm_outputs = [d.embedding for d in embeddings.data]
+        run_embedding_correctness_test(hf_model, prompts, vllm_outputs, dimensions)
+
+    if model_info.is_matryoshka:
+        valid_dimensions: list[int | None] = [None]
+        if model_info.matryoshka_dimensions is not None:
+            valid_dimensions += model_info.matryoshka_dimensions[:2]
+
+        for dimensions in valid_dimensions:
+            await make_request_and_correctness_test(dimensions)
+
+        invalid_dimensions: list[int | None] = [-1]
+        if model_info.matryoshka_dimensions is not None:
+            assert 5 not in model_info.matryoshka_dimensions
+            invalid_dimensions.append(5)
+
+        for dimensions in invalid_dimensions:
+            with pytest.raises(openai.BadRequestError):
+                await make_request_and_correctness_test(dimensions)
+
+    else:
+        for dimensions in [None]:
+            await make_request_and_correctness_test(dimensions)
+
+        for dimensions in [-1, 16]:
+            with pytest.raises(openai.BadRequestError):
+                await make_request_and_correctness_test(dimensions)
--- a/tests/entrypoints/pooling/embed/test_online_long_text.py
+++ b/tests/entrypoints/pooling/embed/test_online_long_text.py
@@ -0,0 +1,458 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test cases for long text embedding with automatic chunking mechanism.
+
+This test suite validates vLLM's automatic chunking functionality for handling
+text inputs that exceed the model's maximum token length, specifically targeting
+the intfloat/multilingual-e5-small model (max token length: 512).
+"""
+
+import random
+
+import openai
+import pytest
+import pytest_asyncio
+
+from tests.utils import RemoteOpenAIServer
+from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
+
+
+def _generate_random_text(word_count: int) -> str:
+    """Generate random text with approximately the specified word count."""
+    # Common English words with focus on verbs and nouns for realistic text
+    common_words = [
+        # Essential articles and pronouns (minimal)
+        "the",
+        "and",
+        "you",
+        "they",
+        "this",
+        "that",
+        "these",
+        "those",
+        # Action verbs
+        "create",
+        "build",
+        "develop",
+        "design",
+        "implement",
+        "execute",
+        "analyze",
+        "process",
+        "generate",
+        "calculate",
+        "evaluate",
+        "optimize",
+        "transform",
+        "integrate",
+        "configure",
+        "deploy",
+        "monitor",
+        "manage",
+        "discover",
+        "explore",
+        "investigate",
+        "research",
+        "study",
+        "examine",
+        "improve",
+        "enhance",
+        "upgrade",
+        "modify",
+        "update",
+        "maintain",
+        "solve",
+        "resolve",
+        "handle",
+        "address",
+        "tackle",
+        "overcome",
+        "communicate",
+        "collaborate",
+        "coordinate",
+        "organize",
+        "plan",
+        "achieve",
+        "accomplish",
+        "complete",
+        "finish",
+        "deliver",
+        "provide",
+        # Technology and science nouns
+        "system",
+        "application",
+        "software",
+        "hardware",
+        "network",
+        "database",
+        "algorithm",
+        "model",
+        "framework",
+        "platform",
+        "interface",
+        "protocol",
+        "architecture",
+        "infrastructure",
+        "component",
+        "module",
+        "service",
+        "technology",
+        "innovation",
+        "solution",
+        "methodology",
+        "approach",
+        "artificial",
+        "intelligence",
+        "machine",
+        "learning",
+        "neural",
+        "network",
+        "computer",
+        "processor",
+        "memory",
+        "storage",
+        "computation",
+        "data",
+        "information",
+        "knowledge",
+        "insight",
+        "pattern",
+        "trend",
+        "analysis",
+        "research",
+        "development",
+        "engineering",
+        "science",
+        "mathematics",
+        "statistics",
+        "probability",
+        "optimization",
+        "performance",
+        "efficiency",
+        # General nouns
+        "project",
+        "team",
+        "organization",
+        "company",
+        "business",
+        "industry",
+        "market",
+        "customer",
+        "user",
+        "client",
+        "product",
+        "feature",
+        "function",
+        "requirement",
+        "specification",
+        "documentation",
+        "report",
+        "result",
+        "outcome",
+        "impact",
+        "benefit",
+        "advantage",
+        "challenge",
+        "problem",
+        "opportunity",
+        "strategy",
+        "goal",
+        "objective",
+        "target",
+        "milestone",
+        "process",
+        "procedure",
+        "workflow",
+        "pipeline",
+        "operation",
+        "task",
+        "activity",
+        "event",
+        "session",
+        "meeting",
+        "discussion",
+        "decision",
+    ]
+
+    words = []
+    for _ in range(word_count):
+        words.append(random.choice(common_words))
+
+    # Add some punctuation for more realistic text
+    text = " ".join(words)
+    # Add periods every 10-20 words
+    words_list = text.split()
+    result = []
+    for i, word in enumerate(words_list):
+        result.append(word)
+        if (i + 1) % random.randint(10, 20) == 0 and i < len(words_list) - 1:
+            result[-1] += "."
+
+    return " ".join(result)
+
+
+MODEL_NAME = "intfloat/multilingual-e5-small"
+DTYPE = "bfloat16"
+
+# Test text: Generate text with approximately 1500 words to exceed 1024 tokens
+LONG_TEXT_1500_WORDS = _generate_random_text(1500)
+
+# Test text: Generate text with approximately 2500 words to exceed 2048 tokens
+LONG_TEXT_2500_WORDS = _generate_random_text(2500)
+
+
+@pytest.fixture(scope="module")
+def server_with_chunked_processing():
+    """Start server with automatic chunking processing enabled."""
+    args = [
+        "--runner",
+        "pooling",
+        "--dtype",
+        DTYPE,
+        "--enforce-eager",
+        "--max-model-len",
+        "512",  # Set smaller max_model_len to trigger chunking mechanism
+        "--pooler-config",
+        (
+            '{"pooling_type": "MEAN", "normalize": true, '
+            '"enable_chunked_processing": true, "max_embed_len": 10000}'
+        ),
+        "--gpu-memory-utilization",
+        "0.8",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client_with_chunked_processing(server_with_chunked_processing):
+    """Create async client with chunking processing support."""
+    async with server_with_chunked_processing.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_long_text_embedding_1500_chars(
+    client_with_chunked_processing: openai.AsyncOpenAI, model_name: str
+):
+    """Test embedding processing for ~1500 character long text
+    (~1028 tokens, exceeding 512 token limit)."""
+
+    # Verify text length
+    # Verify text has sufficient word count (approximately 1500 words)
+    word_count = len(LONG_TEXT_1500_WORDS.split())
+    assert word_count >= 1400, f"Test text word count insufficient: {word_count} words"
+
+    # Send embedding request
+    embedding_response = await client_with_chunked_processing.embeddings.create(
+        model=model_name,
+        input=[LONG_TEXT_1500_WORDS],
+        encoding_format="float",
+    )
+
+    # Verify response structure
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json")
+    )
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert (
+        len(embeddings.data[0].embedding) == 384
+    )  # multilingual-e5-small embedding dimension
+    assert embeddings.usage.completion_tokens == 0
+    # Due to chunked processing, token count should
+    # reflect actual processed tokens
+    # With ~1500 words, we expect roughly
+    # 1024+ tokens (exceeding 512 token limit)
+    # Should exceed single chunk limit of 512
+    assert embeddings.usage.prompt_tokens > 800
+    assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens
+
+    # Verify embedding vector validity
+    embedding_vector = embeddings.data[0].embedding
+    assert all(isinstance(x, float) for x in embedding_vector), (
+        "Embedding vector should contain floats"
+    )
+    assert not all(x == 0 for x in embedding_vector), (
+        "Embedding vector should not be all zeros"
+    )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_long_text_embedding_2500_chars(
+    client_with_chunked_processing: openai.AsyncOpenAI, model_name: str
+):
+    """Test embedding processing for ~2500 character long text
+    (~2048 tokens, requiring multiple chunks)."""
+
+    # Verify text length
+    # Verify text has sufficient word count (approximately 2500 words)
+    word_count = len(LONG_TEXT_2500_WORDS.split())
+    assert word_count >= 2300, f"Test text word count insufficient: {word_count} words"
+
+    # Send embedding request
+    embedding_response = await client_with_chunked_processing.embeddings.create(
+        model=model_name,
+        input=[LONG_TEXT_2500_WORDS],
+        encoding_format="float",
+    )
+
+    # Verify response structure
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json")
+    )
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert (
+        len(embeddings.data[0].embedding) == 384
+    )  # multilingual-e5-small embedding dimension
+    assert embeddings.usage.completion_tokens == 0
+    # Due to chunked processing, token count should
+    # reflect actual processed tokens
+    # With ~2500 words, we expect
+    # roughly 2048+ tokens (requiring multiple chunks)
+    # Should require multiple chunks for processing
+    assert embeddings.usage.prompt_tokens > 1500
+    assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens
+
+    # Verify embedding vector validity
+    embedding_vector = embeddings.data[0].embedding
+    assert all(isinstance(x, float) for x in embedding_vector), (
+        "Embedding vector should contain floats"
+    )
+    assert not all(x == 0 for x in embedding_vector), (
+        "Embedding vector should not be all zeros"
+    )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_batch_long_text_embedding(
+    client_with_chunked_processing: openai.AsyncOpenAI, model_name: str
+):
+    """Test batch long text embedding processing."""
+
+    input_texts = [
+        LONG_TEXT_1500_WORDS,
+        LONG_TEXT_2500_WORDS,
+        "This is a short text test.",  # Short text for comparison
+    ]
+
+    # Send batch embedding request
+    embedding_response = await client_with_chunked_processing.embeddings.create(
+        model=model_name,
+        input=input_texts,
+        encoding_format="float",
+    )
+
+    # Verify response structure
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json")
+    )
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 3  # Three input texts
+
+    # Verify each embedding dimension
+    for i, embedding_data in enumerate(embeddings.data):
+        assert len(embedding_data.embedding) == 384
+        assert embedding_data.index == i
+
+        # Verify embedding vector validity
+        embedding_vector = embedding_data.embedding
+        assert all(isinstance(x, float) for x in embedding_vector)
+        assert not all(x == 0 for x in embedding_vector)
+
+    # Verify token usage
+    assert embeddings.usage.completion_tokens == 0
+    # Total token count should be very substantial
+    assert embeddings.usage.prompt_tokens > 1000
+    assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_chunked_vs_normal_consistency(
+    client_with_chunked_processing: openai.AsyncOpenAI, model_name: str
+):
+    """Test consistency between chunked and
+    normal processing (using short text)."""
+
+    # Use a short text within the 512 token limit
+    short_text = (
+        "Artificial intelligence technology is changing our world, "
+        "bringing unprecedented opportunities and challenges."
+    )
+
+    # Send embedding request
+    embedding_response = await client_with_chunked_processing.embeddings.create(
+        model=model_name,
+        input=[short_text],
+        encoding_format="float",
+    )
+
+    # Verify response structure
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json")
+    )
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 384
+    assert embeddings.usage.completion_tokens == 0
+    # Short text should not require chunked processing
+    assert embeddings.usage.prompt_tokens < 512
+    assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens
+
+    # 验证embedding向量的有效性
+    embedding_vector = embeddings.data[0].embedding
+    assert all(isinstance(x, float) for x in embedding_vector)
+    assert not all(x == 0 for x in embedding_vector)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_chunked_processing_response_format(
+    client_with_chunked_processing: openai.AsyncOpenAI, model_name: str
+):
+    """Test response format and structure during chunked processing."""
+
+    # Test with long text to trigger chunking
+    embedding_response = await client_with_chunked_processing.embeddings.create(
+        model=model_name,
+        input=[LONG_TEXT_1500_WORDS],
+        encoding_format="float",
+    )
+
+    # Verify response structure
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json")
+    )
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert embeddings.data[0].object == "embedding"
+    assert embeddings.data[0].index == 0
+
+    # Verify embedding vector properties
+    embedding_vector = embeddings.data[0].embedding
+    import math
+
+    vector_norm = math.sqrt(sum(x * x for x in embedding_vector))
+    # Check that the vector is normalized
+    # (default behavior for most embedding models)
+    assert 0.8 < vector_norm < 1.2, (
+        f"Vector norm should be reasonable, actual: {vector_norm}"
+    )
--- a/tests/entrypoints/pooling/embed/test_online_vision.py
+++ b/tests/entrypoints/pooling/embed/test_online_vision.py
@@ -0,0 +1,106 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+import requests
+from transformers import AutoProcessor
+
+from tests.utils import VLLM_PATH, RemoteOpenAIServer
+from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
+from vllm.multimodal.base import MediaWithBytes
+from vllm.multimodal.utils import encode_image_base64, fetch_image
+
+MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
+MAXIMUM_IMAGES = 2
+
+vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec_phi3v.jinja"
+assert vlm2vec_jinja_path.exists()
+
+# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
+TEST_IMAGE_ASSETS = [
+    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    "Grayscale_8bits_palette_sample_image.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/Grayscale_8bits_palette_sample_image.png",
+    "1280px-Venn_diagram_rgb.svg.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/1280px-Venn_diagram_rgb.svg.png",
+    "RGBA_comp.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png",
+]
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--runner",
+        "pooling",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "5",
+        "--enforce-eager",
+        "--trust-remote-code",
+        "--limit-mm-per-prompt",
+        json.dumps({"image": MAXIMUM_IMAGES}),
+        "--chat-template",
+        str(vlm2vec_jinja_path),
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="session")
+def base64_encoded_image(local_asset_server) -> dict[str, str]:
+    return {
+        image_url: encode_image_base64(local_asset_server.get_image_asset(image_url))
+        for image_url in TEST_IMAGE_ASSETS
+    }
+
+
+def get_hf_prompt_tokens(model_name, content, image_url):
+    processor = AutoProcessor.from_pretrained(
+        model_name, trust_remote_code=True, num_crops=4
+    )
+
+    placeholder = "<|image_1|> "
+    prompt = f"{placeholder}{content}"
+    image = fetch_image(image_url)
+    # Unwrap MediaWithBytes if present
+    if isinstance(image, MediaWithBytes):
+        image = image.media
+    images = [image]
+    inputs = processor(prompt, images, return_tensors="pt")
+    return inputs.input_ids.shape[1]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
+async def test_image_embedding(
+    server: RemoteOpenAIServer, model_name: str, image_url: str
+):
+    content_text = "Represent the given image."
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": image_url}},
+                {"type": "text", "text": content_text},
+            ],
+        }
+    ]
+
+    response = requests.post(
+        server.url_for("v1/embeddings"),
+        json={"model": model_name, "messages": messages, "encoding_format": "float"},
+    )
+    response.raise_for_status()
+    embeddings = EmbeddingResponse.model_validate(response.json())
+
+    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url)
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 3072
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == hf_prompt_tokens
+    assert embeddings.usage.total_tokens == hf_prompt_tokens