Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/tests/entrypoints/pooling/basic/init.py
+++ b/tests/entrypoints/pooling/basic/init.py
--- a/tests/entrypoints/pooling/basic/test_encode.py
+++ b/tests/entrypoints/pooling/basic/test_encode.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import weakref
+
+import pytest
+
+from vllm import LLM, PoolingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
+
+MODEL_NAME = "intfloat/multilingual-e5-small"
+
+PROMPTS = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+TOKEN_IDS = [
+    # Using ID={0, 1, 2, 3} results in NaN values,
+    # so we add this offset of 1000
+    [1000],
+    [1000, 1001],
+    [1000, 1002, 1001],
+    [1000, 1003, 1001, 1002],
+]
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(
+        model=MODEL_NAME,
+        max_num_batched_tokens=32768,
+        tensor_parallel_size=1,
+        gpu_memory_utilization=0.75,
+        enforce_eager=True,
+        seed=0,
+    )
+
+    yield weakref.proxy(llm)
+
+    del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.mark.skip_global_cleanup
+def test_multiple_pooling_params(llm: LLM):
+    pooling_params = [
+        PoolingParams(),
+        PoolingParams(),
+        PoolingParams(),
+        PoolingParams(),
+    ]
+
+    # Multiple PoolingParams should be matched with each prompt
+    outputs = llm.encode(PROMPTS, pooling_params=pooling_params, pooling_task="embed")
+    assert len(PROMPTS) == len(outputs)
+
+    # Exception raised, if the size of params does not match the size of prompts
+    with pytest.raises(ValueError):
+        outputs = llm.encode(
+            PROMPTS, pooling_params=pooling_params[:3], pooling_task="embed"
+        )
+
+    # Single PoolingParams should be applied to every prompt
+    single_pooling_params = PoolingParams()
+    outputs = llm.encode(
+        PROMPTS, pooling_params=single_pooling_params, pooling_task="embed"
+    )
+    assert len(PROMPTS) == len(outputs)
+
+    # pooling_params is None, default params should be applied
+    outputs = llm.encode(PROMPTS, pooling_params=None, pooling_task="embed")
+    assert len(PROMPTS) == len(outputs)
+
+
+def test_right_side_truncation(llm: LLM):
+    # Embeddings models should truncate the end of the prompt
+    tokenizer = llm.get_tokenizer()
+    assert tokenizer.truncation_side == "right"
--- a/tests/entrypoints/pooling/basic/test_truncation.py
+++ b/tests/entrypoints/pooling/basic/test_truncation.py
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+import openai
+import pytest
+import pytest_asyncio
+
+from tests.utils import RemoteOpenAIServer
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
+
+MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
+max_model_len = 128
+
+input = """Immerse yourself in the enchanting chronicle of calculus, a 
+    mathematical domain that has radically transformed our comprehension of 
+    change and motion. Despite its roots in ancient civilizations, the 
+    formal birth of calculus predominantly occurred in the 17th century, 
+    primarily under the influential guidance of Sir Isaac Newton and Gottfried 
+    Wilhelm Leibniz. The earliest traces of calculus concepts are found in 
+    ancient Greek mathematics,most notably in the works of Eudoxus and 
+    Archimedes, around 300 BCE. They utilized the 'method of exhaustion'—a 
+    technique for computing areas and volumes through the use of finite sums. 
+    This methodology laid crucial foundational work for integral calculus. 
+    In the 17th century, both Newton and Leibniz independently pioneered 
+    calculus, each contributing unique perspectives that would shape this new 
+    field."""
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--runner",
+        "pooling",
+        "--dtype",
+        "bfloat16",
+        "--enforce-eager",
+        "--max-model-len",
+        str(max_model_len),
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_smaller_truncation_size(client: openai.AsyncOpenAI):
+    truncation_size = 10
+    kwargs: dict[str, Any] = {
+        "model": MODEL_NAME,
+        "input": input,
+        "truncate_prompt_tokens": truncation_size,
+    }
+
+    response = await client.post(path="embeddings", cast_to=object, body={**kwargs})
+
+    assert response["usage"]["prompt_tokens"] == truncation_size
+
+
+@pytest.mark.asyncio
+async def test_zero_truncation_size(client: openai.AsyncOpenAI):
+    truncation_size = 0
+    kwargs: dict[str, Any] = {
+        "model": MODEL_NAME,
+        "input": input,
+        "truncate_prompt_tokens": truncation_size,
+    }
+
+    response = await client.post(path="embeddings", cast_to=object, body={**kwargs})
+
+    assert response["usage"]["prompt_tokens"] == truncation_size
+
+
+@pytest.mark.asyncio
+async def test_bigger_truncation_size(client: openai.AsyncOpenAI):
+    truncation_size = max_model_len + 1
+    kwargs: dict[str, Any] = {
+        "model": MODEL_NAME,
+        "input": input,
+        "truncate_prompt_tokens": truncation_size,
+    }
+
+    with pytest.raises(openai.BadRequestError) as err:
+        await client.post(path="embeddings", cast_to=object, body={**kwargs})
+
+    assert err.value.status_code == 400
+    error_details = err.value.response.json()["error"]
+    assert error_details["type"] == "BadRequestError"
+    expected_message = (
+        "truncate_prompt_tokens value is "
+        "greater than max_model_len."
+        " Please, select a smaller truncation size."
+    )
+    assert error_details["message"] == expected_message
+
+
+@pytest.mark.asyncio
+async def test_max_truncation_size(client: openai.AsyncOpenAI):
+    truncation_size = -1
+    kwargs: dict[str, Any] = {
+        "model": MODEL_NAME,
+        "input": input,
+        "truncate_prompt_tokens": truncation_size,
+    }
+
+    response = await client.post(path="embeddings", cast_to=object, body={**kwargs})
+
+    assert response["usage"]["prompt_tokens"] == max_model_len