Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/tests/entrypoints/pooling/score/init.py
+++ b/tests/entrypoints/pooling/score/init.py
--- a/tests/entrypoints/pooling/score/test_correctness_mteb.py
+++ b/tests/entrypoints/pooling/score/test_correctness_mteb.py
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+
+import pytest
+
+from tests.models.language.pooling_mteb_test.mteb_utils import (
+    MTEB_RERANK_LANGS,
+    MTEB_RERANK_TASKS,
+    MTEB_RERANK_TOL,
+    RerankClientMtebEncoder,
+    ScoreClientMtebEncoder,
+    run_mteb_rerank,
+)
+from tests.utils import RemoteOpenAIServer
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
+
+os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
+
+MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+st_main_score = 0.33457
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+def test_mteb_score(server):
+    url = server.url_for("score")
+    encoder = ScoreClientMtebEncoder(MODEL_NAME, url)
+    vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS, MTEB_RERANK_LANGS)
+
+    print("VLLM main score: ", vllm_main_score)
+    print("SentenceTransformer main score: ", st_main_score)
+    print("Difference: ", st_main_score - vllm_main_score)
+
+    # We are not concerned that the vllm mteb results are better
+    # than SentenceTransformers, so we only perform one-sided testing.
+    assert st_main_score - vllm_main_score < MTEB_RERANK_TOL
+
+
+def test_mteb_rerank(server):
+    url = server.url_for("rerank")
+    encoder = RerankClientMtebEncoder(MODEL_NAME, url)
+    vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS, MTEB_RERANK_LANGS)
+
+    print("VLLM main score: ", vllm_main_score)
+    print("SentenceTransformer main score: ", st_main_score)
+    print("Difference: ", st_main_score - vllm_main_score)
+
+    # We are not concerned that the vllm mteb results are better
+    # than SentenceTransformers, so we only perform one-sided testing.
+    assert st_main_score - vllm_main_score < MTEB_RERANK_TOL
--- a/tests/entrypoints/pooling/score/test_offline.py
+++ b/tests/entrypoints/pooling/score/test_offline.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import weakref
+
+import pytest
+import torch
+
+from tests.models.utils import softmax
+from vllm import LLM, PoolingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
+
+MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(
+        model=MODEL_NAME,
+        max_num_batched_tokens=32768,
+        tensor_parallel_size=1,
+        gpu_memory_utilization=0.75,
+        enforce_eager=True,
+        seed=0,
+    )
+
+    yield weakref.proxy(llm)
+
+    del llm
+
+    cleanup_dist_env_and_memory()
+
+
+def test_pooling_params(llm: LLM):
+    def get_outputs(use_activation):
+        text_1 = "What is the capital of France?"
+        text_2 = "The capital of France is Paris."
+
+        outputs = llm.score(
+            text_1,
+            text_2,
+            pooling_params=PoolingParams(use_activation=use_activation),
+            use_tqdm=False,
+        )
+        return torch.tensor([x.outputs.score for x in outputs])
+
+    default = get_outputs(use_activation=None)
+    w_activation = get_outputs(use_activation=True)
+    wo_activation = get_outputs(use_activation=False)
+
+    assert torch.allclose(default, w_activation, atol=1e-2), (
+        "Default should use activation."
+    )
+    assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
+        "wo_activation should not use activation."
+    )
+    assert torch.allclose(softmax(wo_activation), w_activation, atol=1e-2), (
+        "w_activation should be close to activation(wo_activation)."
+    )
--- a/tests/entrypoints/pooling/score/test_online_rerank.py
+++ b/tests/entrypoints/pooling/score/test_online_rerank.py
@@ -0,0 +1,224 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import requests
+import torch
+import torch.nn.functional as F
+
+from tests.utils import RemoteOpenAIServer
+from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
+from vllm.entrypoints.pooling.score.protocol import RerankResponse
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
+
+MODEL_NAME = "BAAI/bge-reranker-base"
+DTYPE = "bfloat16"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_rerank_texts(server: RemoteOpenAIServer, model_name: str):
+    query = "What is the capital of France?"
+    documents = [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
+    ]
+
+    rerank_response = requests.post(
+        server.url_for("rerank"),
+        json={
+            "model": model_name,
+            "query": query,
+            "documents": documents,
+        },
+    )
+    rerank_response.raise_for_status()
+    rerank = RerankResponse.model_validate(rerank_response.json())
+
+    assert rerank.id is not None
+    assert rerank.results is not None
+    assert len(rerank.results) == 2
+    assert rerank.results[0].relevance_score >= 0.9
+    assert rerank.results[1].relevance_score <= 0.01
+
+
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_top_n(server: RemoteOpenAIServer, model_name: str):
+    query = "What is the capital of France?"
+    documents = [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
+        "Cross-encoder models are neat",
+    ]
+
+    rerank_response = requests.post(
+        server.url_for("rerank"),
+        json={"model": model_name, "query": query, "documents": documents, "top_n": 2},
+    )
+    rerank_response.raise_for_status()
+    rerank = RerankResponse.model_validate(rerank_response.json())
+
+    assert rerank.id is not None
+    assert rerank.results is not None
+    assert len(rerank.results) == 2
+    assert rerank.results[0].relevance_score >= 0.9
+    assert rerank.results[1].relevance_score <= 0.01
+
+
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str):
+    query = "What is the capital of France?" * 100
+    documents = [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
+    ]
+
+    rerank_response = requests.post(
+        server.url_for("rerank"),
+        json={"model": model_name, "query": query, "documents": documents},
+    )
+    assert rerank_response.status_code == 400
+    # Assert just a small fragments of the response
+    assert "Please reduce the length of the input." in rerank_response.text
+
+
+def test_invocations(server: RemoteOpenAIServer):
+    query = "What is the capital of France?"
+    documents = [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
+    ]
+
+    request_args = {
+        "model": MODEL_NAME,
+        "query": query,
+        "documents": documents,
+    }
+
+    rerank_response = requests.post(server.url_for("rerank"), json=request_args)
+    rerank_response.raise_for_status()
+
+    invocation_response = requests.post(
+        server.url_for("invocations"), json=request_args
+    )
+    invocation_response.raise_for_status()
+
+    rerank_output = rerank_response.json()
+    invocation_output = invocation_response.json()
+
+    assert rerank_output.keys() == invocation_output.keys()
+    for rerank_result, invocations_result in zip(
+        rerank_output["results"], invocation_output["results"]
+    ):
+        assert rerank_result.keys() == invocations_result.keys()
+        assert rerank_result["relevance_score"] == pytest.approx(
+            invocations_result["relevance_score"], rel=0.05
+        )
+        # TODO: reset this tolerance to 0.01 once we find
+        # an alternative to flash_attn with bfloat16
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_use_activation(server: RemoteOpenAIServer, model_name: str):
+    async def get_outputs(use_activation):
+        query = "What is the capital of France?"
+        documents = [
+            "The capital of Brazil is Brasilia.",
+            "The capital of France is Paris.",
+        ]
+
+        response = requests.post(
+            server.url_for("rerank"),
+            json={
+                "model": model_name,
+                "query": query,
+                "documents": documents,
+                "use_activation": use_activation,
+            },
+        )
+        outputs = response.json()
+
+        return torch.tensor([x["relevance_score"] for x in outputs["results"]])
+
+    default = await get_outputs(use_activation=None)
+    w_activation = await get_outputs(use_activation=True)
+    wo_activation = await get_outputs(use_activation=False)
+
+    assert torch.allclose(default, w_activation, atol=1e-2), (
+        "Default should use activation."
+    )
+    assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
+        "wo_activation should not use activation."
+    )
+    assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), (
+        "w_activation should be close to activation(wo_activation)."
+    )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_pooling_classify(server: RemoteOpenAIServer, model_name: str):
+    input_text = "This product was excellent and exceeded my expectations"
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_text,
+            "encoding_format": "float",
+            "task": "classify",
+        },
+    )
+    poolings = PoolingResponse.model_validate(response.json())
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 1
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_pooling_token_classify(server: RemoteOpenAIServer, model_name: str):
+    input_text = ["The chef prepared a delicious meal."]
+
+    response = requests.post(
+        server.url_for("pooling"),
+        json={"model": model_name, "input": input_text, "encoding_format": "float"},
+    )
+
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 11
+    assert len(poolings.data[0].data[0]) == 1
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("task", ["embed", "token_embed", "plugin"])
+async def test_pooling_not_supported(
+    server: RemoteOpenAIServer, model_name: str, task: str
+):
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": "test",
+            "encoding_format": "float",
+            "task": task,
+        },
+    )
+    assert response.json()["error"]["type"] == "BadRequestError"
+    assert response.json()["error"]["message"].startswith(
+        f"Task {task} is not supported"
+    )
--- a/tests/entrypoints/pooling/score/test_online_score.py
+++ b/tests/entrypoints/pooling/score/test_online_score.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+import pytest
+import requests
+import torch
+import torch.nn.functional as F
+from torch import tensor
+
+from tests.utils import RemoteOpenAIServer
+from vllm.entrypoints.pooling.score.protocol import ScoreResponse
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
+
+MODELS = [
+    {"name": "BAAI/bge-reranker-v2-m3", "is_cross_encoder": True},
+    {"name": "BAAI/bge-base-en-v1.5", "is_cross_encoder": False},
+]
+DTYPE = "half"
+
+
+def run_transformers(hf_model, model, text_pairs):
+    if model["is_cross_encoder"]:
+        return hf_model.predict(text_pairs).tolist()
+    else:
+        hf_embeddings = [hf_model.encode(text_pair) for text_pair in text_pairs]
+        return [
+            F.cosine_similarity(tensor(pair[0]), tensor(pair[1]), dim=0)
+            for pair in hf_embeddings
+        ]
+
+
+@pytest.fixture(scope="class", params=MODELS)
+def model(request):
+    yield request.param
+
+
+@pytest.fixture(scope="class")
+def server(model: dict[str, Any]):
+    args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
+
+    with RemoteOpenAIServer(model["name"], args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="class")
+def runner(model: dict[str, Any], hf_runner):
+    kwargs = {
+        "dtype": DTYPE,
+        "is_cross_encoder"
+        if model["is_cross_encoder"]
+        else "is_sentence_transformer": True,
+    }
+
+    with hf_runner(model["name"], **kwargs) as hf_model:
+        yield hf_model
+
+
+class TestModel:
+    def test_text_1_str_text_2_list(
+        self, server: RemoteOpenAIServer, model: dict[str, Any], runner
+    ):
+        text_1 = "What is the capital of France?"
+        text_2 = [
+            "The capital of Brazil is Brasilia.",
+            "The capital of France is Paris.",
+        ]
+
+        score_response = requests.post(
+            server.url_for("score"),
+            json={
+                "model": model["name"],
+                "text_1": text_1,
+                "text_2": text_2,
+            },
+        )
+        score_response.raise_for_status()
+        score = ScoreResponse.model_validate(score_response.json())
+
+        assert score.id is not None
+        assert score.data is not None
+        assert len(score.data) == 2
+
+        vllm_outputs = [d.score for d in score.data]
+
+        text_pairs = [[text_1, text_2[0]], [text_1, text_2[1]]]
+        hf_outputs = run_transformers(runner, model, text_pairs)
+
+        for i in range(len(vllm_outputs)):
+            assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
+
+    def test_text_1_list_text_2_list(
+        self, server: RemoteOpenAIServer, model: dict[str, Any], runner
+    ):
+        text_1 = [
+            "What is the capital of the United States?",
+            "What is the capital of France?",
+        ]
+        text_2 = [
+            "The capital of Brazil is Brasilia.",
+            "The capital of France is Paris.",
+        ]
+
+        score_response = requests.post(
+            server.url_for("score"),
+            json={
+                "model": model["name"],
+                "text_1": text_1,
+                "text_2": text_2,
+            },
+        )
+        score_response.raise_for_status()
+        score = ScoreResponse.model_validate(score_response.json())
+
+        assert score.id is not None
+        assert score.data is not None
+        assert len(score.data) == 2
+
+        vllm_outputs = [d.score for d in score.data]
+
+        text_pairs = [[text_1[0], text_2[0]], [text_1[1], text_2[1]]]
+        hf_outputs = run_transformers(runner, model, text_pairs)
+
+        for i in range(len(vllm_outputs)):
+            assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
+
+    def test_text_1_str_text_2_str(
+        self, server: RemoteOpenAIServer, model: dict[str, Any], runner
+    ):
+        text_1 = "What is the capital of France?"
+        text_2 = "The capital of France is Paris."
+
+        score_response = requests.post(
+            server.url_for("score"),
+            json={
+                "model": model["name"],
+                "text_1": text_1,
+                "text_2": text_2,
+            },
+        )
+        score_response.raise_for_status()
+        score = ScoreResponse.model_validate(score_response.json())
+
+        assert score.id is not None
+        assert score.data is not None
+        assert len(score.data) == 1
+
+        vllm_outputs = [d.score for d in score.data]
+
+        text_pairs = [[text_1, text_2]]
+        hf_outputs = run_transformers(runner, model, text_pairs)
+
+        for i in range(len(vllm_outputs)):
+            assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
+
+    def test_score_max_model_len(
+        self, server: RemoteOpenAIServer, model: dict[str, Any]
+    ):
+        text_1 = "What is the capital of France?" * 20
+        text_2 = [
+            "The capital of Brazil is Brasilia.",
+            "The capital of France is Paris.",
+        ]
+
+        score_response = requests.post(
+            server.url_for("score"),
+            json={
+                "model": model["name"],
+                "text_1": text_1,
+                "text_2": text_2,
+            },
+        )
+        assert score_response.status_code == 400
+        # Assert just a small fragments of the response
+        assert "Please reduce the length of the input." in score_response.text
+
+        # Test truncation
+        score_response = requests.post(
+            server.url_for("score"),
+            json={
+                "model": model["name"],
+                "text_1": text_1,
+                "text_2": text_2,
+                "truncate_prompt_tokens": 101,
+            },
+        )
+        assert score_response.status_code == 400
+        assert "Please, select a smaller truncation size." in score_response.text
+
+    def test_invocations(self, server: RemoteOpenAIServer, model: dict[str, Any]):
+        text_1 = "What is the capital of France?"
+        text_2 = "The capital of France is Paris."
+
+        request_args = {
+            "model": model["name"],
+            "text_1": text_1,
+            "text_2": text_2,
+        }
+
+        score_response = requests.post(server.url_for("score"), json=request_args)
+        score_response.raise_for_status()
+
+        invocation_response = requests.post(
+            server.url_for("invocations"), json=request_args
+        )
+        invocation_response.raise_for_status()
+
+        score_output = score_response.json()
+        invocation_output = invocation_response.json()
+
+        assert score_output.keys() == invocation_output.keys()
+        for score_data, invocation_data in zip(
+            score_output["data"], invocation_output["data"]
+        ):
+            assert score_data.keys() == invocation_data.keys()
+            assert score_data["score"] == pytest.approx(
+                invocation_data["score"], rel=0.05
+            )
+            # TODO: reset this tolerance to 0.01 once we find
+            # an alternative to flash_attn with bfloat16
+
+    def test_use_activation(self, server: RemoteOpenAIServer, model: dict[str, Any]):
+        def get_outputs(use_activation):
+            text_1 = "What is the capital of France?"
+            text_2 = "The capital of France is Paris."
+            response = requests.post(
+                server.url_for("score"),
+                json={
+                    "model": model["name"],
+                    "text_1": text_1,
+                    "text_2": text_2,
+                    "use_activation": use_activation,
+                },
+            )
+            if response.status_code != 200:
+                return response
+
+            outputs = response.json()
+            return torch.tensor([x["score"] for x in outputs["data"]])
+
+        if model["is_cross_encoder"]:
+            default = get_outputs(use_activation=None)
+            w_activation = get_outputs(use_activation=True)
+            wo_activation = get_outputs(use_activation=False)
+
+            assert torch.allclose(default, w_activation, atol=1e-2), (
+                "Default should use activation."
+            )
+            assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
+                "wo_activation should not use activation."
+            )
+            assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), (
+                "w_activation should be close to activation(wo_activation)."
+            )
+        else:
+            get_outputs(use_activation=None)
+
+            # The activation parameter only works for the is_cross_encoder model
+            response = get_outputs(use_activation=True)
+            assert response.status_code == 400