[Model] Support pooling models (#3122)

### What this PR does / why we need it? Support pooling models (like `bge-reranker-v2-m3`) in vllm-ascend, this pr covered the three model types of embed (cls_token, mean_token, lasttoken). After this [commit](17373dcd93), vllm has provided support for adapting pooling models on the v1 engine. This PR includes corresponding adaptations on the vllm-ascend side. Fixes #1960 - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: lianyibo <lianyibo1@kunlunit.com> Signed-off-by: MengqingCao <cmq0113@163.com> Co-authored-by: MengqingCao <cmq0113@163.com>
2025-12-10 11:37:57 +08:00
parent 1a7a34c5ec
commit e32014ac1d
17 changed files with 577 additions and 338 deletions
--- a/tests/e2e/conftest.py
+++ b/tests/e2e/conftest.py
@@ -26,7 +26,7 @@ import shlex
 import subprocess
 import sys
 import time
-from typing import Any, List, Optional, Tuple, TypeVar, Union
+from typing import Any, Optional, Tuple, TypeVar, Union

 import httpx
 import numpy as np
@@ -42,7 +42,8 @@ from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
                          BatchEncoding, BatchFeature)
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 from vllm import LLM, SamplingParams
-from vllm.config.model import _get_and_verify_dtype
+from vllm.config.model import (ConvertOption, RunnerOption,
+                               _get_and_verify_dtype)
 from vllm.inputs import TextPrompt
 from vllm.outputs import RequestOutput
 from vllm.platforms import current_platform
@@ -67,7 +68,7 @@ from vllm.distributed.parallel_state import (  # noqa E402
 _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)
 _M = TypeVar("_M")

-_PromptMultiModalInput = Union[List[_M], List[List[_M]]]
+_PromptMultiModalInput = Union[list[_M], list[list[_M]]]

 PromptImageInput = _PromptMultiModalInput[Image.Image]
 PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]]
@@ -320,12 +321,11 @@ class VllmRunner:
    def __init__(
        self,
        model_name: str,
-        runner: str = "auto",
+        runner: RunnerOption = "auto",
+        convert: ConvertOption = "auto",
        tokenizer_name: Optional[str] = None,
        tokenizer_mode: str = "auto",
-        # Use smaller max model length, otherwise bigger model cannot run due
-        # to kv cache size limit.
-        max_model_len: int = 1024,
+        max_model_len: Optional[int] = 1024,
        dtype: str = "auto",
        disable_log_stats: bool = True,
        tensor_parallel_size: int = 1,
@@ -339,6 +339,7 @@ class VllmRunner:
        self.model = LLM(
            model=model_name,
            runner=runner,
+            convert=convert,
            tokenizer=tokenizer_name,
            tokenizer_mode=tokenizer_mode,
            trust_remote_code=True,
@@ -356,73 +357,79 @@ class VllmRunner:

    def get_inputs(
        self,
-        prompts: List[str],
+        prompts: Union[list[str], list[torch.Tensor], list[int]],
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
-    ) -> List[TextPrompt]:
-        if images is not None:
-            assert len(prompts) == len(images)
+    ) -> list[TextPrompt]:

-        if videos is not None:
-            assert len(prompts) == len(videos)
+        if any(x is not None and len(x) != len(prompts)
+               for x in [images, videos, audios]):
+            raise ValueError(
+                "All non-None multimodal inputs must have the same length as "
+                "prompts")

-        if audios is not None:
-            assert len(prompts) == len(audios)
+        inputs = []
+        for i, prompt in enumerate(prompts):
+            multi_modal_data = {}
+            if images is not None and (image := images[i]) is not None:
+                multi_modal_data["image"] = image
+            if videos is not None and (video := videos[i]) is not None:
+                multi_modal_data["video"] = video
+            if audios is not None and (audio := audios[i]) is not None:
+                multi_modal_data["audio"] = audio

-        inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
-        if images is not None:
-            for i, image in enumerate(images):
-                if image is not None:
-                    inputs[i]["multi_modal_data"] = {"image": image}
+            text_prompt_kwargs: dict[str, Any] = {
+                "multi_modal_data": multi_modal_data or None
+            }
+            if isinstance(prompt, str):
+                text_prompt_kwargs["prompt"] = prompt
+            elif isinstance(prompt, list):
+                text_prompt_kwargs["prompt_token_ids"] = prompt
+            else:
+                text_prompt_kwargs["prompt_embeds"] = prompt

-        if videos is not None:
-            for i, video in enumerate(videos):
-                if video is not None:
-                    inputs[i]["multi_modal_data"] = {"video": video}
-
-        if audios is not None:
-            for i, audio in enumerate(audios):
-                if audio is not None:
-                    inputs[i]["multi_modal_data"] = {"audio": audio}
+            inputs.append(TextPrompt(**text_prompt_kwargs))

        return inputs

    def generate(
        self,
-        prompts: List[str],
+        prompts: Union[list[str], list[torch.Tensor]],
        sampling_params: SamplingParams,
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
-    ) -> List[Tuple[List[List[int]], List[str]]]:
+        **kwargs: Any,
+    ) -> list[tuple[list[list[int]], list[str]]]:
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)

        req_outputs = self.model.generate(inputs,
-                                          sampling_params=sampling_params)
+                                          sampling_params=sampling_params,
+                                          **kwargs)

-        outputs: List[Tuple[List[List[int]], List[str]]] = []
+        outputs: list[tuple[list[list[int]], list[str]]] = []
        for req_output in req_outputs:
            prompt_str = req_output.prompt
            prompt_ids = req_output.prompt_token_ids
-            req_sample_output_ids: List[List[int]] = []
-            req_sample_output_strs: List[str] = []
+            req_sample_output_ids: list[list[int]] = []
+            req_sample_output_strs: list[str] = []
            for sample in req_output.outputs:
                output_str = sample.text
                output_ids = list(sample.token_ids)
                req_sample_output_ids.append(prompt_ids + output_ids)
-                req_sample_output_strs.append(prompt_str + output_str)
+                req_sample_output_strs.append((prompt_str or "") + output_str)
            outputs.append((req_sample_output_ids, req_sample_output_strs))
        return outputs

    @staticmethod
    def _final_steps_generate_w_logprobs(
-        req_outputs: List[RequestOutput],
-    ) -> List[TokensTextLogprobsPromptLogprobs]:
-        outputs: List[TokensTextLogprobsPromptLogprobs] = []
+        req_outputs: list[RequestOutput],
+    ) -> list[TokensTextLogprobsPromptLogprobs]:
+        outputs: list[TokensTextLogprobsPromptLogprobs] = []
        for req_output in req_outputs:
            assert len(req_output.outputs) > 0
            for sample in req_output.outputs:
@@ -435,20 +442,22 @@ class VllmRunner:

    def generate_w_logprobs(
        self,
-        prompts: List[str],
+        prompts: list[str],
        sampling_params: SamplingParams,
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
        videos: Optional[PromptVideoInput] = None,
-    ) -> Union[List[TokensTextLogprobs],
-               List[TokensTextLogprobsPromptLogprobs]]:
+        **kwargs: Any,
+    ) -> Union[list[TokensTextLogprobs],
+               list[TokensTextLogprobsPromptLogprobs]]:
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)

        req_outputs = self.model.generate(inputs,
-                                          sampling_params=sampling_params)
+                                          sampling_params=sampling_params,
+                                          **kwargs)

        toks_str_logsprobs_prompt_logprobs = (
            self._final_steps_generate_w_logprobs(req_outputs))
@@ -459,34 +468,37 @@ class VllmRunner:

    def generate_greedy(
        self,
-        prompts: List[str],
+        prompts: Union[list[str], list[torch.Tensor]],
        max_tokens: int,
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
-    ) -> List[Tuple[List[int], str]]:
+        **kwargs: Any,
+    ) -> list[tuple[list[int], str]]:
        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
        outputs = self.generate(prompts,
                                greedy_params,
                                images=images,
                                videos=videos,
-                                audios=audios)
+                                audios=audios,
+                                **kwargs)
        return [(output_ids[0], output_str[0])
                for output_ids, output_str in outputs]

    def generate_greedy_logprobs(
        self,
-        prompts: List[str],
+        prompts: list[str],
        max_tokens: int,
-        num_logprobs: int,
+        num_logprobs: Optional[int],
        num_prompt_logprobs: Optional[int] = None,
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
        videos: Optional[PromptVideoInput] = None,
-        stop_token_ids: Optional[List[int]] = None,
-        stop: Optional[List[str]] = None,
-    ) -> Union[List[TokensTextLogprobs],
-               List[TokensTextLogprobsPromptLogprobs]]:
+        stop_token_ids: Optional[list[int]] = None,
+        stop: Optional[list[str]] = None,
+        **kwargs: Any,
+    ) -> Union[list[TokensTextLogprobs],
+               list[TokensTextLogprobsPromptLogprobs]]:
        greedy_logprobs_params = SamplingParams(
            temperature=0.0,
            max_tokens=max_tokens,
@@ -499,23 +511,46 @@ class VllmRunner:
                                        greedy_logprobs_params,
                                        images=images,
                                        audios=audios,
-                                        videos=videos)
+                                        videos=videos,
+                                        **kwargs)

-    def encode(
-        self,
-        prompts: List[str],
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
-    ) -> List[List[float]]:
+    def classify(self, prompts: list[str]) -> list[list[float]]:
+        req_outputs = self.model.classify(prompts)
+        return [req_output.outputs.probs for req_output in req_outputs]
+
+    def embed(self,
+              prompts: list[str],
+              images: Optional[PromptImageInput] = None,
+              videos: Optional[PromptVideoInput] = None,
+              audios: Optional[PromptAudioInput] = None,
+              *args,
+              **kwargs) -> list[list[float]]:
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)

-        req_outputs = self.model.embed(inputs)
+        req_outputs = self.model.embed(inputs, *args, **kwargs)
        return [req_output.outputs.embedding for req_output in req_outputs]

+    def encode(self, prompts: list[str]) -> list[list[float]]:
+        req_outputs = self.model.encode(prompts)
+        return [req_output.outputs.data for req_output in req_outputs]
+
+    def reward(self, prompts: list[str]) -> list[list[float]]:
+        req_outputs = self.model.reward(prompts)
+        return [req_output.outputs.data for req_output in req_outputs]
+
+    def score(
+        self,
+        text_1: Union[str, list[str]],
+        text_2: Union[str, list[str]],
+        *args,
+        **kwargs,
+    ) -> list[float]:
+        req_outputs = self.model.score(text_1, text_2, *args, **kwargs)
+        return [req_output.outputs.score for req_output in req_outputs]
+
    def __enter__(self):
        return self

@@ -635,10 +670,79 @@ class HfRunner:
        if skip_tokenizer_init:
            self.tokenizer = self.processor.tokenizer

+    def get_inputs(
+        self,
+        prompts: list[str],
+        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+    ) -> list[Union[BatchFeature, BatchEncoding]]:
+        if images is not None:
+            assert len(prompts) == len(images)
+
+        if videos is not None:
+            assert len(prompts) == len(videos)
+
+        if audios is not None:
+            assert len(prompts) == len(audios)
+
+        all_inputs: list[Union[BatchFeature, BatchEncoding]] = []
+        for i, prompt in enumerate(prompts):
+            processor_kwargs: dict[str, Any] = {
+                "text": prompt,
+                "return_tensors": "pt",
+            }
+            if images is not None and (image := images[i]) is not None:
+                processor_kwargs["images"] = image
+            if videos is not None and (video := videos[i]) is not None:
+                processor_kwargs["videos"] = video
+            if audios is not None and (audio_inputs := audios[i]) is not None:
+                # HACK - not all processors take sampling_rate; we should
+                # clean this up in the future.
+                if len(audio_inputs) == 2:
+                    audio, sr = audio_inputs
+                    processor_kwargs["audio"] = audio
+                    processor_kwargs["sampling_rate"] = sr
+                else:
+                    processor_kwargs["audio"] = audio_inputs
+
+            inputs = self.processor(**processor_kwargs)
+            if isinstance(inputs, BatchFeature):
+                inputs = inputs.to(dtype=self.dtype)
+
+            all_inputs.append(inputs)
+
+        return all_inputs
+
+    def classify(self, prompts: list[str]) -> list[str]:
+        # output is final logits
+        all_inputs = self.get_inputs(prompts)
+        outputs = []
+        problem_type = getattr(self.config, "problem_type", "")
+
+        for inputs in all_inputs:
+            output = self.model(**self.wrap_device(inputs))
+            if problem_type == "regression":
+                logits = output.logits[0].tolist()
+            elif problem_type == "multi_label_classification":
+                logits = output.logits.sigmoid()[0].tolist()
+            else:
+                logits = output.logits.softmax(dim=-1)[0].tolist()
+            outputs.append(logits)
+
+        return outputs
+
    def encode(self, prompts: list[str], *args,
               **kwargs) -> list[list[torch.Tensor]]:
        return self.model.encode(prompts, *args, **kwargs)

+    def predict(self, prompts: list[list[str]], *args,
+                **kwargs) -> torch.Tensor:
+        return self.model.predict(prompts,
+                                  *args,
+                                  convert_to_tensor=True,
+                                  **kwargs)
+
    def __enter__(self):
        return self

@@ -652,7 +756,7 @@ def ilama_lora_files():
    return snapshot_download(repo_id="vllm-ascend/ilama-text2sql-spider")


-def qwen_prompt(questions: List[str]) -> List[str]:
+def qwen_prompt(questions: list[str]) -> list[str]:
    placeholder = "<|image_pad|>"
    return [("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
             f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
--- a/tests/e2e/singlecard/pooling/init.py
+++ b/tests/e2e/singlecard/pooling/init.py
--- a/tests/e2e/singlecard/pooling/test_classification.py
+++ b/tests/e2e/singlecard/pooling/test_classification.py
@@ -0,0 +1,34 @@
+import torch
+from modelscope import snapshot_download  # type: ignore[import-untyped]
+from transformers import AutoModelForSequenceClassification
+
+from tests.e2e.conftest import HfRunner, VllmRunner
+
+
+def test_classify_correctness() -> None:
+
+    model_name = snapshot_download("Howeee/Qwen2.5-1.5B-apeach")
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is what",
+    ]
+    with VllmRunner(
+            model_name,
+            runner="pooling",
+            max_model_len=None,
+            cudagraph_capture_sizes=[4],
+    ) as vllm_runner:
+        vllm_outputs = vllm_runner.classify(prompts)
+
+    with HfRunner(model_name,
+                  dtype="float32",
+                  auto_cls=AutoModelForSequenceClassification) as hf_runner:
+        hf_outputs = hf_runner.classify(prompts)
+
+    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+        hf_output = torch.tensor(hf_output)
+        vllm_output = torch.tensor(vllm_output)
+        assert torch.allclose(hf_output, vllm_output, 1e-2)
--- a/tests/e2e/singlecard/pooling/test_embedding.py
+++ b/tests/e2e/singlecard/pooling/test_embedding.py
@@ -16,22 +16,32 @@
 # This file is a part of the vllm-ascend project.
 # Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
 #
+import pytest
 from modelscope import snapshot_download  # type: ignore[import-untyped]

 from tests.e2e.conftest import HfRunner, VllmRunner
 from tests.e2e.utils import check_embeddings_close

+MODELS = [
+    "Qwen/Qwen3-Embedding-0.6B",  # lasttoken
+    "BAAI/bge-small-en-v1.5",  # cls_token
+    "intfloat/multilingual-e5-small"  # mean_tokens
+]

-def test_embed_models_correctness():
+
+@pytest.mark.parametrize("model", MODELS)
+def test_embed_models_correctness(model: str):
    queries = ['What is the capital of China?', 'Explain gravity']

-    model_name = snapshot_download("Qwen/Qwen3-Embedding-0.6B")
+    model_name = snapshot_download(model)
    with VllmRunner(
            model_name,
            runner="pooling",
            enforce_eager=False,
+            max_model_len=None,
+            cudagraph_capture_sizes=[4],
    ) as vllm_runner:
-        vllm_outputs = vllm_runner.encode(queries)
+        vllm_outputs = vllm_runner.embed(queries)

    with HfRunner(
            model_name,
--- a/tests/e2e/singlecard/pooling/test_scoring.py
+++ b/tests/e2e/singlecard/pooling/test_scoring.py
@@ -0,0 +1,187 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+import torch.nn.functional as F
+from modelscope import snapshot_download  # type: ignore[import-untyped]
+
+from tests.e2e.conftest import HfRunner, VllmRunner
+
+CROSS_ENCODER_MODELS = [
+    "dengcao/ms-marco-MiniLM-L6-v2",  # Bert
+    "BAAI/bge-reranker-v2-m3",  # Roberta
+]
+
+EMBEDDING_MODELS = [
+    "sentence-transformers/all-MiniLM-L12-v2",
+]
+
+TEXTS_1 = [
+    "What is the capital of France?",
+    "What is the capital of Germany?",
+]
+
+TEXTS_2 = [
+    "The capital of France is Paris.",
+    "The capital of Germany is Berlin.",
+]
+
+DTYPE = "half"
+
+
+@pytest.fixture(scope="module", params=CROSS_ENCODER_MODELS)
+def model_name(request):
+    yield snapshot_download(request.param)
+
+
+def test_cross_encoder_1_to_1(model_name):
+    text_pair = [TEXTS_1[0], TEXTS_2[0]]
+
+    with HfRunner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
+        hf_outputs = hf_model.predict([text_pair]).tolist()
+
+    with VllmRunner(model_name,
+                    runner="pooling",
+                    dtype=DTYPE,
+                    cudagraph_capture_sizes=[4],
+                    max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
+
+    assert len(vllm_outputs) == 1
+    assert len(hf_outputs) == 1
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+
+
+def test_cross_encoder_1_to_N(model_name):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[0], TEXTS_2[1]],
+    ]
+
+    with HfRunner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
+        hf_outputs = hf_model.predict(text_pairs).tolist()
+
+    with VllmRunner(model_name,
+                    runner="pooling",
+                    dtype=DTYPE,
+                    cudagraph_capture_sizes=[4],
+                    max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
+
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
+
+
+def test_cross_encoder_N_to_N(model_name):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[1], TEXTS_2[1]],
+    ]
+
+    with HfRunner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
+        hf_outputs = hf_model.predict(text_pairs).tolist()
+
+    with VllmRunner(model_name,
+                    runner="pooling",
+                    dtype=DTYPE,
+                    cudagraph_capture_sizes=[4],
+                    max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
+
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
+
+
+@pytest.fixture(scope="module", params=EMBEDDING_MODELS)
+def emb_model_name(request):
+    yield snapshot_download(request.param)
+
+
+def test_embedding_1_to_1(emb_model_name):
+    text_pair = [TEXTS_1[0], TEXTS_2[0]]
+
+    with HfRunner(emb_model_name, dtype=DTYPE,
+                  is_sentence_transformer=True) as hf_model:
+        hf_embeddings = hf_model.encode(text_pair)
+        hf_outputs = [
+            F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0)
+        ]
+
+    with VllmRunner(emb_model_name,
+                    runner="pooling",
+                    dtype=DTYPE,
+                    cudagraph_capture_sizes=[4],
+                    max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
+
+    assert len(vllm_outputs) == 1
+    assert len(hf_outputs) == 1
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+
+
+def test_embedding_1_to_N(emb_model_name):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[0], TEXTS_2[1]],
+    ]
+
+    with HfRunner(emb_model_name, dtype=DTYPE,
+                  is_sentence_transformer=True) as hf_model:
+        hf_embeddings = [
+            hf_model.encode(text_pair) for text_pair in text_pairs
+        ]
+        hf_outputs = [
+            F.cosine_similarity(*map(torch.tensor, pair), dim=0)
+            for pair in hf_embeddings
+        ]
+
+    with VllmRunner(emb_model_name,
+                    runner="pooling",
+                    dtype=DTYPE,
+                    cudagraph_capture_sizes=[4],
+                    max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
+
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
+
+
+def test_embedding_N_to_N(emb_model_name):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[1], TEXTS_2[1]],
+    ]
+
+    with HfRunner(emb_model_name, dtype=DTYPE,
+                  is_sentence_transformer=True) as hf_model:
+        hf_embeddings = [
+            hf_model.encode(text_pair) for text_pair in text_pairs
+        ]
+        hf_outputs = [
+            F.cosine_similarity(*map(torch.tensor, pair), dim=0)
+            for pair in hf_embeddings
+        ]
+
+    with VllmRunner(emb_model_name,
+                    runner="pooling",
+                    dtype=DTYPE,
+                    cudagraph_capture_sizes=[4],
+                    max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
+
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
--- a/tests/e2e/singlecard/test_bge_model.py
+++ b/tests/e2e/singlecard/test_bge_model.py
@@ -1,49 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This file is a part of the vllm-ascend project.
-# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
-#
-from modelscope import snapshot_download  # type: ignore[import-untyped]
-
-from tests.e2e.conftest import HfRunner, VllmRunner
-from tests.e2e.utils import check_embeddings_close
-
-
-def test_bge_model_correctness():
-    queries = ['What is the capital of China?', 'Explain gravity']
-
-    model_name = snapshot_download("BAAI/bge-m3")
-    with VllmRunner(
-            model_name,
-            runner="pooling",
-            enforce_eager=True,
-    ) as vllm_runner:
-        vllm_outputs = vllm_runner.encode(queries)
-
-    with HfRunner(
-            model_name,
-            dtype="float32",
-            is_sentence_transformer=True,
-    ) as hf_runner:
-        hf_outputs = hf_runner.encode(queries)
-
-    check_embeddings_close(
-        embeddings_0_lst=hf_outputs,
-        embeddings_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-        tol=1e-2,
-    )
--- a/tests/e2e/singlecard/test_embedding_aclgraph.py
+++ b/tests/e2e/singlecard/test_embedding_aclgraph.py
@@ -1,55 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This file is a part of the vllm-ascend project.
-# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
-#
-import os
-
-import pytest
-
-from tests.e2e.conftest import VllmRunner
-from tests.e2e.utils import check_embeddings_close
-
-os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-
-MODELS = ["BAAI/bge-m3"]
-
-
-@pytest.mark.parametrize("model_name", MODELS)
-def test_aclgrpah_embed_models_correctness(model_name):
-    queries = ['What is the capital of China?', 'Explain gravity']
-
-    with VllmRunner(
-            model_name,
-            runner="pooling",
-            enforce_eager=False,
-    ) as vllm_aclgraph_runner:
-        vllm_aclgraph_outputs = vllm_aclgraph_runner.encode(queries)
-
-    with VllmRunner(
-            model_name,
-            runner="pooling",
-            enforce_eager=True,
-    ) as vllm_runner:
-        vllm_outputs = vllm_runner.encode(queries)
-
-    check_embeddings_close(
-        embeddings_0_lst=vllm_outputs,
-        embeddings_1_lst=vllm_aclgraph_outputs,
-        name_0="hf",
-        name_1="vllm",
-        tol=1e-2,
-    )