[Feat] Supports Aclgraph for bge-m3 (#3171)

### What this PR does / why we need it? [Feat] Supports Aclgraph for bge-m3 ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? ``` pytest -s tests/e2e/singlecard/test_embedding.py pytest -s tests/e2e/singlecard/test_embedding_aclgraph.py ``` to start an online server with bs 10, each batch's seq length=8192, we set --max-num-batched-tokens=8192*10 to ensure encoder is not chunked: ``` vllm serve /home/data/bge-m3 --max_model_len 1024 --served-model-name "bge-m3" --task embed --host 0.0.0.0 --port 9095 --max-num-batched-tokens 81920 --compilation-config '{"cudagraph_capture_sizes":[8192, 10240, 20480, 40960, 81920]}' ``` For bs10, each batch's seq length=8192, QPS is improved from 85 to 104, which is a 22% improvement, lots of host bound is reduced. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: xuyexiong <xuyexiong@huawei.com> Co-authored-by: wangyongjun <1104133197@qq.com>
2025-10-14 23:07:45 +08:00
parent 434059e417
commit 02c26dcfc7
11 changed files with 307 additions and 21 deletions
--- a/tests/e2e/singlecard/test_bge_model.py
+++ b/tests/e2e/singlecard/test_bge_model.py
@@ -0,0 +1,49 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
+#
+from modelscope import snapshot_download  # type: ignore[import-untyped]
+
+from tests.e2e.conftest import HfRunner, VllmRunner
+from tests.e2e.utils import check_embeddings_close
+
+
+def test_bge_model_correctness():
+    queries = ['What is the capital of China?', 'Explain gravity']
+
+    model_name = snapshot_download("BAAI/bge-m3")
+    with VllmRunner(
+            model_name,
+            task="embed",
+            enforce_eager=True,
+    ) as vllm_runner:
+        vllm_outputs = vllm_runner.encode(queries)
+
+    with HfRunner(
+            model_name,
+            dtype="float32",
+            is_sentence_transformer=True,
+    ) as hf_runner:
+        hf_outputs = hf_runner.encode(queries)
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+        tol=1e-2,
+    )
--- a/tests/e2e/singlecard/test_embedding_aclgraph.py
+++ b/tests/e2e/singlecard/test_embedding_aclgraph.py
@@ -0,0 +1,55 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
+#
+import os
+
+import pytest
+
+from tests.e2e.conftest import VllmRunner
+from tests.e2e.utils import check_embeddings_close
+
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+MODELS = ["BAAI/bge-m3"]
+
+
+@pytest.mark.parametrize("model_name", MODELS)
+def test_aclgrpah_embed_models_correctness(model_name):
+    queries = ['What is the capital of China?', 'Explain gravity']
+
+    with VllmRunner(
+            model_name,
+            task="embed",
+            enforce_eager=False,
+    ) as vllm_aclgraph_runner:
+        vllm_aclgraph_outputs = vllm_aclgraph_runner.encode(queries)
+
+    with VllmRunner(
+            model_name,
+            task="embed",
+            enforce_eager=True,
+    ) as vllm_runner:
+        vllm_outputs = vllm_runner.encode(queries)
+
+    check_embeddings_close(
+        embeddings_0_lst=vllm_outputs,
+        embeddings_1_lst=vllm_aclgraph_outputs,
+        name_0="hf",
+        name_1="vllm",
+        tol=1e-2,
+    )