[E2E] Refactor the e2e testcases. (#4789)
### What this PR does / why we need it?
Refactor the e2e testcases.
- tests/e2e/multicard/test_weight_loader.py: Remove the unused code.
- tests/e2e/singlecard/multi-modal/test_internvl.py: Move to accuracy
test.
- tests/e2e/singlecard/test_aclgraph.py: Rename the file.
- tests/e2e/singlecard/test_embedding_aclgraph.py : Combine with
tests/e2e/singlecard/test_bge_model.py
- tests/e2e/singlecard/test_completion_with_prompt_embeds.py: Delete
eager mode and modify model to Qwen3-0.6B
- tests/e2e/singlecard/test_quantization.py: Modify model to
Qwen3-0.6B-W8A8
- tests/e2e/singlecard/test_vlm.py: Modify model to Qwen3-VL-8B
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: menogrey <1299267905@qq.com>
This commit is contained in:
6
.github/workflows/_e2e_test.yaml
vendored
6
.github/workflows/_e2e_test.yaml
vendored
@@ -75,7 +75,7 @@ jobs:
|
|||||||
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
|
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
|
||||||
if: ${{ inputs.type == 'light' }}
|
if: ${{ inputs.type == 'light' }}
|
||||||
run: |
|
run: |
|
||||||
# pytest -sv tests/e2e/singlecard/test_aclgraph.py
|
# pytest -sv tests/e2e/singlecard/test_aclgraph_accuracy.py
|
||||||
# pytest -sv tests/e2e/singlecard/test_quantization.py
|
# pytest -sv tests/e2e/singlecard/test_quantization.py
|
||||||
pytest -sv tests/e2e/singlecard/test_vlm.py::test_multimodal_vl
|
pytest -sv tests/e2e/singlecard/test_vlm.py::test_multimodal_vl
|
||||||
pytest -sv tests/e2e/singlecard/pooling/test_classification.py::test_classify_correctness
|
pytest -sv tests/e2e/singlecard/pooling/test_classification.py::test_classify_correctness
|
||||||
@@ -91,10 +91,9 @@ jobs:
|
|||||||
# the test separately.
|
# the test separately.
|
||||||
|
|
||||||
pytest -sv tests/e2e/singlecard/test_completion_with_prompt_embeds.py
|
pytest -sv tests/e2e/singlecard/test_completion_with_prompt_embeds.py
|
||||||
pytest -sv tests/e2e/singlecard/test_aclgraph.py
|
pytest -sv tests/e2e/singlecard/test_aclgraph_accuracy.py
|
||||||
pytest -sv tests/e2e/singlecard/test_aclgraph_mem.py
|
pytest -sv tests/e2e/singlecard/test_aclgraph_mem.py
|
||||||
pytest -sv tests/e2e/singlecard/test_camem.py
|
pytest -sv tests/e2e/singlecard/test_camem.py
|
||||||
# pytest -sv tests/e2e/singlecard/test_embedding_aclgraph.py
|
|
||||||
pytest -sv tests/e2e/singlecard/test_guided_decoding.py
|
pytest -sv tests/e2e/singlecard/test_guided_decoding.py
|
||||||
# torch 2.8 doesn't work with lora, fix me
|
# torch 2.8 doesn't work with lora, fix me
|
||||||
#pytest -sv tests/e2e/singlecard/test_ilama_lora.py
|
#pytest -sv tests/e2e/singlecard/test_ilama_lora.py
|
||||||
@@ -102,7 +101,6 @@ jobs:
|
|||||||
pytest -sv tests/e2e/singlecard/test_quantization.py
|
pytest -sv tests/e2e/singlecard/test_quantization.py
|
||||||
pytest -sv tests/e2e/singlecard/test_sampler.py
|
pytest -sv tests/e2e/singlecard/test_sampler.py
|
||||||
pytest -sv tests/e2e/singlecard/test_vlm.py
|
pytest -sv tests/e2e/singlecard/test_vlm.py
|
||||||
pytest -sv tests/e2e/singlecard/multi-modal/test_internvl.py
|
|
||||||
pytest -sv tests/e2e/singlecard/test_xlite.py
|
pytest -sv tests/e2e/singlecard/test_xlite.py
|
||||||
pytest -sv tests/e2e/singlecard/pooling/
|
pytest -sv tests/e2e/singlecard/pooling/
|
||||||
|
|
||||||
|
|||||||
@@ -252,7 +252,7 @@ Run nightly multi-node test cases locally refer to section of `Running Locally`
|
|||||||
|
|
||||||
- Offline test example: [`tests/e2e/singlecard/test_offline_inference.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_offline_inference.py)
|
- Offline test example: [`tests/e2e/singlecard/test_offline_inference.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_offline_inference.py)
|
||||||
- Online test examples: [`tests/e2e/singlecard/test_prompt_embedding.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_prompt_embedding.py)
|
- Online test examples: [`tests/e2e/singlecard/test_prompt_embedding.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_prompt_embedding.py)
|
||||||
- Correctness test example: [`tests/e2e/singlecard/test_aclgraph.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph.py)
|
- Correctness test example: [`tests/e2e/singlecard/test_aclgraph_accuracy.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph_accuracy.py)
|
||||||
|
|
||||||
The CI resource is limited, and you might need to reduce the number of layers of a model. Below is an example of how to generate a reduced layer model:
|
The CI resource is limited, and you might need to reduce the number of layers of a model. Below is an example of how to generate a reduced layer model:
|
||||||
1. Fork the original model repo in modelscope. All the files in the repo except for weights are required.
|
1. Fork the original model repo in modelscope. All the files in the repo except for weights are required.
|
||||||
|
|||||||
@@ -169,11 +169,11 @@ msgstr ""
|
|||||||
#: ../../developer_guide/contribution/testing.md:246
|
#: ../../developer_guide/contribution/testing.md:246
|
||||||
msgid ""
|
msgid ""
|
||||||
"Correctness test example: "
|
"Correctness test example: "
|
||||||
"[`tests/e2e/singlecard/test_aclgraph.py`](https://github.com/vllm-"
|
"[`tests/e2e/singlecard/test_aclgraph_accuracy.py`](https://github.com/vllm-"
|
||||||
"project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph.py)"
|
"project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph_accuracy.py)"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
"正确性测试示例:[`tests/e2e/singlecard/test_aclgraph.py`](https://github.com/vllm-"
|
"正确性测试示例:[`tests/e2e/singlecard/test_aclgraph_accuracy.py`](https://github.com/vllm-"
|
||||||
"project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph.py)"
|
"project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph_accuracy.py)"
|
||||||
|
|
||||||
#: ../../developer_guide/contribution/testing.md:247
|
#: ../../developer_guide/contribution/testing.md:247
|
||||||
msgid ""
|
msgid ""
|
||||||
|
|||||||
11
tests/e2e/models/configs/InternVL2-8B.yaml
Normal file
11
tests/e2e/models/configs/InternVL2-8B.yaml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
model_name: "OpenGVLab/InternVL2-8B"
|
||||||
|
runner: "linux-aarch64-a2-1"
|
||||||
|
hardware: "Atlas A2 Series"
|
||||||
|
model: "vllm-vlm"
|
||||||
|
tasks:
|
||||||
|
- name: "mmmu_val"
|
||||||
|
metrics:
|
||||||
|
- name: "acc,none"
|
||||||
|
value: 0.58
|
||||||
|
max_model_len: 32768
|
||||||
|
trust_remote_code: True
|
||||||
11
tests/e2e/models/configs/InternVL2_5-8B.yaml
Normal file
11
tests/e2e/models/configs/InternVL2_5-8B.yaml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
model_name: "OpenGVLab/InternVL2_5-8B"
|
||||||
|
runner: "linux-aarch64-a2-1"
|
||||||
|
hardware: "Atlas A2 Series"
|
||||||
|
model: "vllm-vlm"
|
||||||
|
tasks:
|
||||||
|
- name: "mmmu_val"
|
||||||
|
metrics:
|
||||||
|
- name: "acc,none"
|
||||||
|
value: 0.58
|
||||||
|
max_model_len: 32768
|
||||||
|
trust_remote_code: True
|
||||||
11
tests/e2e/models/configs/InternVL3-8B.yaml
Normal file
11
tests/e2e/models/configs/InternVL3-8B.yaml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
model_name: "OpenGVLab/InternVL3-8B"
|
||||||
|
runner: "linux-aarch64-a2-1"
|
||||||
|
hardware: "Atlas A2 Series"
|
||||||
|
model: "vllm-vlm"
|
||||||
|
tasks:
|
||||||
|
- name: "mmmu_val"
|
||||||
|
metrics:
|
||||||
|
- name: "acc,none"
|
||||||
|
value: 0.58
|
||||||
|
max_model_len: 32768
|
||||||
|
trust_remote_code: True
|
||||||
@@ -6,6 +6,9 @@ Qwen3-VL-30B-A3B-Instruct.yaml
|
|||||||
Qwen3-VL-8B-Instruct.yaml
|
Qwen3-VL-8B-Instruct.yaml
|
||||||
Qwen2.5-Omni-7B.yaml
|
Qwen2.5-Omni-7B.yaml
|
||||||
Meta-Llama-3.1-8B-Instruct.yaml
|
Meta-Llama-3.1-8B-Instruct.yaml
|
||||||
|
InternVL2-8B.yaml
|
||||||
|
InternVL2_5-8B.yaml
|
||||||
|
InternVL3-8B.yaml
|
||||||
InternVL3_5-8B.yaml
|
InternVL3_5-8B.yaml
|
||||||
ERNIE-4.5-21B-A3B-PT.yaml
|
ERNIE-4.5-21B-A3B-PT.yaml
|
||||||
gemma-3-4b-it.yaml
|
gemma-3-4b-it.yaml
|
||||||
|
|||||||
@@ -1,109 +0,0 @@
|
|||||||
#
|
|
||||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
||||||
# Copyright 2023 The vLLM team.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
#
|
|
||||||
"""
|
|
||||||
Compare the outputs of vLLM with and without aclgraph.
|
|
||||||
|
|
||||||
Run `pytest tests/multicard/test_external_launcher.py`.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import subprocess
|
|
||||||
import sys
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import torch_npu
|
|
||||||
|
|
||||||
MOE_MODELS = ["Qwen/Qwen3-30B-A3B"]
|
|
||||||
MODELS = ["Qwen/Qwen3-8B"]
|
|
||||||
DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", MOE_MODELS)
|
|
||||||
def test_external_launcher(model):
|
|
||||||
script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
|
|
||||||
env = os.environ.copy()
|
|
||||||
# TODO: Change to 2 when ci machine has 4 cards
|
|
||||||
cmd = [
|
|
||||||
sys.executable,
|
|
||||||
str(script),
|
|
||||||
"--model",
|
|
||||||
model,
|
|
||||||
"--tp-size",
|
|
||||||
"2",
|
|
||||||
"--proc-per-node",
|
|
||||||
"2",
|
|
||||||
"--trust-remote-code",
|
|
||||||
"--enable-expert-parallel",
|
|
||||||
"--enable-sleep-mode",
|
|
||||||
"--model-weight-gib",
|
|
||||||
"20",
|
|
||||||
]
|
|
||||||
|
|
||||||
print(f"Running subprocess: {' '.join(cmd)}")
|
|
||||||
proc = subprocess.run(
|
|
||||||
cmd,
|
|
||||||
env=env,
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.STDOUT,
|
|
||||||
timeout=600,
|
|
||||||
)
|
|
||||||
output = proc.stdout.decode(errors='ignore')
|
|
||||||
|
|
||||||
print(output)
|
|
||||||
|
|
||||||
assert "TP RANKS: [0]" in output
|
|
||||||
assert "TP RANKS: [1]" in output
|
|
||||||
assert "Generated text:" in output
|
|
||||||
assert proc.returncode == 0
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
|
||||||
def test_external_launcher_dense(model):
|
|
||||||
script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
|
|
||||||
env = os.environ.copy()
|
|
||||||
# TODO: Change to 2 when ci machine has 4 cards
|
|
||||||
cmd = [
|
|
||||||
sys.executable,
|
|
||||||
str(script),
|
|
||||||
"--model",
|
|
||||||
model,
|
|
||||||
"--tp-size",
|
|
||||||
"2",
|
|
||||||
"--proc-per-node",
|
|
||||||
"2",
|
|
||||||
"--trust-remote-code",
|
|
||||||
"--enable-sleep-mode",
|
|
||||||
"--model-weight-gib",
|
|
||||||
"20",
|
|
||||||
]
|
|
||||||
|
|
||||||
print(f"Running subprocess: {' '.join(cmd)}")
|
|
||||||
proc = subprocess.run(
|
|
||||||
cmd,
|
|
||||||
env=env,
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.STDOUT,
|
|
||||||
timeout=600,
|
|
||||||
)
|
|
||||||
output = proc.stdout.decode(errors='ignore')
|
|
||||||
|
|
||||||
print(output)
|
|
||||||
|
|
||||||
assert "TP RANKS: [0]" in output
|
|
||||||
assert "TP RANKS: [1]" in output
|
|
||||||
assert "Generated text:" in output
|
|
||||||
assert proc.returncode == 0
|
|
||||||
@@ -1,89 +0,0 @@
|
|||||||
#
|
|
||||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
# This file is a part of the vllm-ascend project.
|
|
||||||
#
|
|
||||||
|
|
||||||
import os
|
|
||||||
|
|
||||||
# Set spawn method before any torch/NPU imports to avoid fork issues
|
|
||||||
os.environ.setdefault('VLLM_WORKER_MULTIPROC_METHOD', 'spawn')
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from vllm.assets.image import ImageAsset
|
|
||||||
|
|
||||||
from tests.e2e.conftest import VllmRunner
|
|
||||||
from tests.e2e.model_utils import check_outputs_equal
|
|
||||||
|
|
||||||
MODELS = [
|
|
||||||
"OpenGVLab/InternVL2-8B",
|
|
||||||
"OpenGVLab/InternVL2_5-8B",
|
|
||||||
"OpenGVLab/InternVL3-8B",
|
|
||||||
"OpenGVLab/InternVL3_5-8B",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
|
||||||
def test_internvl_basic(model: str):
|
|
||||||
"""Test basic InternVL2 inference with single image."""
|
|
||||||
# Load test image
|
|
||||||
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
|
|
||||||
|
|
||||||
# InternVL uses chat template format
|
|
||||||
# Format: <|im_start|>user\n<image>\nQUESTION<|im_end|>\n<|im_start|>assistant\n
|
|
||||||
questions = [
|
|
||||||
"What is the content of this image?",
|
|
||||||
"Describe this image in detail.",
|
|
||||||
]
|
|
||||||
|
|
||||||
# Build prompts with InternVL2 chat template
|
|
||||||
prompts = [
|
|
||||||
f"<|im_start|>user\n<image>\n{q}<|im_end|>\n<|im_start|>assistant\n"
|
|
||||||
for q in questions
|
|
||||||
]
|
|
||||||
images = [image] * len(prompts)
|
|
||||||
|
|
||||||
outputs = {}
|
|
||||||
for enforce_eager, mode in [(False, "eager"), (True, "graph")]:
|
|
||||||
with VllmRunner(
|
|
||||||
model,
|
|
||||||
max_model_len=8192,
|
|
||||||
limit_mm_per_prompt={"image": 4},
|
|
||||||
enforce_eager=enforce_eager,
|
|
||||||
dtype="bfloat16",
|
|
||||||
) as vllm_model:
|
|
||||||
generated_outputs = vllm_model.generate_greedy(
|
|
||||||
prompts=prompts,
|
|
||||||
images=images,
|
|
||||||
max_tokens=128,
|
|
||||||
)
|
|
||||||
|
|
||||||
assert len(generated_outputs) == len(prompts), \
|
|
||||||
f"Expected {len(prompts)} outputs, got {len(generated_outputs)} in {mode} mode"
|
|
||||||
|
|
||||||
for i, (_, output_str) in enumerate(generated_outputs):
|
|
||||||
assert output_str, \
|
|
||||||
f"{mode.capitalize()} mode output {i} should not be empty. Prompt: {prompts[i]}"
|
|
||||||
assert len(output_str.strip()) > 0, \
|
|
||||||
f"{mode.capitalize()} mode Output {i} should have meaningful content"
|
|
||||||
|
|
||||||
outputs[mode] = generated_outputs
|
|
||||||
|
|
||||||
eager_outputs = outputs["eager"]
|
|
||||||
graph_outputs = outputs["graph"]
|
|
||||||
|
|
||||||
check_outputs_equal(outputs_0_lst=eager_outputs,
|
|
||||||
outputs_1_lst=graph_outputs,
|
|
||||||
name_0="eager mode",
|
|
||||||
name_1="graph mode")
|
|
||||||
@@ -24,7 +24,6 @@ from tests.e2e.utils import check_embeddings_close
|
|||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
"Qwen/Qwen3-Embedding-0.6B", # lasttoken
|
"Qwen/Qwen3-Embedding-0.6B", # lasttoken
|
||||||
"BAAI/bge-small-en-v1.5", # cls_token
|
|
||||||
"intfloat/multilingual-e5-small" # mean_tokens
|
"intfloat/multilingual-e5-small" # mean_tokens
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -57,3 +56,45 @@ def test_embed_models_correctness(model: str):
|
|||||||
name_1="vllm",
|
name_1="vllm",
|
||||||
tol=1e-2,
|
tol=1e-2,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_bge_model_correctness():
|
||||||
|
queries = ['What is the capital of China?', 'Explain gravity']
|
||||||
|
|
||||||
|
model_name = snapshot_download("BAAI/bge-m3")
|
||||||
|
with VllmRunner(
|
||||||
|
model_name,
|
||||||
|
runner="pooling",
|
||||||
|
enforce_eager=False,
|
||||||
|
) as vllm_aclgraph_runner:
|
||||||
|
vllm_aclgraph_outputs = vllm_aclgraph_runner.embed(queries)
|
||||||
|
|
||||||
|
with VllmRunner(
|
||||||
|
model_name,
|
||||||
|
runner="pooling",
|
||||||
|
enforce_eager=True,
|
||||||
|
) as vllm_runner:
|
||||||
|
vllm_eager_outputs = vllm_runner.embed(queries)
|
||||||
|
|
||||||
|
with HfRunner(
|
||||||
|
model_name,
|
||||||
|
dtype="float32",
|
||||||
|
is_sentence_transformer=True,
|
||||||
|
) as hf_runner:
|
||||||
|
hf_outputs = hf_runner.encode(queries)
|
||||||
|
|
||||||
|
check_embeddings_close(
|
||||||
|
embeddings_0_lst=hf_outputs,
|
||||||
|
embeddings_1_lst=vllm_eager_outputs,
|
||||||
|
name_0="hf",
|
||||||
|
name_1="vllm",
|
||||||
|
tol=1e-2,
|
||||||
|
)
|
||||||
|
|
||||||
|
check_embeddings_close(
|
||||||
|
embeddings_0_lst=vllm_eager_outputs,
|
||||||
|
embeddings_1_lst=vllm_aclgraph_outputs,
|
||||||
|
name_0="eager",
|
||||||
|
name_1="aclgraph",
|
||||||
|
tol=1e-2,
|
||||||
|
)
|
||||||
|
|||||||
@@ -17,7 +17,7 @@
|
|||||||
"""
|
"""
|
||||||
Compare the outputs of vLLM with and without aclgraph.
|
Compare the outputs of vLLM with and without aclgraph.
|
||||||
|
|
||||||
Run `pytest tests/compile/test_aclgraph.py`.
|
Run `pytest tests/compile/test_aclgraph_accuracy.py`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
@@ -36,7 +36,7 @@ MODELS = [
|
|||||||
|
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
@pytest.mark.parametrize("max_tokens", [32])
|
@pytest.mark.parametrize("max_tokens", [32])
|
||||||
def test_models_with_aclgraph(
|
def test_output_between_eager_and_aclgraph(
|
||||||
model: str,
|
model: str,
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
) -> None:
|
) -> None:
|
||||||
@@ -100,7 +100,7 @@ def test_models_with_aclgraph(
|
|||||||
|
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
@pytest.mark.parametrize("max_tokens", [32])
|
@pytest.mark.parametrize("max_tokens", [32])
|
||||||
def test_models_with_aclgraph_full_decode_only(
|
def test_output_between_eager_and_full_decode_only(
|
||||||
model: str,
|
model: str,
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
) -> None:
|
) -> None:
|
||||||
@@ -25,7 +25,7 @@ from tests.e2e.conftest import VllmRunner
|
|||||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||||
|
|
||||||
MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
|
MODELS = ["Qwen/Qwen3-0.6B"]
|
||||||
|
|
||||||
|
|
||||||
def get_prompt_embeds(chat, tokenizer, embedding_layer):
|
def get_prompt_embeds(chat, tokenizer, embedding_layer):
|
||||||
@@ -37,127 +37,6 @@ def get_prompt_embeds(chat, tokenizer, embedding_layer):
|
|||||||
return prompt_embeds
|
return prompt_embeds
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model_name", MODELS)
|
|
||||||
def test_single_prompt_embeds_inference(model_name):
|
|
||||||
"""Test single prompt inference with prompt embeddings."""
|
|
||||||
# Prepare prompt embeddings
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
||||||
transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
|
|
||||||
embedding_layer = transformers_model.get_input_embeddings()
|
|
||||||
|
|
||||||
chat = [{
|
|
||||||
"role": "user",
|
|
||||||
"content": "Please tell me about the capital of France."
|
|
||||||
}]
|
|
||||||
prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)
|
|
||||||
|
|
||||||
# Run inference with prompt embeddings
|
|
||||||
with VllmRunner(
|
|
||||||
model_name,
|
|
||||||
enable_prompt_embeds=True,
|
|
||||||
enforce_eager=True,
|
|
||||||
) as vllm_runner:
|
|
||||||
outputs = vllm_runner.model.generate({
|
|
||||||
"prompt_embeds": prompt_embeds,
|
|
||||||
})
|
|
||||||
|
|
||||||
# Verify output
|
|
||||||
assert len(outputs) == 1
|
|
||||||
assert len(outputs[0].outputs) > 0
|
|
||||||
assert len(outputs[0].outputs[0].text) > 0
|
|
||||||
print(f"\n[Single Inference Output]: {outputs[0].outputs[0].text}")
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model_name", MODELS)
|
|
||||||
def test_batch_prompt_embeds_inference(model_name):
|
|
||||||
"""Test batch prompt inference with prompt embeddings."""
|
|
||||||
# Prepare prompt embeddings
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
||||||
transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
|
|
||||||
embedding_layer = transformers_model.get_input_embeddings()
|
|
||||||
|
|
||||||
chats = [[{
|
|
||||||
"role": "user",
|
|
||||||
"content": "Please tell me about the capital of France."
|
|
||||||
}],
|
|
||||||
[{
|
|
||||||
"role": "user",
|
|
||||||
"content": "When is the day longest during the year?"
|
|
||||||
}],
|
|
||||||
[{
|
|
||||||
"role": "user",
|
|
||||||
"content": "Where is bigger, the moon or the sun?"
|
|
||||||
}]]
|
|
||||||
|
|
||||||
prompt_embeds_list = [
|
|
||||||
get_prompt_embeds(chat, tokenizer, embedding_layer) for chat in chats
|
|
||||||
]
|
|
||||||
|
|
||||||
# Run batch inference with prompt embeddings
|
|
||||||
with VllmRunner(
|
|
||||||
model_name,
|
|
||||||
enable_prompt_embeds=True,
|
|
||||||
enforce_eager=True,
|
|
||||||
) as vllm_runner:
|
|
||||||
outputs = vllm_runner.model.generate([{
|
|
||||||
"prompt_embeds": embeds
|
|
||||||
} for embeds in prompt_embeds_list])
|
|
||||||
|
|
||||||
# Verify outputs
|
|
||||||
assert len(outputs) == len(chats)
|
|
||||||
for i, output in enumerate(outputs):
|
|
||||||
assert len(output.outputs) > 0
|
|
||||||
assert len(output.outputs[0].text) > 0
|
|
||||||
print(f"\nQ{i+1}: {chats[i][0]['content']}")
|
|
||||||
print(f"A{i+1}: {output.outputs[0].text}")
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model_name", MODELS)
|
|
||||||
def test_prompt_embeds_with_aclgraph(model_name):
|
|
||||||
"""Test prompt embeddings with ACL graph enabled vs disabled."""
|
|
||||||
# Prepare prompt embeddings
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
||||||
transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
|
|
||||||
embedding_layer = transformers_model.get_input_embeddings()
|
|
||||||
|
|
||||||
chat = [{"role": "user", "content": "What is the capital of China?"}]
|
|
||||||
prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)
|
|
||||||
|
|
||||||
# Run with ACL graph enabled (enforce_eager=False)
|
|
||||||
with VllmRunner(
|
|
||||||
model_name,
|
|
||||||
enable_prompt_embeds=True,
|
|
||||||
enforce_eager=False,
|
|
||||||
) as vllm_aclgraph_runner:
|
|
||||||
aclgraph_outputs = vllm_aclgraph_runner.model.generate({
|
|
||||||
"prompt_embeds":
|
|
||||||
prompt_embeds,
|
|
||||||
})
|
|
||||||
|
|
||||||
# Run with ACL graph disabled (enforce_eager=True)
|
|
||||||
with VllmRunner(
|
|
||||||
model_name,
|
|
||||||
enable_prompt_embeds=True,
|
|
||||||
enforce_eager=True,
|
|
||||||
) as vllm_eager_runner:
|
|
||||||
eager_outputs = vllm_eager_runner.model.generate({
|
|
||||||
"prompt_embeds":
|
|
||||||
prompt_embeds,
|
|
||||||
})
|
|
||||||
|
|
||||||
# Verify both produce valid outputs
|
|
||||||
assert len(aclgraph_outputs) == 1
|
|
||||||
assert len(eager_outputs) == 1
|
|
||||||
assert len(aclgraph_outputs[0].outputs[0].text) > 0
|
|
||||||
assert len(eager_outputs[0].outputs[0].text) > 0
|
|
||||||
|
|
||||||
print("\n[ACL Graph Output]:", aclgraph_outputs[0].outputs[0].text)
|
|
||||||
print("[Eager Output]:", eager_outputs[0].outputs[0].text)
|
|
||||||
|
|
||||||
# Note: Outputs may differ slightly due to different execution paths,
|
|
||||||
# but both should be valid responses
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model_name", MODELS)
|
@pytest.mark.parametrize("model_name", MODELS)
|
||||||
def test_mixed_prompt_embeds_and_text(model_name):
|
def test_mixed_prompt_embeds_and_text(model_name):
|
||||||
"""Test mixed inputs with both prompt embeddings and text prompts."""
|
"""Test mixed inputs with both prompt embeddings and text prompts."""
|
||||||
@@ -176,7 +55,6 @@ def test_mixed_prompt_embeds_and_text(model_name):
|
|||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
model_name,
|
model_name,
|
||||||
enable_prompt_embeds=True,
|
enable_prompt_embeds=True,
|
||||||
enforce_eager=True,
|
|
||||||
) as vllm_runner:
|
) as vllm_runner:
|
||||||
# Test prompt embeddings
|
# Test prompt embeddings
|
||||||
embeds_output = vllm_runner.model.generate({
|
embeds_output = vllm_runner.model.generate({
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ def test_multimodal_vl(prompt_template):
|
|||||||
]
|
]
|
||||||
images = [image] * len(img_questions)
|
images = [image] * len(img_questions)
|
||||||
prompts = prompt_template(img_questions)
|
prompts = prompt_template(img_questions)
|
||||||
with VllmRunner("Qwen/Qwen2.5-VL-3B-Instruct",
|
with VllmRunner("Qwen/Qwen3-VL-8B-Instruct",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
mm_processor_kwargs={
|
mm_processor_kwargs={
|
||||||
"min_pixels": 28 * 28,
|
"min_pixels": 28 * 28,
|
||||||
|
|||||||
Reference in New Issue
Block a user