[E2E] Refactor the e2e testcases. (#4789)

### What this PR does / why we need it?
Refactor the e2e testcases.
- tests/e2e/multicard/test_weight_loader.py: Remove the unused code.
- tests/e2e/singlecard/multi-modal/test_internvl.py: Move to accuracy
test.
- tests/e2e/singlecard/test_aclgraph.py: Rename the file.
- tests/e2e/singlecard/test_embedding_aclgraph.py : Combine with
tests/e2e/singlecard/test_bge_model.py
- tests/e2e/singlecard/test_completion_with_prompt_embeds.py: Delete
eager mode and modify model to Qwen3-0.6B
- tests/e2e/singlecard/test_quantization.py: Modify model to
Qwen3-0.6B-W8A8
- tests/e2e/singlecard/test_vlm.py: Modify model to Qwen3-VL-8B

- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c

---------

Signed-off-by: menogrey <1299267905@qq.com>
This commit is contained in:
zhangyiming
2025-12-11 10:15:00 +08:00
committed by GitHub
parent 11bebb518c
commit 66b0781840
13 changed files with 90 additions and 335 deletions

View File

@@ -75,7 +75,7 @@ jobs:
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256 PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
if: ${{ inputs.type == 'light' }} if: ${{ inputs.type == 'light' }}
run: | run: |
# pytest -sv tests/e2e/singlecard/test_aclgraph.py # pytest -sv tests/e2e/singlecard/test_aclgraph_accuracy.py
# pytest -sv tests/e2e/singlecard/test_quantization.py # pytest -sv tests/e2e/singlecard/test_quantization.py
pytest -sv tests/e2e/singlecard/test_vlm.py::test_multimodal_vl pytest -sv tests/e2e/singlecard/test_vlm.py::test_multimodal_vl
pytest -sv tests/e2e/singlecard/pooling/test_classification.py::test_classify_correctness pytest -sv tests/e2e/singlecard/pooling/test_classification.py::test_classify_correctness
@@ -91,10 +91,9 @@ jobs:
# the test separately. # the test separately.
pytest -sv tests/e2e/singlecard/test_completion_with_prompt_embeds.py pytest -sv tests/e2e/singlecard/test_completion_with_prompt_embeds.py
pytest -sv tests/e2e/singlecard/test_aclgraph.py pytest -sv tests/e2e/singlecard/test_aclgraph_accuracy.py
pytest -sv tests/e2e/singlecard/test_aclgraph_mem.py pytest -sv tests/e2e/singlecard/test_aclgraph_mem.py
pytest -sv tests/e2e/singlecard/test_camem.py pytest -sv tests/e2e/singlecard/test_camem.py
# pytest -sv tests/e2e/singlecard/test_embedding_aclgraph.py
pytest -sv tests/e2e/singlecard/test_guided_decoding.py pytest -sv tests/e2e/singlecard/test_guided_decoding.py
# torch 2.8 doesn't work with lora, fix me # torch 2.8 doesn't work with lora, fix me
#pytest -sv tests/e2e/singlecard/test_ilama_lora.py #pytest -sv tests/e2e/singlecard/test_ilama_lora.py
@@ -102,7 +101,6 @@ jobs:
pytest -sv tests/e2e/singlecard/test_quantization.py pytest -sv tests/e2e/singlecard/test_quantization.py
pytest -sv tests/e2e/singlecard/test_sampler.py pytest -sv tests/e2e/singlecard/test_sampler.py
pytest -sv tests/e2e/singlecard/test_vlm.py pytest -sv tests/e2e/singlecard/test_vlm.py
pytest -sv tests/e2e/singlecard/multi-modal/test_internvl.py
pytest -sv tests/e2e/singlecard/test_xlite.py pytest -sv tests/e2e/singlecard/test_xlite.py
pytest -sv tests/e2e/singlecard/pooling/ pytest -sv tests/e2e/singlecard/pooling/

View File

@@ -252,7 +252,7 @@ Run nightly multi-node test cases locally refer to section of `Running Locally`
- Offline test example: [`tests/e2e/singlecard/test_offline_inference.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_offline_inference.py) - Offline test example: [`tests/e2e/singlecard/test_offline_inference.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_offline_inference.py)
- Online test examples: [`tests/e2e/singlecard/test_prompt_embedding.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_prompt_embedding.py) - Online test examples: [`tests/e2e/singlecard/test_prompt_embedding.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_prompt_embedding.py)
- Correctness test example: [`tests/e2e/singlecard/test_aclgraph.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph.py) - Correctness test example: [`tests/e2e/singlecard/test_aclgraph_accuracy.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph_accuracy.py)
The CI resource is limited, and you might need to reduce the number of layers of a model. Below is an example of how to generate a reduced layer model: The CI resource is limited, and you might need to reduce the number of layers of a model. Below is an example of how to generate a reduced layer model:
1. Fork the original model repo in modelscope. All the files in the repo except for weights are required. 1. Fork the original model repo in modelscope. All the files in the repo except for weights are required.

View File

@@ -169,11 +169,11 @@ msgstr ""
#: ../../developer_guide/contribution/testing.md:246 #: ../../developer_guide/contribution/testing.md:246
msgid "" msgid ""
"Correctness test example: " "Correctness test example: "
"[`tests/e2e/singlecard/test_aclgraph.py`](https://github.com/vllm-" "[`tests/e2e/singlecard/test_aclgraph_accuracy.py`](https://github.com/vllm-"
"project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph.py)" "project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph_accuracy.py)"
msgstr "" msgstr ""
"正确性测试示例:[`tests/e2e/singlecard/test_aclgraph.py`](https://github.com/vllm-" "正确性测试示例:[`tests/e2e/singlecard/test_aclgraph_accuracy.py`](https://github.com/vllm-"
"project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph.py)" "project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph_accuracy.py)"
#: ../../developer_guide/contribution/testing.md:247 #: ../../developer_guide/contribution/testing.md:247
msgid "" msgid ""

View File

@@ -0,0 +1,11 @@
model_name: "OpenGVLab/InternVL2-8B"
runner: "linux-aarch64-a2-1"
hardware: "Atlas A2 Series"
model: "vllm-vlm"
tasks:
- name: "mmmu_val"
metrics:
- name: "acc,none"
value: 0.58
max_model_len: 32768
trust_remote_code: True

View File

@@ -0,0 +1,11 @@
model_name: "OpenGVLab/InternVL2_5-8B"
runner: "linux-aarch64-a2-1"
hardware: "Atlas A2 Series"
model: "vllm-vlm"
tasks:
- name: "mmmu_val"
metrics:
- name: "acc,none"
value: 0.58
max_model_len: 32768
trust_remote_code: True

View File

@@ -0,0 +1,11 @@
model_name: "OpenGVLab/InternVL3-8B"
runner: "linux-aarch64-a2-1"
hardware: "Atlas A2 Series"
model: "vllm-vlm"
tasks:
- name: "mmmu_val"
metrics:
- name: "acc,none"
value: 0.58
max_model_len: 32768
trust_remote_code: True

View File

@@ -6,6 +6,9 @@ Qwen3-VL-30B-A3B-Instruct.yaml
Qwen3-VL-8B-Instruct.yaml Qwen3-VL-8B-Instruct.yaml
Qwen2.5-Omni-7B.yaml Qwen2.5-Omni-7B.yaml
Meta-Llama-3.1-8B-Instruct.yaml Meta-Llama-3.1-8B-Instruct.yaml
InternVL2-8B.yaml
InternVL2_5-8B.yaml
InternVL3-8B.yaml
InternVL3_5-8B.yaml InternVL3_5-8B.yaml
ERNIE-4.5-21B-A3B-PT.yaml ERNIE-4.5-21B-A3B-PT.yaml
gemma-3-4b-it.yaml gemma-3-4b-it.yaml

View File

@@ -1,109 +0,0 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
Compare the outputs of vLLM with and without aclgraph.
Run `pytest tests/multicard/test_external_launcher.py`.
"""
import os
import subprocess
import sys
import pytest
import torch_npu
MOE_MODELS = ["Qwen/Qwen3-30B-A3B"]
MODELS = ["Qwen/Qwen3-8B"]
DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
@pytest.mark.parametrize("model", MOE_MODELS)
def test_external_launcher(model):
script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
env = os.environ.copy()
# TODO: Change to 2 when ci machine has 4 cards
cmd = [
sys.executable,
str(script),
"--model",
model,
"--tp-size",
"2",
"--proc-per-node",
"2",
"--trust-remote-code",
"--enable-expert-parallel",
"--enable-sleep-mode",
"--model-weight-gib",
"20",
]
print(f"Running subprocess: {' '.join(cmd)}")
proc = subprocess.run(
cmd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=600,
)
output = proc.stdout.decode(errors='ignore')
print(output)
assert "TP RANKS: [0]" in output
assert "TP RANKS: [1]" in output
assert "Generated text:" in output
assert proc.returncode == 0
@pytest.mark.parametrize("model", MODELS)
def test_external_launcher_dense(model):
script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
env = os.environ.copy()
# TODO: Change to 2 when ci machine has 4 cards
cmd = [
sys.executable,
str(script),
"--model",
model,
"--tp-size",
"2",
"--proc-per-node",
"2",
"--trust-remote-code",
"--enable-sleep-mode",
"--model-weight-gib",
"20",
]
print(f"Running subprocess: {' '.join(cmd)}")
proc = subprocess.run(
cmd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=600,
)
output = proc.stdout.decode(errors='ignore')
print(output)
assert "TP RANKS: [0]" in output
assert "TP RANKS: [1]" in output
assert "Generated text:" in output
assert proc.returncode == 0

View File

@@ -1,89 +0,0 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
import os
# Set spawn method before any torch/NPU imports to avoid fork issues
os.environ.setdefault('VLLM_WORKER_MULTIPROC_METHOD', 'spawn')
import pytest
from vllm.assets.image import ImageAsset
from tests.e2e.conftest import VllmRunner
from tests.e2e.model_utils import check_outputs_equal
MODELS = [
"OpenGVLab/InternVL2-8B",
"OpenGVLab/InternVL2_5-8B",
"OpenGVLab/InternVL3-8B",
"OpenGVLab/InternVL3_5-8B",
]
@pytest.mark.parametrize("model", MODELS)
def test_internvl_basic(model: str):
"""Test basic InternVL2 inference with single image."""
# Load test image
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
# InternVL uses chat template format
# Format: <|im_start|>user\n<image>\nQUESTION<|im_end|>\n<|im_start|>assistant\n
questions = [
"What is the content of this image?",
"Describe this image in detail.",
]
# Build prompts with InternVL2 chat template
prompts = [
f"<|im_start|>user\n<image>\n{q}<|im_end|>\n<|im_start|>assistant\n"
for q in questions
]
images = [image] * len(prompts)
outputs = {}
for enforce_eager, mode in [(False, "eager"), (True, "graph")]:
with VllmRunner(
model,
max_model_len=8192,
limit_mm_per_prompt={"image": 4},
enforce_eager=enforce_eager,
dtype="bfloat16",
) as vllm_model:
generated_outputs = vllm_model.generate_greedy(
prompts=prompts,
images=images,
max_tokens=128,
)
assert len(generated_outputs) == len(prompts), \
f"Expected {len(prompts)} outputs, got {len(generated_outputs)} in {mode} mode"
for i, (_, output_str) in enumerate(generated_outputs):
assert output_str, \
f"{mode.capitalize()} mode output {i} should not be empty. Prompt: {prompts[i]}"
assert len(output_str.strip()) > 0, \
f"{mode.capitalize()} mode Output {i} should have meaningful content"
outputs[mode] = generated_outputs
eager_outputs = outputs["eager"]
graph_outputs = outputs["graph"]
check_outputs_equal(outputs_0_lst=eager_outputs,
outputs_1_lst=graph_outputs,
name_0="eager mode",
name_1="graph mode")

View File

@@ -24,7 +24,6 @@ from tests.e2e.utils import check_embeddings_close
MODELS = [ MODELS = [
"Qwen/Qwen3-Embedding-0.6B", # lasttoken "Qwen/Qwen3-Embedding-0.6B", # lasttoken
"BAAI/bge-small-en-v1.5", # cls_token
"intfloat/multilingual-e5-small" # mean_tokens "intfloat/multilingual-e5-small" # mean_tokens
] ]
@@ -57,3 +56,45 @@ def test_embed_models_correctness(model: str):
name_1="vllm", name_1="vllm",
tol=1e-2, tol=1e-2,
) )
def test_bge_model_correctness():
queries = ['What is the capital of China?', 'Explain gravity']
model_name = snapshot_download("BAAI/bge-m3")
with VllmRunner(
model_name,
runner="pooling",
enforce_eager=False,
) as vllm_aclgraph_runner:
vllm_aclgraph_outputs = vllm_aclgraph_runner.embed(queries)
with VllmRunner(
model_name,
runner="pooling",
enforce_eager=True,
) as vllm_runner:
vllm_eager_outputs = vllm_runner.embed(queries)
with HfRunner(
model_name,
dtype="float32",
is_sentence_transformer=True,
) as hf_runner:
hf_outputs = hf_runner.encode(queries)
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_eager_outputs,
name_0="hf",
name_1="vllm",
tol=1e-2,
)
check_embeddings_close(
embeddings_0_lst=vllm_eager_outputs,
embeddings_1_lst=vllm_aclgraph_outputs,
name_0="eager",
name_1="aclgraph",
tol=1e-2,
)

View File

@@ -17,7 +17,7 @@
""" """
Compare the outputs of vLLM with and without aclgraph. Compare the outputs of vLLM with and without aclgraph.
Run `pytest tests/compile/test_aclgraph.py`. Run `pytest tests/compile/test_aclgraph_accuracy.py`.
""" """
import os import os
@@ -36,7 +36,7 @@ MODELS = [
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("max_tokens", [32])
def test_models_with_aclgraph( def test_output_between_eager_and_aclgraph(
model: str, model: str,
max_tokens: int, max_tokens: int,
) -> None: ) -> None:
@@ -100,7 +100,7 @@ def test_models_with_aclgraph(
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("max_tokens", [32])
def test_models_with_aclgraph_full_decode_only( def test_output_between_eager_and_full_decode_only(
model: str, model: str,
max_tokens: int, max_tokens: int,
) -> None: ) -> None:

View File

@@ -25,7 +25,7 @@ from tests.e2e.conftest import VllmRunner
os.environ["VLLM_USE_MODELSCOPE"] = "True" os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"] MODELS = ["Qwen/Qwen3-0.6B"]
def get_prompt_embeds(chat, tokenizer, embedding_layer): def get_prompt_embeds(chat, tokenizer, embedding_layer):
@@ -37,127 +37,6 @@ def get_prompt_embeds(chat, tokenizer, embedding_layer):
return prompt_embeds return prompt_embeds
@pytest.mark.parametrize("model_name", MODELS)
def test_single_prompt_embeds_inference(model_name):
"""Test single prompt inference with prompt embeddings."""
# Prepare prompt embeddings
tokenizer = AutoTokenizer.from_pretrained(model_name)
transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
embedding_layer = transformers_model.get_input_embeddings()
chat = [{
"role": "user",
"content": "Please tell me about the capital of France."
}]
prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)
# Run inference with prompt embeddings
with VllmRunner(
model_name,
enable_prompt_embeds=True,
enforce_eager=True,
) as vllm_runner:
outputs = vllm_runner.model.generate({
"prompt_embeds": prompt_embeds,
})
# Verify output
assert len(outputs) == 1
assert len(outputs[0].outputs) > 0
assert len(outputs[0].outputs[0].text) > 0
print(f"\n[Single Inference Output]: {outputs[0].outputs[0].text}")
@pytest.mark.parametrize("model_name", MODELS)
def test_batch_prompt_embeds_inference(model_name):
"""Test batch prompt inference with prompt embeddings."""
# Prepare prompt embeddings
tokenizer = AutoTokenizer.from_pretrained(model_name)
transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
embedding_layer = transformers_model.get_input_embeddings()
chats = [[{
"role": "user",
"content": "Please tell me about the capital of France."
}],
[{
"role": "user",
"content": "When is the day longest during the year?"
}],
[{
"role": "user",
"content": "Where is bigger, the moon or the sun?"
}]]
prompt_embeds_list = [
get_prompt_embeds(chat, tokenizer, embedding_layer) for chat in chats
]
# Run batch inference with prompt embeddings
with VllmRunner(
model_name,
enable_prompt_embeds=True,
enforce_eager=True,
) as vllm_runner:
outputs = vllm_runner.model.generate([{
"prompt_embeds": embeds
} for embeds in prompt_embeds_list])
# Verify outputs
assert len(outputs) == len(chats)
for i, output in enumerate(outputs):
assert len(output.outputs) > 0
assert len(output.outputs[0].text) > 0
print(f"\nQ{i+1}: {chats[i][0]['content']}")
print(f"A{i+1}: {output.outputs[0].text}")
@pytest.mark.parametrize("model_name", MODELS)
def test_prompt_embeds_with_aclgraph(model_name):
"""Test prompt embeddings with ACL graph enabled vs disabled."""
# Prepare prompt embeddings
tokenizer = AutoTokenizer.from_pretrained(model_name)
transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
embedding_layer = transformers_model.get_input_embeddings()
chat = [{"role": "user", "content": "What is the capital of China?"}]
prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)
# Run with ACL graph enabled (enforce_eager=False)
with VllmRunner(
model_name,
enable_prompt_embeds=True,
enforce_eager=False,
) as vllm_aclgraph_runner:
aclgraph_outputs = vllm_aclgraph_runner.model.generate({
"prompt_embeds":
prompt_embeds,
})
# Run with ACL graph disabled (enforce_eager=True)
with VllmRunner(
model_name,
enable_prompt_embeds=True,
enforce_eager=True,
) as vllm_eager_runner:
eager_outputs = vllm_eager_runner.model.generate({
"prompt_embeds":
prompt_embeds,
})
# Verify both produce valid outputs
assert len(aclgraph_outputs) == 1
assert len(eager_outputs) == 1
assert len(aclgraph_outputs[0].outputs[0].text) > 0
assert len(eager_outputs[0].outputs[0].text) > 0
print("\n[ACL Graph Output]:", aclgraph_outputs[0].outputs[0].text)
print("[Eager Output]:", eager_outputs[0].outputs[0].text)
# Note: Outputs may differ slightly due to different execution paths,
# but both should be valid responses
@pytest.mark.parametrize("model_name", MODELS) @pytest.mark.parametrize("model_name", MODELS)
def test_mixed_prompt_embeds_and_text(model_name): def test_mixed_prompt_embeds_and_text(model_name):
"""Test mixed inputs with both prompt embeddings and text prompts.""" """Test mixed inputs with both prompt embeddings and text prompts."""
@@ -176,7 +55,6 @@ def test_mixed_prompt_embeds_and_text(model_name):
with VllmRunner( with VllmRunner(
model_name, model_name,
enable_prompt_embeds=True, enable_prompt_embeds=True,
enforce_eager=True,
) as vllm_runner: ) as vllm_runner:
# Test prompt embeddings # Test prompt embeddings
embeds_output = vllm_runner.model.generate({ embeds_output = vllm_runner.model.generate({

View File

@@ -38,7 +38,7 @@ def test_multimodal_vl(prompt_template):
] ]
images = [image] * len(img_questions) images = [image] * len(img_questions)
prompts = prompt_template(img_questions) prompts = prompt_template(img_questions)
with VllmRunner("Qwen/Qwen2.5-VL-3B-Instruct", with VllmRunner("Qwen/Qwen3-VL-8B-Instruct",
max_model_len=4096, max_model_len=4096,
mm_processor_kwargs={ mm_processor_kwargs={
"min_pixels": 28 * 28, "min_pixels": 28 * 28,