[E2E] Refactor the e2e testcases. (#4789)

### What this PR does / why we need it?
Refactor the e2e testcases.
- tests/e2e/multicard/test_weight_loader.py: Remove the unused code.
- tests/e2e/singlecard/multi-modal/test_internvl.py: Move to accuracy
test.
- tests/e2e/singlecard/test_aclgraph.py: Rename the file.
- tests/e2e/singlecard/test_embedding_aclgraph.py : Combine with
tests/e2e/singlecard/test_bge_model.py
- tests/e2e/singlecard/test_completion_with_prompt_embeds.py: Delete
eager mode and modify model to Qwen3-0.6B
- tests/e2e/singlecard/test_quantization.py: Modify model to
Qwen3-0.6B-W8A8
- tests/e2e/singlecard/test_vlm.py: Modify model to Qwen3-VL-8B

- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c

---------

Signed-off-by: menogrey <1299267905@qq.com>
This commit is contained in:
zhangyiming
2025-12-11 10:15:00 +08:00
committed by GitHub
parent 11bebb518c
commit 66b0781840
13 changed files with 90 additions and 335 deletions

View File

@@ -1,89 +0,0 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
import os
# Set spawn method before any torch/NPU imports to avoid fork issues
os.environ.setdefault('VLLM_WORKER_MULTIPROC_METHOD', 'spawn')
import pytest
from vllm.assets.image import ImageAsset
from tests.e2e.conftest import VllmRunner
from tests.e2e.model_utils import check_outputs_equal
MODELS = [
"OpenGVLab/InternVL2-8B",
"OpenGVLab/InternVL2_5-8B",
"OpenGVLab/InternVL3-8B",
"OpenGVLab/InternVL3_5-8B",
]
@pytest.mark.parametrize("model", MODELS)
def test_internvl_basic(model: str):
"""Test basic InternVL2 inference with single image."""
# Load test image
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
# InternVL uses chat template format
# Format: <|im_start|>user\n<image>\nQUESTION<|im_end|>\n<|im_start|>assistant\n
questions = [
"What is the content of this image?",
"Describe this image in detail.",
]
# Build prompts with InternVL2 chat template
prompts = [
f"<|im_start|>user\n<image>\n{q}<|im_end|>\n<|im_start|>assistant\n"
for q in questions
]
images = [image] * len(prompts)
outputs = {}
for enforce_eager, mode in [(False, "eager"), (True, "graph")]:
with VllmRunner(
model,
max_model_len=8192,
limit_mm_per_prompt={"image": 4},
enforce_eager=enforce_eager,
dtype="bfloat16",
) as vllm_model:
generated_outputs = vllm_model.generate_greedy(
prompts=prompts,
images=images,
max_tokens=128,
)
assert len(generated_outputs) == len(prompts), \
f"Expected {len(prompts)} outputs, got {len(generated_outputs)} in {mode} mode"
for i, (_, output_str) in enumerate(generated_outputs):
assert output_str, \
f"{mode.capitalize()} mode output {i} should not be empty. Prompt: {prompts[i]}"
assert len(output_str.strip()) > 0, \
f"{mode.capitalize()} mode Output {i} should have meaningful content"
outputs[mode] = generated_outputs
eager_outputs = outputs["eager"]
graph_outputs = outputs["graph"]
check_outputs_equal(outputs_0_lst=eager_outputs,
outputs_1_lst=graph_outputs,
name_0="eager mode",
name_1="graph mode")

View File

@@ -24,7 +24,6 @@ from tests.e2e.utils import check_embeddings_close
MODELS = [
"Qwen/Qwen3-Embedding-0.6B", # lasttoken
"BAAI/bge-small-en-v1.5", # cls_token
"intfloat/multilingual-e5-small" # mean_tokens
]
@@ -57,3 +56,45 @@ def test_embed_models_correctness(model: str):
name_1="vllm",
tol=1e-2,
)
def test_bge_model_correctness():
queries = ['What is the capital of China?', 'Explain gravity']
model_name = snapshot_download("BAAI/bge-m3")
with VllmRunner(
model_name,
runner="pooling",
enforce_eager=False,
) as vllm_aclgraph_runner:
vllm_aclgraph_outputs = vllm_aclgraph_runner.embed(queries)
with VllmRunner(
model_name,
runner="pooling",
enforce_eager=True,
) as vllm_runner:
vllm_eager_outputs = vllm_runner.embed(queries)
with HfRunner(
model_name,
dtype="float32",
is_sentence_transformer=True,
) as hf_runner:
hf_outputs = hf_runner.encode(queries)
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_eager_outputs,
name_0="hf",
name_1="vllm",
tol=1e-2,
)
check_embeddings_close(
embeddings_0_lst=vllm_eager_outputs,
embeddings_1_lst=vllm_aclgraph_outputs,
name_0="eager",
name_1="aclgraph",
tol=1e-2,
)

View File

@@ -17,7 +17,7 @@
"""
Compare the outputs of vLLM with and without aclgraph.
Run `pytest tests/compile/test_aclgraph.py`.
Run `pytest tests/compile/test_aclgraph_accuracy.py`.
"""
import os
@@ -36,7 +36,7 @@ MODELS = [
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
def test_models_with_aclgraph(
def test_output_between_eager_and_aclgraph(
model: str,
max_tokens: int,
) -> None:
@@ -100,7 +100,7 @@ def test_models_with_aclgraph(
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
def test_models_with_aclgraph_full_decode_only(
def test_output_between_eager_and_full_decode_only(
model: str,
max_tokens: int,
) -> None:

View File

@@ -25,7 +25,7 @@ from tests.e2e.conftest import VllmRunner
os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
MODELS = ["Qwen/Qwen3-0.6B"]
def get_prompt_embeds(chat, tokenizer, embedding_layer):
@@ -37,127 +37,6 @@ def get_prompt_embeds(chat, tokenizer, embedding_layer):
return prompt_embeds
@pytest.mark.parametrize("model_name", MODELS)
def test_single_prompt_embeds_inference(model_name):
"""Test single prompt inference with prompt embeddings."""
# Prepare prompt embeddings
tokenizer = AutoTokenizer.from_pretrained(model_name)
transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
embedding_layer = transformers_model.get_input_embeddings()
chat = [{
"role": "user",
"content": "Please tell me about the capital of France."
}]
prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)
# Run inference with prompt embeddings
with VllmRunner(
model_name,
enable_prompt_embeds=True,
enforce_eager=True,
) as vllm_runner:
outputs = vllm_runner.model.generate({
"prompt_embeds": prompt_embeds,
})
# Verify output
assert len(outputs) == 1
assert len(outputs[0].outputs) > 0
assert len(outputs[0].outputs[0].text) > 0
print(f"\n[Single Inference Output]: {outputs[0].outputs[0].text}")
@pytest.mark.parametrize("model_name", MODELS)
def test_batch_prompt_embeds_inference(model_name):
"""Test batch prompt inference with prompt embeddings."""
# Prepare prompt embeddings
tokenizer = AutoTokenizer.from_pretrained(model_name)
transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
embedding_layer = transformers_model.get_input_embeddings()
chats = [[{
"role": "user",
"content": "Please tell me about the capital of France."
}],
[{
"role": "user",
"content": "When is the day longest during the year?"
}],
[{
"role": "user",
"content": "Where is bigger, the moon or the sun?"
}]]
prompt_embeds_list = [
get_prompt_embeds(chat, tokenizer, embedding_layer) for chat in chats
]
# Run batch inference with prompt embeddings
with VllmRunner(
model_name,
enable_prompt_embeds=True,
enforce_eager=True,
) as vllm_runner:
outputs = vllm_runner.model.generate([{
"prompt_embeds": embeds
} for embeds in prompt_embeds_list])
# Verify outputs
assert len(outputs) == len(chats)
for i, output in enumerate(outputs):
assert len(output.outputs) > 0
assert len(output.outputs[0].text) > 0
print(f"\nQ{i+1}: {chats[i][0]['content']}")
print(f"A{i+1}: {output.outputs[0].text}")
@pytest.mark.parametrize("model_name", MODELS)
def test_prompt_embeds_with_aclgraph(model_name):
"""Test prompt embeddings with ACL graph enabled vs disabled."""
# Prepare prompt embeddings
tokenizer = AutoTokenizer.from_pretrained(model_name)
transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
embedding_layer = transformers_model.get_input_embeddings()
chat = [{"role": "user", "content": "What is the capital of China?"}]
prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)
# Run with ACL graph enabled (enforce_eager=False)
with VllmRunner(
model_name,
enable_prompt_embeds=True,
enforce_eager=False,
) as vllm_aclgraph_runner:
aclgraph_outputs = vllm_aclgraph_runner.model.generate({
"prompt_embeds":
prompt_embeds,
})
# Run with ACL graph disabled (enforce_eager=True)
with VllmRunner(
model_name,
enable_prompt_embeds=True,
enforce_eager=True,
) as vllm_eager_runner:
eager_outputs = vllm_eager_runner.model.generate({
"prompt_embeds":
prompt_embeds,
})
# Verify both produce valid outputs
assert len(aclgraph_outputs) == 1
assert len(eager_outputs) == 1
assert len(aclgraph_outputs[0].outputs[0].text) > 0
assert len(eager_outputs[0].outputs[0].text) > 0
print("\n[ACL Graph Output]:", aclgraph_outputs[0].outputs[0].text)
print("[Eager Output]:", eager_outputs[0].outputs[0].text)
# Note: Outputs may differ slightly due to different execution paths,
# but both should be valid responses
@pytest.mark.parametrize("model_name", MODELS)
def test_mixed_prompt_embeds_and_text(model_name):
"""Test mixed inputs with both prompt embeddings and text prompts."""
@@ -176,7 +55,6 @@ def test_mixed_prompt_embeds_and_text(model_name):
with VllmRunner(
model_name,
enable_prompt_embeds=True,
enforce_eager=True,
) as vllm_runner:
# Test prompt embeddings
embeds_output = vllm_runner.model.generate({

View File

@@ -38,7 +38,7 @@ def test_multimodal_vl(prompt_template):
]
images = [image] * len(img_questions)
prompts = prompt_template(img_questions)
with VllmRunner("Qwen/Qwen2.5-VL-3B-Instruct",
with VllmRunner("Qwen/Qwen3-VL-8B-Instruct",
max_model_len=4096,
mm_processor_kwargs={
"min_pixels": 28 * 28,