[CI] refect e2e ci test (#5246)

### What this PR does / why we need it?
efect e2e ci test:
1. tests/e2e/singlecard/pooling/test_embedding.py: remove the eager
parameter and rename test case
2. tests/e2e/singlecard/pooling/test_scoring.py: Rename test cases
3. tests/e2e/singlecard/pooling/test_classification.py: Rename test case
4. tests/e2e/singlecard/test_quantization.py: remove the eager parameter
and chage model to vllm-ascend/Qwen2.5-0.6B-W8A8 and Rename test case
5. tests/e2e/multicard/test_shared_expert_dp.py: Rename test cases
6. tests/e2e/singlecard/test_sampler.py: Rename test cases
7. tests/e2e/singlecard/test_aclgraph_accuracy.py: Rename test cases
8. tests/e2e/multicard/test_offline_inference_distributed.py: Rename
test cases and remove the eager parameter
9. tests/e2e/multicard/long_sequence/test_accuracy.py: Rename test cases
and remove the eager parameter
10. tests/e2e/multicard/long_sequence/test_basic.py: Rename test cases
and remove the eager parameter
11.tests/e2e/multicard/test_expert_parallel.py:remove the eager
parameter
12.tests/e2e/multicard/test_full_graph_mode.py:remove the eager
parameter
13.tests/e2e/multicard/test_ilama_lora_tp2.py:remove the eager parameter

14.tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py:remove
the eager parameter
15.tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py:remove the
eager parameter
16.tests/e2e/singlecard/test_aclgraph_accuracy.py:remove the eager
parameter
17.tests/e2e/singlecard/test_camem.py:remove the eager parameter
18.tests/e2e/singlecard/test_ilama_lora.py:remove the eager parameter

19.tests/e2e/singlecard/test_multistream_overlap_shared_expert.py:remove
the eager parameter
20.tests/e2e/singlecard/test_vlm.py:remove the eager parameter
21.tests/e2e/singlecard/test_xli:remove the eager parameter

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
ad32e3e19c

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
zhangxinyuehfad
2025-12-23 18:42:35 +08:00
committed by GitHub
parent 5d1f6daef6
commit 8ae7fca947
20 changed files with 61 additions and 88 deletions

View File

@@ -92,7 +92,7 @@ jobs:
# pytest -sv --durations=0 tests/e2e/singlecard/test_aclgraph_accuracy.py
# pytest -sv --durations=0 tests/e2e/singlecard/test_quantization.py
pytest -sv --durations=0 tests/e2e/singlecard/test_vlm.py::test_multimodal_vl
pytest -sv --durations=0 tests/e2e/singlecard/pooling/test_classification.py::test_classify_correctness
pytest -sv --durations=0 tests/e2e/singlecard/pooling/test_classification.py::test_qwen_pooling_classify_correctness
- name: Run e2e test
env:

View File

@@ -36,7 +36,7 @@ MODELS = [
reason="0.12.0 is not supported for context sequence.")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [10])
def test_output_between_tp_and_cp(
def test_models_long_sequence_output_between_tp_and_cp(
model: str,
max_tokens: int,
) -> None:
@@ -69,7 +69,6 @@ def test_output_between_tp_and_cp(
"tensor_parallel_size": 1,
"decode_context_parallel_size": 1,
"prefill_context_parallel_size": 2,
"enforce_eager": False,
"compilation_config": {
"cudagraph_mode": "FULL_DECODE_ONLY",
"cudagraph_capture_sizes": [4, 8, 24, 48, 60]

View File

@@ -34,7 +34,7 @@ os.environ["HCCL_BUFFSIZE"] = "768"
@pytest.mark.skipif(vllm_version_is('0.12.0'),
reason="0.12.0 is not supported for context sequence.")
def test_pcp_dcp_basic():
def test_models_pcp_dcp_basic():
prompts = [
"The capital of France is", "Hello, my name is Tom, I am",
"The president of United States is", "AI future is"
@@ -69,7 +69,7 @@ def test_pcp_dcp_basic():
@pytest.mark.skipif(vllm_version_is('0.12.0'),
reason="0.12.0 is not supported for context sequence.")
def test_pcp_dcp_full_graph():
def test_models_pcp_dcp_full_graph():
prompts = [
"The capital of France is", "Hello, my name is Tom, I am",
"The president of United States is", "AI future is"
@@ -77,7 +77,6 @@ def test_pcp_dcp_full_graph():
model = "deepseek-ai/DeepSeek-V2-Lite-Chat"
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
with VllmRunner(model,
enforce_eager=False,
max_model_len=1024,
tensor_parallel_size=2,
prefill_context_parallel_size=2,
@@ -93,7 +92,6 @@ def test_pcp_dcp_full_graph():
model = "vllm-ascend/Qwen3-30B-A3B-W8A8"
with VllmRunner(model,
enforce_eager=False,
max_model_len=1024,
tensor_parallel_size=2,
prefill_context_parallel_size=2,
@@ -110,7 +108,7 @@ def test_pcp_dcp_full_graph():
@pytest.mark.skipif(vllm_version_is('0.12.0'),
reason="0.12.0 is not supported for context sequence.")
def test_pcp_dcp_piece_wise():
def test_models_pcp_dcp_piece_wise():
prompts = [
"The capital of France is", "Hello, my name is Tom, I am",
"The president of United States is", "AI future is"
@@ -118,7 +116,6 @@ def test_pcp_dcp_piece_wise():
model = "deepseek-ai/DeepSeek-V2-Lite-Chat"
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
with VllmRunner(model,
enforce_eager=False,
max_model_len=1024,
tensor_parallel_size=2,
prefill_context_parallel_size=2,
@@ -130,7 +127,6 @@ def test_pcp_dcp_piece_wise():
model = "vllm-ascend/Qwen3-30B-A3B-W8A8"
with VllmRunner(model,
enforce_eager=False,
max_model_len=1024,
tensor_parallel_size=2,
prefill_context_parallel_size=2,

View File

@@ -15,14 +15,12 @@ def test_deepseek_correctness_ep(model_name):
max_tokens = 5
# FIXME: Really strange that chunked prefill might lead to different results, investigate further
with VllmRunner(model_name, tensor_parallel_size=2,
enforce_eager=False) as vllm_model:
with VllmRunner(model_name, tensor_parallel_size=2) as vllm_model:
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
with VllmRunner(model_name,
tensor_parallel_size=2,
enable_expert_parallel=True,
enforce_eager=False) as vllm_model:
enable_expert_parallel=True) as vllm_model:
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(

View File

@@ -41,7 +41,6 @@ def test_qwen3_moe_full_decode_only_tp2():
with VllmRunner(model,
max_model_len=1024,
tensor_parallel_size=2,
enforce_eager=False,
compilation_config={
"cudagraph_mode": "FULL_DECODE_ONLY",
"cudagraph_capture_sizes": [4, 8, 24, 48, 60]
@@ -53,7 +52,6 @@ def test_qwen3_moe_full_decode_only_tp2():
model,
max_model_len=1024,
tensor_parallel_size=2,
enforce_eager=False,
) as runner:
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
@@ -87,7 +85,6 @@ def test_qwen3_moe_full_graph_tp2():
with VllmRunner(model,
max_model_len=1024,
tensor_parallel_size=2,
enforce_eager=False,
compilation_config={
"cudagraph_mode": "FULL",
"cudagraph_capture_sizes": [4, 8, 24, 48, 60]
@@ -99,7 +96,6 @@ def test_qwen3_moe_full_graph_tp2():
model,
max_model_len=1024,
tensor_parallel_size=2,
enforce_eager=False,
) as runner:
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)

View File

@@ -8,15 +8,16 @@ from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT,
@pytest.mark.parametrize("distributed_executor_backend", ["mp"])
def test_ilama_lora_tp2(distributed_executor_backend, ilama_lora_files):
with VllmRunner(snapshot_download(MODEL_PATH),
enable_lora=True,
max_loras=4,
dtype="half",
max_model_len=1024,
max_num_seqs=16,
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=False) as vllm_model:
with VllmRunner(
snapshot_download(MODEL_PATH),
enable_lora=True,
max_loras=4,
dtype="half",
max_model_len=1024,
max_num_seqs=16,
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
output = do_sample(vllm_model.model, ilama_lora_files, lora_id=2)
for i in range(len(EXPECTED_LORA_OUTPUT)):

View File

@@ -189,7 +189,6 @@ def test_qwen3_dense_fc1_tp2(model):
with VllmRunner(
snapshot_download(model),
max_model_len=8192,
enforce_eager=False,
dtype="auto",
tensor_parallel_size=2,
quantization="ascend",
@@ -209,7 +208,6 @@ def test_qwen3_dense_prefetch_mlp_weight_tp2(model):
with VllmRunner(
snapshot_download(model),
max_model_len=8192,
enforce_eager=False,
dtype="auto",
tensor_parallel_size=2,
quantization="ascend",

View File

@@ -5,7 +5,7 @@ from transformers import AutoModelForSequenceClassification
from tests.e2e.conftest import HfRunner, VllmRunner
def test_classify_correctness() -> None:
def test_qwen_pooling_classify_correctness() -> None:
model_name = snapshot_download("Howeee/Qwen2.5-1.5B-apeach")

View File

@@ -36,7 +36,6 @@ def test_embed_models_correctness(model: str):
with VllmRunner(
model_name,
runner="pooling",
enforce_eager=False,
max_model_len=None,
cudagraph_capture_sizes=[4],
) as vllm_runner:
@@ -58,14 +57,13 @@ def test_embed_models_correctness(model: str):
)
def test_bge_model_correctness():
def test_bge_m3_correctness():
queries = ['What is the capital of China?', 'Explain gravity']
model_name = snapshot_download("BAAI/bge-m3")
with VllmRunner(
model_name,
runner="pooling",
enforce_eager=False,
) as vllm_aclgraph_runner:
vllm_aclgraph_outputs = vllm_aclgraph_runner.embed(queries)

View File

@@ -34,7 +34,7 @@ def model_name(request):
yield snapshot_download(request.param)
def test_cross_encoder_1_to_1(model_name):
def test_cross_encoder_score_1_to_1(model_name):
text_pair = [TEXTS_1[0], TEXTS_2[0]]
with HfRunner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
@@ -53,7 +53,7 @@ def test_cross_encoder_1_to_1(model_name):
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
def test_cross_encoder_1_to_N(model_name):
def test_cross_encoder_score_1_to_N(model_name):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[0], TEXTS_2[1]],
@@ -76,7 +76,7 @@ def test_cross_encoder_1_to_N(model_name):
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
def test_cross_encoder_N_to_N(model_name):
def test_cross_encoder_score_N_to_N(model_name):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[1], TEXTS_2[1]],
@@ -104,7 +104,7 @@ def emb_model_name(request):
yield snapshot_download(request.param)
def test_embedding_1_to_1(emb_model_name):
def test_embedding_score_1_to_1(emb_model_name):
text_pair = [TEXTS_1[0], TEXTS_2[0]]
with HfRunner(emb_model_name, dtype=DTYPE,
@@ -127,7 +127,7 @@ def test_embedding_1_to_1(emb_model_name):
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
def test_embedding_1_to_N(emb_model_name):
def test_embedding_score_1_to_N(emb_model_name):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[0], TEXTS_2[1]],
@@ -157,7 +157,7 @@ def test_embedding_1_to_N(emb_model_name):
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
def test_embedding_N_to_N(emb_model_name):
def test_embedding_score_N_to_N(emb_model_name):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[1], TEXTS_2[1]],

View File

@@ -171,4 +171,4 @@ def test_mtp2_correctness_piecewise_graph_with_pad(
mtp_correctness(sampling_config,
model_name,
2,
disable_padded_drafter_batch=False)
disable_padded_drafter_batch=False)

View File

@@ -76,19 +76,22 @@ def test_ngram_correctness(
should be the same when using ngram speculative decoding.
'''
with VllmRunner(model_name, max_model_len=1024,
enforce_eager=False) as ref_llm:
with VllmRunner(
model_name,
max_model_len=1024,
) as ref_llm:
ref_outputs = ref_llm.model.chat(test_prompts, sampling_config)
with VllmRunner(model_name,
speculative_config={
"method": "ngram",
"prompt_lookup_max": 5,
"prompt_lookup_min": 3,
"num_speculative_tokens": 3,
},
max_model_len=1024,
enforce_eager=False) as runner:
with VllmRunner(
model_name,
speculative_config={
"method": "ngram",
"prompt_lookup_max": 5,
"prompt_lookup_min": 3,
"num_speculative_tokens": 3,
},
max_model_len=1024,
) as runner:
spec_outputs = runner.model.chat(test_prompts, sampling_config)
matches = 0
misses = 0
@@ -190,8 +193,7 @@ def test_suffix_correctness(
Compare the outputs of a original LLM and a speculative LLM
should be the same when using ngram speculative decoding.
'''
with VllmRunner(model_name, max_model_len=1024,
enforce_eager=False) as ref_llm:
with VllmRunner(model_name, max_model_len=1024) as ref_llm:
ref_outputs = ref_llm.model.chat(test_prompts, sampling_config)
with VllmRunner(model_name,
@@ -199,8 +201,7 @@ def test_suffix_correctness(
"method": "suffix",
"num_speculative_tokens": 8,
},
max_model_len=1024,
enforce_eager=False) as runner:
max_model_len=1024) as runner:
spec_outputs = runner.model.chat(test_prompts, sampling_config)
matches = 0
misses = 0
@@ -236,8 +237,7 @@ def test_suffix_acceptance(
"num_speculative_tokens": 10,
},
max_model_len=1024,
disable_log_stats=False,
enforce_eager=False) as runner:
disable_log_stats=False) as runner:
for i in range(10):
runner.model.chat(test_prompts[i], sampling_config)
metrics = runner.model.get_metrics()
@@ -278,7 +278,7 @@ def test_eagle_logprobs(
max_tokens=10,
ignore_eos=False)
ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=False)
ref_llm = LLM(model=model_name, max_model_len=2048)
ref_outputs = ref_llm.chat([prompt], sampling_params)
ref_logprobs = []
for output in ref_outputs[0].outputs:
@@ -300,7 +300,6 @@ def test_eagle_logprobs(
"max_model_len": 128,
},
max_model_len=128,
enforce_eager=False,
) as runner:
spec_outputs = runner.model.chat([prompt], sampling_params)

View File

@@ -36,7 +36,7 @@ MODELS = [
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
def test_output_between_eager_and_aclgraph(
def test_models_output_between_eager_and_aclgraph(
model: str,
max_tokens: int,
) -> None:
@@ -50,7 +50,6 @@ def test_output_between_eager_and_aclgraph(
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=False,
quantization="ascend",
) as runner:
vllm_aclgraph_outputs = runner.model.generate(
@@ -68,7 +67,6 @@ def test_output_between_eager_and_aclgraph(
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=False,
) as runner:
vllm_aclgraph_outputs = runner.model.generate(
prompts, sampling_params)
@@ -100,7 +98,7 @@ def test_output_between_eager_and_aclgraph(
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
def test_output_between_eager_and_full_decode_only(
def test_models_output_between_eager_and_full_decode_only(
model: str,
max_tokens: int,
) -> None:
@@ -155,7 +153,6 @@ def test_output_between_eager_and_full_decode_only(
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=False,
compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"},
quantization="ascend",
) as runner:
@@ -166,7 +163,6 @@ def test_output_between_eager_and_full_decode_only(
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=False,
compilation_config={
"cudagraph_capture_sizes": [4, 8, 32, 64],
"cudagraph_mode": "FULL_DECODE_ONLY"
@@ -196,7 +192,7 @@ def test_output_between_eager_and_full_decode_only(
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
def test_output_between_eager_and_fullgraph_npugraph_ex(
def test_models_output_between_eager_and_fullgraph_npugraph_ex(
model: str,
max_tokens: int,
) -> None:
@@ -251,7 +247,6 @@ def test_output_between_eager_and_fullgraph_npugraph_ex(
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=False,
compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"},
additional_config={"enable_npugraph_ex": True},
quantization="ascend",
@@ -263,7 +258,6 @@ def test_output_between_eager_and_fullgraph_npugraph_ex(
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=False,
compilation_config={
"cudagraph_capture_sizes": [4, 8, 32, 64],
"cudagraph_mode": "FULL_DECODE_ONLY"

View File

@@ -76,9 +76,7 @@ def test_end_to_end():
prompt = "How are you?"
sampling_params = SamplingParams(temperature=0, max_tokens=10)
with VllmRunner("Qwen/Qwen3-0.6B",
enforce_eager=False,
enable_sleep_mode=True) as runner:
with VllmRunner("Qwen/Qwen3-0.6B", enable_sleep_mode=True) as runner:
output = runner.model.generate(prompt, sampling_params)
# the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,

View File

@@ -45,13 +45,14 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
def test_ilama_lora(ilama_lora_files):
with VllmRunner(snapshot_download(MODEL_PATH),
enable_lora=True,
dtype="half",
max_loras=4,
max_model_len=1024,
max_num_seqs=16,
enforce_eager=False) as vllm_model:
with VllmRunner(
snapshot_download(MODEL_PATH),
enable_lora=True,
dtype="half",
max_loras=4,
max_model_len=1024,
max_num_seqs=16,
) as vllm_model:
output1 = do_sample(vllm_model.model, ilama_lora_files, lora_id=1)
for i in range(len(EXPECTED_LORA_OUTPUT)):

View File

@@ -58,7 +58,6 @@ def test_models_with_multistream_overlap_shared_expert(
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=False,
additional_config={
"multistream_overlap_shared_expert": True,
},

View File

@@ -20,15 +20,14 @@ from modelscope import snapshot_download # type: ignore[import-untyped]
from tests.e2e.conftest import VllmRunner
def test_quant_W8A8():
def test_qwen3_w8a8_quant():
max_tokens = 5
example_prompts = [
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
]
with VllmRunner(
snapshot_download("vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"),
snapshot_download("vllm-ascend/Qwen3-0.6B-W8A8"),
max_model_len=8192,
enforce_eager=False,
gpu_memory_utilization=0.7,
quantization="ascend",
) as vllm_model:

View File

@@ -21,7 +21,7 @@ from vllm import SamplingParams
from tests.e2e.conftest import VllmRunner
def test_models_topk() -> None:
def test_qwen3_topk() -> None:
example_prompts = [
"Hello, my name is",
]
@@ -36,7 +36,7 @@ def test_models_topk() -> None:
runner.generate(example_prompts, sampling_params)
def test_models_prompt_logprobs() -> None:
def test_qwen3_prompt_logprobs() -> None:
example_prompts = [
"Hello, my name is",
]
@@ -49,7 +49,7 @@ def test_models_prompt_logprobs() -> None:
num_logprobs=1)
def test_exponential_overlap() -> None:
def test_qwen3_exponential_overlap() -> None:
example_prompts = [
"Hello, my name is",
]

View File

@@ -46,7 +46,6 @@ def test_multimodal_vl(vl_config):
with VllmRunner(vl_config["model"],
mm_processor_kwargs=vl_config["mm_processor_kwargs"],
enforce_eager=False,
max_model_len=8192,
limit_mm_per_prompt={"image": 1}) as vllm_model:
outputs = vllm_model.generate_greedy(

View File

@@ -48,7 +48,6 @@ def test_models_with_xlite_decode_only(
model,
block_size=128,
max_model_len=1024,
enforce_eager=False,
additional_config={"xlite_graph_config": {
"enabled": True
}},
@@ -97,7 +96,6 @@ def test_models_with_xlite_full_mode(
model,
block_size=128,
max_model_len=1024,
enforce_eager=False,
additional_config={
"xlite_graph_config": {
"enabled": True,