[CI] refect e2e ci test (#5246)

### What this PR does / why we need it?
efect e2e ci test:
1. tests/e2e/singlecard/pooling/test_embedding.py: remove the eager
parameter and rename test case
2. tests/e2e/singlecard/pooling/test_scoring.py: Rename test cases
3. tests/e2e/singlecard/pooling/test_classification.py: Rename test case
4. tests/e2e/singlecard/test_quantization.py: remove the eager parameter
and chage model to vllm-ascend/Qwen2.5-0.6B-W8A8 and Rename test case
5. tests/e2e/multicard/test_shared_expert_dp.py: Rename test cases
6. tests/e2e/singlecard/test_sampler.py: Rename test cases
7. tests/e2e/singlecard/test_aclgraph_accuracy.py: Rename test cases
8. tests/e2e/multicard/test_offline_inference_distributed.py: Rename
test cases and remove the eager parameter
9. tests/e2e/multicard/long_sequence/test_accuracy.py: Rename test cases
and remove the eager parameter
10. tests/e2e/multicard/long_sequence/test_basic.py: Rename test cases
and remove the eager parameter
11.tests/e2e/multicard/test_expert_parallel.py:remove the eager
parameter
12.tests/e2e/multicard/test_full_graph_mode.py:remove the eager
parameter
13.tests/e2e/multicard/test_ilama_lora_tp2.py:remove the eager parameter

14.tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py:remove
the eager parameter
15.tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py:remove the
eager parameter
16.tests/e2e/singlecard/test_aclgraph_accuracy.py:remove the eager
parameter
17.tests/e2e/singlecard/test_camem.py:remove the eager parameter
18.tests/e2e/singlecard/test_ilama_lora.py:remove the eager parameter

19.tests/e2e/singlecard/test_multistream_overlap_shared_expert.py:remove
the eager parameter
20.tests/e2e/singlecard/test_vlm.py:remove the eager parameter
21.tests/e2e/singlecard/test_xli:remove the eager parameter

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
ad32e3e19c

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
zhangxinyuehfad
2025-12-23 18:42:35 +08:00
committed by GitHub
parent 5d1f6daef6
commit 8ae7fca947
20 changed files with 61 additions and 88 deletions

View File

@@ -5,7 +5,7 @@ from transformers import AutoModelForSequenceClassification
from tests.e2e.conftest import HfRunner, VllmRunner
def test_classify_correctness() -> None:
def test_qwen_pooling_classify_correctness() -> None:
model_name = snapshot_download("Howeee/Qwen2.5-1.5B-apeach")

View File

@@ -36,7 +36,6 @@ def test_embed_models_correctness(model: str):
with VllmRunner(
model_name,
runner="pooling",
enforce_eager=False,
max_model_len=None,
cudagraph_capture_sizes=[4],
) as vllm_runner:
@@ -58,14 +57,13 @@ def test_embed_models_correctness(model: str):
)
def test_bge_model_correctness():
def test_bge_m3_correctness():
queries = ['What is the capital of China?', 'Explain gravity']
model_name = snapshot_download("BAAI/bge-m3")
with VllmRunner(
model_name,
runner="pooling",
enforce_eager=False,
) as vllm_aclgraph_runner:
vllm_aclgraph_outputs = vllm_aclgraph_runner.embed(queries)

View File

@@ -34,7 +34,7 @@ def model_name(request):
yield snapshot_download(request.param)
def test_cross_encoder_1_to_1(model_name):
def test_cross_encoder_score_1_to_1(model_name):
text_pair = [TEXTS_1[0], TEXTS_2[0]]
with HfRunner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
@@ -53,7 +53,7 @@ def test_cross_encoder_1_to_1(model_name):
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
def test_cross_encoder_1_to_N(model_name):
def test_cross_encoder_score_1_to_N(model_name):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[0], TEXTS_2[1]],
@@ -76,7 +76,7 @@ def test_cross_encoder_1_to_N(model_name):
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
def test_cross_encoder_N_to_N(model_name):
def test_cross_encoder_score_N_to_N(model_name):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[1], TEXTS_2[1]],
@@ -104,7 +104,7 @@ def emb_model_name(request):
yield snapshot_download(request.param)
def test_embedding_1_to_1(emb_model_name):
def test_embedding_score_1_to_1(emb_model_name):
text_pair = [TEXTS_1[0], TEXTS_2[0]]
with HfRunner(emb_model_name, dtype=DTYPE,
@@ -127,7 +127,7 @@ def test_embedding_1_to_1(emb_model_name):
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
def test_embedding_1_to_N(emb_model_name):
def test_embedding_score_1_to_N(emb_model_name):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[0], TEXTS_2[1]],
@@ -157,7 +157,7 @@ def test_embedding_1_to_N(emb_model_name):
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
def test_embedding_N_to_N(emb_model_name):
def test_embedding_score_N_to_N(emb_model_name):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[1], TEXTS_2[1]],

View File

@@ -171,4 +171,4 @@ def test_mtp2_correctness_piecewise_graph_with_pad(
mtp_correctness(sampling_config,
model_name,
2,
disable_padded_drafter_batch=False)
disable_padded_drafter_batch=False)

View File

@@ -76,19 +76,22 @@ def test_ngram_correctness(
should be the same when using ngram speculative decoding.
'''
with VllmRunner(model_name, max_model_len=1024,
enforce_eager=False) as ref_llm:
with VllmRunner(
model_name,
max_model_len=1024,
) as ref_llm:
ref_outputs = ref_llm.model.chat(test_prompts, sampling_config)
with VllmRunner(model_name,
speculative_config={
"method": "ngram",
"prompt_lookup_max": 5,
"prompt_lookup_min": 3,
"num_speculative_tokens": 3,
},
max_model_len=1024,
enforce_eager=False) as runner:
with VllmRunner(
model_name,
speculative_config={
"method": "ngram",
"prompt_lookup_max": 5,
"prompt_lookup_min": 3,
"num_speculative_tokens": 3,
},
max_model_len=1024,
) as runner:
spec_outputs = runner.model.chat(test_prompts, sampling_config)
matches = 0
misses = 0
@@ -190,8 +193,7 @@ def test_suffix_correctness(
Compare the outputs of a original LLM and a speculative LLM
should be the same when using ngram speculative decoding.
'''
with VllmRunner(model_name, max_model_len=1024,
enforce_eager=False) as ref_llm:
with VllmRunner(model_name, max_model_len=1024) as ref_llm:
ref_outputs = ref_llm.model.chat(test_prompts, sampling_config)
with VllmRunner(model_name,
@@ -199,8 +201,7 @@ def test_suffix_correctness(
"method": "suffix",
"num_speculative_tokens": 8,
},
max_model_len=1024,
enforce_eager=False) as runner:
max_model_len=1024) as runner:
spec_outputs = runner.model.chat(test_prompts, sampling_config)
matches = 0
misses = 0
@@ -236,8 +237,7 @@ def test_suffix_acceptance(
"num_speculative_tokens": 10,
},
max_model_len=1024,
disable_log_stats=False,
enforce_eager=False) as runner:
disable_log_stats=False) as runner:
for i in range(10):
runner.model.chat(test_prompts[i], sampling_config)
metrics = runner.model.get_metrics()
@@ -278,7 +278,7 @@ def test_eagle_logprobs(
max_tokens=10,
ignore_eos=False)
ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=False)
ref_llm = LLM(model=model_name, max_model_len=2048)
ref_outputs = ref_llm.chat([prompt], sampling_params)
ref_logprobs = []
for output in ref_outputs[0].outputs:
@@ -300,7 +300,6 @@ def test_eagle_logprobs(
"max_model_len": 128,
},
max_model_len=128,
enforce_eager=False,
) as runner:
spec_outputs = runner.model.chat([prompt], sampling_params)

View File

@@ -36,7 +36,7 @@ MODELS = [
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
def test_output_between_eager_and_aclgraph(
def test_models_output_between_eager_and_aclgraph(
model: str,
max_tokens: int,
) -> None:
@@ -50,7 +50,6 @@ def test_output_between_eager_and_aclgraph(
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=False,
quantization="ascend",
) as runner:
vllm_aclgraph_outputs = runner.model.generate(
@@ -68,7 +67,6 @@ def test_output_between_eager_and_aclgraph(
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=False,
) as runner:
vllm_aclgraph_outputs = runner.model.generate(
prompts, sampling_params)
@@ -100,7 +98,7 @@ def test_output_between_eager_and_aclgraph(
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
def test_output_between_eager_and_full_decode_only(
def test_models_output_between_eager_and_full_decode_only(
model: str,
max_tokens: int,
) -> None:
@@ -155,7 +153,6 @@ def test_output_between_eager_and_full_decode_only(
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=False,
compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"},
quantization="ascend",
) as runner:
@@ -166,7 +163,6 @@ def test_output_between_eager_and_full_decode_only(
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=False,
compilation_config={
"cudagraph_capture_sizes": [4, 8, 32, 64],
"cudagraph_mode": "FULL_DECODE_ONLY"
@@ -196,7 +192,7 @@ def test_output_between_eager_and_full_decode_only(
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
def test_output_between_eager_and_fullgraph_npugraph_ex(
def test_models_output_between_eager_and_fullgraph_npugraph_ex(
model: str,
max_tokens: int,
) -> None:
@@ -251,7 +247,6 @@ def test_output_between_eager_and_fullgraph_npugraph_ex(
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=False,
compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"},
additional_config={"enable_npugraph_ex": True},
quantization="ascend",
@@ -263,7 +258,6 @@ def test_output_between_eager_and_fullgraph_npugraph_ex(
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=False,
compilation_config={
"cudagraph_capture_sizes": [4, 8, 32, 64],
"cudagraph_mode": "FULL_DECODE_ONLY"

View File

@@ -76,9 +76,7 @@ def test_end_to_end():
prompt = "How are you?"
sampling_params = SamplingParams(temperature=0, max_tokens=10)
with VllmRunner("Qwen/Qwen3-0.6B",
enforce_eager=False,
enable_sleep_mode=True) as runner:
with VllmRunner("Qwen/Qwen3-0.6B", enable_sleep_mode=True) as runner:
output = runner.model.generate(prompt, sampling_params)
# the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,

View File

@@ -45,13 +45,14 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
def test_ilama_lora(ilama_lora_files):
with VllmRunner(snapshot_download(MODEL_PATH),
enable_lora=True,
dtype="half",
max_loras=4,
max_model_len=1024,
max_num_seqs=16,
enforce_eager=False) as vllm_model:
with VllmRunner(
snapshot_download(MODEL_PATH),
enable_lora=True,
dtype="half",
max_loras=4,
max_model_len=1024,
max_num_seqs=16,
) as vllm_model:
output1 = do_sample(vllm_model.model, ilama_lora_files, lora_id=1)
for i in range(len(EXPECTED_LORA_OUTPUT)):

View File

@@ -58,7 +58,6 @@ def test_models_with_multistream_overlap_shared_expert(
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=False,
additional_config={
"multistream_overlap_shared_expert": True,
},

View File

@@ -20,15 +20,14 @@ from modelscope import snapshot_download # type: ignore[import-untyped]
from tests.e2e.conftest import VllmRunner
def test_quant_W8A8():
def test_qwen3_w8a8_quant():
max_tokens = 5
example_prompts = [
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
]
with VllmRunner(
snapshot_download("vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"),
snapshot_download("vllm-ascend/Qwen3-0.6B-W8A8"),
max_model_len=8192,
enforce_eager=False,
gpu_memory_utilization=0.7,
quantization="ascend",
) as vllm_model:

View File

@@ -21,7 +21,7 @@ from vllm import SamplingParams
from tests.e2e.conftest import VllmRunner
def test_models_topk() -> None:
def test_qwen3_topk() -> None:
example_prompts = [
"Hello, my name is",
]
@@ -36,7 +36,7 @@ def test_models_topk() -> None:
runner.generate(example_prompts, sampling_params)
def test_models_prompt_logprobs() -> None:
def test_qwen3_prompt_logprobs() -> None:
example_prompts = [
"Hello, my name is",
]
@@ -49,7 +49,7 @@ def test_models_prompt_logprobs() -> None:
num_logprobs=1)
def test_exponential_overlap() -> None:
def test_qwen3_exponential_overlap() -> None:
example_prompts = [
"Hello, my name is",
]

View File

@@ -46,7 +46,6 @@ def test_multimodal_vl(vl_config):
with VllmRunner(vl_config["model"],
mm_processor_kwargs=vl_config["mm_processor_kwargs"],
enforce_eager=False,
max_model_len=8192,
limit_mm_per_prompt={"image": 1}) as vllm_model:
outputs = vllm_model.generate_greedy(

View File

@@ -48,7 +48,6 @@ def test_models_with_xlite_decode_only(
model,
block_size=128,
max_model_len=1024,
enforce_eager=False,
additional_config={"xlite_graph_config": {
"enabled": True
}},
@@ -97,7 +96,6 @@ def test_models_with_xlite_full_mode(
model,
block_size=128,
max_model_len=1024,
enforce_eager=False,
additional_config={
"xlite_graph_config": {
"enabled": True,