[CI] refect e2e ci test (#5246)

### What this PR does / why we need it?
efect e2e ci test:
1. tests/e2e/singlecard/pooling/test_embedding.py: remove the eager
parameter and rename test case
2. tests/e2e/singlecard/pooling/test_scoring.py: Rename test cases
3. tests/e2e/singlecard/pooling/test_classification.py: Rename test case
4. tests/e2e/singlecard/test_quantization.py: remove the eager parameter
and chage model to vllm-ascend/Qwen2.5-0.6B-W8A8 and Rename test case
5. tests/e2e/multicard/test_shared_expert_dp.py: Rename test cases
6. tests/e2e/singlecard/test_sampler.py: Rename test cases
7. tests/e2e/singlecard/test_aclgraph_accuracy.py: Rename test cases
8. tests/e2e/multicard/test_offline_inference_distributed.py: Rename
test cases and remove the eager parameter
9. tests/e2e/multicard/long_sequence/test_accuracy.py: Rename test cases
and remove the eager parameter
10. tests/e2e/multicard/long_sequence/test_basic.py: Rename test cases
and remove the eager parameter
11.tests/e2e/multicard/test_expert_parallel.py:remove the eager
parameter
12.tests/e2e/multicard/test_full_graph_mode.py:remove the eager
parameter
13.tests/e2e/multicard/test_ilama_lora_tp2.py:remove the eager parameter

14.tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py:remove
the eager parameter
15.tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py:remove the
eager parameter
16.tests/e2e/singlecard/test_aclgraph_accuracy.py:remove the eager
parameter
17.tests/e2e/singlecard/test_camem.py:remove the eager parameter
18.tests/e2e/singlecard/test_ilama_lora.py:remove the eager parameter

19.tests/e2e/singlecard/test_multistream_overlap_shared_expert.py:remove
the eager parameter
20.tests/e2e/singlecard/test_vlm.py:remove the eager parameter
21.tests/e2e/singlecard/test_xli:remove the eager parameter

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
ad32e3e19c

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
zhangxinyuehfad
2025-12-23 18:42:35 +08:00
committed by GitHub
parent 5d1f6daef6
commit 8ae7fca947
20 changed files with 61 additions and 88 deletions

View File

@@ -171,4 +171,4 @@ def test_mtp2_correctness_piecewise_graph_with_pad(
mtp_correctness(sampling_config,
model_name,
2,
disable_padded_drafter_batch=False)
disable_padded_drafter_batch=False)

View File

@@ -76,19 +76,22 @@ def test_ngram_correctness(
should be the same when using ngram speculative decoding.
'''
with VllmRunner(model_name, max_model_len=1024,
enforce_eager=False) as ref_llm:
with VllmRunner(
model_name,
max_model_len=1024,
) as ref_llm:
ref_outputs = ref_llm.model.chat(test_prompts, sampling_config)
with VllmRunner(model_name,
speculative_config={
"method": "ngram",
"prompt_lookup_max": 5,
"prompt_lookup_min": 3,
"num_speculative_tokens": 3,
},
max_model_len=1024,
enforce_eager=False) as runner:
with VllmRunner(
model_name,
speculative_config={
"method": "ngram",
"prompt_lookup_max": 5,
"prompt_lookup_min": 3,
"num_speculative_tokens": 3,
},
max_model_len=1024,
) as runner:
spec_outputs = runner.model.chat(test_prompts, sampling_config)
matches = 0
misses = 0
@@ -190,8 +193,7 @@ def test_suffix_correctness(
Compare the outputs of a original LLM and a speculative LLM
should be the same when using ngram speculative decoding.
'''
with VllmRunner(model_name, max_model_len=1024,
enforce_eager=False) as ref_llm:
with VllmRunner(model_name, max_model_len=1024) as ref_llm:
ref_outputs = ref_llm.model.chat(test_prompts, sampling_config)
with VllmRunner(model_name,
@@ -199,8 +201,7 @@ def test_suffix_correctness(
"method": "suffix",
"num_speculative_tokens": 8,
},
max_model_len=1024,
enforce_eager=False) as runner:
max_model_len=1024) as runner:
spec_outputs = runner.model.chat(test_prompts, sampling_config)
matches = 0
misses = 0
@@ -236,8 +237,7 @@ def test_suffix_acceptance(
"num_speculative_tokens": 10,
},
max_model_len=1024,
disable_log_stats=False,
enforce_eager=False) as runner:
disable_log_stats=False) as runner:
for i in range(10):
runner.model.chat(test_prompts[i], sampling_config)
metrics = runner.model.get_metrics()
@@ -278,7 +278,7 @@ def test_eagle_logprobs(
max_tokens=10,
ignore_eos=False)
ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=False)
ref_llm = LLM(model=model_name, max_model_len=2048)
ref_outputs = ref_llm.chat([prompt], sampling_params)
ref_logprobs = []
for output in ref_outputs[0].outputs:
@@ -300,7 +300,6 @@ def test_eagle_logprobs(
"max_model_len": 128,
},
max_model_len=128,
enforce_eager=False,
) as runner:
spec_outputs = runner.model.chat([prompt], sampling_params)