[CI] refect e2e ci test (#5246)
### What this PR does / why we need it?
efect e2e ci test:
1. tests/e2e/singlecard/pooling/test_embedding.py: remove the eager
parameter and rename test case
2. tests/e2e/singlecard/pooling/test_scoring.py: Rename test cases
3. tests/e2e/singlecard/pooling/test_classification.py: Rename test case
4. tests/e2e/singlecard/test_quantization.py: remove the eager parameter
and chage model to vllm-ascend/Qwen2.5-0.6B-W8A8 and Rename test case
5. tests/e2e/multicard/test_shared_expert_dp.py: Rename test cases
6. tests/e2e/singlecard/test_sampler.py: Rename test cases
7. tests/e2e/singlecard/test_aclgraph_accuracy.py: Rename test cases
8. tests/e2e/multicard/test_offline_inference_distributed.py: Rename
test cases and remove the eager parameter
9. tests/e2e/multicard/long_sequence/test_accuracy.py: Rename test cases
and remove the eager parameter
10. tests/e2e/multicard/long_sequence/test_basic.py: Rename test cases
and remove the eager parameter
11.tests/e2e/multicard/test_expert_parallel.py:remove the eager
parameter
12.tests/e2e/multicard/test_full_graph_mode.py:remove the eager
parameter
13.tests/e2e/multicard/test_ilama_lora_tp2.py:remove the eager parameter
14.tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py:remove
the eager parameter
15.tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py:remove the
eager parameter
16.tests/e2e/singlecard/test_aclgraph_accuracy.py:remove the eager
parameter
17.tests/e2e/singlecard/test_camem.py:remove the eager parameter
18.tests/e2e/singlecard/test_ilama_lora.py:remove the eager parameter
19.tests/e2e/singlecard/test_multistream_overlap_shared_expert.py:remove
the eager parameter
20.tests/e2e/singlecard/test_vlm.py:remove the eager parameter
21.tests/e2e/singlecard/test_xli:remove the eager parameter
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: release/v0.13.0
- vLLM main:
ad32e3e19c
Signed-off-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
@@ -171,4 +171,4 @@ def test_mtp2_correctness_piecewise_graph_with_pad(
|
||||
mtp_correctness(sampling_config,
|
||||
model_name,
|
||||
2,
|
||||
disable_padded_drafter_batch=False)
|
||||
disable_padded_drafter_batch=False)
|
||||
@@ -76,19 +76,22 @@ def test_ngram_correctness(
|
||||
should be the same when using ngram speculative decoding.
|
||||
'''
|
||||
|
||||
with VllmRunner(model_name, max_model_len=1024,
|
||||
enforce_eager=False) as ref_llm:
|
||||
with VllmRunner(
|
||||
model_name,
|
||||
max_model_len=1024,
|
||||
) as ref_llm:
|
||||
ref_outputs = ref_llm.model.chat(test_prompts, sampling_config)
|
||||
|
||||
with VllmRunner(model_name,
|
||||
speculative_config={
|
||||
"method": "ngram",
|
||||
"prompt_lookup_max": 5,
|
||||
"prompt_lookup_min": 3,
|
||||
"num_speculative_tokens": 3,
|
||||
},
|
||||
max_model_len=1024,
|
||||
enforce_eager=False) as runner:
|
||||
with VllmRunner(
|
||||
model_name,
|
||||
speculative_config={
|
||||
"method": "ngram",
|
||||
"prompt_lookup_max": 5,
|
||||
"prompt_lookup_min": 3,
|
||||
"num_speculative_tokens": 3,
|
||||
},
|
||||
max_model_len=1024,
|
||||
) as runner:
|
||||
spec_outputs = runner.model.chat(test_prompts, sampling_config)
|
||||
matches = 0
|
||||
misses = 0
|
||||
@@ -190,8 +193,7 @@ def test_suffix_correctness(
|
||||
Compare the outputs of a original LLM and a speculative LLM
|
||||
should be the same when using ngram speculative decoding.
|
||||
'''
|
||||
with VllmRunner(model_name, max_model_len=1024,
|
||||
enforce_eager=False) as ref_llm:
|
||||
with VllmRunner(model_name, max_model_len=1024) as ref_llm:
|
||||
ref_outputs = ref_llm.model.chat(test_prompts, sampling_config)
|
||||
|
||||
with VllmRunner(model_name,
|
||||
@@ -199,8 +201,7 @@ def test_suffix_correctness(
|
||||
"method": "suffix",
|
||||
"num_speculative_tokens": 8,
|
||||
},
|
||||
max_model_len=1024,
|
||||
enforce_eager=False) as runner:
|
||||
max_model_len=1024) as runner:
|
||||
spec_outputs = runner.model.chat(test_prompts, sampling_config)
|
||||
matches = 0
|
||||
misses = 0
|
||||
@@ -236,8 +237,7 @@ def test_suffix_acceptance(
|
||||
"num_speculative_tokens": 10,
|
||||
},
|
||||
max_model_len=1024,
|
||||
disable_log_stats=False,
|
||||
enforce_eager=False) as runner:
|
||||
disable_log_stats=False) as runner:
|
||||
for i in range(10):
|
||||
runner.model.chat(test_prompts[i], sampling_config)
|
||||
metrics = runner.model.get_metrics()
|
||||
@@ -278,7 +278,7 @@ def test_eagle_logprobs(
|
||||
max_tokens=10,
|
||||
ignore_eos=False)
|
||||
|
||||
ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=False)
|
||||
ref_llm = LLM(model=model_name, max_model_len=2048)
|
||||
ref_outputs = ref_llm.chat([prompt], sampling_params)
|
||||
ref_logprobs = []
|
||||
for output in ref_outputs[0].outputs:
|
||||
@@ -300,7 +300,6 @@ def test_eagle_logprobs(
|
||||
"max_model_len": 128,
|
||||
},
|
||||
max_model_len=128,
|
||||
enforce_eager=False,
|
||||
) as runner:
|
||||
spec_outputs = runner.model.chat([prompt], sampling_params)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user