[CI] refect e2e ci test (#5246)

### What this PR does / why we need it? efect e2e ci test： 1. tests/e2e/singlecard/pooling/test_embedding.py: remove the eager parameter and rename test case 2. tests/e2e/singlecard/pooling/test_scoring.py: Rename test cases 3. tests/e2e/singlecard/pooling/test_classification.py: Rename test case 4. tests/e2e/singlecard/test_quantization.py: remove the eager parameter and chage model to vllm-ascend/Qwen2.5-0.6B-W8A8 and Rename test case 5. tests/e2e/multicard/test_shared_expert_dp.py: Rename test cases 6. tests/e2e/singlecard/test_sampler.py: Rename test cases 7. tests/e2e/singlecard/test_aclgraph_accuracy.py: Rename test cases 8. tests/e2e/multicard/test_offline_inference_distributed.py: Rename test cases and remove the eager parameter 9. tests/e2e/multicard/long_sequence/test_accuracy.py: Rename test cases and remove the eager parameter 10. tests/e2e/multicard/long_sequence/test_basic.py: Rename test cases and remove the eager parameter 11.tests/e2e/multicard/test_expert_parallel.py:remove the eager parameter 12.tests/e2e/multicard/test_full_graph_mode.py:remove the eager parameter 13.tests/e2e/multicard/test_ilama_lora_tp2.py:remove the eager parameter 14.tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py:remove the eager parameter 15.tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py:remove the eager parameter 16.tests/e2e/singlecard/test_aclgraph_accuracy.py:remove the eager parameter 17.tests/e2e/singlecard/test_camem.py:remove the eager parameter 18.tests/e2e/singlecard/test_ilama_lora.py:remove the eager parameter 19.tests/e2e/singlecard/test_multistream_overlap_shared_expert.py:remove the eager parameter 20.tests/e2e/singlecard/test_vlm.py:remove the eager parameter 21.tests/e2e/singlecard/test_xli:remove the eager parameter ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: release/v0.13.0 - vLLM main: ad32e3e19c Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2025-12-23 18:42:35 +08:00
parent 5d1f6daef6
commit 8ae7fca947
20 changed files with 61 additions and 88 deletions
--- a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
+++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
@@ -171,4 +171,4 @@ def test_mtp2_correctness_piecewise_graph_with_pad(
    mtp_correctness(sampling_config,
                    model_name,
                    2,
-                    disable_padded_drafter_batch=False)
+                    disable_padded_drafter_batch=False)
--- a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
+++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
@@ -76,19 +76,22 @@ def test_ngram_correctness(
    should be the same when using ngram speculative decoding.
    '''

-    with VllmRunner(model_name, max_model_len=1024,
-                    enforce_eager=False) as ref_llm:
+    with VllmRunner(
+            model_name,
+            max_model_len=1024,
+    ) as ref_llm:
        ref_outputs = ref_llm.model.chat(test_prompts, sampling_config)

-    with VllmRunner(model_name,
-                    speculative_config={
-                        "method": "ngram",
-                        "prompt_lookup_max": 5,
-                        "prompt_lookup_min": 3,
-                        "num_speculative_tokens": 3,
-                    },
-                    max_model_len=1024,
-                    enforce_eager=False) as runner:
+    with VllmRunner(
+            model_name,
+            speculative_config={
+                "method": "ngram",
+                "prompt_lookup_max": 5,
+                "prompt_lookup_min": 3,
+                "num_speculative_tokens": 3,
+            },
+            max_model_len=1024,
+    ) as runner:
        spec_outputs = runner.model.chat(test_prompts, sampling_config)
    matches = 0
    misses = 0
@@ -190,8 +193,7 @@ def test_suffix_correctness(
    Compare the outputs of a original LLM and a speculative LLM
    should be the same when using ngram speculative decoding.
    '''
-    with VllmRunner(model_name, max_model_len=1024,
-                    enforce_eager=False) as ref_llm:
+    with VllmRunner(model_name, max_model_len=1024) as ref_llm:
        ref_outputs = ref_llm.model.chat(test_prompts, sampling_config)

    with VllmRunner(model_name,
@@ -199,8 +201,7 @@ def test_suffix_correctness(
                        "method": "suffix",
                        "num_speculative_tokens": 8,
                    },
-                    max_model_len=1024,
-                    enforce_eager=False) as runner:
+                    max_model_len=1024) as runner:
        spec_outputs = runner.model.chat(test_prompts, sampling_config)
    matches = 0
    misses = 0
@@ -236,8 +237,7 @@ def test_suffix_acceptance(
                        "num_speculative_tokens": 10,
                    },
                    max_model_len=1024,
-                    disable_log_stats=False,
-                    enforce_eager=False) as runner:
+                    disable_log_stats=False) as runner:
        for i in range(10):
            runner.model.chat(test_prompts[i], sampling_config)
            metrics = runner.model.get_metrics()
@@ -278,7 +278,7 @@ def test_eagle_logprobs(
                                     max_tokens=10,
                                     ignore_eos=False)

-    ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=False)
+    ref_llm = LLM(model=model_name, max_model_len=2048)
    ref_outputs = ref_llm.chat([prompt], sampling_params)
    ref_logprobs = []
    for output in ref_outputs[0].outputs:
@@ -300,7 +300,6 @@ def test_eagle_logprobs(
                "max_model_len": 128,
            },
            max_model_len=128,
-            enforce_eager=False,
    ) as runner:
        spec_outputs = runner.model.chat([prompt], sampling_params)