[CI] refect e2e ci test (#5246)

### What this PR does / why we need it? efect e2e ci test： 1. tests/e2e/singlecard/pooling/test_embedding.py: remove the eager parameter and rename test case 2. tests/e2e/singlecard/pooling/test_scoring.py: Rename test cases 3. tests/e2e/singlecard/pooling/test_classification.py: Rename test case 4. tests/e2e/singlecard/test_quantization.py: remove the eager parameter and chage model to vllm-ascend/Qwen2.5-0.6B-W8A8 and Rename test case 5. tests/e2e/multicard/test_shared_expert_dp.py: Rename test cases 6. tests/e2e/singlecard/test_sampler.py: Rename test cases 7. tests/e2e/singlecard/test_aclgraph_accuracy.py: Rename test cases 8. tests/e2e/multicard/test_offline_inference_distributed.py: Rename test cases and remove the eager parameter 9. tests/e2e/multicard/long_sequence/test_accuracy.py: Rename test cases and remove the eager parameter 10. tests/e2e/multicard/long_sequence/test_basic.py: Rename test cases and remove the eager parameter 11.tests/e2e/multicard/test_expert_parallel.py:remove the eager parameter 12.tests/e2e/multicard/test_full_graph_mode.py:remove the eager parameter 13.tests/e2e/multicard/test_ilama_lora_tp2.py:remove the eager parameter 14.tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py:remove the eager parameter 15.tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py:remove the eager parameter 16.tests/e2e/singlecard/test_aclgraph_accuracy.py:remove the eager parameter 17.tests/e2e/singlecard/test_camem.py:remove the eager parameter 18.tests/e2e/singlecard/test_ilama_lora.py:remove the eager parameter 19.tests/e2e/singlecard/test_multistream_overlap_shared_expert.py:remove the eager parameter 20.tests/e2e/singlecard/test_vlm.py:remove the eager parameter 21.tests/e2e/singlecard/test_xli:remove the eager parameter ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: release/v0.13.0 - vLLM main: ad32e3e19c Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2025-12-23 18:42:35 +08:00
parent 5d1f6daef6
commit 8ae7fca947
20 changed files with 61 additions and 88 deletions
--- a/tests/e2e/singlecard/test_aclgraph_accuracy.py
+++ b/tests/e2e/singlecard/test_aclgraph_accuracy.py
@@ -36,7 +36,7 @@ MODELS = [

@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
-def test_output_between_eager_and_aclgraph(
+def test_models_output_between_eager_and_aclgraph(
    model: str,
    max_tokens: int,
 ) -> None:
@@ -50,7 +50,6 @@ def test_output_between_eager_and_aclgraph(
        with VllmRunner(
                model,
                max_model_len=1024,
-                enforce_eager=False,
                quantization="ascend",
        ) as runner:
            vllm_aclgraph_outputs = runner.model.generate(
@@ -68,7 +67,6 @@ def test_output_between_eager_and_aclgraph(
        with VllmRunner(
                model,
                max_model_len=1024,
-                enforce_eager=False,
        ) as runner:
            vllm_aclgraph_outputs = runner.model.generate(
                prompts, sampling_params)
@@ -100,7 +98,7 @@ def test_output_between_eager_and_aclgraph(

@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
-def test_output_between_eager_and_full_decode_only(
+def test_models_output_between_eager_and_full_decode_only(
    model: str,
    max_tokens: int,
 ) -> None:
@@ -155,7 +153,6 @@ def test_output_between_eager_and_full_decode_only(
        with VllmRunner(
                model,
                max_model_len=1024,
-                enforce_eager=False,
                compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"},
                quantization="ascend",
        ) as runner:
@@ -166,7 +163,6 @@ def test_output_between_eager_and_full_decode_only(
        with VllmRunner(
                model,
                max_model_len=1024,
-                enforce_eager=False,
                compilation_config={
                    "cudagraph_capture_sizes": [4, 8, 32, 64],
                    "cudagraph_mode": "FULL_DECODE_ONLY"
@@ -196,7 +192,7 @@ def test_output_between_eager_and_full_decode_only(

@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
-def test_output_between_eager_and_fullgraph_npugraph_ex(
+def test_models_output_between_eager_and_fullgraph_npugraph_ex(
    model: str,
    max_tokens: int,
 ) -> None:
@@ -251,7 +247,6 @@ def test_output_between_eager_and_fullgraph_npugraph_ex(
        with VllmRunner(
                model,
                max_model_len=1024,
-                enforce_eager=False,
                compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"},
                additional_config={"enable_npugraph_ex": True},
                quantization="ascend",
@@ -263,7 +258,6 @@ def test_output_between_eager_and_fullgraph_npugraph_ex(
        with VllmRunner(
                model,
                max_model_len=1024,
-                enforce_eager=False,
                compilation_config={
                    "cudagraph_capture_sizes": [4, 8, 32, 64],
                    "cudagraph_mode": "FULL_DECODE_ONLY"