[E2E] Optimize the E2E test time. (#5294)

### What this PR does / why we need it? Add cudagraph_capture_sizes for E2E CI test. - vLLM version: release/v0.13.0 - vLLM main: ad32e3e19c Signed-off-by: menogrey <1299267905@qq.com>
2025-12-26 14:17:50 +08:00
parent 29d2fe653d
commit 45c5bcd962
22 changed files with 57 additions and 5 deletions
--- a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
+++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
@@ -41,6 +41,7 @@ def mtp_correctness(sampling_config: SamplingParams,
                    tensor_parallel_size=1,
                    gpu_memory_utilization=0.7,
                    max_model_len=256,
+                    cudagraph_capture_sizes=[12],
                    enforce_eager=enforce_eager) as ref_llm:
        ref_outputs = ref_llm.generate(example_prompts, sampling_config)

--- a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
+++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
@@ -79,6 +79,7 @@ def test_ngram_correctness(
    with VllmRunner(
            model_name,
            max_model_len=1024,
+            cudagraph_capture_sizes=[1, 2, 4, 8],
    ) as ref_llm:
        ref_outputs = ref_llm.model.chat(test_prompts, sampling_config)

@@ -91,6 +92,7 @@ def test_ngram_correctness(
                "num_speculative_tokens": 3,
            },
            max_model_len=1024,
+            cudagraph_capture_sizes=[1, 2, 4, 8],
    ) as runner:
        spec_outputs = runner.model.chat(test_prompts, sampling_config)
    matches = 0
@@ -193,7 +195,9 @@ def test_suffix_correctness(
    Compare the outputs of a original LLM and a speculative LLM
    should be the same when using ngram speculative decoding.
    '''
-    with VllmRunner(model_name, max_model_len=1024) as ref_llm:
+    with VllmRunner(model_name,
+                    max_model_len=1024,
+                    cudagraph_capture_sizes=[1, 2, 4, 8]) as ref_llm:
        ref_outputs = ref_llm.model.chat(test_prompts, sampling_config)

    with VllmRunner(model_name,
@@ -201,6 +205,7 @@ def test_suffix_correctness(
                        "method": "suffix",
                        "num_speculative_tokens": 8,
                    },
+                    cudagraph_capture_sizes=[1, 2, 4, 8],
                    max_model_len=1024) as runner:
        spec_outputs = runner.model.chat(test_prompts, sampling_config)
    matches = 0
@@ -237,6 +242,7 @@ def test_suffix_acceptance(
                        "num_speculative_tokens": 10,
                    },
                    max_model_len=1024,
+                    cudagraph_capture_sizes=[1, 2, 4, 8],
                    disable_log_stats=False) as runner:
        for i in range(10):
            runner.model.chat(test_prompts[i], sampling_config)
@@ -300,6 +306,7 @@ def test_eagle_logprobs(
                "max_model_len": 128,
            },
            max_model_len=128,
+            cudagraph_capture_sizes=[1, 2, 4, 8],
    ) as runner:
        spec_outputs = runner.model.chat([prompt], sampling_params)