[E2E] Optimize the E2E test time. (#5294)

### What this PR does / why we need it?
Add cudagraph_capture_sizes for E2E CI test.

- vLLM version: release/v0.13.0
- vLLM main:
ad32e3e19c

Signed-off-by: menogrey <1299267905@qq.com>
This commit is contained in:
zhangyiming
2025-12-26 14:17:50 +08:00
committed by GitHub
parent 29d2fe653d
commit 45c5bcd962
22 changed files with 57 additions and 5 deletions

View File

@@ -41,6 +41,7 @@ def mtp_correctness(sampling_config: SamplingParams,
tensor_parallel_size=1,
gpu_memory_utilization=0.7,
max_model_len=256,
cudagraph_capture_sizes=[12],
enforce_eager=enforce_eager) as ref_llm:
ref_outputs = ref_llm.generate(example_prompts, sampling_config)

View File

@@ -79,6 +79,7 @@ def test_ngram_correctness(
with VllmRunner(
model_name,
max_model_len=1024,
cudagraph_capture_sizes=[1, 2, 4, 8],
) as ref_llm:
ref_outputs = ref_llm.model.chat(test_prompts, sampling_config)
@@ -91,6 +92,7 @@ def test_ngram_correctness(
"num_speculative_tokens": 3,
},
max_model_len=1024,
cudagraph_capture_sizes=[1, 2, 4, 8],
) as runner:
spec_outputs = runner.model.chat(test_prompts, sampling_config)
matches = 0
@@ -193,7 +195,9 @@ def test_suffix_correctness(
Compare the outputs of a original LLM and a speculative LLM
should be the same when using ngram speculative decoding.
'''
with VllmRunner(model_name, max_model_len=1024) as ref_llm:
with VllmRunner(model_name,
max_model_len=1024,
cudagraph_capture_sizes=[1, 2, 4, 8]) as ref_llm:
ref_outputs = ref_llm.model.chat(test_prompts, sampling_config)
with VllmRunner(model_name,
@@ -201,6 +205,7 @@ def test_suffix_correctness(
"method": "suffix",
"num_speculative_tokens": 8,
},
cudagraph_capture_sizes=[1, 2, 4, 8],
max_model_len=1024) as runner:
spec_outputs = runner.model.chat(test_prompts, sampling_config)
matches = 0
@@ -237,6 +242,7 @@ def test_suffix_acceptance(
"num_speculative_tokens": 10,
},
max_model_len=1024,
cudagraph_capture_sizes=[1, 2, 4, 8],
disable_log_stats=False) as runner:
for i in range(10):
runner.model.chat(test_prompts[i], sampling_config)
@@ -300,6 +306,7 @@ def test_eagle_logprobs(
"max_model_len": 128,
},
max_model_len=128,
cudagraph_capture_sizes=[1, 2, 4, 8],
) as runner:
spec_outputs = runner.model.chat([prompt], sampling_params)