[E2E] Optimize the E2E test time. (#5294)
### What this PR does / why we need it?
Add cudagraph_capture_sizes for E2E CI test.
- vLLM version: release/v0.13.0
- vLLM main:
ad32e3e19c
Signed-off-by: menogrey <1299267905@qq.com>
This commit is contained in:
@@ -41,6 +41,7 @@ def mtp_correctness(sampling_config: SamplingParams,
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.7,
|
||||
max_model_len=256,
|
||||
cudagraph_capture_sizes=[12],
|
||||
enforce_eager=enforce_eager) as ref_llm:
|
||||
ref_outputs = ref_llm.generate(example_prompts, sampling_config)
|
||||
|
||||
|
||||
@@ -79,6 +79,7 @@ def test_ngram_correctness(
|
||||
with VllmRunner(
|
||||
model_name,
|
||||
max_model_len=1024,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
) as ref_llm:
|
||||
ref_outputs = ref_llm.model.chat(test_prompts, sampling_config)
|
||||
|
||||
@@ -91,6 +92,7 @@ def test_ngram_correctness(
|
||||
"num_speculative_tokens": 3,
|
||||
},
|
||||
max_model_len=1024,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
) as runner:
|
||||
spec_outputs = runner.model.chat(test_prompts, sampling_config)
|
||||
matches = 0
|
||||
@@ -193,7 +195,9 @@ def test_suffix_correctness(
|
||||
Compare the outputs of a original LLM and a speculative LLM
|
||||
should be the same when using ngram speculative decoding.
|
||||
'''
|
||||
with VllmRunner(model_name, max_model_len=1024) as ref_llm:
|
||||
with VllmRunner(model_name,
|
||||
max_model_len=1024,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8]) as ref_llm:
|
||||
ref_outputs = ref_llm.model.chat(test_prompts, sampling_config)
|
||||
|
||||
with VllmRunner(model_name,
|
||||
@@ -201,6 +205,7 @@ def test_suffix_correctness(
|
||||
"method": "suffix",
|
||||
"num_speculative_tokens": 8,
|
||||
},
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
max_model_len=1024) as runner:
|
||||
spec_outputs = runner.model.chat(test_prompts, sampling_config)
|
||||
matches = 0
|
||||
@@ -237,6 +242,7 @@ def test_suffix_acceptance(
|
||||
"num_speculative_tokens": 10,
|
||||
},
|
||||
max_model_len=1024,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
disable_log_stats=False) as runner:
|
||||
for i in range(10):
|
||||
runner.model.chat(test_prompts[i], sampling_config)
|
||||
@@ -300,6 +306,7 @@ def test_eagle_logprobs(
|
||||
"max_model_len": 128,
|
||||
},
|
||||
max_model_len=128,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
) as runner:
|
||||
spec_outputs = runner.model.chat([prompt], sampling_params)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user