From 4a628f104281d448bb9059cd4eb80f4bb40575fd Mon Sep 17 00:00:00 2001 From: hucong <33891520+underfituu@users.noreply.github.com> Date: Wed, 8 Apr 2026 21:08:26 +0800 Subject: [PATCH] [UT][v0.18.0] Fix APC nightly UT and TTFT ratio (cherry-pick #7468) (#8053) ### What this PR does / why we need it? Cherry-pick from https://github.com/vllm-project/vllm-ascend/pull/7468 - Fix TTFT ratio threshold from 0.8 to 0.4 for prefix cache benchmarks - Fix max_out_len values for warm_up and benchmark configs - Applied to both DeepSeek-R1-0528-W8A8 and Qwen3-32B-Int8 configs ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? Signed-off-by: underfituu --- .../configs/Prefix-Cache-DeepSeek-R1-0528-W8A8.yaml | 8 ++++---- .../models/configs/Prefix-Cache-Qwen3-32B-Int8.yaml | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/e2e/nightly/single_node/models/configs/Prefix-Cache-DeepSeek-R1-0528-W8A8.yaml b/tests/e2e/nightly/single_node/models/configs/Prefix-Cache-DeepSeek-R1-0528-W8A8.yaml index ddfbcab6..7629a7ff 100644 --- a/tests/e2e/nightly/single_node/models/configs/Prefix-Cache-DeepSeek-R1-0528-W8A8.yaml +++ b/tests/e2e/nightly/single_node/models/configs/Prefix-Cache-DeepSeek-R1-0528-W8A8.yaml @@ -42,7 +42,7 @@ test_cases: - metric: "TTFT" baseline: "prefix0" target: "prefix75" - ratio: 0.8 + ratio: 0.4 operator: "<" benchmarks: warm_up: @@ -51,7 +51,7 @@ test_cases: request_conf: vllm_api_stream_chat dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf num_prompts: 210 - max_out_len: 2 + max_out_len: 1 batch_size: 1000 baseline: 0 threshold: 0.97 @@ -61,7 +61,7 @@ test_cases: request_conf: vllm_api_stream_chat dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf num_prompts: 210 - max_out_len: 1500 + max_out_len: 1 batch_size: 18 baseline: 1 threshold: 0.97 @@ -71,7 +71,7 @@ test_cases: request_conf: vllm_api_stream_chat dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf num_prompts: 210 - max_out_len: 1500 + max_out_len: 1 batch_size: 18 baseline: 1 threshold: 0.97 diff --git a/tests/e2e/nightly/single_node/models/configs/Prefix-Cache-Qwen3-32B-Int8.yaml b/tests/e2e/nightly/single_node/models/configs/Prefix-Cache-Qwen3-32B-Int8.yaml index 6ead3525..d57368dd 100644 --- a/tests/e2e/nightly/single_node/models/configs/Prefix-Cache-Qwen3-32B-Int8.yaml +++ b/tests/e2e/nightly/single_node/models/configs/Prefix-Cache-Qwen3-32B-Int8.yaml @@ -35,7 +35,7 @@ test_cases: - metric: "TTFT" baseline: "prefix0" target: "prefix75" - ratio: 0.8 + ratio: 0.4 operator: "<" benchmarks: warm_up: @@ -44,7 +44,7 @@ test_cases: request_conf: vllm_api_stream_chat dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf num_prompts: 210 - max_out_len: 2 + max_out_len: 1 batch_size: 1000 baseline: 0 threshold: 0.97 @@ -54,7 +54,7 @@ test_cases: request_conf: vllm_api_stream_chat dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf num_prompts: 210 - max_out_len: 1500 + max_out_len: 1 batch_size: 48 baseline: 1 threshold: 0.97 @@ -64,7 +64,7 @@ test_cases: request_conf: vllm_api_stream_chat dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf num_prompts: 210 - max_out_len: 1500 + max_out_len: 1 batch_size: 48 baseline: 1 threshold: 0.97