[UT][v0.18.0] Fix APC nightly UT and TTFT ratio (cherry-pick #7468) (#8053)

### What this PR does / why we need it?  Cherry-pick from https://github.com/vllm-project/vllm-ascend/pull/7468 - Fix TTFT ratio threshold from 0.8 to 0.4 for prefix cache benchmarks - Fix max_out_len values for warm_up and benchmark configs - Applied to both DeepSeek-R1-0528-W8A8 and Qwen3-32B-Int8 configs ### Does this PR introduce _any_ user-facing change?  ### How was this patch tested?  Signed-off-by: underfituu <hzhucong@163.com>
2026-04-08 21:08:26 +08:00
parent 044d4c3974
commit 4a628f1042
2 changed files with 8 additions and 8 deletions
--- a/tests/e2e/nightly/single_node/models/configs/Prefix-Cache-DeepSeek-R1-0528-W8A8.yaml
+++ b/tests/e2e/nightly/single_node/models/configs/Prefix-Cache-DeepSeek-R1-0528-W8A8.yaml
@@ -42,7 +42,7 @@ test_cases:
      - metric: "TTFT"
        baseline: "prefix0"
        target: "prefix75"
-        ratio: 0.8
+        ratio: 0.4
        operator: "<"
    benchmarks:
      warm_up:
@@ -51,7 +51,7 @@ test_cases:
        request_conf: vllm_api_stream_chat
        dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
        num_prompts: 210
-        max_out_len: 2
+        max_out_len: 1
        batch_size: 1000
        baseline: 0
        threshold: 0.97
@@ -61,7 +61,7 @@ test_cases:
        request_conf: vllm_api_stream_chat
        dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
        num_prompts: 210
-        max_out_len: 1500
+        max_out_len: 1
        batch_size: 18
        baseline: 1
        threshold: 0.97
@@ -71,7 +71,7 @@ test_cases:
        request_conf: vllm_api_stream_chat
        dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
        num_prompts: 210
-        max_out_len: 1500
+        max_out_len: 1
        batch_size: 18
        baseline: 1
        threshold: 0.97
--- a/tests/e2e/nightly/single_node/models/configs/Prefix-Cache-Qwen3-32B-Int8.yaml
+++ b/tests/e2e/nightly/single_node/models/configs/Prefix-Cache-Qwen3-32B-Int8.yaml
@@ -35,7 +35,7 @@ test_cases:
      - metric: "TTFT"
        baseline: "prefix0"
        target: "prefix75"
-        ratio: 0.8
+        ratio: 0.4
        operator: "<"
    benchmarks:
      warm_up:
@@ -44,7 +44,7 @@ test_cases:
        request_conf: vllm_api_stream_chat
        dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
        num_prompts: 210
-        max_out_len: 2
+        max_out_len: 1
        batch_size: 1000
        baseline: 0
        threshold: 0.97
@@ -54,7 +54,7 @@ test_cases:
        request_conf: vllm_api_stream_chat
        dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
        num_prompts: 210
-        max_out_len: 1500
+        max_out_len: 1
        batch_size: 48
        baseline: 1
        threshold: 0.97
@@ -64,7 +64,7 @@ test_cases:
        request_conf: vllm_api_stream_chat
        dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
        num_prompts: 210
-        max_out_len: 1500
+        max_out_len: 1
        batch_size: 48
        baseline: 1
        threshold: 0.97