Cherry-pick https://github.com/vllm-project/vllm-ascend/pull/8683 ### What this PR does / why we need it? This PR relaxes the TTFT threshold from `0.4` to `0.5` to improve robustness under Data Parallel (DP) load imbalance. #### Background The current assertion enforces: prefix75 < prefix0 * 0.4 #### ❌ Nightly Failure Cases (Observed) | prefix0 | threshold (0.4x) | prefix75 | delta | |--------|------------------|----------|--------| | 4696.24 | 1878.50 | 1883.99 | +5.49 | | 4696.20 | 1878.48 | 1896.01 | +17.53 | | 4636.73 | 1854.69 | 1902.48 | +47.79 | | 4655.17 | 1862.07 | 1913.54 | +51.47 | | 4685.35 | 1874.14 | 1919.36 | +45.22 | | 4660.33 | 1864.13 | 1915.41 | +51.28 | | 4648.30 | 1859.32 | 1950.50 | +91.18 | | 4655.30 | 1862.12 | 1962.32 | +100.20 | --- #### ✅ Nightly Passing Cases (Observed) | prefix0 | threshold (0.4x) | prefix75 | margin | |--------|------------------|----------|---------| | 4685.64 | 1874.26 | 1864.46 | -9.80 | | 5520.28 | 2208.11 | 1928.97 | -279.14 | | 4639.23 | 1855.69 | 1846.86 | -8.83 | | 4651.64 | 1860.66 | 1854.30 | -6.36 | | 4640.39 | 1856.15 | 1840.32 | -15.83 | | 4677.20 | 1870.88 | 1848.35 | -22.53 | --- #### Key Observations - Failures exceed the threshold by only **~5 ms to ~100 ms (~0.3%–5%)** - Passing cases often have **very tight margins (~5–10 ms)** - There is clear **overlap between pass and fail boundaries** - Many failures are **borderline violations**, not real regressions --- #### Root Cause The instability is caused by **Data Parallel (DP) load imbalance**, which introduces systematic variance: - Uneven request distribution across workers - Queueing delays - Increased TTFT variance (especially for `prefix75`) --- #### Conclusion - The current threshold (`0.4x`) is **too strict** - Observed natural fluctuation: - Absolute: up to ~100 ms - Relative: up to ~5% over threshold - Pass/fail boundary is currently **too sensitive to runtime jitter** --- #### Change We relax the threshold: **0.4 → 0.5** This adjustment: - Accounts for expected runtime variance - Reduces false negatives - Maintains a meaningful performance constraint Even with `0.5`, the requirement remains strict (`prefix75 < 50% of prefix0`) and does not mask real regressions. --- ### Does this PR introduce _any_ user-facing change? No. This change only affects internal test assertions and does not impact user-facing behavior or model performance. --- ### How was this patch tested? - Verified against existing TTFT test cases: - Previously failing cases (due to small variance) now pass - No regressions observed in other scenarios - Confirmed that failures were due to DP load imbalance rather than actual performance degradation - Ensured the updated threshold still enforces a meaningful constraint on TTFT Signed-off-by: underfituu <hzhucong@163.com>
78 lines
2.1 KiB
YAML
78 lines
2.1 KiB
YAML
# ==========================================
|
|
# ACTUAL TEST CASES
|
|
# ==========================================
|
|
|
|
test_cases:
|
|
- name: "prefix-cache-deepseek-r1-0528-w8a8"
|
|
model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
|
|
envs:
|
|
OMP_NUM_THREADS: "10"
|
|
OMP_PROC_BIND: "false"
|
|
HCCL_BUFFSIZE: "1024"
|
|
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
|
|
SERVER_PORT: "DEFAULT_PORT"
|
|
server_cmd:
|
|
- "--quantization"
|
|
- "ascend"
|
|
- "--data-parallel-size"
|
|
- "2"
|
|
- "--tensor-parallel-size"
|
|
- "8"
|
|
- "--enable-expert-parallel"
|
|
- "--port"
|
|
- "$SERVER_PORT"
|
|
- "--seed"
|
|
- "1024"
|
|
- "--max-model-len"
|
|
- "5200"
|
|
- "--max-num-batched-tokens"
|
|
- "4096"
|
|
- "--max-num-seqs"
|
|
- "16"
|
|
- "--trust-remote-code"
|
|
- "--gpu-memory-utilization"
|
|
- "0.9"
|
|
- "--additional-config"
|
|
- '{"enable_weight_nz_layout": true}'
|
|
- "--speculative-config"
|
|
- '{"num_speculative_tokens": 1, "method": "mtp"}'
|
|
test_content:
|
|
- "benchmark_comparisons"
|
|
benchmark_comparisons_args:
|
|
- metric: "TTFT"
|
|
baseline: "prefix0"
|
|
target: "prefix75"
|
|
ratio: 0.5
|
|
operator: "<"
|
|
benchmarks:
|
|
warm_up:
|
|
case_type: performance
|
|
dataset_path: vllm-ascend/GSM8K-in1024-bs210
|
|
request_conf: vllm_api_stream_chat
|
|
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
|
num_prompts: 210
|
|
max_out_len: 1
|
|
batch_size: 1000
|
|
baseline: 0
|
|
threshold: 0.97
|
|
prefix0:
|
|
case_type: performance
|
|
dataset_path: vllm-ascend/prefix0-in3500-bs210
|
|
request_conf: vllm_api_stream_chat
|
|
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
|
num_prompts: 210
|
|
max_out_len: 1
|
|
batch_size: 18
|
|
baseline: 1
|
|
threshold: 0.97
|
|
prefix75:
|
|
case_type: performance
|
|
dataset_path: vllm-ascend/prefix75-in3500-bs210
|
|
request_conf: vllm_api_stream_chat
|
|
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
|
num_prompts: 210
|
|
max_out_len: 1
|
|
batch_size: 18
|
|
baseline: 1
|
|
threshold: 0.97
|