[doc] add Prefill-Decode Disaggregation doc for GLM5.md (#7300)

### What this PR does / why we need it? add Prefill-Decode Disaggregation doc for GLM5.md w8a8 65k-1.5k Concurrency: 80 prefixcache: 90% tps: 2054 - vLLM version: v0.17.0 - vLLM main: 4034c3d32e --------- Signed-off-by: liuhaiyang27 <liuhaiyang27@huawei.com> Co-authored-by: liuhaiyang27 <liuhaiyang27@huawei.com>
2026-03-18 17:00:31 +08:00
parent 6bc68c55d0
commit 58725b8b24
3 changed files with 926 additions and 47 deletions
--- a/tests/e2e/nightly/single_node/models/configs/GLM-5.yaml
+++ b/tests/e2e/nightly/single_node/models/configs/GLM-5.yaml
@@ -0,0 +1,83 @@
+# ==========================================
+# Shared Configurations
+# ==========================================
+
+_envs: &envs
+  HCCL_BUFFSIZE: "200"
+  SERVER_PORT: "DEFAULT_PORT"
+  HCCL_OP_EXPANSION_MODE: "AIV"
+  OMP_PROC_BIND: "false"
+  OMP_NUM_THREADS: "1"
+  PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
+  VLLM_ASCEND_BALANCE_SCHEDULING: "1"
+
+_server_cmd: &server_cmd
+  - "--enable-expert-parallel"
+  - "--tensor-parallel-size"
+  - "16"
+  - "--data-parallel-size"
+  - "1"
+  - "--port"
+  - "$SERVER_PORT"
+  - "--max-model-len"
+  - "8192"
+  - "--max-num-batched-tokens"
+  - "4096"
+  - "--trust-remote-code"
+  - "--gpu-memory-utilization"
+  - "0.95"
+  - "--max-num-seqs"
+  - "8"
+  - "--quantization"
+  - "ascend"
+  - "--async-scheduling"
+  - "--additional-config"
+  - '{"enable_npugraph_ex": true,"fuse_muls_add":true,"multistream_overlap_shared_expert":true}'
+  - "--speculative-config"
+  - '{"num_speculative_tokens": 3, "method": "deepseek_mtp"}'
+
+_benchmarks: &benchmarks
+  acc:
+    case_type: accuracy
+    dataset_path: vllm-ascend/gsm8k-lite
+    request_conf: vllm_api_general_chat
+    dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
+    max_out_len: 4096
+    batch_size: 8
+    baseline: 95
+    threshold: 5
+  perf:
+    case_type: performance
+    dataset_path: vllm-ascend/GSM8K-in3500-bs400
+    request_conf: vllm_api_stream_chat
+    dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
+    num_prompts: 16
+    max_out_len: 1500
+    batch_size: 8
+    request_rate: 0
+    baseline: 1
+    threshold: 0.97
+
+# ==========================================
+# ACTUAL TEST CASES
+# ==========================================
+
+test_cases:
+  - name: "GLM-5-TP16-DP1-decodegraph"
+    model: "Eco-Tech/GLM-5-w4a8"
+    envs:
+      <<: *envs
+    server_cmd: *server_cmd
+    server_cmd_extra:
+      - "--compilation-config"
+      - '{"cudagraph_capture": [4,8,12,16,20,24,28,32], "cudagraph_model":"FULL_DECODE_ONLY"}'
+    benchmarks:
+      <<: *benchmarks
+
+  - name: "GLM-5-TP16-DP1-eager"
+    model: "Eco-Tech/GLM-5-w4a8"
+    envs:
+      <<: *envs
+    server_cmd: *server_cmd
+    benchmarks:
+      <<: *benchmarks