Add gsm8k accuracy test for multi-note Qwen3-235B-A22B (#4802)

### What this PR does / why we need it? As there is not accuracy test for qwen3-235B-A22B model Test result: dataset version metric mode vllm-api-general-chat --------- --------- -------- ------ ----------------------- gsm8k 7cd45e accuracy gen 96.29 Times long for test case running: 30mintues - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: leo-pony <nengjunma@outlook.com> Co-authored-by: Mengqing Cao <cmq0113@163.com>
2025-12-09 23:05:41 +08:00
parent a77045f355
commit 863a5a5a17
1 changed files with 11 additions and 3 deletions
--- a/tests/e2e/nightly/multi_node/config/models/Qwen3-235B-A3B.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/Qwen3-235B-A3B.yaml
@@ -23,7 +23,7 @@ deployment:
        --tensor-parallel-size 8
        --seed 1024
        --enable-expert-parallel
-        --max-num-seqs 16
+        --max-num-seqs 32
        --max-model-len 8192
        --max-num-batched-tokens 8192
        --trust-remote-code
@@ -40,7 +40,7 @@ deployment:
        --data-parallel-rpc-port 13389
        --tensor-parallel-size 8
        --seed 1024
-        --max-num-seqs 16
+        --max-num-seqs 32
        --max-model-len 8192
        --max-num-batched-tokens 8192
        --enable-expert-parallel
@@ -48,4 +48,12 @@ deployment:
        --no-enable-prefix-caching
        --gpu-memory-utilization 0.9
 benchmarks:
-
+  acc:
+    case_type: accuracy
+    dataset_path: vllm-ascend/gsm8k
+    request_conf: vllm_api_general_chat
+    dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
+    max_out_len: 7680
+    batch_size: 512
+    baseline: 95
+    threshold: 3