From 34386c8896f9532f08e865b459940a2b1d0841e6 Mon Sep 17 00:00:00 2001 From: ZYang6263 <50876451+ZYang6263@users.noreply.github.com> Date: Fri, 10 Apr 2026 14:22:24 +0800 Subject: [PATCH] [v0.18.0][CI] Fix and simplify the CI for Qwen3 32B (#8093) ### What this PR does / why we need it? This PR fixes and simplifies the CI configuration for Qwen3 32B. The main changes are: - Remove the redundant `Qwen3-32B-Int8-A3-Feature-Stack3.yaml` config and consolidate the CI setup into `Qwen3-32B-Int8.yaml`. - Improve runtime stability by adding `PYTORCH_NPU_ALLOC_CONF=expandable_segments:True` and setting `--max-num-seqs 80`. - Update the accuracy benchmark from `aime2024` to `gsm8k-lite`, and adjust the related dataset config, output length, baseline, and threshold accordingly. These changes make the Qwen3 32B CI easier to maintain and more stable in nightly validation. --------- Signed-off-by: ZYang6263 --- .../workflows/schedule_nightly_test_a3.yaml | 3 - .../Qwen3-32B-Int8-A3-Feature-Stack3.yaml | 69 ------------------- .../models/configs/Qwen3-32B-Int8.yaml | 33 ++++----- 3 files changed, 17 insertions(+), 88 deletions(-) delete mode 100644 tests/e2e/nightly/single_node/models/configs/Qwen3-32B-Int8-A3-Feature-Stack3.yaml diff --git a/.github/workflows/schedule_nightly_test_a3.yaml b/.github/workflows/schedule_nightly_test_a3.yaml index 8deb59ca..e05bc78a 100644 --- a/.github/workflows/schedule_nightly_test_a3.yaml +++ b/.github/workflows/schedule_nightly_test_a3.yaml @@ -214,9 +214,6 @@ jobs: - name: qwen2-5-vl-32b os: linux-aarch64-a3-4 config_file_path: Qwen2.5-VL-32B-Instruct.yaml - - name: qwen3-32b-int8-a3-feature-stack3 - os: linux-aarch64-a3-4 - config_file_path: Qwen3-32B-Int8-A3-Feature-Stack3.yaml - name: qwen3-32b-int8-prefix-cache os: linux-aarch64-a3-4 config_file_path: Prefix-Cache-Qwen3-32B-Int8.yaml diff --git a/tests/e2e/nightly/single_node/models/configs/Qwen3-32B-Int8-A3-Feature-Stack3.yaml b/tests/e2e/nightly/single_node/models/configs/Qwen3-32B-Int8-A3-Feature-Stack3.yaml deleted file mode 100644 index 0b396ae0..00000000 --- a/tests/e2e/nightly/single_node/models/configs/Qwen3-32B-Int8-A3-Feature-Stack3.yaml +++ /dev/null @@ -1,69 +0,0 @@ -# ========================================== -# ACTUAL TEST CASES -# ========================================== - -test_cases: - - name: "Qwen3-32B-W8A8-a3-feature-stack3" - model: "vllm-ascend/Qwen3-32B-W8A8" - envs: - VLLM_USE: "1" - TASK_QUEUE_ENABLE: "1" - HCCL_OP_EXPANSION_MODE: "AIV" - OMP_PROC_BIND: "false" - VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE: "1" - VLLM_ASCEND_ENABLE_FLASHCOMM: "1" - SERVER_PORT: "DEFAULT_PORT" - prompts: - - "9.11 and 9.8, which is greater?" - api_keyword_args: - chat_template_kwargs: - enable_thinking: true - server_cmd: - - "--quantization" - - "ascend" - - "--tensor-parallel-size" - - "4" - - "--port" - - "$SERVER_PORT" - - "--trust-remote-code" - - "--reasoning-parser" - - "qwen3" - - "--distributed_executor_backend" - - "mp" - - "--gpu-memory-utilization" - - "0.9" - - "--block-size" - - "128" - - "--max-num-seqs" - - "256" - - "--enforce-eager" - - "--max-model-len" - - "35840" - - "--max-num-batched-tokens" - - "35840" - - "--additional-config" - - '{"enable_weight_nz_layout":true, "weight_prefetch_config":{"enabled": true}}' - - "--compilation-config" - - '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[1,8,24,48,60]}' - test_content: - - "chat_completion" - benchmarks: - acc: - case_type: accuracy - dataset_path: vllm-ascend/gsm8k-lite - request_conf: vllm_api_general_chat - dataset_conf: gsm8k/gsm8k_gen_0_shot_noncot_chat_prompt - max_out_len: 10240 - batch_size: 32 - baseline: 96 - threshold: 4 - perf: - case_type: performance - dataset_path: vllm-ascend/GSM8K-in3500-bs400 - request_conf: vllm_api_stream_chat - dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf - num_prompts: 240 - max_out_len: 1500 - batch_size: 60 - baseline: 1 - threshold: 0.97 diff --git a/tests/e2e/nightly/single_node/models/configs/Qwen3-32B-Int8.yaml b/tests/e2e/nightly/single_node/models/configs/Qwen3-32B-Int8.yaml index 0a3a90e5..dbd8ead7 100644 --- a/tests/e2e/nightly/single_node/models/configs/Qwen3-32B-Int8.yaml +++ b/tests/e2e/nightly/single_node/models/configs/Qwen3-32B-Int8.yaml @@ -4,6 +4,7 @@ _envs: &envs TASK_QUEUE_ENABLE: "1" + PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True" HCCL_OP_EXPANSION_MODE: "AIV" VLLM_ASCEND_ENABLE_FLASHCOMM: "1" SERVER_PORT: "DEFAULT_PORT" @@ -14,6 +15,8 @@ _server_cmd: &server_cmd - "--no-enable-prefix-caching" - "--tensor-parallel-size" - "4" + - "--max-num-seqs" + - "80" - "--port" - "$SERVER_PORT" - "--max-model-len" @@ -23,8 +26,6 @@ _server_cmd: &server_cmd - "--block-size" - "128" - "--trust-remote-code" - - "--reasoning-parser" - - "qwen3" - "--gpu-memory-utilization" - "0.9" - "--async-scheduling" @@ -34,23 +35,23 @@ _server_cmd: &server_cmd _benchmarks: &benchmarks acc: case_type: accuracy - dataset_path: vllm-ascend/aime2024 + dataset_path: vllm-ascend/gsm8k-lite request_conf: vllm_api_general_chat - dataset_conf: aime2024/aime2024_gen_0_shot_chat_prompt - max_out_len: 32768 + dataset_conf: gsm8k/gsm8k_gen_0_shot_noncot_chat_prompt + max_out_len: 10240 batch_size: 32 - baseline: 83.33 - threshold: 7 + baseline: 96 + threshold: 4 perf: - case_type: performance - dataset_path: vllm-ascend/GSM8K-in3500-bs400 - request_conf: vllm_api_stream_chat - dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf - num_prompts: 304 - max_out_len: 1500 - batch_size: 76 - baseline: 1 - threshold: 0.97 + case_type: performance + dataset_path: vllm-ascend/GSM8K-in3500-bs400 + request_conf: vllm_api_stream_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf + num_prompts: 304 + max_out_len: 1500 + batch_size: 76 + baseline: 1 + threshold: 0.97 # ========================================== # ACTUAL TEST CASES