From 902d1312d9170dc567cd34bc715e1b5f7e514d53 Mon Sep 17 00:00:00 2001 From: jiangmengyu18 <56633611+jiangmengyu18@users.noreply.github.com> Date: Fri, 3 Apr 2026 11:39:28 +0800 Subject: [PATCH] [v0.18.0][CI] add nightly ci test for qwen3vl (#7913) ### What this PR does / why we need it? Add nightly ci test for qwen3vl ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? Signed-off-by: betta18 Co-authored-by: betta18 --- .../workflows/schedule_nightly_test_a2.yaml | 3 + .../workflows/schedule_nightly_test_a3.yaml | 3 + .../Qwen3-VL-235B-A22B-Instruct-W8A8.yaml | 71 +++++++++++++++++++ .../configs/Qwen3-VL-32B-Instruct-W8A8.yaml | 63 ++++++++++++++++ 4 files changed, 140 insertions(+) create mode 100644 tests/e2e/nightly/single_node/models/configs/Qwen3-VL-235B-A22B-Instruct-W8A8.yaml create mode 100644 tests/e2e/nightly/single_node/models/configs/Qwen3-VL-32B-Instruct-W8A8.yaml diff --git a/.github/workflows/schedule_nightly_test_a2.yaml b/.github/workflows/schedule_nightly_test_a2.yaml index 74952a1b..d7a67934 100644 --- a/.github/workflows/schedule_nightly_test_a2.yaml +++ b/.github/workflows/schedule_nightly_test_a2.yaml @@ -87,6 +87,9 @@ jobs: os: linux-aarch64-a2b3-4 tests: tests/e2e/nightly/single_node/ops/multicard_ops_a2/ # YAML-driven tests + - name: qwen3-vl-32b-instruct-w8a8 + os: linux-aarch64-a2b3-4 + config_file_path: Qwen3-VL-32B-Instruct-W8A8.yaml - name: qwen3-32b os: linux-aarch64-a2b3-4 config_file_path: Qwen3-32B.yaml diff --git a/.github/workflows/schedule_nightly_test_a3.yaml b/.github/workflows/schedule_nightly_test_a3.yaml index 66a5818e..d59df004 100644 --- a/.github/workflows/schedule_nightly_test_a3.yaml +++ b/.github/workflows/schedule_nightly_test_a3.yaml @@ -157,6 +157,9 @@ jobs: os: linux-aarch64-a3-16 tests: tests/e2e/nightly/single_node/ops/multicard_ops_a3/ # YAML-driven tests + - name: qwen3-vl-235b-a22b-instruct-w8a8 + os: linux-aarch64-a3-16 + config_file_path: Qwen3-VL-235B-A22B-Instruct-W8A8.yaml - name: deepseek-r1-0528-w8a8 os: linux-aarch64-a3-16 config_file_path: DeepSeek-R1-0528-W8A8.yaml diff --git a/tests/e2e/nightly/single_node/models/configs/Qwen3-VL-235B-A22B-Instruct-W8A8.yaml b/tests/e2e/nightly/single_node/models/configs/Qwen3-VL-235B-A22B-Instruct-W8A8.yaml new file mode 100644 index 00000000..4eb8538d --- /dev/null +++ b/tests/e2e/nightly/single_node/models/configs/Qwen3-VL-235B-A22B-Instruct-W8A8.yaml @@ -0,0 +1,71 @@ +# ========================================== +# Shared Configurations +# ========================================== + +_envs: &envs + OMP_NUM_THREADS: "1" + OMP_PROC_BIND: "false" + TASK_QUEUE_ENABLE: "1" + HCCL_OP_EXPANSION_MODE: "AIV" + HCCL_BUFFSIZE: "1536" + PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True" + VLLM_ASCEND_ENABLE_FLASHCOMM1: "1" + VLLM_ASCEND_ENABLE_FUSED_MC2: "1" + VLLM_ASCEND_ENABLE_NZ: "2" + VLLM_ASCEND_BALANCE_SCHEDULING: "1" + SERVER_PORT: "DEFAULT_PORT" + +_server_cmd: &server_cmd + - "--quantization" + - "ascend" + - "--no-enable-prefix-caching" + - "--mm-processor-cache-gb" + - "0" + - "--tensor-parallel-size" + - "4" + - "--data-parallel-size" + - "4" + - "--enable-expert-parallel" + - "--port" + - "$SERVER_PORT" + - "--max-model-len" + - "32768" + - "--max-num-batched-tokens" + - "16384" + - "--max-num-seqs" + - "32" + - "--trust-remote-code" + - "--gpu-memory-utilization" + - "0.92" + - "--async-scheduling" + +_benchmarks: &benchmarks + acc: + case_type: accuracy + dataset_path: vllm-ascend/textvqa-lite + request_conf: vllm_api_stream_chat + dataset_conf: textvqa/textvqa_gen_base64 + max_out_len: 2048 + batch_size: 128 + baseline: 80 + temperature: 0 + top_k: -1 + top_p: 1 + repetition_penalty: 1 + threshold: 5 + +# ========================================== +# ACTUAL TEST CASES +# ========================================== + +test_cases: + - name: "Qwen3-VL-235B-A22B-Instruct-W8A8" + model: "Eco-Tech/Qwen3-VL-235B-A22B-Instruct-w8a8-QuaRot" + envs: + <<: *envs + server_cmd: *server_cmd + server_cmd_extra: + - "--compilation_config" + - '{"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1,2,4,8,16,24,32]}' + benchmarks: + <<: *benchmarks diff --git a/tests/e2e/nightly/single_node/models/configs/Qwen3-VL-32B-Instruct-W8A8.yaml b/tests/e2e/nightly/single_node/models/configs/Qwen3-VL-32B-Instruct-W8A8.yaml new file mode 100644 index 00000000..4b5c4ec4 --- /dev/null +++ b/tests/e2e/nightly/single_node/models/configs/Qwen3-VL-32B-Instruct-W8A8.yaml @@ -0,0 +1,63 @@ +# ========================================== +# Shared Configurations +# ========================================== + +_envs: &envs + OMP_NUM_THREADS: "1" + OMP_PROC_BIND: "false" + TASK_QUEUE_ENABLE: "1" + HCCL_OP_EXPANSION_MODE: "AIV" + VLLM_ASCEND_ENABLE_FLASHCOMM1: "1" + PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True" + VLLM_ASCEND_ENABLE_PREFETCH_MLP: "1" + SERVER_PORT: "DEFAULT_PORT" + +_server_cmd: &server_cmd + - "--quantization" + - "ascend" + - "--no-enable-prefix-caching" + - "--mm-processor-cache-gb" + - "0" + - "--tensor-parallel-size" + - "2" + - "--port" + - "$SERVER_PORT" + - "--max-model-len" + - "20000" + - "--max-num-batched-tokens" + - "8192" + - "--trust-remote-code" + - "--gpu-memory-utilization" + - "0.9" + - "--async-scheduling" + +_benchmarks: &benchmarks + acc: + case_type: accuracy + dataset_path: vllm-ascend/textvqa-lite + request_conf: vllm_api_stream_chat + dataset_conf: textvqa/textvqa_gen_base64 + max_out_len: 2048 + batch_size: 128 + baseline: 80 + temperature: 0 + top_k: -1 + top_p: 1 + repetition_penalty: 1 + threshold: 5 + +# ========================================== +# ACTUAL TEST CASES +# ========================================== + +test_cases: + - name: "Qwen3-VL-32B-Instruct-W8A8" + model: "Eco-Tech/Qwen3-VL-32B-Instruct-w8a8-QuaRot" + envs: + <<: *envs + server_cmd: *server_cmd + server_cmd_extra: + - "--compilation_config" + - '{"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1,12,16,20,24,32,48,64,68,72,76,80,128]}' + benchmarks: + <<: *benchmarks