From 7e85f2ff9793df75350622ed0c631de01fbfd448 Mon Sep 17 00:00:00 2001 From: pppeng <60355449+ppppeng@users.noreply.github.com> Date: Sun, 15 Mar 2026 22:19:02 +0800 Subject: [PATCH] [CI] Add test_qwen3_5.py (#7133) ### What this PR does / why we need it? Add test_qwen3_5.py for base scenarios tp4 on Qwen3.5-27B and Qwen3.5-35B-A3B. - vLLM version: main - vLLM main: https://github.com/vllm-project/vllm/commit/4034c3d32e30d01639459edd3ab486f56993876d --------- Signed-off-by: pppeng Co-authored-by: Mengqing Cao --- .github/workflows/scripts/config.yaml | 2 + tests/e2e/multicard/4-cards/test_qwen3_5.py | 75 +++++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 tests/e2e/multicard/4-cards/test_qwen3_5.py diff --git a/.github/workflows/scripts/config.yaml b/.github/workflows/scripts/config.yaml index e3e80e4a..6b7bd2a8 100644 --- a/.github/workflows/scripts/config.yaml +++ b/.github/workflows/scripts/config.yaml @@ -143,6 +143,8 @@ e2e-multicard-4-cards: # TODO: recover skipped tests - name: tests/e2e/multicard/4-cards/test_qwen3_next.py estimated_time: 1250 + - name: tests/e2e/multicard/4-cards/test_qwen3_5.py + estimated_time: 60 - name: tests/e2e/multicard/4-cards/test_data_parallel_tp2.py estimated_time: 60 - name: tests/e2e/multicard/4-cards/test_kimi_k2.py diff --git a/tests/e2e/multicard/4-cards/test_qwen3_5.py b/tests/e2e/multicard/4-cards/test_qwen3_5.py new file mode 100644 index 00000000..475086a0 --- /dev/null +++ b/tests/e2e/multicard/4-cards/test_qwen3_5.py @@ -0,0 +1,75 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py +# +from tests.e2e.conftest import VllmRunner + + +def test_qwen3_5_27b_distributed_mp_tp4(): + example_prompts = [ + "Hello, my name is", + ] * 4 + max_tokens = 5 + with VllmRunner("Qwen/Qwen3.5-27B", + tensor_parallel_size=4, + cudagraph_capture_sizes=[1, 2, 4, 8], + max_model_len=4096, + gpu_memory_utilization=0.90, + distributed_executor_backend="mp") as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) + del vllm_model + + +def test_qwen3_5_35b_distributed_mp_tp4(): + example_prompts = [ + "Hello, my name is", + ] * 4 + max_tokens = 5 + with VllmRunner("Qwen/Qwen3.5-35B-A3B", + tensor_parallel_size=4, + cudagraph_capture_sizes=[1, 2, 4, 8], + max_model_len=4096, + gpu_memory_utilization=0.90, + distributed_executor_backend="mp") as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) + del vllm_model + + +def test_qwen3_5_35b_distributed_mp_tp4_full_decode_only_mtp3(): + example_prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + + max_tokens = 20 + with VllmRunner("Qwen/Qwen3.5-35B-A3B", + tensor_parallel_size=4, + max_model_len=4096, + gpu_memory_utilization=0.90, + distributed_executor_backend="mp", + compilation_config={ + "cudagraph_mode": "FULL_DECODE_ONLY", + "cudagraph_capture_sizes": [4, 8, 12, 16], + }, + speculative_config={ + "method": "qwen3_5_mtp", + "num_speculative_tokens": 3, + }) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) + del vllm_model \ No newline at end of file