From 1cc225711d04aa9ee6a499b7965b8193646e760f Mon Sep 17 00:00:00 2001 From: pu-zhe Date: Sat, 7 Feb 2026 09:28:37 +0800 Subject: [PATCH] [Refactor]310p_e2e test case update (#6539) ### What this PR does / why we need it? This pull request significantly enhances the test suite by adding new end-to-end test cases for Qwen3 models on the 310P hardware platform. The primary goal is to ensure the stability and correctness of these models under diverse operational conditions, including various parallelism strategies, data types, and quantization methods. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? E2E test - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.15.0 --------- Signed-off-by: pu-zhe --- .github/workflows/_e2e_test.yaml | 6 +- .../multicard/test_dense_model_multicard.py | 46 +++++++++++ .../test_moe_model_multicard.py} | 82 +++++++++++-------- .../singlecard/test_dense_model_singlecard.py | 46 +++++++++++ tests/e2e/310p/test_offline_inference_310p.py | 78 ------------------ .../310p/test_offline_inference_w8a8_310p.py | 22 ----- vllm_ascend/_310p/fused_moe/fused_moe.py | 2 + vllm_ascend/_310p/worker_310p.py | 4 +- 8 files changed, 144 insertions(+), 142 deletions(-) create mode 100644 tests/e2e/310p/multicard/test_dense_model_multicard.py rename tests/e2e/310p/{test_offline_inference_parallel_310p.py => multicard/test_moe_model_multicard.py} (52%) create mode 100644 tests/e2e/310p/singlecard/test_dense_model_singlecard.py delete mode 100644 tests/e2e/310p/test_offline_inference_310p.py delete mode 100644 tests/e2e/310p/test_offline_inference_w8a8_310p.py diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index 758b35ef..75849a56 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -403,7 +403,7 @@ jobs: PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256 VLLM_WORKER_MULTIPROC_METHOD: spawn run: | - pytest -sv --durations=0 tests/e2e/310p/test_offline_inference_310p.py + pytest -sv --durations=0 tests/e2e/310p/singlecard/test_dense_model_singlecard.py e2e_310p-4cards: name: 310p multicards 4cards @@ -462,5 +462,5 @@ jobs: VLLM_WORKER_MULTIPROC_METHOD: spawn run: | pytest -sv --durations=0 \ - tests/e2e/310p/test_offline_inference_parallel_310p.py \ - tests/e2e/310p/test_offline_inference_w8a8_310p.py + tests/e2e/310p/multicard/test_dense_model_multicard.py \ + tests/e2e/310p/multicard/test_moe_model_multicard.py diff --git a/tests/e2e/310p/multicard/test_dense_model_multicard.py b/tests/e2e/310p/multicard/test_dense_model_multicard.py new file mode 100644 index 00000000..e964c48f --- /dev/null +++ b/tests/e2e/310p/multicard/test_dense_model_multicard.py @@ -0,0 +1,46 @@ +# +# Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. + +from tests.e2e.conftest import VllmRunner + + +def test_qwen3_dense_tp2_fp16(): + example_prompts = [ + "Hello, my name is", + ] + max_tokens = 5 + with VllmRunner( + "Qwen/Qwen3-8B", + tensor_parallel_size=2, + enforce_eager=True, + dtype="float16" + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) + +def test_qwen3_dense_tp4_w8a8(): + example_prompts = [ + "Hello, my name is", + ] + max_tokens = 5 + with VllmRunner( + "vllm-ascend/Qwen3-32B-W8A8", + tensor_parallel_size=4, + enforce_eager=True, + dtype="float16", + quantization="ascend" + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/e2e/310p/test_offline_inference_parallel_310p.py b/tests/e2e/310p/multicard/test_moe_model_multicard.py similarity index 52% rename from tests/e2e/310p/test_offline_inference_parallel_310p.py rename to tests/e2e/310p/multicard/test_moe_model_multicard.py index 2a796ad5..40dabf25 100644 --- a/tests/e2e/310p/test_offline_inference_parallel_310p.py +++ b/tests/e2e/310p/multicard/test_moe_model_multicard.py @@ -1,36 +1,46 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. - -import pytest - -from tests.e2e.conftest import VllmRunner - - -@pytest.mark.parametrize("dtype", ["float16"]) -@pytest.mark.parametrize("max_tokens", [5]) -def test_models(dtype: str, max_tokens: int) -> None: - example_prompts = [ - "Hello, my name is", - "The future of AI is", - ] - - with VllmRunner("Qwen/Qwen3-0.6B", - tensor_parallel_size=4, - dtype=dtype, - max_model_len=2048, - enforce_eager=True) as vllm_model: - vllm_model.generate_greedy(example_prompts, max_tokens) +# +# Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. + +from tests.e2e.conftest import VllmRunner + + +def test_qwen3_moe_tp4_fp16(): + example_prompts = [ + "Hello, my name is", + ] + max_tokens = 5 + with VllmRunner( + "Qwen/Qwen3-30B-A3B", + tensor_parallel_size=4, + enforce_eager=True, + dtype="float16" + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) + +def test_qwen3_moe_ep4_fp16(): + example_prompts = [ + "Hello, my name is", + ] + max_tokens = 5 + with VllmRunner( + "Qwen/Qwen3-30B-A3B", + tensor_parallel_size=4, + enforce_eager=True, + dtype="float16", + enable_expert_parallel=True + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/e2e/310p/singlecard/test_dense_model_singlecard.py b/tests/e2e/310p/singlecard/test_dense_model_singlecard.py new file mode 100644 index 00000000..a557f577 --- /dev/null +++ b/tests/e2e/310p/singlecard/test_dense_model_singlecard.py @@ -0,0 +1,46 @@ +# +# Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. + +from tests.e2e.conftest import VllmRunner + + +def test_qwen3_dense_tp1_fp16(): + example_prompts = [ + "Hello, my name is", + ] + max_tokens = 5 + with VllmRunner( + "Qwen/Qwen3-8B", + tensor_parallel_size=1, + enforce_eager=True, + dtype="float16" + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) + +def test_qwen3_dense_tp1_w8a8(): + example_prompts = [ + "Hello, my name is", + ] + max_tokens = 5 + with VllmRunner( + "vllm-ascend/Qwen3-8B-W8A8", + tensor_parallel_size=1, + enforce_eager=True, + dtype="float16", + quantization="ascend" + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/e2e/310p/test_offline_inference_310p.py b/tests/e2e/310p/test_offline_inference_310p.py deleted file mode 100644 index e62b8026..00000000 --- a/tests/e2e/310p/test_offline_inference_310p.py +++ /dev/null @@ -1,78 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -import pytest -from vllm.assets.image import ImageAsset - -from tests.e2e.conftest import VllmRunner - - -@pytest.mark.parametrize("dtype", ["float16"]) -@pytest.mark.parametrize("max_tokens", [5]) -def test_llm_models(dtype: str, max_tokens: int) -> None: - example_prompts = [ - "Hello, my name is", - "The future of AI is", - ] - - with VllmRunner("Qwen/Qwen3-0.6B", - tensor_parallel_size=1, - dtype=dtype, - max_model_len=2048, - enforce_eager=True) as vllm_model: - vllm_model.generate_greedy(example_prompts, max_tokens) - - -@pytest.mark.skip(reason="310P: multimodal test skipped, offline is ok") -@pytest.mark.parametrize("dtype", ["float16"]) -def test_multimodal_vl(dtype: str): - image = ImageAsset("cherry_blossom").pil_image.convert("RGB") - - img_questions = [ - "What is the content of this image?", - "Describe the content of this image in detail.", - "What's in the image?", - "Where is this image taken?", - ] - - images = [image] * len(img_questions) - placeholder = "<|image_pad|>" - prompts = [ - ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" - f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" - f"{q}<|im_end|>\n<|im_start|>assistant\n") for q in img_questions - ] - - with VllmRunner("Qwen/Qwen2.5-VL-3B-Instruct", - mm_processor_kwargs={ - "min_pixels": 28 * 28, - "max_pixels": 1280 * 28 * 28, - "fps": 1, - }, - dtype=dtype, - max_model_len=8192, - enforce_eager=True, - limit_mm_per_prompt={"image": 1}) as vllm_model: - outputs = vllm_model.generate_greedy( - prompts=prompts, - images=images, - max_tokens=64, - ) - - assert len(outputs) == len(prompts) - - for _, output_str in outputs: - assert output_str, "Generated output should not be empty." diff --git a/tests/e2e/310p/test_offline_inference_w8a8_310p.py b/tests/e2e/310p/test_offline_inference_w8a8_310p.py deleted file mode 100644 index 84b3eb49..00000000 --- a/tests/e2e/310p/test_offline_inference_w8a8_310p.py +++ /dev/null @@ -1,22 +0,0 @@ -import pytest - -from tests.e2e.conftest import VllmRunner - - -@pytest.mark.parametrize("dtype", ["float16"]) -@pytest.mark.parametrize("max_tokens", [5]) -def test_qwen3_w8a8_e2e_310p(dtype: str, max_tokens: int) -> None: - example_prompts = [ - "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.", - ] - - with VllmRunner( - "vllm-ascend/Qwen3-32B-W8A8", - tensor_parallel_size=4, - dtype=dtype, - max_model_len=8192, - enforce_eager=True, - quantization="ascend", - enable_prefix_caching=False, - ) as vllm_model: - vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/vllm_ascend/_310p/fused_moe/fused_moe.py b/vllm_ascend/_310p/fused_moe/fused_moe.py index 5cca5036..f54ac604 100644 --- a/vllm_ascend/_310p/fused_moe/fused_moe.py +++ b/vllm_ascend/_310p/fused_moe/fused_moe.py @@ -251,9 +251,11 @@ class AscendSharedFusedMoE310(SharedFusedMoE, AscendFusedMoE310): shared_experts: torch.nn.Module, gate: torch.nn.Module | None = None, use_overlapped: bool = True, + routed_input_transform: torch.nn.Module | None = None, **kwargs, ): AscendFusedMoE310.__init__(self, **kwargs) + self._routed_input_transform = routed_input_transform self._shared_experts = shared_experts self.use_overlapped = use_overlapped self.shared_expert_stream = None diff --git a/vllm_ascend/_310p/worker_310p.py b/vllm_ascend/_310p/worker_310p.py index 8ced752b..bb0fa28d 100644 --- a/vllm_ascend/_310p/worker_310p.py +++ b/vllm_ascend/_310p/worker_310p.py @@ -25,9 +25,7 @@ from vllm_ascend.worker.worker import NPUWorker, init_workspace_manager class NPUWorker310(NPUWorker): def init_device(self): self.device = self._init_device() - - # TODO: There is accuracy issue when jit_compile is disabled currently. - torch_npu.npu.set_compile_mode(jit_compile=True) + torch_npu.npu.set_compile_mode(jit_compile=False) init_workspace_manager(self.device, num_ubatches=1)