diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index 758b35ef..75849a56 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -403,7 +403,7 @@ jobs: PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256 VLLM_WORKER_MULTIPROC_METHOD: spawn run: | - pytest -sv --durations=0 tests/e2e/310p/test_offline_inference_310p.py + pytest -sv --durations=0 tests/e2e/310p/singlecard/test_dense_model_singlecard.py e2e_310p-4cards: name: 310p multicards 4cards @@ -462,5 +462,5 @@ jobs: VLLM_WORKER_MULTIPROC_METHOD: spawn run: | pytest -sv --durations=0 \ - tests/e2e/310p/test_offline_inference_parallel_310p.py \ - tests/e2e/310p/test_offline_inference_w8a8_310p.py + tests/e2e/310p/multicard/test_dense_model_multicard.py \ + tests/e2e/310p/multicard/test_moe_model_multicard.py diff --git a/tests/e2e/310p/multicard/test_dense_model_multicard.py b/tests/e2e/310p/multicard/test_dense_model_multicard.py new file mode 100644 index 00000000..e964c48f --- /dev/null +++ b/tests/e2e/310p/multicard/test_dense_model_multicard.py @@ -0,0 +1,46 @@ +# +# Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. + +from tests.e2e.conftest import VllmRunner + + +def test_qwen3_dense_tp2_fp16(): + example_prompts = [ + "Hello, my name is", + ] + max_tokens = 5 + with VllmRunner( + "Qwen/Qwen3-8B", + tensor_parallel_size=2, + enforce_eager=True, + dtype="float16" + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) + +def test_qwen3_dense_tp4_w8a8(): + example_prompts = [ + "Hello, my name is", + ] + max_tokens = 5 + with VllmRunner( + "vllm-ascend/Qwen3-32B-W8A8", + tensor_parallel_size=4, + enforce_eager=True, + dtype="float16", + quantization="ascend" + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/e2e/310p/test_offline_inference_parallel_310p.py b/tests/e2e/310p/multicard/test_moe_model_multicard.py similarity index 52% rename from tests/e2e/310p/test_offline_inference_parallel_310p.py rename to tests/e2e/310p/multicard/test_moe_model_multicard.py index 2a796ad5..40dabf25 100644 --- a/tests/e2e/310p/test_offline_inference_parallel_310p.py +++ b/tests/e2e/310p/multicard/test_moe_model_multicard.py @@ -1,36 +1,46 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. - -import pytest - -from tests.e2e.conftest import VllmRunner - - -@pytest.mark.parametrize("dtype", ["float16"]) -@pytest.mark.parametrize("max_tokens", [5]) -def test_models(dtype: str, max_tokens: int) -> None: - example_prompts = [ - "Hello, my name is", - "The future of AI is", - ] - - with VllmRunner("Qwen/Qwen3-0.6B", - tensor_parallel_size=4, - dtype=dtype, - max_model_len=2048, - enforce_eager=True) as vllm_model: - vllm_model.generate_greedy(example_prompts, max_tokens) +# +# Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. + +from tests.e2e.conftest import VllmRunner + + +def test_qwen3_moe_tp4_fp16(): + example_prompts = [ + "Hello, my name is", + ] + max_tokens = 5 + with VllmRunner( + "Qwen/Qwen3-30B-A3B", + tensor_parallel_size=4, + enforce_eager=True, + dtype="float16" + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) + +def test_qwen3_moe_ep4_fp16(): + example_prompts = [ + "Hello, my name is", + ] + max_tokens = 5 + with VllmRunner( + "Qwen/Qwen3-30B-A3B", + tensor_parallel_size=4, + enforce_eager=True, + dtype="float16", + enable_expert_parallel=True + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/e2e/310p/singlecard/test_dense_model_singlecard.py b/tests/e2e/310p/singlecard/test_dense_model_singlecard.py new file mode 100644 index 00000000..a557f577 --- /dev/null +++ b/tests/e2e/310p/singlecard/test_dense_model_singlecard.py @@ -0,0 +1,46 @@ +# +# Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. + +from tests.e2e.conftest import VllmRunner + + +def test_qwen3_dense_tp1_fp16(): + example_prompts = [ + "Hello, my name is", + ] + max_tokens = 5 + with VllmRunner( + "Qwen/Qwen3-8B", + tensor_parallel_size=1, + enforce_eager=True, + dtype="float16" + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) + +def test_qwen3_dense_tp1_w8a8(): + example_prompts = [ + "Hello, my name is", + ] + max_tokens = 5 + with VllmRunner( + "vllm-ascend/Qwen3-8B-W8A8", + tensor_parallel_size=1, + enforce_eager=True, + dtype="float16", + quantization="ascend" + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/e2e/310p/test_offline_inference_310p.py b/tests/e2e/310p/test_offline_inference_310p.py deleted file mode 100644 index e62b8026..00000000 --- a/tests/e2e/310p/test_offline_inference_310p.py +++ /dev/null @@ -1,78 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -import pytest -from vllm.assets.image import ImageAsset - -from tests.e2e.conftest import VllmRunner - - -@pytest.mark.parametrize("dtype", ["float16"]) -@pytest.mark.parametrize("max_tokens", [5]) -def test_llm_models(dtype: str, max_tokens: int) -> None: - example_prompts = [ - "Hello, my name is", - "The future of AI is", - ] - - with VllmRunner("Qwen/Qwen3-0.6B", - tensor_parallel_size=1, - dtype=dtype, - max_model_len=2048, - enforce_eager=True) as vllm_model: - vllm_model.generate_greedy(example_prompts, max_tokens) - - -@pytest.mark.skip(reason="310P: multimodal test skipped, offline is ok") -@pytest.mark.parametrize("dtype", ["float16"]) -def test_multimodal_vl(dtype: str): - image = ImageAsset("cherry_blossom").pil_image.convert("RGB") - - img_questions = [ - "What is the content of this image?", - "Describe the content of this image in detail.", - "What's in the image?", - "Where is this image taken?", - ] - - images = [image] * len(img_questions) - placeholder = "<|image_pad|>" - prompts = [ - ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" - f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" - f"{q}<|im_end|>\n<|im_start|>assistant\n") for q in img_questions - ] - - with VllmRunner("Qwen/Qwen2.5-VL-3B-Instruct", - mm_processor_kwargs={ - "min_pixels": 28 * 28, - "max_pixels": 1280 * 28 * 28, - "fps": 1, - }, - dtype=dtype, - max_model_len=8192, - enforce_eager=True, - limit_mm_per_prompt={"image": 1}) as vllm_model: - outputs = vllm_model.generate_greedy( - prompts=prompts, - images=images, - max_tokens=64, - ) - - assert len(outputs) == len(prompts) - - for _, output_str in outputs: - assert output_str, "Generated output should not be empty." diff --git a/tests/e2e/310p/test_offline_inference_w8a8_310p.py b/tests/e2e/310p/test_offline_inference_w8a8_310p.py deleted file mode 100644 index 84b3eb49..00000000 --- a/tests/e2e/310p/test_offline_inference_w8a8_310p.py +++ /dev/null @@ -1,22 +0,0 @@ -import pytest - -from tests.e2e.conftest import VllmRunner - - -@pytest.mark.parametrize("dtype", ["float16"]) -@pytest.mark.parametrize("max_tokens", [5]) -def test_qwen3_w8a8_e2e_310p(dtype: str, max_tokens: int) -> None: - example_prompts = [ - "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.", - ] - - with VllmRunner( - "vllm-ascend/Qwen3-32B-W8A8", - tensor_parallel_size=4, - dtype=dtype, - max_model_len=8192, - enforce_eager=True, - quantization="ascend", - enable_prefix_caching=False, - ) as vllm_model: - vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/vllm_ascend/_310p/fused_moe/fused_moe.py b/vllm_ascend/_310p/fused_moe/fused_moe.py index 5cca5036..f54ac604 100644 --- a/vllm_ascend/_310p/fused_moe/fused_moe.py +++ b/vllm_ascend/_310p/fused_moe/fused_moe.py @@ -251,9 +251,11 @@ class AscendSharedFusedMoE310(SharedFusedMoE, AscendFusedMoE310): shared_experts: torch.nn.Module, gate: torch.nn.Module | None = None, use_overlapped: bool = True, + routed_input_transform: torch.nn.Module | None = None, **kwargs, ): AscendFusedMoE310.__init__(self, **kwargs) + self._routed_input_transform = routed_input_transform self._shared_experts = shared_experts self.use_overlapped = use_overlapped self.shared_expert_stream = None diff --git a/vllm_ascend/_310p/worker_310p.py b/vllm_ascend/_310p/worker_310p.py index 8ced752b..bb0fa28d 100644 --- a/vllm_ascend/_310p/worker_310p.py +++ b/vllm_ascend/_310p/worker_310p.py @@ -25,9 +25,7 @@ from vllm_ascend.worker.worker import NPUWorker, init_workspace_manager class NPUWorker310(NPUWorker): def init_device(self): self.device = self._init_device() - - # TODO: There is accuracy issue when jit_compile is disabled currently. - torch_npu.npu.set_compile_mode(jit_compile=True) + torch_npu.npu.set_compile_mode(jit_compile=False) init_workspace_manager(self.device, num_ubatches=1)