diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index 9af889eb..cda75bae 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -189,11 +189,8 @@ jobs: #pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py # To avoid oom, we need to run the test in a single process. - pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe - pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W8A8 - pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC_new_version - pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC_old_version + pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W4A8DYNAMIC pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_sp_for_qwen3_moe pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_fc2_for_qwen3_moe @@ -272,7 +269,6 @@ jobs: # pytest -sv tests/e2e/multicard/test_qwen3_moe.py::test_models_distributed_Qwen3_MOE_TP2_WITH_EP # pytest -sv tests/e2e/multicard/test_qwen3_moe.py::test_models_distributed_Qwen3_MOE_W8A8_WITH_EP pytest -sv tests/e2e/multicard/test_data_parallel_tp2.py - - name: Install Ascend toolkit & triton_ascend (for Qwen3-Next-80B-A3B-Instruct) shell: bash -l {0} run: | diff --git a/tests/e2e/multicard/test_aclgraph_capture_replay.py b/tests/e2e/multicard/test_aclgraph_capture_replay.py index 4375e825..e81b5615 100644 --- a/tests/e2e/multicard/test_aclgraph_capture_replay.py +++ b/tests/e2e/multicard/test_aclgraph_capture_replay.py @@ -134,7 +134,7 @@ def _run_worker_process( @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [4, 36]) @patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"}) -def test_aclgraph_capture_replay_dp2( +def test_aclgraph_capture_replay_metrics_dp2( model: str, max_tokens: int, monkeypatch: pytest.MonkeyPatch, diff --git a/tests/e2e/multicard/test_data_parallel.py b/tests/e2e/multicard/test_data_parallel.py index e6959b02..cb3c6048 100644 --- a/tests/e2e/multicard/test_data_parallel.py +++ b/tests/e2e/multicard/test_data_parallel.py @@ -38,7 +38,7 @@ MODELS = [ @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [32]) @patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"}) -def test_data_parallel_inference(model, max_tokens): +def test_qwen_inference_dp2(model, max_tokens): moe_models = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"] quantization_models = ["vllm-ascend/Qwen3-30B-A3B-W8A8"] script = "examples/offline_data_parallel.py" diff --git a/tests/e2e/multicard/test_data_parallel_tp2.py b/tests/e2e/multicard/test_data_parallel_tp2.py index 6b0bdabe..202eaa9c 100644 --- a/tests/e2e/multicard/test_data_parallel_tp2.py +++ b/tests/e2e/multicard/test_data_parallel_tp2.py @@ -15,7 +15,7 @@ MODELS = ["Qwen/Qwen3-0.6B"] @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [32]) @patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3"}) -def test_data_parallel_inference(model, max_tokens): +def test_qwen_inference_dp2_tp2(model, max_tokens): script = "examples/offline_data_parallel.py" env = os.environ.copy() diff --git a/tests/e2e/multicard/test_expert_parallel.py b/tests/e2e/multicard/test_expert_parallel.py index b8f03d5f..762ca6d2 100644 --- a/tests/e2e/multicard/test_expert_parallel.py +++ b/tests/e2e/multicard/test_expert_parallel.py @@ -5,7 +5,7 @@ from tests.e2e.model_utils import check_outputs_equal @pytest.mark.parametrize("model_name", ["deepseek-ai/DeepSeek-V2-Lite-Chat"]) -def test_e2e_ep_correctness(model_name): +def test_deepseek_correctness_ep(model_name): example_prompts = [ "Hello, my name is", "The president of the United States is", diff --git a/tests/e2e/multicard/test_external_launcher.py b/tests/e2e/multicard/test_external_launcher.py index ece35def..4a4a17ec 100644 --- a/tests/e2e/multicard/test_external_launcher.py +++ b/tests/e2e/multicard/test_external_launcher.py @@ -37,7 +37,7 @@ DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10] @pytest.mark.parametrize("model", MODELS) @patch.dict(os.environ, {"HCCL_BUFFSIZE": "500"}) -def test_external_launcher(model): +def test_qwen_external_launcher(model): script = Path( __file__ ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py" @@ -78,7 +78,7 @@ def test_external_launcher(model): @pytest.mark.parametrize("model", MOE_MODELS) -def test_moe_external_launcher(model): +def test_qwen_moe_external_launcher_ep(model): script = Path( __file__ ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py" @@ -109,7 +109,7 @@ def test_moe_external_launcher(model): @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"}) -def test_external_launcher_and_sleepmode(): +def test_qwen_external_launcher_with_sleepmode(): script = Path( __file__ ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py" @@ -154,7 +154,7 @@ def test_external_launcher_and_sleepmode(): @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"}) -def test_external_launcher_and_sleepmode_level2(): +def test_qwen_external_launcher_with_sleepmode_level2(): script = Path( __file__ ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py" @@ -210,7 +210,7 @@ def test_external_launcher_and_sleepmode_level2(): "VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": "1", "HCCL_BUFFSIZE": "500" }) -def test_mm_allreduce(model): +def test_qwen_external_launcher_with_matmul_allreduce(model): script = Path( __file__ ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py" diff --git a/tests/e2e/multicard/test_full_graph_mode.py b/tests/e2e/multicard/test_full_graph_mode.py index 3ccbf823..c788e9da 100644 --- a/tests/e2e/multicard/test_full_graph_mode.py +++ b/tests/e2e/multicard/test_full_graph_mode.py @@ -29,7 +29,7 @@ from tests.e2e.conftest import VllmRunner from tests.e2e.model_utils import check_outputs_equal -def test_models_distributed_Qwen3_MOE_TP2_WITH_FULL_DECODE_ONLY(): +def test_qwen_moe_with_full_decode_only(): if 'HCCL_OP_EXPANSION_MODE' in os.environ: del os.environ['HCCL_OP_EXPANSION_MODE'] prompts = [ @@ -75,7 +75,7 @@ def test_models_distributed_Qwen3_MOE_TP2_WITH_FULL_DECODE_ONLY(): ) -def test_models_distributed_Qwen3_MOE_TP2_WITH_FULL(): +def test_qwen_moe_with_full(): if 'HCCL_OP_EXPANSION_MODE' in os.environ: del os.environ['HCCL_OP_EXPANSION_MODE'] prompts = [ diff --git a/tests/e2e/multicard/test_fused_moe_allgather_ep.py b/tests/e2e/multicard/test_fused_moe_allgather_ep.py index 85d246e5..4fa111ce 100644 --- a/tests/e2e/multicard/test_fused_moe_allgather_ep.py +++ b/tests/e2e/multicard/test_fused_moe_allgather_ep.py @@ -41,7 +41,7 @@ from tests.e2e.conftest import VllmRunner "TASK_QUEUE_ENABLE": "1", "VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP": "1" }) -def test_generate_with_allgather(): +def test_deepseek_moe_fused_allgather_ep(): example_prompts = ["Hello, my name is"] sampling_params = SamplingParams(max_tokens=100, temperature=0.0) @@ -62,7 +62,7 @@ def test_generate_with_allgather(): "VLLM_WORKER_MULTIPROC_METHOD": "spawn", "TASK_QUEUE_ENABLE": "1" }) -def test_generate_with_alltoall(): +def test_deepseek_moe_fused_alltoall_ep(): example_prompts = ["Hello, my name is"] sampling_params = SamplingParams(max_tokens=100, temperature=0.0) diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py index a13276bb..f4cf5a25 100644 --- a/tests/e2e/multicard/test_offline_inference_distributed.py +++ b/tests/e2e/multicard/test_offline_inference_distributed.py @@ -33,20 +33,15 @@ os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" QWEN_DENSE_MODELS = [ - "vllm-ascend/Qwen3-8B-W8A8", "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8" + "vllm-ascend/Qwen3-8B-W8A8", ] -QWEN_W4A8_OLD_VERSION_MODELS = [ - "vllm-ascend/Qwen3-8B-W4A8", -] - -QWEN_W4A8_NEW_VERSION_MODELS = [ +QWEN_W4A8_MODELS = [ "vllm-ascend/Qwen3-1.7B-W4A8-V1", ] DEEPSEEK_W4A8_MODELS = [ - "vllm-ascend/DeepSeek-V3-W4A8-Pruing", - "vllm-ascend/DeepSeek-V3.1-W4A8-puring" + "vllm-ascend/DeepSeek-V3.1-W4A8-puring", ] KIMI_W4A16_MODELS = [ @@ -54,22 +49,6 @@ KIMI_W4A16_MODELS = [ ] -def test_models_distributed_QwQ(): - example_prompts = [ - "Hello, my name is", - ] - dtype = "half" - max_tokens = 5 - with VllmRunner( - "Qwen/QwQ-32B", - dtype=dtype, - tensor_parallel_size=2, - distributed_executor_backend="mp", - enforce_eager=False, - ) as vllm_model: - vllm_model.generate_greedy(example_prompts, max_tokens) - - def test_models_distributed_DeepSeek_multistream_moe(): example_prompts = [ "Hello, my name is", @@ -89,40 +68,8 @@ def test_models_distributed_DeepSeek_multistream_moe(): vllm_model.generate_greedy(example_prompts, max_tokens) -def test_models_distributed_Qwen3_W8A8(): - example_prompts = [ - "Hello, my name is", - ] - max_tokens = 5 - - with VllmRunner( - snapshot_download("vllm-ascend/Qwen3-8B-W8A8"), - max_model_len=8192, - dtype="auto", - tensor_parallel_size=2, - quantization="ascend", - ) as vllm_model: - vllm_model.generate_greedy(example_prompts, max_tokens) - - -@pytest.mark.parametrize("model", QWEN_W4A8_OLD_VERSION_MODELS) -def test_models_distributed_Qwen3_W4A8DYNAMIC_old_version(model): - prompts = [ - "Hello, my name is", - ] - max_tokens = 5 - with VllmRunner( - snapshot_download(model), - max_model_len=8192, - dtype="auto", - tensor_parallel_size=2, - quantization="ascend", - ) as vllm_model: - vllm_model.generate_greedy(prompts, max_tokens) - - -@pytest.mark.parametrize("model", QWEN_W4A8_NEW_VERSION_MODELS) -def test_models_distributed_Qwen3_W4A8DYNAMIC_new_version(model): +@pytest.mark.parametrize("model", QWEN_W4A8_MODELS) +def test_models_distributed_Qwen3_W4A8DYNAMIC(model): prompts = [ "Hello, my name is", ] diff --git a/tests/e2e/multicard/test_offline_weight_load.py b/tests/e2e/multicard/test_offline_weight_load.py index 0e9ba95c..dd0ac01f 100644 --- a/tests/e2e/multicard/test_offline_weight_load.py +++ b/tests/e2e/multicard/test_offline_weight_load.py @@ -31,7 +31,7 @@ MODELS = ["Qwen/Qwen3-30B-A3B"] @pytest.mark.parametrize("model", MODELS) @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"}) -def test_offline_weight_load_and_sleepmode(model): +def test_qwen_offline_weight_load_and_sleepmode(model): script = Path( __file__ ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py" diff --git a/tests/e2e/multicard/test_pipeline_parallel.py b/tests/e2e/multicard/test_pipeline_parallel.py index fa21fe8d..855724ea 100644 --- a/tests/e2e/multicard/test_pipeline_parallel.py +++ b/tests/e2e/multicard/test_pipeline_parallel.py @@ -1,47 +1,47 @@ -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# -import pytest - -from tests.e2e.conftest import VllmRunner - -MODELS = [ - "Qwen/Qwen3-0.6B", - "deepseek-ai/DeepSeek-V2-Lite-Chat", -] - -TENSOR_PARALLELS = [1] -PIPELINE_PARALLELS = [2] -DIST_EXECUTOR_BACKEND = ["mp", "ray"] - -prompts = [ - "Hello, my name is", - "The future of AI is", -] - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS) -@pytest.mark.parametrize("pp_size", PIPELINE_PARALLELS) -@pytest.mark.parametrize("distributed_executor_backend", DIST_EXECUTOR_BACKEND) -def test_models(model: str, tp_size: int, pp_size: int, - distributed_executor_backend: str) -> None: - with VllmRunner(model, - tensor_parallel_size=tp_size, - pipeline_parallel_size=pp_size, - distributed_executor_backend=distributed_executor_backend, - gpu_memory_utilization=0.7) as vllm_model: - vllm_model.generate_greedy(prompts, 64) +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +import pytest + +from tests.e2e.conftest import VllmRunner + +MODELS = [ + "Qwen/Qwen3-0.6B", + "deepseek-ai/DeepSeek-V2-Lite-Chat", +] + +TENSOR_PARALLELS = [1] +PIPELINE_PARALLELS = [2] +DIST_EXECUTOR_BACKEND = ["mp", "ray"] + +prompts = [ + "Hello, my name is", + "The future of AI is", +] + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS) +@pytest.mark.parametrize("pp_size", PIPELINE_PARALLELS) +@pytest.mark.parametrize("distributed_executor_backend", DIST_EXECUTOR_BACKEND) +def test_models_pp2(model: str, tp_size: int, pp_size: int, + distributed_executor_backend: str) -> None: + with VllmRunner(model, + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + distributed_executor_backend=distributed_executor_backend, + gpu_memory_utilization=0.7) as vllm_model: + vllm_model.generate_greedy(prompts, 64)