diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index 95a2bdf2..94456091 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -209,13 +209,13 @@ jobs: #pytest -sv --durations=0 tests/e2e/multicard/test_ilama_lora_tp2.py # To avoid oom, we need to run the test in a single process. - pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe - pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC - pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_sp_for_qwen3_moe - pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_fc2_for_qwen3_moe - pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen_Dense_with_flashcomm_v1 - pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight - pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_deepseek_w4a8_accuracy + pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_deepseek_multistream_moe_tp2 + pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_qwen3_w4a8_dynamic_tp2 + pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_qwen3_moe_sp_tp2 + pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_qwen3_moe_fc2_tp2 + pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_qwen3_dense_fc1_tp2 + pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_qwen3_dense_prefetch_mlp_weight_tp2 + pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_deepseek_w4a8_accuracy_tp2 pytest -sv --durations=0 tests/e2e/multicard/test_prefix_caching.py pytest -sv --durations=0 tests/e2e/multicard/test_pipeline_parallel.py @@ -288,8 +288,8 @@ jobs: env: VLLM_WORKER_MULTIPROC_METHOD: spawn run: | - pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe - pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Kimi_K2_Thinking_W4A16 + pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_deepseek_multistream_moe_tp2 + pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_kimi_k2_thinking_w4a16_tp4 pytest -sv --durations=0 tests/e2e/multicard/test_data_parallel_tp2.py pytest -sv --durations=0 tests/e2e/multicard/long_sequence/test_basic.py pytest -sv --durations=0 tests/e2e/multicard/long_sequence/test_accuracy.py diff --git a/tests/e2e/multicard/test_aclgraph_capture_replay.py b/tests/e2e/multicard/test_aclgraph_capture_replay.py index a5e3b846..847d3aaa 100644 --- a/tests/e2e/multicard/test_aclgraph_capture_replay.py +++ b/tests/e2e/multicard/test_aclgraph_capture_replay.py @@ -134,7 +134,7 @@ def _run_worker_process( @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [4, 36]) @patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"}) -def test_aclgraph_capture_replay_metrics_dp2( +def test_models_aclgraph_capture_replay_metrics_dp2( model: str, max_tokens: int, monkeypatch: pytest.MonkeyPatch, diff --git a/tests/e2e/multicard/test_data_parallel.py b/tests/e2e/multicard/test_data_parallel.py index cb3c6048..6edeeb37 100644 --- a/tests/e2e/multicard/test_data_parallel.py +++ b/tests/e2e/multicard/test_data_parallel.py @@ -38,7 +38,7 @@ MODELS = [ @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [32]) @patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"}) -def test_qwen_inference_dp2(model, max_tokens): +def test_qwen3_inference_dp2(model, max_tokens): moe_models = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"] quantization_models = ["vllm-ascend/Qwen3-30B-A3B-W8A8"] script = "examples/offline_data_parallel.py" diff --git a/tests/e2e/multicard/test_data_parallel_tp2.py b/tests/e2e/multicard/test_data_parallel_tp2.py index 202eaa9c..03b2d665 100644 --- a/tests/e2e/multicard/test_data_parallel_tp2.py +++ b/tests/e2e/multicard/test_data_parallel_tp2.py @@ -15,7 +15,7 @@ MODELS = ["Qwen/Qwen3-0.6B"] @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [32]) @patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3"}) -def test_qwen_inference_dp2_tp2(model, max_tokens): +def test_qwen3_inference_dp2_tp2(model, max_tokens): script = "examples/offline_data_parallel.py" env = os.environ.copy() diff --git a/tests/e2e/multicard/test_external_launcher.py b/tests/e2e/multicard/test_external_launcher.py index 4a4a17ec..3c2d7521 100644 --- a/tests/e2e/multicard/test_external_launcher.py +++ b/tests/e2e/multicard/test_external_launcher.py @@ -37,7 +37,7 @@ DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10] @pytest.mark.parametrize("model", MODELS) @patch.dict(os.environ, {"HCCL_BUFFSIZE": "500"}) -def test_qwen_external_launcher(model): +def test_qwen3_external_launcher(model): script = Path( __file__ ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py" @@ -78,7 +78,7 @@ def test_qwen_external_launcher(model): @pytest.mark.parametrize("model", MOE_MODELS) -def test_qwen_moe_external_launcher_ep(model): +def test_qwen3_moe_external_launcher_ep_tp2(model): script = Path( __file__ ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py" @@ -109,7 +109,7 @@ def test_qwen_moe_external_launcher_ep(model): @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"}) -def test_qwen_external_launcher_with_sleepmode(): +def test_qwen3_external_launcher_with_sleepmode(): script = Path( __file__ ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py" @@ -154,7 +154,7 @@ def test_qwen_external_launcher_with_sleepmode(): @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"}) -def test_qwen_external_launcher_with_sleepmode_level2(): +def test_qwen3_external_launcher_with_sleepmode_level2(): script = Path( __file__ ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py" @@ -210,7 +210,7 @@ def test_qwen_external_launcher_with_sleepmode_level2(): "VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": "1", "HCCL_BUFFSIZE": "500" }) -def test_qwen_external_launcher_with_matmul_allreduce(model): +def test_qwen3_external_launcher_with_matmul_allreduce(model): script = Path( __file__ ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py" diff --git a/tests/e2e/multicard/test_full_graph_mode.py b/tests/e2e/multicard/test_full_graph_mode.py index c788e9da..362f8be7 100644 --- a/tests/e2e/multicard/test_full_graph_mode.py +++ b/tests/e2e/multicard/test_full_graph_mode.py @@ -29,7 +29,7 @@ from tests.e2e.conftest import VllmRunner from tests.e2e.model_utils import check_outputs_equal -def test_qwen_moe_with_full_decode_only(): +def test_qwen3_moe_full_decode_only_tp2(): if 'HCCL_OP_EXPANSION_MODE' in os.environ: del os.environ['HCCL_OP_EXPANSION_MODE'] prompts = [ @@ -75,7 +75,7 @@ def test_qwen_moe_with_full_decode_only(): ) -def test_qwen_moe_with_full(): +def test_qwen3_moe_full_graph_tp2(): if 'HCCL_OP_EXPANSION_MODE' in os.environ: del os.environ['HCCL_OP_EXPANSION_MODE'] prompts = [ diff --git a/tests/e2e/multicard/test_fused_moe_allgather_ep.py b/tests/e2e/multicard/test_fused_moe_allgather_ep.py index 4fa111ce..8404fe7c 100644 --- a/tests/e2e/multicard/test_fused_moe_allgather_ep.py +++ b/tests/e2e/multicard/test_fused_moe_allgather_ep.py @@ -41,7 +41,7 @@ from tests.e2e.conftest import VllmRunner "TASK_QUEUE_ENABLE": "1", "VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP": "1" }) -def test_deepseek_moe_fused_allgather_ep(): +def test_deepseek_v3_moe_fused_allgather_ep_tp2(): example_prompts = ["Hello, my name is"] sampling_params = SamplingParams(max_tokens=100, temperature=0.0) @@ -62,7 +62,7 @@ def test_deepseek_moe_fused_allgather_ep(): "VLLM_WORKER_MULTIPROC_METHOD": "spawn", "TASK_QUEUE_ENABLE": "1" }) -def test_deepseek_moe_fused_alltoall_ep(): +def test_deepseek_v3_moe_fused_alltoall_ep_tp2(): example_prompts = ["Hello, my name is"] sampling_params = SamplingParams(max_tokens=100, temperature=0.0) diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py index a1e24ecf..558067ce 100644 --- a/tests/e2e/multicard/test_offline_inference_distributed.py +++ b/tests/e2e/multicard/test_offline_inference_distributed.py @@ -34,7 +34,7 @@ os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" QWEN_DENSE_MODELS = [ - "vllm-ascend/Qwen3-8B-W8A8", + "vllm-ascend/Qwen3-0.6B-W8A8", ] QWEN_W4A8_MODELS = [ @@ -50,7 +50,7 @@ KIMI_W4A16_MODELS = [ ] -def test_models_distributed_DeepSeek_multistream_moe(): +def test_deepseek_multistream_moe_tp2(): example_prompts = [ "Hello, my name is", ] @@ -70,7 +70,7 @@ def test_models_distributed_DeepSeek_multistream_moe(): @pytest.mark.parametrize("model", QWEN_W4A8_MODELS) -def test_models_distributed_Qwen3_W4A8DYNAMIC(model): +def test_qwen3_w4a8_dynamic_tp2(model): prompts = [ "Hello, my name is", ] @@ -85,7 +85,7 @@ def test_models_distributed_Qwen3_W4A8DYNAMIC(model): vllm_model.generate_greedy(prompts, max_tokens) -def test_sp_for_qwen3_moe() -> None: +def test_qwen3_moe_sp_tp2() -> None: example_prompts = [ "Hello, my name is", ] @@ -108,7 +108,7 @@ def test_sp_for_qwen3_moe() -> None: @pytest.mark.parametrize("model", DEEPSEEK_W4A8_MODELS) @patch.dict(os.environ, {"HCCL_BUFFSIZE": "2048"}) -def test_deepseek_w4a8_accuracy(model): +def test_deepseek_w4a8_accuracy_tp2(model): prompts = [ "Hello, my name is", "The president of the United States is", "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs" @@ -140,7 +140,7 @@ def test_deepseek_w4a8_accuracy(model): @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"}) @patch.dict(os.environ, {"VLLM_ASCEND_FLASHCOMM2_PARALLEL_SIZE": "1"}) -def test_fc2_for_qwen3_moe() -> None: +def test_qwen3_moe_fc2_tp2() -> None: example_prompts = [ "Hello, my name is", ] @@ -159,7 +159,7 @@ def test_fc2_for_qwen3_moe() -> None: @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"}) -def test_models_distributed_deepseek_v2_lite_with_flashcomm_v1() -> None: +def test_deepseek_v2_lite_fc1_tp2() -> None: example_prompts = [ "test" * 1001, ] @@ -180,7 +180,7 @@ def test_models_distributed_deepseek_v2_lite_with_flashcomm_v1() -> None: @pytest.mark.parametrize("model", QWEN_DENSE_MODELS) @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1"}) @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"}) -def test_models_distributed_Qwen_Dense_with_flashcomm_v1(model): +def test_qwen3_dense_fc1_tp2(model): example_prompts = [ "Hello, my name is", ] @@ -200,7 +200,7 @@ def test_models_distributed_Qwen_Dense_with_flashcomm_v1(model): @pytest.mark.parametrize("model", QWEN_DENSE_MODELS) @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1"}) @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_PREFETCH_MLP": "1"}) -def test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight(model): +def test_qwen3_dense_prefetch_mlp_weight_tp2(model): example_prompts = [ "Hello, my name is", ] @@ -218,7 +218,7 @@ def test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight(model): @pytest.mark.parametrize("model", KIMI_W4A16_MODELS) -def test_models_distributed_Kimi_K2_Thinking_W4A16(model): +def test_kimi_k2_thinking_w4a16_tp4(model): example_prompts = [ "Hello, my name is", ] diff --git a/tests/e2e/multicard/test_offline_weight_load.py b/tests/e2e/multicard/test_offline_weight_load.py index dd0ac01f..95deebe8 100644 --- a/tests/e2e/multicard/test_offline_weight_load.py +++ b/tests/e2e/multicard/test_offline_weight_load.py @@ -31,7 +31,7 @@ MODELS = ["Qwen/Qwen3-30B-A3B"] @pytest.mark.parametrize("model", MODELS) @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"}) -def test_qwen_offline_weight_load_and_sleepmode(model): +def test_qwen3_offline_load_and_sleepmode_tp2(model): script = Path( __file__ ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py" diff --git a/tests/e2e/multicard/test_pipeline_parallel.py b/tests/e2e/multicard/test_pipeline_parallel.py index 5153ca24..855724ea 100644 --- a/tests/e2e/multicard/test_pipeline_parallel.py +++ b/tests/e2e/multicard/test_pipeline_parallel.py @@ -44,4 +44,4 @@ def test_models_pp2(model: str, tp_size: int, pp_size: int, pipeline_parallel_size=pp_size, distributed_executor_backend=distributed_executor_backend, gpu_memory_utilization=0.7) as vllm_model: - vllm_model.generate_greedy(prompts, 64) \ No newline at end of file + vllm_model.generate_greedy(prompts, 64)