diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index cda75bae..370504e2 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -197,8 +197,8 @@ jobs: pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen_Dense_with_flashcomm_v1 pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight - pytest -sv tests/e2e/multicard/test_pipeline_parallel.py pytest -sv tests/e2e/multicard/test_prefix_caching.py + pytest -sv tests/e2e/multicard/test_pipeline_parallel.py pytest -sv tests/e2e/multicard/test_qwen3_moe.py pytest -sv tests/e2e/multicard/test_offline_weight_load.py @@ -266,9 +266,8 @@ jobs: pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W4A8DYNAMIC pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Kimi_K2_Thinking_W4A16 - # pytest -sv tests/e2e/multicard/test_qwen3_moe.py::test_models_distributed_Qwen3_MOE_TP2_WITH_EP - # pytest -sv tests/e2e/multicard/test_qwen3_moe.py::test_models_distributed_Qwen3_MOE_W8A8_WITH_EP pytest -sv tests/e2e/multicard/test_data_parallel_tp2.py + - name: Install Ascend toolkit & triton_ascend (for Qwen3-Next-80B-A3B-Instruct) shell: bash -l {0} run: | diff --git a/tests/e2e/multicard/test_pipeline_parallel.py b/tests/e2e/multicard/test_pipeline_parallel.py index 855724ea..5153ca24 100644 --- a/tests/e2e/multicard/test_pipeline_parallel.py +++ b/tests/e2e/multicard/test_pipeline_parallel.py @@ -44,4 +44,4 @@ def test_models_pp2(model: str, tp_size: int, pp_size: int, pipeline_parallel_size=pp_size, distributed_executor_backend=distributed_executor_backend, gpu_memory_utilization=0.7) as vllm_model: - vllm_model.generate_greedy(prompts, 64) + vllm_model.generate_greedy(prompts, 64) \ No newline at end of file diff --git a/tests/e2e/multicard/test_prefix_caching.py b/tests/e2e/multicard/test_prefix_caching.py index 114d5d72..272efc2a 100644 --- a/tests/e2e/multicard/test_prefix_caching.py +++ b/tests/e2e/multicard/test_prefix_caching.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Compare the with and without prefix caching on V1 scheduler or AscendScheduler.""" +"""Compare the with and without prefix caching.""" import pytest @@ -9,7 +9,7 @@ from tests.e2e.model_utils import check_outputs_equal MODELS = [ # for MHA - "Qwen/Qwen3-8B-Base", + "Qwen/Qwen3-8B", # for MLA "deepseek-ai/DeepSeek-V2-Lite-Chat" ] @@ -60,9 +60,8 @@ INPUT_PROMPTS = [ @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [50]) -def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None: +def test_models_prefix_cache_tp2(model: str, max_tokens: int) -> None: with VllmRunner(model, - enforce_eager=False, max_model_len=2048, tensor_parallel_size=2, gpu_memory_utilization=0.7) as vllm_model: @@ -71,7 +70,6 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None: with VllmRunner(model, enable_prefix_caching=False, - enforce_eager=False, max_model_len=2048, tensor_parallel_size=2, gpu_memory_utilization=0.7) as vllm_model: diff --git a/tests/e2e/multicard/test_quantization.py b/tests/e2e/multicard/test_quantization.py index 67c57daf..c37bfa5a 100644 --- a/tests/e2e/multicard/test_quantization.py +++ b/tests/e2e/multicard/test_quantization.py @@ -25,16 +25,17 @@ from modelscope import snapshot_download # type: ignore from tests.e2e.conftest import VllmRunner -def test_models_distributed_quantized_W8A8(): +def test_qwen2_5_w8a8_external_quantized_tp2(): example_prompts = [ "The president of the United States is", ] max_tokens = 5 - with VllmRunner(snapshot_download("neuralmagic/Qwen2.5-3B-quantized.w8a8"), - tensor_parallel_size=2, - max_model_len=4096, - gpu_memory_utilization=0.8, - enforce_eager=False) as vllm_model: + with VllmRunner( + snapshot_download("neuralmagic/Qwen2.5-3B-quantized.w8a8"), + tensor_parallel_size=2, + max_model_len=4096, + gpu_memory_utilization=0.8, + ) as vllm_model: vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens) golden_results = [ diff --git a/tests/e2e/multicard/test_qwen3_moe.py b/tests/e2e/multicard/test_qwen3_moe.py index e3758cfc..1d8c51ab 100644 --- a/tests/e2e/multicard/test_qwen3_moe.py +++ b/tests/e2e/multicard/test_qwen3_moe.py @@ -29,21 +29,8 @@ from modelscope import snapshot_download # type: ignore from tests.e2e.conftest import VllmRunner -def test_models_distributed_Qwen3_MOE_TP2(): - example_prompts = [ - "Hello, my name is", - ] - max_tokens = 5 - with VllmRunner( - "Qwen/Qwen3-30B-A3B", - tensor_parallel_size=2, - distributed_executor_backend="mp", - ) as vllm_model: - vllm_model.generate_greedy(example_prompts, max_tokens) - - @patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"}) -def test_models_distributed_Qwen3_MOE_TP2_WITH_EP(): +def test_qwen3_moe_distributed_mp_tp2_ep(): example_prompts = [ "Hello, my name is", ] @@ -53,12 +40,11 @@ def test_models_distributed_Qwen3_MOE_TP2_WITH_EP(): tensor_parallel_size=2, enable_expert_parallel=True, distributed_executor_backend="mp", - enforce_eager=False, ) as vllm_model: vllm_model.generate_greedy(example_prompts, max_tokens) -def test_models_distributed_Qwen3_MOE_W8A8(): +def test_qwen3_moe_w8a8_distributed_tp2(): example_prompts = [ "Hello, my name is", ] @@ -73,7 +59,7 @@ def test_models_distributed_Qwen3_MOE_W8A8(): @patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"}) -def test_models_distributed_Qwen3_MOE_W8A8_WITH_EP(): +def test_qwen3_moe_w8a8_distributed_tp2_ep(): example_prompts = [ "Hello, my name is", ] @@ -88,7 +74,7 @@ def test_models_distributed_Qwen3_MOE_W8A8_WITH_EP(): vllm_model.generate_greedy(example_prompts, max_tokens) -def test_models_distributed_Qwen3_MOE_TP2_WITH_ACLGRAPH_AIV(): +def test_qwen3_moe_distributed_aiv_tp2(): os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV' example_prompts = [ "Hello, my name is", @@ -99,23 +85,5 @@ def test_models_distributed_Qwen3_MOE_TP2_WITH_ACLGRAPH_AIV(): "Qwen/Qwen3-30B-A3B", dtype=dtype, tensor_parallel_size=2, - enforce_eager=False, - ) as vllm_model: - vllm_model.generate_greedy(example_prompts, max_tokens) - - -def test_models_distributed_Qwen3_MOE_TP2_WITH_ACLGRAPH(): - if 'HCCL_OP_EXPANSION_MODE' in os.environ: - del os.environ['HCCL_OP_EXPANSION_MODE'] - example_prompts = [ - "Hello, my name is", - ] - dtype = "auto" - max_tokens = 5 - with VllmRunner( - "Qwen/Qwen3-30B-A3B", - dtype=dtype, - tensor_parallel_size=2, - enforce_eager=False, ) as vllm_model: vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/e2e/multicard/test_qwen3_next.py b/tests/e2e/multicard/test_qwen3_next.py index b406a792..7a7fe64b 100644 --- a/tests/e2e/multicard/test_qwen3_next.py +++ b/tests/e2e/multicard/test_qwen3_next.py @@ -29,7 +29,7 @@ from modelscope import snapshot_download # type: ignore from tests.e2e.conftest import VllmRunner -def test_models_distributed_Qwen3_NEXT_TP4(): +def test_qwen3_next_distributed_mp_tp4(): example_prompts = [ "Hello, my name is", ] * 4 @@ -38,13 +38,12 @@ def test_models_distributed_Qwen3_NEXT_TP4(): tensor_parallel_size=4, max_model_len=4096, gpu_memory_utilization=0.8, - distributed_executor_backend="mp", - enforce_eager=True) as vllm_model: + distributed_executor_backend="mp") as vllm_model: vllm_model.generate_greedy(example_prompts, max_tokens) del vllm_model -def test_models_distributed_Qwen3_NEXT_TP4_FULL_DECODE_ONLY(): +def test_qwen3_next_distributed_mp_full_decode_only_tp4(): example_prompts = [ "Hello, my name is", ] * 4 @@ -54,7 +53,6 @@ def test_models_distributed_Qwen3_NEXT_TP4_FULL_DECODE_ONLY(): max_model_len=4096, gpu_memory_utilization=0.8, distributed_executor_backend="mp", - enforce_eager=False, compilation_config={ "cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 8, 24, 48, 60] @@ -64,7 +62,7 @@ def test_models_distributed_Qwen3_NEXT_TP4_FULL_DECODE_ONLY(): # TODO: Fix the accuary of batch chunked prefill -def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY(): +def test_qwen3_next_distributed_mp_eager_mtp_similarity_tp4(): example_prompts = ["Hello, my name is"] max_tokens = 20 @@ -110,16 +108,15 @@ def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY(): # TODO: will conduct accuracy verification after the subsequent version becomes stable @patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"}) -def test_models_distributed_Qwen3_NEXT_W8A8DYNAMIC_WITH_EP(): +def test_qwen3_next_w8a8dynamic_distributed_tp4_ep(): example_prompts = [ "Hello, my name is", ] max_tokens = 5 with VllmRunner( - snapshot_download( - "vllm-ascend/Qwen3-Next-80B-A3B-Instruct-W8A8-Pruning"), + snapshot_download("vllm-ascend/Qwen3-Next-80B-A3B-Instruct-W8A8"), max_model_len=4096, - tensor_parallel_size=2, + tensor_parallel_size=4, gpu_memory_utilization=0.4, max_num_seqs=1, enable_expert_parallel=True, diff --git a/tests/e2e/multicard/test_shared_expert_dp.py b/tests/e2e/multicard/test_shared_expert_dp.py index 867d3ab6..db95653b 100644 --- a/tests/e2e/multicard/test_shared_expert_dp.py +++ b/tests/e2e/multicard/test_shared_expert_dp.py @@ -7,13 +7,13 @@ from tests.e2e.conftest import VllmRunner from tests.e2e.model_utils import check_outputs_equal MODELS = [ - "vllm-ascend/DeepSeek-V2-Lite", + "deepseek-ai/DeepSeek-V2-Lite", ] os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" @pytest.mark.parametrize("model", MODELS) -def test_models_with_enable_shared_expert_dp(model: str) -> None: +def test_deepseek_v2_lite_enable_shared_expert_dp_tp2(model: str) -> None: if 'HCCL_OP_EXPANSION_MODE' in os.environ: del os.environ['HCCL_OP_EXPANSION_MODE'] @@ -51,7 +51,7 @@ def test_models_with_enable_shared_expert_dp(model: str) -> None: model, max_model_len=1024, tensor_parallel_size=2, - enforce_eager=False, + enable_expert_parallel=True, compilation_config={ "cudagraph_capture_sizes": [1, 4, 8, 16], "cudagraph_mode": "FULL_DECODE_ONLY", diff --git a/tests/e2e/multicard/test_single_request_aclgraph.py b/tests/e2e/multicard/test_single_request_aclgraph.py index 9e9c1c77..90b5d7e5 100644 --- a/tests/e2e/multicard/test_single_request_aclgraph.py +++ b/tests/e2e/multicard/test_single_request_aclgraph.py @@ -23,7 +23,7 @@ from vllm.utils.network_utils import get_open_port from tests.e2e.conftest import RemoteOpenAIServer -MODELS = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"] +MODELS = ["Qwen/Qwen3-0.6B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"] DATA_PARALLELS = [2] @@ -39,7 +39,8 @@ api_keyword_args = { @pytest.mark.asyncio @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dp_size", DATA_PARALLELS) -async def test_single_request_aclgraph(model: str, dp_size: int) -> None: +async def test_models_single_request_aclgraph_dp2(model: str, + dp_size: int) -> None: port = get_open_port() env_dict = { "TASK_QUEUE_ENABLE": "1",