From 27b09ca9b9515a271683a626cec85d33534520b4 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Mon, 1 Dec 2025 20:33:50 +0800 Subject: [PATCH] [CI] drop ascend scheduler test (#4582) let' drop ascend scheduler test first to ensure all function works without it. - vLLM version: v0.11.2 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2 Signed-off-by: wangxiyuan --- .github/workflows/_e2e_test.yaml | 1 - .../test_offline_inference_parallel_310p.py | 3 - tests/e2e/multicard/test_expert_parallel.py | 21 ++--- .../multicard/test_fused_moe_allgather_ep.py | 16 +--- .../test_offline_inference_distributed.py | 14 +--- tests/e2e/multicard/test_prefix_caching.py | 64 --------------- tests/e2e/multicard/test_qwen3_next.py | 6 -- .../e2e/multicard/test_torchair_graph_mode.py | 12 --- .../test_mtpx_deepseek_r1_0528_w8a8.py | 6 +- ...test_prefix_cache_deepseek_r1_0528_w8a8.py | 3 - .../test_prefix_cache_qwen3_32b_int8.py | 7 +- .../test_qwen3_32b_int8_a3_feature_stack3.py | 3 +- .../models/test_deepseek_r1_0528_w8a8.py | 10 +-- .../models/test_deepseek_r1_w8a8_eplb.py | 3 - .../models/test_deepseek_v3_2_exp_w8a8.py | 3 +- .../e2e/nightly/models/test_qwen2_5_vl_32b.py | 5 +- .../models/test_qwen3_235b_a22b_w8a8_eplb.py | 6 +- .../nightly/models/test_qwen3_235b_w8a8.py | 6 -- tests/e2e/nightly/models/test_qwq_32b.py | 2 - .../models/DeepSeek-R1-W8A8-A2-torchair.yaml | 4 +- .../config/models/DeepSeek-R1-W8A8-A2.yaml | 4 +- .../config/models/DeepSeek-R1-W8A8-EPLB.yaml | 8 +- .../config/models/DeepSeek-R1-W8A8.yaml | 8 +- .../config/models/DeepSeek-V3_2-Exp-bf16.yaml | 4 +- .../spec_decode_v1/test_v1_mtp_correctness.py | 41 +++++----- tests/e2e/singlecard/test_ascend_scheduler.py | 52 ------------ tests/e2e/singlecard/test_chunked.py | 82 ------------------- tests/e2e/singlecard/test_vlm.py | 35 -------- 28 files changed, 53 insertions(+), 376 deletions(-) delete mode 100644 tests/e2e/singlecard/test_chunked.py diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index c07906ba..c7e883a0 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -94,7 +94,6 @@ jobs: pytest -sv tests/e2e/singlecard/test_ascend_scheduler.py pytest -sv tests/e2e/singlecard/test_bge_model.py pytest -sv tests/e2e/singlecard/test_camem.py - pytest -sv tests/e2e/singlecard/test_chunked.py pytest -sv tests/e2e/singlecard/test_embedding.py # pytest -sv tests/e2e/singlecard/test_embedding_aclgraph.py pytest -sv tests/e2e/singlecard/test_guided_decoding.py diff --git a/tests/e2e/310p/test_offline_inference_parallel_310p.py b/tests/e2e/310p/test_offline_inference_parallel_310p.py index 6bf33568..7ba7ef73 100644 --- a/tests/e2e/310p/test_offline_inference_parallel_310p.py +++ b/tests/e2e/310p/test_offline_inference_parallel_310p.py @@ -29,9 +29,6 @@ ADDITIONAL_CONFIG = [{ "additional_config": { "torchair_graph_config": { "enabled": True - }, - "ascend_scheduler_config": { - "enabled": True, } } }] diff --git a/tests/e2e/multicard/test_expert_parallel.py b/tests/e2e/multicard/test_expert_parallel.py index f1076013..b8f03d5f 100644 --- a/tests/e2e/multicard/test_expert_parallel.py +++ b/tests/e2e/multicard/test_expert_parallel.py @@ -15,23 +15,14 @@ def test_e2e_ep_correctness(model_name): max_tokens = 5 # FIXME: Really strange that chunked prefill might lead to different results, investigate further - with VllmRunner( - model_name, - tensor_parallel_size=2, - additional_config={"ascend_scheduler_config": { - "enabled": True - }}, - enforce_eager=False) as vllm_model: + with VllmRunner(model_name, tensor_parallel_size=2, + enforce_eager=False) as vllm_model: tp_output = vllm_model.generate_greedy(example_prompts, max_tokens) - with VllmRunner( - model_name, - tensor_parallel_size=2, - enable_expert_parallel=True, - additional_config={"ascend_scheduler_config": { - "enabled": True - }}, - enforce_eager=False) as vllm_model: + with VllmRunner(model_name, + tensor_parallel_size=2, + enable_expert_parallel=True, + enforce_eager=False) as vllm_model: ep_output = vllm_model.generate_greedy(example_prompts, max_tokens) check_outputs_equal( diff --git a/tests/e2e/multicard/test_fused_moe_allgather_ep.py b/tests/e2e/multicard/test_fused_moe_allgather_ep.py index 9335e19a..85d246e5 100644 --- a/tests/e2e/multicard/test_fused_moe_allgather_ep.py +++ b/tests/e2e/multicard/test_fused_moe_allgather_ep.py @@ -49,13 +49,7 @@ def test_generate_with_allgather(): tensor_parallel_size=2, max_model_len=1024, dtype="auto", - enable_expert_parallel=True, - additional_config={ - "ascend_scheduler_config": { - "enabled": True, - "chunked_prefill_enabled": False, - }, - }) as vllm_model: + enable_expert_parallel=True) as vllm_model: vllm_model.generate(example_prompts, sampling_params) @@ -76,11 +70,5 @@ def test_generate_with_alltoall(): tensor_parallel_size=2, max_model_len=1024, dtype="auto", - enable_expert_parallel=True, - additional_config={ - "ascend_scheduler_config": { - "enabled": True, - "chunked_prefill_enabled": False, - }, - }) as vllm_model: + enable_expert_parallel=True) as vllm_model: vllm_model.generate(example_prompts, sampling_params) diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py index 320c3bdf..1380c49e 100644 --- a/tests/e2e/multicard/test_offline_inference_distributed.py +++ b/tests/e2e/multicard/test_offline_inference_distributed.py @@ -82,9 +82,6 @@ def test_models_distributed_DeepSeek_multistream_moe(): "enabled": True, }, "enable_multistream_moe": True, - "ascend_scheduler_config": { - "enabled": True, - }, "refresh": True, }, ) as vllm_model: @@ -154,14 +151,9 @@ def test_models_distributed_DeepSeek_W4A8DYNAMIC(model): quantization="ascend", enforce_eager=True, enable_expert_parallel=True, - additional_config={ - "torchair_graph_config": { - "enabled": False, - }, - "ascend_scheduler_config": { - "enabled": True, - } - }, + additional_config={"torchair_graph_config": { + "enabled": False, + }}, ) as vllm_model: vllm_model.generate_greedy(prompts, max_tokens) diff --git a/tests/e2e/multicard/test_prefix_caching.py b/tests/e2e/multicard/test_prefix_caching.py index e2991662..f16c94b1 100644 --- a/tests/e2e/multicard/test_prefix_caching.py +++ b/tests/e2e/multicard/test_prefix_caching.py @@ -84,67 +84,3 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None: name_0="vllm_output", name_1="prefix_cache_output", ) - - -@pytest.mark.skip(reason="Fix me, the accuracy is not correct") -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("max_tokens", [50]) -def test_prefix_cache_with_ascend_scheduler(model: str, - max_tokens: int) -> None: - - with VllmRunner(model, - additional_config={ - 'ascend_scheduler_config': { - 'enabled': True, - }, - }, - enforce_eager=False, - max_model_len=2048, - tensor_parallel_size=2, - gpu_memory_utilization=0.7) as vllm_model: - vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens) - - with VllmRunner(model, - additional_config={ - 'ascend_scheduler_config': { - 'enabled': True, - 'enable_prefix_caching': True, - }, - }, - enforce_eager=False, - max_model_len=2048, - tensor_parallel_size=2, - gpu_memory_utilization=0.7) as vllm_model: - prefix_cache_output = vllm_model.generate_greedy( - INPUT_PROMPTS, max_tokens) - - # TODO: enable apc and chunked prefill with ascend scheduler will lead accuracy problem. - # Disable it now. Fix it or drop the ascend scheduler in the future. - # with VllmRunner(model, - # additional_config={ - # 'ascend_scheduler_config': { - # 'enabled': True, - # 'enable_prefix_caching': True, - # "enable_chunked_prefill": True, - # }, - # }, - # enforce_eager=True, - # max_model_len=2048, - # tensor_parallel_size=2, - # gpu_memory_utilization=0.7) as vllm_model: - # chunk_prefill_prefix_cache_output = vllm_model.generate_greedy( - # INPUT_PROMPTS, max_tokens) - - check_outputs_equal( - outputs_0_lst=vllm_output, - outputs_1_lst=prefix_cache_output, - name_0="vllm_output", - name_1="prefix_cache_output", - ) - - # check_outputs_equal( - # outputs_0_lst=chunk_prefill_prefix_cache_output, - # outputs_1_lst=prefix_cache_output, - # name_0="chunk_prefill_prefix_cache_output", - # name_1="prefix_cache_output", - # ) diff --git a/tests/e2e/multicard/test_qwen3_next.py b/tests/e2e/multicard/test_qwen3_next.py index e51748ea..eaacd838 100644 --- a/tests/e2e/multicard/test_qwen3_next.py +++ b/tests/e2e/multicard/test_qwen3_next.py @@ -89,12 +89,6 @@ def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY(): gpu_memory_utilization=0.8, distributed_executor_backend="mp", enforce_eager=True, - additional_config={ - "ascend_scheduler_config": { - "enabled": True, - "enable_chunked_prefill": False - } - }, speculative_config={ "method": "qwen3_next_mtp", "num_speculative_tokens": 1 diff --git a/tests/e2e/multicard/test_torchair_graph_mode.py b/tests/e2e/multicard/test_torchair_graph_mode.py index 6a488782..3472051e 100644 --- a/tests/e2e/multicard/test_torchair_graph_mode.py +++ b/tests/e2e/multicard/test_torchair_graph_mode.py @@ -44,9 +44,6 @@ def _deepseek_torchair_test_fixture( kwargs = {} if not use_v1_schduler: kwargs = { - "ascend_scheduler_config": { - "enabled": True, - }, "refresh": True, } additional_config.update(**kwargs) @@ -121,9 +118,6 @@ def _pangu_torchair_test_fixture( # torchair is only work without chunked-prefill now kwargs = { - "ascend_scheduler_config": { - "enabled": True, - }, "refresh": True, } additional_config.update(**kwargs) @@ -186,9 +180,6 @@ def _qwen_torchair_test_fixture( "torchair_graph_config": { "enabled": False, }, - "ascend_scheduler_config": { - "enabled": True, - }, "refresh": True, } @@ -245,9 +236,6 @@ def _deepseek_v2_lite_torchair_test_fixure( kwargs = {} if not use_v1_schduler: kwargs = { - "ascend_scheduler_config": { - "enable": True, - }, "refresh": True, } additional_config.update(**kwargs) diff --git a/tests/e2e/nightly/features/test_mtpx_deepseek_r1_0528_w8a8.py b/tests/e2e/nightly/features/test_mtpx_deepseek_r1_0528_w8a8.py index 65d01b21..880b44ae 100644 --- a/tests/e2e/nightly/features/test_mtpx_deepseek_r1_0528_w8a8.py +++ b/tests/e2e/nightly/features/test_mtpx_deepseek_r1_0528_w8a8.py @@ -73,11 +73,7 @@ async def test_models(model: str, mode: str) -> None: "VLLM_RPC_TIMEOUT": "3600000", "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": "3600000" } - additional_config: dict[str, Any] = { - "ascend_scheduler_config": { - "enabled": False - }, - } + additional_config: dict[str, Any] = {} speculative_config = { "num_speculative_tokens": 2, "method": "deepseek_mtp" diff --git a/tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py b/tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py index 8ac1883d..80157588 100644 --- a/tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py +++ b/tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py @@ -74,9 +74,6 @@ async def test_models(model: str) -> None: "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True", } additional_config = { - "ascend_scheduler_config": { - "enabled": False - }, "torchair_graph_config": { "enabled": True, "enable_multistream_moe": False, diff --git a/tests/e2e/nightly/features/test_prefix_cache_qwen3_32b_int8.py b/tests/e2e/nightly/features/test_prefix_cache_qwen3_32b_int8.py index 3ee23287..fdf7167b 100644 --- a/tests/e2e/nightly/features/test_prefix_cache_qwen3_32b_int8.py +++ b/tests/e2e/nightly/features/test_prefix_cache_qwen3_32b_int8.py @@ -68,12 +68,7 @@ aisbench_cases75 = [{ async def test_models(model: str) -> None: port = get_open_port() env_dict = {"TASK_QUEUE_ENABLE": "1", "HCCL_OP_EXPANSION_MODE": "AIV"} - additional_config = { - "ascend_scheduler_config": { - "enabled": False - }, - "enable_weight_nz_layout": True - } + additional_config = {"enable_weight_nz_layout": True} server_args = [ "--quantization", "ascend", "--reasoning-parser", "qwen3", "--tensor-parallel-size", "4", "--port", diff --git a/tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py b/tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py index 17a7f4b6..9fa2d1e5 100644 --- a/tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py +++ b/tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py @@ -83,8 +83,7 @@ async def test_models(model: str, tp_size: int) -> None: "0.9", "--block-size", "128", "--max-num-seqs", "256", "--enforce-eager", "--max-model-len", "35840", "--max-num-batched-tokens", "35840", "--additional-config", - '{"ascend_scheduler_config":{"enabled":true},"enable_weight_nz_layout":true}', - "--compilation-config", + '{"enable_weight_nz_layout":true}', "--compilation-config", '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[1,8,24,48,60]}' ] with RemoteOpenAIServer(model, diff --git a/tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py b/tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py index c9126577..35082edb 100644 --- a/tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py +++ b/tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py @@ -33,7 +33,6 @@ MODES = [ "single", "aclgraph", "aclgraph_mlapo", - "no_chunkprefill", ] prompts = [ @@ -82,9 +81,6 @@ async def test_models(model: str, mode: str) -> None: "method": "deepseek_mtp" } additional_config = { - "ascend_scheduler_config": { - "enabled": False - }, "torchair_graph_config": { "enabled": True, "enable_multistream_moe": False, @@ -112,10 +108,6 @@ async def test_models(model: str, mode: str) -> None: if mode == "aclgraph_mlapo": env_dict["VLLM_ASCEND_ENABLE_MLAPO"] = "1" additional_config["torchair_graph_config"] = {"enabled": False} - if mode == "no_chunkprefill": - additional_config["ascend_scheduler_config"] = {"enabled": True} - i = server_args.index("--max-num-batched-tokens") + 1 - server_args[i] = "36864" server_args.extend(["--additional-config", json.dumps(additional_config)]) request_keyword_args: dict[str, Any] = { **api_keyword_args, @@ -134,7 +126,7 @@ async def test_models(model: str, mode: str) -> None: choices: list[openai.types.CompletionChoice] = batch.choices assert choices[0].text, "empty response" print(choices) - if mode in ["single", "no_chunkprefill"]: + if mode in ["single"]: return # aisbench test run_aisbench_cases(model, diff --git a/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py b/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py index bca2baf0..6413aba0 100644 --- a/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py +++ b/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py @@ -71,9 +71,6 @@ async def test_models(model: str) -> None: "cudagraph_mode": "FULL_DECODE_ONLY" } additional_config: dict[str, Any] = { - "ascend_scheduler_config": { - "enabled": False - }, "torchair_graph_config": { "enabled": True }, diff --git a/tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py b/tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py index 217b2786..9d5b78f0 100644 --- a/tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py +++ b/tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py @@ -92,8 +92,7 @@ async def test_models(model: str, tp_size: int, dp_size: int, "--gpu-memory-utilization", "0.9", "--additional-config", - '{"ascend_scheduler_config":{"enabled":true},' - '"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}', + '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}', ] if full_graph: server_args += [ diff --git a/tests/e2e/nightly/models/test_qwen2_5_vl_32b.py b/tests/e2e/nightly/models/test_qwen2_5_vl_32b.py index fe6bbedf..77c1a7e1 100644 --- a/tests/e2e/nightly/models/test_qwen2_5_vl_32b.py +++ b/tests/e2e/nightly/models/test_qwen2_5_vl_32b.py @@ -85,9 +85,8 @@ async def test_models(model: str, tp_size: int) -> None: str(tp_size), "--port", str(port), "--max-model-len", "30000", "--max-num-batched-tokens", "40000", "--max-num-seqs", "400", "--trust-remote-code", - "--gpu-memory-utilization", "0.8", "--additional-config", - '{"ascend_scheduler_config":{"enabled":false}}', - "--compilation_config", '{"cudagraph_mode": "FULL_DECODE_ONLY"}' + "--gpu-memory-utilization", "0.8", "--compilation_config", + '{"cudagraph_mode": "FULL_DECODE_ONLY"}' ] request_keyword_args: dict[str, Any] = { **api_keyword_args, diff --git a/tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py b/tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py index 945d7cae..efbf77d2 100644 --- a/tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py +++ b/tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py @@ -60,11 +60,7 @@ async def test_models(model: str) -> None: "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True", "VLLM_ASCEND_ENABLE_FLASHCOMM1": "1" } - additional_config: dict[str, Any] = { - "ascend_scheduler_config": { - "enabled": False - }, - } + additional_config: dict[str, Any] = {} compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"} server_args = [ "--quantization", "ascend", "--async-scheduling", diff --git a/tests/e2e/nightly/models/test_qwen3_235b_w8a8.py b/tests/e2e/nightly/models/test_qwen3_235b_w8a8.py index 8220e4d5..055a452e 100644 --- a/tests/e2e/nightly/models/test_qwen3_235b_w8a8.py +++ b/tests/e2e/nightly/models/test_qwen3_235b_w8a8.py @@ -63,11 +63,6 @@ async def test_models(model: str, mode: str) -> None: "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True", "VLLM_ASCEND_ENABLE_FLASHCOMM1": "1" } - additional_config: dict[str, Any] = { - "ascend_scheduler_config": { - "enabled": False - }, - } compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"} server_args = [ "--quantization", "ascend", "--async-scheduling", @@ -82,7 +77,6 @@ async def test_models(model: str, mode: str) -> None: server_args.extend( ["--compilation-config", json.dumps(compilation_config)]) - server_args.extend(["--additional-config", json.dumps(additional_config)]) request_keyword_args: dict[str, Any] = { **api_keyword_args, } diff --git a/tests/e2e/nightly/models/test_qwq_32b.py b/tests/e2e/nightly/models/test_qwq_32b.py index a60eff22..824651ba 100644 --- a/tests/e2e/nightly/models/test_qwq_32b.py +++ b/tests/e2e/nightly/models/test_qwq_32b.py @@ -93,8 +93,6 @@ async def test_models(model: str, mode: str, tp_size: int) -> None: server_args.remove( '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 8, 24, 48, 60]}' ) - server_args.append("--additional-config") - server_args.append('{"ascend_scheduler_config":{"enabled":true}}') server_args.append("--enforce-eager") request_keyword_args: dict[str, Any] = { **api_keyword_args, diff --git a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2-torchair.yaml b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2-torchair.yaml index 42b70f76..7bfe3f5e 100644 --- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2-torchair.yaml +++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2-torchair.yaml @@ -30,7 +30,7 @@ deployment: --quantization ascend --gpu-memory-utilization 0.9 --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' - --additional-config '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}' + --additional-config '{"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}' - server_cmd: > @@ -51,7 +51,7 @@ deployment: --quantization ascend --gpu-memory-utilization 0.9 --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' - --additional-config '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}' + --additional-config '{"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}' benchmarks: acc: case_type: accuracy diff --git a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml index cf44bc8f..01100f29 100644 --- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml +++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml @@ -31,7 +31,7 @@ deployment: --gpu-memory-utilization 0.9 --enforce-eager --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' - --additional-config '{"ascend_scheduler_config":{"enabled":false},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}' + --additional-config '{"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}' - server_cmd: > @@ -53,5 +53,5 @@ deployment: --gpu-memory-utilization 0.9 --enforce-eager --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' - --additional-config '{"ascend_scheduler_config":{"enabled":false},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}' + --additional-config '{"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}' benchmarks: diff --git a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-EPLB.yaml b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-EPLB.yaml index 9a4c3d94..6ca189c4 100644 --- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-EPLB.yaml +++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-EPLB.yaml @@ -50,7 +50,7 @@ deployment: "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector" }' --additional-config - '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}' + '{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}' - server_cmd: > @@ -80,7 +80,7 @@ deployment: "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector" }' --additional-config - '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}' + '{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}' - server_cmd: > vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8 @@ -111,7 +111,7 @@ deployment: "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector" }' --additional-config - '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}' + '{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}' - server_cmd: > vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8 @@ -141,7 +141,7 @@ deployment: "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector" }' --additional-config - '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}' + '{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}' benchmarks: perf: case_type: performance diff --git a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml index a8e49290..37a024b9 100644 --- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml +++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml @@ -49,7 +49,7 @@ deployment: "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector" }' --additional-config - '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}' + '{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}' - server_cmd: > @@ -79,7 +79,7 @@ deployment: "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector" }' --additional-config - '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}' + '{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}' - server_cmd: > vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8 @@ -110,7 +110,7 @@ deployment: "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector" }' --additional-config - '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}' + '{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}' - server_cmd: > vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8 @@ -140,7 +140,7 @@ deployment: "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector" }' --additional-config - '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}' + '{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}' benchmarks: perf: case_type: performance diff --git a/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3_2-Exp-bf16.yaml b/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3_2-Exp-bf16.yaml index 6dafd3cc..40ac6476 100644 --- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3_2-Exp-bf16.yaml +++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3_2-Exp-bf16.yaml @@ -29,7 +29,7 @@ deployment: --trust-remote-code --no-enable-prefix-caching --gpu-memory-utilization 0.9 - --additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}' + --additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}' - server_cmd: > @@ -49,5 +49,5 @@ deployment: --trust-remote-code --no-enable-prefix-caching --gpu-memory-utilization 0.92 - --additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}' + --additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}' benchmarks: diff --git a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py index 2f56d9d2..6b90ec36 100644 --- a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py +++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py @@ -48,27 +48,26 @@ def mtp_correctness(sampling_config: SamplingParams, if graph_mode == CUDAGraphMode.FULL: graph_mode_str = "FULL_DECODE_ONLY" - with VllmRunner( - model_name, - tensor_parallel_size=1, - max_num_seqs=256, - gpu_memory_utilization=0.7, - distributed_executor_backend="mp", - enable_expert_parallel=True, - speculative_config={ - "method": "deepseek_mtp", - "num_speculative_tokens": num_speculative_tokens, - "disable_padded_drafter_batch": disable_padded_drafter_batch, - }, - enforce_eager=enforce_eager, - max_model_len=2000, - compilation_config=CompilationConfig( - cudagraph_mode=graph_mode_str, - cudagraph_capture_sizes=[12], - ), - additional_config={"ascend_scheduler_config": { - "enabled": False - }}) as spec_llm: + with VllmRunner(model_name, + tensor_parallel_size=1, + max_num_seqs=256, + gpu_memory_utilization=0.7, + distributed_executor_backend="mp", + enable_expert_parallel=True, + speculative_config={ + "method": + "deepseek_mtp", + "num_speculative_tokens": + num_speculative_tokens, + "disable_padded_drafter_batch": + disable_padded_drafter_batch, + }, + enforce_eager=enforce_eager, + max_model_len=2000, + compilation_config=CompilationConfig( + cudagraph_mode=graph_mode_str, + cudagraph_capture_sizes=[12], + )) as spec_llm: spec_outputs = spec_llm.generate(example_prompts, sampling_config) matches = 0 diff --git a/tests/e2e/singlecard/test_ascend_scheduler.py b/tests/e2e/singlecard/test_ascend_scheduler.py index 502a8103..0c996e4e 100644 --- a/tests/e2e/singlecard/test_ascend_scheduler.py +++ b/tests/e2e/singlecard/test_ascend_scheduler.py @@ -12,11 +12,6 @@ MODEL = "Qwen/Qwen3-0.6B" @pytest.mark.parametrize("enforce_eager", [True, False]) def test_concurrent_partial_prefill(enforce_eager): with VllmRunner(MODEL, - additional_config={ - 'ascend_scheduler_config': { - 'enabled': True, - }, - }, max_num_seqs=3, max_num_batched_tokens=8192, enforce_eager=enforce_eager, @@ -31,11 +26,6 @@ def test_concurrent_partial_prefill(enforce_eager): @pytest.mark.parametrize("enforce_eager", [True, False]) def test_prefix_cache_stats_is_recorded(enforce_eager): with VllmRunner(MODEL, - additional_config={ - 'ascend_scheduler_config': { - 'enabled': True, - }, - }, max_num_seqs=3, max_num_batched_tokens=8192, enforce_eager=enforce_eager, @@ -47,48 +37,6 @@ def test_prefix_cache_stats_is_recorded(enforce_eager): assert outputs[0].num_cached_tokens == 128 -@pytest.mark.parametrize("max_tokens", - [4]) # cannot align results when max_tokens > 4 -@pytest.mark.parametrize("chunked_prefill_token_size", [2048]) -def test_chunked_prefill_with_ascend_scheduler( - max_tokens: int, chunked_prefill_token_size: int) -> None: - example_prompts = [ - "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs." - ] - max_num_seqs = chunked_prefill_token_size - max_num_batched_tokens = chunked_prefill_token_size - with VllmRunner(MODEL, - additional_config={ - 'ascend_scheduler_config': { - 'enabled': True, - 'enable_chunked_prefill': True, - }, - }, - max_num_seqs=max_num_seqs, - max_num_batched_tokens=max_num_batched_tokens, - max_model_len=2048, - gpu_memory_utilization=0.7) as vllm_model: - chunked_prefill_output = vllm_model.generate_greedy( - example_prompts, max_tokens) - - with VllmRunner(MODEL, - additional_config={ - 'ascend_scheduler_config': { - 'enabled': True, - }, - }, - max_model_len=2048, - gpu_memory_utilization=0.7) as vllm_model: - vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens) - - check_outputs_equal( - outputs_0_lst=vllm_output, - outputs_1_lst=chunked_prefill_output, - name_0="vllm_output", - name_1="chunked_prefill_output", - ) - - @pytest.mark.parametrize("max_tokens", [4]) # cannot align results when max_tokens > 4 @pytest.mark.parametrize("chunked_prefill_token_size", [2048]) diff --git a/tests/e2e/singlecard/test_chunked.py b/tests/e2e/singlecard/test_chunked.py deleted file mode 100644 index f6eacb71..00000000 --- a/tests/e2e/singlecard/test_chunked.py +++ /dev/null @@ -1,82 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -""" -Compare the outputs of vLLM with and without aclgraph. - -Run `pytest tests/compile/test_aclgraph.py`. -""" -import gc - -import pytest -import torch -from vllm import SamplingParams - -from tests.e2e.conftest import VllmRunner - -MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"] - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("max_tokens", [1]) -def test_models( - model: str, - max_tokens: int, -) -> None: - prompts = ["The president of the United States is"] - - sampling_params = SamplingParams( - max_tokens=max_tokens, - temperature=0.0, - ) - - with VllmRunner(model, - long_prefill_token_threshold=20, - enforce_eager=False) as vllm_model: - output1 = vllm_model.generate(prompts, sampling_params) - - with VllmRunner(model, - enforce_eager=False, - additional_config={ - 'ascend_scheduler_config': { - 'enabled': True - }, - }) as vllm_model: - output2 = vllm_model.generate(prompts, sampling_params) - - # Extract the generated token IDs for comparison - token_ids1 = output1[0][0][0] - token_ids2 = output2[0][0][0] - - print(f"Token IDs 1: {token_ids1}") - print(f"Token IDs 2: {token_ids2}") - - # Convert token IDs to tensors and calculate cosine similarity - # Take the length of a shorter sequence to ensure consistent dimensions - min_len = min(len(token_ids1), len(token_ids2)) - - tensor1 = torch.tensor(token_ids1[:min_len], dtype=torch.float32) - tensor2 = torch.tensor(token_ids2[:min_len], dtype=torch.float32) - - # Calculate similarity using torch.cosine_similarity - similarity = torch.cosine_similarity(tensor1, tensor2, dim=0) - print(f"Token IDs cosine similarity: {similarity.item()}") - - assert similarity > 0.95 - - gc.collect() - torch.npu.empty_cache() - torch.npu.reset_peak_memory_stats() diff --git a/tests/e2e/singlecard/test_vlm.py b/tests/e2e/singlecard/test_vlm.py index cc3d50f8..95456679 100644 --- a/tests/e2e/singlecard/test_vlm.py +++ b/tests/e2e/singlecard/test_vlm.py @@ -20,7 +20,6 @@ Run `pytest tests/test_offline_inference.py`. """ -import pytest from vllm import SamplingParams from vllm.assets.audio import AudioAsset from vllm.assets.image import ImageAsset @@ -55,40 +54,6 @@ def test_multimodal_vl(prompt_template): assert output_str, "Generated output should not be empty." -@pytest.mark.skip(reason="This e2e test will stuck in multi-batch scenario. " - "Add this back after fixing the issue.") -def test_multimodal_ascend_scheduler(prompt_template): - image = ImageAsset("cherry_blossom") \ - .pil_image.convert("RGB") - img_questions = [ - "What is the content of this image?", - "Describe the content of this image in detail.", - "What's in the image?", - "Where is this image taken?", - ] - images = [image] * len(img_questions) - prompts = prompt_template(img_questions) - with VllmRunner("Qwen/Qwen2.5-VL-3B-Instruct", - max_model_len=4096, - additional_config={ - 'ascend_scheduler_config': { - 'enabled': True, - }, - }, - mm_processor_kwargs={ - "min_pixels": 28 * 28, - "max_pixels": 1280 * 28 * 28, - "fps": 1, - }, - enforce_eager=True) as vllm_model: - outputs = vllm_model.generate_greedy(prompts=prompts, - images=images, - max_tokens=64) - assert len(outputs) == len(prompts) - for _, output_str in outputs: - assert output_str, "Generated output should not be empty." - - def test_multimodal_audio(): audio_prompt = "".join([ f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"