From 1b424fb7f12a0320186f3c2b47df3589e664c907 Mon Sep 17 00:00:00 2001 From: lilinsiman Date: Fri, 17 Oct 2025 17:15:19 +0800 Subject: [PATCH] ACLgraph enable: Test cases revisions for all features (#3388) ### What this PR does / why we need it? This PR revise the test cases of various features on the warehouse which add the enablement of aclgraph to the test cases. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? ut - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: lilinsiman --- .../e2e/models/configs/DeepSeek-V2-Lite.yaml | 2 +- tests/e2e/multicard/test_data_parallel.py | 2 +- tests/e2e/multicard/test_expert_parallel.py | 4 +- tests/e2e/multicard/test_ilama_lora_tp2.py | 2 +- .../test_offline_inference_distributed.py | 13 ++- tests/e2e/multicard/test_prefix_caching.py | 8 +- tests/e2e/multicard/test_weight_loader.py | 81 +------------------ .../e2e/pd_disaggreate/run_edge_case_test.sh | 2 - tests/e2e/pd_disaggreate/setup_pd.sh | 2 - .../spec_decode_v1/test_v1_spec_decode.py | 8 +- tests/e2e/singlecard/test_ascend_scheduler.py | 10 ++- tests/e2e/singlecard/test_camem.py | 2 +- tests/e2e/singlecard/test_chunked.py | 7 +- tests/e2e/singlecard/test_embedding.py | 2 +- tests/e2e/singlecard/test_ilama_lora.py | 2 +- tests/e2e/singlecard/test_quantization.py | 2 +- tests/e2e/singlecard/test_vlm.py | 2 +- 17 files changed, 34 insertions(+), 117 deletions(-) diff --git a/tests/e2e/models/configs/DeepSeek-V2-Lite.yaml b/tests/e2e/models/configs/DeepSeek-V2-Lite.yaml index 58af318..848a491 100644 --- a/tests/e2e/models/configs/DeepSeek-V2-Lite.yaml +++ b/tests/e2e/models/configs/DeepSeek-V2-Lite.yaml @@ -14,4 +14,4 @@ gpu_memory_utilization: 0.7 apply_chat_template: False fewshot_as_multiturn: False trust_remote_code: True -enforce_eager: True +enforce_eager: False diff --git a/tests/e2e/multicard/test_data_parallel.py b/tests/e2e/multicard/test_data_parallel.py index 11b7681..2e8ba38 100644 --- a/tests/e2e/multicard/test_data_parallel.py +++ b/tests/e2e/multicard/test_data_parallel.py @@ -52,8 +52,8 @@ def test_data_parallel_inference(model, max_tokens): "--node-rank", "0", "--trust-remote-code", - "--enforce-eager", ] + if model == "Qwen/Qwen3-30B-A3B": cmd.append("--enable-expert-parallel") diff --git a/tests/e2e/multicard/test_expert_parallel.py b/tests/e2e/multicard/test_expert_parallel.py index 288afdd..f107601 100644 --- a/tests/e2e/multicard/test_expert_parallel.py +++ b/tests/e2e/multicard/test_expert_parallel.py @@ -21,7 +21,7 @@ def test_e2e_ep_correctness(model_name): additional_config={"ascend_scheduler_config": { "enabled": True }}, - enforce_eager=True) as vllm_model: + enforce_eager=False) as vllm_model: tp_output = vllm_model.generate_greedy(example_prompts, max_tokens) with VllmRunner( @@ -31,7 +31,7 @@ def test_e2e_ep_correctness(model_name): additional_config={"ascend_scheduler_config": { "enabled": True }}, - enforce_eager=True) as vllm_model: + enforce_eager=False) as vllm_model: ep_output = vllm_model.generate_greedy(example_prompts, max_tokens) check_outputs_equal( diff --git a/tests/e2e/multicard/test_ilama_lora_tp2.py b/tests/e2e/multicard/test_ilama_lora_tp2.py index 9fca8ae..750039c 100644 --- a/tests/e2e/multicard/test_ilama_lora_tp2.py +++ b/tests/e2e/multicard/test_ilama_lora_tp2.py @@ -16,7 +16,7 @@ def test_ilama_lora_tp2(distributed_executor_backend, ilama_lora_files): max_num_seqs=16, tensor_parallel_size=2, distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: + enforce_eager=False) as vllm_model: output = do_sample(vllm_model.model, ilama_lora_files, lora_id=2) for i in range(len(EXPECTED_LORA_OUTPUT)): diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py index be62e1b..60f3c1b 100644 --- a/tests/e2e/multicard/test_offline_inference_distributed.py +++ b/tests/e2e/multicard/test_offline_inference_distributed.py @@ -52,7 +52,7 @@ def test_models_distributed_QwQ(): dtype=dtype, tensor_parallel_size=2, distributed_executor_backend="mp", - enforce_eager=True, + enforce_eager=False, ) as vllm_model: vllm_model.generate_greedy(example_prompts, max_tokens) @@ -163,11 +163,10 @@ def test_sp_for_qwen3_moe() -> None: vllm_model.generate(example_prompts, sampling_params) -@pytest.mark.parametrize("enforce_eager", [True, False]) @pytest.mark.parametrize("model", QWEN_DENSE_MODELS) @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1"}) @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"}) -def test_models_distributed_Qwen_Dense_with_flashcomm_v1(model, enforce_eager): +def test_models_distributed_Qwen_Dense_with_flashcomm_v1(model): example_prompts = [ "Hello, my name is", ] @@ -176,7 +175,7 @@ def test_models_distributed_Qwen_Dense_with_flashcomm_v1(model, enforce_eager): with VllmRunner( snapshot_download(model), max_model_len=8192, - enforce_eager=enforce_eager, + enforce_eager=False, dtype="auto", tensor_parallel_size=2, quantization="ascend", @@ -184,12 +183,10 @@ def test_models_distributed_Qwen_Dense_with_flashcomm_v1(model, enforce_eager): vllm_model.generate_greedy(example_prompts, max_tokens) -@pytest.mark.parametrize("enforce_eager", [True, False]) @pytest.mark.parametrize("model", QWEN_DENSE_MODELS) @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1"}) @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_PREFETCH_MLP": "1"}) -def test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight( - model, enforce_eager): +def test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight(model): example_prompts = [ "Hello, my name is", ] @@ -198,7 +195,7 @@ def test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight( with VllmRunner( snapshot_download(model), max_model_len=8192, - enforce_eager=enforce_eager, + enforce_eager=False, dtype="auto", tensor_parallel_size=2, quantization="ascend", diff --git a/tests/e2e/multicard/test_prefix_caching.py b/tests/e2e/multicard/test_prefix_caching.py index e563488..713cbb4 100644 --- a/tests/e2e/multicard/test_prefix_caching.py +++ b/tests/e2e/multicard/test_prefix_caching.py @@ -62,7 +62,7 @@ INPUT_PROMPTS = [ @pytest.mark.parametrize("max_tokens", [50]) def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None: with VllmRunner(model, - enforce_eager=True, + enforce_eager=False, max_model_len=2048, tensor_parallel_size=2, gpu_memory_utilization=0.7) as vllm_model: @@ -71,7 +71,7 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None: with VllmRunner(model, enable_prefix_caching=False, - enforce_eager=True, + enforce_eager=False, max_model_len=2048, tensor_parallel_size=2, gpu_memory_utilization=0.7) as vllm_model: @@ -96,7 +96,7 @@ def test_prefix_cache_with_ascend_scheduler(model: str, 'enabled': True, }, }, - enforce_eager=True, + enforce_eager=False, max_model_len=2048, tensor_parallel_size=2, gpu_memory_utilization=0.7) as vllm_model: @@ -109,7 +109,7 @@ def test_prefix_cache_with_ascend_scheduler(model: str, 'enable_prefix_caching': True, }, }, - enforce_eager=True, + enforce_eager=False, max_model_len=2048, tensor_parallel_size=2, gpu_memory_utilization=0.7) as vllm_model: diff --git a/tests/e2e/multicard/test_weight_loader.py b/tests/e2e/multicard/test_weight_loader.py index f59cd1f..2150a44 100644 --- a/tests/e2e/multicard/test_weight_loader.py +++ b/tests/e2e/multicard/test_weight_loader.py @@ -33,47 +33,7 @@ DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10] @pytest.mark.parametrize("model", MOE_MODELS) -def test_external_launcher_eager(model): - script = script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py" - env = os.environ.copy() - # TODO: Change to 2 when ci machine has 4 cards - cmd = [ - sys.executable, - str(script), - "--model", - model, - "--tp-size", - "2", - "--proc-per-node", - "2", - "--trust-remote-code", - "--enforce-eager", - "--enable-expert-parallel", - "--enable-sleep-mode", - "--model-weight-gib", - "20", - ] - - print(f"Running subprocess: {' '.join(cmd)}") - proc = subprocess.run( - cmd, - env=env, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - timeout=600, - ) - output = proc.stdout.decode() - - print(output) - - assert "TP RANKS: [0]" in output - assert "TP RANKS: [1]" in output - assert "Generated text:" in output - assert proc.returncode == 0 - - -@pytest.mark.parametrize("model", MOE_MODELS) -def test_external_launcher_aclgraph(model): +def test_external_launcher(model): script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py" env = os.environ.copy() # TODO: Change to 2 when ci machine has 4 cards @@ -147,42 +107,3 @@ def test_external_launcher_dense(model): assert "TP RANKS: [1]" in output assert "Generated text:" in output assert proc.returncode == 0 - - -@pytest.mark.parametrize("model", MODELS) -def test_external_launcher_dense_eager(model): - script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py" - env = os.environ.copy() - # TODO: Change to 2 when ci machine has 4 cards - cmd = [ - sys.executable, - str(script), - "--model", - model, - "--tp-size", - "2", - "--proc-per-node", - "2", - "--trust-remote-code", - "--enforce-eager", - "--enable-sleep-mode", - "--model-weight-gib", - "20", - ] - - print(f"Running subprocess: {' '.join(cmd)}") - proc = subprocess.run( - cmd, - env=env, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - timeout=600, - ) - output = proc.stdout.decode() - - print(output) - - assert "TP RANKS: [0]" in output - assert "TP RANKS: [1]" in output - assert "Generated text:" in output - assert proc.returncode == 0 diff --git a/tests/e2e/pd_disaggreate/run_edge_case_test.sh b/tests/e2e/pd_disaggreate/run_edge_case_test.sh index 49e06e5..f0e7ace 100644 --- a/tests/e2e/pd_disaggreate/run_edge_case_test.sh +++ b/tests/e2e/pd_disaggreate/run_edge_case_test.sh @@ -73,7 +73,6 @@ run_tests_for_model() { BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=0 VLLM_ASCEND_LLMDD_RPC_PORT=5559 vllm serve $model_name \ --port $PREFILL_PORT \ --seed 1024 \ - --enforce-eager \ --disable-log-requests \ --gpu-memory-utilization 0.8 \ --kv-transfer-config '{\"kv_connector\":\"LLMDataDistCMgrConnector\",\"kv_role\":\"kv_producer\",\"kv_buffer_device\":\"npu\",\"kv_parallel_size\":\"1\",\"kv_port\":\"20001\",\"engine_id\":\"0\",\"kv_connector_module_path\":\"vllm_ascend.distributed.llmdatadist_c_mgr_connector\"}'" @@ -93,7 +92,6 @@ run_tests_for_model() { BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=1 VLLM_ASCEND_LLMDD_RPC_PORT=6000 vllm serve $model_name \ --port $DECODE_PORT \ --seed 1024 \ - --enforce-eager \ --disable-log-requests \ --gpu-memory-utilization 0.8 \ --kv-transfer-config '{\"kv_connector\":\"LLMDataDistCMgrConnector\",\"kv_role\":\"kv_consumer\",\"kv_buffer_device\":\"npu\",\"kv_parallel_size\":\"1\",\"kv_port\":\"20001\",\"engine_id\":\"0\",\"kv_connector_module_path\":\"vllm_ascend.distributed.llmdatadist_c_mgr_connector\"}'" diff --git a/tests/e2e/pd_disaggreate/setup_pd.sh b/tests/e2e/pd_disaggreate/setup_pd.sh index c15f109..675bee4 100644 --- a/tests/e2e/pd_disaggreate/setup_pd.sh +++ b/tests/e2e/pd_disaggreate/setup_pd.sh @@ -66,7 +66,6 @@ function run_prefill_instance() { --served-model-name Deepseek \ --max-model-len 2000 \ --trust-remote-code \ - --enforce-eager \ --kv-transfer-config "$KV_CONFIG" } @@ -120,7 +119,6 @@ function run_decode_instance() { --max-num-batched-tokens 2000 \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ - --enforce-eager \ --kv-transfer-config "$KV_CONFIG" } diff --git a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py index 0c1546d..a6b6f16 100644 --- a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py +++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py @@ -71,7 +71,7 @@ def test_ngram_correctness( should be the same when using ngram speculative decoding. ''' pytest.skip("Not current support for the test.") - ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=True) + ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=False) ref_outputs = ref_llm.chat(test_prompts, sampling_config) del ref_llm with VllmRunner(model_name, @@ -82,7 +82,7 @@ def test_ngram_correctness( "num_speculative_tokens": 3, }, max_model_len=1024, - enforce_eager=True) as runner: + enforce_eager=False) as runner: spec_outputs = runner.model.chat(test_prompts, sampling_config) matches = 0 misses = 0 @@ -111,7 +111,7 @@ def test_eagle_correctness( should be the same when using eagle speculative decoding. ''' - ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=True) + ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=False) ref_outputs = ref_llm.chat(test_prompts, sampling_config) del ref_llm @@ -129,7 +129,7 @@ def test_eagle_correctness( "max_model_len": 128, }, max_model_len=128, - enforce_eager=True, + enforce_eager=False, ) as runner: spec_outputs = runner.model.chat(test_prompts, sampling_config) diff --git a/tests/e2e/singlecard/test_ascend_scheduler.py b/tests/e2e/singlecard/test_ascend_scheduler.py index b6ab3f3..916db51 100644 --- a/tests/e2e/singlecard/test_ascend_scheduler.py +++ b/tests/e2e/singlecard/test_ascend_scheduler.py @@ -9,7 +9,8 @@ from tests.e2e.model_utils import check_outputs_equal MODEL = "Qwen/Qwen3-0.6B" -def test_concurrent_partial_prefill(): +@pytest.mark.parametrize("enforce_eager", [True, False]) +def test_concurrent_partial_prefill(enforce_eager): with VllmRunner(MODEL, additional_config={ 'ascend_scheduler_config': { @@ -18,7 +19,7 @@ def test_concurrent_partial_prefill(): }, max_num_seqs=3, max_num_batched_tokens=2048, - enforce_eager=True, + enforce_eager=enforce_eager, max_model_len=2048, gpu_memory_utilization=0.7) as vllm_model: outputs = vllm_model.model.generate(["Hello my name is Robert and I"] * @@ -28,7 +29,8 @@ def test_concurrent_partial_prefill(): assert len(output.outputs) == 1 -def test_prefix_cache_stats_is_recorded(): +@pytest.mark.parametrize("enforce_eager", [True, False]) +def test_prefix_cache_stats_is_recorded(enforce_eager): with VllmRunner(MODEL, additional_config={ 'ascend_scheduler_config': { @@ -37,7 +39,7 @@ def test_prefix_cache_stats_is_recorded(): }, max_num_seqs=3, max_num_batched_tokens=2048, - enforce_eager=True, + enforce_eager=enforce_eager, max_model_len=2048, gpu_memory_utilization=0.7) as vllm_model: # 17 tokens will make sure first 16 tokens are cached in a block diff --git a/tests/e2e/singlecard/test_camem.py b/tests/e2e/singlecard/test_camem.py index 2ca8a1b..3f1f92b 100644 --- a/tests/e2e/singlecard/test_camem.py +++ b/tests/e2e/singlecard/test_camem.py @@ -74,7 +74,7 @@ def test_end_to_end(): sampling_params = SamplingParams(temperature=0, max_tokens=10) with VllmRunner("Qwen/Qwen3-0.6B", - enforce_eager=True, + enforce_eager=False, enable_sleep_mode=True) as runner: output = runner.model.generate(prompt, sampling_params) diff --git a/tests/e2e/singlecard/test_chunked.py b/tests/e2e/singlecard/test_chunked.py index 40df8f8..f6eacb7 100644 --- a/tests/e2e/singlecard/test_chunked.py +++ b/tests/e2e/singlecard/test_chunked.py @@ -43,12 +43,13 @@ def test_models( temperature=0.0, ) - with VllmRunner(model, long_prefill_token_threshold=20, - enforce_eager=True) as vllm_model: + with VllmRunner(model, + long_prefill_token_threshold=20, + enforce_eager=False) as vllm_model: output1 = vllm_model.generate(prompts, sampling_params) with VllmRunner(model, - enforce_eager=True, + enforce_eager=False, additional_config={ 'ascend_scheduler_config': { 'enabled': True diff --git a/tests/e2e/singlecard/test_embedding.py b/tests/e2e/singlecard/test_embedding.py index 4f85dd7..8c63a98 100644 --- a/tests/e2e/singlecard/test_embedding.py +++ b/tests/e2e/singlecard/test_embedding.py @@ -29,7 +29,7 @@ def test_embed_models_correctness(): with VllmRunner( model_name, task="embed", - enforce_eager=True, + enforce_eager=False, ) as vllm_runner: vllm_outputs = vllm_runner.encode(queries) diff --git a/tests/e2e/singlecard/test_ilama_lora.py b/tests/e2e/singlecard/test_ilama_lora.py index 499e46f..daeac4a 100644 --- a/tests/e2e/singlecard/test_ilama_lora.py +++ b/tests/e2e/singlecard/test_ilama_lora.py @@ -51,7 +51,7 @@ def test_ilama_lora(ilama_lora_files): max_loras=4, max_model_len=1024, max_num_seqs=16, - enforce_eager=True) as vllm_model: + enforce_eager=False) as vllm_model: output1 = do_sample(vllm_model.model, ilama_lora_files, lora_id=1) for i in range(len(EXPECTED_LORA_OUTPUT)): diff --git a/tests/e2e/singlecard/test_quantization.py b/tests/e2e/singlecard/test_quantization.py index 4ec3198..95f26ee 100644 --- a/tests/e2e/singlecard/test_quantization.py +++ b/tests/e2e/singlecard/test_quantization.py @@ -28,7 +28,7 @@ def test_quant_W8A8(): with VllmRunner( snapshot_download("vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"), max_model_len=8192, - enforce_eager=True, + enforce_eager=False, gpu_memory_utilization=0.7, quantization="ascend", ) as vllm_model: diff --git a/tests/e2e/singlecard/test_vlm.py b/tests/e2e/singlecard/test_vlm.py index 59fb10e..654078e 100644 --- a/tests/e2e/singlecard/test_vlm.py +++ b/tests/e2e/singlecard/test_vlm.py @@ -46,7 +46,7 @@ def test_multimodal_vl(prompt_template): "max_pixels": 1280 * 28 * 28, "fps": 1, }, - enforce_eager=True) as vllm_model: + enforce_eager=False) as vllm_model: outputs = vllm_model.generate_greedy(prompts=prompts, images=images, max_tokens=64)