From 44b58b86659a55a690ffa9c0362f02ba3645d3da Mon Sep 17 00:00:00 2001 From: jiangyunfan1 Date: Tue, 4 Nov 2025 16:47:48 +0800 Subject: [PATCH] [TEST]Add full graph for multimodal nightly tests (#3968) ### What this PR does / why we need it? This PR adds full graph for multimodal nightly test, we need to maintain this senario ### How was this patch tested? by running the test - vLLM version: v0.11.0 - vLLM main: https://github.com/vllm-project/vllm/commit/83f478bb19489b41e9d208b47b4bb5a95ac171ac Signed-off-by: jiangyunfan1 --- .../e2e/nightly/models/test_qwen2_5_vl_32b.py | 3 ++- .../e2e/nightly/models/test_qwen2_5_vl_7b.py | 20 ++++++------------- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/tests/e2e/nightly/models/test_qwen2_5_vl_32b.py b/tests/e2e/nightly/models/test_qwen2_5_vl_32b.py index 09c0d4ee..760f8dee 100644 --- a/tests/e2e/nightly/models/test_qwen2_5_vl_32b.py +++ b/tests/e2e/nightly/models/test_qwen2_5_vl_32b.py @@ -86,7 +86,8 @@ async def test_models(model: str, tp_size: int) -> None: str(port), "--max-model-len", "30000", "--max-num-batched-tokens", "40000", "--max-num-seqs", "400", "--trust-remote-code", "--gpu-memory-utilization", "0.8", "--additional-config", - '{"ascend_scheduler_config":{"enabled":false}}' + '{"ascend_scheduler_config":{"enabled":false}}', + "--compilation_config", '{"cudagraph_mode": "FULL_DECODE_ONLY"}' ] request_keyword_args: dict[str, Any] = { **api_keyword_args, diff --git a/tests/e2e/nightly/models/test_qwen2_5_vl_7b.py b/tests/e2e/nightly/models/test_qwen2_5_vl_7b.py index 578ae1db..bc35ff88 100644 --- a/tests/e2e/nightly/models/test_qwen2_5_vl_7b.py +++ b/tests/e2e/nightly/models/test_qwen2_5_vl_7b.py @@ -72,21 +72,13 @@ async def test_models(model: str, tp_size: int) -> None: "HCCL_OP_EXPANSION_MODE": "AIV" } server_args = [ - "--no-enable-prefix-caching", - "--disable-mm-preprocessor-cache", + "--no-enable-prefix-caching", "--disable-mm-preprocessor-cache", "--tensor-parallel-size", - str(tp_size), - "--port", - str(port), - "--max-model-len", - "30000", - "--max-num-batched-tokens", - "40000", - "--max-num-seqs", - "400", - "--trust-remote-code", - "--gpu-memory-utilization", - "0.8", + str(tp_size), "--port", + str(port), "--max-model-len", "30000", "--max-num-batched-tokens", + "40000", "--max-num-seqs", "400", "--trust-remote-code", + "--gpu-memory-utilization", "0.8", "--compilation_config", + '{"cudagraph_mode": "FULL_DECODE_ONLY"}' ] request_keyword_args: dict[str, Any] = { **api_keyword_args,