diff --git a/tests/e2e/nightly/models/test_qwen2_5_vl_32b.py b/tests/e2e/nightly/models/test_qwen2_5_vl_32b.py index 09c0d4ee..760f8dee 100644 --- a/tests/e2e/nightly/models/test_qwen2_5_vl_32b.py +++ b/tests/e2e/nightly/models/test_qwen2_5_vl_32b.py @@ -86,7 +86,8 @@ async def test_models(model: str, tp_size: int) -> None: str(port), "--max-model-len", "30000", "--max-num-batched-tokens", "40000", "--max-num-seqs", "400", "--trust-remote-code", "--gpu-memory-utilization", "0.8", "--additional-config", - '{"ascend_scheduler_config":{"enabled":false}}' + '{"ascend_scheduler_config":{"enabled":false}}', + "--compilation_config", '{"cudagraph_mode": "FULL_DECODE_ONLY"}' ] request_keyword_args: dict[str, Any] = { **api_keyword_args, diff --git a/tests/e2e/nightly/models/test_qwen2_5_vl_7b.py b/tests/e2e/nightly/models/test_qwen2_5_vl_7b.py index 578ae1db..bc35ff88 100644 --- a/tests/e2e/nightly/models/test_qwen2_5_vl_7b.py +++ b/tests/e2e/nightly/models/test_qwen2_5_vl_7b.py @@ -72,21 +72,13 @@ async def test_models(model: str, tp_size: int) -> None: "HCCL_OP_EXPANSION_MODE": "AIV" } server_args = [ - "--no-enable-prefix-caching", - "--disable-mm-preprocessor-cache", + "--no-enable-prefix-caching", "--disable-mm-preprocessor-cache", "--tensor-parallel-size", - str(tp_size), - "--port", - str(port), - "--max-model-len", - "30000", - "--max-num-batched-tokens", - "40000", - "--max-num-seqs", - "400", - "--trust-remote-code", - "--gpu-memory-utilization", - "0.8", + str(tp_size), "--port", + str(port), "--max-model-len", "30000", "--max-num-batched-tokens", + "40000", "--max-num-seqs", "400", "--trust-remote-code", + "--gpu-memory-utilization", "0.8", "--compilation_config", + '{"cudagraph_mode": "FULL_DECODE_ONLY"}' ] request_keyword_args: dict[str, Any] = { **api_keyword_args,