[TEST]Add full graph for multimodal nightly tests (#3968)

### What this PR does / why we need it?
This PR adds full graph for multimodal nightly test, we need to maintain
this senario

### How was this patch tested?
by running the test
- vLLM version: v0.11.0
- vLLM main:
83f478bb19

Signed-off-by: jiangyunfan1 <jiangyunfan1@h-partners.com>
This commit is contained in:
jiangyunfan1
2025-11-04 16:47:48 +08:00
committed by GitHub
parent 15bb5098ad
commit 44b58b8665
2 changed files with 8 additions and 15 deletions

View File

@@ -86,7 +86,8 @@ async def test_models(model: str, tp_size: int) -> None:
str(port), "--max-model-len", "30000", "--max-num-batched-tokens",
"40000", "--max-num-seqs", "400", "--trust-remote-code",
"--gpu-memory-utilization", "0.8", "--additional-config",
'{"ascend_scheduler_config":{"enabled":false}}'
'{"ascend_scheduler_config":{"enabled":false}}',
"--compilation_config", '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
]
request_keyword_args: dict[str, Any] = {
**api_keyword_args,

View File

@@ -72,21 +72,13 @@ async def test_models(model: str, tp_size: int) -> None:
"HCCL_OP_EXPANSION_MODE": "AIV"
}
server_args = [
"--no-enable-prefix-caching",
"--disable-mm-preprocessor-cache",
"--no-enable-prefix-caching", "--disable-mm-preprocessor-cache",
"--tensor-parallel-size",
str(tp_size),
"--port",
str(port),
"--max-model-len",
"30000",
"--max-num-batched-tokens",
"40000",
"--max-num-seqs",
"400",
"--trust-remote-code",
"--gpu-memory-utilization",
"0.8",
str(tp_size), "--port",
str(port), "--max-model-len", "30000", "--max-num-batched-tokens",
"40000", "--max-num-seqs", "400", "--trust-remote-code",
"--gpu-memory-utilization", "0.8", "--compilation_config",
'{"cudagraph_mode": "FULL_DECODE_ONLY"}'
]
request_keyword_args: dict[str, Any] = {
**api_keyword_args,