[v0.11.0][Test] Add new test model for aclgraph single_request v0.11.0 (#3889)

### What this PR does / why we need it?
add new test model for aclgraph single_request v0.11.0

### Does this PR introduce _any_ user-facing change?
no

### How was this patch tested?
ut

Signed-off-by: lilinsiman <lilinsiman@gmail.com>
This commit is contained in:
lilinsiman
2025-10-31 11:23:55 +08:00
committed by GitHub
parent 90aca84e60
commit ee2e55e602

View File

@@ -23,9 +23,7 @@ from vllm.utils import get_open_port
from tests.e2e.conftest import RemoteOpenAIServer
MODELS = [
"Qwen/Qwen3-30B-A3B",
]
MODELS = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"]
DATA_PARALLELS = [2]
@@ -47,12 +45,21 @@ async def test_single_request_aclgraph(model: str, dp_size: int) -> None:
"TASK_QUEUE_ENABLE": "1",
"HCCL_OP_EXPANSION_MODE": "AIV",
}
server_args = [
"--no-enable-prefix-caching", "--tensor-parallel-size", "1",
"--data-parallel-size",
str(dp_size), "--port",
str(port), "--trust-remote-code", "--gpu-memory-utilization", "0.9"
]
if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
server_args = [
"--no-enable-prefix-caching", "--tensor-parallel-size", "1",
"--data-parallel-size",
str(dp_size), "--quantization", "ascend", "--max-model-len",
"1024", "--port",
str(port), "--trust-remote-code", "--gpu-memory-utilization", "0.9"
]
else:
server_args = [
"--no-enable-prefix-caching", "--tensor-parallel-size", "1",
"--data-parallel-size",
str(dp_size), "--port",
str(port), "--trust-remote-code", "--gpu-memory-utilization", "0.9"
]
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
}