diff --git a/tests/e2e/multicard/test_single_request_aclgraph.py b/tests/e2e/multicard/test_single_request_aclgraph.py index f7ef5d3e..5172f72a 100644 --- a/tests/e2e/multicard/test_single_request_aclgraph.py +++ b/tests/e2e/multicard/test_single_request_aclgraph.py @@ -28,9 +28,7 @@ if vllm_version_is("0.11.0"): else: from vllm.utils.network_utils import get_open_port -MODELS = [ - "Qwen/Qwen3-30B-A3B", -] +MODELS = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"] DATA_PARALLELS = [2] @@ -52,12 +50,21 @@ async def test_single_request_aclgraph(model: str, dp_size: int) -> None: "TASK_QUEUE_ENABLE": "1", "HCCL_OP_EXPANSION_MODE": "AIV", } - server_args = [ - "--no-enable-prefix-caching", "--tensor-parallel-size", "1", - "--data-parallel-size", - str(dp_size), "--port", - str(port), "--trust-remote-code", "--gpu-memory-utilization", "0.9" - ] + if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8": + server_args = [ + "--no-enable-prefix-caching", "--tensor-parallel-size", "1", + "--data-parallel-size", + str(dp_size), "--quantization", "ascend", "--max-model-len", + "1024", "--port", + str(port), "--trust-remote-code", "--gpu-memory-utilization", "0.9" + ] + else: + server_args = [ + "--no-enable-prefix-caching", "--tensor-parallel-size", "1", + "--data-parallel-size", + str(dp_size), "--port", + str(port), "--trust-remote-code", "--gpu-memory-utilization", "0.9" + ] request_keyword_args: dict[str, Any] = { **api_keyword_args, }