Provide an argument to set the maximum batch size for cuda graph (#1809)

This commit is contained in:
Lianmin Zheng
2024-10-26 15:09:33 -07:00
committed by GitHub
parent 9d6fb08457
commit 2b80978859
4 changed files with 25 additions and 10 deletions

View File

@@ -120,6 +120,7 @@ class ServerArgs:
enable_mixed_chunk: bool = False
enable_torch_compile: bool = False
max_torch_compile_bs: int = 32
max_cuda_graph_bs: int = 160
torchao_config: str = ""
enable_p2p_check: bool = False
triton_attention_reduce_in_fp32: bool = False
@@ -624,6 +625,12 @@ class ServerArgs:
default=ServerArgs.max_torch_compile_bs,
help="Set the maximum batch size when using torch compile.",
)
parser.add_argument(
"--max-cuda-graph-bs",
type=int,
default=ServerArgs.max_cuda_graph_bs,
help="Set the maximum batch size for cuda graph.",
)
parser.add_argument(
"--torchao-config",
type=str,