Log if cuda graph is used & extend cuda graph capture to cuda-graph-max-bs (#6201)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
This commit is contained in:
Lianmin Zheng
2025-05-12 00:17:33 -07:00
committed by GitHub
parent 7d3a3d4510
commit fba8eccd7e
27 changed files with 293 additions and 121 deletions

View File

@@ -395,12 +395,12 @@ def popen_launch_server(
other_args: list[str] = (),
env: Optional[dict] = None,
return_stdout_stderr: Optional[tuple] = None,
pd_seperated: bool = False,
pd_separated: bool = False,
):
_, host, port = base_url.split(":")
host = host[2:]
if pd_seperated:
if pd_separated:
command = "sglang.launch_pd_server"
else:
command = "sglang.launch_server"
@@ -414,7 +414,7 @@ def popen_launch_server(
*[str(x) for x in other_args],
]
if pd_seperated:
if pd_separated:
command.extend(
[
"--lb-host",
@@ -656,7 +656,7 @@ def get_benchmark_args(
disable_stream=False,
disable_ignore_eos=False,
seed: int = 0,
pd_seperated: bool = False,
pd_separated: bool = False,
):
return SimpleNamespace(
backend="sglang",
@@ -686,7 +686,7 @@ def get_benchmark_args(
profile=None,
lora_name=None,
prompt_suffix="",
pd_seperated=pd_seperated,
pd_separated=pd_separated,
)
@@ -750,7 +750,7 @@ def run_bench_serving_multi(
other_server_args,
benchmark_args,
need_warmup=False,
pd_seperated=False,
pd_separated=False,
):
# Launch the server
process = popen_launch_server(
@@ -758,7 +758,7 @@ def run_bench_serving_multi(
base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=other_server_args,
pd_seperated=pd_seperated,
pd_separated=pd_separated,
)
# run benchmark for all