Log if cuda graph is used & extend cuda graph capture to cuda-graph-max-bs (#6201)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
This commit is contained in:
Lianmin Zheng
2025-05-12 00:17:33 -07:00
committed by GitHub
parent 7d3a3d4510
commit fba8eccd7e
27 changed files with 293 additions and 121 deletions

View File

@@ -1103,7 +1103,7 @@ async def benchmark(
lora_names: List[str],
extra_request_body: Dict[str, Any],
profile: bool,
pd_seperated: bool = False,
pd_separated: bool = False,
flush_cache: bool = False,
warmup_requests: int = 1,
):
@@ -1239,12 +1239,14 @@ async def benchmark(
if "sglang" in backend:
server_info = requests.get(base_url + "/get_server_info")
if pd_seperated:
accept_length = server_info.json()["decode"][0].get(
if pd_separated:
accept_length = server_info.json()["decode"][0]["internal_states"][0].get(
"avg_spec_accept_length", None
)
else:
accept_length = server_info.json().get("avg_spec_accept_length", None)
accept_length = server_info.json()["internal_states"][0].get(
"avg_spec_accept_length", None
)
else:
accept_length = None
@@ -1541,7 +1543,7 @@ def run_benchmark(args_: argparse.Namespace):
lora_names=args.lora_name,
extra_request_body=extra_request_body,
profile=args.profile,
pd_seperated=args.pd_seperated,
pd_separated=args.pd_separated,
flush_cache=args.flush_cache,
)
)