From 8f157893141ea24ebb581c9e48c27a8eeb9b81fb Mon Sep 17 00:00:00 2001 From: Pratyush Patel Date: Fri, 10 Jan 2025 07:30:44 -0800 Subject: [PATCH] Add more metrics to serving benchmark. (#2819) --- python/sglang/bench_serving.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index 4744ad338..941507705 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -514,6 +514,8 @@ class BenchmarkMetrics: p99_itl_ms: float mean_e2e_latency_ms: float median_e2e_latency_ms: float + std_e2e_latency_ms: float + p99_e2e_latency_ms: float SHAREGPT_URL = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json" @@ -873,6 +875,8 @@ def calculate_metrics( p99_itl_ms=np.percentile(itls or 0, 99) * 1000, mean_e2e_latency_ms=np.mean(e2e_latencies) * 1000, median_e2e_latency_ms=np.median(e2e_latencies) * 1000, + std_e2e_latency_ms=np.std(e2e_latencies) * 1000, + p99_e2e_latency_ms=np.percentile(e2e_latencies, 99) * 1000, ) return metrics, output_lens @@ -1064,10 +1068,20 @@ async def benchmark( "total_output_tokens_retokenized": metrics.total_output_retokenized, "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms, "median_e2e_latency_ms": metrics.median_e2e_latency_ms, + "std_e2e_latency_ms": metrics.std_e2e_latency_ms, + "p99_e2e_latency_ms": metrics.p99_e2e_latency_ms, "mean_ttft_ms": metrics.mean_ttft_ms, "median_ttft_ms": metrics.median_ttft_ms, + "std_ttft_ms": metrics.std_ttft_ms, + "p99_ttft_ms": metrics.p99_ttft_ms, + "mean_tpot_ms": metrics.mean_tpot_ms, + "median_tpot_ms": metrics.median_tpot_ms, + "std_tpot_ms": metrics.std_tpot_ms, + "p99_tpot_ms": metrics.p99_tpot_ms, "mean_itl_ms": metrics.mean_itl_ms, "median_itl_ms": metrics.median_itl_ms, + "std_itl_ms": metrics.std_itl_ms, + "p99_itl_ms": metrics.p99_itl_ms, "input_throughput": metrics.input_throughput, "output_throughput": metrics.output_throughput, "sharegpt_output_len": args.sharegpt_output_len,