diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index 5e4e2c8ef..32a6d2692 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -839,10 +839,12 @@ class BenchmarkMetrics: mean_ttft_ms: float median_ttft_ms: float std_ttft_ms: float + p95_ttft_ms: float p99_ttft_ms: float mean_tpot_ms: float median_tpot_ms: float std_tpot_ms: float + p95_tpot_ms: float p99_tpot_ms: float mean_itl_ms: float median_itl_ms: float @@ -1665,10 +1667,12 @@ def calculate_metrics( * 1000, # ttfts is empty if streaming is not supported by backend median_ttft_ms=np.median(ttfts or 0) * 1000, std_ttft_ms=np.std(ttfts or 0) * 1000, + p95_ttft_ms=np.percentile(ttfts or 0, 95) * 1000, p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000, mean_tpot_ms=np.mean(tpots or 0) * 1000, median_tpot_ms=np.median(tpots or 0) * 1000, std_tpot_ms=np.std(tpots or 0) * 1000, + p95_tpot_ms=np.percentile(tpots or 0, 95) * 1000, p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000, mean_itl_ms=np.mean(itls or 0) * 1000, median_itl_ms=np.median(itls or 0) * 1000, @@ -1974,6 +1978,12 @@ async def benchmark( print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms)) print("{:<40} {:<10.2f}".format("Median TTFT (ms):", metrics.median_ttft_ms)) print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms)) + print("{:<40} {:<10.2f}".format("P95 TTFT (ms):", metrics.p95_ttft_ms)) + print("{s:{c}^{n}}".format(s="Time per Output Token (excl. 1st token)", n=50, c="-")) + print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms)) + print("{:<40} {:<10.2f}".format("Median TPOT (ms):", metrics.median_tpot_ms)) + print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms)) + print("{:<40} {:<10.2f}".format("P95 TPOT (ms):", metrics.p95_tpot_ms)) print("{s:{c}^{n}}".format(s="Inter-Token Latency", n=50, c="-")) print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms)) print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))