From a7000a765041cf870bb9964ee533dd0fb7cebcdf Mon Sep 17 00:00:00 2001 From: Yudi Xue <10211+binarycrayon@users.noreply.github.com> Date: Mon, 3 Mar 2025 05:03:58 -0800 Subject: [PATCH] Update metrics documentation (#3264) --- docs/references/production_metrics.md | 212 ++-- examples/monitoring/grafana.json | 1394 ++++++------------------- 2 files changed, 443 insertions(+), 1163 deletions(-) diff --git a/docs/references/production_metrics.md b/docs/references/production_metrics.md index 20a34e54b..7183158e1 100644 --- a/docs/references/production_metrics.md +++ b/docs/references/production_metrics.md @@ -8,129 +8,133 @@ Here is an example of the metrics: ``` $ curl http://localhost:30000/metrics - # HELP sglang:prompt_tokens_total Number of prefill tokens processed. # TYPE sglang:prompt_tokens_total counter -sglang:prompt_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.0 +sglang:prompt_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 8.128902e+06 # HELP sglang:generation_tokens_total Number of generation tokens processed. # TYPE sglang:generation_tokens_total counter -sglang:generation_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 8.0 +sglang:generation_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.557572e+06 +# HELP sglang:token_usage The token usage +# TYPE sglang:token_usage gauge +sglang:token_usage{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.28 +# HELP sglang:cache_hit_rate The cache hit rate +# TYPE sglang:cache_hit_rate gauge +sglang:cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.007507552643049313 # HELP sglang:time_to_first_token_seconds Histogram of time to first token in seconds. # TYPE sglang:time_to_first_token_seconds histogram -sglang:time_to_first_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.30457592010498047 +sglang:time_to_first_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct"} 2.3518979474117756e+06 sglang:time_to_first_token_seconds_bucket{le="0.001",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0 sglang:time_to_first_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0 sglang:time_to_first_token_seconds_bucket{le="0.01",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0 sglang:time_to_first_token_seconds_bucket{le="0.02",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0 -sglang:time_to_first_token_seconds_bucket{le="0.04",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0 -sglang:time_to_first_token_seconds_bucket{le="0.06",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0 -sglang:time_to_first_token_seconds_bucket{le="0.08",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0 -sglang:time_to_first_token_seconds_bucket{le="0.1",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0 -sglang:time_to_first_token_seconds_bucket{le="0.25",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0 -sglang:time_to_first_token_seconds_bucket{le="0.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:time_to_first_token_seconds_bucket{le="0.75",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:time_to_first_token_seconds_bucket{le="1.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:time_to_first_token_seconds_bucket{le="2.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:time_to_first_token_seconds_bucket{le="5.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:time_to_first_token_seconds_bucket{le="7.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:time_to_first_token_seconds_bucket{le="10.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:time_to_first_token_seconds_bucket{le="15.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:time_to_first_token_seconds_bucket{le="20.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:time_to_first_token_seconds_bucket{le="25.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:time_to_first_token_seconds_bucket{le="30.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:time_to_first_token_seconds_bucket{le="+Inf",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 +sglang:time_to_first_token_seconds_bucket{le="0.04",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 +sglang:time_to_first_token_seconds_bucket{le="0.06",model_name="meta-llama/Llama-3.1-8B-Instruct"} 3.0 +sglang:time_to_first_token_seconds_bucket{le="0.08",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0 +sglang:time_to_first_token_seconds_bucket{le="0.1",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0 +sglang:time_to_first_token_seconds_bucket{le="0.25",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0 +sglang:time_to_first_token_seconds_bucket{le="0.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0 +sglang:time_to_first_token_seconds_bucket{le="0.75",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0 +sglang:time_to_first_token_seconds_bucket{le="1.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 27.0 +sglang:time_to_first_token_seconds_bucket{le="2.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 140.0 +sglang:time_to_first_token_seconds_bucket{le="5.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 314.0 +sglang:time_to_first_token_seconds_bucket{le="7.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 941.0 +sglang:time_to_first_token_seconds_bucket{le="10.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1330.0 +sglang:time_to_first_token_seconds_bucket{le="15.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1970.0 +sglang:time_to_first_token_seconds_bucket{le="20.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 2326.0 +sglang:time_to_first_token_seconds_bucket{le="25.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 2417.0 +sglang:time_to_first_token_seconds_bucket{le="30.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 2513.0 +sglang:time_to_first_token_seconds_bucket{le="+Inf",model_name="meta-llama/Llama-3.1-8B-Instruct"} 11008.0 +sglang:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 11008.0 # HELP sglang:e2e_request_latency_seconds Histogram of End-to-end request latency in seconds # TYPE sglang:e2e_request_latency_seconds histogram -sglang:e2e_request_latency_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.30521273612976074 +sglang:e2e_request_latency_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct"} 3.116093850019932e+06 sglang:e2e_request_latency_seconds_bucket{le="0.3",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0 -sglang:e2e_request_latency_seconds_bucket{le="0.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:e2e_request_latency_seconds_bucket{le="0.8",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:e2e_request_latency_seconds_bucket{le="1.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:e2e_request_latency_seconds_bucket{le="1.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:e2e_request_latency_seconds_bucket{le="2.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:e2e_request_latency_seconds_bucket{le="2.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:e2e_request_latency_seconds_bucket{le="5.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:e2e_request_latency_seconds_bucket{le="10.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:e2e_request_latency_seconds_bucket{le="15.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:e2e_request_latency_seconds_bucket{le="20.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:e2e_request_latency_seconds_bucket{le="30.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:e2e_request_latency_seconds_bucket{le="40.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:e2e_request_latency_seconds_bucket{le="50.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:e2e_request_latency_seconds_bucket{le="60.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:e2e_request_latency_seconds_bucket{le="+Inf",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:e2e_request_latency_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 +sglang:e2e_request_latency_seconds_bucket{le="0.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0 +sglang:e2e_request_latency_seconds_bucket{le="0.8",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0 +sglang:e2e_request_latency_seconds_bucket{le="1.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0 +sglang:e2e_request_latency_seconds_bucket{le="1.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0 +sglang:e2e_request_latency_seconds_bucket{le="2.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0 +sglang:e2e_request_latency_seconds_bucket{le="2.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0 +sglang:e2e_request_latency_seconds_bucket{le="5.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.0 +sglang:e2e_request_latency_seconds_bucket{le="10.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 10.0 +sglang:e2e_request_latency_seconds_bucket{le="15.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 11.0 +sglang:e2e_request_latency_seconds_bucket{le="20.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 14.0 +sglang:e2e_request_latency_seconds_bucket{le="30.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 247.0 +sglang:e2e_request_latency_seconds_bucket{le="40.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 486.0 +sglang:e2e_request_latency_seconds_bucket{le="50.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 845.0 +sglang:e2e_request_latency_seconds_bucket{le="60.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1513.0 +sglang:e2e_request_latency_seconds_bucket{le="+Inf",model_name="meta-llama/Llama-3.1-8B-Instruct"} 11228.0 +sglang:e2e_request_latency_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 11228.0 # HELP sglang:time_per_output_token_seconds Histogram of time per output token in seconds. # TYPE sglang:time_per_output_token_seconds histogram -sglang:time_per_output_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0381757915019989 -sglang:time_per_output_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0 -sglang:time_per_output_token_seconds_bucket{le="0.01",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0 -sglang:time_per_output_token_seconds_bucket{le="0.015",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0 -sglang:time_per_output_token_seconds_bucket{le="0.02",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0 -sglang:time_per_output_token_seconds_bucket{le="0.025",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0 -sglang:time_per_output_token_seconds_bucket{le="0.03",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0 -sglang:time_per_output_token_seconds_bucket{le="0.04",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:time_per_output_token_seconds_bucket{le="0.05",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:time_per_output_token_seconds_bucket{le="0.075",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:time_per_output_token_seconds_bucket{le="0.1",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:time_per_output_token_seconds_bucket{le="0.15",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:time_per_output_token_seconds_bucket{le="0.2",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:time_per_output_token_seconds_bucket{le="0.3",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:time_per_output_token_seconds_bucket{le="0.4",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:time_per_output_token_seconds_bucket{le="0.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:time_per_output_token_seconds_bucket{le="0.75",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:time_per_output_token_seconds_bucket{le="1.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:time_per_output_token_seconds_bucket{le="2.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:time_per_output_token_seconds_bucket{le="+Inf",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 -sglang:time_per_output_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 +sglang:time_per_output_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct"} 866964.5791549598 +sglang:time_per_output_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0 +sglang:time_per_output_token_seconds_bucket{le="0.01",model_name="meta-llama/Llama-3.1-8B-Instruct"} 73.0 +sglang:time_per_output_token_seconds_bucket{le="0.015",model_name="meta-llama/Llama-3.1-8B-Instruct"} 382.0 +sglang:time_per_output_token_seconds_bucket{le="0.02",model_name="meta-llama/Llama-3.1-8B-Instruct"} 593.0 +sglang:time_per_output_token_seconds_bucket{le="0.025",model_name="meta-llama/Llama-3.1-8B-Instruct"} 855.0 +sglang:time_per_output_token_seconds_bucket{le="0.03",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1035.0 +sglang:time_per_output_token_seconds_bucket{le="0.04",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1815.0 +sglang:time_per_output_token_seconds_bucket{le="0.05",model_name="meta-llama/Llama-3.1-8B-Instruct"} 11685.0 +sglang:time_per_output_token_seconds_bucket{le="0.075",model_name="meta-llama/Llama-3.1-8B-Instruct"} 433413.0 +sglang:time_per_output_token_seconds_bucket{le="0.1",model_name="meta-llama/Llama-3.1-8B-Instruct"} 4.950195e+06 +sglang:time_per_output_token_seconds_bucket{le="0.15",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.039435e+06 +sglang:time_per_output_token_seconds_bucket{le="0.2",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.171662e+06 +sglang:time_per_output_token_seconds_bucket{le="0.3",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.266055e+06 +sglang:time_per_output_token_seconds_bucket{le="0.4",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.296752e+06 +sglang:time_per_output_token_seconds_bucket{le="0.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.312226e+06 +sglang:time_per_output_token_seconds_bucket{le="0.75",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.339675e+06 +sglang:time_per_output_token_seconds_bucket{le="1.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.357747e+06 +sglang:time_per_output_token_seconds_bucket{le="2.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.389414e+06 +sglang:time_per_output_token_seconds_bucket{le="+Inf",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.400757e+06 +sglang:time_per_output_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.400757e+06 # HELP sglang:func_latency_seconds Function latency in seconds # TYPE sglang:func_latency_seconds histogram -sglang:func_latency_seconds_sum{name="generate_request"} 0.3061351010110229 -sglang:func_latency_seconds_bucket{le="0.05",name="generate_request"} 0.0 -sglang:func_latency_seconds_bucket{le="0.07500000000000001",name="generate_request"} 0.0 -sglang:func_latency_seconds_bucket{le="0.1125",name="generate_request"} 0.0 -sglang:func_latency_seconds_bucket{le="0.16875",name="generate_request"} 0.0 -sglang:func_latency_seconds_bucket{le="0.253125",name="generate_request"} 0.0 -sglang:func_latency_seconds_bucket{le="0.3796875",name="generate_request"} 1.0 -sglang:func_latency_seconds_bucket{le="0.56953125",name="generate_request"} 1.0 -sglang:func_latency_seconds_bucket{le="0.8542968750000001",name="generate_request"} 1.0 -sglang:func_latency_seconds_bucket{le="1.2814453125",name="generate_request"} 1.0 -sglang:func_latency_seconds_bucket{le="1.9221679687500002",name="generate_request"} 1.0 -sglang:func_latency_seconds_bucket{le="2.8832519531250003",name="generate_request"} 1.0 -sglang:func_latency_seconds_bucket{le="4.3248779296875",name="generate_request"} 1.0 -sglang:func_latency_seconds_bucket{le="6.487316894531251",name="generate_request"} 1.0 -sglang:func_latency_seconds_bucket{le="9.730975341796876",name="generate_request"} 1.0 -sglang:func_latency_seconds_bucket{le="14.596463012695313",name="generate_request"} 1.0 -sglang:func_latency_seconds_bucket{le="21.89469451904297",name="generate_request"} 1.0 -sglang:func_latency_seconds_bucket{le="32.84204177856446",name="generate_request"} 1.0 -sglang:func_latency_seconds_bucket{le="49.26306266784668",name="generate_request"} 1.0 -sglang:func_latency_seconds_bucket{le="+Inf",name="generate_request"} 1.0 -sglang:func_latency_seconds_count{name="generate_request"} 1.0 +sglang:func_latency_seconds_sum{name="generate_request"} 4.514771912145079 +sglang:func_latency_seconds_bucket{le="0.05",name="generate_request"} 14006.0 +sglang:func_latency_seconds_bucket{le="0.07500000000000001",name="generate_request"} 14006.0 +sglang:func_latency_seconds_bucket{le="0.1125",name="generate_request"} 14006.0 +sglang:func_latency_seconds_bucket{le="0.16875",name="generate_request"} 14006.0 +sglang:func_latency_seconds_bucket{le="0.253125",name="generate_request"} 14006.0 +sglang:func_latency_seconds_bucket{le="0.3796875",name="generate_request"} 14006.0 +sglang:func_latency_seconds_bucket{le="0.56953125",name="generate_request"} 14006.0 +sglang:func_latency_seconds_bucket{le="0.8542968750000001",name="generate_request"} 14006.0 +sglang:func_latency_seconds_bucket{le="1.2814453125",name="generate_request"} 14006.0 +sglang:func_latency_seconds_bucket{le="1.9221679687500002",name="generate_request"} 14006.0 +sglang:func_latency_seconds_bucket{le="2.8832519531250003",name="generate_request"} 14006.0 +sglang:func_latency_seconds_bucket{le="4.3248779296875",name="generate_request"} 14007.0 +sglang:func_latency_seconds_bucket{le="6.487316894531251",name="generate_request"} 14007.0 +sglang:func_latency_seconds_bucket{le="9.730975341796876",name="generate_request"} 14007.0 +sglang:func_latency_seconds_bucket{le="14.596463012695313",name="generate_request"} 14007.0 +sglang:func_latency_seconds_bucket{le="21.89469451904297",name="generate_request"} 14007.0 +sglang:func_latency_seconds_bucket{le="32.84204177856446",name="generate_request"} 14007.0 +sglang:func_latency_seconds_bucket{le="49.26306266784668",name="generate_request"} 14007.0 +sglang:func_latency_seconds_bucket{le="+Inf",name="generate_request"} 14007.0 +sglang:func_latency_seconds_count{name="generate_request"} 14007.0 # HELP sglang:num_running_reqs The number of running requests # TYPE sglang:num_running_reqs gauge -sglang:num_running_reqs{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0 +sglang:num_running_reqs{model_name="meta-llama/Llama-3.1-8B-Instruct"} 162.0 # HELP sglang:num_used_tokens The number of used tokens # TYPE sglang:num_used_tokens gauge -sglang:num_used_tokens{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0 +sglang:num_used_tokens{model_name="meta-llama/Llama-3.1-8B-Instruct"} 123859.0 # HELP sglang:gen_throughput The generate throughput (token/s) # TYPE sglang:gen_throughput gauge -sglang:gen_throughput{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0 +sglang:gen_throughput{model_name="meta-llama/Llama-3.1-8B-Instruct"} 86.50814177726902 # HELP sglang:num_queue_reqs The number of requests in the waiting queue # TYPE sglang:num_queue_reqs gauge -sglang:num_queue_reqs{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0 -# HELP sglang:token_usage The token usage -# TYPE sglang:token_usage gauge -sglang:token_usage{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0 -# HELP sglang:cache_hit_rate The cache hit rate -# TYPE sglang:cache_hit_rate gauge -sglang:cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0 +sglang:num_queue_reqs{model_name="meta-llama/Llama-3.1-8B-Instruct"} 2826.0 ``` ## Setup Guide To setup a monitoring dashboard, you can use the following docker compose file: [examples/monitoring/docker-compose.yaml](../examples/monitoring/docker-compose.yaml). -Assume you have sglang server running at `localhost:30000`. +Assume you have sglang server running at `localhost:30000`, to start the server, ensure you have `--enable-metrics` flag enabled: + +```bash +python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \ +--port 30000 --host 0.0.0.0 --enable-metrics +``` To start the monitoring dashboard (prometheus + grafana), cd to `examples/monitoring` and run: @@ -142,4 +146,28 @@ Then you can access the Grafana dashboard at http://localhost:3000. ### Grafana Dashboard +In a new Grafana setup, ensure that you have the `Prometheus` data source enabled. To check that, go to `http://localhost:3000/connections/datasources` and ensure that `Prometheus` is enabled. + +If not, click `Add data source` -> `Prometheus`, set Prometheus URL to `http://localhost:9090`, and click `Save & Test`. + To import the Grafana dashboard, click `+` -> `Import` -> `Upload JSON file` -> `Upload` and select [grafana.json](../examples/monitoring/grafana.json). + +### Troubleshooting + +#### Check if the variables are created + +The example dashboard assume you have the following variables avaliable: +- `model_name` (name: `model_name`, label: `model name`, Data source: `Prometheus`, Type: `Label values`) +- `instance` (name: `instance`, label: `instance`, Data source: `Prometheus`, Type: `Label values`) + +If you don't have these variables, you can create them manually. + +To create a variable, go to dashboard settings, `Variables` -> `New variable`. + +You should be able to see the preview the values (e.g. `meta-llama/Llama-3.1-8B-Instruct` for `model_name`). + +#### Check if the metrics are being collected + +Run `python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 3000 --random-input 1024 --random-output 1024 --random-range-ratio 0.5` to generate some requests. + +Then you should be able to see the metrics in the Grafana dashboard. diff --git a/examples/monitoring/grafana.json b/examples/monitoring/grafana.json index e7d436de2..871bcf822 100644 --- a/examples/monitoring/grafana.json +++ b/examples/monitoring/grafana.json @@ -25,432 +25,7 @@ "datasource": { "default": true, "type": "prometheus", - "uid": "ee2vha8w6f5kwf" - }, - "description": "max-running-requests from server argument", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 3, - "x": 0, - "y": 0 - }, - "id": 2, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "last" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "ddyfngn31dg5cf" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sglang:max_running_requests{name=\"$name\", instance=\"$instance\"}", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Max Running Requests", - "type": "stat" - }, - { - "datasource": { - "default": true, - "type": "prometheus", - "uid": "ee2vha8w6f5kwf" - }, - "description": "Supported context length with loaded model", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 3, - "x": 3, - "y": 0 - }, - "id": 1, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "last" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "ddyfngn31dg5cf" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sglang:context_len{instance=\"$instance\", name=\"$name\"}", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Max Context Length", - "type": "stat" - }, - { - "datasource": { - "default": true, - "type": "prometheus", - "uid": "ee2vha8w6f5kwf" - }, - "description": "max_total_tokens", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 3, - "x": 6, - "y": 0 - }, - "id": 4, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "last" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "ddyfngn31dg5cf" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sglang:max_total_num_tokens{instance=\"$instance\", name=\"$name\"}", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Max Total Num Tokens", - "type": "stat" - }, - { - "datasource": { - "default": true, - "type": "prometheus", - "uid": "ee2vha8w6f5kwf" - }, - "description": "max_prefill_tokens from server args", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 3, - "x": 9, - "y": 0 - }, - "id": 3, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "last" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "ddyfngn31dg5cf" - }, - "disableTextWrap": false, - "editorMode": "code", - "expr": "sglang:max_prefill_tokens{instance=\"$instance\", name=\"$name\"}", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Max Prefill Tokens", - "type": "stat" - }, - { - "datasource": { - "default": true, - "type": "prometheus", - "uid": "ee2vha8w6f5kwf" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 6, - "x": 12, - "y": 0 - }, - "id": 6, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "ddyfngn31dg5cf" - }, - "disableTextWrap": false, - "editorMode": "code", - "expr": "sglang:cached_token{instance=\"$instance\", name=\"$name\"}", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "{{__name__}}", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Cached Tokens", - "type": "stat" - }, - { - "datasource": { - "default": true, - "type": "prometheus", - "uid": "ee2vha8w6f5kwf" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 6, - "x": 18, - "y": 0 - }, - "id": 5, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "ddyfngn31dg5cf" - }, - "disableTextWrap": false, - "editorMode": "code", - "expr": "sglang:cache_hit_rate{instance=\"$instance\", name=\"$name\"}", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "{{__name__}}", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Cache Hit Rate (%)", - "type": "stat" - }, - { - "datasource": { - "default": true, - "type": "prometheus", - "uid": "ee2vha8w6f5kwf" + "uid": "aeboq3sqk89vkd" }, "fieldConfig": { "defaults": { @@ -511,7 +86,7 @@ "h": 8, "w": 12, "x": 0, - "y": 3 + "y": 0 }, "id": 14, "options": { @@ -534,7 +109,7 @@ }, "disableTextWrap": false, "editorMode": "code", - "expr": "histogram_quantile(0.99, sum by(le) (rate(sglang:e2e_request_latency_seconds_bucket{instance=\"$instance\", name=\"$name\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by (le) (rate(sglang:e2e_request_latency_seconds_bucket[$__rate_interval])))\r\n", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, @@ -550,7 +125,7 @@ }, "disableTextWrap": false, "editorMode": "code", - "expr": "histogram_quantile(0.9, sum by(le) (rate(sglang:e2e_request_latency_seconds_bucket{instance=\"$instance\", name=\"$name\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by (le) (rate(sglang:e2e_request_latency_seconds_bucket[$__rate_interval])))\r\n", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, @@ -566,8 +141,8 @@ "uid": "ddyfngn31dg5cf" }, "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(sglang:e2e_request_latency_seconds_bucket{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval])))", + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(sglang:e2e_request_latency_seconds_bucket[$__rate_interval])))\r\n", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, @@ -583,8 +158,8 @@ "uid": "ddyfngn31dg5cf" }, "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(sglang:e2e_request_latency_seconds_bucket{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval])))", + "editorMode": "code", + "expr": "avg(rate(sglang:e2e_request_latency_seconds_sum[$__rate_interval]) / rate(sglang:e2e_request_latency_seconds_count[$__rate_interval]))\r\n", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, @@ -593,85 +168,28 @@ "range": true, "refId": "D", "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "ddyfngn31dg5cf" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "rate(sglang:e2e_request_latency_seconds_sum{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval]) / rate(sglang:e2e_request_latency_seconds_count[$__rate_interval])", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Average", - "range": true, - "refId": "E", - "useBackend": false } ], - "title": "E2E Request Latency (S)", + "title": "End-to-End Request Latency", "type": "timeseries" }, { "datasource": { "default": true, "type": "prometheus", - "uid": "ee2vha8w6f5kwf" + "uid": "aeboq3sqk89vkd" }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, "scaleDistribution": { "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] } }, "overrides": [] @@ -680,43 +198,70 @@ "h": 8, "w": 12, "x": 12, - "y": 3 + "y": 0 }, - "id": 18, + "id": 17, "options": { + "calculate": true, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "show": true + }, + "rowsFrame": { + "layout": "auto" }, "tooltip": { "mode": "single", - "sort": "none" + "showColorScale": true, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false } }, + "pluginVersion": "11.2.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "ddyfngn31dg5cf" }, + "disableTextWrap": false, "editorMode": "code", - "expr": "sglang:gen_throughput{instance=\"$instance\", name=\"$name\"}", + "expr": "rate(sglang:e2e_request_latency_seconds_bucket{model_name=~\"$model_name\"}[$__rate_interval])\r\n", + "fullMetaSearch": false, + "includeNullMetadata": true, "instant": false, - "legendFormat": "__auto", + "legendFormat": "{{le}}", "range": true, - "refId": "A" + "refId": "A", + "useBackend": false } ], - "title": "Generation Throughput (Token / S)", - "type": "timeseries" + "title": "End-to-End Request Latency Heatmap", + "type": "heatmap" }, { "datasource": { "default": true, "type": "prometheus", - "uid": "ee2vha8w6f5kwf" + "uid": "aeboq3sqk89vkd" }, "fieldConfig": { "defaults": { @@ -777,7 +322,244 @@ "h": 8, "w": 12, "x": 0, - "y": 11 + "y": 8 + }, + "id": 20, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(sglang:time_to_first_token_seconds_bucket[$__rate_interval])))\r\n", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "P99", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(sglang:time_to_first_token_seconds_bucket[$__rate_interval])))\r\n", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "P90", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(sglang:time_to_first_token_seconds_bucket[$__rate_interval])))\r\n", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "P95", + "range": true, + "refId": "C", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "avg(rate(sglang:time_to_first_token_seconds_bucket[$__rate_interval]) / rate(sglang:time_to_first_token_seconds_bucket[$__rate_interval]))\r\n", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "P50", + "range": true, + "refId": "D", + "useBackend": false + } + ], + "title": "End-to-End Request Latency", + "type": "timeseries" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "aeboq3sqk89vkd" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 19, + "options": { + "calculate": true, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": true, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(sglang:time_to_first_token_seconds_bucket{model_name=~\"$model_name\"}[$__rate_interval])\r\n", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{le}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Time-To-First-Token Seconds Heatmap", + "type": "heatmap" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "aeboq3sqk89vkd" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 }, "id": 7, "options": { @@ -800,24 +582,25 @@ }, "disableTextWrap": false, "editorMode": "code", - "expr": "sglang:num_requests_running{instance=\"$instance\", name=\"$name\"}", + "expr": "sglang:num_running_reqs", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, - "legendFormat": "{{__name__}}", + "interval": "", + "legendFormat": "{{instance}}", "range": true, "refId": "A", "useBackend": false } ], - "title": "Num Requests Running", + "title": "Num Running Requests", "type": "timeseries" }, { "datasource": { "default": true, "type": "prometheus", - "uid": "ee2vha8w6f5kwf" + "uid": "aeboq3sqk89vkd" }, "fieldConfig": { "defaults": { @@ -878,280 +661,9 @@ "h": 8, "w": 12, "x": 12, - "y": 11 + "y": 16 }, - "id": 8, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "ddyfngn31dg5cf" - }, - "disableTextWrap": false, - "editorMode": "code", - "expr": "sglang:num_requests_waiting{instance=\"$instance\", name=\"$name\"}", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "{{__name__}}", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Number of Requests Waiting", - "type": "timeseries" - }, - { - "datasource": { - "default": true, - "type": "prometheus", - "uid": "ee2vha8w6f5kwf" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 19 - }, - "id": 16, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "ddyfngn31dg5cf" - }, - "disableTextWrap": false, - "editorMode": "code", - "expr": "histogram_quantile(0.99, sum by(le) (rate(sglang:e2e_request_latency_seconds_bucket{name=\"$name\"}[$__rate_interval])))", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "P99", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "ddyfngn31dg5cf" - }, - "disableTextWrap": false, - "editorMode": "code", - "expr": "histogram_quantile(0.9, sum by(le) (rate(sglang:e2e_request_latency_seconds_bucket{name=\"$name\"}[$__rate_interval])))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "P90", - "range": true, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "ddyfngn31dg5cf" - }, - "disableTextWrap": false, - "editorMode": "code", - "expr": "histogram_quantile(0.95, sum by(le) (rate(sglang:e2e_request_latency_seconds_bucket{name=\"$name\"}[$__rate_interval])))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "P95", - "range": true, - "refId": "C", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "ddyfngn31dg5cf" - }, - "disableTextWrap": false, - "editorMode": "code", - "expr": "histogram_quantile(0.5, sum by(le) (rate(sglang:e2e_request_latency_seconds_bucket{name=\"$name\"}[$__rate_interval])))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "P50", - "range": true, - "refId": "D", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "ddyfngn31dg5cf" - }, - "disableTextWrap": false, - "editorMode": "code", - "expr": "rate(sglang:e2e_request_latency_seconds_sum{name=\"$name\"}[$__rate_interval]) / rate(sglang:e2e_request_latency_seconds_count{name=\"$name\"}[$__rate_interval])", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Average", - "range": true, - "refId": "E", - "useBackend": false - } - ], - "title": "Time Request Decoding (S)", - "type": "timeseries" - }, - { - "datasource": { - "default": true, - "type": "prometheus", - "uid": "ee2vha8w6f5kwf" - }, - "description": "Time requests waiting before added to batch", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 19 - }, - "id": 15, + "id": 18, "options": { "legend": { "calcs": [], @@ -1171,73 +683,21 @@ "uid": "ddyfngn31dg5cf" }, "editorMode": "code", - "expr": "histogram_quantile(0.99, sum by (le) (rate(sglang:waiting_request_latency_seconds_bucket{name=\"$name\"}[$__rate_interval])))", + "expr": "sglang:gen_throughput", "instant": false, - "legendFormat": "P99", + "legendFormat": "{{instance}}", "range": true, "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "ddyfngn31dg5cf" - }, - "editorMode": "code", - "expr": "histogram_quantile(0.95, sum by (le) (rate(sglang:waiting_request_latency_seconds_bucket{name=\"$name\"}[$__rate_interval])))", - "hide": false, - "instant": false, - "legendFormat": "P95", - "range": true, - "refId": "B" - }, - { - "datasource": { - "type": "prometheus", - "uid": "ddyfngn31dg5cf" - }, - "editorMode": "code", - "expr": "histogram_quantile(0.9, sum by (le) (rate(sglang:waiting_request_latency_seconds_bucket{name=\"$name\"}[$__rate_interval])))", - "hide": false, - "instant": false, - "legendFormat": "P90", - "range": true, - "refId": "C" - }, - { - "datasource": { - "type": "prometheus", - "uid": "ddyfngn31dg5cf" - }, - "editorMode": "code", - "expr": "histogram_quantile(0.5, sum by (le) (rate(sglang:waiting_request_latency_seconds_bucket{name=\"$name\"}[$__rate_interval])))", - "hide": false, - "instant": false, - "legendFormat": "P50", - "range": true, - "refId": "D" - }, - { - "datasource": { - "type": "prometheus", - "uid": "ddyfngn31dg5cf" - }, - "editorMode": "code", - "expr": "rate(sglang:waiting_request_latency_seconds_sum{name=\"$name\"}[$__rate_interval])\r\n/\r\nrate(sglang:waiting_request_latency_seconds_count{name=\"$name\"}[$__rate_interval])", - "hide": false, - "instant": false, - "legendFormat": "Average", - "range": true, - "refId": "E" } ], - "title": "Time Request Waiting (S)", + "title": "Token Generation Throughput (Tokens / S)", "type": "timeseries" }, { "datasource": { "default": true, "type": "prometheus", - "uid": "ee2vha8w6f5kwf" + "uid": "aeboq3sqk89vkd" }, "fieldConfig": { "defaults": { @@ -1298,7 +758,7 @@ "h": 8, "w": 12, "x": 0, - "y": 27 + "y": 24 }, "id": 11, "options": { @@ -1321,41 +781,24 @@ }, "disableTextWrap": false, "editorMode": "code", - "expr": "sum(rate(sglang:request_prompt_tokens_sum{instance=\"$instance\", name=\"$name\"}[$__rate_interval])) by (instance, name)", + "expr": "sglang:cache_hit_rate", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, - "legendFormat": "{{__name__}}", + "legendFormat": "{{instance}}", "range": true, "refId": "A", "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "ddyfngn31dg5cf" - }, - "disableTextWrap": false, - "editorMode": "code", - "expr": "", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "B", - "useBackend": false } ], - "title": "Prompt Tokens", + "title": "Cache Hit Rate", "type": "timeseries" }, { "datasource": { "default": true, "type": "prometheus", - "uid": "ee2vha8w6f5kwf" + "uid": "aeboq3sqk89vkd" }, "fieldConfig": { "defaults": { @@ -1416,9 +859,9 @@ "h": 8, "w": 12, "x": 12, - "y": 27 + "y": 24 }, - "id": 17, + "id": 8, "options": { "legend": { "calcs": [], @@ -1439,213 +882,18 @@ }, "disableTextWrap": false, "editorMode": "code", - "expr": "sum(rate(sglang:request_generation_tokens_sum{instance=\"$instance\", name=\"$name\"}[$__rate_interval])) by (instance, name)", + "expr": "sglang:num_queue_reqs", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, - "legendFormat": "{{__name__}}", + "legendFormat": "{{instance}}", "range": true, "refId": "A", "useBackend": false } ], - "title": "Generated Tokens", + "title": "Number Queued Requests", "type": "timeseries" - }, - { - "datasource": { - "default": true, - "type": "prometheus", - "uid": "ee2vha8w6f5kwf" - }, - "fieldConfig": { - "defaults": { - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "scaleDistribution": { - "type": "linear" - } - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 35 - }, - "id": 13, - "options": { - "calculate": false, - "calculation": { - "yBuckets": { - "scale": { - "log": 2, - "type": "log" - } - } - }, - "cellGap": 1, - "color": { - "exponent": 0.5, - "fill": "dark-orange", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "Oranges", - "steps": 64 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, - "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto" - }, - "tooltip": { - "mode": "single", - "showColorScale": false, - "yHistogram": false - }, - "yAxis": { - "axisPlacement": "left", - "reverse": false - } - }, - "pluginVersion": "11.2.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "ddyfngn31dg5cf" - }, - "disableTextWrap": false, - "editorMode": "code", - "expr": "sum by(le) (increase(sglang:request_prompt_tokens_bucket{name=\"$name\", instance=\"$instance\"}[$__rate_interval]))", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "{{__name__}}", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Request Prompt Tokens", - "type": "heatmap" - }, - { - "datasource": { - "default": true, - "type": "prometheus", - "uid": "ee2vha8w6f5kwf" - }, - "description": "", - "fieldConfig": { - "defaults": { - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "scaleDistribution": { - "type": "linear" - } - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 35 - }, - "id": 12, - "options": { - "calculate": false, - "calculation": { - "xBuckets": { - "mode": "size", - "value": "" - }, - "yBuckets": { - "mode": "size", - "scale": { - "log": 2, - "type": "log" - }, - "value": "" - } - }, - "cellGap": 1, - "color": { - "exponent": 0.5, - "fill": "dark-orange", - "min": 0, - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "Spectral", - "steps": 64 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, - "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto", - "value": "Request count" - }, - "tooltip": { - "mode": "single", - "showColorScale": false, - "yHistogram": true - }, - "yAxis": { - "axisLabel": "Generation Length", - "axisPlacement": "left", - "reverse": false, - "unit": "none" - } - }, - "pluginVersion": "11.2.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "ddyfngn31dg5cf" - }, - "disableTextWrap": false, - "editorMode": "code", - "expr": "sum by(le) (increase(sglang:request_generation_tokens_bucket{name=\"$name\", instance=\"$instance\"}[$__rate_interval]))", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "{{__name__}}", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Request Generation Tokens", - "type": "heatmap" } ], "refresh": "5s", @@ -1661,7 +909,7 @@ }, "datasource": { "type": "prometheus", - "uid": "ddyfngn31dg5cf" + "uid": "aeboq3sqk89vkd" }, "definition": "label_values(instance)", "hide": 0, @@ -1683,20 +931,24 @@ }, { "current": { - "selected": true, - "text": "google/gemma-2-9b-it", - "value": "google/gemma-2-9b-it" + "selected": false, + "text": "meta-llama/Llama-3.1-8B-Instruct", + "value": "meta-llama/Llama-3.1-8B-Instruct" }, - "definition": "label_values(name)", - "hide": 1, + "datasource": { + "type": "prometheus", + "uid": "aeboq3sqk89vkd" + }, + "definition": "label_values(model_name)", + "hide": 0, "includeAll": false, - "label": "name", + "label": "model name", "multi": false, - "name": "name", + "name": "model_name", "options": [], "query": { "qryType": 1, - "query": "label_values(name)", + "query": "label_values(model_name)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -1715,6 +967,6 @@ "timezone": "browser", "title": "SGLang Dashboard", "uid": "ddyp55uq7brpcc", - "version": 3, + "version": 5, "weekStart": "" }