146 lines
11 KiB
Markdown
146 lines
11 KiB
Markdown
# Production Metrics
|
|
|
|
SGLang exposes the following metrics via Prometheus. The metrics are namespaced by `$name` (the model name).
|
|
|
|
An example of the monitoring dashboard is available in [examples/monitoring/grafana.json](../examples/monitoring/grafana.json).
|
|
|
|
Here is an example of the metrics:
|
|
|
|
```
|
|
$ curl http://localhost:30000/metrics
|
|
|
|
# HELP sglang:prompt_tokens_total Number of prefill tokens processed.
|
|
# TYPE sglang:prompt_tokens_total counter
|
|
sglang:prompt_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.0
|
|
# HELP sglang:generation_tokens_total Number of generation tokens processed.
|
|
# TYPE sglang:generation_tokens_total counter
|
|
sglang:generation_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 8.0
|
|
# HELP sglang:time_to_first_token_seconds Histogram of time to first token in seconds.
|
|
# TYPE sglang:time_to_first_token_seconds histogram
|
|
sglang:time_to_first_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.30457592010498047
|
|
sglang:time_to_first_token_seconds_bucket{le="0.001",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
|
sglang:time_to_first_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
|
sglang:time_to_first_token_seconds_bucket{le="0.01",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
|
sglang:time_to_first_token_seconds_bucket{le="0.02",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
|
sglang:time_to_first_token_seconds_bucket{le="0.04",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
|
sglang:time_to_first_token_seconds_bucket{le="0.06",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
|
sglang:time_to_first_token_seconds_bucket{le="0.08",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
|
sglang:time_to_first_token_seconds_bucket{le="0.1",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
|
sglang:time_to_first_token_seconds_bucket{le="0.25",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
|
sglang:time_to_first_token_seconds_bucket{le="0.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:time_to_first_token_seconds_bucket{le="0.75",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:time_to_first_token_seconds_bucket{le="1.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:time_to_first_token_seconds_bucket{le="2.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:time_to_first_token_seconds_bucket{le="5.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:time_to_first_token_seconds_bucket{le="7.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:time_to_first_token_seconds_bucket{le="10.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:time_to_first_token_seconds_bucket{le="15.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:time_to_first_token_seconds_bucket{le="20.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:time_to_first_token_seconds_bucket{le="25.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:time_to_first_token_seconds_bucket{le="30.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:time_to_first_token_seconds_bucket{le="+Inf",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
# HELP sglang:e2e_request_latency_seconds Histogram of End-to-end request latency in seconds
|
|
# TYPE sglang:e2e_request_latency_seconds histogram
|
|
sglang:e2e_request_latency_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.30521273612976074
|
|
sglang:e2e_request_latency_seconds_bucket{le="0.3",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
|
sglang:e2e_request_latency_seconds_bucket{le="0.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:e2e_request_latency_seconds_bucket{le="0.8",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:e2e_request_latency_seconds_bucket{le="1.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:e2e_request_latency_seconds_bucket{le="1.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:e2e_request_latency_seconds_bucket{le="2.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:e2e_request_latency_seconds_bucket{le="2.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:e2e_request_latency_seconds_bucket{le="5.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:e2e_request_latency_seconds_bucket{le="10.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:e2e_request_latency_seconds_bucket{le="15.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:e2e_request_latency_seconds_bucket{le="20.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:e2e_request_latency_seconds_bucket{le="30.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:e2e_request_latency_seconds_bucket{le="40.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:e2e_request_latency_seconds_bucket{le="50.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:e2e_request_latency_seconds_bucket{le="60.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:e2e_request_latency_seconds_bucket{le="+Inf",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:e2e_request_latency_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
# HELP sglang:time_per_output_token_seconds Histogram of time per output token in seconds.
|
|
# TYPE sglang:time_per_output_token_seconds histogram
|
|
sglang:time_per_output_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0381757915019989
|
|
sglang:time_per_output_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
|
sglang:time_per_output_token_seconds_bucket{le="0.01",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
|
sglang:time_per_output_token_seconds_bucket{le="0.015",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
|
sglang:time_per_output_token_seconds_bucket{le="0.02",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
|
sglang:time_per_output_token_seconds_bucket{le="0.025",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
|
sglang:time_per_output_token_seconds_bucket{le="0.03",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
|
sglang:time_per_output_token_seconds_bucket{le="0.04",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:time_per_output_token_seconds_bucket{le="0.05",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:time_per_output_token_seconds_bucket{le="0.075",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:time_per_output_token_seconds_bucket{le="0.1",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:time_per_output_token_seconds_bucket{le="0.15",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:time_per_output_token_seconds_bucket{le="0.2",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:time_per_output_token_seconds_bucket{le="0.3",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:time_per_output_token_seconds_bucket{le="0.4",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:time_per_output_token_seconds_bucket{le="0.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:time_per_output_token_seconds_bucket{le="0.75",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:time_per_output_token_seconds_bucket{le="1.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:time_per_output_token_seconds_bucket{le="2.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:time_per_output_token_seconds_bucket{le="+Inf",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
sglang:time_per_output_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
|
# HELP sglang:func_latency_seconds Function latency in seconds
|
|
# TYPE sglang:func_latency_seconds histogram
|
|
sglang:func_latency_seconds_sum{name="generate_request"} 0.3061351010110229
|
|
sglang:func_latency_seconds_bucket{le="0.05",name="generate_request"} 0.0
|
|
sglang:func_latency_seconds_bucket{le="0.07500000000000001",name="generate_request"} 0.0
|
|
sglang:func_latency_seconds_bucket{le="0.1125",name="generate_request"} 0.0
|
|
sglang:func_latency_seconds_bucket{le="0.16875",name="generate_request"} 0.0
|
|
sglang:func_latency_seconds_bucket{le="0.253125",name="generate_request"} 0.0
|
|
sglang:func_latency_seconds_bucket{le="0.3796875",name="generate_request"} 1.0
|
|
sglang:func_latency_seconds_bucket{le="0.56953125",name="generate_request"} 1.0
|
|
sglang:func_latency_seconds_bucket{le="0.8542968750000001",name="generate_request"} 1.0
|
|
sglang:func_latency_seconds_bucket{le="1.2814453125",name="generate_request"} 1.0
|
|
sglang:func_latency_seconds_bucket{le="1.9221679687500002",name="generate_request"} 1.0
|
|
sglang:func_latency_seconds_bucket{le="2.8832519531250003",name="generate_request"} 1.0
|
|
sglang:func_latency_seconds_bucket{le="4.3248779296875",name="generate_request"} 1.0
|
|
sglang:func_latency_seconds_bucket{le="6.487316894531251",name="generate_request"} 1.0
|
|
sglang:func_latency_seconds_bucket{le="9.730975341796876",name="generate_request"} 1.0
|
|
sglang:func_latency_seconds_bucket{le="14.596463012695313",name="generate_request"} 1.0
|
|
sglang:func_latency_seconds_bucket{le="21.89469451904297",name="generate_request"} 1.0
|
|
sglang:func_latency_seconds_bucket{le="32.84204177856446",name="generate_request"} 1.0
|
|
sglang:func_latency_seconds_bucket{le="49.26306266784668",name="generate_request"} 1.0
|
|
sglang:func_latency_seconds_bucket{le="+Inf",name="generate_request"} 1.0
|
|
sglang:func_latency_seconds_count{name="generate_request"} 1.0
|
|
# HELP sglang:num_running_reqs The number of running requests
|
|
# TYPE sglang:num_running_reqs gauge
|
|
sglang:num_running_reqs{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
|
# HELP sglang:num_used_tokens The number of used tokens
|
|
# TYPE sglang:num_used_tokens gauge
|
|
sglang:num_used_tokens{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
|
# HELP sglang:gen_throughput The generate throughput (token/s)
|
|
# TYPE sglang:gen_throughput gauge
|
|
sglang:gen_throughput{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
|
# HELP sglang:num_queue_reqs The number of requests in the waiting queue
|
|
# TYPE sglang:num_queue_reqs gauge
|
|
sglang:num_queue_reqs{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
|
# HELP sglang:token_usage The token usage
|
|
# TYPE sglang:token_usage gauge
|
|
sglang:token_usage{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
|
# HELP sglang:cache_hit_rate The cache hit rate
|
|
# TYPE sglang:cache_hit_rate gauge
|
|
sglang:cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
|
```
|
|
|
|
## Setup Guide
|
|
|
|
To setup a monitoring dashboard, you can use the following docker compose file: [examples/monitoring/docker-compose.yaml](../examples/monitoring/docker-compose.yaml).
|
|
|
|
Assume you have sglang server running at `localhost:30000`.
|
|
|
|
To start the monitoring dashboard (prometheus + grafana), cd to `examples/monitoring` and run:
|
|
|
|
```bash
|
|
docker compose -f compose.yaml -p monitoring up
|
|
```
|
|
|
|
Then you can access the Grafana dashboard at http://localhost:3000.
|
|
|
|
### Grafana Dashboard
|
|
|
|
To import the Grafana dashboard, click `+` -> `Import` -> `Upload JSON file` -> `Upload` and select [grafana.json](../examples/monitoring/grafana.json).
|