Sync from v0.13
This commit is contained in:
48
examples/online_serving/dashboards/perses/README.md
Normal file
48
examples/online_serving/dashboards/perses/README.md
Normal file
@@ -0,0 +1,48 @@
|
||||
# Perses Dashboards for vLLM Monitoring
|
||||
|
||||
This directory contains Perses dashboard configurations designed to monitor vLLM
|
||||
performance and metrics.
|
||||
|
||||
## Requirements
|
||||
|
||||
- Perses instance (standalone or via operator)
|
||||
- Prometheus data source configured in Perses
|
||||
- vLLM deployment with Prometheus metrics enabled
|
||||
|
||||
## Dashboard Format
|
||||
|
||||
We provide dashboards in the **native Perses YAML format** that works across all
|
||||
deployment methods:
|
||||
|
||||
- **Files**: `*.yaml` (native Perses dashboard specifications)
|
||||
- **Format**: Pure dashboard specifications that work everywhere
|
||||
- **Usage**: Works with standalone Perses, API imports, CLI, and file provisioning
|
||||
- **Kubernetes**: Directly compatible with Perses Operator
|
||||
|
||||
## Dashboard Descriptions
|
||||
|
||||
- **performance_statistics.yaml**: Performance metrics with aggregated latency
|
||||
statistics
|
||||
- **query_statistics.yaml**: Query performance and deployment metrics
|
||||
|
||||
## Deployment Options
|
||||
|
||||
### Direct Import to Perses
|
||||
|
||||
Import the dashboard specifications via Perses API or CLI:
|
||||
|
||||
```bash
|
||||
percli apply -f performance_statistics.yaml
|
||||
```
|
||||
|
||||
### Perses Operator (Kubernetes)
|
||||
|
||||
The native YAML format works directly with the Perses Operator:
|
||||
|
||||
```bash
|
||||
kubectl apply -f performance_statistics.yaml -n <namespace>
|
||||
```
|
||||
|
||||
### File Provisioning
|
||||
|
||||
Place the YAML files in a Perses provisioning folder for automatic loading.
|
||||
@@ -0,0 +1,764 @@
|
||||
kind: PersesDashboard
|
||||
metadata:
|
||||
name: performance-statistics
|
||||
createdAt: 0001-01-01T00:00:00Z
|
||||
updatedAt: 0001-01-01T00:00:00Z
|
||||
version: 0
|
||||
project: ""
|
||||
spec:
|
||||
display:
|
||||
name: Performance Statistics
|
||||
|
||||
variables:
|
||||
- kind: ListVariable
|
||||
spec:
|
||||
display:
|
||||
name: Deployment_ID
|
||||
hidden: false
|
||||
name: Deployment_id
|
||||
allowAllValue: true
|
||||
allowMultiple: true
|
||||
defaultValue:
|
||||
- $__all
|
||||
sort: alphabetical-asc
|
||||
plugin:
|
||||
kind: PrometheusLabelValuesVariable
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
labelName: model_name
|
||||
matchers:
|
||||
# Any one vllm metric that always carries model_name
|
||||
- vllm:generation_tokens_total{}
|
||||
|
||||
panels:
|
||||
"1":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: E2E Latency over Time
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend:
|
||||
mode: table
|
||||
position: bottom
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
# avg latency by model = sum(rate(sum)) / sum(rate(count))
|
||||
query: >
|
||||
sum by (model_name) (rate(vllm:e2e_request_latency_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
|
||||
/
|
||||
sum by (model_name) (rate(vllm:e2e_request_latency_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
|
||||
seriesNameFormat: '{{model_name}}'
|
||||
|
||||
"2":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: E2E Latency (Avg)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
(sum by (model_name) (increase(vllm:e2e_request_latency_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
|
||||
/
|
||||
(sum by (model_name) (increase(vllm:e2e_request_latency_seconds_count{model_name=~"$Deployment_id"}[$__range])))
|
||||
|
||||
"3":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: E2E Latency (P50)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.50,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:e2e_request_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
|
||||
"4":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: E2E Latency (P90)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.90,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:e2e_request_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
|
||||
"5":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: E2E Latency (P99)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.99,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:e2e_request_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
|
||||
"6":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: TTFT over Time
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend:
|
||||
mode: table
|
||||
position: bottom
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
sum by (model_name) (rate(vllm:time_to_first_token_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
|
||||
/
|
||||
sum by (model_name) (rate(vllm:time_to_first_token_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
|
||||
seriesNameFormat: '{{model_name}}'
|
||||
|
||||
"7":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: TTFT (Avg)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
(sum by (model_name) (increase(vllm:time_to_first_token_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
|
||||
/
|
||||
(sum by (model_name) (increase(vllm:time_to_first_token_seconds_count{model_name=~"$Deployment_id"}[$__range])))
|
||||
|
||||
"8":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: TTFT (P50)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.50,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:time_to_first_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
|
||||
"9":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: TTFT (P90)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.90,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:time_to_first_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
|
||||
"10":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: TTFT (P99)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.99,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:time_to_first_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
|
||||
"11":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: ITL (Time per Output Token) over Time
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend:
|
||||
mode: table
|
||||
position: bottom
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
sum by (model_name) (rate(vllm:time_per_output_token_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
|
||||
/
|
||||
sum by (model_name) (rate(vllm:time_per_output_token_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
|
||||
seriesNameFormat: '{{model_name}}'
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.50,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
seriesNameFormat: '{{model_name}} p50'
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.90,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
seriesNameFormat: '{{model_name}} p90'
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.99,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
seriesNameFormat: '{{model_name}} p99'
|
||||
|
||||
"12":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: ITL (Avg)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
(sum by (model_name) (increase(vllm:time_per_output_token_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
|
||||
/
|
||||
(sum by (model_name) (increase(vllm:time_per_output_token_seconds_count{model_name=~"$Deployment_id"}[$__range])))
|
||||
|
||||
"13":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: ITL (P50)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.50,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
|
||||
"14":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: ITL (P90)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.90,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
|
||||
"15":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: ITL (P99)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.99,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
|
||||
"16":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: TPS (Tokens/sec) over Time
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend:
|
||||
mode: table
|
||||
position: bottom
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
sum by (model_name) (rate(vllm:generation_tokens_total{model_name=~"$Deployment_id"}[$__interval]))
|
||||
seriesNameFormat: '{{model_name}} generation'
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
sum by (model_name) (rate(vllm:prompt_tokens_total{model_name=~"$Deployment_id"}[$__interval]))
|
||||
seriesNameFormat: '{{model_name}} prompt'
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
# overall iteration tokens/sec if exposed
|
||||
query: >
|
||||
rate(vllm:iteration_tokens_total_count[$__interval])
|
||||
seriesNameFormat: 'iteration overall'
|
||||
|
||||
"17":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: KV Cache Usage (avg %)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
# Multiply by 100 so we can read it as a percentage without setting a unit (avoids CUE unit conflicts)
|
||||
query: >
|
||||
100 * avg(vllm:kv_cache_usage_perc)
|
||||
|
||||
"18":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: Running Requests by Pod
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend:
|
||||
mode: table
|
||||
position: bottom
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
sum by (pod) (vllm:num_requests_running)
|
||||
seriesNameFormat: '{{pod}}'
|
||||
|
||||
"19":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: Waiting Requests by Pod
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend:
|
||||
mode: table
|
||||
position: bottom
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
sum by (pod) (vllm:num_requests_waiting)
|
||||
seriesNameFormat: '{{pod}}'
|
||||
|
||||
"20":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: Running Requests (sum)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: sum(vllm:num_requests_running)
|
||||
|
||||
"21":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: Waiting Requests (sum)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: sum(vllm:num_requests_waiting)
|
||||
|
||||
layouts:
|
||||
- kind: Grid
|
||||
spec:
|
||||
display:
|
||||
title: Overview
|
||||
items:
|
||||
- x: 0
|
||||
y: 0
|
||||
width: 6
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/17' } # KV cache %
|
||||
- x: 6
|
||||
y: 0
|
||||
width: 6
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/20' } # running sum
|
||||
- x: 12
|
||||
y: 0
|
||||
width: 6
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/21' } # waiting sum
|
||||
|
||||
- kind: Grid
|
||||
spec:
|
||||
display:
|
||||
title: E2E Latency
|
||||
items:
|
||||
- x: 0
|
||||
y: 1
|
||||
width: 10
|
||||
height: 6
|
||||
content: { $ref: '#/spec/panels/1' }
|
||||
- x: 10
|
||||
y: 1
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/2' }
|
||||
- x: 17
|
||||
y: 1
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/3' }
|
||||
- x: 10
|
||||
y: 4
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/4' }
|
||||
- x: 17
|
||||
y: 4
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/5' }
|
||||
|
||||
- kind: Grid
|
||||
spec:
|
||||
display:
|
||||
title: TTFT
|
||||
items:
|
||||
- x: 0
|
||||
y: 8
|
||||
width: 10
|
||||
height: 6
|
||||
content: { $ref: '#/spec/panels/6' }
|
||||
- x: 10
|
||||
y: 8
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/7' }
|
||||
- x: 17
|
||||
y: 8
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/8' }
|
||||
- x: 10
|
||||
y: 11
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/9' }
|
||||
- x: 17
|
||||
y: 11
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/10' }
|
||||
|
||||
- kind: Grid
|
||||
spec:
|
||||
display:
|
||||
title: ITL (Time per Output Token)
|
||||
items:
|
||||
- x: 0
|
||||
y: 15
|
||||
width: 10
|
||||
height: 6
|
||||
content: { $ref: '#/spec/panels/11' }
|
||||
- x: 10
|
||||
y: 15
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/12' }
|
||||
- x: 17
|
||||
y: 15
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/13' }
|
||||
- x: 10
|
||||
y: 18
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/14' }
|
||||
- x: 17
|
||||
y: 18
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/15' }
|
||||
|
||||
- kind: Grid
|
||||
spec:
|
||||
display:
|
||||
title: TPS (Prompt / Generation / Iteration)
|
||||
items:
|
||||
- x: 0
|
||||
y: 22
|
||||
width: 14
|
||||
height: 6
|
||||
content: { $ref: '#/spec/panels/16' }
|
||||
|
||||
- kind: Grid
|
||||
spec:
|
||||
display:
|
||||
title: Per-Pod Request State
|
||||
items:
|
||||
- x: 0
|
||||
y: 28
|
||||
width: 12
|
||||
height: 6
|
||||
content: { $ref: '#/spec/panels/18' }
|
||||
- x: 12
|
||||
y: 28
|
||||
width: 12
|
||||
height: 6
|
||||
content: { $ref: '#/spec/panels/19' }
|
||||
|
||||
392
examples/online_serving/dashboards/perses/query_statistics.yaml
Normal file
392
examples/online_serving/dashboards/perses/query_statistics.yaml
Normal file
@@ -0,0 +1,392 @@
|
||||
kind: PersesDashboard
|
||||
metadata:
|
||||
name: query-statistics
|
||||
createdAt: 0001-01-01T00:00:00Z
|
||||
updatedAt: 0001-01-01T00:00:00Z
|
||||
version: 0
|
||||
project: ""
|
||||
spec:
|
||||
display:
|
||||
name: Query Statistics_New
|
||||
|
||||
variables:
|
||||
- kind: ListVariable
|
||||
spec:
|
||||
name: NS
|
||||
display: { name: Namespace }
|
||||
allowMultiple: false
|
||||
defaultValue: llm-d
|
||||
plugin:
|
||||
kind: PrometheusLabelValuesVariable
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
labelName: namespace
|
||||
matchers:
|
||||
- up{service=~".*vllm.*"}
|
||||
|
||||
- kind: ListVariable
|
||||
spec:
|
||||
name: SVC
|
||||
display: { name: Service }
|
||||
allowMultiple: false
|
||||
defaultValue: vllm-qwen2-0-5b-sim
|
||||
plugin:
|
||||
kind: PrometheusLabelValuesVariable
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
labelName: service
|
||||
matchers:
|
||||
- up{namespace="$NS",service=~".*vllm.*"}
|
||||
|
||||
- kind: ListVariable
|
||||
spec:
|
||||
name: MODEL
|
||||
display: { name: Model (real vLLM) }
|
||||
allowAllValue: true
|
||||
allowMultiple: true
|
||||
defaultValue: ["$__all"]
|
||||
plugin:
|
||||
kind: PrometheusLabelValuesVariable
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
labelName: model_name
|
||||
matchers:
|
||||
- vllm:request_success_total{namespace="$NS",service="$SVC"}
|
||||
|
||||
panels:
|
||||
|
||||
# --- Core (works on Simulator & Real) ---
|
||||
core_running_now:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Running Requests (now) }
|
||||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: sum(vllm:num_requests_running{namespace="$NS",service="$SVC"}) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
core_waiting_now:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Waiting Requests (now) }
|
||||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: sum(vllm:num_requests_waiting{namespace="$NS",service="$SVC"}) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
core_kv_usage_now:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: KV Cache Usage (0–1) }
|
||||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: avg(vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
core_running_ts:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Running Over Time }
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend: { mode: table, position: bottom }
|
||||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: sum by (service) (vllm:num_requests_running{namespace="$NS",service="$SVC"}) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
core_waiting_ts:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Waiting Over Time }
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend: { mode: table, position: bottom }
|
||||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: sum by (service) (vllm:num_requests_waiting{namespace="$NS",service="$SVC"}) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
core_targets_up:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Scrape Targets Up }
|
||||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: count(up{namespace="$NS",service="$SVC"} == 1) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
# --- KV Cache as Percent (works on Simulator & Real) ---
|
||||
core_kv_usage_pct_now:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: KV Cache Usage (%) – now }
|
||||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
# multiply by 100 to present percentage; omit format.unit to avoid schema conflicts
|
||||
query: (avg(vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
core_kv_usage_pct_ts:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: KV Cache Usage (%) – over time }
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend: { mode: table, position: bottom }
|
||||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: (avg by (service) (vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
# --- Per-Pod breakdowns (works on Simulator & Real) ---
|
||||
per_pod_running_ts:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Running by Pod }
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend: { mode: table, position: bottom }
|
||||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: sum by (pod) (vllm:num_requests_running{namespace="$NS",service="$SVC"}) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
per_pod_waiting_ts:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Waiting by Pod }
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend: { mode: table, position: bottom }
|
||||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: sum by (pod) (vllm:num_requests_waiting{namespace="$NS",service="$SVC"}) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
per_pod_kv_pct_ts:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: KV Cache (%) by Pod }
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend: { mode: table, position: bottom }
|
||||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
# if your exporter labels kv metric with pod (the sim does), this works; otherwise it will just return empty
|
||||
query: (avg by (pod) (vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
# --- Real vLLM only (zeros on simulator) ---
|
||||
real_req_rate_ts:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Request Rate (real vLLM) }
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend: { mode: table, position: bottom }
|
||||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: sum by (model_name) (rate(vllm:request_success_total{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval])) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
real_p50:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: p50 Latency (real vLLM) }
|
||||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: histogram_quantile(0.50, sum by (le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval]))) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
real_p90:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: p90 Latency (real vLLM) }
|
||||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: histogram_quantile(0.90, sum by (le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval]))) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
real_p99:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: p99 Latency (real vLLM) }
|
||||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: histogram_quantile(0.99, sum by (le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval]))) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
real_input_tokens_ts:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Input Tokens / sec (real vLLM) }
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend: { mode: table, position: bottom }
|
||||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: sum by (model_name) (rate(vllm:prompt_tokens_total{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval])) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
real_output_tokens_ts:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Output Tokens / sec (real vLLM) }
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend: { mode: table, position: bottom }
|
||||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: sum by (model_name) (rate(vllm:generation_tokens_total{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval])) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
layouts:
|
||||
- kind: Grid
|
||||
spec:
|
||||
display: { title: Core (Sim & Real) }
|
||||
items:
|
||||
- { x: 0, y: 0, width: 6, height: 3, content: { $ref: '#/spec/panels/core_running_now' } }
|
||||
- { x: 6, y: 0, width: 6, height: 3, content: { $ref: '#/spec/panels/core_waiting_now' } }
|
||||
- { x: 12, y: 0, width: 6, height: 3, content: { $ref: '#/spec/panels/core_kv_usage_now' } }
|
||||
- { x: 18, y: 0, width: 6, height: 3, content: { $ref: '#/spec/panels/core_targets_up' } }
|
||||
- { x: 0, y: 3, width: 12, height: 6, content: { $ref: '#/spec/panels/core_running_ts' } }
|
||||
- { x: 12, y: 3, width: 12, height: 6, content: { $ref: '#/spec/panels/core_waiting_ts' } }
|
||||
|
||||
- kind: Grid
|
||||
spec:
|
||||
display: { title: KV Cache (%) }
|
||||
items:
|
||||
- { x: 0, y: 9, width: 6, height: 3, content: { $ref: '#/spec/panels/core_kv_usage_pct_now' } }
|
||||
- { x: 6, y: 9, width: 18, height: 6, content: { $ref: '#/spec/panels/core_kv_usage_pct_ts' } }
|
||||
|
||||
- kind: Grid
|
||||
spec:
|
||||
display: { title: Per-Pod breakdowns }
|
||||
items:
|
||||
- { x: 0, y: 15, width: 12, height: 6, content: { $ref: '#/spec/panels/per_pod_running_ts' } }
|
||||
- { x: 12, y: 15, width: 12, height: 6, content: { $ref: '#/spec/panels/per_pod_waiting_ts' } }
|
||||
- { x: 0, y: 21, width: 24, height: 6, content: { $ref: '#/spec/panels/per_pod_kv_pct_ts' } }
|
||||
|
||||
- kind: Grid
|
||||
spec:
|
||||
display: { title: Real vLLM only (shows 0 on simulator) }
|
||||
items:
|
||||
- { x: 0, y: 27, width: 12, height: 6, content: { $ref: '#/spec/panels/real_req_rate_ts' } }
|
||||
- { x: 12, y: 27, width: 4, height: 3, content: { $ref: '#/spec/panels/real_p50' } }
|
||||
- { x: 16, y: 27, width: 4, height: 3, content: { $ref: '#/spec/panels/real_p90' } }
|
||||
- { x: 20, y: 27, width: 4, height: 3, content: { $ref: '#/spec/panels/real_p99' } }
|
||||
- { x: 0, y: 33, width: 12, height: 6, content: { $ref: '#/spec/panels/real_input_tokens_ts' } }
|
||||
- { x: 12, y: 33, width: 12, height: 6, content: { $ref: '#/spec/panels/real_output_tokens_ts' } }
|
||||
|
||||
Reference in New Issue
Block a user