Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/examples/online_serving/dashboards/perses/README.md
+++ b/examples/online_serving/dashboards/perses/README.md
@@ -0,0 +1,48 @@
+# Perses Dashboards for vLLM Monitoring
+
+This directory contains Perses dashboard configurations designed to monitor vLLM
+performance and metrics.
+
+## Requirements
+
+- Perses instance (standalone or via operator)
+- Prometheus data source configured in Perses
+- vLLM deployment with Prometheus metrics enabled
+
+## Dashboard Format
+
+We provide dashboards in the **native Perses YAML format** that works across all
+deployment methods:
+
+- **Files**: `*.yaml` (native Perses dashboard specifications)
+- **Format**: Pure dashboard specifications that work everywhere
+- **Usage**: Works with standalone Perses, API imports, CLI, and file provisioning
+- **Kubernetes**: Directly compatible with Perses Operator
+
+## Dashboard Descriptions
+
+- **performance_statistics.yaml**: Performance metrics with aggregated latency
+  statistics
+- **query_statistics.yaml**: Query performance and deployment metrics
+
+## Deployment Options
+
+### Direct Import to Perses
+
+Import the dashboard specifications via Perses API or CLI:
+
+```bash
+percli apply -f performance_statistics.yaml
+```
+
+### Perses Operator (Kubernetes)
+
+The native YAML format works directly with the Perses Operator:
+
+```bash
+kubectl apply -f performance_statistics.yaml -n <namespace>
+```
+
+### File Provisioning
+
+Place the YAML files in a Perses provisioning folder for automatic loading.
--- a/examples/online_serving/dashboards/perses/performance_statistics.yaml
+++ b/examples/online_serving/dashboards/perses/performance_statistics.yaml
@@ -0,0 +1,764 @@
+kind: PersesDashboard
+metadata:
+  name: performance-statistics
+  createdAt: 0001-01-01T00:00:00Z
+  updatedAt: 0001-01-01T00:00:00Z
+  version: 0
+  project: ""
+spec:
+  display:
+    name: Performance Statistics
+
+  variables:
+    - kind: ListVariable
+      spec:
+        display:
+          name: Deployment_ID
+          hidden: false
+        name: Deployment_id
+        allowAllValue: true
+        allowMultiple: true
+        defaultValue:
+          - $__all
+        sort: alphabetical-asc
+        plugin:
+          kind: PrometheusLabelValuesVariable
+          spec:
+            datasource:
+              kind: PrometheusDatasource
+              name: accelerators-thanos-querier-datasource
+            labelName: model_name
+            matchers:
+              # Any one vllm metric that always carries model_name
+              - vllm:generation_tokens_total{}
+
+  panels:
+    "1":
+      kind: Panel
+      spec:
+        display:
+          name: E2E Latency over Time
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend:
+              mode: table
+              position: bottom
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  # avg latency by model = sum(rate(sum)) / sum(rate(count))
+                  query: >
+                    sum by (model_name) (rate(vllm:e2e_request_latency_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
+                    /
+                    sum by (model_name) (rate(vllm:e2e_request_latency_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
+                  seriesNameFormat: '{{model_name}}'
+
+    "2":
+      kind: Panel
+      spec:
+        display:
+          name: E2E Latency (Avg)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    (sum by (model_name) (increase(vllm:e2e_request_latency_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
+                    /
+                    (sum by (model_name) (increase(vllm:e2e_request_latency_seconds_count{model_name=~"$Deployment_id"}[$__range])))
+
+    "3":
+      kind: Panel
+      spec:
+        display:
+          name: E2E Latency (P50)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    histogram_quantile(
+                      0.50,
+                      sum by (le, model_name) (
+                        rate(vllm:e2e_request_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                      )
+                    )
+
+    "4":
+      kind: Panel
+      spec:
+        display:
+          name: E2E Latency (P90)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    histogram_quantile(
+                      0.90,
+                      sum by (le, model_name) (
+                        rate(vllm:e2e_request_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                      )
+                    )
+
+    "5":
+      kind: Panel
+      spec:
+        display:
+          name: E2E Latency (P99)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    histogram_quantile(
+                      0.99,
+                      sum by (le, model_name) (
+                        rate(vllm:e2e_request_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                      )
+                    )
+
+    "6":
+      kind: Panel
+      spec:
+        display:
+          name: TTFT over Time
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend:
+              mode: table
+              position: bottom
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    sum by (model_name) (rate(vllm:time_to_first_token_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
+                    /
+                    sum by (model_name) (rate(vllm:time_to_first_token_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
+                  seriesNameFormat: '{{model_name}}'
+
+    "7":
+      kind: Panel
+      spec:
+        display:
+          name: TTFT (Avg)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    (sum by (model_name) (increase(vllm:time_to_first_token_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
+                    /
+                    (sum by (model_name) (increase(vllm:time_to_first_token_seconds_count{model_name=~"$Deployment_id"}[$__range])))
+
+    "8":
+      kind: Panel
+      spec:
+        display:
+          name: TTFT (P50)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    histogram_quantile(
+                      0.50,
+                      sum by (le, model_name) (
+                        rate(vllm:time_to_first_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                      )
+                    )
+
+    "9":
+      kind: Panel
+      spec:
+        display:
+          name: TTFT (P90)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    histogram_quantile(
+                      0.90,
+                      sum by (le, model_name) (
+                        rate(vllm:time_to_first_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                      )
+                    )
+
+    "10":
+      kind: Panel
+      spec:
+        display:
+          name: TTFT (P99)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    histogram_quantile(
+                      0.99,
+                      sum by (le, model_name) (
+                        rate(vllm:time_to_first_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                      )
+                    )
+
+    "11":
+      kind: Panel
+      spec:
+        display:
+          name: ITL (Time per Output Token) over Time
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend:
+              mode: table
+              position: bottom
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    sum by (model_name) (rate(vllm:time_per_output_token_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
+                    /
+                    sum by (model_name) (rate(vllm:time_per_output_token_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
+                  seriesNameFormat: '{{model_name}}'
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    histogram_quantile(
+                      0.50,
+                      sum by (le, model_name) (
+                        rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                      )
+                    )
+                  seriesNameFormat: '{{model_name}} p50'
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    histogram_quantile(
+                      0.90,
+                      sum by (le, model_name) (
+                        rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                      )
+                    )
+                  seriesNameFormat: '{{model_name}} p90'
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    histogram_quantile(
+                      0.99,
+                      sum by (le, model_name) (
+                        rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                      )
+                    )
+                  seriesNameFormat: '{{model_name}} p99'
+
+    "12":
+      kind: Panel
+      spec:
+        display:
+          name: ITL (Avg)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    (sum by (model_name) (increase(vllm:time_per_output_token_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
+                    /
+                    (sum by (model_name) (increase(vllm:time_per_output_token_seconds_count{model_name=~"$Deployment_id"}[$__range])))
+
+    "13":
+      kind: Panel
+      spec:
+        display:
+          name: ITL (P50)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    histogram_quantile(
+                      0.50,
+                      sum by (le, model_name) (
+                        rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                      )
+                    )
+
+    "14":
+      kind: Panel
+      spec:
+        display:
+          name: ITL (P90)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    histogram_quantile(
+                      0.90,
+                      sum by (le, model_name) (
+                        rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                      )
+                    )
+
+    "15":
+      kind: Panel
+      spec:
+        display:
+          name: ITL (P99)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    histogram_quantile(
+                      0.99,
+                      sum by (le, model_name) (
+                        rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                      )
+                    )
+
+    "16":
+      kind: Panel
+      spec:
+        display:
+          name: TPS (Tokens/sec) over Time
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend:
+              mode: table
+              position: bottom
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    sum by (model_name) (rate(vllm:generation_tokens_total{model_name=~"$Deployment_id"}[$__interval]))
+                  seriesNameFormat: '{{model_name}} generation'
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    sum by (model_name) (rate(vllm:prompt_tokens_total{model_name=~"$Deployment_id"}[$__interval]))
+                  seriesNameFormat: '{{model_name}} prompt'
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  # overall iteration tokens/sec if exposed
+                  query: >
+                    rate(vllm:iteration_tokens_total_count[$__interval])
+                  seriesNameFormat: 'iteration overall'
+
+    "17":
+      kind: Panel
+      spec:
+        display:
+          name: KV Cache Usage (avg %)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  # Multiply by 100 so we can read it as a percentage without setting a unit (avoids CUE unit conflicts)
+                  query: >
+                    100 * avg(vllm:kv_cache_usage_perc)
+
+    "18":
+      kind: Panel
+      spec:
+        display:
+          name: Running Requests by Pod
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend:
+              mode: table
+              position: bottom
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    sum by (pod) (vllm:num_requests_running)
+                  seriesNameFormat: '{{pod}}'
+
+    "19":
+      kind: Panel
+      spec:
+        display:
+          name: Waiting Requests by Pod
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend:
+              mode: table
+              position: bottom
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    sum by (pod) (vllm:num_requests_waiting)
+                  seriesNameFormat: '{{pod}}'
+
+    "20":
+      kind: Panel
+      spec:
+        display:
+          name: Running Requests (sum)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: sum(vllm:num_requests_running)
+
+    "21":
+      kind: Panel
+      spec:
+        display:
+          name: Waiting Requests (sum)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: sum(vllm:num_requests_waiting)
+
+  layouts:
+    - kind: Grid
+      spec:
+        display:
+          title: Overview
+        items:
+          - x: 0
+            y: 0
+            width: 6
+            height: 3
+            content: { $ref: '#/spec/panels/17' }   # KV cache %
+          - x: 6
+            y: 0
+            width: 6
+            height: 3
+            content: { $ref: '#/spec/panels/20' }   # running sum
+          - x: 12
+            y: 0
+            width: 6
+            height: 3
+            content: { $ref: '#/spec/panels/21' }   # waiting sum
+
+    - kind: Grid
+      spec:
+        display:
+          title: E2E Latency
+        items:
+          - x: 0
+            y: 1
+            width: 10
+            height: 6
+            content: { $ref: '#/spec/panels/1' }
+          - x: 10
+            y: 1
+            width: 7
+            height: 3
+            content: { $ref: '#/spec/panels/2' }
+          - x: 17
+            y: 1
+            width: 7
+            height: 3
+            content: { $ref: '#/spec/panels/3' }
+          - x: 10
+            y: 4
+            width: 7
+            height: 3
+            content: { $ref: '#/spec/panels/4' }
+          - x: 17
+            y: 4
+            width: 7
+            height: 3
+            content: { $ref: '#/spec/panels/5' }
+
+    - kind: Grid
+      spec:
+        display:
+          title: TTFT
+        items:
+          - x: 0
+            y: 8
+            width: 10
+            height: 6
+            content: { $ref: '#/spec/panels/6' }
+          - x: 10
+            y: 8
+            width: 7
+            height: 3
+            content: { $ref: '#/spec/panels/7' }
+          - x: 17
+            y: 8
+            width: 7
+            height: 3
+            content: { $ref: '#/spec/panels/8' }
+          - x: 10
+            y: 11
+            width: 7
+            height: 3
+            content: { $ref: '#/spec/panels/9' }
+          - x: 17
+            y: 11
+            width: 7
+            height: 3
+            content: { $ref: '#/spec/panels/10' }
+
+    - kind: Grid
+      spec:
+        display:
+          title: ITL (Time per Output Token)
+        items:
+          - x: 0
+            y: 15
+            width: 10
+            height: 6
+            content: { $ref: '#/spec/panels/11' }
+          - x: 10
+            y: 15
+            width: 7
+            height: 3
+            content: { $ref: '#/spec/panels/12' }
+          - x: 17
+            y: 15
+            width: 7
+            height: 3
+            content: { $ref: '#/spec/panels/13' }
+          - x: 10
+            y: 18
+            width: 7
+            height: 3
+            content: { $ref: '#/spec/panels/14' }
+          - x: 17
+            y: 18
+            width: 7
+            height: 3
+            content: { $ref: '#/spec/panels/15' }
+
+    - kind: Grid
+      spec:
+        display:
+          title: TPS (Prompt / Generation / Iteration)
+        items:
+          - x: 0
+            y: 22
+            width: 14
+            height: 6
+            content: { $ref: '#/spec/panels/16' }
+
+    - kind: Grid
+      spec:
+        display:
+          title: Per-Pod Request State
+        items:
+          - x: 0
+            y: 28
+            width: 12
+            height: 6
+            content: { $ref: '#/spec/panels/18' }
+          - x: 12
+            y: 28
+            width: 12
+            height: 6
+            content: { $ref: '#/spec/panels/19' }
+
--- a/examples/online_serving/dashboards/perses/query_statistics.yaml
+++ b/examples/online_serving/dashboards/perses/query_statistics.yaml
@@ -0,0 +1,392 @@
+kind: PersesDashboard
+metadata:
+  name: query-statistics
+  createdAt: 0001-01-01T00:00:00Z
+  updatedAt: 0001-01-01T00:00:00Z
+  version: 0
+  project: ""
+spec:
+  display:
+    name: Query Statistics_New
+
+  variables:
+    - kind: ListVariable
+      spec:
+        name: NS
+        display: { name: Namespace }
+        allowMultiple: false
+        defaultValue: llm-d
+        plugin:
+          kind: PrometheusLabelValuesVariable
+          spec:
+            datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+            labelName: namespace
+            matchers:
+              - up{service=~".*vllm.*"}
+
+    - kind: ListVariable
+      spec:
+        name: SVC
+        display: { name: Service }
+        allowMultiple: false
+        defaultValue: vllm-qwen2-0-5b-sim
+        plugin:
+          kind: PrometheusLabelValuesVariable
+          spec:
+            datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+            labelName: service
+            matchers:
+              - up{namespace="$NS",service=~".*vllm.*"}
+
+    - kind: ListVariable
+      spec:
+        name: MODEL
+        display: { name: Model (real vLLM) }
+        allowAllValue: true
+        allowMultiple: true
+        defaultValue: ["$__all"]
+        plugin:
+          kind: PrometheusLabelValuesVariable
+          spec:
+            datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+            labelName: model_name
+            matchers:
+              - vllm:request_success_total{namespace="$NS",service="$SVC"}
+
+  panels:
+
+    # --- Core (works on Simulator & Real) ---
+    core_running_now:
+      kind: Panel
+      spec:
+        display: { name: Running Requests (now) }
+        plugin: { kind: StatChart, spec: { calculation: last-number } }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: sum(vllm:num_requests_running{namespace="$NS",service="$SVC"}) or vector(0)
+                  minStep: "15s"
+
+    core_waiting_now:
+      kind: Panel
+      spec:
+        display: { name: Waiting Requests (now) }
+        plugin: { kind: StatChart, spec: { calculation: last-number } }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: sum(vllm:num_requests_waiting{namespace="$NS",service="$SVC"}) or vector(0)
+                  minStep: "15s"
+
+    core_kv_usage_now:
+      kind: Panel
+      spec:
+        display: { name: KV Cache Usage (0–1) }
+        plugin: { kind: StatChart, spec: { calculation: last-number } }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: avg(vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) or vector(0)
+                  minStep: "15s"
+
+    core_running_ts:
+      kind: Panel
+      spec:
+        display: { name: Running Over Time }
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend: { mode: table, position: bottom }
+            visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: sum by (service) (vllm:num_requests_running{namespace="$NS",service="$SVC"}) or vector(0)
+                  minStep: "15s"
+
+    core_waiting_ts:
+      kind: Panel
+      spec:
+        display: { name: Waiting Over Time }
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend: { mode: table, position: bottom }
+            visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: sum by (service) (vllm:num_requests_waiting{namespace="$NS",service="$SVC"}) or vector(0)
+                  minStep: "15s"
+
+    core_targets_up:
+      kind: Panel
+      spec:
+        display: { name: Scrape Targets Up }
+        plugin: { kind: StatChart, spec: { calculation: last-number } }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: count(up{namespace="$NS",service="$SVC"} == 1) or vector(0)
+                  minStep: "15s"
+
+    # --- KV Cache as Percent (works on Simulator & Real) ---
+    core_kv_usage_pct_now:
+      kind: Panel
+      spec:
+        display: { name: KV Cache Usage (%) – now }
+        plugin: { kind: StatChart, spec: { calculation: last-number } }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  # multiply by 100 to present percentage; omit format.unit to avoid schema conflicts
+                  query: (avg(vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
+                  minStep: "15s"
+
+    core_kv_usage_pct_ts:
+      kind: Panel
+      spec:
+        display: { name: KV Cache Usage (%) – over time }
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend: { mode: table, position: bottom }
+            visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: (avg by (service) (vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
+                  minStep: "15s"
+
+    # --- Per-Pod breakdowns (works on Simulator & Real) ---
+    per_pod_running_ts:
+      kind: Panel
+      spec:
+        display: { name: Running by Pod }
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend: { mode: table, position: bottom }
+            visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: sum by (pod) (vllm:num_requests_running{namespace="$NS",service="$SVC"}) or vector(0)
+                  minStep: "15s"
+
+    per_pod_waiting_ts:
+      kind: Panel
+      spec:
+        display: { name: Waiting by Pod }
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend: { mode: table, position: bottom }
+            visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: sum by (pod) (vllm:num_requests_waiting{namespace="$NS",service="$SVC"}) or vector(0)
+                  minStep: "15s"
+
+    per_pod_kv_pct_ts:
+      kind: Panel
+      spec:
+        display: { name: KV Cache (%) by Pod }
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend: { mode: table, position: bottom }
+            visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  # if your exporter labels kv metric with pod (the sim does), this works; otherwise it will just return empty
+                  query: (avg by (pod) (vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
+                  minStep: "15s"
+
+    # --- Real vLLM only (zeros on simulator) ---
+    real_req_rate_ts:
+      kind: Panel
+      spec:
+        display: { name: Request Rate (real vLLM) }
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend: { mode: table, position: bottom }
+            visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: sum by (model_name) (rate(vllm:request_success_total{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval])) or vector(0)
+                  minStep: "15s"
+
+    real_p50:
+      kind: Panel
+      spec:
+        display: { name: p50 Latency (real vLLM) }
+        plugin: { kind: StatChart, spec: { calculation: last-number } }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: histogram_quantile(0.50, sum by (le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval]))) or vector(0)
+                  minStep: "15s"
+
+    real_p90:
+      kind: Panel
+      spec:
+        display: { name: p90 Latency (real vLLM) }
+        plugin: { kind: StatChart, spec: { calculation: last-number } }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: histogram_quantile(0.90, sum by (le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval]))) or vector(0)
+                  minStep: "15s"
+
+    real_p99:
+      kind: Panel
+      spec:
+        display: { name: p99 Latency (real vLLM) }
+        plugin: { kind: StatChart, spec: { calculation: last-number } }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: histogram_quantile(0.99, sum by (le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval]))) or vector(0)
+                  minStep: "15s"
+
+    real_input_tokens_ts:
+      kind: Panel
+      spec:
+        display: { name: Input Tokens / sec (real vLLM) }
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend: { mode: table, position: bottom }
+            visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: sum by (model_name) (rate(vllm:prompt_tokens_total{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval])) or vector(0)
+                  minStep: "15s"
+
+    real_output_tokens_ts:
+      kind: Panel
+      spec:
+        display: { name: Output Tokens / sec (real vLLM) }
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend: { mode: table, position: bottom }
+            visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: sum by (model_name) (rate(vllm:generation_tokens_total{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval])) or vector(0)
+                  minStep: "15s"
+
+  layouts:
+    - kind: Grid
+      spec:
+        display: { title: Core (Sim & Real) }
+        items:
+          - { x: 0,  y: 0,  width: 6,  height: 3, content: { $ref: '#/spec/panels/core_running_now' } }
+          - { x: 6,  y: 0,  width: 6,  height: 3, content: { $ref: '#/spec/panels/core_waiting_now' } }
+          - { x: 12, y: 0,  width: 6,  height: 3, content: { $ref: '#/spec/panels/core_kv_usage_now' } }
+          - { x: 18, y: 0,  width: 6,  height: 3, content: { $ref: '#/spec/panels/core_targets_up' } }
+          - { x: 0,  y: 3,  width: 12, height: 6, content: { $ref: '#/spec/panels/core_running_ts' } }
+          - { x: 12, y: 3,  width: 12, height: 6, content: { $ref: '#/spec/panels/core_waiting_ts' } }
+
+    - kind: Grid
+      spec:
+        display: { title: KV Cache (%) }
+        items:
+          - { x: 0,  y: 9,  width: 6,  height: 3, content: { $ref: '#/spec/panels/core_kv_usage_pct_now' } }
+          - { x: 6,  y: 9,  width: 18, height: 6, content: { $ref: '#/spec/panels/core_kv_usage_pct_ts' } }
+
+    - kind: Grid
+      spec:
+        display: { title: Per-Pod breakdowns }
+        items:
+          - { x: 0,  y: 15, width: 12, height: 6, content: { $ref: '#/spec/panels/per_pod_running_ts' } }
+          - { x: 12, y: 15, width: 12, height: 6, content: { $ref: '#/spec/panels/per_pod_waiting_ts' } }
+          - { x: 0,  y: 21, width: 24, height: 6, content: { $ref: '#/spec/panels/per_pod_kv_pct_ts' } }
+
+    - kind: Grid
+      spec:
+        display: { title: Real vLLM only (shows 0 on simulator) }
+        items:
+          - { x: 0,  y: 27, width: 12, height: 6, content: { $ref: '#/spec/panels/real_req_rate_ts' } }
+          - { x: 12, y: 27, width: 4,  height: 3, content: { $ref: '#/spec/panels/real_p50' } }
+          - { x: 16, y: 27, width: 4,  height: 3, content: { $ref: '#/spec/panels/real_p90' } }
+          - { x: 20, y: 27, width: 4,  height: 3, content: { $ref: '#/spec/panels/real_p99' } }
+          - { x: 0,  y: 33, width: 12, height: 6, content: { $ref: '#/spec/panels/real_input_tokens_ts' } }
+          - { x: 12, y: 33, width: 12, height: 6, content: { $ref: '#/spec/panels/real_output_tokens_ts' } }
+