Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/examples/online_serving/api_client.py
+++ b/examples/online_serving/api_client.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Example Python client for `vllm.entrypoints.api_server`
+Start the demo server:
+    python -m vllm.entrypoints.api_server --model <model_name>
+
+NOTE: The API server is used only for demonstration and simple performance
+benchmarks. It is not intended for production use.
+For production use, we recommend `vllm serve` and the OpenAI client API.
+"""
+
+import argparse
+import json
+from argparse import Namespace
+from collections.abc import Iterable
+
+import requests
+
+
+def clear_line(n: int = 1) -> None:
+    LINE_UP = "\033[1A"
+    LINE_CLEAR = "\x1b[2K"
+    for _ in range(n):
+        print(LINE_UP, end=LINE_CLEAR, flush=True)
+
+
+def post_http_request(
+    prompt: str, api_url: str, n: int = 1, stream: bool = False
+) -> requests.Response:
+    headers = {"User-Agent": "Test Client"}
+    pload = {
+        "prompt": prompt,
+        "n": n,
+        "temperature": 0.0,
+        "max_tokens": 16,
+        "stream": stream,
+    }
+    response = requests.post(api_url, headers=headers, json=pload, stream=stream)
+    return response
+
+
+def get_streaming_response(response: requests.Response) -> Iterable[list[str]]:
+    for chunk in response.iter_lines(
+        chunk_size=8192, decode_unicode=False, delimiter=b"\n"
+    ):
+        if chunk:
+            data = json.loads(chunk.decode("utf-8"))
+            output = data["text"]
+            yield output
+
+
+def get_response(response: requests.Response) -> list[str]:
+    data = json.loads(response.content)
+    output = data["text"]
+    return output
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--n", type=int, default=1)
+    parser.add_argument("--prompt", type=str, default="San Francisco is a")
+    parser.add_argument("--stream", action="store_true")
+    return parser.parse_args()
+
+
+def main(args: Namespace):
+    prompt = args.prompt
+    api_url = f"http://{args.host}:{args.port}/generate"
+    n = args.n
+    stream = args.stream
+
+    print(f"Prompt: {prompt!r}\n", flush=True)
+    response = post_http_request(prompt, api_url, n, stream)
+
+    if stream:
+        num_printed_lines = 0
+        for h in get_streaming_response(response):
+            clear_line(num_printed_lines)
+            num_printed_lines = 0
+            for i, line in enumerate(h):
+                num_printed_lines += 1
+                print(f"Beam candidate {i}: {line!r}", flush=True)
+    else:
+        output = get_response(response)
+        for i, line in enumerate(output):
+            print(f"Beam candidate {i}: {line!r}", flush=True)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/online_serving/chart-helm/.helmignore
+++ b/examples/online_serving/chart-helm/.helmignore
@@ -0,0 +1,6 @@
+*.png
+.git/
+ct.yaml
+lintconf.yaml
+values.schema.json
+/workflows
--- a/examples/online_serving/chart-helm/Chart.yaml
+++ b/examples/online_serving/chart-helm/Chart.yaml
@@ -0,0 +1,21 @@
+apiVersion: v2
+name: chart-vllm
+description: Chart vllm
+
+# A chart can be either an 'application' or a 'library' chart.
+#
+# Application charts are a collection of templates that can be packaged into versioned archives
+# to be deployed.
+#
+# Library charts provide useful utilities or functions for the chart developer. They're included as
+# a dependency of application charts to inject those utilities and functions into the rendering
+# pipeline. Library charts do not define any templates and therefore cannot be deployed.
+type: application
+
+# This is the chart version. This version number should be incremented each time you make changes
+# to the chart and its templates, including the app version.
+# Versions are expected to follow Semantic Versioning (https://semver.org/)
+version: 0.0.1
+
+maintainers:
+  - name: mfournioux
--- a/examples/online_serving/chart-helm/README.md
+++ b/examples/online_serving/chart-helm/README.md
@@ -0,0 +1,33 @@
+# Helm Charts
+
+This directory contains a Helm chart for deploying the vllm application. The chart includes configurations for deployment, autoscaling, resource management, and more.
+
+## Files
+
+- Chart.yaml: Defines the chart metadata including name, version, and maintainers.
+- ct.yaml: Configuration for chart testing.
+- lintconf.yaml: Linting rules for YAML files.
+- values.schema.json: JSON schema for validating values.yaml.
+- values.yaml: Default values for the Helm chart.
+- templates/_helpers.tpl: Helper templates for defining common configurations.
+- templates/configmap.yaml: Template for creating ConfigMaps.
+- templates/custom-objects.yaml: Template for custom Kubernetes objects.
+- templates/deployment.yaml: Template for creating Deployments.
+- templates/hpa.yaml: Template for Horizontal Pod Autoscaler.
+- templates/job.yaml: Template for Kubernetes Jobs.
+- templates/poddisruptionbudget.yaml: Template for Pod Disruption Budget.
+- templates/pvc.yaml: Template for Persistent Volume Claims.
+- templates/secrets.yaml: Template for Kubernetes Secrets.
+- templates/service.yaml: Template for creating Services.
+
+## Running Tests
+
+This chart includes unit tests using [helm-unittest](https://github.com/helm-unittest/helm-unittest). Install the plugin and run tests:
+
+```bash
+# Install plugin
+helm plugin install https://github.com/helm-unittest/helm-unittest
+
+# Run tests
+helm unittest .
+```
--- a/examples/online_serving/chart-helm/ct.yaml
+++ b/examples/online_serving/chart-helm/ct.yaml
@@ -0,0 +1,3 @@
+chart-dirs:
+  - charts
+validate-maintainers: false
--- a/examples/online_serving/chart-helm/lintconf.yaml
+++ b/examples/online_serving/chart-helm/lintconf.yaml
@@ -0,0 +1,42 @@
+---
+rules:
+  braces:
+    min-spaces-inside: 0
+    max-spaces-inside: 0
+    min-spaces-inside-empty: -1
+    max-spaces-inside-empty: -1
+  brackets:
+    min-spaces-inside: 0
+    max-spaces-inside: 0
+    min-spaces-inside-empty: -1
+    max-spaces-inside-empty: -1
+  colons:
+    max-spaces-before: 0
+    max-spaces-after: 1
+  commas:
+    max-spaces-before: 0
+    min-spaces-after: 1
+    max-spaces-after: 1
+  comments:
+    require-starting-space: true
+    min-spaces-from-content: 2
+  document-end: disable
+  document-start: disable           # No --- to start a file
+  empty-lines:
+    max: 2
+    max-start: 0
+    max-end: 0
+  hyphens:
+    max-spaces-after: 1
+  indentation:
+    spaces: consistent
+    indent-sequences: whatever      # - list indentation will handle both indentation and without
+    check-multi-line-strings: false
+  key-duplicates: enable
+  line-length: disable              # Lines can be any length
+  new-line-at-end-of-file: disable
+  new-lines:
+    type: unix
+  trailing-spaces: enable
+  truthy:
+    level: warning
--- a/examples/online_serving/chart-helm/templates/_helpers.tpl
+++ b/examples/online_serving/chart-helm/templates/_helpers.tpl
@@ -0,0 +1,165 @@
+{{/*
+Define ports for the pods
+*/}}
+{{- define "chart.container-port" -}}
+{{-  default "8000" .Values.containerPort }}
+{{- end }}
+
+{{/*
+Define service name
+*/}}
+{{- define "chart.service-name" -}}
+{{-  if .Values.serviceName }}
+{{-    .Values.serviceName | lower | trim }}
+{{-  else }}
+"{{ .Release.Name }}-service"
+{{-  end }}
+{{- end }}
+
+{{/*
+Define service port
+*/}}
+{{- define "chart.service-port" -}}
+{{-  if .Values.servicePort }}
+{{-    .Values.servicePort }}
+{{-  else }}
+{{-    include "chart.container-port" . }}
+{{-  end }}
+{{- end }}
+
+{{/*
+Define service port name
+*/}}
+{{- define "chart.service-port-name" -}}
+"service-port"
+{{- end }}
+
+{{/*
+Define container port name
+*/}}
+{{- define "chart.container-port-name" -}}
+"container-port"
+{{- end }}
+
+{{/*
+Define deployment strategy
+*/}}
+{{- define "chart.strategy" -}}
+strategy:
+{{-   if not .Values.deploymentStrategy }}
+  rollingUpdate:
+    maxSurge: 100%
+    maxUnavailable: 0
+{{-   else }}
+{{      toYaml .Values.deploymentStrategy | indent 2 }}
+{{-   end }}
+{{- end }}
+
+{{/*
+Define additional ports
+*/}}
+{{- define "chart.extraPorts" }}
+{{-   with .Values.extraPorts }}
+{{      toYaml . }}
+{{-   end }}
+{{- end }}
+
+{{/*
+Define chart external ConfigMaps and Secrets
+*/}}
+{{- define "chart.externalConfigs" -}}
+{{-   with .Values.externalConfigs -}}
+{{      toYaml . }}
+{{-   end }}
+{{- end }}
+
+
+{{/*
+Define liveness et readiness probes
+*/}}
+{{- define "chart.probes" -}}
+{{-   if .Values.readinessProbe  }}
+readinessProbe:
+{{-     with .Values.readinessProbe }}
+{{-       toYaml . | nindent 2 }}
+{{-     end }}
+{{-   end }}
+{{-   if .Values.livenessProbe  }}
+livenessProbe:
+{{-     with .Values.livenessProbe }}
+{{-       toYaml . | nindent 2 }}
+{{-     end }}
+{{-   end }}
+{{- end }}
+
+{{/*
+Define resources
+*/}}
+{{- define "chart.resources" -}}
+requests:
+  memory: {{ required "Value 'resources.requests.memory' must be defined !" .Values.resources.requests.memory | quote }}
+  cpu: {{ required "Value 'resources.requests.cpu' must be defined !" .Values.resources.requests.cpu | quote }}
+  {{- if and (gt (int (index .Values.resources.requests "nvidia.com/gpu")) 0) (gt (int (index .Values.resources.limits "nvidia.com/gpu")) 0) }}
+  nvidia.com/gpu: {{ required "Value 'resources.requests.nvidia.com/gpu' must be defined !" (index .Values.resources.requests "nvidia.com/gpu") | quote }}
+  {{- end }}
+limits:
+  memory: {{ required "Value 'resources.limits.memory' must be defined !" .Values.resources.limits.memory | quote }}
+  cpu: {{ required "Value 'resources.limits.cpu' must be defined !" .Values.resources.limits.cpu | quote }}
+  {{- if and (gt (int (index .Values.resources.requests "nvidia.com/gpu")) 0) (gt (int (index .Values.resources.limits "nvidia.com/gpu")) 0) }}
+  nvidia.com/gpu: {{ required "Value 'resources.limits.nvidia.com/gpu' must be defined !" (index .Values.resources.limits "nvidia.com/gpu") | quote }}
+  {{- end }}
+{{- end }}
+
+
+{{/*
+Define User used for the main container
+*/}}
+{{- define "chart.user" }}
+{{-   if .Values.image.runAsUser  }}
+runAsUser: 
+{{-     with .Values.runAsUser }}
+{{-       toYaml . | nindent 2 }}
+{{-     end }}
+{{-   end }}
+{{- end }}
+
+
+{{- define "chart.extraInitEnv" -}}
+- name: S3_ENDPOINT_URL
+  valueFrom:
+    secretKeyRef:
+      name: {{ .Release.Name }}-secrets
+      key: s3endpoint
+- name: S3_BUCKET_NAME
+  valueFrom:
+    secretKeyRef:
+      name: {{ .Release.Name }}-secrets
+      key: s3bucketname
+- name: AWS_ACCESS_KEY_ID
+  valueFrom:
+    secretKeyRef:
+      name: {{ .Release.Name }}-secrets
+      key: s3accesskeyid
+- name: AWS_SECRET_ACCESS_KEY
+  valueFrom:
+    secretKeyRef:
+      name: {{ .Release.Name }}-secrets
+      key: s3accesskey
+{{- if .Values.extraInit.s3modelpath }}
+- name: S3_PATH
+  value: "{{ .Values.extraInit.s3modelpath }}"
+{{- end }}
+{{- if hasKey .Values.extraInit "awsEc2MetadataDisabled" }}
+- name: AWS_EC2_METADATA_DISABLED
+  value: "{{ .Values.extraInit.awsEc2MetadataDisabled }}"
+{{- end }}
+{{- end }}
+
+{{/*
+  Define chart labels
+*/}}
+{{- define "chart.labels" -}}
+{{-   with .Values.labels -}}
+{{      toYaml . }}
+{{-   end }}
+{{- end }}
--- a/examples/online_serving/chart-helm/templates/configmap.yaml
+++ b/examples/online_serving/chart-helm/templates/configmap.yaml
@@ -0,0 +1,11 @@
+{{- if .Values.configs -}}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-configs"
+  namespace: {{ .Release.Namespace }}
+data:
+  {{- with .Values.configs }}
+  {{- toYaml . | nindent 2 }}
+  {{- end }}
+{{- end -}}
--- a/examples/online_serving/chart-helm/templates/custom-objects.yaml
+++ b/examples/online_serving/chart-helm/templates/custom-objects.yaml
@@ -0,0 +1,6 @@
+{{- if .Values.customObjects }}
+{{- range .Values.customObjects }}
+{{- tpl (. | toYaml) $ }}
+---
+{{- end }}
+{{- end }}
--- a/examples/online_serving/chart-helm/templates/deployment.yaml
+++ b/examples/online_serving/chart-helm/templates/deployment.yaml
@@ -0,0 +1,131 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: "{{ .Release.Name }}-deployment-vllm"
+  namespace: {{ .Release.Namespace }}
+  labels:
+  {{- include "chart.labels" . | nindent 4 }}
+spec:
+  replicas: {{ .Values.replicaCount }}
+  {{- include "chart.strategy" . | nindent 2 }}
+  selector:                                                                                                                                  
+    matchLabels:
+      environment: "test"
+      release: "test"
+  progressDeadlineSeconds: 1200
+  template:
+    metadata:
+      labels:
+        environment: "test"
+        release: "test"
+    spec:
+      containers:
+        - name: "vllm"
+          image: "{{ required "Required value 'image.repository' must be defined !" .Values.image.repository }}:{{ required "Required value 'image.tag' must be defined !" .Values.image.tag }}"
+          {{- if .Values.image.command }}
+          command :
+            {{- with .Values.image.command }}
+            {{- toYaml . | nindent 10 }}
+            {{- end }}
+          {{- end }}
+          securityContext:
+            {{- if .Values.image.securityContext }}
+              {{- with .Values.image.securityContext }}
+              {{- toYaml . | nindent 12 }}
+              {{- end }}
+            {{- else }}
+            runAsNonRoot: false
+              {{- include "chart.user" . | indent 12 }}
+            {{- end }}
+          imagePullPolicy: IfNotPresent
+          {{- if .Values.image.env }}
+          env :
+            {{- with .Values.image.env }}
+            {{- toYaml . | nindent 10 }}
+            {{- end }}
+          {{- else }}
+          env: []
+          {{- end }}
+          {{- if or .Values.externalConfigs .Values.configs .Values.secrets }}
+          envFrom:
+            {{- if .Values.configs }}
+            - configMapRef:
+                name: "{{ .Release.Name }}-configs"
+            {{- end }}
+            {{- if .Values.secrets}}
+            - secretRef:
+                name: "{{ .Release.Name }}-secrets"
+            {{- end }}
+            {{- include "chart.externalConfigs" . | nindent 12 }}
+          {{- end }}          
+          ports:
+            - name: {{ include "chart.container-port-name" . }}
+              containerPort: {{ include "chart.container-port" . }}
+            {{- include "chart.extraPorts" . | nindent 12 }}
+          {{- include "chart.probes" . | indent 10 }}
+          resources: {{- include "chart.resources" . | nindent 12 }}
+          volumeMounts:
+          - name: {{ .Release.Name }}-storage
+            mountPath: /data
+
+        {{- with .Values.extraContainers }}
+        {{ toYaml . | nindent 8 }}
+        {{- end }}
+
+      {{- if and .Values.extraInit (or .Values.extraInit.modelDownload.enabled .Values.extraInit.initContainers) }}
+      initContainers:
+      {{- if .Values.extraInit.modelDownload.enabled }}
+      - name: wait-download-model
+        image: {{ .Values.extraInit.modelDownload.image.repository }}:{{ .Values.extraInit.modelDownload.image.tag }}
+        imagePullPolicy: {{ .Values.extraInit.modelDownload.image.pullPolicy }}
+        command: {{ .Values.extraInit.modelDownload.waitContainer.command | toJson }}
+        args:
+        {{- toYaml .Values.extraInit.modelDownload.waitContainer.args | nindent 10 }}
+        env:
+        {{- if .Values.extraInit.modelDownload.waitContainer.env }}
+        {{- toYaml .Values.extraInit.modelDownload.waitContainer.env | nindent 10 }}
+        {{- else }}
+        {{- include "chart.extraInitEnv" . | nindent 10 }}
+        {{- end }}
+        resources:
+          requests:
+            cpu: 200m
+            memory: 1Gi
+          limits:
+            cpu: 500m
+            memory: 2Gi
+        volumeMounts:
+        - name: {{ .Release.Name }}-storage
+          mountPath: /data
+      {{- end }}
+      {{- with .Values.extraInit.initContainers }}
+      {{- toYaml . | nindent 6 }}
+      {{- end }}
+      {{- end }}
+      volumes:
+        - name: {{ .Release.Name }}-storage
+          persistentVolumeClaim:
+            claimName: {{ .Release.Name }}-storage-claim     
+
+      {{- with .Values.nodeSelector }}
+      nodeSelector:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.tolerations }}
+      tolerations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- if and (gt (int (index .Values.resources.requests "nvidia.com/gpu")) 0) (gt (int (index .Values.resources.limits "nvidia.com/gpu")) 0) }}
+      runtimeClassName: nvidia
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                - key: nvidia.com/gpu.product
+                  operator: In
+                  {{- with .Values.gpuModels }}
+                  values:
+                    {{- toYaml . | nindent 20 }}
+                  {{- end }}
+      {{- end }} 
--- a/examples/online_serving/chart-helm/templates/hpa.yaml
+++ b/examples/online_serving/chart-helm/templates/hpa.yaml
@@ -0,0 +1,31 @@
+{{- if .Values.autoscaling.enabled }}
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: "{{ .Release.Name }}-hpa"
+  namespace: {{ .Release.Namespace }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: vllm
+  minReplicas: {{ .Values.autoscaling.minReplicas }}
+  maxReplicas: {{ .Values.autoscaling.maxReplicas }}
+  metrics:
+    {{- if .Values.autoscaling.targetCPUUtilizationPercentage }}
+    - type: Resource
+      resource:
+        name: cpu
+        target:
+          type: Utilization
+          averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
+    {{- end }}
+    {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }}
+    - type: Resource
+      resource:
+        name: memory
+        target:
+          type: Utilization
+          averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }}
+    {{- end }}
+{{- end }}
--- a/examples/online_serving/chart-helm/templates/job.yaml
+++ b/examples/online_serving/chart-helm/templates/job.yaml
@@ -0,0 +1,41 @@
+{{- if and .Values.extraInit .Values.extraInit.modelDownload.enabled }}
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: "{{ .Release.Name }}-init-vllm"
+  namespace: {{ .Release.Namespace }}
+spec:
+  ttlSecondsAfterFinished: 100
+  template:
+   metadata:
+     name: init-vllm
+   spec:
+    containers:
+    - name: job-download-model
+      image: {{ .Values.extraInit.modelDownload.image.repository }}:{{ .Values.extraInit.modelDownload.image.tag }}
+      imagePullPolicy: {{ .Values.extraInit.modelDownload.image.pullPolicy }}
+      command: {{ .Values.extraInit.modelDownload.downloadJob.command | toJson }}
+      args:
+      {{- toYaml .Values.extraInit.modelDownload.downloadJob.args | nindent 8 }}
+      env:
+      {{- if .Values.extraInit.modelDownload.downloadJob.env }}
+      {{- toYaml .Values.extraInit.modelDownload.downloadJob.env | nindent 8 }}
+      {{- else }}
+      {{- include "chart.extraInitEnv" . | nindent 8 }}
+      {{- end }}
+      volumeMounts:
+        - name: {{ .Release.Name }}-storage
+          mountPath: /data
+      resources:
+        requests:
+          cpu: 200m
+          memory: 1Gi
+        limits:
+          cpu: 500m
+          memory: 2Gi
+    restartPolicy: OnFailure
+    volumes:
+    - name: {{ .Release.Name }}-storage
+      persistentVolumeClaim:
+        claimName: "{{ .Release.Name }}-storage-claim"
+{{- end }}
--- a/examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml
+++ b/examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml
@@ -0,0 +1,7 @@
+apiVersion: policy/v1
+kind: PodDisruptionBudget
+metadata:
+  name: "{{ .Release.Name }}-pdb"
+  namespace: {{ .Release.Namespace }}
+spec:
+  maxUnavailable: {{ default 1 .Values.maxUnavailablePodDisruptionBudget }}
--- a/examples/online_serving/chart-helm/templates/pvc.yaml
+++ b/examples/online_serving/chart-helm/templates/pvc.yaml
@@ -0,0 +1,13 @@
+{{-   if .Values.extraInit  }}
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: "{{ .Release.Name }}-storage-claim"
+  namespace: {{ .Release.Namespace }}
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: {{ .Values.extraInit.pvcStorage }}
+{{- end }}
--- a/examples/online_serving/chart-helm/templates/secrets.yaml
+++ b/examples/online_serving/chart-helm/templates/secrets.yaml
@@ -0,0 +1,10 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: "{{ .Release.Name }}-secrets"
+  namespace: {{ .Release.Namespace }}
+type: Opaque
+data:
+  {{- range $key, $val := .Values.secrets }}
+  {{ $key }}: {{ $val | b64enc | quote }}
+  {{- end }}
--- a/examples/online_serving/chart-helm/templates/service.yaml
+++ b/examples/online_serving/chart-helm/templates/service.yaml
@@ -0,0 +1,14 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: "{{ .Release.Name }}-service"
+  namespace: {{ .Release.Namespace }}
+spec:
+  type: ClusterIP
+  ports:
+    - name: {{ include "chart.service-port-name" . }}
+      port: {{ include "chart.service-port" . }}
+      targetPort: {{ include "chart.container-port-name" . }}
+      protocol: TCP
+  selector:
+  {{- include "chart.labels" . | nindent 4 }}
--- a/examples/online_serving/chart-helm/tests/deployment_test.yaml
+++ b/examples/online_serving/chart-helm/tests/deployment_test.yaml
@@ -0,0 +1,135 @@
+suite: test deployment
+templates:
+  - deployment.yaml
+tests:
+  - it: should create wait-download-model init container when modelDownload is enabled
+    set:
+      extraInit:
+        modelDownload:
+          enabled: true
+          image:
+            repository: "amazon/aws-cli"
+            tag: "2.6.4"
+            pullPolicy: "IfNotPresent"
+          waitContainer:
+            command: [ "/bin/bash" ]
+            args:
+              - "-eucx"
+              - "while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done"
+          downloadJob:
+            command: [ "/bin/bash" ]
+            args:
+              - "-eucx"
+              - "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data"
+        initContainers: [ ]
+        pvcStorage: "1Gi"
+        s3modelpath: "relative_s3_model_path/opt-125m"
+        awsEc2MetadataDisabled: true
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: Deployment
+      - isNotEmpty:
+          path: spec.template.spec.initContainers
+      - equal:
+          path: spec.template.spec.initContainers[0].name
+          value: wait-download-model
+      - equal:
+          path: spec.template.spec.initContainers[0].image
+          value: amazon/aws-cli:2.6.4
+      - equal:
+          path: spec.template.spec.initContainers[0].imagePullPolicy
+          value: IfNotPresent
+
+  - it: should only create custom init containers when modelDownload is disabled
+    set:
+      extraInit:
+        modelDownload:
+          enabled: false
+          image:
+            repository: "amazon/aws-cli"
+            tag: "2.6.4"
+            pullPolicy: "IfNotPresent"
+          waitContainer:
+            command: [ "/bin/bash" ]
+            args: [ "-c", "echo test" ]
+          downloadJob:
+            command: [ "/bin/bash" ]
+            args: [ "-c", "echo test" ]
+        initContainers:
+          - name: llm-d-routing-proxy
+            image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
+            imagePullPolicy: IfNotPresent
+            ports:
+              - containerPort: 8080
+                name: proxy
+        pvcStorage: "10Gi"
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: Deployment
+      - lengthEqual:
+          path: spec.template.spec.initContainers
+          count: 1
+      - equal:
+          path: spec.template.spec.initContainers[0].name
+          value: llm-d-routing-proxy
+      - equal:
+          path: spec.template.spec.initContainers[0].image
+          value: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
+      - equal:
+          path: spec.template.spec.initContainers[0].ports[0].containerPort
+          value: 8080
+
+  - it: should create both wait-download-model and custom init containers when both are enabled
+    set:
+      extraInit:
+        modelDownload:
+          enabled: true
+          image:
+            repository: "amazon/aws-cli"
+            tag: "2.6.4"
+            pullPolicy: "IfNotPresent"
+          waitContainer:
+            command: [ "/bin/bash" ]
+            args:
+              - "-eucx"
+              - "while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done"
+          downloadJob:
+            command: [ "/bin/bash" ]
+            args:
+              - "-eucx"
+              - "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data"
+        initContainers:
+          - name: llm-d-routing-proxy
+            image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
+            imagePullPolicy: IfNotPresent
+            ports:
+              - containerPort: 8080
+                name: proxy
+        pvcStorage: "10Gi"
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: Deployment
+      - lengthEqual:
+          path: spec.template.spec.initContainers
+          count: 2
+      - equal:
+          path: spec.template.spec.initContainers[0].name
+          value: wait-download-model
+      - equal:
+          path: spec.template.spec.initContainers[0].image
+          value: amazon/aws-cli:2.6.4
+      - equal:
+          path: spec.template.spec.initContainers[1].name
+          value: llm-d-routing-proxy
+      - equal:
+          path: spec.template.spec.initContainers[1].image
+          value: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
+      - equal:
+          path: spec.template.spec.initContainers[1].ports[0].containerPort
+          value: 8080
--- a/examples/online_serving/chart-helm/tests/job_test.yaml
+++ b/examples/online_serving/chart-helm/tests/job_test.yaml
@@ -0,0 +1,61 @@
+suite: test job
+templates:
+  - job.yaml
+tests:
+  - it: should create job when modelDownload is enabled
+    set:
+      extraInit:
+        modelDownload:
+          enabled: true
+          image:
+            repository: "amazon/aws-cli"
+            tag: "2.6.4"
+            pullPolicy: "IfNotPresent"
+          waitContainer:
+            command: [ "/bin/bash" ]
+            args: [ "-c", "wait" ]
+          downloadJob:
+            command: [ "/bin/bash" ]
+            args:
+              - "-eucx"
+              - "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data"
+        pvcStorage: "1Gi"
+        s3modelpath: "relative_s3_model_path/opt-125m"
+        awsEc2MetadataDisabled: true
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: Job
+      - equal:
+          path: spec.template.spec.containers[0].name
+          value: job-download-model
+      - equal:
+          path: spec.template.spec.containers[0].image
+          value: amazon/aws-cli:2.6.4
+      - equal:
+          path: spec.template.spec.restartPolicy
+          value: OnFailure
+
+  - it: should not create job when modelDownload is disabled
+    set:
+      extraInit:
+        modelDownload:
+          enabled: false
+          image:
+            repository: "amazon/aws-cli"
+            tag: "2.6.4"
+            pullPolicy: "IfNotPresent"
+          waitContainer:
+            command: [ "/bin/bash" ]
+            args: [ "-c", "wait" ]
+          downloadJob:
+            command: [ "/bin/bash" ]
+            args: [ "-c", "download" ]
+        initContainers:
+          - name: llm-d-routing-proxy
+            image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
+        pvcStorage: "10Gi"
+    asserts:
+      - hasDocuments:
+          count: 0
--- a/examples/online_serving/chart-helm/tests/pvc_test.yaml
+++ b/examples/online_serving/chart-helm/tests/pvc_test.yaml
@@ -0,0 +1,32 @@
+suite: test pvc
+templates:
+  - pvc.yaml
+tests:
+  # Test Case: PVC Created When extraInit Defined
+  - it: should create pvc when extraInit is defined
+    set:
+      extraInit:
+        modelDownload:
+          enabled: true
+          image:
+            repository: "amazon/aws-cli"
+            tag: "2.6.4"
+            pullPolicy: "IfNotPresent"
+          waitContainer:
+            command: ["/bin/bash"]
+            args: ["-c", "wait"]
+          downloadJob:
+            command: ["/bin/bash"]
+            args: ["-c", "download"]
+        pvcStorage: "10Gi"
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: PersistentVolumeClaim
+      - equal:
+          path: spec.accessModes[0]
+          value: ReadWriteOnce
+      - equal:
+          path: spec.resources.requests.storage
+          value: 10Gi
--- a/examples/online_serving/chart-helm/values.schema.json
+++ b/examples/online_serving/chart-helm/values.schema.json
@@ -0,0 +1,329 @@
+{
+    "$schema": "http://json-schema.org/schema#",
+    "type": "object",
+    "properties": {
+        "image": {
+            "type": "object",
+            "properties": {
+                "repository": {
+                    "type": "string"
+                },
+                "tag": {
+                    "type": "string"
+                },
+                "command": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                }
+            },
+            "required": [
+                "command",
+                "repository",
+                "tag"
+            ]
+        },
+        "containerPort": {
+            "type": "integer"
+        },
+        "serviceName": {
+            "type": "null"
+        },
+        "servicePort": {
+            "type": "integer"
+        },
+        "extraPorts": {
+            "type": "array"
+        },
+        "replicaCount": {
+            "type": "integer"
+        },
+        "deploymentStrategy": {
+            "type": "object"
+        },
+        "resources": {
+            "type": "object",
+            "properties": {
+                "requests": {
+                    "type": "object",
+                    "properties": {
+                        "cpu": {
+                            "type": "integer"
+                        },
+                        "memory": {
+                            "type": "string"
+                        },
+                        "nvidia.com/gpu": {
+                            "type": "integer"
+                        }
+                    },
+                    "required": [
+                        "cpu",
+                        "memory",
+                        "nvidia.com/gpu"
+                    ]
+                },
+                "limits": {
+                    "type": "object",
+                    "properties": {
+                        "cpu": {
+                            "type": "integer"
+                        },
+                        "memory": {
+                            "type": "string"
+                        },
+                        "nvidia.com/gpu": {
+                            "type": "integer"
+                        }
+                    },
+                    "required": [
+                        "cpu",
+                        "memory",
+                        "nvidia.com/gpu"
+                    ]
+                }
+            },
+            "required": [
+                "limits",
+                "requests"
+            ]
+        },
+        "gpuModels": {
+            "type": "array",
+            "items": {
+                "type": "string"
+            }
+        },
+        "autoscaling": {
+            "type": "object",
+            "properties": {
+                "enabled": {
+                    "type": "boolean"
+                },
+                "minReplicas": {
+                    "type": "integer"
+                },
+                "maxReplicas": {
+                    "type": "integer"
+                },
+                "targetCPUUtilizationPercentage": {
+                    "type": "integer"
+                }
+            },
+            "required": [
+                "enabled",
+                "maxReplicas",
+                "minReplicas",
+                "targetCPUUtilizationPercentage"
+            ]
+        },
+        "configs": {
+            "type": "object"
+        },
+        "secrets": {
+            "type": "object"
+        },
+        "externalConfigs": {
+            "type": "array"
+        },
+        "customObjects": {
+            "type": "array"
+        },
+        "maxUnavailablePodDisruptionBudget": {
+            "type": "string"
+        },
+        "extraInit": {
+            "type": "object",
+            "properties": {
+                "modelDownload": {
+                    "type": "object",
+                    "properties": {
+                        "enabled": {
+                            "type": "boolean"
+                        },
+                        "image": {
+                            "type": "object",
+                            "properties": {
+                                "repository": {
+                                    "type": "string"
+                                },
+                                "tag": {
+                                    "type": "string"
+                                },
+                                "pullPolicy": {
+                                    "type": "string"
+                                }
+                            },
+                            "required": ["repository", "tag", "pullPolicy"]
+                        },
+                        "waitContainer": {
+                            "type": "object",
+                            "properties": {
+                                "command": {
+                                    "type": "array",
+                                    "items": {"type": "string"}
+                                },
+                                "args": {
+                                    "type": "array",
+                                    "items": {"type": "string"}
+                                },
+                                "env": {
+                                    "type": "array",
+                                    "items": {"type": "object"}
+                                }
+                            },
+                            "required": ["command", "args"]
+                        },
+                        "downloadJob": {
+                            "type": "object",
+                            "properties": {
+                                "command": {
+                                    "type": "array",
+                                    "items": {"type": "string"}
+                                },
+                                "args": {
+                                    "type": "array",
+                                    "items": {"type": "string"}
+                                },
+                                "env": {
+                                    "type": "array",
+                                    "items": {"type": "object"}
+                                }
+                            },
+                            "required": ["command", "args"]
+                        }
+                    },
+                    "required": ["enabled", "image", "waitContainer", "downloadJob"]
+                },
+                "initContainers": {
+                    "type": "array",
+                    "items": {"type": "object"}
+                },
+                "s3modelpath": {
+                    "type": "string"
+                },
+                "pvcStorage": {
+                    "type": "string"
+                },
+                "awsEc2MetadataDisabled": {
+                    "type": "boolean"
+                }
+            },
+            "required": [
+                "modelDownload",
+                "initContainers",
+                "pvcStorage"
+            ]
+        },
+        "extraContainers": {
+            "type": "array"
+        },
+        "readinessProbe": {
+            "type": "object",
+            "properties": {
+                "initialDelaySeconds": {
+                    "type": "integer"
+                },
+                "periodSeconds": {
+                    "type": "integer"
+                },
+                "failureThreshold": {
+                    "type": "integer"
+                },
+                "httpGet": {
+                    "type": "object",
+                    "properties": {
+                        "path": {
+                            "type": "string"
+                        },
+                        "port": {
+                            "type": "integer"
+                        }
+                    },
+                    "required": [
+                        "path",
+                        "port"
+                    ]
+                }
+            },
+            "required": [
+                "failureThreshold",
+                "httpGet",
+                "initialDelaySeconds",
+                "periodSeconds"
+            ]
+        },
+        "livenessProbe": {
+            "type": "object",
+            "properties": {
+                "initialDelaySeconds": {
+                    "type": "integer"
+                },
+                "failureThreshold": {
+                    "type": "integer"
+                },
+                "periodSeconds": {
+                    "type": "integer"
+                },
+                "httpGet": {
+                    "type": "object",
+                    "properties": {
+                        "path": {
+                            "type": "string"
+                        },
+                        "port": {
+                            "type": "integer"
+                        }
+                    },
+                    "required": [
+                        "path",
+                        "port"
+                    ]
+                }
+            },
+            "required": [
+                "failureThreshold",
+                "httpGet",
+                "initialDelaySeconds",
+                "periodSeconds"
+            ]
+        },
+        "labels": {
+            "type": "object",
+            "properties": {
+                "environment": {
+                    "type": "string"
+                },
+                "release": {
+                    "type": "string"
+                }
+            },
+            "required": [
+                "environment",
+                "release"
+            ]
+        }
+    },
+    "required": [
+        "autoscaling",
+        "configs",
+        "containerPort",
+        "customObjects",
+        "deploymentStrategy",
+        "externalConfigs",
+        "extraContainers",
+        "extraInit",
+        "extraPorts",
+        "gpuModels",
+        "image",
+        "labels",
+        "livenessProbe",
+        "maxUnavailablePodDisruptionBudget",
+        "readinessProbe",
+        "replicaCount",
+        "resources",
+        "secrets",
+        "servicePort"
+    ]
+}
--- a/examples/online_serving/chart-helm/values.yaml
+++ b/examples/online_serving/chart-helm/values.yaml
@@ -0,0 +1,174 @@
+# -- Default values for chart vllm
+# -- Declare variables to be passed into your templates.
+
+# -- Image configuration
+image:
+  # -- Image repository
+  repository: "vllm/vllm-openai"
+  # -- Image tag
+  tag: "latest"
+  # -- Container launch command
+  command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--enforce-eager", "--dtype", "bfloat16", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"]
+
+# -- Container port
+containerPort: 8000
+# -- Service name
+serviceName:
+# -- Service port
+servicePort: 80
+# -- Additional ports configuration
+extraPorts: []
+
+# -- Number of replicas
+replicaCount: 1
+
+# -- Deployment strategy configuration
+deploymentStrategy: {}
+
+# -- Resource configuration
+resources:
+  requests:
+    # -- Number of CPUs
+    cpu: 4
+    # -- CPU memory configuration
+    memory: 16Gi
+    # -- Number of gpus used
+    nvidia.com/gpu: 1
+  limits:
+    # -- Number of CPUs
+    cpu: 4
+    # -- CPU memory configuration
+    memory: 16Gi
+    # -- Number of gpus used
+    nvidia.com/gpu: 1
+
+# -- Type of gpu used
+gpuModels:
+  - "TYPE_GPU_USED"
+
+# -- Autoscaling configuration
+autoscaling:
+  # -- Enable autoscaling
+  enabled: false
+  # -- Minimum replicas
+  minReplicas: 1
+  # -- Maximum replicas
+  maxReplicas: 100
+  # -- Target CPU utilization for autoscaling
+  targetCPUUtilizationPercentage: 80
+  # targetMemoryUtilizationPercentage: 80
+
+# -- Configmap
+configs: {}
+
+# -- Secrets configuration
+secrets: {}
+
+# -- External configuration
+externalConfigs: []
+
+# -- Custom Objects configuration
+customObjects: []
+
+# -- Disruption Budget Configuration
+maxUnavailablePodDisruptionBudget: ""
+
+# -- Additional configuration for the init container
+extraInit:
+  # -- Model download functionality (optional)
+  modelDownload:
+    # -- Enable model download job and wait container
+    enabled: true
+    # -- Image configuration for model download operations
+    image:
+      # -- Image repository
+      repository: "amazon/aws-cli"
+      # -- Image tag
+      tag: "2.6.4"
+      # -- Image pull policy
+      pullPolicy: "IfNotPresent"
+    # -- Wait container configuration (init container that waits for model to be ready)
+    waitContainer:
+      # -- Command to execute
+      command: ["/bin/bash"]
+      # -- Arguments for the wait container
+      args:
+        - "-eucx"
+        - "while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done"
+      # -- Environment variables (optional, overrides S3 defaults entirely if specified)
+      # env:
+      #   - name: HUGGING_FACE_HUB_TOKEN
+      #     value: "your-token"
+      #   - name: MODEL_ID
+      #     value: "meta-llama/Llama-2-7b"
+    # -- Download job configuration (job that actually downloads the model)
+    downloadJob:
+      # -- Command to execute
+      command: ["/bin/bash"]
+      # -- Arguments for the download job
+      args:
+        - "-eucx"
+        - "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data"
+      # -- Environment variables (optional, overrides S3 defaults entirely if specified)
+      # env:
+      #   - name: HUGGING_FACE_HUB_TOKEN
+      #     value: "your-token"
+      #   - name: MODEL_ID
+      #     value: "meta-llama/Llama-2-7b"
+
+  # -- Custom init containers (appended after wait-download-model if modelDownload is enabled)
+  initContainers: []
+  # Example for llm-d sidecar:
+  # initContainers:
+  #   - name: llm-d-routing-proxy
+  #     image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
+  #     imagePullPolicy: IfNotPresent
+  #     ports:
+  #       - containerPort: 8080
+  #         name: proxy
+  #     securityContext:
+  #       runAsUser: 1000
+
+  # -- Path of the model on the s3 which hosts model weights and config files
+  s3modelpath: "relative_s3_model_path/opt-125m"
+  # -- Storage size for the PVC
+  pvcStorage: "1Gi"
+  # -- Disable AWS EC2 metadata service
+  awsEc2MetadataDisabled: true
+
+# -- Additional containers configuration
+extraContainers: []
+
+# -- Readiness probe configuration
+readinessProbe:
+  # -- Number of seconds after the container has started before readiness probe is initiated
+  initialDelaySeconds: 5
+  # -- How often (in seconds) to perform the readiness probe
+  periodSeconds: 5
+  # -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
+  failureThreshold: 3
+   # -- Configuration of the Kubelet http request on the server
+  httpGet:
+    # -- Path to access on the HTTP server
+    path: /health
+    # -- Name or number of the port to access on the container, on which the server is listening
+    port: 8000
+
+# -- Liveness probe configuration
+livenessProbe:
+ # -- Number of seconds after the container has started before liveness probe is initiated
+  initialDelaySeconds: 15
+  # -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
+  failureThreshold: 3
+  # -- How often (in seconds) to perform the liveness probe
+  periodSeconds: 10
+  # -- Configuration of the Kubelet http request on the server
+  httpGet:
+    # -- Path to access on the HTTP server
+    path: /health
+    # -- Name or number of the port to access on the container, on which the server is listening
+    port: 8000
+
+labels:
+  environment: "test"
+  release: "test"
--- a/examples/online_serving/dashboards/README.md
+++ b/examples/online_serving/dashboards/README.md
@@ -0,0 +1,87 @@
+# Monitoring Dashboards
+
+This directory contains monitoring dashboard configurations for vLLM, providing
+comprehensive observability for your vLLM deployments.
+
+## Dashboard Platforms
+
+We provide dashboards for two popular observability platforms:
+
+- **[Grafana](https://grafana.com)**
+- **[Perses](https://perses.dev)**
+
+## Dashboard Format Approach
+
+All dashboards are provided in **native formats** that work across different
+deployment methods:
+
+### Grafana (JSON)
+
+- ✅ Works with any Grafana instance (cloud, self-hosted, Docker)
+- ✅ Direct import via Grafana UI or API
+- ✅ Can be wrapped in Kubernetes operators when needed
+- ✅ No vendor lock-in or deployment dependencies
+
+### Perses (YAML)
+
+- ✅ Works with standalone Perses instances
+- ✅ Compatible with Perses API and CLI
+- ✅ Supports Dashboard-as-Code workflows
+- ✅ Can be wrapped in Kubernetes operators when needed
+
+## Dashboard Contents
+
+Both platforms provide equivalent monitoring capabilities:
+
+| Dashboard | Description |
+|-----------|-------------|
+| **Performance Statistics** | Tracks latency, throughput, and performance metrics |
+| **Query Statistics** | Monitors request volume, query performance, and KPIs |
+
+## Quick Start
+
+First, navigate to this example's directory:
+
+```bash
+cd examples/online_serving/dashboards
+```
+
+### Grafana
+
+Import the JSON directly into the Grafana UI, or use the API:
+
+```bash
+curl -X POST http://grafana/api/dashboards/db \
+  -H "Content-Type: application/json" \
+  -d @grafana/performance_statistics.json
+```
+
+### Perses
+
+Import via the Perses CLI:
+
+```bash
+percli apply -f perses/performance_statistics.yaml
+```
+
+## Requirements
+
+- **Prometheus** metrics from your vLLM deployment
+- **Data source** configured in your monitoring platform
+- **vLLM metrics** enabled and accessible
+
+## Platform-Specific Documentation
+
+For detailed deployment instructions and platform-specific options, see:
+
+- **[Grafana Documentation](./grafana)** - JSON dashboards, operator usage, manual import
+- **[Perses Documentation](./perses)** - YAML specs, CLI usage, operator wrapping
+
+## Contributing
+
+When adding new dashboards, please:
+
+1. Provide native formats (JSON for Grafana, YAML specs for Perses)
+2. Update platform-specific README files
+3. Ensure dashboards work across deployment methods
+4. Test with the latest platform versions
--- a/examples/online_serving/dashboards/grafana/README.md
+++ b/examples/online_serving/dashboards/grafana/README.md
@@ -0,0 +1,59 @@
+# Grafana Dashboards for vLLM Monitoring
+
+This directory contains Grafana dashboard configurations (as JSON) designed to monitor
+vLLM performance and metrics.
+
+## Requirements
+
+- Grafana 8.0+
+- Prometheus data source configured in Grafana
+- vLLM deployment with Prometheus metrics enabled
+
+## Dashboard Descriptions
+
+- **performance_statistics.json**: Tracks performance metrics including latency and
+  throughput for your vLLM service.
+- **query_statistics.json**: Tracks query performance, request volume, and key
+  performance indicators for your vLLM service.
+
+## Deployment Options
+
+### Manual Import (Recommended)
+
+The easiest way to use these dashboards is to manually import the JSON configurations
+directly into your Grafana instance:
+
+1. Navigate to your Grafana instance
+2. Click the '+' icon in the sidebar
+3. Select 'Import'
+4. Copy and paste the JSON content from the dashboard files, or upload the JSON files
+   directly
+
+### Grafana Operator
+
+If you're using the [Grafana Operator](https://github.com/grafana-operator/grafana-operator)
+in Kubernetes, you can wrap these JSON configurations in a `GrafanaDashboard` custom
+resource:
+
+```yaml
+# Note: Adjust the instanceSelector to match your Grafana instance's labels
+# You can check with: kubectl get grafana -o yaml
+apiVersion: grafana.integreatly.org/v1beta1
+kind: GrafanaDashboard
+metadata:
+  name: vllm-performance-dashboard
+spec:
+  instanceSelector:
+    matchLabels:
+      dashboards: grafana  # Adjust to match your Grafana instance labels
+  folder: "vLLM Monitoring"
+  json: |
+    # Replace this comment with the complete JSON content from
+    # performance_statistics.json - The JSON should start with { and end with }
+```
+
+Then apply to your cluster:
+
+```bash
+kubectl apply -f your-dashboard.yaml -n <namespace>
+```
--- a/examples/online_serving/dashboards/grafana/performance_statistics.json
+++ b/examples/online_serving/dashboards/grafana/performance_statistics.json
--- a/examples/online_serving/dashboards/grafana/query_statistics.json
+++ b/examples/online_serving/dashboards/grafana/query_statistics.json
@@ -0,0 +1,760 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "grafana",
+          "uid": "-- Grafana --"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "description": "High-level overview of VLLM model deployment behavior and key performance indicators. Designed for Data Scientists and Product Managers to monitor request volume, token throughput, and latency",
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": 47,
+  "links": [],
+  "panels": [
+    {
+      "collapsed": true,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
+      "id": 20,
+      "panels": [],
+      "title": "Request Over Time",
+      "type": "row"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "palette-classic" },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": { "type": "linear" },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": { "group": "A", "mode": "none" },
+            "thresholdsStyle": { "mode": "off" }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
+          },
+          "unit": "req/s"
+        },
+        "overrides": []
+      },
+      "gridPos": { "h": 6, "w": 10, "x": 0, "y": 1 },
+      "id": 1,
+      "options": {
+        "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true },
+        "tooltip": { "mode": "single", "sort": "none" }
+      },
+      "pluginVersion": "11.3.0",
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "editorMode": "code",
+          "expr": "sum by (model_name) (\n  rate(vllm:request_success_total{model_name=~\"$Deployment_id\"}[$__rate_interval])\n)",
+          "interval": "1",
+          "legendFormat": "{{model_name}}",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Successful Requests Over Time",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "thresholds" },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
+          },
+          "unit": "req/s"
+        },
+        "overrides": []
+      },
+      "gridPos": { "h": 3, "w": 7, "x": 10, "y": 1 },
+      "id": 2,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "percentChangeColorMode": "standard",
+        "reduceOptions": { "calcs": ["mean"], "fields": "", "values": false },
+        "showPercentChange": false,
+        "textMode": "auto",
+        "wideLayout": true
+      },
+      "pluginVersion": "11.3.0",
+      "targets": [
+        {
+          "editorMode": "code",
+          "expr": "sum(rate(vllm:request_success_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Requests Avg Rate",
+      "type": "stat"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "thresholds" },
+          "mappings": [
+            { "options": { "Calcultaions": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
+          ],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
+          },
+          "unit": "ms"
+        },
+        "overrides": []
+      },
+      "gridPos": { "h": 3, "w": 7, "x": 17, "y": 1 },
+      "id": 3,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "percentChangeColorMode": "standard",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+        "showPercentChange": false,
+        "textMode": "auto",
+        "wideLayout": true
+      },
+      "pluginVersion": "11.3.0",
+      "targets": [
+        {
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.50, sum by(le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "p50 Latency",
+      "type": "stat"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "thresholds" },
+          "mappings": [
+            { "options": { "Calculation": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
+          ],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
+          },
+          "unit": "ms"
+        },
+        "overrides": []
+      },
+      "gridPos": { "h": 3, "w": 7, "x": 10, "y": 4 },
+      "id": 4,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "percentChangeColorMode": "standard",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+        "showPercentChange": false,
+        "textMode": "auto",
+        "wideLayout": true
+      },
+      "pluginVersion": "11.3.0",
+      "targets": [
+        {
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.90, sum by(le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "p90 Latency",
+      "type": "stat"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "thresholds" },
+          "mappings": [
+            { "options": { "Calculation": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
+          ],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
+          },
+          "unit": "ms"
+        },
+        "overrides": []
+      },
+      "gridPos": { "h": 3, "w": 7, "x": 17, "y": 4 },
+      "id": 5,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "percentChangeColorMode": "standard",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+        "showPercentChange": false,
+        "textMode": "auto",
+        "wideLayout": true
+      },
+      "pluginVersion": "11.3.0",
+      "targets": [
+        {
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.99, sum by(le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "p99 Latency",
+      "type": "stat"
+    },
+    {
+      "collapsed": false,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 7 },
+      "id": 19,
+      "panels": [],
+      "title": "Size Distribution",
+      "type": "row"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "palette-classic" },
+          "custom": {
+            "fillOpacity": 80,
+            "gradientMode": "none",
+            "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+            "lineWidth": 1,
+            "stacking": { "group": "A", "mode": "none" }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
+          },
+          "unit": "cps"
+        },
+        "overrides": []
+      },
+      "gridPos": { "h": 6, "w": 10, "x": 0, "y": 8 },
+      "id": 6,
+      "options": {
+        "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true },
+        "tooltip": { "mode": "single", "sort": "none" }
+      },
+      "pluginVersion": "11.3.0",
+      "targets": [
+        {
+          "editorMode": "code",
+          "expr": "sum by (le, model_name) (rate(vllm:request_prompt_tokens_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
+          "legendFormat": "{{model_name}} le={{le}}",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Input Token Size Distribution",
+      "type": "histogram"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "thresholds" },
+          "mappings": [
+            { "options": { "calculation ": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
+          ],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
+          },
+          "unit": "cps"
+        },
+        "overrides": []
+      },
+      "gridPos": { "h": 3, "w": 7, "x": 10, "y": 8 },
+      "id": 9,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "percentChangeColorMode": "standard",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+        "showPercentChange": false,
+        "textMode": "auto",
+        "wideLayout": true
+      },
+      "pluginVersion": "11.3.0",
+      "targets": [
+        {
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.90, sum by(le, model_name) (rate(vllm:request_prompt_tokens_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Input Token Size p90",
+      "type": "stat"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "thresholds" },
+          "mappings": [
+            { "options": { "Calcultion": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
+          ],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
+          },
+          "unit": "cps"
+        },
+        "overrides": []
+      },
+      "gridPos": { "h": 3, "w": 7, "x": 17, "y": 8 },
+      "id": 8,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "percentChangeColorMode": "standard",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+        "showPercentChange": false,
+        "textMode": "auto",
+        "wideLayout": true
+      },
+      "pluginVersion": "11.3.0",
+      "targets": [
+        {
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.50, sum by(le, model_name) (rate(vllm:request_prompt_tokens_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Input Token Size p50",
+      "type": "stat"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "thresholds" },
+          "mappings": [
+            { "options": { "Calcultaion": { "index": 0, "text": "mean" } }, "type": "value" }
+          ],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
+          },
+          "unit": "cps"
+        },
+        "overrides": []
+      },
+      "gridPos": { "h": 3, "w": 7, "x": 10, "y": 11 },
+      "id": 7,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "percentChangeColorMode": "standard",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+        "showPercentChange": false,
+        "textMode": "auto",
+        "wideLayout": true
+      },
+      "pluginVersion": "11.3.0",
+      "targets": [
+        {
+          "editorMode": "code",
+          "expr": "sum(rate(vllm:prompt_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))\n/\nsum(rate(vllm:request_success_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Input Token Size Avg",
+      "type": "stat"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "thresholds" },
+          "mappings": [
+            { "options": { "Calculation": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
+          ],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
+          },
+          "unit": "cps"
+        },
+        "overrides": []
+      },
+      "gridPos": { "h": 3, "w": 7, "x": 17, "y": 11 },
+      "id": 10,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "percentChangeColorMode": "standard",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+        "showPercentChange": false,
+        "textMode": "auto",
+        "wideLayout": true
+      },
+      "pluginVersion": "11.3.0",
+      "targets": [
+        {
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.99, sum by(le, model_name) (rate(vllm:request_prompt_tokens_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Input Token Size p99",
+      "type": "stat"
+    },
+    {
+      "collapsed": true,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
+      "id": 18,
+      "panels": [],
+      "title": "Input Token Over Time",
+      "type": "row"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "palette-classic" },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": { "type": "linear" },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": { "group": "A", "mode": "none" },
+            "thresholdsStyle": { "mode": "off" }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
+          },
+          "unit": "cps"
+        },
+        "overrides": []
+      },
+      "gridPos": { "h": 6, "w": 10, "x": 0, "y": 15 },
+      "id": 11,
+      "options": {
+        "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true },
+        "tooltip": { "mode": "single", "sort": "none" }
+      },
+      "pluginVersion": "11.3.0",
+      "targets": [
+        {
+          "editorMode": "code",
+          "expr": "sum by (model_name) (rate(vllm:prompt_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
+          "legendFormat": "{{model_name}}",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Input Tokens Over Time",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "thresholds" },
+          "mappings": [
+            { "options": { "Calculation": { "index": 0, "text": "mean" } }, "type": "value" }
+          ],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
+          },
+          "unit": "cps"
+        },
+        "overrides": []
+      },
+      "gridPos": { "h": 3, "w": 7, "x": 10, "y": 15 },
+      "id": 12,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "percentChangeColorMode": "standard",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+        "showPercentChange": false,
+        "textMode": "auto",
+        "wideLayout": true
+      },
+      "pluginVersion": "11.3.0",
+      "targets": [
+        {
+          "editorMode": "code",
+          "expr": "sum(rate(vllm:prompt_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Input Tokens/Sec Avg",
+      "type": "stat"
+    },
+    {
+      "collapsed": false,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 },
+      "id": 17,
+      "panels": [],
+      "title": "Output Token Over Time",
+      "type": "row"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "palette-classic" },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": { "type": "linear" },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": { "group": "A", "mode": "none" },
+            "thresholdsStyle": { "mode": "off" }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
+          },
+          "unit": "cps"
+        },
+        "overrides": []
+      },
+      "gridPos": { "h": 6, "w": 10, "x": 0, "y": 22 },
+      "id": 13,
+      "options": {
+        "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true },
+        "tooltip": { "mode": "single", "sort": "none" }
+      },
+      "pluginVersion": "11.3.0",
+      "targets": [
+        {
+          "editorMode": "code",
+          "expr": "sum by (model_name) (rate(vllm:generation_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
+          "legendFormat": "{{model_name}}",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Output Tokens Over Time",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "thresholds" },
+          "mappings": [
+            { "options": { "Calculation": { "index": 0, "text": "mean" } }, "type": "value" }
+          ],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
+          },
+          "unit": "cps"
+        },
+        "overrides": []
+      },
+      "gridPos": { "h": 3, "w": 7, "x": 10, "y": 22 },
+      "id": 14,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "percentChangeColorMode": "standard",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+        "showPercentChange": false,
+        "textMode": "auto",
+        "wideLayout": true
+      },
+      "pluginVersion": "11.3.0",
+      "targets": [
+        {
+          "editorMode": "code",
+          "expr": "sum(rate(vllm:generation_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Output Tokens/Sec Avg",
+      "type": "stat"
+    }
+  ],
+  "preload": false,
+  "schemaVersion": 40,
+  "tags": [],
+  "templating": {
+    "list": [
+      {
+        "current": { "text": "Prometheus", "value": "4184fc20-68a7-483a-8d9b-7caa59c680dd" },
+        "label": "datasource",
+        "name": "DS_PROMETHEUS",
+        "options": [],
+        "query": "prometheus",
+        "refresh": 1,
+        "type": "datasource"
+      },
+      {
+        "current": { "text": ["All"], "value": ["$__all"] },
+        "definition": "label_values(vllm:request_success_total,model_name)",
+        "includeAll": true,
+        "label": "Deployment_ID",
+        "multi": true,
+        "name": "Deployment_id",
+        "options": [],
+        "query": {
+          "qryType": 1,
+          "query": "label_values(vllm:request_success_total,model_name)",
+          "refId": "PrometheusVariableQueryEditor-VariableQuery"
+        },
+        "refresh": 1,
+        "regex": "",
+        "sort": 1,
+        "type": "query"
+      },
+      {
+        "current": { "text": "All hours", "value": "All hours" },
+        "hide": 2,
+        "label": "Rush Hours Only",
+        "name": "rush_hours",
+        "options": [
+          { "selected": true, "text": "false", "value": "All hours" },
+          { "selected": false, "text": "true", "value": "Rush hours" }
+        ],
+        "query": "false : All hours, true : Rush hours",
+        "type": "custom"
+      },
+      {
+        "current": { "text": "All", "value": "All" },
+        "hide": 2,
+        "label": "Rush Hours Type",
+        "name": "rush_hours_type",
+        "options": [
+          { "selected": true, "text": "^All__.*$", "value": "All" },
+          { "selected": false, "text": "^Static__.*$", "value": "Static" },
+          { "selected": false, "text": "^Dynamic__.*$", "value": "Dynamic" }
+        ],
+        "query": "^All__.*$ : All, ^Static__.*$ : Static, ^Dynamic__.*$ : Dynamic",
+        "type": "custom"
+      },
+      {
+        "current": { "text": "", "value": "" },
+        "hide": 2,
+        "name": "query0",
+        "options": [],
+        "query": "",
+        "refresh": 1,
+        "regex": "",
+        "type": "query"
+      }
+    ]
+  },
+  "time": { "from": "now-12h", "to": "now" },
+  "timepicker": {},
+  "timezone": "browser",
+  "title": "Query Statistics_New4",
+  "uid": "query-statistics4",
+  "version": 2,
+  "weekStart": ""
+}
+
--- a/examples/online_serving/dashboards/perses/README.md
+++ b/examples/online_serving/dashboards/perses/README.md
@@ -0,0 +1,48 @@
+# Perses Dashboards for vLLM Monitoring
+
+This directory contains Perses dashboard configurations designed to monitor vLLM
+performance and metrics.
+
+## Requirements
+
+- Perses instance (standalone or via operator)
+- Prometheus data source configured in Perses
+- vLLM deployment with Prometheus metrics enabled
+
+## Dashboard Format
+
+We provide dashboards in the **native Perses YAML format** that works across all
+deployment methods:
+
+- **Files**: `*.yaml` (native Perses dashboard specifications)
+- **Format**: Pure dashboard specifications that work everywhere
+- **Usage**: Works with standalone Perses, API imports, CLI, and file provisioning
+- **Kubernetes**: Directly compatible with Perses Operator
+
+## Dashboard Descriptions
+
+- **performance_statistics.yaml**: Performance metrics with aggregated latency
+  statistics
+- **query_statistics.yaml**: Query performance and deployment metrics
+
+## Deployment Options
+
+### Direct Import to Perses
+
+Import the dashboard specifications via Perses API or CLI:
+
+```bash
+percli apply -f performance_statistics.yaml
+```
+
+### Perses Operator (Kubernetes)
+
+The native YAML format works directly with the Perses Operator:
+
+```bash
+kubectl apply -f performance_statistics.yaml -n <namespace>
+```
+
+### File Provisioning
+
+Place the YAML files in a Perses provisioning folder for automatic loading.
--- a/examples/online_serving/dashboards/perses/performance_statistics.yaml
+++ b/examples/online_serving/dashboards/perses/performance_statistics.yaml
@@ -0,0 +1,764 @@
+kind: PersesDashboard
+metadata:
+  name: performance-statistics
+  createdAt: 0001-01-01T00:00:00Z
+  updatedAt: 0001-01-01T00:00:00Z
+  version: 0
+  project: ""
+spec:
+  display:
+    name: Performance Statistics
+
+  variables:
+    - kind: ListVariable
+      spec:
+        display:
+          name: Deployment_ID
+          hidden: false
+        name: Deployment_id
+        allowAllValue: true
+        allowMultiple: true
+        defaultValue:
+          - $__all
+        sort: alphabetical-asc
+        plugin:
+          kind: PrometheusLabelValuesVariable
+          spec:
+            datasource:
+              kind: PrometheusDatasource
+              name: accelerators-thanos-querier-datasource
+            labelName: model_name
+            matchers:
+              # Any one vllm metric that always carries model_name
+              - vllm:generation_tokens_total{}
+
+  panels:
+    "1":
+      kind: Panel
+      spec:
+        display:
+          name: E2E Latency over Time
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend:
+              mode: table
+              position: bottom
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  # avg latency by model = sum(rate(sum)) / sum(rate(count))
+                  query: >
+                    sum by (model_name) (rate(vllm:e2e_request_latency_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
+                    /
+                    sum by (model_name) (rate(vllm:e2e_request_latency_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
+                  seriesNameFormat: '{{model_name}}'
+
+    "2":
+      kind: Panel
+      spec:
+        display:
+          name: E2E Latency (Avg)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    (sum by (model_name) (increase(vllm:e2e_request_latency_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
+                    /
+                    (sum by (model_name) (increase(vllm:e2e_request_latency_seconds_count{model_name=~"$Deployment_id"}[$__range])))
+
+    "3":
+      kind: Panel
+      spec:
+        display:
+          name: E2E Latency (P50)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    histogram_quantile(
+                      0.50,
+                      sum by (le, model_name) (
+                        rate(vllm:e2e_request_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                      )
+                    )
+
+    "4":
+      kind: Panel
+      spec:
+        display:
+          name: E2E Latency (P90)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    histogram_quantile(
+                      0.90,
+                      sum by (le, model_name) (
+                        rate(vllm:e2e_request_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                      )
+                    )
+
+    "5":
+      kind: Panel
+      spec:
+        display:
+          name: E2E Latency (P99)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    histogram_quantile(
+                      0.99,
+                      sum by (le, model_name) (
+                        rate(vllm:e2e_request_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                      )
+                    )
+
+    "6":
+      kind: Panel
+      spec:
+        display:
+          name: TTFT over Time
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend:
+              mode: table
+              position: bottom
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    sum by (model_name) (rate(vllm:time_to_first_token_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
+                    /
+                    sum by (model_name) (rate(vllm:time_to_first_token_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
+                  seriesNameFormat: '{{model_name}}'
+
+    "7":
+      kind: Panel
+      spec:
+        display:
+          name: TTFT (Avg)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    (sum by (model_name) (increase(vllm:time_to_first_token_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
+                    /
+                    (sum by (model_name) (increase(vllm:time_to_first_token_seconds_count{model_name=~"$Deployment_id"}[$__range])))
+
+    "8":
+      kind: Panel
+      spec:
+        display:
+          name: TTFT (P50)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    histogram_quantile(
+                      0.50,
+                      sum by (le, model_name) (
+                        rate(vllm:time_to_first_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                      )
+                    )
+
+    "9":
+      kind: Panel
+      spec:
+        display:
+          name: TTFT (P90)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    histogram_quantile(
+                      0.90,
+                      sum by (le, model_name) (
+                        rate(vllm:time_to_first_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                      )
+                    )
+
+    "10":
+      kind: Panel
+      spec:
+        display:
+          name: TTFT (P99)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    histogram_quantile(
+                      0.99,
+                      sum by (le, model_name) (
+                        rate(vllm:time_to_first_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                      )
+                    )
+
+    "11":
+      kind: Panel
+      spec:
+        display:
+          name: ITL (Time per Output Token) over Time
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend:
+              mode: table
+              position: bottom
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    sum by (model_name) (rate(vllm:time_per_output_token_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
+                    /
+                    sum by (model_name) (rate(vllm:time_per_output_token_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
+                  seriesNameFormat: '{{model_name}}'
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    histogram_quantile(
+                      0.50,
+                      sum by (le, model_name) (
+                        rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                      )
+                    )
+                  seriesNameFormat: '{{model_name}} p50'
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    histogram_quantile(
+                      0.90,
+                      sum by (le, model_name) (
+                        rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                      )
+                    )
+                  seriesNameFormat: '{{model_name}} p90'
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    histogram_quantile(
+                      0.99,
+                      sum by (le, model_name) (
+                        rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                      )
+                    )
+                  seriesNameFormat: '{{model_name}} p99'
+
+    "12":
+      kind: Panel
+      spec:
+        display:
+          name: ITL (Avg)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    (sum by (model_name) (increase(vllm:time_per_output_token_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
+                    /
+                    (sum by (model_name) (increase(vllm:time_per_output_token_seconds_count{model_name=~"$Deployment_id"}[$__range])))
+
+    "13":
+      kind: Panel
+      spec:
+        display:
+          name: ITL (P50)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    histogram_quantile(
+                      0.50,
+                      sum by (le, model_name) (
+                        rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                      )
+                    )
+
+    "14":
+      kind: Panel
+      spec:
+        display:
+          name: ITL (P90)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    histogram_quantile(
+                      0.90,
+                      sum by (le, model_name) (
+                        rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                      )
+                    )
+
+    "15":
+      kind: Panel
+      spec:
+        display:
+          name: ITL (P99)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    histogram_quantile(
+                      0.99,
+                      sum by (le, model_name) (
+                        rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                      )
+                    )
+
+    "16":
+      kind: Panel
+      spec:
+        display:
+          name: TPS (Tokens/sec) over Time
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend:
+              mode: table
+              position: bottom
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    sum by (model_name) (rate(vllm:generation_tokens_total{model_name=~"$Deployment_id"}[$__interval]))
+                  seriesNameFormat: '{{model_name}} generation'
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    sum by (model_name) (rate(vllm:prompt_tokens_total{model_name=~"$Deployment_id"}[$__interval]))
+                  seriesNameFormat: '{{model_name}} prompt'
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  # overall iteration tokens/sec if exposed
+                  query: >
+                    rate(vllm:iteration_tokens_total_count[$__interval])
+                  seriesNameFormat: 'iteration overall'
+
+    "17":
+      kind: Panel
+      spec:
+        display:
+          name: KV Cache Usage (avg %)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  # Multiply by 100 so we can read it as a percentage without setting a unit (avoids CUE unit conflicts)
+                  query: >
+                    100 * avg(vllm:kv_cache_usage_perc)
+
+    "18":
+      kind: Panel
+      spec:
+        display:
+          name: Running Requests by Pod
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend:
+              mode: table
+              position: bottom
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    sum by (pod) (vllm:num_requests_running)
+                  seriesNameFormat: '{{pod}}'
+
+    "19":
+      kind: Panel
+      spec:
+        display:
+          name: Waiting Requests by Pod
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend:
+              mode: table
+              position: bottom
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: >
+                    sum by (pod) (vllm:num_requests_waiting)
+                  seriesNameFormat: '{{pod}}'
+
+    "20":
+      kind: Panel
+      spec:
+        display:
+          name: Running Requests (sum)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: sum(vllm:num_requests_running)
+
+    "21":
+      kind: Panel
+      spec:
+        display:
+          name: Waiting Requests (sum)
+        plugin:
+          kind: StatChart
+          spec:
+            calculation: last-number
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource:
+                    kind: PrometheusDatasource
+                    name: accelerators-thanos-querier-datasource
+                  query: sum(vllm:num_requests_waiting)
+
+  layouts:
+    - kind: Grid
+      spec:
+        display:
+          title: Overview
+        items:
+          - x: 0
+            y: 0
+            width: 6
+            height: 3
+            content: { $ref: '#/spec/panels/17' }   # KV cache %
+          - x: 6
+            y: 0
+            width: 6
+            height: 3
+            content: { $ref: '#/spec/panels/20' }   # running sum
+          - x: 12
+            y: 0
+            width: 6
+            height: 3
+            content: { $ref: '#/spec/panels/21' }   # waiting sum
+
+    - kind: Grid
+      spec:
+        display:
+          title: E2E Latency
+        items:
+          - x: 0
+            y: 1
+            width: 10
+            height: 6
+            content: { $ref: '#/spec/panels/1' }
+          - x: 10
+            y: 1
+            width: 7
+            height: 3
+            content: { $ref: '#/spec/panels/2' }
+          - x: 17
+            y: 1
+            width: 7
+            height: 3
+            content: { $ref: '#/spec/panels/3' }
+          - x: 10
+            y: 4
+            width: 7
+            height: 3
+            content: { $ref: '#/spec/panels/4' }
+          - x: 17
+            y: 4
+            width: 7
+            height: 3
+            content: { $ref: '#/spec/panels/5' }
+
+    - kind: Grid
+      spec:
+        display:
+          title: TTFT
+        items:
+          - x: 0
+            y: 8
+            width: 10
+            height: 6
+            content: { $ref: '#/spec/panels/6' }
+          - x: 10
+            y: 8
+            width: 7
+            height: 3
+            content: { $ref: '#/spec/panels/7' }
+          - x: 17
+            y: 8
+            width: 7
+            height: 3
+            content: { $ref: '#/spec/panels/8' }
+          - x: 10
+            y: 11
+            width: 7
+            height: 3
+            content: { $ref: '#/spec/panels/9' }
+          - x: 17
+            y: 11
+            width: 7
+            height: 3
+            content: { $ref: '#/spec/panels/10' }
+
+    - kind: Grid
+      spec:
+        display:
+          title: ITL (Time per Output Token)
+        items:
+          - x: 0
+            y: 15
+            width: 10
+            height: 6
+            content: { $ref: '#/spec/panels/11' }
+          - x: 10
+            y: 15
+            width: 7
+            height: 3
+            content: { $ref: '#/spec/panels/12' }
+          - x: 17
+            y: 15
+            width: 7
+            height: 3
+            content: { $ref: '#/spec/panels/13' }
+          - x: 10
+            y: 18
+            width: 7
+            height: 3
+            content: { $ref: '#/spec/panels/14' }
+          - x: 17
+            y: 18
+            width: 7
+            height: 3
+            content: { $ref: '#/spec/panels/15' }
+
+    - kind: Grid
+      spec:
+        display:
+          title: TPS (Prompt / Generation / Iteration)
+        items:
+          - x: 0
+            y: 22
+            width: 14
+            height: 6
+            content: { $ref: '#/spec/panels/16' }
+
+    - kind: Grid
+      spec:
+        display:
+          title: Per-Pod Request State
+        items:
+          - x: 0
+            y: 28
+            width: 12
+            height: 6
+            content: { $ref: '#/spec/panels/18' }
+          - x: 12
+            y: 28
+            width: 12
+            height: 6
+            content: { $ref: '#/spec/panels/19' }
+
--- a/examples/online_serving/dashboards/perses/query_statistics.yaml
+++ b/examples/online_serving/dashboards/perses/query_statistics.yaml
@@ -0,0 +1,392 @@
+kind: PersesDashboard
+metadata:
+  name: query-statistics
+  createdAt: 0001-01-01T00:00:00Z
+  updatedAt: 0001-01-01T00:00:00Z
+  version: 0
+  project: ""
+spec:
+  display:
+    name: Query Statistics_New
+
+  variables:
+    - kind: ListVariable
+      spec:
+        name: NS
+        display: { name: Namespace }
+        allowMultiple: false
+        defaultValue: llm-d
+        plugin:
+          kind: PrometheusLabelValuesVariable
+          spec:
+            datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+            labelName: namespace
+            matchers:
+              - up{service=~".*vllm.*"}
+
+    - kind: ListVariable
+      spec:
+        name: SVC
+        display: { name: Service }
+        allowMultiple: false
+        defaultValue: vllm-qwen2-0-5b-sim
+        plugin:
+          kind: PrometheusLabelValuesVariable
+          spec:
+            datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+            labelName: service
+            matchers:
+              - up{namespace="$NS",service=~".*vllm.*"}
+
+    - kind: ListVariable
+      spec:
+        name: MODEL
+        display: { name: Model (real vLLM) }
+        allowAllValue: true
+        allowMultiple: true
+        defaultValue: ["$__all"]
+        plugin:
+          kind: PrometheusLabelValuesVariable
+          spec:
+            datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+            labelName: model_name
+            matchers:
+              - vllm:request_success_total{namespace="$NS",service="$SVC"}
+
+  panels:
+
+    # --- Core (works on Simulator & Real) ---
+    core_running_now:
+      kind: Panel
+      spec:
+        display: { name: Running Requests (now) }
+        plugin: { kind: StatChart, spec: { calculation: last-number } }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: sum(vllm:num_requests_running{namespace="$NS",service="$SVC"}) or vector(0)
+                  minStep: "15s"
+
+    core_waiting_now:
+      kind: Panel
+      spec:
+        display: { name: Waiting Requests (now) }
+        plugin: { kind: StatChart, spec: { calculation: last-number } }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: sum(vllm:num_requests_waiting{namespace="$NS",service="$SVC"}) or vector(0)
+                  minStep: "15s"
+
+    core_kv_usage_now:
+      kind: Panel
+      spec:
+        display: { name: KV Cache Usage (0–1) }
+        plugin: { kind: StatChart, spec: { calculation: last-number } }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: avg(vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) or vector(0)
+                  minStep: "15s"
+
+    core_running_ts:
+      kind: Panel
+      spec:
+        display: { name: Running Over Time }
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend: { mode: table, position: bottom }
+            visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: sum by (service) (vllm:num_requests_running{namespace="$NS",service="$SVC"}) or vector(0)
+                  minStep: "15s"
+
+    core_waiting_ts:
+      kind: Panel
+      spec:
+        display: { name: Waiting Over Time }
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend: { mode: table, position: bottom }
+            visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: sum by (service) (vllm:num_requests_waiting{namespace="$NS",service="$SVC"}) or vector(0)
+                  minStep: "15s"
+
+    core_targets_up:
+      kind: Panel
+      spec:
+        display: { name: Scrape Targets Up }
+        plugin: { kind: StatChart, spec: { calculation: last-number } }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: count(up{namespace="$NS",service="$SVC"} == 1) or vector(0)
+                  minStep: "15s"
+
+    # --- KV Cache as Percent (works on Simulator & Real) ---
+    core_kv_usage_pct_now:
+      kind: Panel
+      spec:
+        display: { name: KV Cache Usage (%) – now }
+        plugin: { kind: StatChart, spec: { calculation: last-number } }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  # multiply by 100 to present percentage; omit format.unit to avoid schema conflicts
+                  query: (avg(vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
+                  minStep: "15s"
+
+    core_kv_usage_pct_ts:
+      kind: Panel
+      spec:
+        display: { name: KV Cache Usage (%) – over time }
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend: { mode: table, position: bottom }
+            visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: (avg by (service) (vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
+                  minStep: "15s"
+
+    # --- Per-Pod breakdowns (works on Simulator & Real) ---
+    per_pod_running_ts:
+      kind: Panel
+      spec:
+        display: { name: Running by Pod }
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend: { mode: table, position: bottom }
+            visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: sum by (pod) (vllm:num_requests_running{namespace="$NS",service="$SVC"}) or vector(0)
+                  minStep: "15s"
+
+    per_pod_waiting_ts:
+      kind: Panel
+      spec:
+        display: { name: Waiting by Pod }
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend: { mode: table, position: bottom }
+            visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: sum by (pod) (vllm:num_requests_waiting{namespace="$NS",service="$SVC"}) or vector(0)
+                  minStep: "15s"
+
+    per_pod_kv_pct_ts:
+      kind: Panel
+      spec:
+        display: { name: KV Cache (%) by Pod }
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend: { mode: table, position: bottom }
+            visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  # if your exporter labels kv metric with pod (the sim does), this works; otherwise it will just return empty
+                  query: (avg by (pod) (vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
+                  minStep: "15s"
+
+    # --- Real vLLM only (zeros on simulator) ---
+    real_req_rate_ts:
+      kind: Panel
+      spec:
+        display: { name: Request Rate (real vLLM) }
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend: { mode: table, position: bottom }
+            visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: sum by (model_name) (rate(vllm:request_success_total{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval])) or vector(0)
+                  minStep: "15s"
+
+    real_p50:
+      kind: Panel
+      spec:
+        display: { name: p50 Latency (real vLLM) }
+        plugin: { kind: StatChart, spec: { calculation: last-number } }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: histogram_quantile(0.50, sum by (le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval]))) or vector(0)
+                  minStep: "15s"
+
+    real_p90:
+      kind: Panel
+      spec:
+        display: { name: p90 Latency (real vLLM) }
+        plugin: { kind: StatChart, spec: { calculation: last-number } }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: histogram_quantile(0.90, sum by (le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval]))) or vector(0)
+                  minStep: "15s"
+
+    real_p99:
+      kind: Panel
+      spec:
+        display: { name: p99 Latency (real vLLM) }
+        plugin: { kind: StatChart, spec: { calculation: last-number } }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: histogram_quantile(0.99, sum by (le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval]))) or vector(0)
+                  minStep: "15s"
+
+    real_input_tokens_ts:
+      kind: Panel
+      spec:
+        display: { name: Input Tokens / sec (real vLLM) }
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend: { mode: table, position: bottom }
+            visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: sum by (model_name) (rate(vllm:prompt_tokens_total{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval])) or vector(0)
+                  minStep: "15s"
+
+    real_output_tokens_ts:
+      kind: Panel
+      spec:
+        display: { name: Output Tokens / sec (real vLLM) }
+        plugin:
+          kind: TimeSeriesChart
+          spec:
+            legend: { mode: table, position: bottom }
+            visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
+        queries:
+          - kind: TimeSeriesQuery
+            spec:
+              plugin:
+                kind: PrometheusTimeSeriesQuery
+                spec:
+                  datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
+                  query: sum by (model_name) (rate(vllm:generation_tokens_total{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval])) or vector(0)
+                  minStep: "15s"
+
+  layouts:
+    - kind: Grid
+      spec:
+        display: { title: Core (Sim & Real) }
+        items:
+          - { x: 0,  y: 0,  width: 6,  height: 3, content: { $ref: '#/spec/panels/core_running_now' } }
+          - { x: 6,  y: 0,  width: 6,  height: 3, content: { $ref: '#/spec/panels/core_waiting_now' } }
+          - { x: 12, y: 0,  width: 6,  height: 3, content: { $ref: '#/spec/panels/core_kv_usage_now' } }
+          - { x: 18, y: 0,  width: 6,  height: 3, content: { $ref: '#/spec/panels/core_targets_up' } }
+          - { x: 0,  y: 3,  width: 12, height: 6, content: { $ref: '#/spec/panels/core_running_ts' } }
+          - { x: 12, y: 3,  width: 12, height: 6, content: { $ref: '#/spec/panels/core_waiting_ts' } }
+
+    - kind: Grid
+      spec:
+        display: { title: KV Cache (%) }
+        items:
+          - { x: 0,  y: 9,  width: 6,  height: 3, content: { $ref: '#/spec/panels/core_kv_usage_pct_now' } }
+          - { x: 6,  y: 9,  width: 18, height: 6, content: { $ref: '#/spec/panels/core_kv_usage_pct_ts' } }
+
+    - kind: Grid
+      spec:
+        display: { title: Per-Pod breakdowns }
+        items:
+          - { x: 0,  y: 15, width: 12, height: 6, content: { $ref: '#/spec/panels/per_pod_running_ts' } }
+          - { x: 12, y: 15, width: 12, height: 6, content: { $ref: '#/spec/panels/per_pod_waiting_ts' } }
+          - { x: 0,  y: 21, width: 24, height: 6, content: { $ref: '#/spec/panels/per_pod_kv_pct_ts' } }
+
+    - kind: Grid
+      spec:
+        display: { title: Real vLLM only (shows 0 on simulator) }
+        items:
+          - { x: 0,  y: 27, width: 12, height: 6, content: { $ref: '#/spec/panels/real_req_rate_ts' } }
+          - { x: 12, y: 27, width: 4,  height: 3, content: { $ref: '#/spec/panels/real_p50' } }
+          - { x: 16, y: 27, width: 4,  height: 3, content: { $ref: '#/spec/panels/real_p90' } }
+          - { x: 20, y: 27, width: 4,  height: 3, content: { $ref: '#/spec/panels/real_p99' } }
+          - { x: 0,  y: 33, width: 12, height: 6, content: { $ref: '#/spec/panels/real_input_tokens_ts' } }
+          - { x: 12, y: 33, width: 12, height: 6, content: { $ref: '#/spec/panels/real_output_tokens_ts' } }
+
--- a/examples/online_serving/disaggregated_encoder/README.md
+++ b/examples/online_serving/disaggregated_encoder/README.md
@@ -0,0 +1,119 @@
+# Disaggregated Encoder
+
+These example scripts that demonstrate the disaggregated encoder (EPD) features of vLLM.
+
+For a detailed explanation of the EPD features, please refer to the [Disaggregated Encoder Feature Documentation](../../../docs/features/disagg_encoder.md).
+
+## Files
+
+- `disagg_epd_proxy.py` - Proxy script that demonstrates the XeYpZd setup (X encode instances, Y prefill instances, Z decode instances). Currently stable for the 1e1p1d configuration.
+
+- `disagg_1e1p1d_example.sh` - Sets up the 1e1p1d configuration, runs the VisionArena benchmark, and processes a single request with a local image.
+
+- `disagg_1e1pd_example.sh` - Sets up the 1e1pd configuration, runs the VisionArena benchmark, and processes a single request with a local image.
+
+### Custom Configuration
+
+```bash
+# Use specific GPUs
+GPU_E=0 GPU_PD=1 GPU_P=1 GPU_D=2 bash disagg_1e1p1d_example.sh
+
+# Use specific ports
+ENDPOINT_PORT=10001 bash disagg_1e1p1d_example.sh
+
+# Use specific model
+MODEL="Qwen/Qwen2.5-VL-3B-Instruct" bash disagg_1e1p1d_example.sh
+
+# Use specific storage path
+EC_SHARED_STORAGE_PATH="/tmp/my_ec_cache" bash disagg_1e1p1d_example.sh
+```
+
+## Encoder Instances
+
+Encoder engines should be launched with the following flags:
+
+- `--enforce-eager` **(required)** – The current EPD implementation is only compatible with encoder instances running in this mode.
+
+- `--no-enable-prefix-caching` **(required)** – Encoder instances do not consume KV cache; prefix caching is disabled to avoid conflicts with other features.
+
+- `--max-num-batched-tokens=<large value>` **(default: 2048)** – This flag controls the token scheduling budget per decoding step and is irrelevant to encoder-only instances. **Set it to a very high value (effectively unlimited) to bypass scheduler limitations.** The actual token budget is managed by the encoder cache manager.
+
+## Local media inputs
+
+To support local image inputs (from your ```MEDIA_PATH``` directory), add the following flag to the encoder instance:
+
+```bash
+--allowed-local-media-path $MEDIA_PATH
+```
+
+The vllm instances and `disagg_encoder_proxy` supports local URIs with ```{"url": "file://'"$MEDIA_PATH_FILENAME"'}``` as multimodal inputs. Each URI is passed unchanged from the `disagg_encoder_proxy` to the encoder instance so that the encoder can load the media locally.
+
+## EC connector and KV transfer
+
+The `ECExampleonnector` is used to store the encoder cache on local disk and facilitate transfer. To enable the encoder disaggregation feature, add the following configuration:
+
+```bash
+# Add to encoder instance: 
+--ec-transfer-config '{
+    "ec_connector": "ECExampleConnector",
+    "ec_role": "ec_producer",
+    "ec_connector_extra_config": {
+        "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+    }
+}' 
+
+# Add to prefill/prefill+decode instance: 
+--ec-transfer-config '{
+    "ec_connector": "ECExampleConnector",
+    "ec_role": "ec_consumer",
+    "ec_connector_extra_config": {
+        "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+    }
+}' 
+```
+
+`$EC_SHARED_STORAGE_PATH` is the path where the EC connector temporarily stores the cache.
+
+If you enable prefill instance (`--prefill-servers-urls` not disabled), you will need --kv-transfer-config to facilitate the PD disaggregation. Currently, we use the `NixlConnector` for this purpose. Refer to `tests/v1/kv_connector/nixl_integration` for more example codes on PD disaggregation with Nixl.
+
+```bash
+# Add to prefill instance:    
+--kv-transfer-config '{
+    "kv_connector": "NixlConnector",
+    "kv_role": "kv_producer"
+}' 
+
+# Add to decode instance:
+--kv-transfer-config '{
+    "kv_connector": "NixlConnector",
+    "kv_role": "kv_consumer"
+}' 
+```
+
+## Proxy Instance Flags (`disagg_epd_proxy.py`)
+
+| Flag | Description |
+|------|-------------|
+| `--encode-servers-urls` | Comma-separated list of encoder endpoints. Every multimodal item extracted from the request is fanned out to one of these URLs in a round-robin fashion. |
+| `--prefill-servers-urls` | Comma-separated list of prefill endpoints. Set to `disable`, `none`, or `""` to skip the dedicated prefill phase and run E+PD (encoder + combined prefill/decode). |
+| `--decode-servers-urls` | Comma-separated list of decode endpoints. Non-stream and stream paths both round-robin over this list. |
+| `--host`, `--port` | Bind address for the proxy itself (defaults: `0.0.0.0:8000`). |
+
+Example usage:
+For E + PD setup:
+
+```bash
+$ python disagg_encoder_proxy.py \
+      --encode-servers-urls "http://e1:8001,http://e2:8002" \
+      --prefill-servers-urls "disable" \
+      --decode-servers-urls "http://pd1:8003,http://pd2:8004"
+```
+
+For E + P + D setup:
+
+```bash
+$ python disagg_encoder_proxy.py \
+      --encode-servers-urls "http://e1:8001,http://e2:8001" \
+      --prefill-servers-urls "http://p1:8003,http://p2:8004" \ 
+      --decode-servers-urls "http://d1:8005,http://d2:8006"
+```
--- a/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh
+++ b/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh
@@ -0,0 +1,221 @@
+#!/bin/bash
+set -euo pipefail
+
+declare -a PIDS=()
+
+###############################################################################
+# Configuration -- override via env before running
+###############################################################################
+MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}"
+LOG_PATH="${LOG_PATH:-./logs}"
+mkdir -p $LOG_PATH
+
+ENCODE_PORT="${ENCODE_PORT:-19534}"
+PREFILL_PORT="${PREFILL_PORT:-19535}"
+DECODE_PORT="${DECODE_PORT:-19536}"
+PROXY_PORT="${PROXY_PORT:-10001}"
+
+GPU_E="${GPU_E:-2}"
+GPU_P="${GPU_P:-2}"
+GPU_D="${GPU_D:-3}"
+
+EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache}"
+TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-12000}"   # wait_for_server timeout
+
+NUM_PROMPTS="${NUM_PROMPTS:-100}"    # number of prompts to send in benchmark
+
+export UCX_TLS=all
+export UCX_NET_DEVICES=all
+
+###############################################################################
+# Helpers
+###############################################################################
+# Find the git repository root directory
+GIT_ROOT=$(git rev-parse --show-toplevel)
+
+START_TIME=$(date +"%Y%m%d_%H%M%S")
+ENC_LOG=$LOG_PATH/encoder_${START_TIME}.log
+P_LOG=$LOG_PATH/p_${START_TIME}.log
+D_LOG=$LOG_PATH/d_${START_TIME}.log
+PROXY_LOG=$LOG_PATH/proxy_${START_TIME}.log
+
+wait_for_server() {
+    local port=$1
+    timeout "$TIMEOUT_SECONDS" bash -c "
+        until curl -s localhost:$port/v1/chat/completions > /dev/null; do
+            sleep 1
+        done" && return 0 || return 1
+}
+
+# Cleanup function
+cleanup() {
+    echo "Stopping everything…"
+    trap - INT TERM USR1   # prevent re-entrancy
+    
+    # Kill all tracked PIDs
+    for pid in "${PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            echo "Killing process $pid"
+            kill "$pid" 2>/dev/null
+        fi
+    done
+    
+    # Wait a moment for graceful shutdown
+    sleep 2
+    
+    # Force kill any remaining processes
+    for pid in "${PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            echo "Force killing process $pid"
+            kill -9 "$pid" 2>/dev/null
+        fi
+    done
+    
+    # Kill the entire process group as backup
+    kill -- -$$ 2>/dev/null
+    
+    echo "All processes stopped."
+    exit 0
+}
+
+trap cleanup INT
+trap cleanup USR1
+trap cleanup TERM
+
+# clear previous cache
+echo "remove previous ec cache folder"
+rm -rf $EC_SHARED_STORAGE_PATH
+
+echo "make ec cache folder"
+mkdir -p $EC_SHARED_STORAGE_PATH
+
+###############################################################################
+# Encoder worker
+###############################################################################
+CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
+    --gpu-memory-utilization 0.01 \
+    --port "$ENCODE_PORT" \
+    --enforce-eager \
+    --enable-request-id-headers \
+    --no-enable-prefix-caching \
+    --max-num-batched-tokens 114688 \
+    --max-num-seqs 128 \
+    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+    --ec-transfer-config '{
+        "ec_connector": "ECExampleConnector",
+        "ec_role": "ec_producer",
+        "ec_connector_extra_config": {
+            "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+        }
+    }' \
+    >"${ENC_LOG}" 2>&1 &
+
+PIDS+=($!)
+
+###############################################################################
+# Prefill worker
+###############################################################################
+CUDA_VISIBLE_DEVICES="$GPU_P" \
+UCX_NET_DEVICES=all \
+VLLM_NIXL_SIDE_CHANNEL_PORT=5559 \
+vllm serve "$MODEL" \
+    --gpu-memory-utilization 0.7 \
+    --port "$PREFILL_PORT" \
+    --enforce-eager \
+    --enable-request-id-headers \
+    --max-num-seqs 128 \
+    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+    --ec-transfer-config '{
+        "ec_connector": "ECExampleConnector",
+        "ec_role": "ec_consumer",
+        "ec_connector_extra_config": {
+            "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+        }
+    }' \
+    --kv-transfer-config '{
+        "kv_connector": "NixlConnector",
+        "kv_role": "kv_producer"
+    }' \
+    >"${P_LOG}" 2>&1 &
+
+PIDS+=($!)
+
+###############################################################################
+# Decode worker
+###############################################################################
+CUDA_VISIBLE_DEVICES="$GPU_D" \
+UCX_NET_DEVICES=all \
+VLLM_NIXL_SIDE_CHANNEL_PORT=6000 \
+vllm serve "$MODEL" \
+    --gpu-memory-utilization 0.7 \
+    --port "$DECODE_PORT" \
+    --enforce-eager \
+    --enable-request-id-headers \
+    --max-num-seqs 128 \
+    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+    --kv-transfer-config '{
+        "kv_connector": "NixlConnector",
+        "kv_role": "kv_consumer"
+    }' \
+    >"${D_LOG}" 2>&1 &
+
+PIDS+=($!)
+
+# Wait for workers
+wait_for_server $ENCODE_PORT
+wait_for_server $PREFILL_PORT
+wait_for_server $DECODE_PORT
+
+###############################################################################
+# Proxy
+###############################################################################
+python disagg_epd_proxy.py \
+    --host "0.0.0.0" \
+    --port "$PROXY_PORT" \
+    --encode-servers-urls "http://localhost:$ENCODE_PORT" \
+    --prefill-servers-urls "http://localhost:$PREFILL_PORT" \
+    --decode-servers-urls "http://localhost:$DECODE_PORT" \
+    >"${PROXY_LOG}" 2>&1 &
+
+PIDS+=($!)
+
+wait_for_server $PROXY_PORT
+echo "All services are up!"
+
+###############################################################################
+# Benchmark
+###############################################################################
+echo "Running benchmark (stream)..."
+vllm bench serve \
+  --model               $MODEL \
+  --backend             openai-chat \
+  --endpoint            /v1/chat/completions \
+  --dataset-name        hf \
+  --dataset-path        lmarena-ai/VisionArena-Chat \
+  --seed                0 \
+  --num-prompts         $NUM_PROMPTS \
+  --port                $PROXY_PORT
+
+PIDS+=($!)
+
+###############################################################################
+# Single request with local image
+###############################################################################
+echo "Running single request with local image (non-stream)..."
+curl http://127.0.0.1:${PROXY_PORT}/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+    "model": "'${MODEL}'",
+    "messages": [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": [
+        {"type": "image_url", "image_url": {"url": "file://'"${GIT_ROOT}"'/tests/v1/ec_connector/integration/hato.jpg"}},
+        {"type": "text", "text": "What is in this image?"}
+    ]}
+    ]
+    }'
+
+
+# cleanup
+echo "cleanup..."
+cleanup
--- a/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh
+++ b/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh
@@ -0,0 +1,186 @@
+#!/bin/bash
+set -euo pipefail
+
+declare -a PIDS=()
+
+###############################################################################
+# Configuration -- override via env before running
+###############################################################################
+MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}"
+LOG_PATH="${LOG_PATH:-./logs}"
+mkdir -p $LOG_PATH
+
+ENCODE_PORT="${ENCODE_PORT:-19534}"
+PREFILL_DECODE_PORT="${PREFILL_DECODE_PORT:-19535}"
+PROXY_PORT="${PROXY_PORT:-10001}"
+
+GPU_E="${GPU_E:-0}"
+GPU_PD="${GPU_PD:-1}"
+
+EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache}"
+TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-12000}"   # wait_for_server timeout
+
+NUM_PROMPTS="${NUM_PROMPTS:-100}"    # number of prompts to send in benchmark
+
+###############################################################################
+# Helpers
+###############################################################################
+# Find the git repository root directory
+GIT_ROOT=$(git rev-parse --show-toplevel)
+
+START_TIME=$(date +"%Y%m%d_%H%M%S")
+ENC_LOG=$LOG_PATH/encoder_${START_TIME}.log
+PD_LOG=$LOG_PATH/pd_${START_TIME}.log
+PROXY_LOG=$LOG_PATH/proxy_${START_TIME}.log
+
+wait_for_server() {
+    local port=$1
+    timeout "$TIMEOUT_SECONDS" bash -c "
+        until curl -s localhost:$port/v1/chat/completions > /dev/null; do
+            sleep 1
+        done" && return 0 || return 1
+}
+
+# Cleanup function
+cleanup() {
+    echo "Stopping everything…"
+    trap - INT TERM USR1   # prevent re-entrancy
+    
+    # Kill all tracked PIDs
+    for pid in "${PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            echo "Killing process $pid"
+            kill "$pid" 2>/dev/null
+        fi
+    done
+    
+    # Wait a moment for graceful shutdown
+    sleep 2
+    
+    # Force kill any remaining processes
+    for pid in "${PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            echo "Force killing process $pid"
+            kill -9 "$pid" 2>/dev/null
+        fi
+    done
+    
+    # Kill the entire process group as backup
+    kill -- -$$ 2>/dev/null
+    
+    echo "All processes stopped."
+    exit 0
+}
+
+trap cleanup INT
+trap cleanup USR1
+trap cleanup TERM
+
+# clear previous cache
+echo "remove previous ec cache folder"
+rm -rf $EC_SHARED_STORAGE_PATH
+
+echo "make ec cache folder"
+mkdir -p $EC_SHARED_STORAGE_PATH
+
+###############################################################################
+# Encoder worker
+###############################################################################
+CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
+    --gpu-memory-utilization 0.01 \
+    --port "$ENCODE_PORT" \
+    --enforce-eager \
+    --enable-request-id-headers \
+    --no-enable-prefix-caching \
+    --max-num-batched-tokens 114688 \
+    --max-num-seqs 128 \
+    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+    --ec-transfer-config '{
+        "ec_connector": "ECExampleConnector",
+        "ec_role": "ec_producer",
+        "ec_connector_extra_config": {
+            "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+        }
+    }' \
+    >"${ENC_LOG}" 2>&1 &
+
+PIDS+=($!)
+
+###############################################################################
+# Prefill+Decode worker
+###############################################################################
+CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \
+    --gpu-memory-utilization 0.7 \
+    --port "$PREFILL_DECODE_PORT" \
+    --enforce-eager \
+    --enable-request-id-headers \
+    --max-num-seqs 128 \
+    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+    --ec-transfer-config '{
+        "ec_connector": "ECExampleConnector",
+        "ec_role": "ec_consumer",
+        "ec_connector_extra_config": {
+            "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+        }
+    }' \
+    >"${PD_LOG}" 2>&1 &
+
+PIDS+=($!)
+
+# Wait for workers
+wait_for_server $ENCODE_PORT
+wait_for_server $PREFILL_DECODE_PORT
+
+###############################################################################
+# Proxy
+###############################################################################
+python disagg_epd_proxy.py \
+    --host "0.0.0.0" \
+    --port "$PROXY_PORT" \
+    --encode-servers-urls "http://localhost:$ENCODE_PORT" \
+    --prefill-servers-urls "disable" \
+    --decode-servers-urls "http://localhost:$PREFILL_DECODE_PORT" \
+    >"${PROXY_LOG}" 2>&1 &
+
+PIDS+=($!)
+
+wait_for_server $PROXY_PORT
+echo "All services are up!"
+
+###############################################################################
+# Benchmark
+###############################################################################
+echo "Running benchmark (stream)..."
+vllm bench serve \
+  --model               $MODEL \
+  --backend             openai-chat \
+  --endpoint            /v1/chat/completions \
+  --dataset-name        hf \
+  --dataset-path        lmarena-ai/VisionArena-Chat \
+  --seed                0 \
+  --num-prompts         $NUM_PROMPTS \
+  --port                $PROXY_PORT
+
+PIDS+=($!)
+
+###############################################################################
+# Single request with local image
+###############################################################################
+echo "Running single request with local image (non-stream)..."
+curl http://127.0.0.1:${PROXY_PORT}/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+    "model": "'${MODEL}'",
+    "messages": [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": [
+        {"type": "image_url", "image_url": {"url": "file://'"${GIT_ROOT}"'/tests/v1/ec_connector/integration/hato.jpg"}},
+        {"type": "text", "text": "What is in this image?"}
+    ]}
+    ]
+    }'
+
+
+# cleanup
+echo "cleanup..."
+cleanup
--- a/examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py
+++ b/examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py
@@ -0,0 +1,606 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+disagg_encoder_proxy.py
+
+Proxy that routes OpenAI-compatible “/v1/chat/completions” requests to two
+clusters:
+  • encode  (multimodal feature extraction)
+  • decode  (language-model inference)
+
+For MM input we:
+    1. Extract *every* image/audio item.
+    2. Fire N concurrent requests to the encoder cluster
+       (one request per item, with **all text removed**).
+    3. Wait for all of them to succeed.
+    4. Forward the *original* request to a decode server.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import logging
+import os
+import random
+import uuid
+from collections.abc import AsyncIterator
+
+import aiohttp
+import uvicorn
+from fastapi import FastAPI, HTTPException, Request
+from fastapi.responses import JSONResponse, StreamingResponse
+
+###############################################################################
+# FastAPI app & global state
+###############################################################################
+
+logging.basicConfig(
+    level=logging.DEBUG, format="%(asctime)s %(levelname)s: %(message)s"
+)
+logger = logging.getLogger("proxy")
+
+app = FastAPI()
+encode_session: aiohttp.ClientSession | None = None
+prefill_session: aiohttp.ClientSession | None = None
+decode_session: aiohttp.ClientSession | None = None
+
+###############################################################################
+# Utils
+###############################################################################
+
+
+MM_TYPES = {"image_url", "audio_url", "input_audio"}
+
+
+def extract_mm_items(request_data: dict) -> list[dict]:
+    """
+    Return *all* image/audio items that appear anywhere in `messages`.
+
+    Each returned dict looks like:
+        { "type": "image_url", "image_url": {...} }
+    """
+    items: list[dict] = []
+    for msg in request_data.get("messages", []):
+        content = msg.get("content")
+        if not isinstance(content, list):
+            continue
+
+        for item in content:
+            if item.get("type") in MM_TYPES:
+                items.append(item)
+    return items
+
+
+async def fanout_encoder_primer(
+    orig_request: dict,
+    e_urls: list[str],
+    req_id: str,
+) -> None:
+    """
+    1. Build one request *per MM item* with all text removed.
+    2. Send them concurrently to the encode cluster.
+    3. Raise if any of them fails.
+    """
+    logger.info("[%s] Processing multimodal items...", req_id)
+
+    mm_items = extract_mm_items(orig_request)
+    if not mm_items:
+        logger.info("[%s] No multimodal items, skipping encoder", req_id)
+        return  # nothing to do
+
+    logger.info("[%s] got %d multimodal items...", req_id, len(mm_items))
+
+    tasks = []
+
+    # Round-robin over encode servers to distribute load a bit
+    url_cycle = (e_urls[i % len(e_urls)] for i in range(len(mm_items)))
+
+    for idx, (item, target_url) in enumerate(zip(mm_items, url_cycle)):
+        # Derive a *child* request id:  <parent>:<index>:<random-short>
+        child_req_id = f"{req_id}:{idx}:{uuid.uuid4().hex[:6]}"
+        headers = {"x-request-id": child_req_id}
+
+        encoder_req = {
+            # You *may* need to keep additional fields
+            "model": orig_request.get("model"),
+            "messages": [
+                {"role": "user", "content": [item]},
+            ],
+            # Only need 1 token so the server actually runs the encoder path
+            "max_tokens": 1,
+            "stream": False,
+        }
+        tasks.append(
+            encode_session.post(
+                f"{target_url}/v1/chat/completions",
+                json=encoder_req,
+                headers=headers,
+            )
+        )
+
+    results = await asyncio.gather(*tasks, return_exceptions=True)
+
+    # Fail fast if any sub-request failed
+    for idx, r in enumerate(results):
+        if isinstance(r, Exception):
+            logger.error(
+                "[%s] Encoder request #%d raised exception: %s",
+                req_id,
+                idx,
+                r,
+                exc_info=r,
+            )
+            raise HTTPException(
+                status_code=502, detail=f"Encoder request failed: {str(r)}"
+            )
+        if r.status != 200:
+            try:
+                detail = await r.text()
+            except Exception:
+                detail = "<unable to read body>"
+            logger.error(
+                "[%s] Encoder request #%d returned status %s: %s",
+                req_id,
+                idx,
+                r.status,
+                detail,
+            )
+            raise HTTPException(
+                status_code=r.status,
+                detail=f"Encoder request failed: {detail}",
+            )
+
+    logger.info(
+        "[%s] All %d encoder requests completed successfully", req_id, len(mm_items)
+    )
+
+
+async def maybe_prefill(
+    req_data: dict,
+    p_url: str,
+    req_id: str,
+) -> dict:
+    """
+    - Do prefill-only task if p_url exist;
+    - Return modified request data with kv transfer params (for nixl connector)
+    - Else, skip and return the original request data for decode
+    """
+    if p_url:
+        logger.info("[%s] Processing through prefill: %s", req_id, p_url)
+
+        prefill_response = await process_prefill_stage(req_data, p_url, req_id)
+        # for nixl connector to facilitate kv transfer...
+        prefill_response_json = await prefill_response.json()
+        kv_transfer_params = prefill_response_json.get("kv_transfer_params", {})
+        if kv_transfer_params:
+            req_data["kv_transfer_params"] = kv_transfer_params
+
+        return req_data
+    else:
+        return req_data
+
+
+async def process_prefill_stage(
+    req_data: dict,
+    p_url: str,
+    req_id: str,
+) -> dict:
+    """Process request through Prefill stage and return kv_transfer_params"""
+    logger.info("[%s] Sending prefill request to: %s", req_id, p_url)
+
+    prefill_request = req_data.copy()
+    prefill_request["kv_transfer_params"] = {
+        "do_remote_decode": True,
+        "do_remote_prefill": False,
+        "remote_engine_id": None,
+        "remote_block_ids": None,
+        "remote_host": None,
+        "remote_port": None,
+    }
+    prefill_request["stream"] = False
+    prefill_request["max_tokens"] = 1
+    if "max_completion_tokens" in prefill_request:
+        prefill_request["max_completion_tokens"] = 1
+    if "stream_options" in prefill_request:
+        del prefill_request["stream_options"]
+
+    headers = {"x-request-id": req_id}
+    try:
+        prefill_response = await prefill_session.post(
+            f"{p_url}/v1/chat/completions", json=prefill_request, headers=headers
+        )
+        prefill_response.raise_for_status()
+
+        if prefill_response.status != 200:
+            error_text = await prefill_response.text()
+            logger.error(
+                "[%s] Prefill request failed with status %d: %s",
+                req_id,
+                prefill_response.status,
+                error_text,
+            )
+            raise HTTPException(
+                status_code=prefill_response.status,
+                detail={"error": "Prefill request failed", "message": error_text},
+            )
+        logger.info("[%s] Prefill request completed successfully", req_id)
+
+        return prefill_response
+
+    except Exception as e:
+        logger.error("Prefill processing failed: %s", str(e))
+        raise HTTPException(
+            status_code=500,
+            detail={"error": "Prefill processing error", "message": str(e)},
+        ) from e
+
+
+###############################################################################
+# Middleware for request/response logging
+###############################################################################
+
+
+@app.middleware("http")
+async def log_requests(request: Request, call_next):
+    """Middleware to log all incoming requests and responses"""
+    req_id = request.headers.get("x-request-id", str(uuid.uuid4()))
+
+    # Log incoming request
+    logger.info(
+        ">>> [%s] %s %s from %s",
+        req_id,
+        request.method,
+        request.url.path,
+        request.client.host if request.client else "unknown",
+    )
+
+    try:
+        # Process request
+        response = await call_next(request)
+
+        # Log response
+        logger.info(
+            "<<< [%s] %s %s completed with status %d",
+            req_id,
+            request.method,
+            request.url.path,
+            response.status_code,
+        )
+
+        return response
+    except Exception as e:
+        # Log errors
+        logger.exception(
+            "!!! [%s] %s %s failed with error: %s",
+            req_id,
+            request.method,
+            request.url.path,
+            str(e),
+        )
+        raise
+
+
+###############################################################################
+# FastAPI lifecycle
+###############################################################################
+
+
+@app.on_event("startup")
+async def on_startup() -> None:
+    global encode_session, prefill_session, decode_session
+    timeout = aiohttp.ClientTimeout(total=100_000)
+    connector = aiohttp.TCPConnector(limit=0, force_close=False)
+    encode_session = aiohttp.ClientSession(timeout=timeout, connector=connector)
+    if app.state.p_urls:
+        # only setup if prefill instance(s) exist
+        prefill_session = aiohttp.ClientSession(timeout=timeout, connector=connector)
+    decode_session = aiohttp.ClientSession(timeout=timeout, connector=connector)
+
+
+@app.on_event("shutdown")
+async def on_shutdown() -> None:
+    global encode_session, prefill_session, decode_session
+    if encode_session:
+        await encode_session.close()
+    if prefill_session:
+        await prefill_session.close()
+    if decode_session:
+        await decode_session.close()
+
+
+###############################################################################
+# Core forwarding
+###############################################################################
+
+
+async def forward_non_stream(
+    req_data: dict, req_id: str, e_urls: list[str], p_url: str, d_url: str
+) -> dict:
+    try:
+        # Step 1: Process through Encoder instance (if has MM input)
+        await fanout_encoder_primer(req_data, e_urls, req_id)
+
+        # Step 2: Process through Prefill instance
+        req_data = await maybe_prefill(req_data, p_url, req_id)
+
+        # Step 3: Process through Decode instance
+        logger.info("[%s] Forwarding to decode: %s", req_id, d_url)
+        headers = {"x-request-id": req_id}
+
+        # Non-streaming response
+        async with decode_session.post(
+            f"{d_url}/v1/chat/completions", json=req_data, headers=headers
+        ) as resp:
+            resp.raise_for_status()
+            return await resp.json()
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.exception("[%s] Error in forward_non_stream: %s", req_id, str(e))
+        raise HTTPException(status_code=500, detail=f"Proxy error: {str(e)}") from e
+
+
+async def forward_stream(
+    req_data: dict, req_id: str, e_urls: list[str], p_url: str, d_url: str
+) -> AsyncIterator[str]:
+    try:
+        # Step 1: Process through Encoder instance (if has MM input)
+        await fanout_encoder_primer(req_data, e_urls, req_id)
+
+        # Step 2: Process through Prefill instance
+        req_data = await maybe_prefill(req_data, p_url, req_id)
+
+        # Step 3: Process through Decode instance
+        logger.info("[%s] Starting streaming from decode: %s", req_id, d_url)
+        headers = {"x-request-id": req_id}
+
+        # Streaming response
+        async with decode_session.post(
+            f"{d_url}/v1/chat/completions",
+            json=req_data,
+            headers=headers,
+        ) as resp:
+            resp.raise_for_status()
+            async for chunk in resp.content.iter_chunked(1024):
+                if chunk:
+                    yield chunk.decode("utf-8", errors="ignore")
+
+        logger.info("[%s] Streaming completed", req_id)
+
+    except HTTPException:
+        logger.exception("[%s] HTTPException in forward_stream", req_id)
+        raise
+    except Exception as e:
+        logger.exception("[%s] Error in forward_stream: %s", req_id, str(e))
+        raise HTTPException(
+            status_code=500, detail=f"Proxy streaming error: {str(e)}"
+        ) from e
+
+
+###############################################################################
+# Public routes
+###############################################################################
+
+
+@app.post("/v1/chat/completions")
+async def chat_completions(request: Request):
+    try:
+        req_data = await request.json()
+        req_id = request.headers.get("x-request-id", str(uuid.uuid4()))
+
+        e_urls = app.state.e_urls  # we want the full list for fan-out
+        p_url = random.choice(app.state.p_urls) if app.state.p_urls else None
+        d_url = random.choice(app.state.d_urls)
+
+        is_streaming = req_data.get("stream", False)
+
+        if is_streaming:
+            return StreamingResponse(
+                forward_stream(req_data, req_id, e_urls, p_url, d_url),
+                media_type="text/event-stream",
+            )
+        result = await forward_non_stream(req_data, req_id, e_urls, p_url, d_url)
+        return JSONResponse(content=result)
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.exception("Error in chat_completions endpoint: %s", str(e))
+        raise HTTPException(
+            status_code=500, detail=f"Request processing error: {str(e)}"
+        ) from e
+
+
+@app.get("/v1/models")
+async def list_models():
+    async with decode_session.get(f"{app.state.d_urls[0]}/v1/models") as resp:
+        resp.raise_for_status()
+        return await resp.json()
+
+
+@app.get("/health")
+async def health_check():
+    async def healthy(urls):
+        if not urls:
+            return "empty"
+        for u in urls:
+            try:
+                async with encode_session.get(f"{u}/health") as resp:
+                    resp.raise_for_status()
+            except Exception:
+                return "unhealthy"
+        return "healthy"
+
+    e_status, p_status, d_status = await asyncio.gather(
+        healthy(app.state.e_urls), healthy(app.state.p_urls), healthy(app.state.d_urls)
+    )
+
+    overall_healthy = all(
+        status != "unhealthy" for status in (e_status, p_status, d_status)
+    )
+
+    status_code = 200 if overall_healthy else 503
+
+    return JSONResponse(
+        {
+            "proxy": "healthy",
+            "encode_cluster": e_status,
+            "prefill_cluster": p_status,
+            "decode_cluster": d_status,
+        },
+        status_code=status_code,
+    )
+
+
+###############################################################################
+# Simple profiler fan-out (unchanged except for sessions)
+###############################################################################
+
+
+async def _post_if_available(
+    session: aiohttp.ClientSession,
+    url: str,
+    payload: dict,
+    headers: dict,
+) -> dict | None:
+    """
+    POST `payload` to `url`.
+
+    Returns
+    -------
+    • The decoded JSON body on success (2xx)
+    • None if the endpoint does not exist (404)
+    • Raises for anything else.
+    """
+    try:
+        resp = await session.post(url, json=payload, headers=headers)
+        if resp.status == 404:  # profiling disabled on that server
+            logger.warning("Profiling endpoint missing on %s", url)
+            return None
+        resp.raise_for_status()
+        return await resp.json(content_type=None)
+    except aiohttp.ClientResponseError as exc:
+        # Pass 404 through the branch above, re-raise everything else
+        if exc.status == 404:
+            logger.warning("Profiling endpoint missing on %s", url)
+            return None
+        raise
+    except Exception:
+        # Network errors etc.: propagate
+        raise
+
+
+async def _profile_cmd(cmd: str, payload: dict, e_url: str, p_url: str, d_url: str):
+    """
+    Fire & forget to both clusters, tolerate 404.
+    """
+    headers = {"Authorization": f"Bearer {os.getenv('OPENAI_API_KEY', '')}"}
+
+    encode_task = _post_if_available(
+        encode_session, f"{e_url}/{cmd}_profile", payload, headers
+    )
+    prefill_task = (
+        _post_if_available(prefill_session, f"{p_url}/{cmd}_profile", payload, headers)
+        if p_url is not None
+        else asyncio.sleep(0)
+    )
+    decode_task = _post_if_available(
+        decode_session, f"{d_url}/{cmd}_profile", payload, headers
+    )
+
+    encode_res, prefill_res, decode_res = await asyncio.gather(
+        encode_task, prefill_task, decode_task
+    )
+
+    # If *all* clusters said “I don’t have that route”, surface an error
+    if encode_res is prefill_res is decode_res is None:
+        raise HTTPException(
+            status_code=503,
+            detail="Profiling endpoints are disabled on all clusters",
+        )
+
+    return {
+        "encode": encode_res,  # may be None
+        "prefill": prefill_res,  # may be None
+        "decode": decode_res,  # may be None
+    }
+
+
+@app.post("/start_profile")
+async def start_profile(request: Request):
+    body = await request.json()
+    # TODO: handle multi urls properly
+    e_url = random.choice(app.state.e_urls)
+    p_url = random.choice(app.state.p_urls) if app.state.p_urls else None
+    d_url = random.choice(app.state.d_urls)
+    return await _profile_cmd("start", body, e_url, p_url, d_url)
+
+
+@app.post("/stop_profile")
+async def stop_profile(request: Request):
+    body = await request.json()
+    # TODO: handle multi urls properly
+    e_url = random.choice(app.state.e_urls)
+    p_url = random.choice(app.state.p_urls) if app.state.p_urls else None
+    d_url = random.choice(app.state.d_urls)
+    return await _profile_cmd("stop", body, e_url, p_url, d_url)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", default="0.0.0.0")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument(
+        "--encode-servers-urls",
+        required=True,
+        help='Comma-separated encode URLs ("http://e1:8001,http://e2:8001")',
+    )
+    parser.add_argument(
+        "--prefill-servers-urls",
+        required=True,
+        help=(
+            'Comma-separated prefill URLs ("http://p1:8003,http://p2:8004") ',
+            'to enable E->P->D, set "disable" or "none" to enable E->PD',
+        ),
+    )
+    parser.add_argument(
+        "--decode-servers-urls",
+        required=True,
+        help='Comma-separated decode URLs ("http://d1:8005,http://d2:8006")',
+    )
+
+    args = parser.parse_args()
+    app.state.e_urls = [
+        u.strip() for u in args.encode_servers_urls.split(",") if u.strip()
+    ]
+    app.state.d_urls = [
+        u.strip() for u in args.decode_servers_urls.split(",") if u.strip()
+    ]
+    # handle prefill instances
+    if args.prefill_servers_urls.lower() in ("disable", "none", ""):
+        app.state.p_urls = []
+        logger.info(
+            "Disaggregated prefill phase explicitly disabled by user. Running E + PD..."
+        )
+    else:
+        app.state.p_urls = [
+            u.strip() for u in args.prefill_servers_urls.split(",") if u.strip()
+        ]
+        logger.info("Disaggregated prefill phase is enabled. Running E + P + D...")
+
+    logger.info("Proxy listening on %s:%s", args.host, args.port)
+    logger.info("Encode servers: %s", app.state.e_urls)
+    logger.info("Prefill instances %s", app.state.p_urls)
+    logger.info("Decode servers: %s", app.state.d_urls)
+
+    uvicorn.run(
+        app,
+        host=args.host,
+        port=args.port,
+        log_level="info",
+        loop="uvloop",
+        access_log=True,
+    )
--- a/examples/online_serving/disaggregated_prefill.sh
+++ b/examples/online_serving/disaggregated_prefill.sh
@@ -0,0 +1,125 @@
+#!/bin/bash
+# This file demonstrates the example usage of disaggregated prefilling
+# We will launch 2 vllm instances (1 for prefill and 1 for decode),
+# and then transfer the KV cache between them.
+
+set -xe
+
+echo "🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧"
+sleep 1
+
+# meta-llama/Meta-Llama-3.1-8B-Instruct or deepseek-ai/DeepSeek-V2-Lite
+MODEL_NAME=${HF_MODEL_NAME:-meta-llama/Meta-Llama-3.1-8B-Instruct}
+
+# Trap the SIGINT signal (triggered by Ctrl+C)
+trap 'cleanup' INT
+
+# Cleanup function
+cleanup() {
+    echo "Caught Ctrl+C, cleaning up..."
+    # Cleanup commands
+    pgrep python | xargs kill -9
+    pkill -f python
+    echo "Cleanup complete. Exiting."
+    exit 0
+}
+
+
+if [[ -z "${VLLM_HOST_IP:-}" ]]; then
+    export VLLM_HOST_IP=127.0.0.1
+    echo "Using default VLLM_HOST_IP=127.0.0.1 (override by exporting VLLM_HOST_IP before running this script)"
+else
+    echo "Using provided VLLM_HOST_IP=${VLLM_HOST_IP}"
+fi
+
+
+# install quart first -- required for disagg prefill proxy serve
+if python3 -c "import quart" &> /dev/null; then
+    echo "Quart is already installed."
+else
+    echo "Quart is not installed. Installing..."
+    python3 -m pip install quart
+fi 
+
+# a function that waits vLLM server to start
+wait_for_server() {
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -i localhost:${port}/v1/models > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+
+# You can also adjust --kv-ip and --kv-port for distributed inference.
+
+# prefilling instance, which is the KV producer
+CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
+    --host 0.0.0.0 \
+    --port 8100 \
+    --max-model-len 100 \
+    --gpu-memory-utilization 0.8 \
+    --trust-remote-code \
+    --kv-transfer-config \
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":"1e9","kv_port":"14579","kv_connector_extra_config":{"proxy_ip":"'"$VLLM_HOST_IP"'","proxy_port":"30001","http_ip":"'"$VLLM_HOST_IP"'","http_port":"8100","send_type":"PUT_ASYNC"}}' &
+
+# decoding instance, which is the KV consumer  
+CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
+    --host 0.0.0.0 \
+    --port 8200 \
+    --max-model-len 100 \
+    --gpu-memory-utilization 0.8 \
+    --trust-remote-code \
+    --kv-transfer-config \
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":"1e10","kv_port":"14580","kv_connector_extra_config":{"proxy_ip":"'"$VLLM_HOST_IP"'","proxy_port":"30001","http_ip":"'"$VLLM_HOST_IP"'","http_port":"8200","send_type":"PUT_ASYNC"}}' &
+
+# wait until prefill and decode instances are ready
+wait_for_server 8100
+wait_for_server 8200
+
+# launch a proxy server that opens the service at port 8000
+# the workflow of this proxy:
+# - send the request to prefill vLLM instance (port 8100), change max_tokens 
+#   to 1
+# - after the prefill vLLM finishes prefill, send the request to decode vLLM 
+#   instance
+# NOTE: the usage of this API is subject to change --- in the future we will 
+# introduce "vllm connect" to connect between prefill and decode instances
+python3 ../../benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py &
+sleep 1
+
+# serve two example requests
+output1=$(curl -X POST -s http://localhost:8000/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "'"$MODEL_NAME"'",
+"prompt": "San Francisco is a",
+"max_tokens": 10,
+"temperature": 0
+}')
+
+output2=$(curl -X POST -s http://localhost:8000/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "'"$MODEL_NAME"'",
+"prompt": "Santa Clara is a",
+"max_tokens": 10,
+"temperature": 0
+}')
+
+
+# Cleanup commands
+pgrep python | xargs kill -9
+pkill -f python
+
+echo ""
+
+sleep 1
+
+# Print the outputs of the curl requests
+echo ""
+echo "Output of first request: $output1"
+echo "Output of second request: $output2"
+
+echo "🎉🎉 Successfully finished 2 test requests! 🎉🎉"
+echo ""
--- a/examples/online_serving/disaggregated_serving/README.md
+++ b/examples/online_serving/disaggregated_serving/README.md
@@ -0,0 +1,8 @@
+# Disaggregated Serving
+
+This example contains scripts that demonstrate the disaggregated serving features of vLLM.
+
+## Files
+
+- `disagg_proxy_demo.py` - Demonstrates XpYd (X prefill instances, Y decode instances).
+- `kv_events.sh` - Demonstrates KV cache event publishing.
--- a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
+++ b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
@@ -0,0 +1,452 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file provides a disaggregated prefilling proxy demo to demonstrate an
+example usage of XpYd disaggregated prefilling.
+We can launch multiple vllm instances (2 for prefill and 2 for decode), and
+launch this proxy demo through:
+  python3 examples/online_serving/disaggregated_serving/disagg_proxy_demo.py  \
+       --model $model_name  \
+       --prefill localhost:8100 localhost:8101   \
+       --decode localhost:8200 localhost:8201   \
+       --port 8000
+
+Note: This demo will be removed once the PDController implemented in PR 15343
+(https://github.com/vllm-project/vllm/pull/15343) supports XpYd.
+"""
+
+import argparse
+import ipaddress
+import itertools
+import json
+import logging
+import os
+import sys
+from abc import ABC, abstractmethod
+from collections.abc import Callable
+
+import aiohttp
+import requests
+import uvicorn
+from fastapi import APIRouter, Depends, FastAPI, Header, HTTPException, Request, status
+from fastapi.responses import JSONResponse, StreamingResponse
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+logger = logging.getLogger()
+logging.basicConfig(level=logging.INFO)
+
+
+class SchedulingPolicy(ABC):
+    @abstractmethod
+    def schedule(self, cycler: itertools.cycle):
+        raise NotImplementedError("Scheduling Proxy is not set.")
+
+
+class Proxy:
+    def __init__(
+        self,
+        prefill_instances: list[str],
+        decode_instances: list[str],
+        model: str,
+        scheduling_policy: SchedulingPolicy,
+        custom_create_completion: Callable[[Request], StreamingResponse] | None = None,
+        custom_create_chat_completion: Callable[[Request], StreamingResponse]
+        | None = None,
+    ):
+        self.prefill_instances = prefill_instances
+        self.decode_instances = decode_instances
+        self.prefill_cycler = itertools.cycle(prefill_instances)
+        self.decode_cycler = itertools.cycle(decode_instances)
+        self.model = model
+        self.scheduling_policy = scheduling_policy
+        self.custom_create_completion = custom_create_completion
+        self.custom_create_chat_completion = custom_create_chat_completion
+        self.router = APIRouter()
+        self.setup_routes()
+
+    def setup_routes(self):
+        self.router.post(
+            "/v1/completions", dependencies=[Depends(self.validate_json_request)]
+        )(
+            self.custom_create_completion
+            if self.custom_create_completion
+            else self.create_completion
+        )
+        self.router.post(
+            "/v1/chat/completions", dependencies=[Depends(self.validate_json_request)]
+        )(
+            self.custom_create_chat_completion
+            if self.custom_create_chat_completion
+            else self.create_chat_completion
+        )
+        self.router.get("/status", response_class=JSONResponse)(self.get_status)
+        self.router.post(
+            "/instances/add", dependencies=[Depends(self.api_key_authenticate)]
+        )(self.add_instance_endpoint)
+
+    async def validate_json_request(self, raw_request: Request):
+        content_type = raw_request.headers.get("content-type", "").lower()
+        if content_type != "application/json":
+            raise HTTPException(
+                status_code=415,
+                detail="Unsupported Media Type: Only 'application/json' is allowed",
+            )
+
+    def api_key_authenticate(self, x_api_key: str = Header(...)):
+        expected_api_key = os.environ.get("ADMIN_API_KEY")
+        if not expected_api_key:
+            logger.error("ADMIN_API_KEY is not set in the environment.")
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail="Server configuration error.",
+            )
+        if x_api_key != expected_api_key:
+            logger.warning("Unauthorized access attempt with API Key: %s", x_api_key)
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail="Forbidden: Invalid API Key.",
+            )
+
+    async def validate_instance(self, instance: str) -> bool:
+        url = f"http://{instance}/v1/models"
+        try:
+            async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as client:
+                logger.info("Verifying %s ...", instance)
+                async with client.get(url) as response:
+                    if response.status == 200:
+                        data = await response.json()
+                        if "data" in data and len(data["data"]) > 0:
+                            model_cur = data["data"][0].get("id", "")
+                            if model_cur == self.model:
+                                logger.info("Instance: %s could be added.", instance)
+                                return True
+                            else:
+                                logger.warning(
+                                    "Mismatch model %s : %s != %s",
+                                    instance,
+                                    model_cur,
+                                    self.model,
+                                )
+                                return False
+                        else:
+                            return False
+                    else:
+                        return False
+        except aiohttp.ClientError as e:
+            logger.error(str(e))
+            return False
+        except Exception as e:
+            logger.error(str(e))
+            return False
+
+    async def add_instance_endpoint(self, request: Request):
+        try:
+            data = await request.json()
+            logger.warning(str(data))
+            instance_type = data.get("type")
+            instance = data.get("instance")
+            if instance_type not in ["prefill", "decode"]:
+                raise HTTPException(status_code=400, detail="Invalid instance type.")
+            if not instance or ":" not in instance:
+                raise HTTPException(status_code=400, detail="Invalid instance format.")
+            host, port_str = instance.split(":")
+            try:
+                if host != "localhost":
+                    ipaddress.ip_address(host)
+                port = int(port_str)
+                if not (0 < port < 65536):
+                    raise HTTPException(status_code=400, detail="Invalid port number.")
+            except Exception as e:
+                raise HTTPException(
+                    status_code=400, detail="Invalid instance address."
+                ) from e
+
+            is_valid = await self.validate_instance(instance)
+            if not is_valid:
+                raise HTTPException(
+                    status_code=400, detail="Instance validation failed."
+                )
+
+            if instance_type == "prefill":
+                if instance not in self.prefill_instances:
+                    self.prefill_instances.append(instance)
+                    self.prefill_cycler = itertools.cycle(self.prefill_instances)
+                else:
+                    raise HTTPException(
+                        status_code=400, detail="Instance already exists."
+                    )
+            else:
+                if instance not in self.decode_instances:
+                    self.decode_instances.append(instance)
+                    self.decode_cycler = itertools.cycle(self.decode_instances)
+                else:
+                    raise HTTPException(
+                        status_code=400, detail="Instance already exists."
+                    )
+
+            return JSONResponse(
+                content={"message": f"Added {instance} to {instance_type}_instances."}
+            )
+        except HTTPException as http_exc:
+            raise http_exc
+        except Exception as e:
+            logger.error("Error in add_instance_endpoint: %s", str(e))
+            raise HTTPException(status_code=500, detail=str(e)) from e
+
+    async def forward_request(self, url, data, use_chunked=True):
+        async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+            headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+            try:
+                async with session.post(
+                    url=url, json=data, headers=headers
+                ) as response:
+                    if 200 <= response.status < 300 or 400 <= response.status < 500:
+                        if use_chunked:
+                            async for chunk_bytes in response.content.iter_chunked(
+                                1024
+                            ):
+                                yield chunk_bytes
+                        else:
+                            content = await response.read()
+                            yield content
+                    else:
+                        error_content = await response.text()
+                        try:
+                            error_content = json.loads(error_content)
+                        except json.JSONDecodeError:
+                            error_content = error_content
+                        logger.error(
+                            "Request failed with status %s: %s",
+                            response.status,
+                            error_content,
+                        )
+                        raise HTTPException(
+                            status_code=response.status,
+                            detail=f"Request failed with status {response.status}: "
+                            f"{error_content}",
+                        )
+            except aiohttp.ClientError as e:
+                logger.error("ClientError occurred: %s", str(e))
+                raise HTTPException(
+                    status_code=502,
+                    detail="Bad Gateway: Error communicating with upstream server.",
+                ) from e
+            except Exception as e:
+                logger.error("Unexpected error: %s", str(e))
+                raise HTTPException(status_code=500, detail=str(e)) from e
+
+    def schedule(self, cycler: itertools.cycle) -> str:
+        return self.scheduling_policy.schedule(cycler)
+
+    async def get_status(self):
+        status = {
+            "prefill_node_count": len(self.prefill_instances),
+            "decode_node_count": len(self.decode_instances),
+            "prefill_nodes": self.prefill_instances,
+            "decode_nodes": self.decode_instances,
+        }
+        return status
+
+    async def create_completion(self, raw_request: Request):
+        try:
+            request = await raw_request.json()
+
+            kv_prepare_request = request.copy()
+            kv_prepare_request["max_tokens"] = 1
+
+            prefill_instance = self.schedule(self.prefill_cycler)
+            try:
+                async for _ in self.forward_request(
+                    f"http://{prefill_instance}/v1/completions", kv_prepare_request
+                ):
+                    continue
+            except HTTPException as http_exc:
+                self.remove_instance_endpoint("prefill", prefill_instance)
+                raise http_exc
+
+            # Perform kv recv and decoding stage
+            decode_instance = self.schedule(self.decode_cycler)
+
+            try:
+                generator = self.forward_request(
+                    f"http://{decode_instance}/v1/completions", request
+                )
+            except HTTPException as http_exc:
+                self.remove_instance_endpoint("decode", decode_instance)
+                raise http_exc
+            response = StreamingResponse(generator)
+            return response
+        except Exception:
+            import sys
+
+            exc_info = sys.exc_info()
+            print("Error occurred in disagg proxy server")
+            print(exc_info)
+
+    async def create_chat_completion(self, raw_request: Request):
+        try:
+            request = await raw_request.json()
+
+            # add params to request
+            kv_prepare_request = request.copy()
+            kv_prepare_request["max_tokens"] = 1
+            if "max_completion_tokens" in kv_prepare_request:
+                kv_prepare_request["max_completion_tokens"] = 1
+
+            # prefill stage
+            prefill_instance = self.schedule(self.prefill_cycler)
+            try:
+                async for _ in self.forward_request(
+                    f"http://{prefill_instance}/v1/chat/completions", kv_prepare_request
+                ):
+                    continue
+            except HTTPException as http_exc:
+                self.remove_instance_endpoint("prefill", prefill_instance)
+                raise http_exc
+            # Perform kv recv and decoding stage
+            decode_instance = self.schedule(self.decode_cycler)
+
+            try:
+                generator = self.forward_request(
+                    "http://" + decode_instance + "/v1/chat/completions", request
+                )
+            except HTTPException as http_exc:
+                self.remove_instance_endpoint("decode", decode_instance)
+                raise http_exc
+            response = StreamingResponse(content=generator)
+            return response
+        except Exception:
+            exc_info = sys.exc_info()
+            error_messages = [str(e) for e in exc_info if e]
+            print("Error occurred in disagg proxy server")
+            print(error_messages)
+            return StreamingResponse(
+                content=iter(error_messages), media_type="text/event-stream"
+            )
+
+    def remove_instance_endpoint(self, instance_type, instance):
+        if instance_type == "decode" and instance in self.decode_instances:
+            self.decode_instances.remove(instance)
+            self.decode_cycler = itertools.cycle(self.decode_instances)
+        if instance_type == "prefill" and instance in self.decode_instances:
+            self.prefill_instances.remove(instance)
+            self.prefill_cycler = itertools.cycle(self.decode_instances)
+
+
+class RoundRobinSchedulingPolicy(SchedulingPolicy):
+    def __init__(self):
+        super().__init__()
+
+    def schedule(self, cycler: itertools.cycle) -> str:
+        return next(cycler)
+
+
+class ProxyServer:
+    def __init__(
+        self,
+        args: argparse.Namespace,
+        scheduling_policy: SchedulingPolicy | None = None,
+        create_completion: Callable[[Request], StreamingResponse] | None = None,
+        create_chat_completion: Callable[[Request], StreamingResponse] | None = None,
+    ):
+        self.validate_parsed_serve_args(args)
+        self.port = args.port
+        self.proxy_instance = Proxy(
+            prefill_instances=[] if args.prefill is None else args.prefill,
+            decode_instances=[] if args.decode is None else args.decode,
+            model=args.model,
+            scheduling_policy=(
+                scheduling_policy
+                if scheduling_policy is not None
+                else RoundRobinSchedulingPolicy()
+            ),
+            custom_create_completion=create_completion,
+            custom_create_chat_completion=create_chat_completion,
+        )
+
+    def validate_parsed_serve_args(self, args: argparse.Namespace):
+        if not args.prefill:
+            raise ValueError("Please specify at least one prefill node.")
+        if not args.decode:
+            raise ValueError("Please specify at least one decode node.")
+        self.validate_instances(args.prefill)
+        self.validate_instances(args.decode)
+        self.verify_model_config(args.prefill, args.model)
+        self.verify_model_config(args.decode, args.model)
+
+    def validate_instances(self, instances: list):
+        for instance in instances:
+            if len(instance.split(":")) != 2:
+                raise ValueError(f"Invalid instance format: {instance}")
+            host, port = instance.split(":")
+            try:
+                if host != "localhost":
+                    ipaddress.ip_address(host)
+                port = int(port)
+                if not (0 < port < 65536):
+                    raise ValueError(f"Invalid port number in instance: {instance}")
+            except Exception as e:
+                raise ValueError(f"Invalid instance {instance}: {str(e)}") from e
+
+    def verify_model_config(self, instances: list, model: str) -> None:
+        model_suffix = model.split("/")[-1]
+        for instance in instances:
+            try:
+                response = requests.get(f"http://{instance}/v1/models")
+                if response.status_code == 200:
+                    model_cur = response.json()["data"][0]["id"]
+                    model_cur_suffix = model_cur.split("/")[-1]
+                    if model_cur_suffix != model_suffix:
+                        raise ValueError(
+                            f"{instance} serves a different model: "
+                            f"{model_cur} != {model}"
+                        )
+                else:
+                    raise ValueError(f"Cannot get model id from {instance}!")
+            except requests.RequestException as e:
+                raise ValueError(
+                    f"Error communicating with {instance}: {str(e)}"
+                ) from e
+
+    def run_server(self):
+        app = FastAPI()
+        app.include_router(self.proxy_instance.router)
+        config = uvicorn.Config(app, port=self.port, loop="uvloop")
+        server = uvicorn.Server(config)
+        server.run()
+
+
+def parse_args():
+    # Todo: allow more config
+    parser = argparse.ArgumentParser("vLLM disaggregated proxy server.")
+    parser.add_argument("--model", "-m", type=str, required=True, help="Model name")
+
+    parser.add_argument(
+        "--prefill",
+        "-p",
+        type=str,
+        nargs="+",
+        help="List of prefill node URLs (host:port)",
+    )
+
+    parser.add_argument(
+        "--decode",
+        "-d",
+        type=str,
+        nargs="+",
+        help="List of decode node URLs (host:port)",
+    )
+
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8000,
+        help="Server port number",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    proxy_server = ProxyServer(args=args)
+    proxy_server.run_server()
--- a/examples/online_serving/disaggregated_serving/kv_events.sh
+++ b/examples/online_serving/disaggregated_serving/kv_events.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+# This file demonstrates the KV cache event publishing
+# We will launch a vllm instances configured to publish KV cache
+# events and launch a simple subscriber to log those events.
+
+set -xe
+
+echo "🚧🚧 Warning: The usage of KV cache events is experimental and subject to change 🚧🚧"
+sleep 1
+
+MODEL_NAME=${HF_MODEL_NAME:-meta-llama/Meta-Llama-3.1-8B-Instruct}
+
+# Trap the SIGINT signal (triggered by Ctrl+C)
+trap 'cleanup' INT
+
+# Cleanup function
+cleanup() {
+    echo "Caught Ctrl+C, cleaning up..."
+    # Cleanup commands
+    pgrep python | xargs kill -9
+    pkill -f python
+    echo "Cleanup complete. Exiting."
+    exit 0
+}
+
+export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+# a function that waits vLLM server to start
+wait_for_server() {
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+vllm serve $MODEL_NAME \
+    --port 8100 \
+    --max-model-len 100 \
+    --enforce-eager \
+    --gpu-memory-utilization 0.8 \
+    --trust-remote-code \
+    --kv-events-config \
+    '{"enable_kv_cache_events": true, "publisher": "zmq", "topic": "kv-events"}' &
+
+wait_for_server 8100
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+python3 "$SCRIPT_DIR/kv_events_subscriber.py" &
+sleep 1
+
+# serve two example requests
+output1=$(curl -X POST -s http://localhost:8100/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "'"$MODEL_NAME"'",
+"prompt": "Explain quantum computing in simple terms a 5-year-old could understand.",
+"max_tokens": 80,
+"temperature": 0
+}')
+
+output2=$(curl -X POST -s http://localhost:8100/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "'"$MODEL_NAME"'",
+"prompt": "Explain quantum computing in simple terms a 50-year-old could understand.",
+"max_tokens": 80,
+"temperature": 0
+}')
+
+# Cleanup commands
+pkill -9 -u "$USER" -f python
+pkill -9 -u "$USER" -f vllm
+
+sleep 1
+
+echo "Cleaned up"
+
+# Print the outputs of the curl requests
+echo ""
+echo "Output of first request: $output1"
+echo "Output of second request: $output2"
+
+echo "🎉🎉 Successfully finished 2 test requests! 🎉🎉"
+echo ""
--- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
+++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
@@ -0,0 +1,244 @@
+#!/bin/bash
+
+# =============================================================================
+# vLLM Disaggregated Serving Script - P2P NCCL XpYd Architecture
+# =============================================================================
+# This script demonstrates disaggregated prefill and decode serving using
+# P2P NCCL communication. The architecture supports various XpYd configurations:
+#
+# - 1P3D: 1 Prefill server + 3 Decode servers (current default)
+# - 3P1D: 3 Prefill servers + 1 Decode server
+# - etc.
+#
+# Configuration can be customized via environment variables:
+#   MODEL: Model to serve
+#   PREFILL_GPUS: Comma-separated GPU IDs for prefill servers
+#   DECODE_GPUS: Comma-separated GPU IDs for decode servers
+#   PREFILL_PORTS: Comma-separated ports for prefill servers
+#   DECODE_PORTS: Comma-separated ports for decode servers
+#   PROXY_PORT: Proxy server port used to setup XpYd connection.
+#   TIMEOUT_SECONDS: Server startup timeout
+# =============================================================================
+
+# Configuration - can be overridden via environment variables
+MODEL=${MODEL:-meta-llama/Llama-3.1-8B-Instruct}
+TIMEOUT_SECONDS=${TIMEOUT_SECONDS:-1200}
+PROXY_PORT=${PROXY_PORT:-30001}
+
+# Default 1P3D configuration (1 Prefill + 3 Decode)
+PREFILL_GPUS=${PREFILL_GPUS:-0}
+DECODE_GPUS=${DECODE_GPUS:-1,2,3}
+PREFILL_PORTS=${PREFILL_PORTS:-20003}
+DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009}
+
+echo "Warning: P2P NCCL disaggregated prefill XpYd support for vLLM v1 is experimental and subject to change."
+echo ""
+echo "Architecture Configuration:"
+echo "  Model: $MODEL"
+echo "  Prefill GPUs: $PREFILL_GPUS, Ports: $PREFILL_PORTS"
+echo "  Decode GPUs: $DECODE_GPUS, Ports: $DECODE_PORTS"
+echo "  Proxy Port: $PROXY_PORT"
+echo "  Timeout: ${TIMEOUT_SECONDS}s"
+echo ""
+
+PIDS=()
+
+# Switch to the directory of the current script
+cd "$(dirname "${BASH_SOURCE[0]}")"
+
+check_required_files() {
+    local files=("disagg_proxy_p2p_nccl_xpyd.py")
+    for file in "${files[@]}"; do
+        if [[ ! -f "$file" ]]; then
+            echo "Required file $file not found in $(pwd)"
+            exit 1
+        fi
+    done
+}
+
+check_hf_token() {
+    if [ -z "$HF_TOKEN" ]; then
+        echo "HF_TOKEN is not set. Please set it to your Hugging Face token."
+        echo "Example: export HF_TOKEN=your_token_here"
+        exit 1
+    fi
+    if [[ "$HF_TOKEN" != hf_* ]]; then
+        echo "HF_TOKEN is not a valid Hugging Face token. Please set it to your Hugging Face token."
+        exit 1
+    fi
+    echo "HF_TOKEN is set and valid."
+}
+
+check_num_gpus() {
+    # Check if the number of GPUs are >=2 via nvidia-smi
+    num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+    if [ "$num_gpus" -lt 2 ]; then
+        echo "You need at least 2 GPUs to run disaggregated prefill."
+        exit 1
+    else
+        echo "Found $num_gpus GPUs."
+    fi
+}
+
+ensure_python_library_installed() {
+    echo "Checking if $1 is installed..."
+    if ! python3 -c "import $1" > /dev/null 2>&1; then
+        echo "$1 is not installed. Please install it via pip install $1."
+        exit 1
+    else
+        echo "$1 is installed."
+    fi
+}
+
+cleanup() {
+    echo "Stopping everything…"
+    trap - INT TERM        # prevent re-entrancy
+    pkill -9 -f "disagg_proxy_p2p_nccl_xpyd.py"
+    kill -- -$$            # negative PID  ==  "this whole process-group"
+    wait                   # reap children so we don't leave zombies
+    exit 0
+}
+
+wait_for_server() {
+  local port=$1
+  local timeout_seconds=$TIMEOUT_SECONDS
+  local start_time=$(date +%s)
+
+  echo "Waiting for server on port $port..."
+
+  while true; do
+    if curl -s "localhost:${port}/v1/completions" > /dev/null; then
+      echo "Server on port $port is ready."
+      return 0
+    fi
+
+    local now=$(date +%s)
+    if (( now - start_time >= timeout_seconds )); then
+      echo "Timeout waiting for server on port $port"
+      return 1
+    fi
+
+    sleep 1
+  done
+}
+
+main() {
+    check_required_files
+    check_hf_token
+    check_num_gpus
+    ensure_python_library_installed pandas
+    ensure_python_library_installed datasets
+    ensure_python_library_installed vllm
+    ensure_python_library_installed quart
+
+    trap cleanup INT
+    trap cleanup USR1
+    trap cleanup TERM
+
+    echo "Launching disaggregated serving components..."
+    echo "Please check the log files for detailed output:"
+    echo "  - prefill*.log: Prefill server logs"
+    echo "  - decode*.log: Decode server logs"
+    echo "  - proxy.log: Proxy server log"
+
+    # =============================================================================
+    # Launch Proxy Server
+    # =============================================================================
+    echo ""
+    echo "Starting proxy server on port $PROXY_PORT..."
+    python3 disagg_proxy_p2p_nccl_xpyd.py &
+    PIDS+=($!)
+
+    # Parse GPU and port arrays
+    IFS=',' read -ra PREFILL_GPU_ARRAY <<< "$PREFILL_GPUS"
+    IFS=',' read -ra DECODE_GPU_ARRAY <<< "$DECODE_GPUS"
+    IFS=',' read -ra PREFILL_PORT_ARRAY <<< "$PREFILL_PORTS"
+    IFS=',' read -ra DECODE_PORT_ARRAY <<< "$DECODE_PORTS"
+
+    # =============================================================================
+    # Launch Prefill Servers (X Producers)
+    # =============================================================================
+    echo ""
+    echo "Starting ${#PREFILL_GPU_ARRAY[@]} prefill server(s)..."
+    for i in "${!PREFILL_GPU_ARRAY[@]}"; do
+        local gpu_id=${PREFILL_GPU_ARRAY[$i]}
+        local port=${PREFILL_PORT_ARRAY[$i]}
+        local kv_port=$((21001 + i))
+
+        echo "  Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
+        CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
+        --enforce-eager \
+        --host 0.0.0.0 \
+        --port $port \
+        --tensor-parallel-size 1 \
+        --seed 1024 \
+        --dtype float16 \
+        --max-model-len 10000 \
+        --max-num-batched-tokens 10000 \
+        --max-num-seqs 256 \
+        --trust-remote-code \
+        --gpu-memory-utilization 0.9 \
+        --kv-transfer-config \
+        "{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_producer\",\"kv_buffer_size\":\"1e1\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}" > prefill$((i+1)).log 2>&1 &
+        PIDS+=($!)
+    done
+
+    # =============================================================================
+    # Launch Decode Servers (Y Decoders)
+    # =============================================================================
+    echo ""
+    echo "Starting ${#DECODE_GPU_ARRAY[@]} decode server(s)..."
+    for i in "${!DECODE_GPU_ARRAY[@]}"; do
+        local gpu_id=${DECODE_GPU_ARRAY[$i]}
+        local port=${DECODE_PORT_ARRAY[$i]}
+        local kv_port=$((22001 + i))
+
+        echo "  Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
+        CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
+        --enforce-eager \
+        --host 0.0.0.0 \
+        --port $port \
+        --tensor-parallel-size 1 \
+        --seed 1024 \
+        --dtype float16 \
+        --max-model-len 10000 \
+        --max-num-batched-tokens 10000 \
+        --max-num-seqs 256 \
+        --trust-remote-code \
+        --gpu-memory-utilization 0.7 \
+        --kv-transfer-config \
+        "{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_consumer\",\"kv_buffer_size\":\"8e9\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}" > decode$((i+1)).log 2>&1 &
+        PIDS+=($!)
+    done
+
+    # =============================================================================
+    # Wait for All Servers to Start
+    # =============================================================================
+    echo ""
+    echo "Waiting for all servers to start..."
+    for port in "${PREFILL_PORT_ARRAY[@]}" "${DECODE_PORT_ARRAY[@]}"; do
+        if ! wait_for_server $port; then
+            echo "Failed to start server on port $port"
+            cleanup
+            exit 1
+        fi
+    done
+
+    echo ""
+    echo "All servers are up. Starting benchmark..."
+
+    # =============================================================================
+    # Run Benchmark
+    # =============================================================================
+    cd ../../../benchmarks/
+    vllm bench serve --port 10001 --seed $(date +%s) \
+        --model $MODEL \
+        --dataset-name random --random-input-len 7500 --random-output-len 200 \
+        --num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
+
+    echo "Benchmarking done. Cleaning up..."
+
+    cleanup
+}
+
+main
--- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
+++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
@@ -0,0 +1,190 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import socket
+import threading
+import time
+import uuid
+from typing import Any
+
+import aiohttp
+import msgpack
+import zmq
+from quart import Quart, make_response, request
+
+count = 0
+prefill_instances: dict[str, Any] = {}  # http_address: (zmq_address, stamp)
+decode_instances: dict[str, Any] = {}  # http_address: (zmq_address, stamp)
+
+prefill_cv = threading.Condition()
+decode_cv = threading.Condition()
+
+DEFAULT_PING_SECONDS = 5
+
+
+def _remove_oldest_instances(instances: dict[str, Any]) -> None:
+    oldest_key = next(iter(instances), None)
+    while oldest_key is not None:
+        value = instances[oldest_key]
+        if value[1] > time.time():
+            break
+        print(f"🔴Remove [HTTP:{oldest_key}, ZMQ:{value[0]}, stamp:{value[1]}]")
+        instances.pop(oldest_key, None)
+        oldest_key = next(iter(instances), None)
+
+
+def _listen_for_register(poller, router_socket):
+    while True:
+        socks = dict(poller.poll())
+        if router_socket in socks:
+            remote_address, message = router_socket.recv_multipart()
+            # data: {"type": "P", "http_address": "ip:port",
+            #        "zmq_address": "ip:port"}
+            data = msgpack.loads(message)
+            if data["type"] == "P":
+                global prefill_instances
+                global prefill_cv
+                with prefill_cv:
+                    node = prefill_instances.get(data["http_address"], None)
+                    prefill_instances[data["http_address"]] = (
+                        data["zmq_address"],
+                        time.time() + DEFAULT_PING_SECONDS,
+                    )
+                    _remove_oldest_instances(prefill_instances)
+
+            elif data["type"] == "D":
+                global decode_instances
+                global decode_cv
+                with decode_cv:
+                    node = decode_instances.get(data["http_address"], None)
+                    decode_instances[data["http_address"]] = (
+                        data["zmq_address"],
+                        time.time() + DEFAULT_PING_SECONDS,
+                    )
+                    _remove_oldest_instances(decode_instances)
+            else:
+                print(
+                    "Unexpected, Received message from %s, data: %s",
+                    remote_address,
+                    data,
+                )
+                return
+
+            if node is None:
+                print(f"🔵Add [HTTP:{data['http_address']}, ZMQ:{data['zmq_address']}]")
+
+
+def start_service_discovery(hostname, port):
+    if not hostname:
+        hostname = socket.gethostname()
+    if port == 0:
+        raise ValueError("Port cannot be 0")
+
+    context = zmq.Context()
+    router_socket = context.socket(zmq.ROUTER)
+    router_socket.bind(f"tcp://{hostname}:{port}")
+
+    poller = zmq.Poller()
+    poller.register(router_socket, zmq.POLLIN)
+
+    _listener_thread = threading.Thread(
+        target=_listen_for_register, args=[poller, router_socket], daemon=True
+    )
+    _listener_thread.start()
+    return _listener_thread
+
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+
+app = Quart(__name__)
+
+
+def random_uuid() -> str:
+    return str(uuid.uuid4().hex)
+
+
+async def forward_request(url, data, request_id):
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+            "X-Request-Id": request_id,
+        }
+        async with session.post(url=url, json=data, headers=headers) as response:
+            if response.status == 200:
+                if True:
+                    async for chunk_bytes in response.content.iter_chunked(1024):
+                        yield chunk_bytes
+                else:
+                    content = await response.read()
+                    yield content
+
+
+@app.route("/v1/completions", methods=["POST"])
+@app.route("/v1/chat/completions", methods=["POST"])
+async def handle_request():
+    try:
+        original_request_data = await request.get_json()
+
+        prefill_request = original_request_data.copy()
+        # change max_tokens = 1 to let it only do prefill
+        prefill_request["max_tokens"] = 1
+        if "max_completion_tokens" in prefill_request:
+            prefill_request["max_completion_tokens"] = 1
+
+        global count
+        global prefill_instances
+        global prefill_cv
+        with prefill_cv:
+            prefill_list = list(prefill_instances.items())
+            prefill_addr, prefill_zmq_addr = prefill_list[count % len(prefill_list)]
+            prefill_zmq_addr = prefill_zmq_addr[0]
+
+        global decode_instances
+        global decode_cv
+        with decode_cv:
+            decode_list = list(decode_instances.items())
+            decode_addr, decode_zmq_addr = decode_list[count % len(decode_list)]
+            decode_zmq_addr = decode_zmq_addr[0]
+
+        print(
+            f"handle_request count: {count}, [HTTP:{prefill_addr}, "
+            f"ZMQ:{prefill_zmq_addr}] 👉 [HTTP:{decode_addr}, "
+            f"ZMQ:{decode_zmq_addr}]"
+        )
+        count += 1
+
+        request_id = (
+            f"___prefill_addr_{prefill_zmq_addr}___decode_addr_"
+            f"{decode_zmq_addr}_{random_uuid()}"
+        )
+
+        # finish prefill
+        async for _ in forward_request(
+            f"http://{prefill_addr}{request.path}", prefill_request, request_id
+        ):
+            continue
+
+        # return decode
+        generator = forward_request(
+            f"http://{decode_addr}{request.path}", original_request_data, request_id
+        )
+        response = await make_response(generator)
+        response.timeout = None
+
+        return response
+
+    except Exception as e:
+        import sys
+        import traceback
+
+        exc_info = sys.exc_info()
+        print("Error occurred in disagg prefill proxy server")
+        print(e)
+        print("".join(traceback.format_exception(*exc_info)))
+
+
+if __name__ == "__main__":
+    t = start_service_discovery("0.0.0.0", 30001)
+    app.run(host="0.0.0.0", port=10001)
+    t.join()
--- a/examples/online_serving/elastic_ep/bench.sh
+++ b/examples/online_serving/elastic_ep/bench.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+MODEL_NAME="deepseek-ai/DeepSeek-V2-Lite"
+LOCAL_MODEL_PATH="/models/models--deepseek-ai--DeepSeek-V2-Lite/snapshots/604d5664dddd88a0433dbae533b7fe9472482de0"
+HOST="localhost"
+PORT=8006
+NUM_PROMPTS=20
+REQUEST_RATE=5
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)
+            MODEL_NAME="$2"
+            shift 2
+            ;;
+        --local-model)
+            MODEL_NAME=$LOCAL_MODEL_PATH
+            shift
+            ;;
+        --host)
+            HOST="$2"
+            shift 2
+            ;;
+        --port)
+            PORT="$2"
+            shift 2
+            ;;
+        --num-prompts)
+            NUM_PROMPTS="$2"
+            shift 2
+            ;;
+        --request-rate)
+            REQUEST_RATE="$2"
+            shift 2
+            ;;
+        -h|--help)
+            echo "Usage: $0 [OPTIONS]"
+            echo "Options:"
+            echo "  --model MODEL_NAME           Set model name or path (default: deepseek-ai/DeepSeek-V2-Lite)"
+            echo "  --local-model                Use local model path (convenience option)"
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Use -h or --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+
+vllm bench serve \
+    --model $MODEL_NAME \
+    --host $HOST \
+    --port $PORT \
+    --num-prompts $NUM_PROMPTS \
+    --request-rate $REQUEST_RATE
--- a/examples/online_serving/elastic_ep/scale.py
+++ b/examples/online_serving/elastic_ep/scale.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import json
+import sys
+
+import requests
+
+
+def scale(host, port, new_dp_size):
+    url = f"http://{host}:{port}/scale_elastic_ep"
+    payload = {"new_data_parallel_size": new_dp_size}
+    headers = {"Content-Type": "application/json"}
+
+    print(f"Sending scale request to {url}")
+    print(f"Payload: {json.dumps(payload, indent=2)}")
+
+    try:
+        response = requests.post(url, json=payload, headers=headers, timeout=300)
+
+        print(f"Status Code: {response.status_code}")
+        print(f"Response: {response.text}")
+
+        if response.status_code == 200:
+            print("Scale up/down request successful!")
+            return True
+        else:
+            print("Scale up/down request failed!")
+            return False
+
+    except requests.exceptions.RequestException as e:
+        print(f"Request failed: {e}")
+        return False
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Test scale up/down functionality")
+    parser.add_argument("--host", default="localhost", help="API server host")
+    parser.add_argument("--port", type=int, default=8006, help="API server port")
+    parser.add_argument(
+        "--new-dp-size", type=int, default=2, help="New data parallel size"
+    )
+
+    args = parser.parse_args()
+
+    success = scale(args.host, args.port, args.new_dp_size)
+    sys.exit(0 if success else 1)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
+++ b/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+HOST="0.0.0.0"
+PORT=8006
+DATA_PARALLEL_SIZE=4
+REDUNDANT_EXPERTS=0
+LOCAL_MODEL_PATH="/models/models--deepseek-ai--DeepSeek-V2-Lite/snapshots/604d5664dddd88a0433dbae533b7fe9472482de0"
+MODEL_NAME="deepseek-ai/DeepSeek-V2-Lite"
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --dp)
+            DATA_PARALLEL_SIZE="$2"
+            shift 2
+            ;;
+        --re)
+            REDUNDANT_EXPERTS="$2"
+            shift 2
+            ;;
+        --host)
+            HOST="$2"
+            shift 2
+            ;;
+        --port)
+            PORT="$2"
+            shift 2
+            ;;
+        --model)
+            MODEL_NAME="$2"
+            shift 2
+            ;;
+        --local-model)
+            MODEL_NAME=$LOCAL_MODEL_PATH
+            shift
+            ;;
+        -h|--help)
+            echo "Usage: $0 [OPTIONS]"
+            echo "Options:"
+            echo "  --dp SIZE                    Set data parallel size (default: 4)"
+            echo "  --re SIZE                    Set redundant experts (default: 0)"
+            echo "  --host HOST                  Set host address (default: 0.0.0.0)"
+            echo "  --port PORT                  Set port number (default: 8006)"
+            echo "  --model MODEL_NAME           Set model name or path"
+            echo "  -h, --help                   Show this help message"
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Use -h or --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+
+echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALLEL_SIZE and redundant experts: $REDUNDANT_EXPERTS"
+
+export RAY_DEDUP_LOGS=0
+export VLLM_ALL2ALL_BACKEND="pplx"
+export VLLM_USE_DEEP_GEMM=1
+
+vllm serve $MODEL_NAME \
+    --data-parallel-size $DATA_PARALLEL_SIZE \
+    --data-parallel-size-local $DATA_PARALLEL_SIZE \
+    --data-parallel-backend ray \
+    --enforce-eager \
+    --enable-expert-parallel \
+    --enable-eplb \
+    --num-redundant-experts $REDUNDANT_EXPERTS \
+    --trust-remote-code \
+    --host $HOST \
+    --port $PORT
--- a/examples/online_serving/gradio_openai_chatbot_webserver.py
+++ b/examples/online_serving/gradio_openai_chatbot_webserver.py
@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Example for starting a Gradio OpenAI Chatbot Webserver
+Start vLLM API server:
+    vllm serve meta-llama/Llama-2-7b-chat-hf
+
+Start Gradio OpenAI Chatbot Webserver:
+    python examples/online_serving/gradio_openai_chatbot_webserver.py \
+                    -m meta-llama/Llama-2-7b-chat-hf
+
+Note that `pip install --upgrade gradio` is needed to run this example.
+More details: https://github.com/gradio-app/gradio
+
+If your antivirus software blocks the download of frpc for gradio,
+you can install it manually by following these steps:
+
+1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64
+2. Rename the downloaded file to: frpc_linux_amd64_v0.3
+3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
+"""
+
+import argparse
+
+import gradio as gr
+from openai import OpenAI
+
+
+def predict(message, history, client, model_name, temp, stop_token_ids):
+    messages = [
+        {"role": "system", "content": "You are a great AI assistant."},
+        *history,
+        {"role": "user", "content": message},
+    ]
+
+    # Send request to OpenAI API (vLLM server)
+    stream = client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        temperature=temp,
+        stream=True,
+        extra_body={
+            "repetition_penalty": 1,
+            "stop_token_ids": [int(id.strip()) for id in stop_token_ids.split(",")]
+            if stop_token_ids
+            else [],
+        },
+    )
+
+    # Collect all chunks and concatenate them into a full message
+    full_message = ""
+    for chunk in stream:
+        full_message += chunk.choices[0].delta.content or ""
+
+    # Return the full message as a single response
+    return full_message
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Chatbot Interface with Customizable Parameters"
+    )
+    parser.add_argument(
+        "--model-url", type=str, default="http://localhost:8000/v1", help="Model URL"
+    )
+    parser.add_argument(
+        "-m", "--model", type=str, required=True, help="Model name for the chatbot"
+    )
+    parser.add_argument(
+        "--temp", type=float, default=0.8, help="Temperature for text generation"
+    )
+    parser.add_argument(
+        "--stop-token-ids", type=str, default="", help="Comma-separated stop token IDs"
+    )
+    parser.add_argument("--host", type=str, default=None)
+    parser.add_argument("--port", type=int, default=8001)
+    return parser.parse_args()
+
+
+def build_gradio_interface(client, model_name, temp, stop_token_ids):
+    def chat_predict(message, history):
+        return predict(message, history, client, model_name, temp, stop_token_ids)
+
+    return gr.ChatInterface(
+        fn=chat_predict,
+        title="Chatbot Interface",
+        description="A simple chatbot powered by vLLM",
+    )
+
+
+def main():
+    # Parse the arguments
+    args = parse_args()
+
+    # Set OpenAI's API key and API base to use vLLM's API server
+    openai_api_key = "EMPTY"
+    openai_api_base = args.model_url
+
+    # Create an OpenAI client
+    client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
+
+    # Define the Gradio chatbot interface using the predict function
+    gradio_interface = build_gradio_interface(
+        client, args.model, args.temp, args.stop_token_ids
+    )
+
+    gradio_interface.queue().launch(
+        server_name=args.host, server_port=args.port, share=True
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/gradio_webserver.py
+++ b/examples/online_serving/gradio_webserver.py
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Example for starting a Gradio Webserver
+Start vLLM API server:
+    python -m vllm.entrypoints.api_server \
+        --model meta-llama/Llama-2-7b-chat-hf
+
+Start Webserver:
+    python examples/online_serving/gradio_webserver.py
+
+Note that `pip install --upgrade gradio` is needed to run this example.
+More details: https://github.com/gradio-app/gradio
+
+If your antivirus software blocks the download of frpc for gradio,
+you can install it manually by following these steps:
+
+1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64
+2. Rename the downloaded file to: frpc_linux_amd64_v0.3
+3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
+"""
+
+import argparse
+import json
+
+import gradio as gr
+import requests
+
+
+def http_bot(prompt):
+    headers = {"User-Agent": "vLLM Client"}
+    pload = {
+        "prompt": prompt,
+        "stream": True,
+        "max_tokens": 128,
+    }
+    response = requests.post(args.model_url, headers=headers, json=pload, stream=True)
+
+    for chunk in response.iter_lines(
+        chunk_size=8192, decode_unicode=False, delimiter=b"\n"
+    ):
+        if chunk:
+            data = json.loads(chunk.decode("utf-8"))
+            output = data["text"][0]
+            yield output
+
+
+def build_demo():
+    with gr.Blocks() as demo:
+        gr.Markdown("# vLLM text completion demo\n")
+        inputbox = gr.Textbox(label="Input", placeholder="Enter text and press ENTER")
+        outputbox = gr.Textbox(
+            label="Output", placeholder="Generated result from the model"
+        )
+        inputbox.submit(http_bot, [inputbox], [outputbox])
+    return demo
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default=None)
+    parser.add_argument("--port", type=int, default=8001)
+    parser.add_argument(
+        "--model-url", type=str, default="http://localhost:8000/generate"
+    )
+    return parser.parse_args()
+
+
+def main(args):
+    demo = build_demo()
+    demo.queue().launch(server_name=args.host, server_port=args.port, share=True)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/online_serving/kv_events_subscriber.py
+++ b/examples/online_serving/kv_events_subscriber.py
@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+import msgspec
+import zmq
+from msgspec.msgpack import Decoder
+
+from vllm.v1.core.kv_cache_utils import ExternalBlockHash
+
+
+#
+# Types copied from vllm.distributed.kv_events
+#
+class EventBatch(msgspec.Struct, array_like=True, omit_defaults=True, gc=False):
+    ts: float
+    events: list[Any]
+
+
+class KVCacheEvent(
+    msgspec.Struct, array_like=True, omit_defaults=True, gc=False, tag=True
+):
+    """Base class for all KV cache-related events"""
+
+
+class BlockStored(KVCacheEvent):
+    block_hashes: list[ExternalBlockHash]
+    parent_block_hash: ExternalBlockHash | None
+    token_ids: list[int]
+    block_size: int
+    lora_id: int | None
+    medium: str | None
+
+
+class BlockRemoved(KVCacheEvent):
+    block_hashes: list[ExternalBlockHash]
+    medium: str | None
+
+
+class AllBlocksCleared(KVCacheEvent):
+    pass
+
+
+class KVEventBatch(EventBatch):
+    events: list[BlockStored | BlockRemoved | AllBlocksCleared]
+
+
+def process_event(event_batch):
+    print(f"Received event batch at {event_batch.ts}:")
+    for event in event_batch.events:
+        print(f"  - {event}")
+
+
+def main():
+    decoder = Decoder(type=KVEventBatch)
+    last_seq = -1
+
+    context = zmq.Context()
+
+    # Set up the main subscription socket
+    sub = context.socket(zmq.SUB)
+    sub.connect("tcp://localhost:5557")
+    topic = "kv-events"
+    sub.setsockopt_string(zmq.SUBSCRIBE, topic)
+
+    # Initialize replay socket
+    replay = context.socket(zmq.REQ)
+    replay.connect("tcp://localhost:5558")
+    poller = zmq.Poller()
+    poller.register(replay, zmq.POLLIN)
+
+    print("Listening for KV cache events on topic:", topic)
+
+    while True:
+        try:
+            if sub.poll(50):
+                _, seq_bytes, payload = sub.recv_multipart()
+                seq = int.from_bytes(seq_bytes, "big")
+
+                if last_seq >= 0 and seq > last_seq + 1:
+                    missed = seq - last_seq - 1
+                    print(
+                        f"Missed {missed} messages (last: {last_seq}, current: {seq})"
+                    )
+
+                    replay.send((last_seq + 1).to_bytes(8, "big"))
+
+                    while poller.poll(timeout=200):
+                        seq_bytes, replay_payload = replay.recv_multipart()
+                        if not replay_payload:
+                            # End of replay marker is sent as an empty frame
+                            # for the payload
+                            break
+
+                        replay_seq = int.from_bytes(seq_bytes, "big")
+
+                        if replay_seq > last_seq:
+                            event_batch = decoder.decode(replay_payload)
+                            process_event(event_batch)
+                            last_seq = replay_seq
+                            if replay_seq >= seq - 1:
+                                break
+
+                event_batch = decoder.decode(payload)
+                process_event(event_batch)
+
+            # ... do other periodic work or check for shutdown ...
+
+        except KeyboardInterrupt:
+            print("Interrupted")
+            break
+        except Exception as e:
+            print("Error decoding message:", e)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/multi-node-serving.sh
+++ b/examples/online_serving/multi-node-serving.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+#
+# Helper script to manually start or join a Ray cluster for online serving of vLLM models.
+# This script is first executed on the head node, and then on each worker node with the IP address
+# of the head node.
+#
+# Subcommands:
+#   leader: Launches a Ray head node and blocks until the cluster reaches the expected size (head + workers).
+#   worker: Starts a worker node that connects to an existing Ray head node.
+#
+# Example usage:
+# On the head node machine, start the Ray head node process and run a vLLM server.
+#   ./multi-node-serving.sh leader --ray_port=6379 --ray_cluster_size=<SIZE> [<extra ray args>]  && \
+#   vllm serve meta-llama/Meta-Llama-3.1-405B-Instruct --port 8080 --tensor-parallel-size 8 --pipeline_parallel_size 2
+# 
+# On each worker node, start the Ray worker node process.
+#   ./multi-node-serving.sh worker --ray_address=<HEAD_NODE_IP> --ray_port=6379 [<extra ray args>]
+#
+# About Ray:
+# Ray is an open-source distributed execution framework that simplifies
+# distributed computing. Learn more:
+# https://ray.io/
+
+
+subcommand=$1  # Either "leader" or "worker".
+shift          # Remove the subcommand from the argument list.
+
+ray_port=6379              # Port used by the Ray head node.
+ray_init_timeout=300       # Seconds to wait before timing out.
+declare -a start_params    # Parameters forwarded to the underlying 'ray start' command.
+
+# Handle the worker subcommand.
+case "$subcommand" in
+  worker)
+    ray_address=""
+    while [ $# -gt 0 ]; do
+      case "$1" in
+        --ray_address=*)
+          ray_address="${1#*=}"
+          ;;
+        --ray_port=*)
+          ray_port="${1#*=}"
+          ;;
+        --ray_init_timeout=*)
+          ray_init_timeout="${1#*=}"
+          ;;
+        *)
+          start_params+=("$1")
+      esac
+      shift
+    done
+
+    if [ -z "$ray_address" ]; then
+      echo "Error: Missing argument --ray_address"
+      exit 1
+    fi
+
+    # Retry until the worker node connects to the head node or the timeout expires.
+    for (( i=0; i < $ray_init_timeout; i+=5 )); do
+      ray start --address=$ray_address:$ray_port --block "${start_params[@]}"
+      if [ $? -eq 0 ]; then
+        echo "Worker: Ray runtime started with head address $ray_address:$ray_port"
+        exit 0
+      fi
+      echo "Waiting until the ray worker is active..."
+      sleep 5s;
+    done
+    echo "Ray worker starts timeout, head address: $ray_address:$ray_port"
+    exit 1
+    ;;
+
+  # Handle the leader subcommand.
+  leader)
+    ray_cluster_size=""
+    while [ $# -gt 0 ]; do
+          case "$1" in
+            --ray_port=*)
+              ray_port="${1#*=}"
+              ;;
+            --ray_cluster_size=*)
+              ray_cluster_size="${1#*=}"
+              ;;
+            --ray_init_timeout=*)
+              ray_init_timeout="${1#*=}"
+              ;;
+            *)
+              start_params+=("$1")
+          esac
+          shift
+    done
+
+    if [ -z "$ray_cluster_size" ]; then
+      echo "Error: Missing argument --ray_cluster_size"
+      exit 1
+    fi
+
+    # Start the Ray head node.
+    ray start --head --port=$ray_port "${start_params[@]}"
+
+    # Poll Ray until every worker node is active.
+    for (( i=0; i < $ray_init_timeout; i+=5 )); do
+        active_nodes=`python3 -c 'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))'`
+        if [ $active_nodes -eq $ray_cluster_size ]; then
+          echo "All ray workers are active and the ray cluster is initialized successfully."
+          exit 0
+        fi
+        echo "Wait for all ray workers to be active. $active_nodes/$ray_cluster_size is active"
+        sleep 5s;
+    done
+
+    echo "Waiting for all ray workers to be active timed out."
+    exit 1
+    ;;
+
+  *)
+    echo "unknown subcommand: $subcommand"
+    exit 1
+    ;;
+esac
--- a/examples/online_serving/multi_instance_data_parallel.py
+++ b/examples/online_serving/multi_instance_data_parallel.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import threading
+
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import SamplingParams
+from vllm.v1.metrics.loggers import AggregatedLoggingStatLogger
+
+"""
+To run this example, run the following commands simultaneously with
+different CUDA_VISIBLE_DEVICES:
+    python examples/online_serving/multi_instance_data_parallel.py
+
+    vllm serve ibm-research/PowerMoE-3b -dp 2 -dpr 1 \
+        --data-parallel-address 127.0.0.1 --data-parallel-rpc-port 62300 \
+        --data-parallel-size-local 1 --enforce-eager --headless
+
+Once both instances have completed the handshake, this example will
+send a request to the instance with DP rank 1.
+"""
+
+
+def _do_background_logging(engine, interval, stop_event):
+    try:
+        while not stop_event.is_set():
+            asyncio.run(engine.do_log_stats())
+            stop_event.wait(interval)
+    except Exception as e:
+        print(f"vLLM background logging shutdown: {e}")
+        pass
+
+
+async def main():
+    engine_args = AsyncEngineArgs(
+        model="ibm-research/PowerMoE-3b",
+        data_parallel_size=2,
+        tensor_parallel_size=1,
+        dtype="auto",
+        max_model_len=2048,
+        data_parallel_address="127.0.0.1",
+        data_parallel_rpc_port=62300,
+        data_parallel_size_local=1,
+        enforce_eager=True,
+        enable_log_requests=True,
+        disable_custom_all_reduce=True,
+    )
+
+    engine_client = AsyncLLMEngine.from_engine_args(
+        engine_args,
+        # Example: Using aggregated logger
+        stat_loggers=[AggregatedLoggingStatLogger],
+    )
+    stop_logging_event = threading.Event()
+    logging_thread = threading.Thread(
+        target=_do_background_logging,
+        args=(engine_client, 5, stop_logging_event),
+        daemon=True,
+    )
+    logging_thread.start()
+    sampling_params = SamplingParams(
+        temperature=0.7,
+        top_p=0.9,
+        max_tokens=100,
+    )
+    num_prompts = 10
+    for i in range(num_prompts):
+        prompt = "Who won the 2004 World Series?"
+        final_output: RequestOutput | None = None
+        async for output in engine_client.generate(
+            prompt=prompt,
+            sampling_params=sampling_params,
+            request_id=f"abcdef-{i}",
+            data_parallel_rank=1,
+        ):
+            final_output = output
+        if final_output:
+            print(final_output.outputs[0].text)
+
+    stop_logging_event.set()
+    logging_thread.join()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/examples/online_serving/openai_chat_completion_client.py
+++ b/examples/online_serving/openai_chat_completion_client.py
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Example Python client for OpenAI Chat Completion using vLLM API server
+NOTE: start a supported chat completion model server with `vllm serve`, e.g.
+    vllm serve meta-llama/Llama-2-7b-chat-hf
+"""
+
+import argparse
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "Who won the world series in 2020?"},
+    {
+        "role": "assistant",
+        "content": "The Los Angeles Dodgers won the World Series in 2020.",
+    },
+    {"role": "user", "content": "Where was it played?"},
+]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Client for vLLM API server")
+    parser.add_argument(
+        "--stream", action="store_true", help="Enable streaming response"
+    )
+    return parser.parse_args()
+
+
+def main(args):
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    # Chat Completion API
+    chat_completion = client.chat.completions.create(
+        messages=messages,
+        model=model,
+        stream=args.stream,
+    )
+
+    print("-" * 50)
+    print("Chat completion results:")
+    if args.stream:
+        for c in chat_completion:
+            print(c)
+    else:
+        print(chat_completion)
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -0,0 +1,353 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""An example showing how to use vLLM to serve multimodal models
+and run online serving with OpenAI client.
+
+Launch the vLLM server with the following command:
+
+(single image inference with Llava)
+vllm serve llava-hf/llava-1.5-7b-hf
+
+(multi-image inference with Phi-3.5-vision-instruct)
+vllm serve microsoft/Phi-3.5-vision-instruct --runner generate \
+    --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
+
+(audio inference with Ultravox)
+vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b \
+    --max-model-len 4096 --trust-remote-code
+
+run the script with
+python openai_chat_completion_client_for_multimodal.py --chat-type audio
+"""
+
+import base64
+
+import requests
+from openai import OpenAI
+from utils import get_first_model
+
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    # defaults to os.environ.get("OPENAI_API_KEY")
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+headers = {"User-Agent": "vLLM Example Client"}
+
+
+def encode_base64_content_from_url(content_url: str) -> str:
+    """Encode a content retrieved from a remote url to base64 format."""
+
+    with requests.get(content_url, headers=headers) as response:
+        response.raise_for_status()
+        result = base64.b64encode(response.content).decode("utf-8")
+
+    return result
+
+
+# Text-only inference
+def run_text_only(model: str, max_completion_tokens: int) -> None:
+    chat_completion = client.chat.completions.create(
+        messages=[{"role": "user", "content": "What's the capital of France?"}],
+        model=model,
+        max_completion_tokens=max_completion_tokens,
+    )
+
+    result = chat_completion.choices[0].message.content
+    print("Chat completion output:\n", result)
+
+
+# Single-image input inference
+def run_single_image(model: str, max_completion_tokens: int) -> None:
+    ## Use image url in the payload
+    image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this image?"},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": image_url},
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=max_completion_tokens,
+    )
+
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from image url:\n", result)
+
+    ## Use base64 encoded image in the payload
+    image_base64 = encode_base64_content_from_url(image_url)
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this image?"},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=max_completion_tokens,
+    )
+
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from base64 encoded image:", result)
+
+
+# Multi-image input inference
+def run_multi_image(model: str, max_completion_tokens: int) -> None:
+    image_url_duck = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg"
+    image_url_lion = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/lion.jpg"
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What are the animals in these images?"},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": image_url_duck},
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": image_url_lion},
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=max_completion_tokens,
+    )
+
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output:\n", result)
+
+
+# Video input inference
+def run_video(model: str, max_completion_tokens: int) -> None:
+    video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
+    video_base64 = encode_base64_content_from_url(video_url)
+
+    ## Use video url in the payload
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this video?"},
+                    {
+                        "type": "video_url",
+                        "video_url": {"url": video_url},
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=max_completion_tokens,
+    )
+
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from video url:\n", result)
+
+    ## Use base64 encoded video in the payload
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this video?"},
+                    {
+                        "type": "video_url",
+                        "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=max_completion_tokens,
+    )
+
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from base64 encoded video:\n", result)
+
+
+# Audio input inference
+def run_audio(model: str, max_completion_tokens: int) -> None:
+    from vllm.assets.audio import AudioAsset
+
+    audio_url = AudioAsset("winning_call").url
+    audio_base64 = encode_base64_content_from_url(audio_url)
+
+    # OpenAI-compatible schema (`input_audio`)
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this audio?"},
+                    {
+                        "type": "input_audio",
+                        "input_audio": {
+                            # Any format supported by librosa is supported
+                            "data": audio_base64,
+                            "format": "wav",
+                        },
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=max_completion_tokens,
+    )
+
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from input audio:\n", result)
+
+    # HTTP URL
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this audio?"},
+                    {
+                        "type": "audio_url",
+                        "audio_url": {
+                            # Any format supported by librosa is supported
+                            "url": audio_url
+                        },
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=max_completion_tokens,
+    )
+
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from audio url:\n", result)
+
+    # base64 URL
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this audio?"},
+                    {
+                        "type": "audio_url",
+                        "audio_url": {
+                            # Any format supported by librosa is supported
+                            "url": f"data:audio/ogg;base64,{audio_base64}"
+                        },
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=max_completion_tokens,
+    )
+
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from base64 encoded audio:\n", result)
+
+
+def run_multi_audio(model: str, max_completion_tokens: int) -> None:
+    from vllm.assets.audio import AudioAsset
+
+    # Two different audios to showcase batched inference.
+    audio_url = AudioAsset("winning_call").url
+    audio_base64 = encode_base64_content_from_url(audio_url)
+    audio_url2 = AudioAsset("azacinto_foscolo").url
+    audio_base64_2 = encode_base64_content_from_url(audio_url2)
+
+    # OpenAI-compatible schema (`input_audio`)
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Are these two audios the same?"},
+                    {
+                        "type": "input_audio",
+                        "input_audio": {
+                            "data": audio_base64,
+                            "format": "wav",
+                        },
+                    },
+                    {
+                        "type": "input_audio",
+                        "input_audio": {
+                            "data": audio_base64_2,
+                            "format": "wav",
+                        },
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=max_completion_tokens,
+    )
+
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from input audio:\n", result)
+
+
+example_function_map = {
+    "text-only": run_text_only,
+    "single-image": run_single_image,
+    "multi-image": run_multi_image,
+    "multi-audio": run_multi_audio,
+    "video": run_video,
+    "audio": run_audio,
+}
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description="Demo on using OpenAI client for online serving with "
+        "multimodal language models served with vLLM."
+    )
+    parser.add_argument(
+        "--chat-type",
+        "-c",
+        type=str,
+        default="single-image",
+        choices=list(example_function_map.keys()),
+        help="Conversation type with multimodal data.",
+    )
+    parser.add_argument(
+        "--max-completion-tokens",
+        "-n",
+        type=int,
+        default=128,
+        help="Maximum number of tokens to generate for each completion.",
+    )
+    return parser.parse_args()
+
+
+def main(args) -> None:
+    chat_type = args.chat_type
+    model = get_first_model(client)
+    example_function_map[chat_type](model, args.max_completion_tokens)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/online_serving/openai_chat_completion_client_with_tools.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools.py
@@ -0,0 +1,195 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Set up this example by starting a vLLM OpenAI-compatible server with tool call
+options enabled. For example:
+
+IMPORTANT: for mistral, you must use one of the provided mistral tool call
+templates, or your own - the model default doesn't work for tool calls with vLLM
+See the vLLM docs on OpenAI server & tool calling for more details.
+
+vllm serve mistralai/Mistral-7B-Instruct-v0.3 \
+            --chat-template examples/tool_chat_template_mistral.jinja \
+            --enable-auto-tool-choice --tool-call-parser mistral
+
+OR
+vllm serve NousResearch/Hermes-2-Pro-Llama-3-8B \
+            --chat-template examples/tool_chat_template_hermes.jinja \
+            --enable-auto-tool-choice --tool-call-parser hermes
+"""
+
+import json
+from typing import Any
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+properties = {
+    "city": {
+        "type": "string",
+        "description": "The city to find the weather for, e.g. 'San Francisco'",
+    },
+    "state": {
+        "type": "string",
+        "description": "the two-letter abbreviation for the state that the city is"
+        " in, e.g. 'CA' which would mean 'California'",
+    },
+    "unit": {
+        "type": "string",
+        "description": "The unit to fetch the temperature in",
+        "enum": ["celsius", "fahrenheit"],
+    },
+}
+
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": properties,
+                "required": ["city", "state", "unit"],
+            },
+        },
+    }
+]
+
+messages = [
+    {"role": "user", "content": "Hi! How are you doing today?"},
+    {"role": "assistant", "content": "I'm doing well! How can I help you?"},
+    {
+        "role": "user",
+        "content": (
+            "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
+        ),
+    },
+]
+
+
+def get_current_weather(city: str, state: str, unit: "str"):
+    return (
+        "The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
+        "partly cloudly, with highs in the 90's."
+    )
+
+
+def handle_tool_calls_stream(
+    client: OpenAI,
+    messages: list[dict[str, str]],
+    model: str,
+    tools: list[dict[str, Any]],
+) -> list[Any]:
+    tool_calls_stream = client.chat.completions.create(
+        messages=messages, model=model, tools=tools, stream=True
+    )
+    chunks = []
+    print("chunks: ")
+    for chunk in tool_calls_stream:
+        chunks.append(chunk)
+        if chunk.choices[0].delta.tool_calls:
+            print(chunk.choices[0].delta.tool_calls[0])
+        else:
+            print(chunk.choices[0].delta)
+    return chunks
+
+
+def handle_tool_calls_arguments(chunks: list[Any]) -> list[str]:
+    arguments = []
+    tool_call_idx = -1
+    print("arguments: ")
+    for chunk in chunks:
+        if chunk.choices[0].delta.tool_calls:
+            tool_call = chunk.choices[0].delta.tool_calls[0]
+            if tool_call.index != tool_call_idx:
+                if tool_call_idx >= 0:
+                    print(f"streamed tool call arguments: {arguments[tool_call_idx]}")
+                tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
+                arguments.append("")
+            if tool_call.id:
+                print(f"streamed tool call id: {tool_call.id} ")
+
+            if tool_call.function:
+                if tool_call.function.name:
+                    print(f"streamed tool call name: {tool_call.function.name}")
+
+                if tool_call.function.arguments:
+                    arguments[tool_call_idx] += tool_call.function.arguments
+
+    return arguments
+
+
+def main():
+    # Initialize OpenAI client
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    # Get available models and select one
+    models = client.models.list()
+    model = models.data[0].id
+
+    chat_completion = client.chat.completions.create(
+        messages=messages, model=model, tools=tools
+    )
+
+    print("-" * 70)
+    print("Chat completion results:")
+    print(chat_completion)
+    print("-" * 70)
+
+    # Stream tool calls
+    chunks = handle_tool_calls_stream(client, messages, model, tools)
+    print("-" * 70)
+
+    # Handle arguments from streamed tool calls
+    arguments = handle_tool_calls_arguments(chunks)
+
+    if len(arguments):
+        print(f"streamed tool call arguments: {arguments[-1]}\n")
+
+    print("-" * 70)
+
+    # Add tool call results to the conversation
+    messages.append(
+        {
+            "role": "assistant",
+            "tool_calls": chat_completion.choices[0].message.tool_calls,
+            "reasoning": chat_completion.choices[0].message.reasoning,
+        }
+    )
+
+    # Now, simulate a tool call
+    available_tools = {"get_current_weather": get_current_weather}
+
+    completion_tool_calls = chat_completion.choices[0].message.tool_calls
+    for call in completion_tool_calls:
+        tool_to_call = available_tools[call.function.name]
+        args = json.loads(call.function.arguments)
+        result = tool_to_call(**args)
+        print("tool_to_call result: ", result)
+        messages.append(
+            {
+                "role": "tool",
+                "content": result,
+                "tool_call_id": call.id,
+                "name": call.function.name,
+            }
+        )
+
+    chat_completion_2 = client.chat.completions.create(
+        messages=messages, model=model, tools=tools, stream=False
+    )
+    print("Chat completion2 results:")
+    print(chat_completion_2)
+    print("-" * 70)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_chat_completion_client_with_tools_required.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
@@ -0,0 +1,130 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+To run this example, you can start the vLLM server
+without any specific flags:
+
+```bash
+vllm serve unsloth/Llama-3.2-1B-Instruct \
+    --structured-outputs-config.backend outlines
+```
+
+This example demonstrates how to generate chat completions
+using the OpenAI Python client library.
+"""
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string",
+                        "description": "The city to find the weather for"
+                        ", e.g. 'San Francisco'",
+                    },
+                    "state": {
+                        "type": "string",
+                        "description": (
+                            "the two-letter abbreviation for the state that the "
+                            "city is in, e.g. 'CA' which would mean 'California'"
+                        ),
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["city", "state", "unit"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_forecast",
+            "description": "Get the weather forecast for a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string",
+                        "description": (
+                            "The city to get the forecast for, e.g. 'New York'"
+                        ),
+                    },
+                    "state": {
+                        "type": "string",
+                        "description": (
+                            "The two-letter abbreviation for the state, e.g. 'NY'"
+                        ),
+                    },
+                    "days": {
+                        "type": "integer",
+                        "description": "Number of days to get the forecast for (1-7)",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["city", "state", "days", "unit"],
+            },
+        },
+    },
+]
+
+messages = [
+    {"role": "user", "content": "Hi! How are you doing today?"},
+    {"role": "assistant", "content": "I'm doing well! How can I help you?"},
+    {
+        "role": "user",
+        "content": "Can you tell me what the current weather is in Dallas \
+            and the forecast for the next 5 days, in fahrenheit?",
+    },
+]
+
+
+def main():
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    chat_completion = client.chat.completions.create(
+        messages=messages,
+        model=model,
+        tools=tools,
+        tool_choice="required",
+        stream=True,  # Enable streaming response
+    )
+
+    for chunk in chat_completion:
+        if chunk.choices and chunk.choices[0].delta.tool_calls:
+            print(chunk.choices[0].delta.tool_calls)
+
+    chat_completion = client.chat.completions.create(
+        messages=messages, model=model, tools=tools, tool_choice="required"
+    )
+
+    print(chat_completion.choices[0].message.tool_calls)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_chat_completion_client_with_tools_xlam.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_xlam.py
@@ -0,0 +1,245 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+"""
+Set up this example by starting a vLLM OpenAI-compatible server with tool call
+options enabled for xLAM-2 models:
+
+vllm serve --model Salesforce/Llama-xLAM-2-8b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
+
+OR
+
+vllm serve --model Salesforce/xLAM-2-3b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
+"""
+
+import json
+import time
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "empty"
+openai_api_base = "http://localhost:8000/v1"
+
+
+# Define tool functions
+def get_weather(location: str, unit: str):
+    return f"Weather in {location} is 22 degrees {unit}."
+
+
+def calculate_expression(expression: str):
+    try:
+        result = eval(expression)
+        return f"The result of {expression} is {result}"
+    except Exception as e:
+        return f"Could not calculate {expression}: {e}"
+
+
+def translate_text(text: str, target_language: str):
+    return f"Translation of '{text}' to {target_language}: [translated content]"
+
+
+# Define tools
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "City and state, e.g., 'San Francisco, CA'",
+                    },
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                },
+                "required": ["location", "unit"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "calculate_expression",
+            "description": "Calculate a mathematical expression",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "expression": {
+                        "type": "string",
+                        "description": "Mathematical expression to evaluate, needs to be a valid python expression",
+                    }
+                },
+                "required": ["expression"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "translate_text",
+            "description": "Translate text to another language",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "text": {"type": "string", "description": "Text to translate"},
+                    "target_language": {
+                        "type": "string",
+                        "description": "Target language for translation",
+                    },
+                },
+                "required": ["text", "target_language"],
+            },
+        },
+    },
+]
+
+# Map of function names to implementations
+tool_functions = {
+    "get_weather": get_weather,
+    "calculate_expression": calculate_expression,
+    "translate_text": translate_text,
+}
+
+
+def process_response(response, tool_functions, original_query):
+    """Process a non-streaming response with possible tool calls"""
+
+    print("\n--- Response Output ---")
+
+    # Check if the response has content
+    if response.choices[0].message.content:
+        print(f"Content: {response.choices[0].message.content}")
+
+    # Check if the response has tool calls
+    if response.choices[0].message.tool_calls:
+        print("--------------------------------")
+        print(f"Tool calls: {response.choices[0].message.tool_calls}")
+        print("--------------------------------")
+
+        # Collect all tool calls and results before making follow-up request
+        tool_results = []
+        assistant_message = {"role": "assistant"}
+
+        if response.choices[0].message.content:
+            assistant_message["content"] = response.choices[0].message.content
+
+        assistant_tool_calls = []
+
+        # Process each tool call
+        for tool_call in response.choices[0].message.tool_calls:
+            function_name = tool_call.function.name
+            function_args = tool_call.function.arguments
+            function_id = tool_call.id
+
+            print(f"Function called: {function_name}")
+            print(f"Arguments: {function_args}")
+            print(f"Function ID: {function_id}")
+
+            # Execute the function
+            try:
+                # Parse the JSON arguments
+                args = json.loads(function_args)
+
+                # Call the function with the arguments
+                function_result = tool_functions[function_name](**args)
+                print(f"\n--- Function Result ---\n{function_result}\n")
+
+                # Add tool call to assistant message
+                assistant_tool_calls.append(
+                    {
+                        "id": function_id,
+                        "type": "function",
+                        "function": {"name": function_name, "arguments": function_args},
+                    }
+                )
+
+                # Add tool result to tool_results
+                tool_results.append(
+                    {
+                        "role": "tool",
+                        "tool_call_id": function_id,
+                        "content": function_result,
+                    }
+                )
+
+            except Exception as e:
+                print(f"Error executing function: {e}")
+
+        # Add tool_calls to assistant message
+        assistant_message["tool_calls"] = assistant_tool_calls
+
+        # Create a follow-up message with all function results
+        follow_up_messages = [
+            {"role": "user", "content": original_query},
+            assistant_message,
+        ]
+
+        # Add all tool results to the messages
+        follow_up_messages.extend(tool_results)
+
+        # Get completion with all tool results in a single follow-up
+        follow_up_response = client.chat.completions.create(
+            model=client.models.list().data[0].id,
+            messages=follow_up_messages,
+            stream=False,
+        )
+
+        print("\n--- Follow-up Response ---")
+        print(follow_up_response.choices[0].message.content)
+        print("--- End Follow-up ---\n")
+
+    print("--- End Response ---\n")
+
+
+def run_test_case(query, test_name):
+    """Run a single test case with the given query"""
+    print(f"\n{'=' * 50}\nTEST CASE: {test_name}\n{'=' * 50}")
+    print(f"Query: '{query}'")
+
+    start_time = time.time()
+
+    # Create non-streaming chat completion request
+    response = client.chat.completions.create(
+        model=client.models.list().data[0].id,
+        messages=[{"role": "user", "content": query}],
+        tools=tools,
+        tool_choice="auto",
+        stream=False,
+    )
+
+    # Process the non-streaming response, passing the original query
+    process_response(response, tool_functions, query)
+
+    end_time = time.time()
+    print(f"Test completed in {end_time - start_time:.2f} seconds")
+
+
+def main():
+    # Initialize OpenAI client
+    global client
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    # Run test cases
+    test_cases = [
+        ("I want to know the weather in San Francisco", "Weather Information"),
+        ("Calculate 25 * 17 + 31", "Math Calculation"),
+        ("Translate 'Hello world' to Spanish", "Text Translation"),
+        ("What is the weather in Tokyo and New York in celsius", "Multiple Tool Usage"),
+    ]
+
+    # Execute all test cases
+    for query, test_name in test_cases:
+        run_test_case(query, test_name)
+        time.sleep(1)  # Small delay between tests
+
+    print("\nAll tests completed.")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py
@@ -0,0 +1,273 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+"""
+Set up this example by starting a vLLM OpenAI-compatible server with tool call
+options enabled for xLAM-2 models:
+
+vllm serve --model Salesforce/Llama-xLAM-2-8b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
+
+OR
+
+vllm serve --model Salesforce/xLAM-2-3b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
+
+This example demonstrates streaming tool calls with xLAM models.
+"""
+
+import json
+import time
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "empty"
+openai_api_base = "http://localhost:8000/v1"
+
+
+# Define tool functions
+def get_weather(location: str, unit: str):
+    return f"Weather in {location} is 22 degrees {unit}."
+
+
+def calculate_expression(expression: str):
+    try:
+        result = eval(expression)
+        return f"The result of {expression} is {result}"
+    except Exception as e:
+        return f"Could not calculate {expression}: {e}"
+
+
+def translate_text(text: str, target_language: str):
+    return f"Translation of '{text}' to {target_language}: [translated content]"
+
+
+# Define tools
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "City and state, e.g., 'San Francisco, CA'",
+                    },
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                },
+                "required": ["location", "unit"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "calculate_expression",
+            "description": "Calculate a mathematical expression",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "expression": {
+                        "type": "string",
+                        "description": "Mathematical expression to evaluate, needs to be a valid Python expression",
+                    }
+                },
+                "required": ["expression"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "translate_text",
+            "description": "Translate text to another language",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "text": {"type": "string", "description": "Text to translate"},
+                    "target_language": {
+                        "type": "string",
+                        "description": "Target language for translation",
+                    },
+                },
+                "required": ["text", "target_language"],
+            },
+        },
+    },
+]
+
+# Map of function names to implementations
+tool_functions = {
+    "get_weather": get_weather,
+    "calculate_expression": calculate_expression,
+    "translate_text": translate_text,
+}
+
+
+def process_stream(response, tool_functions, original_query):
+    """Process a streaming response with possible tool calls"""
+    # Track multiple tool calls
+    tool_calls = {}  # Dictionary to store tool calls by ID
+
+    current_id = None
+
+    print("\n--- Stream Output ---")
+    for chunk in response:
+        # Handle tool calls in the stream
+        if chunk.choices[0].delta.tool_calls:
+            for tool_call_chunk in chunk.choices[0].delta.tool_calls:
+                # Get the tool call ID
+                if hasattr(tool_call_chunk, "id") and tool_call_chunk.id:
+                    current_id = tool_call_chunk.id
+                    if current_id not in tool_calls:
+                        tool_calls[current_id] = {
+                            "function_name": None,
+                            "function_args": "",
+                            "function_id": current_id,
+                        }
+
+                # Extract function information as it comes in chunks
+                if (
+                    hasattr(tool_call_chunk, "function")
+                    and current_id
+                    and current_id in tool_calls
+                ):
+                    if (
+                        hasattr(tool_call_chunk.function, "name")
+                        and tool_call_chunk.function.name
+                    ):
+                        tool_calls[current_id]["function_name"] = (
+                            tool_call_chunk.function.name
+                        )
+                        print(f"Function called: {tool_call_chunk.function.name}")
+
+                    if (
+                        hasattr(tool_call_chunk.function, "arguments")
+                        and tool_call_chunk.function.arguments
+                    ):
+                        tool_calls[current_id]["function_args"] += (
+                            tool_call_chunk.function.arguments
+                        )
+                        print(f"Arguments chunk: {tool_call_chunk.function.arguments}")
+
+        # Handle regular content in the stream
+        elif chunk.choices[0].delta.content:
+            print(chunk.choices[0].delta.content, end="")
+
+    print("\n--- End Stream ---\n")
+
+    # Execute each function call and build messages for follow-up
+    follow_up_messages = [{"role": "user", "content": original_query}]
+
+    for tool_id, tool_data in tool_calls.items():
+        function_name = tool_data["function_name"]
+        function_args = tool_data["function_args"]
+        function_id = tool_data["function_id"]
+
+        if function_name and function_args:
+            try:
+                # Parse the JSON arguments
+                args = json.loads(function_args)
+
+                # Call the function with the arguments
+                function_result = tool_functions[function_name](**args)
+                print(
+                    f"\n--- Function Result ({function_name}) ---\n{function_result}\n"
+                )
+
+                # Add the assistant message with tool call
+                follow_up_messages.append(
+                    {
+                        "role": "assistant",
+                        "tool_calls": [
+                            {
+                                "id": function_id,
+                                "type": "function",
+                                "function": {
+                                    "name": function_name,
+                                    "arguments": function_args,
+                                },
+                            }
+                        ],
+                    }
+                )
+
+                # Add the tool message with function result
+                follow_up_messages.append(
+                    {
+                        "role": "tool",
+                        "tool_call_id": function_id,
+                        "content": function_result,
+                    }
+                )
+
+            except Exception as e:
+                print(f"Error executing function: {e}")
+
+    # Only send follow-up if we have results to process
+    if len(follow_up_messages) > 1:
+        # Create a follow-up message with all the function results
+        follow_up_response = client.chat.completions.create(
+            model=client.models.list().data[0].id,
+            messages=follow_up_messages,
+            stream=True,
+        )
+
+        print("\n--- Follow-up Response ---")
+        for chunk in follow_up_response:
+            if chunk.choices[0].delta.content:
+                print(chunk.choices[0].delta.content, end="")
+        print("\n--- End Follow-up ---\n")
+
+
+def run_test_case(query, test_name):
+    """Run a single test case with the given query"""
+    print(f"\n{'=' * 50}\nTEST CASE: {test_name}\n{'=' * 50}")
+    print(f"Query: '{query}'")
+
+    start_time = time.time()
+
+    # Create streaming chat completion request
+    response = client.chat.completions.create(
+        model=client.models.list().data[0].id,
+        messages=[{"role": "user", "content": query}],
+        tools=tools,
+        tool_choice="auto",
+        stream=True,
+    )
+
+    # Process the streaming response
+    process_stream(response, tool_functions, query)
+
+    end_time = time.time()
+    print(f"Test completed in {end_time - start_time:.2f} seconds")
+
+
+def main():
+    # Initialize OpenAI client
+    global client
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    # Run test cases
+    test_cases = [
+        ("I want to know the weather in San Francisco", "Weather Information"),
+        ("Calculate 25 * 17 + 31", "Math Calculation"),
+        ("Translate 'Hello world' to Spanish", "Text Translation"),
+        ("What is the weather in Tokyo and New York in celsius", "Multiple Tool Usage"),
+    ]
+
+    # Execute all test cases
+    for query, test_name in test_cases:
+        run_test_case(query, test_name)
+        time.sleep(1)  # Small delay between tests
+
+    print("\nAll tests completed.")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
@@ -0,0 +1,170 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+An example demonstrates how to use tool calling with reasoning models 
+like QwQ-32B. The reasoning will not be parsed by the tool 
+calling process; only the final output will be parsed.
+
+To run this example, you need to start the vLLM server with both 
+the reasoning parser and tool calling enabled.
+
+```bash
+vllm serve Qwen/QwQ-32B \
+     --reasoning-parser deepseek_r1 \
+     --enable-auto-tool-choice --tool-call-parser hermes
+     
+```
+
+"""
+
+from openai import OpenAI
+
+
+# Now, simulate a tool call
+def get_current_weather(city: str, state: str, unit: "str"):
+    return (
+        "The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
+        "partly cloudly, with highs in the 90's."
+    )
+
+
+available_tools = {"get_current_weather": get_current_weather}
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+properties = {
+    "city": {
+        "type": "string",
+        "description": "The city to find the weather for, e.g. 'San Francisco'",
+    },
+    "state": {
+        "type": "string",
+        "description": "the two-letter abbreviation for the state that the city is"
+        " in, e.g. 'CA' which would mean 'California'",
+    },
+    "unit": {
+        "type": "string",
+        "description": "The unit to fetch the temperature in",
+        "enum": ["celsius", "fahrenheit"],
+    },
+}
+
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": properties,
+                "required": ["city", "state", "unit"],
+            },
+        },
+    }
+]
+messages = [
+    {"role": "user", "content": "Hi! How are you doing today?"},
+    {"role": "assistant", "content": "I'm doing well! How can I help you?"},
+    {
+        "role": "user",
+        "content": (
+            "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
+        ),
+    },
+]
+
+
+def extract_reasoning_and_calls(chunks: list):
+    reasoning = ""
+    tool_call_idx = -1
+    arguments = []
+    function_names = []
+    for chunk in chunks:
+        if chunk.choices[0].delta.tool_calls:
+            tool_call = chunk.choices[0].delta.tool_calls[0]
+            if tool_call.index != tool_call_idx:
+                tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
+                arguments.append("")
+                function_names.append("")
+
+            if tool_call.function:
+                if tool_call.function.name:
+                    function_names[tool_call_idx] = tool_call.function.name
+
+                if tool_call.function.arguments:
+                    arguments[tool_call_idx] += tool_call.function.arguments
+        else:
+            if hasattr(chunk.choices[0].delta, "reasoning"):
+                reasoning += chunk.choices[0].delta.reasoning
+    return reasoning, arguments, function_names
+
+
+def main():
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    print("---------Full Generate With Automatic Function Calling-------------")
+    tool_calls = client.chat.completions.create(
+        messages=messages, model=model, tools=tools
+    )
+    print(f"reasoning: {tool_calls.choices[0].message.reasoning}")
+    print(f"function name: {tool_calls.choices[0].message.tool_calls[0].function.name}")
+    print(
+        f"function arguments: "
+        f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}"
+    )
+
+    print("----------Stream Generate With Automatic Function Calling-----------")
+    tool_calls_stream = client.chat.completions.create(
+        messages=messages, model=model, tools=tools, stream=True
+    )
+
+    chunks = list(tool_calls_stream)
+
+    reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
+
+    print(f"reasoning: {reasoning}")
+    print(f"function name: {function_names[0]}")
+    print(f"function arguments: {arguments[0]}")
+
+    print("----------Full Generate With Named Function Calling-----------------")
+    tool_calls = client.chat.completions.create(
+        messages=messages,
+        model=model,
+        tools=tools,
+        tool_choice={"type": "function", "function": {"name": "get_current_weather"}},
+    )
+
+    tool_call = tool_calls.choices[0].message.tool_calls[0].function
+    print(f"reasoning: {tool_calls.choices[0].message.reasoning}")
+    print(f"function name: {tool_call.name}")
+    print(f"function arguments: {tool_call.arguments}")
+    print("----------Stream Generate With Named Function Calling--------------")
+
+    tool_calls_stream = client.chat.completions.create(
+        messages=messages,
+        model=model,
+        tools=tools,
+        tool_choice={"type": "function", "function": {"name": "get_current_weather"}},
+        stream=True,
+    )
+
+    chunks = list(tool_calls_stream)
+
+    reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
+    print(f"reasoning: {reasoning}")
+    print(f"function name: {function_names[0]}")
+    print(f"function arguments: {arguments[0]}")
+    print("\n\n")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_chat_completion_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning.py
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+An example shows how to generate chat completions from reasoning models
+like DeepSeekR1.
+
+To run this example, you need to start the vLLM server
+with the reasoning parser:
+
+```bash
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+    --reasoning-parser deepseek_r1
+```
+
+This example demonstrates how to generate chat completions from reasoning models
+using the OpenAI Python client library.
+"""
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+
+def main():
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    # Round 1
+    messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+    # ruff: noqa: E501
+    # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
+    response = client.chat.completions.create(model=model, messages=messages)
+
+    reasoning = response.choices[0].message.reasoning
+    content = response.choices[0].message.content
+
+    print("reasoning for Round 1:", reasoning)
+    print("content for Round 1:", content)
+
+    # Round 2
+    messages.append({"role": "assistant", "content": content})
+    messages.append(
+        {
+            "role": "user",
+            "content": "How many Rs are there in the word 'strawberry'?",
+        }
+    )
+    response = client.chat.completions.create(model=model, messages=messages)
+
+    reasoning = response.choices[0].message.reasoning
+    content = response.choices[0].message.content
+
+    print("reasoning for Round 2:", reasoning)
+    print("content for Round 2:", content)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+An example shows how to generate chat completions from reasoning models
+like DeepSeekR1.
+
+To run this example, you need to start the vLLM server with the reasoning
+parser:
+
+```bash
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+     --reasoning-parser deepseek_r1
+```
+
+Unlike openai_chat_completion_with_reasoning.py, this example demonstrates the
+streaming chat completions feature.
+
+The streaming chat completions feature allows you to receive chat completions
+in real-time as they are generated by the model. This is useful for scenarios
+where you want to display chat completions to the user as they are generated
+by the model.
+
+Remember to check content and reasoning exist in `ChatCompletionChunk`,
+content may not exist leading to errors if you try to access it.
+"""
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+
+
+def main():
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    # ruff: noqa: E501
+    # For granite: add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
+    stream = client.chat.completions.create(model=model, messages=messages, stream=True)
+
+    print("client: Start streaming chat completions...")
+    printed_reasoning = False
+    printed_content = False
+
+    for chunk in stream:
+        # Safely extract reasoning and content from delta,
+        # defaulting to None if attributes don't exist or are empty strings
+        reasoning = getattr(chunk.choices[0].delta, "reasoning", None) or None
+        content = getattr(chunk.choices[0].delta, "content", None) or None
+
+        if reasoning is not None:
+            if not printed_reasoning:
+                printed_reasoning = True
+                print("reasoning:", end="", flush=True)
+            print(reasoning, end="", flush=True)
+        elif content is not None:
+            if not printed_content:
+                printed_content = True
+                print("\ncontent:", end="", flush=True)
+            # Extract and print the content
+            print(content, end="", flush=True)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_completion_client.py
+++ b/examples/online_serving/openai_completion_client.py
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Client for vLLM API server")
+    parser.add_argument(
+        "--stream", action="store_true", help="Enable streaming response"
+    )
+    return parser.parse_args()
+
+
+def main(args):
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    # Completion API
+    completion = client.completions.create(
+        model=model,
+        prompt="A robot may not injure a human being",
+        echo=False,
+        n=2,
+        stream=args.stream,
+        logprobs=3,
+    )
+
+    print("-" * 50)
+    print("Completion results:")
+    if args.stream:
+        for c in completion:
+            print(c)
+    else:
+        print(completion)
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/online_serving/openai_responses_client.py
+++ b/examples/online_serving/openai_responses_client.py
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Set up this example by starting a vLLM OpenAI-compatible server.
+Reasoning models can be used through the Responses API as seen here
+https://platform.openai.com/docs/api-reference/responses
+For example:
+vllm serve Qwen/Qwen3-8B --reasoning-parser qwen3
+
+"""
+
+from openai import OpenAI
+
+input_messages = [{"role": "user", "content": "What model are you?"}]
+
+
+def main():
+    base_url = "http://localhost:8000/v1"
+    client = OpenAI(base_url=base_url, api_key="empty")
+    model = "Qwen/Qwen3-8B"  # get_first_model(client)
+    response = client.responses.create(
+        model=model,
+        input=input_messages,
+    )
+
+    for message in response.output:
+        if message.type == "reasoning":
+            # append reasoning message
+            input_messages.append(message)
+
+    response_2 = client.responses.create(
+        model=model,
+        input=input_messages,
+    )
+    print(response_2.output_text)
+    # I am Qwen, a large language model developed by Alibaba Cloud.
+    # I am designed to assist with a wide range of tasks, including
+    # answering questions, creating content, coding, and engaging in
+    # conversations. I can help with various topics and provide
+    # information or support in multiple languages. How can I assist you today?
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_responses_client_with_mcp_tools.py
+++ b/examples/online_serving/openai_responses_client_with_mcp_tools.py
@@ -0,0 +1,184 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Example demonstrating MCP (Model Context Protocol) tools with the Responses API.
+
+This example shows how to use MCP tools with different allowed_tools configurations:
+1. No filter (allows all tools from the MCP server)
+2. Wildcard "*" (explicitly allows all tools)
+3. Specific tool names (filters to only those tools)
+
+Set up this example by starting a vLLM OpenAI-compatible server with MCP tools enabled.
+For example:
+vllm serve openai/gpt-oss-20b --enforce-eager --tool-server demo
+
+Environment variables:
+- VLLM_ENABLE_RESPONSES_API_STORE=1
+- VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS=code_interpreter,container
+- VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS=1
+"""
+
+from openai import OpenAI
+from utils import get_first_model
+
+
+def example_no_filter():
+    """Example with no allowed_tools filter - allows all tools."""
+    print("=" * 60)
+    print("Example 1: No allowed_tools filter (allows all tools)")
+    print("=" * 60)
+
+    base_url = "http://0.0.0.0:8000/v1"
+    client = OpenAI(base_url=base_url, api_key="empty")
+    model = get_first_model(client)
+
+    response = client.responses.create(
+        model=model,
+        input="Execute this code: print('Hello from Python!')",
+        instructions="Use the Python tool to execute code.",
+        tools=[
+            {
+                "type": "mcp",
+                "server_label": "code_interpreter",
+                "server_url": "http://localhost:8888",
+                # No allowed_tools specified - all tools are available
+            }
+        ],
+    )
+
+    print(f"Status: {response.status}")
+    print(f"Output: {response.output_text}")
+    print()
+
+
+def example_wildcard():
+    """Example with allowed_tools=['*'] - explicitly allows all tools."""
+    print("=" * 60)
+    print("Example 2: allowed_tools=['*'] (select all tools)")
+    print("=" * 60)
+
+    base_url = "http://0.0.0.0:8000/v1"
+    client = OpenAI(base_url=base_url, api_key="empty")
+    model = get_first_model(client)
+
+    response = client.responses.create(
+        model=model,
+        input="Execute this code: print('Hello from Python with wildcard!')",
+        instructions="Use the Python tool to execute code.",
+        tools=[
+            {
+                "type": "mcp",
+                "server_label": "code_interpreter",
+                "server_url": "http://localhost:8888",
+                # Using "*" to explicitly allow all tools from this MCP server
+                # This is equivalent to not specifying allowed_tools
+                "allowed_tools": ["*"],
+            }
+        ],
+    )
+
+    print(f"Status: {response.status}")
+    print(f"Output: {response.output_text}")
+    print()
+
+
+def example_specific_tools():
+    """Example with specific allowed_tools list - filters available tools.
+
+    Note: This example uses 'web_search_preview' (browser) which has multiple
+    sub-tools: 'search', 'open', 'find'. The code_interpreter (python) doesn't
+    have sub-tools, so filtering doesn't apply there.
+    """
+    print("=" * 60)
+    print("Example 3: allowed_tools=['search'] (filter browser to specific tools)")
+    print("=" * 60)
+
+    base_url = "http://0.0.0.0:8000/v1"
+    client = OpenAI(base_url=base_url, api_key="empty")
+    model = get_first_model(client)
+
+    response = client.responses.create(
+        model=model,
+        input="Search for 'Python programming tutorials'",
+        instructions="Use the browser tool to search.",
+        tools=[
+            {
+                "type": "mcp",
+                "server_label": "web_search_preview",
+                "server_url": "http://localhost:8888",
+                # Browser has tools: 'search', 'open', 'find'
+                # Only allow 'search' - blocks 'open' and 'find'
+                "allowed_tools": ["search"],
+            }
+        ],
+    )
+
+    print(f"Status: {response.status}")
+    print(f"Output: {response.output_text}")
+    print()
+
+
+def example_object_format():
+    """Example using object format for allowed_tools with browser tools."""
+    print("=" * 60)
+    print("Example 4: allowed_tools with object format")
+    print("=" * 60)
+
+    base_url = "http://0.0.0.0:8000/v1"
+    client = OpenAI(base_url=base_url, api_key="empty")
+    model = get_first_model(client)
+
+    response = client.responses.create(
+        model=model,
+        input="Search for 'machine learning' and open the first result",
+        instructions="Use the browser tool.",
+        tools=[
+            {
+                "type": "mcp",
+                "server_label": "web_search_preview",
+                "server_url": "http://localhost:8888",
+                # Object format with tool_names field
+                # Can also include read_only and other fields
+                # Browser has tools: 'search', 'open', 'find'
+                "allowed_tools": {
+                    "tool_names": [
+                        "search",
+                        "open",
+                    ],  # Allow search and open, block find
+                    "read_only": False,
+                },
+            }
+        ],
+    )
+
+    print(f"Status: {response.status}")
+    print(f"Output: {response.output_text}")
+    print()
+
+
+def main():
+    """Run all examples."""
+    print("\n" + "=" * 60)
+    print("MCP Tools with allowed_tools Examples")
+    print("=" * 60 + "\n")
+
+    # Run all examples
+    example_no_filter()
+    example_wildcard()
+    example_specific_tools()
+    example_object_format()
+
+    print("=" * 60)
+    print("Summary:")
+    print("  - No filter or '*' → All tools available from server")
+    print("  - Specific list → Only those sub-tools available")
+    print("  - Object format → More control with tool_names field")
+    print("")
+    print("Note: allowed_tools filters SUB-TOOLS within an MCP server:")
+    print("  - code_interpreter (python): No sub-tools to filter")
+    print("  - web_search_preview (browser): Has 'search', 'open', 'find'")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_responses_client_with_tools.py
+++ b/examples/online_serving/openai_responses_client_with_tools.py
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Set up this example by starting a vLLM OpenAI-compatible server with tool call
+options enabled.
+Reasoning models can be used through the Responses API as seen here
+https://platform.openai.com/docs/api-reference/responses
+For example:
+vllm serve Qwen/Qwen3-1.7B --reasoning-parser qwen3 \
+      --structured-outputs-config.backend xgrammar \
+      --enable-auto-tool-choice --tool-call-parser hermes
+"""
+
+import json
+
+from openai import OpenAI
+from utils import get_first_model
+
+
+def get_weather(latitude: float, longitude: float) -> str:
+    """
+    Mock function to simulate getting weather data.
+    In a real application, this would call an external weather API.
+    """
+    return f"Current temperature at ({latitude}, {longitude}) is 20°C."
+
+
+tools = [
+    {
+        "type": "function",
+        "name": "get_weather",
+        "description": "Get current temperature for provided coordinates in celsius.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "latitude": {"type": "number"},
+                "longitude": {"type": "number"},
+            },
+            "required": ["latitude", "longitude"],
+            "additionalProperties": False,
+        },
+        "strict": True,
+    }
+]
+
+input_messages = [
+    {"role": "user", "content": "What's the weather like in Paris today?"}
+]
+
+
+def main():
+    base_url = "http://0.0.0.0:8000/v1"
+    client = OpenAI(base_url=base_url, api_key="empty")
+    model = get_first_model(client)
+    response = client.responses.create(
+        model=model, input=input_messages, tools=tools, tool_choice="required"
+    )
+
+    for out in response.output:
+        if out.type == "function_call":
+            print("Function call:", out.name, out.arguments)
+            tool_call = out
+    args = json.loads(tool_call.arguments)
+    result = get_weather(args["latitude"], args["longitude"])
+
+    input_messages.append(tool_call)  # append model's function call message
+    input_messages.append(
+        {  # append result message
+            "type": "function_call_output",
+            "call_id": tool_call.call_id,
+            "output": str(result),
+        }
+    )
+    response_2 = client.responses.create(
+        model=model,
+        input=input_messages,
+        tools=tools,
+    )
+    print(response_2.output_text)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_transcription_client.py
+++ b/examples/online_serving/openai_transcription_client.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This script demonstrates how to use the vLLM API server to perform audio
+transcription with the `openai/whisper-large-v3` model.
+
+Before running this script, you must start the vLLM server with the following command:
+
+    vllm serve openai/whisper-large-v3
+
+Requirements:
+- vLLM with audio support
+- openai Python SDK
+- httpx for streaming support
+
+The script performs:
+1. Synchronous transcription using OpenAI-compatible API.
+2. Streaming transcription using raw HTTP request to the vLLM server.
+"""
+
+import asyncio
+
+from openai import AsyncOpenAI, OpenAI
+
+from vllm.assets.audio import AudioAsset
+
+
+def sync_openai(audio_path: str, client: OpenAI):
+    """
+    Perform synchronous transcription using OpenAI-compatible API.
+    """
+    with open(audio_path, "rb") as f:
+        transcription = client.audio.transcriptions.create(
+            file=f,
+            model="openai/whisper-large-v3",
+            language="en",
+            response_format="json",
+            temperature=0.0,
+            # Additional sampling params not provided by OpenAI API.
+            extra_body=dict(
+                seed=4419,
+                repetition_penalty=1.3,
+            ),
+        )
+        print("transcription result:", transcription.text)
+
+
+async def stream_openai_response(audio_path: str, client: AsyncOpenAI):
+    """
+    Perform asynchronous transcription using OpenAI-compatible API.
+    """
+    print("\ntranscription result:", end=" ")
+    with open(audio_path, "rb") as f:
+        transcription = await client.audio.transcriptions.create(
+            file=f,
+            model="openai/whisper-large-v3",
+            language="en",
+            response_format="json",
+            temperature=0.0,
+            # Additional sampling params not provided by OpenAI API.
+            extra_body=dict(
+                seed=420,
+                top_p=0.6,
+            ),
+            stream=True,
+        )
+        async for chunk in transcription:
+            if chunk.choices:
+                content = chunk.choices[0].get("delta", {}).get("content")
+                print(content, end="", flush=True)
+
+    print()  # Final newline after stream ends
+
+
+def main():
+    mary_had_lamb = str(AudioAsset("mary_had_lamb").get_local_path())
+    winning_call = str(AudioAsset("winning_call").get_local_path())
+
+    # Modify OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    sync_openai(mary_had_lamb, client)
+    # Run the asynchronous function
+    client = AsyncOpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    asyncio.run(stream_openai_response(winning_call, client))
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_translation_client.py
+++ b/examples/online_serving/openai_translation_client.py
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import json
+
+import httpx
+from openai import OpenAI
+
+from vllm.assets.audio import AudioAsset
+
+
+def sync_openai(audio_path: str, client: OpenAI):
+    with open(audio_path, "rb") as f:
+        translation = client.audio.translations.create(
+            file=f,
+            model="openai/whisper-large-v3",
+            response_format="json",
+            temperature=0.0,
+            # Additional params not provided by OpenAI API.
+            extra_body=dict(
+                language="it",
+                seed=4419,
+                repetition_penalty=1.3,
+            ),
+        )
+        print("translation result:", translation.text)
+
+
+async def stream_openai_response(audio_path: str, base_url: str, api_key: str):
+    data = {
+        "language": "it",
+        "stream": True,
+        "model": "openai/whisper-large-v3",
+    }
+    url = base_url + "/audio/translations"
+    headers = {"Authorization": f"Bearer {api_key}"}
+    print("translation result:", end=" ")
+    # OpenAI translation API client does not support streaming.
+    async with httpx.AsyncClient() as client:
+        with open(audio_path, "rb") as f:
+            async with client.stream(
+                "POST", url, files={"file": f}, data=data, headers=headers
+            ) as response:
+                async for line in response.aiter_lines():
+                    # Each line is a JSON object prefixed with 'data: '
+                    if line:
+                        if line.startswith("data: "):
+                            line = line[len("data: ") :]
+                        # Last chunk, stream ends
+                        if line.strip() == "[DONE]":
+                            break
+                        # Parse the JSON response
+                        chunk = json.loads(line)
+                        # Extract and print the content
+                        content = chunk["choices"][0].get("delta", {}).get("content")
+                        print(content, end="")
+
+
+def main():
+    foscolo = str(AudioAsset("azacinto_foscolo").get_local_path())
+
+    # Modify OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    sync_openai(foscolo, client)
+    # Run the asynchronous function
+    asyncio.run(stream_openai_response(foscolo, openai_api_base, openai_api_key))
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/opentelemetry/README.md
+++ b/examples/online_serving/opentelemetry/README.md
@@ -0,0 +1,94 @@
+# Setup OpenTelemetry POC
+
+1. Install OpenTelemetry packages:
+
+    ```bash
+    pip install \
+      'opentelemetry-sdk>=1.26.0,<1.27.0' \
+      'opentelemetry-api>=1.26.0,<1.27.0' \
+      'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
+      'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'
+    ```
+
+1. Start Jaeger in a docker container:
+
+    ```bash
+    # From: https://www.jaegertracing.io/docs/1.57/getting-started/
+    docker run --rm --name jaeger \
+        -e COLLECTOR_ZIPKIN_HOST_PORT=:9411 \
+        -p 6831:6831/udp \
+        -p 6832:6832/udp \
+        -p 5778:5778 \
+        -p 16686:16686 \
+        -p 4317:4317 \
+        -p 4318:4318 \
+        -p 14250:14250 \
+        -p 14268:14268 \
+        -p 14269:14269 \
+        -p 9411:9411 \
+        jaegertracing/all-in-one:1.57
+    ```
+
+1. In a new shell, export Jaeger IP:
+
+    ```bash
+    export JAEGER_IP=$(docker inspect   --format '{{ .NetworkSettings.IPAddress }}' jaeger)
+    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
+    ```
+
+    Then set vLLM's service name for OpenTelemetry, enable insecure connections to Jaeger and run vLLM:
+
+    ```bash
+    export OTEL_SERVICE_NAME="vllm-server"
+    export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
+    vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
+    ```
+
+1. In a new shell, send requests with trace context from a dummy client
+
+    ```bash
+    export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger)
+    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
+    export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
+    export OTEL_SERVICE_NAME="client-service"
+    python dummy_client.py
+    ```
+
+1. Open Jaeger webui: <http://localhost:16686/>
+
+    In the search pane, select `vllm-server` service and hit `Find Traces`. You should get a list of traces, one for each request.
+    ![Traces](https://i.imgur.com/GYHhFjo.png)
+
+1. Clicking on a trace will show its spans and their tags. In this demo, each trace has 2 spans. One from the dummy client containing the prompt text and one from vLLM containing metadata about the request.
+![Spans details](https://i.imgur.com/OPf6CBL.png)
+
+## Exporter Protocol
+
+OpenTelemetry supports either `grpc` or `http/protobuf` as the transport protocol for trace data in the exporter.
+By default, `grpc` is used. To set `http/protobuf` as the protocol, configure the `OTEL_EXPORTER_OTLP_TRACES_PROTOCOL` environment variable as follows:
+
+```bash
+export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf
+export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
+vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
+```
+
+## Instrumentation of FastAPI
+
+OpenTelemetry allows automatic instrumentation of FastAPI.
+
+1. Install the instrumentation library
+
+    ```bash
+    pip install opentelemetry-instrumentation-fastapi
+    ```
+
+1. Run vLLM with `opentelemetry-instrument`
+
+    ```bash
+    opentelemetry-instrument vllm serve facebook/opt-125m
+    ```
+
+1. Send a request to vLLM and find its trace in Jaeger. It should contain spans from FastAPI.
+
+![FastAPI Spans](https://i.imgur.com/hywvoOJ.png)
--- a/examples/online_serving/opentelemetry/dummy_client.py
+++ b/examples/online_serving/opentelemetry/dummy_client.py
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import requests
+from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
+from opentelemetry.trace import SpanKind, set_tracer_provider
+from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
+
+trace_provider = TracerProvider()
+set_tracer_provider(trace_provider)
+
+trace_provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter()))
+trace_provider.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))
+
+tracer = trace_provider.get_tracer("dummy-client")
+
+url = "http://localhost:8000/v1/completions"
+with tracer.start_as_current_span("client-span", kind=SpanKind.CLIENT) as span:
+    prompt = "San Francisco is a"
+    span.set_attribute("prompt", prompt)
+    headers = {}
+    TraceContextTextMapPropagator().inject(headers)
+    payload = {
+        "model": "facebook/opt-125m",
+        "prompt": prompt,
+        "max_tokens": 10,
+        "n": 3,
+        "use_beam_search": "true",
+        "temperature": 0.0,
+        # "stream": True,
+    }
+    response = requests.post(url, headers=headers, json=payload)
--- a/examples/online_serving/prometheus_grafana/README.md
+++ b/examples/online_serving/prometheus_grafana/README.md
@@ -0,0 +1,57 @@
+# Prometheus and Grafana
+
+This is a simple example that shows you how to connect vLLM metric logging to the Prometheus/Grafana stack. For this example, we launch Prometheus and Grafana via Docker. You can checkout other methods through [Prometheus](https://prometheus.io/) and [Grafana](https://grafana.com/) websites.
+
+Install:
+
+- [`docker`](https://docs.docker.com/engine/install/)
+- [`docker compose`](https://docs.docker.com/compose/install/linux/#install-using-the-repository)
+
+## Launch
+
+Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint:
+
+```bash
+vllm serve mistralai/Mistral-7B-v0.1 \
+    --max-model-len 2048
+```
+
+Launch Prometheus and Grafana servers with `docker compose`:
+
+```bash
+docker compose up
+```
+
+Submit some sample requests to the server:
+
+```bash
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+vllm bench serve \
+    --model mistralai/Mistral-7B-v0.1 \
+    --tokenizer mistralai/Mistral-7B-v0.1 \
+    --endpoint /v1/completions \
+    --dataset-name sharegpt \
+    --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \
+    --request-rate 3.0
+```
+
+Navigating to [`http://localhost:8000/metrics`](http://localhost:8000/metrics) will show the raw Prometheus metrics being exposed by vLLM.
+
+## Grafana Dashboard
+
+Navigate to [`http://localhost:3000`](http://localhost:3000). Log in with the default username (`admin`) and password (`admin`).
+
+### Add Prometheus Data Source
+
+Navigate to [`http://localhost:3000/connections/datasources/new`](http://localhost:3000/connections/datasources/new) and select Prometheus.
+
+On Prometheus configuration page, we need to add the `Prometheus Server URL` in `Connection`. For this setup, Grafana and Prometheus are running in separate containers, but Docker creates DNS name for each container. You can just use `http://prometheus:9090`.
+
+Click `Save & Test`. You should get a green check saying "Successfully queried the Prometheus API.".
+
+### Import Dashboard
+
+Navigate to [`http://localhost:3000/dashboard/import`](http://localhost:3000/dashboard/import), upload `grafana.json`, and select the `prometheus` datasource. You should see a screen that looks like the following:
+
+![Grafana Dashboard Image](https://i.imgur.com/R2vH9VW.png)
--- a/examples/online_serving/prometheus_grafana/docker-compose.yaml
+++ b/examples/online_serving/prometheus_grafana/docker-compose.yaml
@@ -0,0 +1,19 @@
+# docker-compose.yaml
+version: "3"
+
+services:
+  prometheus:
+    image: prom/prometheus:latest
+    extra_hosts:
+      - "host.docker.internal:host-gateway"     # allow a direct connection from container to the local machine
+    ports:
+      - "9090:9090"   # the default port used by Prometheus
+    volumes:
+      - ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml # mount Prometheus config file
+
+  grafana:
+    image: grafana/grafana:latest
+    depends_on:
+      - prometheus
+    ports:
+      - "3000:3000" # the default port used by Grafana
--- a/examples/online_serving/prometheus_grafana/grafana.json
+++ b/examples/online_serving/prometheus_grafana/grafana.json
--- a/examples/online_serving/prometheus_grafana/prometheus.yaml
+++ b/examples/online_serving/prometheus_grafana/prometheus.yaml
@@ -0,0 +1,10 @@
+# prometheus.yaml
+global:
+  scrape_interval: 5s
+  evaluation_interval: 30s
+
+scrape_configs:
+  - job_name: vllm
+    static_configs:
+      - targets:
+          - 'host.docker.internal:8000'
--- a/examples/online_serving/prompt_embed_inference_with_openai_client.py
+++ b/examples/online_serving/prompt_embed_inference_with_openai_client.py
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+vLLM OpenAI-Compatible Client with Prompt Embeddings
+
+This script demonstrates how to:
+1. Generate prompt embeddings using Hugging Face Transformers
+2. Encode them in base64 format
+3. Send them to a vLLM server via the OpenAI-compatible Completions API
+
+Run the vLLM server first:
+vllm serve meta-llama/Llama-3.2-1B-Instruct \
+  --runner generate \
+  --max-model-len 4096 \
+  --enable-prompt-embeds
+
+Run the client:
+python examples/online_serving/prompt_embed_inference_with_openai_client.py
+
+Model: meta-llama/Llama-3.2-1B-Instruct
+Note: This model is gated on Hugging Face Hub.
+      You must request access to use it:
+      https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct
+
+Dependencies:
+- transformers
+- torch
+- openai
+"""
+
+import transformers
+from openai import OpenAI
+
+from vllm.utils.serial_utils import tensor2base64
+
+
+def main():
+    client = OpenAI(
+        api_key="EMPTY",
+        base_url="http://localhost:8000/v1",
+    )
+
+    model_name = "meta-llama/Llama-3.2-1B-Instruct"
+
+    # Transformers
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
+    transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
+
+    # Refer to the HuggingFace repo for the correct format to use
+    chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
+    token_ids = tokenizer.apply_chat_template(
+        chat, add_generation_prompt=True, return_tensors="pt"
+    )
+
+    embedding_layer = transformers_model.get_input_embeddings()
+    prompt_embeds = embedding_layer(token_ids).squeeze(0)
+
+    # Prompt embeddings
+    encoded_embeds = tensor2base64(prompt_embeds)
+
+    completion = client.completions.create(
+        model=model_name,
+        # NOTE: The OpenAI client does not allow `None` as an input to
+        # `prompt`. Use an empty string if you have no text prompts.
+        prompt="",
+        max_tokens=5,
+        temperature=0.0,
+        # NOTE: The OpenAI client allows passing in extra JSON body via the
+        # `extra_body` argument.
+        extra_body={"prompt_embeds": encoded_embeds},
+    )
+
+    print("-" * 30)
+    print(completion.choices[0].text)
+    print("-" * 30)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/ray_serve_deepseek.py
+++ b/examples/online_serving/ray_serve_deepseek.py
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Deploy DeepSeek R1 or V3 with Ray Serve LLM.
+
+Ray Serve LLM is a scalable and production-grade model serving library built
+on the Ray distributed computing framework and first-class support for the vLLM engine.
+
+Key features:
+- Automatic scaling, back-pressure, and load balancing across a Ray cluster.
+- Unified multi-node multi-model deployment.
+- Exposes an OpenAI-compatible HTTP API.
+- Multi-LoRA support with shared base models.
+
+Run `python3 ray_serve_deepseek.py` to launch an endpoint.
+
+Learn more in the official Ray Serve LLM documentation:
+https://docs.ray.io/en/latest/serve/llm/serving-llms.html
+"""
+
+from ray import serve
+from ray.serve.llm import LLMConfig, build_openai_app
+
+llm_config = LLMConfig(
+    model_loading_config={
+        "model_id": "deepseek",
+        # Pre-downloading the model to local storage is recommended since
+        # the model is large. Set model_source="/path/to/the/model".
+        "model_source": "deepseek-ai/DeepSeek-R1",
+    },
+    deployment_config={
+        "autoscaling_config": {
+            "min_replicas": 1,
+            "max_replicas": 1,
+        }
+    },
+    # Set to the node's accelerator type.
+    accelerator_type="H100",
+    # Customize engine arguments as required (for example, vLLM engine kwargs).
+    engine_kwargs={
+        "tensor_parallel_size": 8,
+        "pipeline_parallel_size": 2,
+        "gpu_memory_utilization": 0.92,
+        "dtype": "auto",
+        "max_num_seqs": 40,
+        "max_model_len": 16384,
+        "enable_chunked_prefill": True,
+        "enable_prefix_caching": True,
+        "trust_remote_code": True,
+    },
+)
+
+# Deploy the application.
+llm_app = build_openai_app({"llm_configs": [llm_config]})
+serve.run(llm_app)
--- a/examples/online_serving/retrieval_augmented_generation_with_langchain.py
+++ b/examples/online_serving/retrieval_augmented_generation_with_langchain.py
@@ -0,0 +1,257 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Retrieval Augmented Generation (RAG) Implementation with Langchain
+==================================================================
+
+This script demonstrates a RAG implementation using LangChain, Milvus
+and vLLM. RAG enhances LLM responses by retrieving relevant context
+from a document collection.
+
+Features:
+- Web content loading and chunking
+- Vector storage with Milvus
+- Embedding generation with vLLM
+- Question answering with context
+
+Prerequisites:
+1. Install dependencies:
+    pip install -U vllm \
+                 langchain_milvus langchain_openai \
+                 langchain_community beautifulsoup4 \
+                 langchain-text-splitters
+
+2. Start services:
+    # Start embedding service (port 8000)
+    vllm serve ssmits/Qwen2-7B-Instruct-embed-base
+
+    # Start chat service (port 8001)
+    vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
+
+Usage:
+    python retrieval_augmented_generation_with_langchain.py
+
+Notes:
+    - Ensure both vLLM services are running before executing
+    - Default ports: 8000 (embedding), 8001 (chat)
+    - First run may take time to download models
+"""
+
+import argparse
+from argparse import Namespace
+from typing import Any
+
+from langchain_community.document_loaders import WebBaseLoader
+from langchain_core.documents import Document
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import PromptTemplate
+from langchain_core.runnables import RunnablePassthrough
+from langchain_milvus import Milvus
+from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+
+
+def load_and_split_documents(config: dict[str, Any]):
+    """
+    Load and split documents from web URL
+    """
+    try:
+        loader = WebBaseLoader(web_paths=(config["url"],))
+        docs = loader.load()
+
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=config["chunk_size"],
+            chunk_overlap=config["chunk_overlap"],
+        )
+        return text_splitter.split_documents(docs)
+    except Exception as e:
+        print(f"Error loading document from {config['url']}: {str(e)}")
+        raise
+
+
+def init_vectorstore(config: dict[str, Any], documents: list[Document]):
+    """
+    Initialize vector store with documents
+    """
+    return Milvus.from_documents(
+        documents=documents,
+        embedding=OpenAIEmbeddings(
+            model=config["embedding_model"],
+            openai_api_key=config["vllm_api_key"],
+            openai_api_base=config["vllm_embedding_endpoint"],
+        ),
+        connection_args={"uri": config["uri"]},
+        drop_old=True,
+    )
+
+
+def init_llm(config: dict[str, Any]):
+    """
+    Initialize llm
+    """
+    return ChatOpenAI(
+        model=config["chat_model"],
+        openai_api_key=config["vllm_api_key"],
+        openai_api_base=config["vllm_chat_endpoint"],
+    )
+
+
+def get_qa_prompt():
+    """
+    Get question answering prompt template
+    """
+    template = """You are an assistant for question-answering tasks.
+Use the following pieces of retrieved context to answer the question.
+If you don't know the answer, just say that you don't know.
+Use three sentences maximum and keep the answer concise.
+Question: {question}
+Context: {context}
+Answer:
+"""
+    return PromptTemplate.from_template(template)
+
+
+def format_docs(docs: list[Document]):
+    """
+    Format documents for prompt
+    """
+    return "\n\n".join(doc.page_content for doc in docs)
+
+
+def create_qa_chain(retriever: Any, llm: ChatOpenAI, prompt: PromptTemplate):
+    """
+    Set up question answering chain
+    """
+    return (
+        {
+            "context": retriever | format_docs,
+            "question": RunnablePassthrough(),
+        }
+        | prompt
+        | llm
+        | StrOutputParser()
+    )
+
+
+def get_parser() -> argparse.ArgumentParser:
+    """
+    Parse command line arguments
+    """
+    parser = argparse.ArgumentParser(description="RAG with vLLM and langchain")
+
+    # Add command line arguments
+    parser.add_argument(
+        "--vllm-api-key", default="EMPTY", help="API key for vLLM compatible services"
+    )
+    parser.add_argument(
+        "--vllm-embedding-endpoint",
+        default="http://localhost:8000/v1",
+        help="Base URL for embedding service",
+    )
+    parser.add_argument(
+        "--vllm-chat-endpoint",
+        default="http://localhost:8001/v1",
+        help="Base URL for chat service",
+    )
+    parser.add_argument("--uri", default="./milvus.db", help="URI for Milvus database")
+    parser.add_argument(
+        "--url",
+        default=("https://docs.vllm.ai/en/latest/getting_started/quickstart.html"),
+        help="URL of the document to process",
+    )
+    parser.add_argument(
+        "--embedding-model",
+        default="ssmits/Qwen2-7B-Instruct-embed-base",
+        help="Model name for embeddings",
+    )
+    parser.add_argument(
+        "--chat-model", default="qwen/Qwen1.5-0.5B-Chat", help="Model name for chat"
+    )
+    parser.add_argument(
+        "-i", "--interactive", action="store_true", help="Enable interactive Q&A mode"
+    )
+    parser.add_argument(
+        "-k", "--top-k", type=int, default=3, help="Number of top results to retrieve"
+    )
+    parser.add_argument(
+        "-c",
+        "--chunk-size",
+        type=int,
+        default=1000,
+        help="Chunk size for document splitting",
+    )
+    parser.add_argument(
+        "-o",
+        "--chunk-overlap",
+        type=int,
+        default=200,
+        help="Chunk overlap for document splitting",
+    )
+
+    return parser
+
+
+def init_config(args: Namespace):
+    """
+    Initialize configuration settings from command line arguments
+    """
+
+    return {
+        "vllm_api_key": args.vllm_api_key,
+        "vllm_embedding_endpoint": args.vllm_embedding_endpoint,
+        "vllm_chat_endpoint": args.vllm_chat_endpoint,
+        "uri": args.uri,
+        "embedding_model": args.embedding_model,
+        "chat_model": args.chat_model,
+        "url": args.url,
+        "chunk_size": args.chunk_size,
+        "chunk_overlap": args.chunk_overlap,
+        "top_k": args.top_k,
+    }
+
+
+def main():
+    # Parse command line arguments
+    args = get_parser().parse_args()
+
+    # Initialize configuration
+    config = init_config(args)
+
+    # Load and split documents
+    documents = load_and_split_documents(config)
+
+    # Initialize vector store and retriever
+    vectorstore = init_vectorstore(config, documents)
+    retriever = vectorstore.as_retriever(search_kwargs={"k": config["top_k"]})
+
+    # Initialize llm and prompt
+    llm = init_llm(config)
+    prompt = get_qa_prompt()
+
+    # Set up QA chain
+    qa_chain = create_qa_chain(retriever, llm, prompt)
+
+    # Interactive mode
+    if args.interactive:
+        print("\nWelcome to Interactive Q&A System!")
+        print("Enter 'q' or 'quit' to exit.")
+
+        while True:
+            question = input("\nPlease enter your question: ")
+            if question.lower() in ["q", "quit"]:
+                print("\nThank you for using! Goodbye!")
+                break
+
+            output = qa_chain.invoke(question)
+            print(output)
+    else:
+        # Default single question mode
+        question = "How to install vLLM?"
+        output = qa_chain.invoke(question)
+        print("-" * 50)
+        print(output)
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py
+++ b/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py
@@ -0,0 +1,225 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+RAG (Retrieval Augmented Generation) Implementation with LlamaIndex
+================================================================
+
+This script demonstrates a RAG system using:
+- LlamaIndex: For document indexing and retrieval
+- Milvus: As vector store backend
+- vLLM: For embedding and text generation
+
+Features:
+1. Document Loading & Processing
+2. Embedding & Storage
+3. Query Processing
+
+Requirements:
+1. Install dependencies:
+pip install llama-index llama-index-readers-web \
+            llama-index-llms-openai-like    \
+            llama-index-embeddings-openai-like \
+            llama-index-vector-stores-milvus \
+
+2. Start services:
+    # Start embedding service (port 8000)
+    vllm serve ssmits/Qwen2-7B-Instruct-embed-base
+
+    # Start chat service (port 8001)
+    vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
+
+Usage:
+    python retrieval_augmented_generation_with_llamaindex.py
+
+Notes:
+    - Ensure both vLLM services are running before executing
+    - Default ports: 8000 (embedding), 8001 (chat)
+    - First run may take time to download models
+"""
+
+import argparse
+from argparse import Namespace
+from typing import Any
+
+from llama_index.core import Settings, StorageContext, VectorStoreIndex
+from llama_index.core.node_parser import SentenceSplitter
+from llama_index.embeddings.openai_like import OpenAILikeEmbedding
+from llama_index.llms.openai_like import OpenAILike
+from llama_index.readers.web import SimpleWebPageReader
+from llama_index.vector_stores.milvus import MilvusVectorStore
+
+
+def init_config(args: Namespace):
+    """Initialize configuration with command line arguments"""
+    return {
+        "url": args.url,
+        "embedding_model": args.embedding_model,
+        "chat_model": args.chat_model,
+        "vllm_api_key": args.vllm_api_key,
+        "embedding_endpoint": args.embedding_endpoint,
+        "chat_endpoint": args.chat_endpoint,
+        "db_path": args.db_path,
+        "chunk_size": args.chunk_size,
+        "chunk_overlap": args.chunk_overlap,
+        "top_k": args.top_k,
+    }
+
+
+def load_documents(url: str) -> list:
+    """Load and process web documents"""
+    return SimpleWebPageReader(html_to_text=True).load_data([url])
+
+
+def setup_models(config: dict[str, Any]):
+    """Configure embedding and chat models"""
+    Settings.embed_model = OpenAILikeEmbedding(
+        api_base=config["embedding_endpoint"],
+        api_key=config["vllm_api_key"],
+        model_name=config["embedding_model"],
+    )
+
+    Settings.llm = OpenAILike(
+        model=config["chat_model"],
+        api_key=config["vllm_api_key"],
+        api_base=config["chat_endpoint"],
+        context_window=128000,
+        is_chat_model=True,
+        is_function_calling_model=False,
+    )
+
+    Settings.transformations = [
+        SentenceSplitter(
+            chunk_size=config["chunk_size"],
+            chunk_overlap=config["chunk_overlap"],
+        )
+    ]
+
+
+def setup_vector_store(db_path: str) -> MilvusVectorStore:
+    """Initialize vector store"""
+    sample_emb = Settings.embed_model.get_text_embedding("test")
+    print(f"Embedding dimension: {len(sample_emb)}")
+    return MilvusVectorStore(uri=db_path, dim=len(sample_emb), overwrite=True)
+
+
+def create_index(documents: list, vector_store: MilvusVectorStore):
+    """Create document index"""
+    storage_context = StorageContext.from_defaults(vector_store=vector_store)
+    return VectorStoreIndex.from_documents(
+        documents,
+        storage_context=storage_context,
+    )
+
+
+def query_document(index: VectorStoreIndex, question: str, top_k: int):
+    """Query document with given question"""
+    query_engine = index.as_query_engine(similarity_top_k=top_k)
+    return query_engine.query(question)
+
+
+def get_parser() -> argparse.ArgumentParser:
+    """Parse command line arguments"""
+    parser = argparse.ArgumentParser(description="RAG with vLLM and LlamaIndex")
+
+    # Add command line arguments
+    parser.add_argument(
+        "--url",
+        default=("https://docs.vllm.ai/en/latest/getting_started/quickstart.html"),
+        help="URL of the document to process",
+    )
+    parser.add_argument(
+        "--embedding-model",
+        default="ssmits/Qwen2-7B-Instruct-embed-base",
+        help="Model name for embeddings",
+    )
+    parser.add_argument(
+        "--chat-model", default="qwen/Qwen1.5-0.5B-Chat", help="Model name for chat"
+    )
+    parser.add_argument(
+        "--vllm-api-key", default="EMPTY", help="API key for vLLM compatible services"
+    )
+    parser.add_argument(
+        "--embedding-endpoint",
+        default="http://localhost:8000/v1",
+        help="Base URL for embedding service",
+    )
+    parser.add_argument(
+        "--chat-endpoint",
+        default="http://localhost:8001/v1",
+        help="Base URL for chat service",
+    )
+    parser.add_argument(
+        "--db-path", default="./milvus_demo.db", help="Path to Milvus database"
+    )
+    parser.add_argument(
+        "-i", "--interactive", action="store_true", help="Enable interactive Q&A mode"
+    )
+    parser.add_argument(
+        "-c",
+        "--chunk-size",
+        type=int,
+        default=1000,
+        help="Chunk size for document splitting",
+    )
+    parser.add_argument(
+        "-o",
+        "--chunk-overlap",
+        type=int,
+        default=200,
+        help="Chunk overlap for document splitting",
+    )
+    parser.add_argument(
+        "-k", "--top-k", type=int, default=3, help="Number of top results to retrieve"
+    )
+
+    return parser
+
+
+def main():
+    # Parse command line arguments
+    args = get_parser().parse_args()
+
+    # Initialize configuration
+    config = init_config(args)
+
+    # Load documents
+    documents = load_documents(config["url"])
+
+    # Setup models
+    setup_models(config)
+
+    # Setup vector store
+    vector_store = setup_vector_store(config["db_path"])
+
+    # Create index
+    index = create_index(documents, vector_store)
+
+    if args.interactive:
+        print("\nEntering interactive mode. Type 'quit' to exit.")
+        while True:
+            # Get user question
+            question = input("\nEnter your question: ")
+
+            # Check for exit command
+            if question.lower() in ["quit", "exit", "q"]:
+                print("Exiting interactive mode...")
+                break
+
+            # Get and print response
+            print("\n" + "-" * 50)
+            print("Response:\n")
+            response = query_document(index, question, config["top_k"])
+            print(response)
+            print("-" * 50)
+    else:
+        # Single query mode
+        question = "How to install vLLM?"
+        response = query_document(index, question, config["top_k"])
+        print("-" * 50)
+        print("Response:\n")
+        print(response)
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/run_cluster.sh
+++ b/examples/online_serving/run_cluster.sh
@@ -0,0 +1,131 @@
+#!/bin/bash
+#
+# Launch a Ray cluster inside Docker for vLLM inference.
+#
+# This script can start either a head node or a worker node, depending on the
+# --head or --worker flag provided as the third positional argument.
+#
+# Usage:
+# 1. Designate one machine as the head node and execute:
+#    bash run_cluster.sh \
+#         vllm/vllm-openai \
+#         <head_node_ip> \
+#         --head \
+#         /abs/path/to/huggingface/cache \
+#         -e VLLM_HOST_IP=<head_node_ip>
+#
+# 2. On every worker machine, execute:
+#    bash run_cluster.sh \
+#         vllm/vllm-openai \
+#         <head_node_ip> \
+#         --worker \
+#         /abs/path/to/huggingface/cache \
+#         -e VLLM_HOST_IP=<worker_node_ip>
+#
+# Each worker requires a unique VLLM_HOST_IP value.
+# Keep each terminal session open. Closing a session stops the associated Ray
+# node and thereby shuts down the entire cluster.
+# Every machine must be reachable at the supplied IP address.
+#
+# The container is named "node-<random_suffix>". To open a shell inside
+# a container after launch, use:
+#       docker exec -it node-<random_suffix> /bin/bash
+#
+# Then, you can execute vLLM commands on the Ray cluster as if it were a
+# single machine, e.g. vllm serve ...
+#
+# To stop the container, use:
+#       docker stop node-<random_suffix>
+
+# Check for minimum number of required arguments.
+if [ $# -lt 4 ]; then
+    echo "Usage: $0 docker_image head_node_ip --head|--worker path_to_hf_home [additional_args...]"
+    exit 1
+fi
+
+# Extract the mandatory positional arguments and remove them from $@.
+DOCKER_IMAGE="$1"
+HEAD_NODE_ADDRESS="$2"
+NODE_TYPE="$3"  # Should be --head or --worker.
+PATH_TO_HF_HOME="$4"
+shift 4
+
+# Preserve any extra arguments so they can be forwarded to Docker.
+ADDITIONAL_ARGS=("$@")
+
+# Validate the NODE_TYPE argument.
+if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then
+    echo "Error: Node type must be --head or --worker"
+    exit 1
+fi
+
+# Extract VLLM_HOST_IP from ADDITIONAL_ARGS (e.g. "-e VLLM_HOST_IP=...").
+VLLM_HOST_IP=""
+for ((i = 0; i < ${#ADDITIONAL_ARGS[@]}; i++)); do
+    arg="${ADDITIONAL_ARGS[$i]}"
+    case "${arg}" in
+        -e)
+            next="${ADDITIONAL_ARGS[$((i + 1))]:-}"
+            if [[ "${next}" == VLLM_HOST_IP=* ]]; then
+                VLLM_HOST_IP="${next#VLLM_HOST_IP=}"
+                break
+            fi
+            ;;
+        -eVLLM_HOST_IP=* | VLLM_HOST_IP=*)
+            VLLM_HOST_IP="${arg#*=}"
+            break
+            ;;
+    esac
+done
+
+# For the head node, HEAD_NODE_ADDRESS and VLLM_HOST_IP should be consistent.
+if [[ "${NODE_TYPE}" == "--head" && -n "${VLLM_HOST_IP}" ]]; then
+    if [[ "${VLLM_HOST_IP}" != "${HEAD_NODE_ADDRESS}" ]]; then
+        echo "Warning: VLLM_HOST_IP (${VLLM_HOST_IP}) differs from head_node_ip (${HEAD_NODE_ADDRESS})."
+        echo "Using VLLM_HOST_IP as the head node address."
+        HEAD_NODE_ADDRESS="${VLLM_HOST_IP}"
+    fi
+fi
+
+# Generate a unique container name with random suffix.
+# Docker container names must be unique on each host.
+# The random suffix allows multiple Ray containers to run simultaneously on the same machine,
+# for example, on a multi-GPU machine.
+CONTAINER_NAME="node-${RANDOM}"
+
+# Define a cleanup routine that removes the container when the script exits.
+# This prevents orphaned containers from accumulating if the script is interrupted.
+cleanup() {
+    docker stop "${CONTAINER_NAME}"
+    docker rm "${CONTAINER_NAME}"
+}
+trap cleanup EXIT
+
+# Build the Ray start command based on the node role.
+# The head node manages the cluster and accepts connections on port 6379,
+# while workers connect to the head's address.
+RAY_START_CMD="ray start --block"
+if [ "${NODE_TYPE}" == "--head" ]; then
+    RAY_START_CMD+=" --head --node-ip-address=${HEAD_NODE_ADDRESS} --port=6379"
+else
+
+    RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
+    if [ -n "${VLLM_HOST_IP}" ]; then
+        RAY_START_CMD+=" --node-ip-address=${VLLM_HOST_IP}"
+    fi
+fi
+
+# Launch the container with the assembled parameters.
+# --network host: Allows Ray nodes to communicate directly via host networking
+# --shm-size 10.24g: Increases shared memory
+# --gpus all: Gives container access to all GPUs on the host
+# -v HF_HOME: Mounts HuggingFace cache to avoid re-downloading models
+docker run \
+    --entrypoint /bin/bash \
+    --network host \
+    --name "${CONTAINER_NAME}" \
+    --shm-size 10.24g \
+    --gpus all \
+    -v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
+    "${ADDITIONAL_ARGS[@]}" \
+    "${DOCKER_IMAGE}" -c "${RAY_START_CMD}"
--- a/examples/online_serving/sagemaker-entrypoint.sh
+++ b/examples/online_serving/sagemaker-entrypoint.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+# Define the prefix for environment variables to look for
+PREFIX="SM_VLLM_"
+ARG_PREFIX="--"
+
+# Initialize an array for storing the arguments
+# port 8080 required by sagemaker, https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-code-container-response
+ARGS=(--port 8080)
+
+# Loop through all environment variables
+while IFS='=' read -r key value; do
+    # Remove the prefix from the key, convert to lowercase, and replace underscores with dashes
+    arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')
+
+    # Add the argument name and value to the ARGS array
+    ARGS+=("${ARG_PREFIX}${arg_name}")
+    if [ -n "$value" ]; then
+        ARGS+=("$value")
+    fi
+done < <(env | grep "^${PREFIX}")
+
+# Pass the collected arguments to the main entrypoint
+exec vllm serve "${ARGS[@]}"
--- a/examples/online_serving/streamlit_openai_chatbot_webserver.py
+++ b/examples/online_serving/streamlit_openai_chatbot_webserver.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+vLLM Chat Assistant - A Streamlit Web Interface
+
+A streamlined chat interface that quickly integrates
+with vLLM API server.
+
+Features:
+- Multiple chat sessions management
+- Streaming response display
+- Configurable API endpoint
+- Real-time chat history
+- Reasoning Display: Optional thinking process visualization 
+
+Requirements:
+    pip install streamlit openai
+
+Usage:
+    # Start the app with default settings
+    streamlit run streamlit_openai_chatbot_webserver.py
+
+    # Start with custom vLLM API endpoint
+    VLLM_API_BASE="http://your-server:8000/v1" \
+        streamlit run streamlit_openai_chatbot_webserver.py
+
+    # Enable debug mode
+    streamlit run streamlit_openai_chatbot_webserver.py \
+        --logger.level=debug
+"""
+
+import os
+from datetime import datetime
+
+import streamlit as st
+from openai import OpenAI
+
+# Get command line arguments from environment variables
+openai_api_key = os.getenv("VLLM_API_KEY", "EMPTY")
+openai_api_base = os.getenv("VLLM_API_BASE", "http://localhost:8000/v1")
+
+# Initialize session states for managing chat sessions
+if "sessions" not in st.session_state:
+    st.session_state.sessions = {}
+
+if "current_session" not in st.session_state:
+    st.session_state.current_session = None
+
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+
+if "active_session" not in st.session_state:
+    st.session_state.active_session = None
+
+# Add new session state for reasoning
+if "show_reasoning" not in st.session_state:
+    st.session_state.show_reasoning = {}
+
+# Initialize session state for API base URL
+if "api_base_url" not in st.session_state:
+    st.session_state.api_base_url = openai_api_base
+
+
+def create_new_chat_session():
+    """Create a new chat session with timestamp as unique identifier.
+
+    This function initializes a new chat session by:
+    1. Generating a timestamp-based session ID
+    2. Creating an empty message list for the new session
+    3. Setting the new session as both current and active session
+    4. Resetting the messages list for the new session
+
+    Returns:
+        None
+
+    Session State Updates:
+        - sessions: Adds new empty message list with timestamp key
+        - current_session: Sets to new session ID
+        - active_session: Sets to new session ID
+        - messages: Resets to empty list
+    """
+    session_id = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    st.session_state.sessions[session_id] = []
+    st.session_state.current_session = session_id
+    st.session_state.active_session = session_id
+    st.session_state.messages = []
+
+
+def switch_to_chat_session(session_id):
+    """Switch the active chat context to a different session.
+
+    Args:
+        session_id (str): The timestamp ID of the session to switch to
+
+    This function handles chat session switching by:
+    1. Setting the specified session as current
+    2. Updating the active session marker
+    3. Loading the messages history from the specified session
+
+    Session State Updates:
+        - current_session: Updated to specified session_id
+        - active_session: Updated to specified session_id
+        - messages: Loaded from sessions[session_id]
+    """
+    st.session_state.current_session = session_id
+    st.session_state.active_session = session_id
+    st.session_state.messages = st.session_state.sessions[session_id]
+
+
+def get_llm_response(messages, model, reason, content_ph=None, reasoning_ph=None):
+    """Generate and stream LLM response with optional reasoning process.
+
+    Args:
+        messages (list): List of conversation message dicts with 'role' and 'content'
+        model (str): The model identifier to use for generation
+        reason (bool): Whether to enable and display reasoning process
+        content_ph (streamlit.empty): Placeholder for streaming response content
+        reasoning_ph (streamlit.empty): Placeholder for streaming reasoning process
+
+    Returns:
+        tuple: (str, str)
+            - First string contains the complete response text
+            - Second string contains the complete reasoning text (if enabled)
+
+    Features:
+        - Streams both reasoning and response text in real-time
+        - Handles model API errors gracefully
+        - Supports live updating of thinking process
+        - Maintains separate content and reasoning displays
+
+    Raises:
+        Exception: Wrapped in error message if API call fails
+
+    Note:
+        The function uses streamlit placeholders for live updates.
+        When reason=True, the reasoning process appears above the response.
+    """
+    full_text = ""
+    think_text = ""
+    live_think = None
+    # Build request parameters
+    params = {"model": model, "messages": messages, "stream": True}
+    if reason:
+        params["extra_body"] = {"chat_template_kwargs": {"enable_thinking": True}}
+
+    try:
+        response = client.chat.completions.create(**params)
+        if isinstance(response, str):
+            if content_ph:
+                content_ph.markdown(response)
+            return response, ""
+
+        # Prepare reasoning expander above content
+        if reason and reasoning_ph:
+            exp = reasoning_ph.expander("💭 Thinking Process (live)", expanded=True)
+            live_think = exp.empty()
+
+        # Stream chunks
+        for chunk in response:
+            delta = chunk.choices[0].delta
+            # Stream reasoning first
+            if reason and hasattr(delta, "reasoning") and live_think:
+                rc = delta.reasoning
+                if rc:
+                    think_text += rc
+                    live_think.markdown(think_text + "▌")
+            # Then stream content
+            if hasattr(delta, "content") and delta.content and content_ph:
+                full_text += delta.content
+                content_ph.markdown(full_text + "▌")
+
+        # Finalize displays: reasoning remains above, content below
+        if reason and live_think:
+            live_think.markdown(think_text)
+        if content_ph:
+            content_ph.markdown(full_text)
+
+        return full_text, think_text
+    except Exception as e:
+        st.error(f"Error details: {str(e)}")
+        return f"Error: {str(e)}", ""
+
+
+# Sidebar - API Settings first
+st.sidebar.title("API Settings")
+new_api_base = st.sidebar.text_input(
+    "API Base URL:", value=st.session_state.api_base_url
+)
+if new_api_base != st.session_state.api_base_url:
+    st.session_state.api_base_url = new_api_base
+    st.rerun()
+
+st.sidebar.divider()
+
+# Sidebar - Session Management
+st.sidebar.title("Chat Sessions")
+if st.sidebar.button("New Session"):
+    create_new_chat_session()
+
+
+# Display all sessions in reverse chronological order
+for session_id in sorted(st.session_state.sessions.keys(), reverse=True):
+    # Mark the active session with a pinned button
+    if session_id == st.session_state.active_session:
+        st.sidebar.button(
+            f"📍 {session_id}",
+            key=session_id,
+            type="primary",
+            on_click=switch_to_chat_session,
+            args=(session_id,),
+        )
+    else:
+        st.sidebar.button(
+            f"Session {session_id}",
+            key=session_id,
+            on_click=switch_to_chat_session,
+            args=(session_id,),
+        )
+
+# Main interface
+st.title("vLLM Chat Assistant")
+
+# Initialize OpenAI client with API settings
+client = OpenAI(api_key=openai_api_key, base_url=st.session_state.api_base_url)
+
+# Get and display current model id
+models = client.models.list()
+model = models.data[0].id
+st.markdown(f"**Model**: {model}")
+
+# Initialize first session if none exists
+if st.session_state.current_session is None:
+    create_new_chat_session()
+    st.session_state.active_session = st.session_state.current_session
+
+# Update the chat history display section
+for idx, msg in enumerate(st.session_state.messages):
+    # Render user messages normally
+    if msg["role"] == "user":
+        with st.chat_message("user"):
+            st.write(msg["content"])
+    # Render assistant messages with reasoning above
+    else:
+        # If reasoning exists for this assistant message, show it above the content
+        if idx in st.session_state.show_reasoning:
+            with st.expander("💭 Thinking Process", expanded=False):
+                st.markdown(st.session_state.show_reasoning[idx])
+        with st.chat_message("assistant"):
+            st.write(msg["content"])
+
+
+# Setup & Cache reasoning support check
+@st.cache_data(show_spinner=False)
+def server_supports_reasoning():
+    """Check if the current model supports reasoning capability.
+
+    Returns:
+        bool: True if the model supports reasoning, False otherwise
+    """
+    resp = client.chat.completions.create(
+        model=model,
+        messages=[{"role": "user", "content": "Hi"}],
+        stream=False,
+    )
+    return hasattr(resp.choices[0].message, "reasoning") and bool(
+        resp.choices[0].message.reasoning
+    )
+
+
+# Check support
+supports_reasoning = server_supports_reasoning()
+
+# Add reasoning toggle in sidebar if supported
+reason = False  # Default to False
+if supports_reasoning:
+    reason = st.sidebar.checkbox("Enable Reasoning", value=False)
+else:
+    st.sidebar.markdown(
+        "<span style='color:gray;'>Reasoning unavailable for this model.</span>",
+        unsafe_allow_html=True,
+    )
+    # reason remains False
+
+# Update the input handling section
+if prompt := st.chat_input("Type your message here..."):
+    # Save and display user message
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    st.session_state.sessions[st.session_state.current_session] = (
+        st.session_state.messages
+    )
+    with st.chat_message("user"):
+        st.write(prompt)
+
+    # Prepare LLM messages
+    msgs = [
+        {"role": m["role"], "content": m["content"]} for m in st.session_state.messages
+    ]
+
+    # Stream assistant response
+    with st.chat_message("assistant"):
+        # Placeholders: reasoning above, content below
+        reason_ph = st.empty()
+        content_ph = st.empty()
+        full, think = get_llm_response(msgs, model, reason, content_ph, reason_ph)
+        # Determine index for this new assistant message
+        message_index = len(st.session_state.messages)
+        # Save assistant reply
+        st.session_state.messages.append({"role": "assistant", "content": full})
+        # Persist reasoning in session state if any
+        if reason and think:
+            st.session_state.show_reasoning[message_index] = think
--- a/examples/online_serving/structured_outputs/README.md
+++ b/examples/online_serving/structured_outputs/README.md
@@ -0,0 +1,58 @@
+# Structured Outputs
+
+This script demonstrates various structured output capabilities of vLLM's OpenAI-compatible server.
+It can run individual constraint type or all of them.
+It supports both streaming responses and concurrent non-streaming requests.
+
+To use this example, you must start an vLLM server with any model of your choice.
+
+```bash
+vllm serve Qwen/Qwen2.5-3B-Instruct
+```
+
+To serve a reasoning model, you can use the following command:
+
+```bash
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \
+    --reasoning-parser deepseek_r1
+```
+
+If you want to run this script standalone with `uv`, you can use the following:
+
+```bash
+uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs \
+    structured-outputs
+```
+
+See [feature docs](https://docs.vllm.ai/en/latest/features/structured_outputs.html) for more information.
+
+!!! tip
+    If vLLM is running remotely, then set `OPENAI_BASE_URL=<remote_url>` before running the script.
+
+## Usage
+
+Run all constraints, non-streaming:
+
+```bash
+uv run structured_outputs.py
+```
+
+Run all constraints, streaming:
+
+```bash
+uv run structured_outputs.py --stream
+```
+
+Run certain constraints, for example `structural_tag` and `regex`, streaming:
+
+```bash
+uv run structured_outputs.py \
+    --constraint structural_tag regex \
+    --stream
+```
+
+Run all constraints, with reasoning models and streaming:
+
+```bash
+uv run structured_outputs.py --reasoning --stream
+```
--- a/examples/online_serving/structured_outputs/pyproject.toml
+++ b/examples/online_serving/structured_outputs/pyproject.toml
@@ -0,0 +1,8 @@
+[project]
+name = "examples-online-structured-outputs"
+requires-python = ">=3.10, <3.14"
+dependencies = ["openai==1.78.1", "pydantic==2.11.4"]
+version = "0.0.0"
+
+[project.scripts]
+structured-outputs = "structured_outputs:main"
--- a/examples/online_serving/structured_outputs/structured_outputs.py
+++ b/examples/online_serving/structured_outputs/structured_outputs.py
@@ -0,0 +1,268 @@
+# ruff: noqa: E501
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import asyncio
+import enum
+import os
+from typing import Any, Literal
+
+import openai
+import pydantic
+from openai.types.chat import ChatCompletionChunk
+
+ConstraintsFormat = Literal[
+    "choice",
+    "regex",
+    "json",
+    "grammar",
+    "structural_tag",
+]
+
+
+async def print_stream_response(
+    stream_response: openai.AsyncStream[ChatCompletionChunk],
+    title: str,
+    args: argparse.Namespace,
+):
+    print(f"\n\n{title} (Streaming):")
+
+    local_reasoning_header_printed = False
+    local_content_header_printed = False
+
+    async for chunk in stream_response:
+        delta = chunk.choices[0].delta
+
+        reasoning_chunk_text: str | None = getattr(delta, "reasoning", None)
+        content_chunk_text = delta.content
+
+        if args.reasoning:
+            if reasoning_chunk_text:
+                if not local_reasoning_header_printed:
+                    print("  Reasoning: ", end="")
+                    local_reasoning_header_printed = True
+                print(reasoning_chunk_text, end="", flush=True)
+
+            if content_chunk_text:
+                if not local_content_header_printed:
+                    if local_reasoning_header_printed:
+                        print()
+                    print("  Content: ", end="")
+                    local_content_header_printed = True
+                print(content_chunk_text, end="", flush=True)
+        else:
+            if content_chunk_text:
+                if not local_content_header_printed:
+                    print("  Content: ", end="")
+                    local_content_header_printed = True
+                print(content_chunk_text, end="", flush=True)
+    print()
+
+
+class CarType(str, enum.Enum):
+    SEDAN = "SEDAN"
+    SUV = "SUV"
+    TRUCK = "TRUCK"
+    COUPE = "COUPE"
+
+
+class CarDescription(pydantic.BaseModel):
+    brand: str
+    model: str
+    car_type: CarType
+
+
+PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
+    "choice": {
+        "messages": [
+            {
+                "role": "user",
+                "content": "Classify this sentiment: vLLM is wonderful!",
+            }
+        ],
+        "extra_body": {"structured_outputs": {"choice": ["positive", "negative"]}},
+    },
+    "regex": {
+        "messages": [
+            {
+                "role": "user",
+                "content": "Generate an email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: 'alan.turing@enigma.com\n'",
+            }
+        ],
+        "extra_body": {
+            "structured_outputs": {"regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n"},
+        },
+    },
+    "json": {
+        "messages": [
+            {
+                "role": "user",
+                "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
+            }
+        ],
+        "response_format": {
+            "type": "json_schema",
+            "json_schema": {
+                "name": "car-description",
+                "schema": CarDescription.model_json_schema(),
+            },
+        },
+    },
+    "grammar": {
+        "messages": [
+            {
+                "role": "user",
+                "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
+            }
+        ],
+        "extra_body": {
+            "structured_outputs": {
+                "grammar": """
+root ::= select_statement
+
+select_statement ::= "SELECT " column " from " table " where " condition
+
+column ::= "col_1 " | "col_2 "
+
+table ::= "table_1 " | "table_2 "
+
+condition ::= column "= " number
+
+number ::= "1 " | "2 "
+""",
+            }
+        },
+    },
+    "structural_tag": {
+        "messages": [
+            {
+                "role": "user",
+                "content": """
+You have access to the following function to retrieve the weather in a city:
+
+{
+    "name": "get_weather",
+    "parameters": {
+        "city": {
+            "param_type": "string",
+            "description": "The city to get the weather for",
+            "required": True
+        }
+    }
+}
+
+If a you choose to call a function ONLY reply in the following format:
+<{start_tag}={function_name}>{parameters}{end_tag}
+where
+
+start_tag => `<function`
+parameters => a JSON dict with the function argument name as key and function
+              argument value as value.
+end_tag => `</function>`
+
+Here is an example,
+<function=example_function_name>{"example_name": "example_value"}</function>
+
+Reminder:
+- Function calls MUST follow the specified format
+- Required parameters MUST be specified
+- Only call one function at a time
+- Put the entire function call reply on one line
+- Always add your sources when using search results to answer the user query
+
+You are a helpful assistant.
+
+Given the previous instructions, what is the weather in New York City, Boston,
+and San Francisco?""",
+            },
+        ],
+        "response_format": {
+            "type": "structural_tag",
+            "structures": [
+                {
+                    "begin": "<function=get_weather>",
+                    "schema": {
+                        "type": "object",
+                        "properties": {"city": {"type": "string"}},
+                        "required": ["city"],
+                    },
+                    "end": "</function>",
+                }
+            ],
+            "triggers": ["<function="],
+        },
+    },
+}
+
+
+async def cli():
+    parser = argparse.ArgumentParser(
+        description="Run OpenAI Chat Completion with various structured outputs capabilities",
+    )
+    _ = parser.add_argument(
+        "--constraint",
+        type=str,
+        nargs="+",
+        choices=[*list(PARAMS), "*"],
+        default=["*"],
+        help="Specify which constraint(s) to run.",
+    )
+    _ = parser.add_argument(
+        "--stream",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="Enable streaming output",
+    )
+    _ = parser.add_argument(
+        "--reasoning",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="Enable printing of reasoning traces if available.",
+    )
+    args = parser.parse_args()
+
+    base_url = os.getenv("OPENAI_BASE_URL", "http://localhost:8000/v1")
+    client = openai.AsyncOpenAI(base_url=base_url, api_key="EMPTY")
+    constraints = list(PARAMS) if "*" in args.constraint else list(set(args.constraint))
+    model = (await client.models.list()).data[0].id
+
+    if args.stream:
+        results = await asyncio.gather(
+            *[
+                client.chat.completions.create(
+                    model=model,
+                    max_tokens=1024,
+                    stream=True,
+                    **PARAMS[name],
+                )
+                for name in constraints
+            ]
+        )
+        for constraint, stream in zip(constraints, results):
+            await print_stream_response(stream, constraint, args)
+    else:
+        results = await asyncio.gather(
+            *[
+                client.chat.completions.create(
+                    model=model,
+                    max_tokens=1024,
+                    stream=False,
+                    **PARAMS[name],
+                )
+                for name in constraints
+            ]
+        )
+        for constraint, response in zip(constraints, results):
+            print(f"\n\n{constraint}:")
+            message = response.choices[0].message
+            if args.reasoning and hasattr(message, "reasoning"):
+                print(f"  Reasoning: {message.reasoning or ''}")
+            print(f"  Content: {message.content!r}")
+
+
+def main():
+    asyncio.run(cli())
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/token_generation_client.py
+++ b/examples/online_serving/token_generation_client.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import httpx
+from transformers import AutoTokenizer
+
+GEN_ENDPOINT = "http://localhost:8000/inference/v1/generate"
+DUMMY_API_KEY = "empty"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+
+transport = httpx.HTTPTransport()
+headers = {"Authorization": f"Bearer {DUMMY_API_KEY}"}
+client = httpx.Client(
+    transport=transport,
+    base_url=GEN_ENDPOINT,
+    timeout=600,
+    headers=headers,
+)
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "How many countries are in the EU?"},
+]
+
+
+def main(client):
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    token_ids = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        enable_thinking=False,
+    )
+    payload = {
+        "model": MODEL_NAME,
+        "token_ids": token_ids,
+        "sampling_params": {"max_tokens": 24, "temperature": 0.2, "detokenize": False},
+        "stream": False,
+    }
+    resp = client.post(GEN_ENDPOINT, json=payload)
+    resp.raise_for_status()
+    data = resp.json()
+    print(data)
+    print("-" * 50)
+    print("Token generation results:")
+    res = tokenizer.decode(data["choices"][0]["token_ids"])
+    print(res)
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    main(client)
--- a/examples/online_serving/utils.py
+++ b/examples/online_serving/utils.py
@@ -0,0 +1,26 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from openai import APIConnectionError, OpenAI
+from openai.pagination import SyncPage
+from openai.types.model import Model
+
+
+def get_first_model(client: OpenAI) -> str:
+    """
+    Get the first model from the vLLM server.
+    """
+    try:
+        models: SyncPage[Model] = client.models.list()
+    except APIConnectionError as e:
+        raise RuntimeError(
+            "Failed to get the list of models from the vLLM server at "
+            f"{client.base_url} with API key {client.api_key}. Check\n"
+            "1. the server is running\n"
+            "2. the server URL is correct\n"
+            "3. the API key is correct"
+        ) from e
+
+    if len(models.data) == 0:
+        raise RuntimeError(f"No models found on the vLLM server at {client.base_url}")
+
+    return models.data[0].id