Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

@@ -0,0 +1,93 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Example Python client for `vllm.entrypoints.api_server`
Start the demo server:
python -m vllm.entrypoints.api_server --model <model_name>
NOTE: The API server is used only for demonstration and simple performance
benchmarks. It is not intended for production use.
For production use, we recommend `vllm serve` and the OpenAI client API.
"""
import argparse
import json
from argparse import Namespace
from collections.abc import Iterable
import requests
def clear_line(n: int = 1) -> None:
LINE_UP = "\033[1A"
LINE_CLEAR = "\x1b[2K"
for _ in range(n):
print(LINE_UP, end=LINE_CLEAR, flush=True)
def post_http_request(
prompt: str, api_url: str, n: int = 1, stream: bool = False
) -> requests.Response:
headers = {"User-Agent": "Test Client"}
pload = {
"prompt": prompt,
"n": n,
"temperature": 0.0,
"max_tokens": 16,
"stream": stream,
}
response = requests.post(api_url, headers=headers, json=pload, stream=stream)
return response
def get_streaming_response(response: requests.Response) -> Iterable[list[str]]:
for chunk in response.iter_lines(
chunk_size=8192, decode_unicode=False, delimiter=b"\n"
):
if chunk:
data = json.loads(chunk.decode("utf-8"))
output = data["text"]
yield output
def get_response(response: requests.Response) -> list[str]:
data = json.loads(response.content)
output = data["text"]
return output
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--n", type=int, default=1)
parser.add_argument("--prompt", type=str, default="San Francisco is a")
parser.add_argument("--stream", action="store_true")
return parser.parse_args()
def main(args: Namespace):
prompt = args.prompt
api_url = f"http://{args.host}:{args.port}/generate"
n = args.n
stream = args.stream
print(f"Prompt: {prompt!r}\n", flush=True)
response = post_http_request(prompt, api_url, n, stream)
if stream:
num_printed_lines = 0
for h in get_streaming_response(response):
clear_line(num_printed_lines)
num_printed_lines = 0
for i, line in enumerate(h):
num_printed_lines += 1
print(f"Beam candidate {i}: {line!r}", flush=True)
else:
output = get_response(response)
for i, line in enumerate(output):
print(f"Beam candidate {i}: {line!r}", flush=True)
if __name__ == "__main__":
args = parse_args()
main(args)

View File

@@ -0,0 +1,6 @@
*.png
.git/
ct.yaml
lintconf.yaml
values.schema.json
/workflows

View File

@@ -0,0 +1,21 @@
apiVersion: v2
name: chart-vllm
description: Chart vllm
# A chart can be either an 'application' or a 'library' chart.
#
# Application charts are a collection of templates that can be packaged into versioned archives
# to be deployed.
#
# Library charts provide useful utilities or functions for the chart developer. They're included as
# a dependency of application charts to inject those utilities and functions into the rendering
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.0.1
maintainers:
- name: mfournioux

View File

@@ -0,0 +1,33 @@
# Helm Charts
This directory contains a Helm chart for deploying the vllm application. The chart includes configurations for deployment, autoscaling, resource management, and more.
## Files
- Chart.yaml: Defines the chart metadata including name, version, and maintainers.
- ct.yaml: Configuration for chart testing.
- lintconf.yaml: Linting rules for YAML files.
- values.schema.json: JSON schema for validating values.yaml.
- values.yaml: Default values for the Helm chart.
- templates/_helpers.tpl: Helper templates for defining common configurations.
- templates/configmap.yaml: Template for creating ConfigMaps.
- templates/custom-objects.yaml: Template for custom Kubernetes objects.
- templates/deployment.yaml: Template for creating Deployments.
- templates/hpa.yaml: Template for Horizontal Pod Autoscaler.
- templates/job.yaml: Template for Kubernetes Jobs.
- templates/poddisruptionbudget.yaml: Template for Pod Disruption Budget.
- templates/pvc.yaml: Template for Persistent Volume Claims.
- templates/secrets.yaml: Template for Kubernetes Secrets.
- templates/service.yaml: Template for creating Services.
## Running Tests
This chart includes unit tests using [helm-unittest](https://github.com/helm-unittest/helm-unittest). Install the plugin and run tests:
```bash
# Install plugin
helm plugin install https://github.com/helm-unittest/helm-unittest
# Run tests
helm unittest .
```

View File

@@ -0,0 +1,3 @@
chart-dirs:
- charts
validate-maintainers: false

View File

@@ -0,0 +1,42 @@
---
rules:
braces:
min-spaces-inside: 0
max-spaces-inside: 0
min-spaces-inside-empty: -1
max-spaces-inside-empty: -1
brackets:
min-spaces-inside: 0
max-spaces-inside: 0
min-spaces-inside-empty: -1
max-spaces-inside-empty: -1
colons:
max-spaces-before: 0
max-spaces-after: 1
commas:
max-spaces-before: 0
min-spaces-after: 1
max-spaces-after: 1
comments:
require-starting-space: true
min-spaces-from-content: 2
document-end: disable
document-start: disable # No --- to start a file
empty-lines:
max: 2
max-start: 0
max-end: 0
hyphens:
max-spaces-after: 1
indentation:
spaces: consistent
indent-sequences: whatever # - list indentation will handle both indentation and without
check-multi-line-strings: false
key-duplicates: enable
line-length: disable # Lines can be any length
new-line-at-end-of-file: disable
new-lines:
type: unix
trailing-spaces: enable
truthy:
level: warning

View File

@@ -0,0 +1,165 @@
{{/*
Define ports for the pods
*/}}
{{- define "chart.container-port" -}}
{{- default "8000" .Values.containerPort }}
{{- end }}
{{/*
Define service name
*/}}
{{- define "chart.service-name" -}}
{{- if .Values.serviceName }}
{{- .Values.serviceName | lower | trim }}
{{- else }}
"{{ .Release.Name }}-service"
{{- end }}
{{- end }}
{{/*
Define service port
*/}}
{{- define "chart.service-port" -}}
{{- if .Values.servicePort }}
{{- .Values.servicePort }}
{{- else }}
{{- include "chart.container-port" . }}
{{- end }}
{{- end }}
{{/*
Define service port name
*/}}
{{- define "chart.service-port-name" -}}
"service-port"
{{- end }}
{{/*
Define container port name
*/}}
{{- define "chart.container-port-name" -}}
"container-port"
{{- end }}
{{/*
Define deployment strategy
*/}}
{{- define "chart.strategy" -}}
strategy:
{{- if not .Values.deploymentStrategy }}
rollingUpdate:
maxSurge: 100%
maxUnavailable: 0
{{- else }}
{{ toYaml .Values.deploymentStrategy | indent 2 }}
{{- end }}
{{- end }}
{{/*
Define additional ports
*/}}
{{- define "chart.extraPorts" }}
{{- with .Values.extraPorts }}
{{ toYaml . }}
{{- end }}
{{- end }}
{{/*
Define chart external ConfigMaps and Secrets
*/}}
{{- define "chart.externalConfigs" -}}
{{- with .Values.externalConfigs -}}
{{ toYaml . }}
{{- end }}
{{- end }}
{{/*
Define liveness et readiness probes
*/}}
{{- define "chart.probes" -}}
{{- if .Values.readinessProbe }}
readinessProbe:
{{- with .Values.readinessProbe }}
{{- toYaml . | nindent 2 }}
{{- end }}
{{- end }}
{{- if .Values.livenessProbe }}
livenessProbe:
{{- with .Values.livenessProbe }}
{{- toYaml . | nindent 2 }}
{{- end }}
{{- end }}
{{- end }}
{{/*
Define resources
*/}}
{{- define "chart.resources" -}}
requests:
memory: {{ required "Value 'resources.requests.memory' must be defined !" .Values.resources.requests.memory | quote }}
cpu: {{ required "Value 'resources.requests.cpu' must be defined !" .Values.resources.requests.cpu | quote }}
{{- if and (gt (int (index .Values.resources.requests "nvidia.com/gpu")) 0) (gt (int (index .Values.resources.limits "nvidia.com/gpu")) 0) }}
nvidia.com/gpu: {{ required "Value 'resources.requests.nvidia.com/gpu' must be defined !" (index .Values.resources.requests "nvidia.com/gpu") | quote }}
{{- end }}
limits:
memory: {{ required "Value 'resources.limits.memory' must be defined !" .Values.resources.limits.memory | quote }}
cpu: {{ required "Value 'resources.limits.cpu' must be defined !" .Values.resources.limits.cpu | quote }}
{{- if and (gt (int (index .Values.resources.requests "nvidia.com/gpu")) 0) (gt (int (index .Values.resources.limits "nvidia.com/gpu")) 0) }}
nvidia.com/gpu: {{ required "Value 'resources.limits.nvidia.com/gpu' must be defined !" (index .Values.resources.limits "nvidia.com/gpu") | quote }}
{{- end }}
{{- end }}
{{/*
Define User used for the main container
*/}}
{{- define "chart.user" }}
{{- if .Values.image.runAsUser }}
runAsUser:
{{- with .Values.runAsUser }}
{{- toYaml . | nindent 2 }}
{{- end }}
{{- end }}
{{- end }}
{{- define "chart.extraInitEnv" -}}
- name: S3_ENDPOINT_URL
valueFrom:
secretKeyRef:
name: {{ .Release.Name }}-secrets
key: s3endpoint
- name: S3_BUCKET_NAME
valueFrom:
secretKeyRef:
name: {{ .Release.Name }}-secrets
key: s3bucketname
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
name: {{ .Release.Name }}-secrets
key: s3accesskeyid
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
name: {{ .Release.Name }}-secrets
key: s3accesskey
{{- if .Values.extraInit.s3modelpath }}
- name: S3_PATH
value: "{{ .Values.extraInit.s3modelpath }}"
{{- end }}
{{- if hasKey .Values.extraInit "awsEc2MetadataDisabled" }}
- name: AWS_EC2_METADATA_DISABLED
value: "{{ .Values.extraInit.awsEc2MetadataDisabled }}"
{{- end }}
{{- end }}
{{/*
Define chart labels
*/}}
{{- define "chart.labels" -}}
{{- with .Values.labels -}}
{{ toYaml . }}
{{- end }}
{{- end }}

View File

@@ -0,0 +1,11 @@
{{- if .Values.configs -}}
apiVersion: v1
kind: ConfigMap
metadata:
name: "{{ .Release.Name }}-configs"
namespace: {{ .Release.Namespace }}
data:
{{- with .Values.configs }}
{{- toYaml . | nindent 2 }}
{{- end }}
{{- end -}}

View File

@@ -0,0 +1,6 @@
{{- if .Values.customObjects }}
{{- range .Values.customObjects }}
{{- tpl (. | toYaml) $ }}
---
{{- end }}
{{- end }}

View File

@@ -0,0 +1,131 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: "{{ .Release.Name }}-deployment-vllm"
namespace: {{ .Release.Namespace }}
labels:
{{- include "chart.labels" . | nindent 4 }}
spec:
replicas: {{ .Values.replicaCount }}
{{- include "chart.strategy" . | nindent 2 }}
selector:
matchLabels:
environment: "test"
release: "test"
progressDeadlineSeconds: 1200
template:
metadata:
labels:
environment: "test"
release: "test"
spec:
containers:
- name: "vllm"
image: "{{ required "Required value 'image.repository' must be defined !" .Values.image.repository }}:{{ required "Required value 'image.tag' must be defined !" .Values.image.tag }}"
{{- if .Values.image.command }}
command :
{{- with .Values.image.command }}
{{- toYaml . | nindent 10 }}
{{- end }}
{{- end }}
securityContext:
{{- if .Values.image.securityContext }}
{{- with .Values.image.securityContext }}
{{- toYaml . | nindent 12 }}
{{- end }}
{{- else }}
runAsNonRoot: false
{{- include "chart.user" . | indent 12 }}
{{- end }}
imagePullPolicy: IfNotPresent
{{- if .Values.image.env }}
env :
{{- with .Values.image.env }}
{{- toYaml . | nindent 10 }}
{{- end }}
{{- else }}
env: []
{{- end }}
{{- if or .Values.externalConfigs .Values.configs .Values.secrets }}
envFrom:
{{- if .Values.configs }}
- configMapRef:
name: "{{ .Release.Name }}-configs"
{{- end }}
{{- if .Values.secrets}}
- secretRef:
name: "{{ .Release.Name }}-secrets"
{{- end }}
{{- include "chart.externalConfigs" . | nindent 12 }}
{{- end }}
ports:
- name: {{ include "chart.container-port-name" . }}
containerPort: {{ include "chart.container-port" . }}
{{- include "chart.extraPorts" . | nindent 12 }}
{{- include "chart.probes" . | indent 10 }}
resources: {{- include "chart.resources" . | nindent 12 }}
volumeMounts:
- name: {{ .Release.Name }}-storage
mountPath: /data
{{- with .Values.extraContainers }}
{{ toYaml . | nindent 8 }}
{{- end }}
{{- if and .Values.extraInit (or .Values.extraInit.modelDownload.enabled .Values.extraInit.initContainers) }}
initContainers:
{{- if .Values.extraInit.modelDownload.enabled }}
- name: wait-download-model
image: {{ .Values.extraInit.modelDownload.image.repository }}:{{ .Values.extraInit.modelDownload.image.tag }}
imagePullPolicy: {{ .Values.extraInit.modelDownload.image.pullPolicy }}
command: {{ .Values.extraInit.modelDownload.waitContainer.command | toJson }}
args:
{{- toYaml .Values.extraInit.modelDownload.waitContainer.args | nindent 10 }}
env:
{{- if .Values.extraInit.modelDownload.waitContainer.env }}
{{- toYaml .Values.extraInit.modelDownload.waitContainer.env | nindent 10 }}
{{- else }}
{{- include "chart.extraInitEnv" . | nindent 10 }}
{{- end }}
resources:
requests:
cpu: 200m
memory: 1Gi
limits:
cpu: 500m
memory: 2Gi
volumeMounts:
- name: {{ .Release.Name }}-storage
mountPath: /data
{{- end }}
{{- with .Values.extraInit.initContainers }}
{{- toYaml . | nindent 6 }}
{{- end }}
{{- end }}
volumes:
- name: {{ .Release.Name }}-storage
persistentVolumeClaim:
claimName: {{ .Release.Name }}-storage-claim
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- if and (gt (int (index .Values.resources.requests "nvidia.com/gpu")) 0) (gt (int (index .Values.resources.limits "nvidia.com/gpu")) 0) }}
runtimeClassName: nvidia
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: nvidia.com/gpu.product
operator: In
{{- with .Values.gpuModels }}
values:
{{- toYaml . | nindent 20 }}
{{- end }}
{{- end }}

View File

@@ -0,0 +1,31 @@
{{- if .Values.autoscaling.enabled }}
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: "{{ .Release.Name }}-hpa"
namespace: {{ .Release.Namespace }}
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: vllm
minReplicas: {{ .Values.autoscaling.minReplicas }}
maxReplicas: {{ .Values.autoscaling.maxReplicas }}
metrics:
{{- if .Values.autoscaling.targetCPUUtilizationPercentage }}
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
{{- end }}
{{- if .Values.autoscaling.targetMemoryUtilizationPercentage }}
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }}
{{- end }}
{{- end }}

View File

@@ -0,0 +1,41 @@
{{- if and .Values.extraInit .Values.extraInit.modelDownload.enabled }}
apiVersion: batch/v1
kind: Job
metadata:
name: "{{ .Release.Name }}-init-vllm"
namespace: {{ .Release.Namespace }}
spec:
ttlSecondsAfterFinished: 100
template:
metadata:
name: init-vllm
spec:
containers:
- name: job-download-model
image: {{ .Values.extraInit.modelDownload.image.repository }}:{{ .Values.extraInit.modelDownload.image.tag }}
imagePullPolicy: {{ .Values.extraInit.modelDownload.image.pullPolicy }}
command: {{ .Values.extraInit.modelDownload.downloadJob.command | toJson }}
args:
{{- toYaml .Values.extraInit.modelDownload.downloadJob.args | nindent 8 }}
env:
{{- if .Values.extraInit.modelDownload.downloadJob.env }}
{{- toYaml .Values.extraInit.modelDownload.downloadJob.env | nindent 8 }}
{{- else }}
{{- include "chart.extraInitEnv" . | nindent 8 }}
{{- end }}
volumeMounts:
- name: {{ .Release.Name }}-storage
mountPath: /data
resources:
requests:
cpu: 200m
memory: 1Gi
limits:
cpu: 500m
memory: 2Gi
restartPolicy: OnFailure
volumes:
- name: {{ .Release.Name }}-storage
persistentVolumeClaim:
claimName: "{{ .Release.Name }}-storage-claim"
{{- end }}

View File

@@ -0,0 +1,7 @@
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: "{{ .Release.Name }}-pdb"
namespace: {{ .Release.Namespace }}
spec:
maxUnavailable: {{ default 1 .Values.maxUnavailablePodDisruptionBudget }}

View File

@@ -0,0 +1,13 @@
{{- if .Values.extraInit }}
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: "{{ .Release.Name }}-storage-claim"
namespace: {{ .Release.Namespace }}
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: {{ .Values.extraInit.pvcStorage }}
{{- end }}

View File

@@ -0,0 +1,10 @@
apiVersion: v1
kind: Secret
metadata:
name: "{{ .Release.Name }}-secrets"
namespace: {{ .Release.Namespace }}
type: Opaque
data:
{{- range $key, $val := .Values.secrets }}
{{ $key }}: {{ $val | b64enc | quote }}
{{- end }}

View File

@@ -0,0 +1,14 @@
apiVersion: v1
kind: Service
metadata:
name: "{{ .Release.Name }}-service"
namespace: {{ .Release.Namespace }}
spec:
type: ClusterIP
ports:
- name: {{ include "chart.service-port-name" . }}
port: {{ include "chart.service-port" . }}
targetPort: {{ include "chart.container-port-name" . }}
protocol: TCP
selector:
{{- include "chart.labels" . | nindent 4 }}

View File

@@ -0,0 +1,135 @@
suite: test deployment
templates:
- deployment.yaml
tests:
- it: should create wait-download-model init container when modelDownload is enabled
set:
extraInit:
modelDownload:
enabled: true
image:
repository: "amazon/aws-cli"
tag: "2.6.4"
pullPolicy: "IfNotPresent"
waitContainer:
command: [ "/bin/bash" ]
args:
- "-eucx"
- "while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done"
downloadJob:
command: [ "/bin/bash" ]
args:
- "-eucx"
- "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data"
initContainers: [ ]
pvcStorage: "1Gi"
s3modelpath: "relative_s3_model_path/opt-125m"
awsEc2MetadataDisabled: true
asserts:
- hasDocuments:
count: 1
- isKind:
of: Deployment
- isNotEmpty:
path: spec.template.spec.initContainers
- equal:
path: spec.template.spec.initContainers[0].name
value: wait-download-model
- equal:
path: spec.template.spec.initContainers[0].image
value: amazon/aws-cli:2.6.4
- equal:
path: spec.template.spec.initContainers[0].imagePullPolicy
value: IfNotPresent
- it: should only create custom init containers when modelDownload is disabled
set:
extraInit:
modelDownload:
enabled: false
image:
repository: "amazon/aws-cli"
tag: "2.6.4"
pullPolicy: "IfNotPresent"
waitContainer:
command: [ "/bin/bash" ]
args: [ "-c", "echo test" ]
downloadJob:
command: [ "/bin/bash" ]
args: [ "-c", "echo test" ]
initContainers:
- name: llm-d-routing-proxy
image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
imagePullPolicy: IfNotPresent
ports:
- containerPort: 8080
name: proxy
pvcStorage: "10Gi"
asserts:
- hasDocuments:
count: 1
- isKind:
of: Deployment
- lengthEqual:
path: spec.template.spec.initContainers
count: 1
- equal:
path: spec.template.spec.initContainers[0].name
value: llm-d-routing-proxy
- equal:
path: spec.template.spec.initContainers[0].image
value: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
- equal:
path: spec.template.spec.initContainers[0].ports[0].containerPort
value: 8080
- it: should create both wait-download-model and custom init containers when both are enabled
set:
extraInit:
modelDownload:
enabled: true
image:
repository: "amazon/aws-cli"
tag: "2.6.4"
pullPolicy: "IfNotPresent"
waitContainer:
command: [ "/bin/bash" ]
args:
- "-eucx"
- "while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done"
downloadJob:
command: [ "/bin/bash" ]
args:
- "-eucx"
- "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data"
initContainers:
- name: llm-d-routing-proxy
image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
imagePullPolicy: IfNotPresent
ports:
- containerPort: 8080
name: proxy
pvcStorage: "10Gi"
asserts:
- hasDocuments:
count: 1
- isKind:
of: Deployment
- lengthEqual:
path: spec.template.spec.initContainers
count: 2
- equal:
path: spec.template.spec.initContainers[0].name
value: wait-download-model
- equal:
path: spec.template.spec.initContainers[0].image
value: amazon/aws-cli:2.6.4
- equal:
path: spec.template.spec.initContainers[1].name
value: llm-d-routing-proxy
- equal:
path: spec.template.spec.initContainers[1].image
value: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
- equal:
path: spec.template.spec.initContainers[1].ports[0].containerPort
value: 8080

View File

@@ -0,0 +1,61 @@
suite: test job
templates:
- job.yaml
tests:
- it: should create job when modelDownload is enabled
set:
extraInit:
modelDownload:
enabled: true
image:
repository: "amazon/aws-cli"
tag: "2.6.4"
pullPolicy: "IfNotPresent"
waitContainer:
command: [ "/bin/bash" ]
args: [ "-c", "wait" ]
downloadJob:
command: [ "/bin/bash" ]
args:
- "-eucx"
- "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data"
pvcStorage: "1Gi"
s3modelpath: "relative_s3_model_path/opt-125m"
awsEc2MetadataDisabled: true
asserts:
- hasDocuments:
count: 1
- isKind:
of: Job
- equal:
path: spec.template.spec.containers[0].name
value: job-download-model
- equal:
path: spec.template.spec.containers[0].image
value: amazon/aws-cli:2.6.4
- equal:
path: spec.template.spec.restartPolicy
value: OnFailure
- it: should not create job when modelDownload is disabled
set:
extraInit:
modelDownload:
enabled: false
image:
repository: "amazon/aws-cli"
tag: "2.6.4"
pullPolicy: "IfNotPresent"
waitContainer:
command: [ "/bin/bash" ]
args: [ "-c", "wait" ]
downloadJob:
command: [ "/bin/bash" ]
args: [ "-c", "download" ]
initContainers:
- name: llm-d-routing-proxy
image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
pvcStorage: "10Gi"
asserts:
- hasDocuments:
count: 0

View File

@@ -0,0 +1,32 @@
suite: test pvc
templates:
- pvc.yaml
tests:
# Test Case: PVC Created When extraInit Defined
- it: should create pvc when extraInit is defined
set:
extraInit:
modelDownload:
enabled: true
image:
repository: "amazon/aws-cli"
tag: "2.6.4"
pullPolicy: "IfNotPresent"
waitContainer:
command: ["/bin/bash"]
args: ["-c", "wait"]
downloadJob:
command: ["/bin/bash"]
args: ["-c", "download"]
pvcStorage: "10Gi"
asserts:
- hasDocuments:
count: 1
- isKind:
of: PersistentVolumeClaim
- equal:
path: spec.accessModes[0]
value: ReadWriteOnce
- equal:
path: spec.resources.requests.storage
value: 10Gi

View File

@@ -0,0 +1,329 @@
{
"$schema": "http://json-schema.org/schema#",
"type": "object",
"properties": {
"image": {
"type": "object",
"properties": {
"repository": {
"type": "string"
},
"tag": {
"type": "string"
},
"command": {
"type": "array",
"items": {
"type": "string"
}
}
},
"required": [
"command",
"repository",
"tag"
]
},
"containerPort": {
"type": "integer"
},
"serviceName": {
"type": "null"
},
"servicePort": {
"type": "integer"
},
"extraPorts": {
"type": "array"
},
"replicaCount": {
"type": "integer"
},
"deploymentStrategy": {
"type": "object"
},
"resources": {
"type": "object",
"properties": {
"requests": {
"type": "object",
"properties": {
"cpu": {
"type": "integer"
},
"memory": {
"type": "string"
},
"nvidia.com/gpu": {
"type": "integer"
}
},
"required": [
"cpu",
"memory",
"nvidia.com/gpu"
]
},
"limits": {
"type": "object",
"properties": {
"cpu": {
"type": "integer"
},
"memory": {
"type": "string"
},
"nvidia.com/gpu": {
"type": "integer"
}
},
"required": [
"cpu",
"memory",
"nvidia.com/gpu"
]
}
},
"required": [
"limits",
"requests"
]
},
"gpuModels": {
"type": "array",
"items": {
"type": "string"
}
},
"autoscaling": {
"type": "object",
"properties": {
"enabled": {
"type": "boolean"
},
"minReplicas": {
"type": "integer"
},
"maxReplicas": {
"type": "integer"
},
"targetCPUUtilizationPercentage": {
"type": "integer"
}
},
"required": [
"enabled",
"maxReplicas",
"minReplicas",
"targetCPUUtilizationPercentage"
]
},
"configs": {
"type": "object"
},
"secrets": {
"type": "object"
},
"externalConfigs": {
"type": "array"
},
"customObjects": {
"type": "array"
},
"maxUnavailablePodDisruptionBudget": {
"type": "string"
},
"extraInit": {
"type": "object",
"properties": {
"modelDownload": {
"type": "object",
"properties": {
"enabled": {
"type": "boolean"
},
"image": {
"type": "object",
"properties": {
"repository": {
"type": "string"
},
"tag": {
"type": "string"
},
"pullPolicy": {
"type": "string"
}
},
"required": ["repository", "tag", "pullPolicy"]
},
"waitContainer": {
"type": "object",
"properties": {
"command": {
"type": "array",
"items": {"type": "string"}
},
"args": {
"type": "array",
"items": {"type": "string"}
},
"env": {
"type": "array",
"items": {"type": "object"}
}
},
"required": ["command", "args"]
},
"downloadJob": {
"type": "object",
"properties": {
"command": {
"type": "array",
"items": {"type": "string"}
},
"args": {
"type": "array",
"items": {"type": "string"}
},
"env": {
"type": "array",
"items": {"type": "object"}
}
},
"required": ["command", "args"]
}
},
"required": ["enabled", "image", "waitContainer", "downloadJob"]
},
"initContainers": {
"type": "array",
"items": {"type": "object"}
},
"s3modelpath": {
"type": "string"
},
"pvcStorage": {
"type": "string"
},
"awsEc2MetadataDisabled": {
"type": "boolean"
}
},
"required": [
"modelDownload",
"initContainers",
"pvcStorage"
]
},
"extraContainers": {
"type": "array"
},
"readinessProbe": {
"type": "object",
"properties": {
"initialDelaySeconds": {
"type": "integer"
},
"periodSeconds": {
"type": "integer"
},
"failureThreshold": {
"type": "integer"
},
"httpGet": {
"type": "object",
"properties": {
"path": {
"type": "string"
},
"port": {
"type": "integer"
}
},
"required": [
"path",
"port"
]
}
},
"required": [
"failureThreshold",
"httpGet",
"initialDelaySeconds",
"periodSeconds"
]
},
"livenessProbe": {
"type": "object",
"properties": {
"initialDelaySeconds": {
"type": "integer"
},
"failureThreshold": {
"type": "integer"
},
"periodSeconds": {
"type": "integer"
},
"httpGet": {
"type": "object",
"properties": {
"path": {
"type": "string"
},
"port": {
"type": "integer"
}
},
"required": [
"path",
"port"
]
}
},
"required": [
"failureThreshold",
"httpGet",
"initialDelaySeconds",
"periodSeconds"
]
},
"labels": {
"type": "object",
"properties": {
"environment": {
"type": "string"
},
"release": {
"type": "string"
}
},
"required": [
"environment",
"release"
]
}
},
"required": [
"autoscaling",
"configs",
"containerPort",
"customObjects",
"deploymentStrategy",
"externalConfigs",
"extraContainers",
"extraInit",
"extraPorts",
"gpuModels",
"image",
"labels",
"livenessProbe",
"maxUnavailablePodDisruptionBudget",
"readinessProbe",
"replicaCount",
"resources",
"secrets",
"servicePort"
]
}

View File

@@ -0,0 +1,174 @@
# -- Default values for chart vllm
# -- Declare variables to be passed into your templates.
# -- Image configuration
image:
# -- Image repository
repository: "vllm/vllm-openai"
# -- Image tag
tag: "latest"
# -- Container launch command
command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--enforce-eager", "--dtype", "bfloat16", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"]
# -- Container port
containerPort: 8000
# -- Service name
serviceName:
# -- Service port
servicePort: 80
# -- Additional ports configuration
extraPorts: []
# -- Number of replicas
replicaCount: 1
# -- Deployment strategy configuration
deploymentStrategy: {}
# -- Resource configuration
resources:
requests:
# -- Number of CPUs
cpu: 4
# -- CPU memory configuration
memory: 16Gi
# -- Number of gpus used
nvidia.com/gpu: 1
limits:
# -- Number of CPUs
cpu: 4
# -- CPU memory configuration
memory: 16Gi
# -- Number of gpus used
nvidia.com/gpu: 1
# -- Type of gpu used
gpuModels:
- "TYPE_GPU_USED"
# -- Autoscaling configuration
autoscaling:
# -- Enable autoscaling
enabled: false
# -- Minimum replicas
minReplicas: 1
# -- Maximum replicas
maxReplicas: 100
# -- Target CPU utilization for autoscaling
targetCPUUtilizationPercentage: 80
# targetMemoryUtilizationPercentage: 80
# -- Configmap
configs: {}
# -- Secrets configuration
secrets: {}
# -- External configuration
externalConfigs: []
# -- Custom Objects configuration
customObjects: []
# -- Disruption Budget Configuration
maxUnavailablePodDisruptionBudget: ""
# -- Additional configuration for the init container
extraInit:
# -- Model download functionality (optional)
modelDownload:
# -- Enable model download job and wait container
enabled: true
# -- Image configuration for model download operations
image:
# -- Image repository
repository: "amazon/aws-cli"
# -- Image tag
tag: "2.6.4"
# -- Image pull policy
pullPolicy: "IfNotPresent"
# -- Wait container configuration (init container that waits for model to be ready)
waitContainer:
# -- Command to execute
command: ["/bin/bash"]
# -- Arguments for the wait container
args:
- "-eucx"
- "while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done"
# -- Environment variables (optional, overrides S3 defaults entirely if specified)
# env:
# - name: HUGGING_FACE_HUB_TOKEN
# value: "your-token"
# - name: MODEL_ID
# value: "meta-llama/Llama-2-7b"
# -- Download job configuration (job that actually downloads the model)
downloadJob:
# -- Command to execute
command: ["/bin/bash"]
# -- Arguments for the download job
args:
- "-eucx"
- "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data"
# -- Environment variables (optional, overrides S3 defaults entirely if specified)
# env:
# - name: HUGGING_FACE_HUB_TOKEN
# value: "your-token"
# - name: MODEL_ID
# value: "meta-llama/Llama-2-7b"
# -- Custom init containers (appended after wait-download-model if modelDownload is enabled)
initContainers: []
# Example for llm-d sidecar:
# initContainers:
# - name: llm-d-routing-proxy
# image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
# imagePullPolicy: IfNotPresent
# ports:
# - containerPort: 8080
# name: proxy
# securityContext:
# runAsUser: 1000
# -- Path of the model on the s3 which hosts model weights and config files
s3modelpath: "relative_s3_model_path/opt-125m"
# -- Storage size for the PVC
pvcStorage: "1Gi"
# -- Disable AWS EC2 metadata service
awsEc2MetadataDisabled: true
# -- Additional containers configuration
extraContainers: []
# -- Readiness probe configuration
readinessProbe:
# -- Number of seconds after the container has started before readiness probe is initiated
initialDelaySeconds: 5
# -- How often (in seconds) to perform the readiness probe
periodSeconds: 5
# -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
failureThreshold: 3
# -- Configuration of the Kubelet http request on the server
httpGet:
# -- Path to access on the HTTP server
path: /health
# -- Name or number of the port to access on the container, on which the server is listening
port: 8000
# -- Liveness probe configuration
livenessProbe:
# -- Number of seconds after the container has started before liveness probe is initiated
initialDelaySeconds: 15
# -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
failureThreshold: 3
# -- How often (in seconds) to perform the liveness probe
periodSeconds: 10
# -- Configuration of the Kubelet http request on the server
httpGet:
# -- Path to access on the HTTP server
path: /health
# -- Name or number of the port to access on the container, on which the server is listening
port: 8000
labels:
environment: "test"
release: "test"

View File

@@ -0,0 +1,87 @@
# Monitoring Dashboards
This directory contains monitoring dashboard configurations for vLLM, providing
comprehensive observability for your vLLM deployments.
## Dashboard Platforms
We provide dashboards for two popular observability platforms:
- **[Grafana](https://grafana.com)**
- **[Perses](https://perses.dev)**
## Dashboard Format Approach
All dashboards are provided in **native formats** that work across different
deployment methods:
### Grafana (JSON)
- ✅ Works with any Grafana instance (cloud, self-hosted, Docker)
- ✅ Direct import via Grafana UI or API
- ✅ Can be wrapped in Kubernetes operators when needed
- ✅ No vendor lock-in or deployment dependencies
### Perses (YAML)
- ✅ Works with standalone Perses instances
- ✅ Compatible with Perses API and CLI
- ✅ Supports Dashboard-as-Code workflows
- ✅ Can be wrapped in Kubernetes operators when needed
## Dashboard Contents
Both platforms provide equivalent monitoring capabilities:
| Dashboard | Description |
|-----------|-------------|
| **Performance Statistics** | Tracks latency, throughput, and performance metrics |
| **Query Statistics** | Monitors request volume, query performance, and KPIs |
## Quick Start
First, navigate to this example's directory:
```bash
cd examples/online_serving/dashboards
```
### Grafana
Import the JSON directly into the Grafana UI, or use the API:
```bash
curl -X POST http://grafana/api/dashboards/db \
-H "Content-Type: application/json" \
-d @grafana/performance_statistics.json
```
### Perses
Import via the Perses CLI:
```bash
percli apply -f perses/performance_statistics.yaml
```
## Requirements
- **Prometheus** metrics from your vLLM deployment
- **Data source** configured in your monitoring platform
- **vLLM metrics** enabled and accessible
## Platform-Specific Documentation
For detailed deployment instructions and platform-specific options, see:
- **[Grafana Documentation](./grafana)** - JSON dashboards, operator usage, manual import
- **[Perses Documentation](./perses)** - YAML specs, CLI usage, operator wrapping
## Contributing
When adding new dashboards, please:
1. Provide native formats (JSON for Grafana, YAML specs for Perses)
2. Update platform-specific README files
3. Ensure dashboards work across deployment methods
4. Test with the latest platform versions

View File

@@ -0,0 +1,59 @@
# Grafana Dashboards for vLLM Monitoring
This directory contains Grafana dashboard configurations (as JSON) designed to monitor
vLLM performance and metrics.
## Requirements
- Grafana 8.0+
- Prometheus data source configured in Grafana
- vLLM deployment with Prometheus metrics enabled
## Dashboard Descriptions
- **performance_statistics.json**: Tracks performance metrics including latency and
throughput for your vLLM service.
- **query_statistics.json**: Tracks query performance, request volume, and key
performance indicators for your vLLM service.
## Deployment Options
### Manual Import (Recommended)
The easiest way to use these dashboards is to manually import the JSON configurations
directly into your Grafana instance:
1. Navigate to your Grafana instance
2. Click the '+' icon in the sidebar
3. Select 'Import'
4. Copy and paste the JSON content from the dashboard files, or upload the JSON files
directly
### Grafana Operator
If you're using the [Grafana Operator](https://github.com/grafana-operator/grafana-operator)
in Kubernetes, you can wrap these JSON configurations in a `GrafanaDashboard` custom
resource:
```yaml
# Note: Adjust the instanceSelector to match your Grafana instance's labels
# You can check with: kubectl get grafana -o yaml
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDashboard
metadata:
name: vllm-performance-dashboard
spec:
instanceSelector:
matchLabels:
dashboards: grafana # Adjust to match your Grafana instance labels
folder: "vLLM Monitoring"
json: |
# Replace this comment with the complete JSON content from
# performance_statistics.json - The JSON should start with { and end with }
```
Then apply to your cluster:
```bash
kubectl apply -f your-dashboard.yaml -n <namespace>
```

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,760 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "High-level overview of VLLM model deployment behavior and key performance indicators. Designed for Data Scientists and Product Managers to monitor request volume, token throughput, and latency",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 47,
"links": [],
"panels": [
{
"collapsed": true,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"id": 20,
"panels": [],
"title": "Request Over Time",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "auto",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "off" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
},
"unit": "req/s"
},
"overrides": []
},
"gridPos": { "h": 6, "w": 10, "x": 0, "y": 1 },
"id": 1,
"options": {
"legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "single", "sort": "none" }
},
"pluginVersion": "11.3.0",
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"editorMode": "code",
"expr": "sum by (model_name) (\n rate(vllm:request_success_total{model_name=~\"$Deployment_id\"}[$__rate_interval])\n)",
"interval": "1",
"legendFormat": "{{model_name}}",
"range": true,
"refId": "A"
}
],
"title": "Successful Requests Over Time",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
},
"unit": "req/s"
},
"overrides": []
},
"gridPos": { "h": 3, "w": 7, "x": 10, "y": 1 },
"id": 2,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": { "calcs": ["mean"], "fields": "", "values": false },
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "11.3.0",
"targets": [
{
"editorMode": "code",
"expr": "sum(rate(vllm:request_success_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Requests Avg Rate",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "Calcultaions": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
},
"unit": "ms"
},
"overrides": []
},
"gridPos": { "h": 3, "w": 7, "x": 17, "y": 1 },
"id": 3,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "11.3.0",
"targets": [
{
"editorMode": "code",
"expr": "histogram_quantile(0.50, sum by(le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "p50 Latency",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "Calculation": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
},
"unit": "ms"
},
"overrides": []
},
"gridPos": { "h": 3, "w": 7, "x": 10, "y": 4 },
"id": 4,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "11.3.0",
"targets": [
{
"editorMode": "code",
"expr": "histogram_quantile(0.90, sum by(le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "p90 Latency",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "Calculation": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
},
"unit": "ms"
},
"overrides": []
},
"gridPos": { "h": 3, "w": 7, "x": 17, "y": 4 },
"id": 5,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "11.3.0",
"targets": [
{
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum by(le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "p99 Latency",
"type": "stat"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 7 },
"id": 19,
"panels": [],
"title": "Size Distribution",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"fillOpacity": 80,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"lineWidth": 1,
"stacking": { "group": "A", "mode": "none" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
},
"unit": "cps"
},
"overrides": []
},
"gridPos": { "h": 6, "w": 10, "x": 0, "y": 8 },
"id": 6,
"options": {
"legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "single", "sort": "none" }
},
"pluginVersion": "11.3.0",
"targets": [
{
"editorMode": "code",
"expr": "sum by (le, model_name) (rate(vllm:request_prompt_tokens_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
"legendFormat": "{{model_name}} le={{le}}",
"range": true,
"refId": "A"
}
],
"title": "Input Token Size Distribution",
"type": "histogram"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "calculation ": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
},
"unit": "cps"
},
"overrides": []
},
"gridPos": { "h": 3, "w": 7, "x": 10, "y": 8 },
"id": 9,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "11.3.0",
"targets": [
{
"editorMode": "code",
"expr": "histogram_quantile(0.90, sum by(le, model_name) (rate(vllm:request_prompt_tokens_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Input Token Size p90",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "Calcultion": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
},
"unit": "cps"
},
"overrides": []
},
"gridPos": { "h": 3, "w": 7, "x": 17, "y": 8 },
"id": 8,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "11.3.0",
"targets": [
{
"editorMode": "code",
"expr": "histogram_quantile(0.50, sum by(le, model_name) (rate(vllm:request_prompt_tokens_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Input Token Size p50",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "Calcultaion": { "index": 0, "text": "mean" } }, "type": "value" }
],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
},
"unit": "cps"
},
"overrides": []
},
"gridPos": { "h": 3, "w": 7, "x": 10, "y": 11 },
"id": 7,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "11.3.0",
"targets": [
{
"editorMode": "code",
"expr": "sum(rate(vllm:prompt_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))\n/\nsum(rate(vllm:request_success_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Input Token Size Avg",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "Calculation": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
},
"unit": "cps"
},
"overrides": []
},
"gridPos": { "h": 3, "w": 7, "x": 17, "y": 11 },
"id": 10,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "11.3.0",
"targets": [
{
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum by(le, model_name) (rate(vllm:request_prompt_tokens_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Input Token Size p99",
"type": "stat"
},
{
"collapsed": true,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
"id": 18,
"panels": [],
"title": "Input Token Over Time",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "auto",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "off" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
},
"unit": "cps"
},
"overrides": []
},
"gridPos": { "h": 6, "w": 10, "x": 0, "y": 15 },
"id": 11,
"options": {
"legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "single", "sort": "none" }
},
"pluginVersion": "11.3.0",
"targets": [
{
"editorMode": "code",
"expr": "sum by (model_name) (rate(vllm:prompt_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
"legendFormat": "{{model_name}}",
"range": true,
"refId": "A"
}
],
"title": "Input Tokens Over Time",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "Calculation": { "index": 0, "text": "mean" } }, "type": "value" }
],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
},
"unit": "cps"
},
"overrides": []
},
"gridPos": { "h": 3, "w": 7, "x": 10, "y": 15 },
"id": 12,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "11.3.0",
"targets": [
{
"editorMode": "code",
"expr": "sum(rate(vllm:prompt_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Input Tokens/Sec Avg",
"type": "stat"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 },
"id": 17,
"panels": [],
"title": "Output Token Over Time",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "auto",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "off" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
},
"unit": "cps"
},
"overrides": []
},
"gridPos": { "h": 6, "w": 10, "x": 0, "y": 22 },
"id": 13,
"options": {
"legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "single", "sort": "none" }
},
"pluginVersion": "11.3.0",
"targets": [
{
"editorMode": "code",
"expr": "sum by (model_name) (rate(vllm:generation_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
"legendFormat": "{{model_name}}",
"range": true,
"refId": "A"
}
],
"title": "Output Tokens Over Time",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "Calculation": { "index": 0, "text": "mean" } }, "type": "value" }
],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
},
"unit": "cps"
},
"overrides": []
},
"gridPos": { "h": 3, "w": 7, "x": 10, "y": 22 },
"id": 14,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "11.3.0",
"targets": [
{
"editorMode": "code",
"expr": "sum(rate(vllm:generation_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Output Tokens/Sec Avg",
"type": "stat"
}
],
"preload": false,
"schemaVersion": 40,
"tags": [],
"templating": {
"list": [
{
"current": { "text": "Prometheus", "value": "4184fc20-68a7-483a-8d9b-7caa59c680dd" },
"label": "datasource",
"name": "DS_PROMETHEUS",
"options": [],
"query": "prometheus",
"refresh": 1,
"type": "datasource"
},
{
"current": { "text": ["All"], "value": ["$__all"] },
"definition": "label_values(vllm:request_success_total,model_name)",
"includeAll": true,
"label": "Deployment_ID",
"multi": true,
"name": "Deployment_id",
"options": [],
"query": {
"qryType": 1,
"query": "label_values(vllm:request_success_total,model_name)",
"refId": "PrometheusVariableQueryEditor-VariableQuery"
},
"refresh": 1,
"regex": "",
"sort": 1,
"type": "query"
},
{
"current": { "text": "All hours", "value": "All hours" },
"hide": 2,
"label": "Rush Hours Only",
"name": "rush_hours",
"options": [
{ "selected": true, "text": "false", "value": "All hours" },
{ "selected": false, "text": "true", "value": "Rush hours" }
],
"query": "false : All hours, true : Rush hours",
"type": "custom"
},
{
"current": { "text": "All", "value": "All" },
"hide": 2,
"label": "Rush Hours Type",
"name": "rush_hours_type",
"options": [
{ "selected": true, "text": "^All__.*$", "value": "All" },
{ "selected": false, "text": "^Static__.*$", "value": "Static" },
{ "selected": false, "text": "^Dynamic__.*$", "value": "Dynamic" }
],
"query": "^All__.*$ : All, ^Static__.*$ : Static, ^Dynamic__.*$ : Dynamic",
"type": "custom"
},
{
"current": { "text": "", "value": "" },
"hide": 2,
"name": "query0",
"options": [],
"query": "",
"refresh": 1,
"regex": "",
"type": "query"
}
]
},
"time": { "from": "now-12h", "to": "now" },
"timepicker": {},
"timezone": "browser",
"title": "Query Statistics_New4",
"uid": "query-statistics4",
"version": 2,
"weekStart": ""
}

View File

@@ -0,0 +1,48 @@
# Perses Dashboards for vLLM Monitoring
This directory contains Perses dashboard configurations designed to monitor vLLM
performance and metrics.
## Requirements
- Perses instance (standalone or via operator)
- Prometheus data source configured in Perses
- vLLM deployment with Prometheus metrics enabled
## Dashboard Format
We provide dashboards in the **native Perses YAML format** that works across all
deployment methods:
- **Files**: `*.yaml` (native Perses dashboard specifications)
- **Format**: Pure dashboard specifications that work everywhere
- **Usage**: Works with standalone Perses, API imports, CLI, and file provisioning
- **Kubernetes**: Directly compatible with Perses Operator
## Dashboard Descriptions
- **performance_statistics.yaml**: Performance metrics with aggregated latency
statistics
- **query_statistics.yaml**: Query performance and deployment metrics
## Deployment Options
### Direct Import to Perses
Import the dashboard specifications via Perses API or CLI:
```bash
percli apply -f performance_statistics.yaml
```
### Perses Operator (Kubernetes)
The native YAML format works directly with the Perses Operator:
```bash
kubectl apply -f performance_statistics.yaml -n <namespace>
```
### File Provisioning
Place the YAML files in a Perses provisioning folder for automatic loading.

View File

@@ -0,0 +1,764 @@
kind: PersesDashboard
metadata:
name: performance-statistics
createdAt: 0001-01-01T00:00:00Z
updatedAt: 0001-01-01T00:00:00Z
version: 0
project: ""
spec:
display:
name: Performance Statistics
variables:
- kind: ListVariable
spec:
display:
name: Deployment_ID
hidden: false
name: Deployment_id
allowAllValue: true
allowMultiple: true
defaultValue:
- $__all
sort: alphabetical-asc
plugin:
kind: PrometheusLabelValuesVariable
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
labelName: model_name
matchers:
# Any one vllm metric that always carries model_name
- vllm:generation_tokens_total{}
panels:
"1":
kind: Panel
spec:
display:
name: E2E Latency over Time
plugin:
kind: TimeSeriesChart
spec:
legend:
mode: table
position: bottom
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
# avg latency by model = sum(rate(sum)) / sum(rate(count))
query: >
sum by (model_name) (rate(vllm:e2e_request_latency_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
/
sum by (model_name) (rate(vllm:e2e_request_latency_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
seriesNameFormat: '{{model_name}}'
"2":
kind: Panel
spec:
display:
name: E2E Latency (Avg)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
(sum by (model_name) (increase(vllm:e2e_request_latency_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
/
(sum by (model_name) (increase(vllm:e2e_request_latency_seconds_count{model_name=~"$Deployment_id"}[$__range])))
"3":
kind: Panel
spec:
display:
name: E2E Latency (P50)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
histogram_quantile(
0.50,
sum by (le, model_name) (
rate(vllm:e2e_request_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
"4":
kind: Panel
spec:
display:
name: E2E Latency (P90)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
histogram_quantile(
0.90,
sum by (le, model_name) (
rate(vllm:e2e_request_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
"5":
kind: Panel
spec:
display:
name: E2E Latency (P99)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
histogram_quantile(
0.99,
sum by (le, model_name) (
rate(vllm:e2e_request_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
"6":
kind: Panel
spec:
display:
name: TTFT over Time
plugin:
kind: TimeSeriesChart
spec:
legend:
mode: table
position: bottom
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
sum by (model_name) (rate(vllm:time_to_first_token_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
/
sum by (model_name) (rate(vllm:time_to_first_token_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
seriesNameFormat: '{{model_name}}'
"7":
kind: Panel
spec:
display:
name: TTFT (Avg)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
(sum by (model_name) (increase(vllm:time_to_first_token_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
/
(sum by (model_name) (increase(vllm:time_to_first_token_seconds_count{model_name=~"$Deployment_id"}[$__range])))
"8":
kind: Panel
spec:
display:
name: TTFT (P50)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
histogram_quantile(
0.50,
sum by (le, model_name) (
rate(vllm:time_to_first_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
"9":
kind: Panel
spec:
display:
name: TTFT (P90)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
histogram_quantile(
0.90,
sum by (le, model_name) (
rate(vllm:time_to_first_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
"10":
kind: Panel
spec:
display:
name: TTFT (P99)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
histogram_quantile(
0.99,
sum by (le, model_name) (
rate(vllm:time_to_first_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
"11":
kind: Panel
spec:
display:
name: ITL (Time per Output Token) over Time
plugin:
kind: TimeSeriesChart
spec:
legend:
mode: table
position: bottom
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
sum by (model_name) (rate(vllm:time_per_output_token_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
/
sum by (model_name) (rate(vllm:time_per_output_token_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
seriesNameFormat: '{{model_name}}'
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
histogram_quantile(
0.50,
sum by (le, model_name) (
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
seriesNameFormat: '{{model_name}} p50'
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
histogram_quantile(
0.90,
sum by (le, model_name) (
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
seriesNameFormat: '{{model_name}} p90'
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
histogram_quantile(
0.99,
sum by (le, model_name) (
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
seriesNameFormat: '{{model_name}} p99'
"12":
kind: Panel
spec:
display:
name: ITL (Avg)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
(sum by (model_name) (increase(vllm:time_per_output_token_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
/
(sum by (model_name) (increase(vllm:time_per_output_token_seconds_count{model_name=~"$Deployment_id"}[$__range])))
"13":
kind: Panel
spec:
display:
name: ITL (P50)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
histogram_quantile(
0.50,
sum by (le, model_name) (
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
"14":
kind: Panel
spec:
display:
name: ITL (P90)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
histogram_quantile(
0.90,
sum by (le, model_name) (
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
"15":
kind: Panel
spec:
display:
name: ITL (P99)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
histogram_quantile(
0.99,
sum by (le, model_name) (
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
"16":
kind: Panel
spec:
display:
name: TPS (Tokens/sec) over Time
plugin:
kind: TimeSeriesChart
spec:
legend:
mode: table
position: bottom
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
sum by (model_name) (rate(vllm:generation_tokens_total{model_name=~"$Deployment_id"}[$__interval]))
seriesNameFormat: '{{model_name}} generation'
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
sum by (model_name) (rate(vllm:prompt_tokens_total{model_name=~"$Deployment_id"}[$__interval]))
seriesNameFormat: '{{model_name}} prompt'
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
# overall iteration tokens/sec if exposed
query: >
rate(vllm:iteration_tokens_total_count[$__interval])
seriesNameFormat: 'iteration overall'
"17":
kind: Panel
spec:
display:
name: KV Cache Usage (avg %)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
# Multiply by 100 so we can read it as a percentage without setting a unit (avoids CUE unit conflicts)
query: >
100 * avg(vllm:kv_cache_usage_perc)
"18":
kind: Panel
spec:
display:
name: Running Requests by Pod
plugin:
kind: TimeSeriesChart
spec:
legend:
mode: table
position: bottom
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
sum by (pod) (vllm:num_requests_running)
seriesNameFormat: '{{pod}}'
"19":
kind: Panel
spec:
display:
name: Waiting Requests by Pod
plugin:
kind: TimeSeriesChart
spec:
legend:
mode: table
position: bottom
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
sum by (pod) (vllm:num_requests_waiting)
seriesNameFormat: '{{pod}}'
"20":
kind: Panel
spec:
display:
name: Running Requests (sum)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: sum(vllm:num_requests_running)
"21":
kind: Panel
spec:
display:
name: Waiting Requests (sum)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: sum(vllm:num_requests_waiting)
layouts:
- kind: Grid
spec:
display:
title: Overview
items:
- x: 0
y: 0
width: 6
height: 3
content: { $ref: '#/spec/panels/17' } # KV cache %
- x: 6
y: 0
width: 6
height: 3
content: { $ref: '#/spec/panels/20' } # running sum
- x: 12
y: 0
width: 6
height: 3
content: { $ref: '#/spec/panels/21' } # waiting sum
- kind: Grid
spec:
display:
title: E2E Latency
items:
- x: 0
y: 1
width: 10
height: 6
content: { $ref: '#/spec/panels/1' }
- x: 10
y: 1
width: 7
height: 3
content: { $ref: '#/spec/panels/2' }
- x: 17
y: 1
width: 7
height: 3
content: { $ref: '#/spec/panels/3' }
- x: 10
y: 4
width: 7
height: 3
content: { $ref: '#/spec/panels/4' }
- x: 17
y: 4
width: 7
height: 3
content: { $ref: '#/spec/panels/5' }
- kind: Grid
spec:
display:
title: TTFT
items:
- x: 0
y: 8
width: 10
height: 6
content: { $ref: '#/spec/panels/6' }
- x: 10
y: 8
width: 7
height: 3
content: { $ref: '#/spec/panels/7' }
- x: 17
y: 8
width: 7
height: 3
content: { $ref: '#/spec/panels/8' }
- x: 10
y: 11
width: 7
height: 3
content: { $ref: '#/spec/panels/9' }
- x: 17
y: 11
width: 7
height: 3
content: { $ref: '#/spec/panels/10' }
- kind: Grid
spec:
display:
title: ITL (Time per Output Token)
items:
- x: 0
y: 15
width: 10
height: 6
content: { $ref: '#/spec/panels/11' }
- x: 10
y: 15
width: 7
height: 3
content: { $ref: '#/spec/panels/12' }
- x: 17
y: 15
width: 7
height: 3
content: { $ref: '#/spec/panels/13' }
- x: 10
y: 18
width: 7
height: 3
content: { $ref: '#/spec/panels/14' }
- x: 17
y: 18
width: 7
height: 3
content: { $ref: '#/spec/panels/15' }
- kind: Grid
spec:
display:
title: TPS (Prompt / Generation / Iteration)
items:
- x: 0
y: 22
width: 14
height: 6
content: { $ref: '#/spec/panels/16' }
- kind: Grid
spec:
display:
title: Per-Pod Request State
items:
- x: 0
y: 28
width: 12
height: 6
content: { $ref: '#/spec/panels/18' }
- x: 12
y: 28
width: 12
height: 6
content: { $ref: '#/spec/panels/19' }

View File

@@ -0,0 +1,392 @@
kind: PersesDashboard
metadata:
name: query-statistics
createdAt: 0001-01-01T00:00:00Z
updatedAt: 0001-01-01T00:00:00Z
version: 0
project: ""
spec:
display:
name: Query Statistics_New
variables:
- kind: ListVariable
spec:
name: NS
display: { name: Namespace }
allowMultiple: false
defaultValue: llm-d
plugin:
kind: PrometheusLabelValuesVariable
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
labelName: namespace
matchers:
- up{service=~".*vllm.*"}
- kind: ListVariable
spec:
name: SVC
display: { name: Service }
allowMultiple: false
defaultValue: vllm-qwen2-0-5b-sim
plugin:
kind: PrometheusLabelValuesVariable
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
labelName: service
matchers:
- up{namespace="$NS",service=~".*vllm.*"}
- kind: ListVariable
spec:
name: MODEL
display: { name: Model (real vLLM) }
allowAllValue: true
allowMultiple: true
defaultValue: ["$__all"]
plugin:
kind: PrometheusLabelValuesVariable
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
labelName: model_name
matchers:
- vllm:request_success_total{namespace="$NS",service="$SVC"}
panels:
# --- Core (works on Simulator & Real) ---
core_running_now:
kind: Panel
spec:
display: { name: Running Requests (now) }
plugin: { kind: StatChart, spec: { calculation: last-number } }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: sum(vllm:num_requests_running{namespace="$NS",service="$SVC"}) or vector(0)
minStep: "15s"
core_waiting_now:
kind: Panel
spec:
display: { name: Waiting Requests (now) }
plugin: { kind: StatChart, spec: { calculation: last-number } }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: sum(vllm:num_requests_waiting{namespace="$NS",service="$SVC"}) or vector(0)
minStep: "15s"
core_kv_usage_now:
kind: Panel
spec:
display: { name: KV Cache Usage (01) }
plugin: { kind: StatChart, spec: { calculation: last-number } }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: avg(vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) or vector(0)
minStep: "15s"
core_running_ts:
kind: Panel
spec:
display: { name: Running Over Time }
plugin:
kind: TimeSeriesChart
spec:
legend: { mode: table, position: bottom }
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: sum by (service) (vllm:num_requests_running{namespace="$NS",service="$SVC"}) or vector(0)
minStep: "15s"
core_waiting_ts:
kind: Panel
spec:
display: { name: Waiting Over Time }
plugin:
kind: TimeSeriesChart
spec:
legend: { mode: table, position: bottom }
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: sum by (service) (vllm:num_requests_waiting{namespace="$NS",service="$SVC"}) or vector(0)
minStep: "15s"
core_targets_up:
kind: Panel
spec:
display: { name: Scrape Targets Up }
plugin: { kind: StatChart, spec: { calculation: last-number } }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: count(up{namespace="$NS",service="$SVC"} == 1) or vector(0)
minStep: "15s"
# --- KV Cache as Percent (works on Simulator & Real) ---
core_kv_usage_pct_now:
kind: Panel
spec:
display: { name: KV Cache Usage (%) now }
plugin: { kind: StatChart, spec: { calculation: last-number } }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
# multiply by 100 to present percentage; omit format.unit to avoid schema conflicts
query: (avg(vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
minStep: "15s"
core_kv_usage_pct_ts:
kind: Panel
spec:
display: { name: KV Cache Usage (%) over time }
plugin:
kind: TimeSeriesChart
spec:
legend: { mode: table, position: bottom }
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: (avg by (service) (vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
minStep: "15s"
# --- Per-Pod breakdowns (works on Simulator & Real) ---
per_pod_running_ts:
kind: Panel
spec:
display: { name: Running by Pod }
plugin:
kind: TimeSeriesChart
spec:
legend: { mode: table, position: bottom }
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: sum by (pod) (vllm:num_requests_running{namespace="$NS",service="$SVC"}) or vector(0)
minStep: "15s"
per_pod_waiting_ts:
kind: Panel
spec:
display: { name: Waiting by Pod }
plugin:
kind: TimeSeriesChart
spec:
legend: { mode: table, position: bottom }
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: sum by (pod) (vllm:num_requests_waiting{namespace="$NS",service="$SVC"}) or vector(0)
minStep: "15s"
per_pod_kv_pct_ts:
kind: Panel
spec:
display: { name: KV Cache (%) by Pod }
plugin:
kind: TimeSeriesChart
spec:
legend: { mode: table, position: bottom }
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
# if your exporter labels kv metric with pod (the sim does), this works; otherwise it will just return empty
query: (avg by (pod) (vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
minStep: "15s"
# --- Real vLLM only (zeros on simulator) ---
real_req_rate_ts:
kind: Panel
spec:
display: { name: Request Rate (real vLLM) }
plugin:
kind: TimeSeriesChart
spec:
legend: { mode: table, position: bottom }
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: sum by (model_name) (rate(vllm:request_success_total{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval])) or vector(0)
minStep: "15s"
real_p50:
kind: Panel
spec:
display: { name: p50 Latency (real vLLM) }
plugin: { kind: StatChart, spec: { calculation: last-number } }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: histogram_quantile(0.50, sum by (le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval]))) or vector(0)
minStep: "15s"
real_p90:
kind: Panel
spec:
display: { name: p90 Latency (real vLLM) }
plugin: { kind: StatChart, spec: { calculation: last-number } }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: histogram_quantile(0.90, sum by (le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval]))) or vector(0)
minStep: "15s"
real_p99:
kind: Panel
spec:
display: { name: p99 Latency (real vLLM) }
plugin: { kind: StatChart, spec: { calculation: last-number } }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: histogram_quantile(0.99, sum by (le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval]))) or vector(0)
minStep: "15s"
real_input_tokens_ts:
kind: Panel
spec:
display: { name: Input Tokens / sec (real vLLM) }
plugin:
kind: TimeSeriesChart
spec:
legend: { mode: table, position: bottom }
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: sum by (model_name) (rate(vllm:prompt_tokens_total{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval])) or vector(0)
minStep: "15s"
real_output_tokens_ts:
kind: Panel
spec:
display: { name: Output Tokens / sec (real vLLM) }
plugin:
kind: TimeSeriesChart
spec:
legend: { mode: table, position: bottom }
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: sum by (model_name) (rate(vllm:generation_tokens_total{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval])) or vector(0)
minStep: "15s"
layouts:
- kind: Grid
spec:
display: { title: Core (Sim & Real) }
items:
- { x: 0, y: 0, width: 6, height: 3, content: { $ref: '#/spec/panels/core_running_now' } }
- { x: 6, y: 0, width: 6, height: 3, content: { $ref: '#/spec/panels/core_waiting_now' } }
- { x: 12, y: 0, width: 6, height: 3, content: { $ref: '#/spec/panels/core_kv_usage_now' } }
- { x: 18, y: 0, width: 6, height: 3, content: { $ref: '#/spec/panels/core_targets_up' } }
- { x: 0, y: 3, width: 12, height: 6, content: { $ref: '#/spec/panels/core_running_ts' } }
- { x: 12, y: 3, width: 12, height: 6, content: { $ref: '#/spec/panels/core_waiting_ts' } }
- kind: Grid
spec:
display: { title: KV Cache (%) }
items:
- { x: 0, y: 9, width: 6, height: 3, content: { $ref: '#/spec/panels/core_kv_usage_pct_now' } }
- { x: 6, y: 9, width: 18, height: 6, content: { $ref: '#/spec/panels/core_kv_usage_pct_ts' } }
- kind: Grid
spec:
display: { title: Per-Pod breakdowns }
items:
- { x: 0, y: 15, width: 12, height: 6, content: { $ref: '#/spec/panels/per_pod_running_ts' } }
- { x: 12, y: 15, width: 12, height: 6, content: { $ref: '#/spec/panels/per_pod_waiting_ts' } }
- { x: 0, y: 21, width: 24, height: 6, content: { $ref: '#/spec/panels/per_pod_kv_pct_ts' } }
- kind: Grid
spec:
display: { title: Real vLLM only (shows 0 on simulator) }
items:
- { x: 0, y: 27, width: 12, height: 6, content: { $ref: '#/spec/panels/real_req_rate_ts' } }
- { x: 12, y: 27, width: 4, height: 3, content: { $ref: '#/spec/panels/real_p50' } }
- { x: 16, y: 27, width: 4, height: 3, content: { $ref: '#/spec/panels/real_p90' } }
- { x: 20, y: 27, width: 4, height: 3, content: { $ref: '#/spec/panels/real_p99' } }
- { x: 0, y: 33, width: 12, height: 6, content: { $ref: '#/spec/panels/real_input_tokens_ts' } }
- { x: 12, y: 33, width: 12, height: 6, content: { $ref: '#/spec/panels/real_output_tokens_ts' } }

View File

@@ -0,0 +1,119 @@
# Disaggregated Encoder
These example scripts that demonstrate the disaggregated encoder (EPD) features of vLLM.
For a detailed explanation of the EPD features, please refer to the [Disaggregated Encoder Feature Documentation](../../../docs/features/disagg_encoder.md).
## Files
- `disagg_epd_proxy.py` - Proxy script that demonstrates the XeYpZd setup (X encode instances, Y prefill instances, Z decode instances). Currently stable for the 1e1p1d configuration.
- `disagg_1e1p1d_example.sh` - Sets up the 1e1p1d configuration, runs the VisionArena benchmark, and processes a single request with a local image.
- `disagg_1e1pd_example.sh` - Sets up the 1e1pd configuration, runs the VisionArena benchmark, and processes a single request with a local image.
### Custom Configuration
```bash
# Use specific GPUs
GPU_E=0 GPU_PD=1 GPU_P=1 GPU_D=2 bash disagg_1e1p1d_example.sh
# Use specific ports
ENDPOINT_PORT=10001 bash disagg_1e1p1d_example.sh
# Use specific model
MODEL="Qwen/Qwen2.5-VL-3B-Instruct" bash disagg_1e1p1d_example.sh
# Use specific storage path
EC_SHARED_STORAGE_PATH="/tmp/my_ec_cache" bash disagg_1e1p1d_example.sh
```
## Encoder Instances
Encoder engines should be launched with the following flags:
- `--enforce-eager` **(required)** The current EPD implementation is only compatible with encoder instances running in this mode.
- `--no-enable-prefix-caching` **(required)** Encoder instances do not consume KV cache; prefix caching is disabled to avoid conflicts with other features.
- `--max-num-batched-tokens=<large value>` **(default: 2048)** This flag controls the token scheduling budget per decoding step and is irrelevant to encoder-only instances. **Set it to a very high value (effectively unlimited) to bypass scheduler limitations.** The actual token budget is managed by the encoder cache manager.
## Local media inputs
To support local image inputs (from your ```MEDIA_PATH``` directory), add the following flag to the encoder instance:
```bash
--allowed-local-media-path $MEDIA_PATH
```
The vllm instances and `disagg_encoder_proxy` supports local URIs with ```{"url": "file://'"$MEDIA_PATH_FILENAME"'}``` as multimodal inputs. Each URI is passed unchanged from the `disagg_encoder_proxy` to the encoder instance so that the encoder can load the media locally.
## EC connector and KV transfer
The `ECExampleonnector` is used to store the encoder cache on local disk and facilitate transfer. To enable the encoder disaggregation feature, add the following configuration:
```bash
# Add to encoder instance:
--ec-transfer-config '{
"ec_connector": "ECExampleConnector",
"ec_role": "ec_producer",
"ec_connector_extra_config": {
"shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
}
}'
# Add to prefill/prefill+decode instance:
--ec-transfer-config '{
"ec_connector": "ECExampleConnector",
"ec_role": "ec_consumer",
"ec_connector_extra_config": {
"shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
}
}'
```
`$EC_SHARED_STORAGE_PATH` is the path where the EC connector temporarily stores the cache.
If you enable prefill instance (`--prefill-servers-urls` not disabled), you will need --kv-transfer-config to facilitate the PD disaggregation. Currently, we use the `NixlConnector` for this purpose. Refer to `tests/v1/kv_connector/nixl_integration` for more example codes on PD disaggregation with Nixl.
```bash
# Add to prefill instance:
--kv-transfer-config '{
"kv_connector": "NixlConnector",
"kv_role": "kv_producer"
}'
# Add to decode instance:
--kv-transfer-config '{
"kv_connector": "NixlConnector",
"kv_role": "kv_consumer"
}'
```
## Proxy Instance Flags (`disagg_epd_proxy.py`)
| Flag | Description |
|------|-------------|
| `--encode-servers-urls` | Comma-separated list of encoder endpoints. Every multimodal item extracted from the request is fanned out to one of these URLs in a round-robin fashion. |
| `--prefill-servers-urls` | Comma-separated list of prefill endpoints. Set to `disable`, `none`, or `""` to skip the dedicated prefill phase and run E+PD (encoder + combined prefill/decode). |
| `--decode-servers-urls` | Comma-separated list of decode endpoints. Non-stream and stream paths both round-robin over this list. |
| `--host`, `--port` | Bind address for the proxy itself (defaults: `0.0.0.0:8000`). |
Example usage:
For E + PD setup:
```bash
$ python disagg_encoder_proxy.py \
--encode-servers-urls "http://e1:8001,http://e2:8002" \
--prefill-servers-urls "disable" \
--decode-servers-urls "http://pd1:8003,http://pd2:8004"
```
For E + P + D setup:
```bash
$ python disagg_encoder_proxy.py \
--encode-servers-urls "http://e1:8001,http://e2:8001" \
--prefill-servers-urls "http://p1:8003,http://p2:8004" \
--decode-servers-urls "http://d1:8005,http://d2:8006"
```

View File

@@ -0,0 +1,221 @@
#!/bin/bash
set -euo pipefail
declare -a PIDS=()
###############################################################################
# Configuration -- override via env before running
###############################################################################
MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}"
LOG_PATH="${LOG_PATH:-./logs}"
mkdir -p $LOG_PATH
ENCODE_PORT="${ENCODE_PORT:-19534}"
PREFILL_PORT="${PREFILL_PORT:-19535}"
DECODE_PORT="${DECODE_PORT:-19536}"
PROXY_PORT="${PROXY_PORT:-10001}"
GPU_E="${GPU_E:-2}"
GPU_P="${GPU_P:-2}"
GPU_D="${GPU_D:-3}"
EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache}"
TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-12000}" # wait_for_server timeout
NUM_PROMPTS="${NUM_PROMPTS:-100}" # number of prompts to send in benchmark
export UCX_TLS=all
export UCX_NET_DEVICES=all
###############################################################################
# Helpers
###############################################################################
# Find the git repository root directory
GIT_ROOT=$(git rev-parse --show-toplevel)
START_TIME=$(date +"%Y%m%d_%H%M%S")
ENC_LOG=$LOG_PATH/encoder_${START_TIME}.log
P_LOG=$LOG_PATH/p_${START_TIME}.log
D_LOG=$LOG_PATH/d_${START_TIME}.log
PROXY_LOG=$LOG_PATH/proxy_${START_TIME}.log
wait_for_server() {
local port=$1
timeout "$TIMEOUT_SECONDS" bash -c "
until curl -s localhost:$port/v1/chat/completions > /dev/null; do
sleep 1
done" && return 0 || return 1
}
# Cleanup function
cleanup() {
echo "Stopping everything…"
trap - INT TERM USR1 # prevent re-entrancy
# Kill all tracked PIDs
for pid in "${PIDS[@]}"; do
if kill -0 "$pid" 2>/dev/null; then
echo "Killing process $pid"
kill "$pid" 2>/dev/null
fi
done
# Wait a moment for graceful shutdown
sleep 2
# Force kill any remaining processes
for pid in "${PIDS[@]}"; do
if kill -0 "$pid" 2>/dev/null; then
echo "Force killing process $pid"
kill -9 "$pid" 2>/dev/null
fi
done
# Kill the entire process group as backup
kill -- -$$ 2>/dev/null
echo "All processes stopped."
exit 0
}
trap cleanup INT
trap cleanup USR1
trap cleanup TERM
# clear previous cache
echo "remove previous ec cache folder"
rm -rf $EC_SHARED_STORAGE_PATH
echo "make ec cache folder"
mkdir -p $EC_SHARED_STORAGE_PATH
###############################################################################
# Encoder worker
###############################################################################
CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
--gpu-memory-utilization 0.01 \
--port "$ENCODE_PORT" \
--enforce-eager \
--enable-request-id-headers \
--no-enable-prefix-caching \
--max-num-batched-tokens 114688 \
--max-num-seqs 128 \
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
--ec-transfer-config '{
"ec_connector": "ECExampleConnector",
"ec_role": "ec_producer",
"ec_connector_extra_config": {
"shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
}
}' \
>"${ENC_LOG}" 2>&1 &
PIDS+=($!)
###############################################################################
# Prefill worker
###############################################################################
CUDA_VISIBLE_DEVICES="$GPU_P" \
UCX_NET_DEVICES=all \
VLLM_NIXL_SIDE_CHANNEL_PORT=5559 \
vllm serve "$MODEL" \
--gpu-memory-utilization 0.7 \
--port "$PREFILL_PORT" \
--enforce-eager \
--enable-request-id-headers \
--max-num-seqs 128 \
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
--ec-transfer-config '{
"ec_connector": "ECExampleConnector",
"ec_role": "ec_consumer",
"ec_connector_extra_config": {
"shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
}
}' \
--kv-transfer-config '{
"kv_connector": "NixlConnector",
"kv_role": "kv_producer"
}' \
>"${P_LOG}" 2>&1 &
PIDS+=($!)
###############################################################################
# Decode worker
###############################################################################
CUDA_VISIBLE_DEVICES="$GPU_D" \
UCX_NET_DEVICES=all \
VLLM_NIXL_SIDE_CHANNEL_PORT=6000 \
vllm serve "$MODEL" \
--gpu-memory-utilization 0.7 \
--port "$DECODE_PORT" \
--enforce-eager \
--enable-request-id-headers \
--max-num-seqs 128 \
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
--kv-transfer-config '{
"kv_connector": "NixlConnector",
"kv_role": "kv_consumer"
}' \
>"${D_LOG}" 2>&1 &
PIDS+=($!)
# Wait for workers
wait_for_server $ENCODE_PORT
wait_for_server $PREFILL_PORT
wait_for_server $DECODE_PORT
###############################################################################
# Proxy
###############################################################################
python disagg_epd_proxy.py \
--host "0.0.0.0" \
--port "$PROXY_PORT" \
--encode-servers-urls "http://localhost:$ENCODE_PORT" \
--prefill-servers-urls "http://localhost:$PREFILL_PORT" \
--decode-servers-urls "http://localhost:$DECODE_PORT" \
>"${PROXY_LOG}" 2>&1 &
PIDS+=($!)
wait_for_server $PROXY_PORT
echo "All services are up!"
###############################################################################
# Benchmark
###############################################################################
echo "Running benchmark (stream)..."
vllm bench serve \
--model $MODEL \
--backend openai-chat \
--endpoint /v1/chat/completions \
--dataset-name hf \
--dataset-path lmarena-ai/VisionArena-Chat \
--seed 0 \
--num-prompts $NUM_PROMPTS \
--port $PROXY_PORT
PIDS+=($!)
###############################################################################
# Single request with local image
###############################################################################
echo "Running single request with local image (non-stream)..."
curl http://127.0.0.1:${PROXY_PORT}/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "'${MODEL}'",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": [
{"type": "image_url", "image_url": {"url": "file://'"${GIT_ROOT}"'/tests/v1/ec_connector/integration/hato.jpg"}},
{"type": "text", "text": "What is in this image?"}
]}
]
}'
# cleanup
echo "cleanup..."
cleanup

View File

@@ -0,0 +1,186 @@
#!/bin/bash
set -euo pipefail
declare -a PIDS=()
###############################################################################
# Configuration -- override via env before running
###############################################################################
MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}"
LOG_PATH="${LOG_PATH:-./logs}"
mkdir -p $LOG_PATH
ENCODE_PORT="${ENCODE_PORT:-19534}"
PREFILL_DECODE_PORT="${PREFILL_DECODE_PORT:-19535}"
PROXY_PORT="${PROXY_PORT:-10001}"
GPU_E="${GPU_E:-0}"
GPU_PD="${GPU_PD:-1}"
EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache}"
TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-12000}" # wait_for_server timeout
NUM_PROMPTS="${NUM_PROMPTS:-100}" # number of prompts to send in benchmark
###############################################################################
# Helpers
###############################################################################
# Find the git repository root directory
GIT_ROOT=$(git rev-parse --show-toplevel)
START_TIME=$(date +"%Y%m%d_%H%M%S")
ENC_LOG=$LOG_PATH/encoder_${START_TIME}.log
PD_LOG=$LOG_PATH/pd_${START_TIME}.log
PROXY_LOG=$LOG_PATH/proxy_${START_TIME}.log
wait_for_server() {
local port=$1
timeout "$TIMEOUT_SECONDS" bash -c "
until curl -s localhost:$port/v1/chat/completions > /dev/null; do
sleep 1
done" && return 0 || return 1
}
# Cleanup function
cleanup() {
echo "Stopping everything…"
trap - INT TERM USR1 # prevent re-entrancy
# Kill all tracked PIDs
for pid in "${PIDS[@]}"; do
if kill -0 "$pid" 2>/dev/null; then
echo "Killing process $pid"
kill "$pid" 2>/dev/null
fi
done
# Wait a moment for graceful shutdown
sleep 2
# Force kill any remaining processes
for pid in "${PIDS[@]}"; do
if kill -0 "$pid" 2>/dev/null; then
echo "Force killing process $pid"
kill -9 "$pid" 2>/dev/null
fi
done
# Kill the entire process group as backup
kill -- -$$ 2>/dev/null
echo "All processes stopped."
exit 0
}
trap cleanup INT
trap cleanup USR1
trap cleanup TERM
# clear previous cache
echo "remove previous ec cache folder"
rm -rf $EC_SHARED_STORAGE_PATH
echo "make ec cache folder"
mkdir -p $EC_SHARED_STORAGE_PATH
###############################################################################
# Encoder worker
###############################################################################
CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
--gpu-memory-utilization 0.01 \
--port "$ENCODE_PORT" \
--enforce-eager \
--enable-request-id-headers \
--no-enable-prefix-caching \
--max-num-batched-tokens 114688 \
--max-num-seqs 128 \
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
--ec-transfer-config '{
"ec_connector": "ECExampleConnector",
"ec_role": "ec_producer",
"ec_connector_extra_config": {
"shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
}
}' \
>"${ENC_LOG}" 2>&1 &
PIDS+=($!)
###############################################################################
# Prefill+Decode worker
###############################################################################
CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \
--gpu-memory-utilization 0.7 \
--port "$PREFILL_DECODE_PORT" \
--enforce-eager \
--enable-request-id-headers \
--max-num-seqs 128 \
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
--ec-transfer-config '{
"ec_connector": "ECExampleConnector",
"ec_role": "ec_consumer",
"ec_connector_extra_config": {
"shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
}
}' \
>"${PD_LOG}" 2>&1 &
PIDS+=($!)
# Wait for workers
wait_for_server $ENCODE_PORT
wait_for_server $PREFILL_DECODE_PORT
###############################################################################
# Proxy
###############################################################################
python disagg_epd_proxy.py \
--host "0.0.0.0" \
--port "$PROXY_PORT" \
--encode-servers-urls "http://localhost:$ENCODE_PORT" \
--prefill-servers-urls "disable" \
--decode-servers-urls "http://localhost:$PREFILL_DECODE_PORT" \
>"${PROXY_LOG}" 2>&1 &
PIDS+=($!)
wait_for_server $PROXY_PORT
echo "All services are up!"
###############################################################################
# Benchmark
###############################################################################
echo "Running benchmark (stream)..."
vllm bench serve \
--model $MODEL \
--backend openai-chat \
--endpoint /v1/chat/completions \
--dataset-name hf \
--dataset-path lmarena-ai/VisionArena-Chat \
--seed 0 \
--num-prompts $NUM_PROMPTS \
--port $PROXY_PORT
PIDS+=($!)
###############################################################################
# Single request with local image
###############################################################################
echo "Running single request with local image (non-stream)..."
curl http://127.0.0.1:${PROXY_PORT}/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "'${MODEL}'",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": [
{"type": "image_url", "image_url": {"url": "file://'"${GIT_ROOT}"'/tests/v1/ec_connector/integration/hato.jpg"}},
{"type": "text", "text": "What is in this image?"}
]}
]
}'
# cleanup
echo "cleanup..."
cleanup

View File

@@ -0,0 +1,606 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
disagg_encoder_proxy.py
Proxy that routes OpenAI-compatible “/v1/chat/completions” requests to two
clusters:
• encode (multimodal feature extraction)
• decode (language-model inference)
For MM input we:
1. Extract *every* image/audio item.
2. Fire N concurrent requests to the encoder cluster
(one request per item, with **all text removed**).
3. Wait for all of them to succeed.
4. Forward the *original* request to a decode server.
"""
from __future__ import annotations
import argparse
import asyncio
import logging
import os
import random
import uuid
from collections.abc import AsyncIterator
import aiohttp
import uvicorn
from fastapi import FastAPI, HTTPException, Request
from fastapi.responses import JSONResponse, StreamingResponse
###############################################################################
# FastAPI app & global state
###############################################################################
logging.basicConfig(
level=logging.DEBUG, format="%(asctime)s %(levelname)s: %(message)s"
)
logger = logging.getLogger("proxy")
app = FastAPI()
encode_session: aiohttp.ClientSession | None = None
prefill_session: aiohttp.ClientSession | None = None
decode_session: aiohttp.ClientSession | None = None
###############################################################################
# Utils
###############################################################################
MM_TYPES = {"image_url", "audio_url", "input_audio"}
def extract_mm_items(request_data: dict) -> list[dict]:
"""
Return *all* image/audio items that appear anywhere in `messages`.
Each returned dict looks like:
{ "type": "image_url", "image_url": {...} }
"""
items: list[dict] = []
for msg in request_data.get("messages", []):
content = msg.get("content")
if not isinstance(content, list):
continue
for item in content:
if item.get("type") in MM_TYPES:
items.append(item)
return items
async def fanout_encoder_primer(
orig_request: dict,
e_urls: list[str],
req_id: str,
) -> None:
"""
1. Build one request *per MM item* with all text removed.
2. Send them concurrently to the encode cluster.
3. Raise if any of them fails.
"""
logger.info("[%s] Processing multimodal items...", req_id)
mm_items = extract_mm_items(orig_request)
if not mm_items:
logger.info("[%s] No multimodal items, skipping encoder", req_id)
return # nothing to do
logger.info("[%s] got %d multimodal items...", req_id, len(mm_items))
tasks = []
# Round-robin over encode servers to distribute load a bit
url_cycle = (e_urls[i % len(e_urls)] for i in range(len(mm_items)))
for idx, (item, target_url) in enumerate(zip(mm_items, url_cycle)):
# Derive a *child* request id: <parent>:<index>:<random-short>
child_req_id = f"{req_id}:{idx}:{uuid.uuid4().hex[:6]}"
headers = {"x-request-id": child_req_id}
encoder_req = {
# You *may* need to keep additional fields
"model": orig_request.get("model"),
"messages": [
{"role": "user", "content": [item]},
],
# Only need 1 token so the server actually runs the encoder path
"max_tokens": 1,
"stream": False,
}
tasks.append(
encode_session.post(
f"{target_url}/v1/chat/completions",
json=encoder_req,
headers=headers,
)
)
results = await asyncio.gather(*tasks, return_exceptions=True)
# Fail fast if any sub-request failed
for idx, r in enumerate(results):
if isinstance(r, Exception):
logger.error(
"[%s] Encoder request #%d raised exception: %s",
req_id,
idx,
r,
exc_info=r,
)
raise HTTPException(
status_code=502, detail=f"Encoder request failed: {str(r)}"
)
if r.status != 200:
try:
detail = await r.text()
except Exception:
detail = "<unable to read body>"
logger.error(
"[%s] Encoder request #%d returned status %s: %s",
req_id,
idx,
r.status,
detail,
)
raise HTTPException(
status_code=r.status,
detail=f"Encoder request failed: {detail}",
)
logger.info(
"[%s] All %d encoder requests completed successfully", req_id, len(mm_items)
)
async def maybe_prefill(
req_data: dict,
p_url: str,
req_id: str,
) -> dict:
"""
- Do prefill-only task if p_url exist;
- Return modified request data with kv transfer params (for nixl connector)
- Else, skip and return the original request data for decode
"""
if p_url:
logger.info("[%s] Processing through prefill: %s", req_id, p_url)
prefill_response = await process_prefill_stage(req_data, p_url, req_id)
# for nixl connector to facilitate kv transfer...
prefill_response_json = await prefill_response.json()
kv_transfer_params = prefill_response_json.get("kv_transfer_params", {})
if kv_transfer_params:
req_data["kv_transfer_params"] = kv_transfer_params
return req_data
else:
return req_data
async def process_prefill_stage(
req_data: dict,
p_url: str,
req_id: str,
) -> dict:
"""Process request through Prefill stage and return kv_transfer_params"""
logger.info("[%s] Sending prefill request to: %s", req_id, p_url)
prefill_request = req_data.copy()
prefill_request["kv_transfer_params"] = {
"do_remote_decode": True,
"do_remote_prefill": False,
"remote_engine_id": None,
"remote_block_ids": None,
"remote_host": None,
"remote_port": None,
}
prefill_request["stream"] = False
prefill_request["max_tokens"] = 1
if "max_completion_tokens" in prefill_request:
prefill_request["max_completion_tokens"] = 1
if "stream_options" in prefill_request:
del prefill_request["stream_options"]
headers = {"x-request-id": req_id}
try:
prefill_response = await prefill_session.post(
f"{p_url}/v1/chat/completions", json=prefill_request, headers=headers
)
prefill_response.raise_for_status()
if prefill_response.status != 200:
error_text = await prefill_response.text()
logger.error(
"[%s] Prefill request failed with status %d: %s",
req_id,
prefill_response.status,
error_text,
)
raise HTTPException(
status_code=prefill_response.status,
detail={"error": "Prefill request failed", "message": error_text},
)
logger.info("[%s] Prefill request completed successfully", req_id)
return prefill_response
except Exception as e:
logger.error("Prefill processing failed: %s", str(e))
raise HTTPException(
status_code=500,
detail={"error": "Prefill processing error", "message": str(e)},
) from e
###############################################################################
# Middleware for request/response logging
###############################################################################
@app.middleware("http")
async def log_requests(request: Request, call_next):
"""Middleware to log all incoming requests and responses"""
req_id = request.headers.get("x-request-id", str(uuid.uuid4()))
# Log incoming request
logger.info(
">>> [%s] %s %s from %s",
req_id,
request.method,
request.url.path,
request.client.host if request.client else "unknown",
)
try:
# Process request
response = await call_next(request)
# Log response
logger.info(
"<<< [%s] %s %s completed with status %d",
req_id,
request.method,
request.url.path,
response.status_code,
)
return response
except Exception as e:
# Log errors
logger.exception(
"!!! [%s] %s %s failed with error: %s",
req_id,
request.method,
request.url.path,
str(e),
)
raise
###############################################################################
# FastAPI lifecycle
###############################################################################
@app.on_event("startup")
async def on_startup() -> None:
global encode_session, prefill_session, decode_session
timeout = aiohttp.ClientTimeout(total=100_000)
connector = aiohttp.TCPConnector(limit=0, force_close=False)
encode_session = aiohttp.ClientSession(timeout=timeout, connector=connector)
if app.state.p_urls:
# only setup if prefill instance(s) exist
prefill_session = aiohttp.ClientSession(timeout=timeout, connector=connector)
decode_session = aiohttp.ClientSession(timeout=timeout, connector=connector)
@app.on_event("shutdown")
async def on_shutdown() -> None:
global encode_session, prefill_session, decode_session
if encode_session:
await encode_session.close()
if prefill_session:
await prefill_session.close()
if decode_session:
await decode_session.close()
###############################################################################
# Core forwarding
###############################################################################
async def forward_non_stream(
req_data: dict, req_id: str, e_urls: list[str], p_url: str, d_url: str
) -> dict:
try:
# Step 1: Process through Encoder instance (if has MM input)
await fanout_encoder_primer(req_data, e_urls, req_id)
# Step 2: Process through Prefill instance
req_data = await maybe_prefill(req_data, p_url, req_id)
# Step 3: Process through Decode instance
logger.info("[%s] Forwarding to decode: %s", req_id, d_url)
headers = {"x-request-id": req_id}
# Non-streaming response
async with decode_session.post(
f"{d_url}/v1/chat/completions", json=req_data, headers=headers
) as resp:
resp.raise_for_status()
return await resp.json()
except HTTPException:
raise
except Exception as e:
logger.exception("[%s] Error in forward_non_stream: %s", req_id, str(e))
raise HTTPException(status_code=500, detail=f"Proxy error: {str(e)}") from e
async def forward_stream(
req_data: dict, req_id: str, e_urls: list[str], p_url: str, d_url: str
) -> AsyncIterator[str]:
try:
# Step 1: Process through Encoder instance (if has MM input)
await fanout_encoder_primer(req_data, e_urls, req_id)
# Step 2: Process through Prefill instance
req_data = await maybe_prefill(req_data, p_url, req_id)
# Step 3: Process through Decode instance
logger.info("[%s] Starting streaming from decode: %s", req_id, d_url)
headers = {"x-request-id": req_id}
# Streaming response
async with decode_session.post(
f"{d_url}/v1/chat/completions",
json=req_data,
headers=headers,
) as resp:
resp.raise_for_status()
async for chunk in resp.content.iter_chunked(1024):
if chunk:
yield chunk.decode("utf-8", errors="ignore")
logger.info("[%s] Streaming completed", req_id)
except HTTPException:
logger.exception("[%s] HTTPException in forward_stream", req_id)
raise
except Exception as e:
logger.exception("[%s] Error in forward_stream: %s", req_id, str(e))
raise HTTPException(
status_code=500, detail=f"Proxy streaming error: {str(e)}"
) from e
###############################################################################
# Public routes
###############################################################################
@app.post("/v1/chat/completions")
async def chat_completions(request: Request):
try:
req_data = await request.json()
req_id = request.headers.get("x-request-id", str(uuid.uuid4()))
e_urls = app.state.e_urls # we want the full list for fan-out
p_url = random.choice(app.state.p_urls) if app.state.p_urls else None
d_url = random.choice(app.state.d_urls)
is_streaming = req_data.get("stream", False)
if is_streaming:
return StreamingResponse(
forward_stream(req_data, req_id, e_urls, p_url, d_url),
media_type="text/event-stream",
)
result = await forward_non_stream(req_data, req_id, e_urls, p_url, d_url)
return JSONResponse(content=result)
except HTTPException:
raise
except Exception as e:
logger.exception("Error in chat_completions endpoint: %s", str(e))
raise HTTPException(
status_code=500, detail=f"Request processing error: {str(e)}"
) from e
@app.get("/v1/models")
async def list_models():
async with decode_session.get(f"{app.state.d_urls[0]}/v1/models") as resp:
resp.raise_for_status()
return await resp.json()
@app.get("/health")
async def health_check():
async def healthy(urls):
if not urls:
return "empty"
for u in urls:
try:
async with encode_session.get(f"{u}/health") as resp:
resp.raise_for_status()
except Exception:
return "unhealthy"
return "healthy"
e_status, p_status, d_status = await asyncio.gather(
healthy(app.state.e_urls), healthy(app.state.p_urls), healthy(app.state.d_urls)
)
overall_healthy = all(
status != "unhealthy" for status in (e_status, p_status, d_status)
)
status_code = 200 if overall_healthy else 503
return JSONResponse(
{
"proxy": "healthy",
"encode_cluster": e_status,
"prefill_cluster": p_status,
"decode_cluster": d_status,
},
status_code=status_code,
)
###############################################################################
# Simple profiler fan-out (unchanged except for sessions)
###############################################################################
async def _post_if_available(
session: aiohttp.ClientSession,
url: str,
payload: dict,
headers: dict,
) -> dict | None:
"""
POST `payload` to `url`.
Returns
-------
• The decoded JSON body on success (2xx)
• None if the endpoint does not exist (404)
• Raises for anything else.
"""
try:
resp = await session.post(url, json=payload, headers=headers)
if resp.status == 404: # profiling disabled on that server
logger.warning("Profiling endpoint missing on %s", url)
return None
resp.raise_for_status()
return await resp.json(content_type=None)
except aiohttp.ClientResponseError as exc:
# Pass 404 through the branch above, re-raise everything else
if exc.status == 404:
logger.warning("Profiling endpoint missing on %s", url)
return None
raise
except Exception:
# Network errors etc.: propagate
raise
async def _profile_cmd(cmd: str, payload: dict, e_url: str, p_url: str, d_url: str):
"""
Fire & forget to both clusters, tolerate 404.
"""
headers = {"Authorization": f"Bearer {os.getenv('OPENAI_API_KEY', '')}"}
encode_task = _post_if_available(
encode_session, f"{e_url}/{cmd}_profile", payload, headers
)
prefill_task = (
_post_if_available(prefill_session, f"{p_url}/{cmd}_profile", payload, headers)
if p_url is not None
else asyncio.sleep(0)
)
decode_task = _post_if_available(
decode_session, f"{d_url}/{cmd}_profile", payload, headers
)
encode_res, prefill_res, decode_res = await asyncio.gather(
encode_task, prefill_task, decode_task
)
# If *all* clusters said “I dont have that route”, surface an error
if encode_res is prefill_res is decode_res is None:
raise HTTPException(
status_code=503,
detail="Profiling endpoints are disabled on all clusters",
)
return {
"encode": encode_res, # may be None
"prefill": prefill_res, # may be None
"decode": decode_res, # may be None
}
@app.post("/start_profile")
async def start_profile(request: Request):
body = await request.json()
# TODO: handle multi urls properly
e_url = random.choice(app.state.e_urls)
p_url = random.choice(app.state.p_urls) if app.state.p_urls else None
d_url = random.choice(app.state.d_urls)
return await _profile_cmd("start", body, e_url, p_url, d_url)
@app.post("/stop_profile")
async def stop_profile(request: Request):
body = await request.json()
# TODO: handle multi urls properly
e_url = random.choice(app.state.e_urls)
p_url = random.choice(app.state.p_urls) if app.state.p_urls else None
d_url = random.choice(app.state.d_urls)
return await _profile_cmd("stop", body, e_url, p_url, d_url)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--host", default="0.0.0.0")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument(
"--encode-servers-urls",
required=True,
help='Comma-separated encode URLs ("http://e1:8001,http://e2:8001")',
)
parser.add_argument(
"--prefill-servers-urls",
required=True,
help=(
'Comma-separated prefill URLs ("http://p1:8003,http://p2:8004") ',
'to enable E->P->D, set "disable" or "none" to enable E->PD',
),
)
parser.add_argument(
"--decode-servers-urls",
required=True,
help='Comma-separated decode URLs ("http://d1:8005,http://d2:8006")',
)
args = parser.parse_args()
app.state.e_urls = [
u.strip() for u in args.encode_servers_urls.split(",") if u.strip()
]
app.state.d_urls = [
u.strip() for u in args.decode_servers_urls.split(",") if u.strip()
]
# handle prefill instances
if args.prefill_servers_urls.lower() in ("disable", "none", ""):
app.state.p_urls = []
logger.info(
"Disaggregated prefill phase explicitly disabled by user. Running E + PD..."
)
else:
app.state.p_urls = [
u.strip() for u in args.prefill_servers_urls.split(",") if u.strip()
]
logger.info("Disaggregated prefill phase is enabled. Running E + P + D...")
logger.info("Proxy listening on %s:%s", args.host, args.port)
logger.info("Encode servers: %s", app.state.e_urls)
logger.info("Prefill instances %s", app.state.p_urls)
logger.info("Decode servers: %s", app.state.d_urls)
uvicorn.run(
app,
host=args.host,
port=args.port,
log_level="info",
loop="uvloop",
access_log=True,
)

View File

@@ -0,0 +1,125 @@
#!/bin/bash
# This file demonstrates the example usage of disaggregated prefilling
# We will launch 2 vllm instances (1 for prefill and 1 for decode),
# and then transfer the KV cache between them.
set -xe
echo "🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧"
sleep 1
# meta-llama/Meta-Llama-3.1-8B-Instruct or deepseek-ai/DeepSeek-V2-Lite
MODEL_NAME=${HF_MODEL_NAME:-meta-llama/Meta-Llama-3.1-8B-Instruct}
# Trap the SIGINT signal (triggered by Ctrl+C)
trap 'cleanup' INT
# Cleanup function
cleanup() {
echo "Caught Ctrl+C, cleaning up..."
# Cleanup commands
pgrep python | xargs kill -9
pkill -f python
echo "Cleanup complete. Exiting."
exit 0
}
if [[ -z "${VLLM_HOST_IP:-}" ]]; then
export VLLM_HOST_IP=127.0.0.1
echo "Using default VLLM_HOST_IP=127.0.0.1 (override by exporting VLLM_HOST_IP before running this script)"
else
echo "Using provided VLLM_HOST_IP=${VLLM_HOST_IP}"
fi
# install quart first -- required for disagg prefill proxy serve
if python3 -c "import quart" &> /dev/null; then
echo "Quart is already installed."
else
echo "Quart is not installed. Installing..."
python3 -m pip install quart
fi
# a function that waits vLLM server to start
wait_for_server() {
local port=$1
timeout 1200 bash -c "
until curl -i localhost:${port}/v1/models > /dev/null; do
sleep 1
done" && return 0 || return 1
}
# You can also adjust --kv-ip and --kv-port for distributed inference.
# prefilling instance, which is the KV producer
CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
--host 0.0.0.0 \
--port 8100 \
--max-model-len 100 \
--gpu-memory-utilization 0.8 \
--trust-remote-code \
--kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":"1e9","kv_port":"14579","kv_connector_extra_config":{"proxy_ip":"'"$VLLM_HOST_IP"'","proxy_port":"30001","http_ip":"'"$VLLM_HOST_IP"'","http_port":"8100","send_type":"PUT_ASYNC"}}' &
# decoding instance, which is the KV consumer
CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
--host 0.0.0.0 \
--port 8200 \
--max-model-len 100 \
--gpu-memory-utilization 0.8 \
--trust-remote-code \
--kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":"1e10","kv_port":"14580","kv_connector_extra_config":{"proxy_ip":"'"$VLLM_HOST_IP"'","proxy_port":"30001","http_ip":"'"$VLLM_HOST_IP"'","http_port":"8200","send_type":"PUT_ASYNC"}}' &
# wait until prefill and decode instances are ready
wait_for_server 8100
wait_for_server 8200
# launch a proxy server that opens the service at port 8000
# the workflow of this proxy:
# - send the request to prefill vLLM instance (port 8100), change max_tokens
# to 1
# - after the prefill vLLM finishes prefill, send the request to decode vLLM
# instance
# NOTE: the usage of this API is subject to change --- in the future we will
# introduce "vllm connect" to connect between prefill and decode instances
python3 ../../benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py &
sleep 1
# serve two example requests
output1=$(curl -X POST -s http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "'"$MODEL_NAME"'",
"prompt": "San Francisco is a",
"max_tokens": 10,
"temperature": 0
}')
output2=$(curl -X POST -s http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "'"$MODEL_NAME"'",
"prompt": "Santa Clara is a",
"max_tokens": 10,
"temperature": 0
}')
# Cleanup commands
pgrep python | xargs kill -9
pkill -f python
echo ""
sleep 1
# Print the outputs of the curl requests
echo ""
echo "Output of first request: $output1"
echo "Output of second request: $output2"
echo "🎉🎉 Successfully finished 2 test requests! 🎉🎉"
echo ""

View File

@@ -0,0 +1,8 @@
# Disaggregated Serving
This example contains scripts that demonstrate the disaggregated serving features of vLLM.
## Files
- `disagg_proxy_demo.py` - Demonstrates XpYd (X prefill instances, Y decode instances).
- `kv_events.sh` - Demonstrates KV cache event publishing.

View File

@@ -0,0 +1,452 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This file provides a disaggregated prefilling proxy demo to demonstrate an
example usage of XpYd disaggregated prefilling.
We can launch multiple vllm instances (2 for prefill and 2 for decode), and
launch this proxy demo through:
python3 examples/online_serving/disaggregated_serving/disagg_proxy_demo.py \
--model $model_name \
--prefill localhost:8100 localhost:8101 \
--decode localhost:8200 localhost:8201 \
--port 8000
Note: This demo will be removed once the PDController implemented in PR 15343
(https://github.com/vllm-project/vllm/pull/15343) supports XpYd.
"""
import argparse
import ipaddress
import itertools
import json
import logging
import os
import sys
from abc import ABC, abstractmethod
from collections.abc import Callable
import aiohttp
import requests
import uvicorn
from fastapi import APIRouter, Depends, FastAPI, Header, HTTPException, Request, status
from fastapi.responses import JSONResponse, StreamingResponse
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
logger = logging.getLogger()
logging.basicConfig(level=logging.INFO)
class SchedulingPolicy(ABC):
@abstractmethod
def schedule(self, cycler: itertools.cycle):
raise NotImplementedError("Scheduling Proxy is not set.")
class Proxy:
def __init__(
self,
prefill_instances: list[str],
decode_instances: list[str],
model: str,
scheduling_policy: SchedulingPolicy,
custom_create_completion: Callable[[Request], StreamingResponse] | None = None,
custom_create_chat_completion: Callable[[Request], StreamingResponse]
| None = None,
):
self.prefill_instances = prefill_instances
self.decode_instances = decode_instances
self.prefill_cycler = itertools.cycle(prefill_instances)
self.decode_cycler = itertools.cycle(decode_instances)
self.model = model
self.scheduling_policy = scheduling_policy
self.custom_create_completion = custom_create_completion
self.custom_create_chat_completion = custom_create_chat_completion
self.router = APIRouter()
self.setup_routes()
def setup_routes(self):
self.router.post(
"/v1/completions", dependencies=[Depends(self.validate_json_request)]
)(
self.custom_create_completion
if self.custom_create_completion
else self.create_completion
)
self.router.post(
"/v1/chat/completions", dependencies=[Depends(self.validate_json_request)]
)(
self.custom_create_chat_completion
if self.custom_create_chat_completion
else self.create_chat_completion
)
self.router.get("/status", response_class=JSONResponse)(self.get_status)
self.router.post(
"/instances/add", dependencies=[Depends(self.api_key_authenticate)]
)(self.add_instance_endpoint)
async def validate_json_request(self, raw_request: Request):
content_type = raw_request.headers.get("content-type", "").lower()
if content_type != "application/json":
raise HTTPException(
status_code=415,
detail="Unsupported Media Type: Only 'application/json' is allowed",
)
def api_key_authenticate(self, x_api_key: str = Header(...)):
expected_api_key = os.environ.get("ADMIN_API_KEY")
if not expected_api_key:
logger.error("ADMIN_API_KEY is not set in the environment.")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Server configuration error.",
)
if x_api_key != expected_api_key:
logger.warning("Unauthorized access attempt with API Key: %s", x_api_key)
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="Forbidden: Invalid API Key.",
)
async def validate_instance(self, instance: str) -> bool:
url = f"http://{instance}/v1/models"
try:
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as client:
logger.info("Verifying %s ...", instance)
async with client.get(url) as response:
if response.status == 200:
data = await response.json()
if "data" in data and len(data["data"]) > 0:
model_cur = data["data"][0].get("id", "")
if model_cur == self.model:
logger.info("Instance: %s could be added.", instance)
return True
else:
logger.warning(
"Mismatch model %s : %s != %s",
instance,
model_cur,
self.model,
)
return False
else:
return False
else:
return False
except aiohttp.ClientError as e:
logger.error(str(e))
return False
except Exception as e:
logger.error(str(e))
return False
async def add_instance_endpoint(self, request: Request):
try:
data = await request.json()
logger.warning(str(data))
instance_type = data.get("type")
instance = data.get("instance")
if instance_type not in ["prefill", "decode"]:
raise HTTPException(status_code=400, detail="Invalid instance type.")
if not instance or ":" not in instance:
raise HTTPException(status_code=400, detail="Invalid instance format.")
host, port_str = instance.split(":")
try:
if host != "localhost":
ipaddress.ip_address(host)
port = int(port_str)
if not (0 < port < 65536):
raise HTTPException(status_code=400, detail="Invalid port number.")
except Exception as e:
raise HTTPException(
status_code=400, detail="Invalid instance address."
) from e
is_valid = await self.validate_instance(instance)
if not is_valid:
raise HTTPException(
status_code=400, detail="Instance validation failed."
)
if instance_type == "prefill":
if instance not in self.prefill_instances:
self.prefill_instances.append(instance)
self.prefill_cycler = itertools.cycle(self.prefill_instances)
else:
raise HTTPException(
status_code=400, detail="Instance already exists."
)
else:
if instance not in self.decode_instances:
self.decode_instances.append(instance)
self.decode_cycler = itertools.cycle(self.decode_instances)
else:
raise HTTPException(
status_code=400, detail="Instance already exists."
)
return JSONResponse(
content={"message": f"Added {instance} to {instance_type}_instances."}
)
except HTTPException as http_exc:
raise http_exc
except Exception as e:
logger.error("Error in add_instance_endpoint: %s", str(e))
raise HTTPException(status_code=500, detail=str(e)) from e
async def forward_request(self, url, data, use_chunked=True):
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
try:
async with session.post(
url=url, json=data, headers=headers
) as response:
if 200 <= response.status < 300 or 400 <= response.status < 500:
if use_chunked:
async for chunk_bytes in response.content.iter_chunked(
1024
):
yield chunk_bytes
else:
content = await response.read()
yield content
else:
error_content = await response.text()
try:
error_content = json.loads(error_content)
except json.JSONDecodeError:
error_content = error_content
logger.error(
"Request failed with status %s: %s",
response.status,
error_content,
)
raise HTTPException(
status_code=response.status,
detail=f"Request failed with status {response.status}: "
f"{error_content}",
)
except aiohttp.ClientError as e:
logger.error("ClientError occurred: %s", str(e))
raise HTTPException(
status_code=502,
detail="Bad Gateway: Error communicating with upstream server.",
) from e
except Exception as e:
logger.error("Unexpected error: %s", str(e))
raise HTTPException(status_code=500, detail=str(e)) from e
def schedule(self, cycler: itertools.cycle) -> str:
return self.scheduling_policy.schedule(cycler)
async def get_status(self):
status = {
"prefill_node_count": len(self.prefill_instances),
"decode_node_count": len(self.decode_instances),
"prefill_nodes": self.prefill_instances,
"decode_nodes": self.decode_instances,
}
return status
async def create_completion(self, raw_request: Request):
try:
request = await raw_request.json()
kv_prepare_request = request.copy()
kv_prepare_request["max_tokens"] = 1
prefill_instance = self.schedule(self.prefill_cycler)
try:
async for _ in self.forward_request(
f"http://{prefill_instance}/v1/completions", kv_prepare_request
):
continue
except HTTPException as http_exc:
self.remove_instance_endpoint("prefill", prefill_instance)
raise http_exc
# Perform kv recv and decoding stage
decode_instance = self.schedule(self.decode_cycler)
try:
generator = self.forward_request(
f"http://{decode_instance}/v1/completions", request
)
except HTTPException as http_exc:
self.remove_instance_endpoint("decode", decode_instance)
raise http_exc
response = StreamingResponse(generator)
return response
except Exception:
import sys
exc_info = sys.exc_info()
print("Error occurred in disagg proxy server")
print(exc_info)
async def create_chat_completion(self, raw_request: Request):
try:
request = await raw_request.json()
# add params to request
kv_prepare_request = request.copy()
kv_prepare_request["max_tokens"] = 1
if "max_completion_tokens" in kv_prepare_request:
kv_prepare_request["max_completion_tokens"] = 1
# prefill stage
prefill_instance = self.schedule(self.prefill_cycler)
try:
async for _ in self.forward_request(
f"http://{prefill_instance}/v1/chat/completions", kv_prepare_request
):
continue
except HTTPException as http_exc:
self.remove_instance_endpoint("prefill", prefill_instance)
raise http_exc
# Perform kv recv and decoding stage
decode_instance = self.schedule(self.decode_cycler)
try:
generator = self.forward_request(
"http://" + decode_instance + "/v1/chat/completions", request
)
except HTTPException as http_exc:
self.remove_instance_endpoint("decode", decode_instance)
raise http_exc
response = StreamingResponse(content=generator)
return response
except Exception:
exc_info = sys.exc_info()
error_messages = [str(e) for e in exc_info if e]
print("Error occurred in disagg proxy server")
print(error_messages)
return StreamingResponse(
content=iter(error_messages), media_type="text/event-stream"
)
def remove_instance_endpoint(self, instance_type, instance):
if instance_type == "decode" and instance in self.decode_instances:
self.decode_instances.remove(instance)
self.decode_cycler = itertools.cycle(self.decode_instances)
if instance_type == "prefill" and instance in self.decode_instances:
self.prefill_instances.remove(instance)
self.prefill_cycler = itertools.cycle(self.decode_instances)
class RoundRobinSchedulingPolicy(SchedulingPolicy):
def __init__(self):
super().__init__()
def schedule(self, cycler: itertools.cycle) -> str:
return next(cycler)
class ProxyServer:
def __init__(
self,
args: argparse.Namespace,
scheduling_policy: SchedulingPolicy | None = None,
create_completion: Callable[[Request], StreamingResponse] | None = None,
create_chat_completion: Callable[[Request], StreamingResponse] | None = None,
):
self.validate_parsed_serve_args(args)
self.port = args.port
self.proxy_instance = Proxy(
prefill_instances=[] if args.prefill is None else args.prefill,
decode_instances=[] if args.decode is None else args.decode,
model=args.model,
scheduling_policy=(
scheduling_policy
if scheduling_policy is not None
else RoundRobinSchedulingPolicy()
),
custom_create_completion=create_completion,
custom_create_chat_completion=create_chat_completion,
)
def validate_parsed_serve_args(self, args: argparse.Namespace):
if not args.prefill:
raise ValueError("Please specify at least one prefill node.")
if not args.decode:
raise ValueError("Please specify at least one decode node.")
self.validate_instances(args.prefill)
self.validate_instances(args.decode)
self.verify_model_config(args.prefill, args.model)
self.verify_model_config(args.decode, args.model)
def validate_instances(self, instances: list):
for instance in instances:
if len(instance.split(":")) != 2:
raise ValueError(f"Invalid instance format: {instance}")
host, port = instance.split(":")
try:
if host != "localhost":
ipaddress.ip_address(host)
port = int(port)
if not (0 < port < 65536):
raise ValueError(f"Invalid port number in instance: {instance}")
except Exception as e:
raise ValueError(f"Invalid instance {instance}: {str(e)}") from e
def verify_model_config(self, instances: list, model: str) -> None:
model_suffix = model.split("/")[-1]
for instance in instances:
try:
response = requests.get(f"http://{instance}/v1/models")
if response.status_code == 200:
model_cur = response.json()["data"][0]["id"]
model_cur_suffix = model_cur.split("/")[-1]
if model_cur_suffix != model_suffix:
raise ValueError(
f"{instance} serves a different model: "
f"{model_cur} != {model}"
)
else:
raise ValueError(f"Cannot get model id from {instance}!")
except requests.RequestException as e:
raise ValueError(
f"Error communicating with {instance}: {str(e)}"
) from e
def run_server(self):
app = FastAPI()
app.include_router(self.proxy_instance.router)
config = uvicorn.Config(app, port=self.port, loop="uvloop")
server = uvicorn.Server(config)
server.run()
def parse_args():
# Todo: allow more config
parser = argparse.ArgumentParser("vLLM disaggregated proxy server.")
parser.add_argument("--model", "-m", type=str, required=True, help="Model name")
parser.add_argument(
"--prefill",
"-p",
type=str,
nargs="+",
help="List of prefill node URLs (host:port)",
)
parser.add_argument(
"--decode",
"-d",
type=str,
nargs="+",
help="List of decode node URLs (host:port)",
)
parser.add_argument(
"--port",
type=int,
default=8000,
help="Server port number",
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
proxy_server = ProxyServer(args=args)
proxy_server.run_server()

View File

@@ -0,0 +1,86 @@
#!/bin/bash
# This file demonstrates the KV cache event publishing
# We will launch a vllm instances configured to publish KV cache
# events and launch a simple subscriber to log those events.
set -xe
echo "🚧🚧 Warning: The usage of KV cache events is experimental and subject to change 🚧🚧"
sleep 1
MODEL_NAME=${HF_MODEL_NAME:-meta-llama/Meta-Llama-3.1-8B-Instruct}
# Trap the SIGINT signal (triggered by Ctrl+C)
trap 'cleanup' INT
# Cleanup function
cleanup() {
echo "Caught Ctrl+C, cleaning up..."
# Cleanup commands
pgrep python | xargs kill -9
pkill -f python
echo "Cleanup complete. Exiting."
exit 0
}
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
# a function that waits vLLM server to start
wait_for_server() {
local port=$1
timeout 1200 bash -c "
until curl -s localhost:${port}/v1/completions > /dev/null; do
sleep 1
done" && return 0 || return 1
}
vllm serve $MODEL_NAME \
--port 8100 \
--max-model-len 100 \
--enforce-eager \
--gpu-memory-utilization 0.8 \
--trust-remote-code \
--kv-events-config \
'{"enable_kv_cache_events": true, "publisher": "zmq", "topic": "kv-events"}' &
wait_for_server 8100
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
python3 "$SCRIPT_DIR/kv_events_subscriber.py" &
sleep 1
# serve two example requests
output1=$(curl -X POST -s http://localhost:8100/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "'"$MODEL_NAME"'",
"prompt": "Explain quantum computing in simple terms a 5-year-old could understand.",
"max_tokens": 80,
"temperature": 0
}')
output2=$(curl -X POST -s http://localhost:8100/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "'"$MODEL_NAME"'",
"prompt": "Explain quantum computing in simple terms a 50-year-old could understand.",
"max_tokens": 80,
"temperature": 0
}')
# Cleanup commands
pkill -9 -u "$USER" -f python
pkill -9 -u "$USER" -f vllm
sleep 1
echo "Cleaned up"
# Print the outputs of the curl requests
echo ""
echo "Output of first request: $output1"
echo "Output of second request: $output2"
echo "🎉🎉 Successfully finished 2 test requests! 🎉🎉"
echo ""

View File

@@ -0,0 +1,244 @@
#!/bin/bash
# =============================================================================
# vLLM Disaggregated Serving Script - P2P NCCL XpYd Architecture
# =============================================================================
# This script demonstrates disaggregated prefill and decode serving using
# P2P NCCL communication. The architecture supports various XpYd configurations:
#
# - 1P3D: 1 Prefill server + 3 Decode servers (current default)
# - 3P1D: 3 Prefill servers + 1 Decode server
# - etc.
#
# Configuration can be customized via environment variables:
# MODEL: Model to serve
# PREFILL_GPUS: Comma-separated GPU IDs for prefill servers
# DECODE_GPUS: Comma-separated GPU IDs for decode servers
# PREFILL_PORTS: Comma-separated ports for prefill servers
# DECODE_PORTS: Comma-separated ports for decode servers
# PROXY_PORT: Proxy server port used to setup XpYd connection.
# TIMEOUT_SECONDS: Server startup timeout
# =============================================================================
# Configuration - can be overridden via environment variables
MODEL=${MODEL:-meta-llama/Llama-3.1-8B-Instruct}
TIMEOUT_SECONDS=${TIMEOUT_SECONDS:-1200}
PROXY_PORT=${PROXY_PORT:-30001}
# Default 1P3D configuration (1 Prefill + 3 Decode)
PREFILL_GPUS=${PREFILL_GPUS:-0}
DECODE_GPUS=${DECODE_GPUS:-1,2,3}
PREFILL_PORTS=${PREFILL_PORTS:-20003}
DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009}
echo "Warning: P2P NCCL disaggregated prefill XpYd support for vLLM v1 is experimental and subject to change."
echo ""
echo "Architecture Configuration:"
echo " Model: $MODEL"
echo " Prefill GPUs: $PREFILL_GPUS, Ports: $PREFILL_PORTS"
echo " Decode GPUs: $DECODE_GPUS, Ports: $DECODE_PORTS"
echo " Proxy Port: $PROXY_PORT"
echo " Timeout: ${TIMEOUT_SECONDS}s"
echo ""
PIDS=()
# Switch to the directory of the current script
cd "$(dirname "${BASH_SOURCE[0]}")"
check_required_files() {
local files=("disagg_proxy_p2p_nccl_xpyd.py")
for file in "${files[@]}"; do
if [[ ! -f "$file" ]]; then
echo "Required file $file not found in $(pwd)"
exit 1
fi
done
}
check_hf_token() {
if [ -z "$HF_TOKEN" ]; then
echo "HF_TOKEN is not set. Please set it to your Hugging Face token."
echo "Example: export HF_TOKEN=your_token_here"
exit 1
fi
if [[ "$HF_TOKEN" != hf_* ]]; then
echo "HF_TOKEN is not a valid Hugging Face token. Please set it to your Hugging Face token."
exit 1
fi
echo "HF_TOKEN is set and valid."
}
check_num_gpus() {
# Check if the number of GPUs are >=2 via nvidia-smi
num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
if [ "$num_gpus" -lt 2 ]; then
echo "You need at least 2 GPUs to run disaggregated prefill."
exit 1
else
echo "Found $num_gpus GPUs."
fi
}
ensure_python_library_installed() {
echo "Checking if $1 is installed..."
if ! python3 -c "import $1" > /dev/null 2>&1; then
echo "$1 is not installed. Please install it via pip install $1."
exit 1
else
echo "$1 is installed."
fi
}
cleanup() {
echo "Stopping everything…"
trap - INT TERM # prevent re-entrancy
pkill -9 -f "disagg_proxy_p2p_nccl_xpyd.py"
kill -- -$$ # negative PID == "this whole process-group"
wait # reap children so we don't leave zombies
exit 0
}
wait_for_server() {
local port=$1
local timeout_seconds=$TIMEOUT_SECONDS
local start_time=$(date +%s)
echo "Waiting for server on port $port..."
while true; do
if curl -s "localhost:${port}/v1/completions" > /dev/null; then
echo "Server on port $port is ready."
return 0
fi
local now=$(date +%s)
if (( now - start_time >= timeout_seconds )); then
echo "Timeout waiting for server on port $port"
return 1
fi
sleep 1
done
}
main() {
check_required_files
check_hf_token
check_num_gpus
ensure_python_library_installed pandas
ensure_python_library_installed datasets
ensure_python_library_installed vllm
ensure_python_library_installed quart
trap cleanup INT
trap cleanup USR1
trap cleanup TERM
echo "Launching disaggregated serving components..."
echo "Please check the log files for detailed output:"
echo " - prefill*.log: Prefill server logs"
echo " - decode*.log: Decode server logs"
echo " - proxy.log: Proxy server log"
# =============================================================================
# Launch Proxy Server
# =============================================================================
echo ""
echo "Starting proxy server on port $PROXY_PORT..."
python3 disagg_proxy_p2p_nccl_xpyd.py &
PIDS+=($!)
# Parse GPU and port arrays
IFS=',' read -ra PREFILL_GPU_ARRAY <<< "$PREFILL_GPUS"
IFS=',' read -ra DECODE_GPU_ARRAY <<< "$DECODE_GPUS"
IFS=',' read -ra PREFILL_PORT_ARRAY <<< "$PREFILL_PORTS"
IFS=',' read -ra DECODE_PORT_ARRAY <<< "$DECODE_PORTS"
# =============================================================================
# Launch Prefill Servers (X Producers)
# =============================================================================
echo ""
echo "Starting ${#PREFILL_GPU_ARRAY[@]} prefill server(s)..."
for i in "${!PREFILL_GPU_ARRAY[@]}"; do
local gpu_id=${PREFILL_GPU_ARRAY[$i]}
local port=${PREFILL_PORT_ARRAY[$i]}
local kv_port=$((21001 + i))
echo " Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
--enforce-eager \
--host 0.0.0.0 \
--port $port \
--tensor-parallel-size 1 \
--seed 1024 \
--dtype float16 \
--max-model-len 10000 \
--max-num-batched-tokens 10000 \
--max-num-seqs 256 \
--trust-remote-code \
--gpu-memory-utilization 0.9 \
--kv-transfer-config \
"{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_producer\",\"kv_buffer_size\":\"1e1\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}" > prefill$((i+1)).log 2>&1 &
PIDS+=($!)
done
# =============================================================================
# Launch Decode Servers (Y Decoders)
# =============================================================================
echo ""
echo "Starting ${#DECODE_GPU_ARRAY[@]} decode server(s)..."
for i in "${!DECODE_GPU_ARRAY[@]}"; do
local gpu_id=${DECODE_GPU_ARRAY[$i]}
local port=${DECODE_PORT_ARRAY[$i]}
local kv_port=$((22001 + i))
echo " Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
--enforce-eager \
--host 0.0.0.0 \
--port $port \
--tensor-parallel-size 1 \
--seed 1024 \
--dtype float16 \
--max-model-len 10000 \
--max-num-batched-tokens 10000 \
--max-num-seqs 256 \
--trust-remote-code \
--gpu-memory-utilization 0.7 \
--kv-transfer-config \
"{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_consumer\",\"kv_buffer_size\":\"8e9\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}" > decode$((i+1)).log 2>&1 &
PIDS+=($!)
done
# =============================================================================
# Wait for All Servers to Start
# =============================================================================
echo ""
echo "Waiting for all servers to start..."
for port in "${PREFILL_PORT_ARRAY[@]}" "${DECODE_PORT_ARRAY[@]}"; do
if ! wait_for_server $port; then
echo "Failed to start server on port $port"
cleanup
exit 1
fi
done
echo ""
echo "All servers are up. Starting benchmark..."
# =============================================================================
# Run Benchmark
# =============================================================================
cd ../../../benchmarks/
vllm bench serve --port 10001 --seed $(date +%s) \
--model $MODEL \
--dataset-name random --random-input-len 7500 --random-output-len 200 \
--num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
echo "Benchmarking done. Cleaning up..."
cleanup
}
main

View File

@@ -0,0 +1,190 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import socket
import threading
import time
import uuid
from typing import Any
import aiohttp
import msgpack
import zmq
from quart import Quart, make_response, request
count = 0
prefill_instances: dict[str, Any] = {} # http_address: (zmq_address, stamp)
decode_instances: dict[str, Any] = {} # http_address: (zmq_address, stamp)
prefill_cv = threading.Condition()
decode_cv = threading.Condition()
DEFAULT_PING_SECONDS = 5
def _remove_oldest_instances(instances: dict[str, Any]) -> None:
oldest_key = next(iter(instances), None)
while oldest_key is not None:
value = instances[oldest_key]
if value[1] > time.time():
break
print(f"🔴Remove [HTTP:{oldest_key}, ZMQ:{value[0]}, stamp:{value[1]}]")
instances.pop(oldest_key, None)
oldest_key = next(iter(instances), None)
def _listen_for_register(poller, router_socket):
while True:
socks = dict(poller.poll())
if router_socket in socks:
remote_address, message = router_socket.recv_multipart()
# data: {"type": "P", "http_address": "ip:port",
# "zmq_address": "ip:port"}
data = msgpack.loads(message)
if data["type"] == "P":
global prefill_instances
global prefill_cv
with prefill_cv:
node = prefill_instances.get(data["http_address"], None)
prefill_instances[data["http_address"]] = (
data["zmq_address"],
time.time() + DEFAULT_PING_SECONDS,
)
_remove_oldest_instances(prefill_instances)
elif data["type"] == "D":
global decode_instances
global decode_cv
with decode_cv:
node = decode_instances.get(data["http_address"], None)
decode_instances[data["http_address"]] = (
data["zmq_address"],
time.time() + DEFAULT_PING_SECONDS,
)
_remove_oldest_instances(decode_instances)
else:
print(
"Unexpected, Received message from %s, data: %s",
remote_address,
data,
)
return
if node is None:
print(f"🔵Add [HTTP:{data['http_address']}, ZMQ:{data['zmq_address']}]")
def start_service_discovery(hostname, port):
if not hostname:
hostname = socket.gethostname()
if port == 0:
raise ValueError("Port cannot be 0")
context = zmq.Context()
router_socket = context.socket(zmq.ROUTER)
router_socket.bind(f"tcp://{hostname}:{port}")
poller = zmq.Poller()
poller.register(router_socket, zmq.POLLIN)
_listener_thread = threading.Thread(
target=_listen_for_register, args=[poller, router_socket], daemon=True
)
_listener_thread.start()
return _listener_thread
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
app = Quart(__name__)
def random_uuid() -> str:
return str(uuid.uuid4().hex)
async def forward_request(url, data, request_id):
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
"X-Request-Id": request_id,
}
async with session.post(url=url, json=data, headers=headers) as response:
if response.status == 200:
if True:
async for chunk_bytes in response.content.iter_chunked(1024):
yield chunk_bytes
else:
content = await response.read()
yield content
@app.route("/v1/completions", methods=["POST"])
@app.route("/v1/chat/completions", methods=["POST"])
async def handle_request():
try:
original_request_data = await request.get_json()
prefill_request = original_request_data.copy()
# change max_tokens = 1 to let it only do prefill
prefill_request["max_tokens"] = 1
if "max_completion_tokens" in prefill_request:
prefill_request["max_completion_tokens"] = 1
global count
global prefill_instances
global prefill_cv
with prefill_cv:
prefill_list = list(prefill_instances.items())
prefill_addr, prefill_zmq_addr = prefill_list[count % len(prefill_list)]
prefill_zmq_addr = prefill_zmq_addr[0]
global decode_instances
global decode_cv
with decode_cv:
decode_list = list(decode_instances.items())
decode_addr, decode_zmq_addr = decode_list[count % len(decode_list)]
decode_zmq_addr = decode_zmq_addr[0]
print(
f"handle_request count: {count}, [HTTP:{prefill_addr}, "
f"ZMQ:{prefill_zmq_addr}] 👉 [HTTP:{decode_addr}, "
f"ZMQ:{decode_zmq_addr}]"
)
count += 1
request_id = (
f"___prefill_addr_{prefill_zmq_addr}___decode_addr_"
f"{decode_zmq_addr}_{random_uuid()}"
)
# finish prefill
async for _ in forward_request(
f"http://{prefill_addr}{request.path}", prefill_request, request_id
):
continue
# return decode
generator = forward_request(
f"http://{decode_addr}{request.path}", original_request_data, request_id
)
response = await make_response(generator)
response.timeout = None
return response
except Exception as e:
import sys
import traceback
exc_info = sys.exc_info()
print("Error occurred in disagg prefill proxy server")
print(e)
print("".join(traceback.format_exception(*exc_info)))
if __name__ == "__main__":
t = start_service_discovery("0.0.0.0", 30001)
app.run(host="0.0.0.0", port=10001)
t.join()

View File

@@ -0,0 +1,57 @@
#!/bin/bash
MODEL_NAME="deepseek-ai/DeepSeek-V2-Lite"
LOCAL_MODEL_PATH="/models/models--deepseek-ai--DeepSeek-V2-Lite/snapshots/604d5664dddd88a0433dbae533b7fe9472482de0"
HOST="localhost"
PORT=8006
NUM_PROMPTS=20
REQUEST_RATE=5
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--model)
MODEL_NAME="$2"
shift 2
;;
--local-model)
MODEL_NAME=$LOCAL_MODEL_PATH
shift
;;
--host)
HOST="$2"
shift 2
;;
--port)
PORT="$2"
shift 2
;;
--num-prompts)
NUM_PROMPTS="$2"
shift 2
;;
--request-rate)
REQUEST_RATE="$2"
shift 2
;;
-h|--help)
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " --model MODEL_NAME Set model name or path (default: deepseek-ai/DeepSeek-V2-Lite)"
echo " --local-model Use local model path (convenience option)"
exit 0
;;
*)
echo "Unknown option: $1"
echo "Use -h or --help for usage information"
exit 1
;;
esac
done
vllm bench serve \
--model $MODEL_NAME \
--host $HOST \
--port $PORT \
--num-prompts $NUM_PROMPTS \
--request-rate $REQUEST_RATE

View File

@@ -0,0 +1,53 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import json
import sys
import requests
def scale(host, port, new_dp_size):
url = f"http://{host}:{port}/scale_elastic_ep"
payload = {"new_data_parallel_size": new_dp_size}
headers = {"Content-Type": "application/json"}
print(f"Sending scale request to {url}")
print(f"Payload: {json.dumps(payload, indent=2)}")
try:
response = requests.post(url, json=payload, headers=headers, timeout=300)
print(f"Status Code: {response.status_code}")
print(f"Response: {response.text}")
if response.status_code == 200:
print("Scale up/down request successful!")
return True
else:
print("Scale up/down request failed!")
return False
except requests.exceptions.RequestException as e:
print(f"Request failed: {e}")
return False
def main():
parser = argparse.ArgumentParser(description="Test scale up/down functionality")
parser.add_argument("--host", default="localhost", help="API server host")
parser.add_argument("--port", type=int, default=8006, help="API server port")
parser.add_argument(
"--new-dp-size", type=int, default=2, help="New data parallel size"
)
args = parser.parse_args()
success = scale(args.host, args.port, args.new_dp_size)
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,71 @@
#!/bin/bash
HOST="0.0.0.0"
PORT=8006
DATA_PARALLEL_SIZE=4
REDUNDANT_EXPERTS=0
LOCAL_MODEL_PATH="/models/models--deepseek-ai--DeepSeek-V2-Lite/snapshots/604d5664dddd88a0433dbae533b7fe9472482de0"
MODEL_NAME="deepseek-ai/DeepSeek-V2-Lite"
while [[ $# -gt 0 ]]; do
case $1 in
--dp)
DATA_PARALLEL_SIZE="$2"
shift 2
;;
--re)
REDUNDANT_EXPERTS="$2"
shift 2
;;
--host)
HOST="$2"
shift 2
;;
--port)
PORT="$2"
shift 2
;;
--model)
MODEL_NAME="$2"
shift 2
;;
--local-model)
MODEL_NAME=$LOCAL_MODEL_PATH
shift
;;
-h|--help)
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " --dp SIZE Set data parallel size (default: 4)"
echo " --re SIZE Set redundant experts (default: 0)"
echo " --host HOST Set host address (default: 0.0.0.0)"
echo " --port PORT Set port number (default: 8006)"
echo " --model MODEL_NAME Set model name or path"
echo " -h, --help Show this help message"
exit 0
;;
*)
echo "Unknown option: $1"
echo "Use -h or --help for usage information"
exit 1
;;
esac
done
echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALLEL_SIZE and redundant experts: $REDUNDANT_EXPERTS"
export RAY_DEDUP_LOGS=0
export VLLM_ALL2ALL_BACKEND="pplx"
export VLLM_USE_DEEP_GEMM=1
vllm serve $MODEL_NAME \
--data-parallel-size $DATA_PARALLEL_SIZE \
--data-parallel-size-local $DATA_PARALLEL_SIZE \
--data-parallel-backend ray \
--enforce-eager \
--enable-expert-parallel \
--enable-eplb \
--num-redundant-experts $REDUNDANT_EXPERTS \
--trust-remote-code \
--host $HOST \
--port $PORT

View File

@@ -0,0 +1,112 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Example for starting a Gradio OpenAI Chatbot Webserver
Start vLLM API server:
vllm serve meta-llama/Llama-2-7b-chat-hf
Start Gradio OpenAI Chatbot Webserver:
python examples/online_serving/gradio_openai_chatbot_webserver.py \
-m meta-llama/Llama-2-7b-chat-hf
Note that `pip install --upgrade gradio` is needed to run this example.
More details: https://github.com/gradio-app/gradio
If your antivirus software blocks the download of frpc for gradio,
you can install it manually by following these steps:
1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64
2. Rename the downloaded file to: frpc_linux_amd64_v0.3
3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
"""
import argparse
import gradio as gr
from openai import OpenAI
def predict(message, history, client, model_name, temp, stop_token_ids):
messages = [
{"role": "system", "content": "You are a great AI assistant."},
*history,
{"role": "user", "content": message},
]
# Send request to OpenAI API (vLLM server)
stream = client.chat.completions.create(
model=model_name,
messages=messages,
temperature=temp,
stream=True,
extra_body={
"repetition_penalty": 1,
"stop_token_ids": [int(id.strip()) for id in stop_token_ids.split(",")]
if stop_token_ids
else [],
},
)
# Collect all chunks and concatenate them into a full message
full_message = ""
for chunk in stream:
full_message += chunk.choices[0].delta.content or ""
# Return the full message as a single response
return full_message
def parse_args():
parser = argparse.ArgumentParser(
description="Chatbot Interface with Customizable Parameters"
)
parser.add_argument(
"--model-url", type=str, default="http://localhost:8000/v1", help="Model URL"
)
parser.add_argument(
"-m", "--model", type=str, required=True, help="Model name for the chatbot"
)
parser.add_argument(
"--temp", type=float, default=0.8, help="Temperature for text generation"
)
parser.add_argument(
"--stop-token-ids", type=str, default="", help="Comma-separated stop token IDs"
)
parser.add_argument("--host", type=str, default=None)
parser.add_argument("--port", type=int, default=8001)
return parser.parse_args()
def build_gradio_interface(client, model_name, temp, stop_token_ids):
def chat_predict(message, history):
return predict(message, history, client, model_name, temp, stop_token_ids)
return gr.ChatInterface(
fn=chat_predict,
title="Chatbot Interface",
description="A simple chatbot powered by vLLM",
)
def main():
# Parse the arguments
args = parse_args()
# Set OpenAI's API key and API base to use vLLM's API server
openai_api_key = "EMPTY"
openai_api_base = args.model_url
# Create an OpenAI client
client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
# Define the Gradio chatbot interface using the predict function
gradio_interface = build_gradio_interface(
client, args.model, args.temp, args.stop_token_ids
)
gradio_interface.queue().launch(
server_name=args.host, server_port=args.port, share=True
)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,75 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Example for starting a Gradio Webserver
Start vLLM API server:
python -m vllm.entrypoints.api_server \
--model meta-llama/Llama-2-7b-chat-hf
Start Webserver:
python examples/online_serving/gradio_webserver.py
Note that `pip install --upgrade gradio` is needed to run this example.
More details: https://github.com/gradio-app/gradio
If your antivirus software blocks the download of frpc for gradio,
you can install it manually by following these steps:
1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64
2. Rename the downloaded file to: frpc_linux_amd64_v0.3
3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
"""
import argparse
import json
import gradio as gr
import requests
def http_bot(prompt):
headers = {"User-Agent": "vLLM Client"}
pload = {
"prompt": prompt,
"stream": True,
"max_tokens": 128,
}
response = requests.post(args.model_url, headers=headers, json=pload, stream=True)
for chunk in response.iter_lines(
chunk_size=8192, decode_unicode=False, delimiter=b"\n"
):
if chunk:
data = json.loads(chunk.decode("utf-8"))
output = data["text"][0]
yield output
def build_demo():
with gr.Blocks() as demo:
gr.Markdown("# vLLM text completion demo\n")
inputbox = gr.Textbox(label="Input", placeholder="Enter text and press ENTER")
outputbox = gr.Textbox(
label="Output", placeholder="Generated result from the model"
)
inputbox.submit(http_bot, [inputbox], [outputbox])
return demo
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default=None)
parser.add_argument("--port", type=int, default=8001)
parser.add_argument(
"--model-url", type=str, default="http://localhost:8000/generate"
)
return parser.parse_args()
def main(args):
demo = build_demo()
demo.queue().launch(server_name=args.host, server_port=args.port, share=True)
if __name__ == "__main__":
args = parse_args()
main(args)

View File

@@ -0,0 +1,117 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
import msgspec
import zmq
from msgspec.msgpack import Decoder
from vllm.v1.core.kv_cache_utils import ExternalBlockHash
#
# Types copied from vllm.distributed.kv_events
#
class EventBatch(msgspec.Struct, array_like=True, omit_defaults=True, gc=False):
ts: float
events: list[Any]
class KVCacheEvent(
msgspec.Struct, array_like=True, omit_defaults=True, gc=False, tag=True
):
"""Base class for all KV cache-related events"""
class BlockStored(KVCacheEvent):
block_hashes: list[ExternalBlockHash]
parent_block_hash: ExternalBlockHash | None
token_ids: list[int]
block_size: int
lora_id: int | None
medium: str | None
class BlockRemoved(KVCacheEvent):
block_hashes: list[ExternalBlockHash]
medium: str | None
class AllBlocksCleared(KVCacheEvent):
pass
class KVEventBatch(EventBatch):
events: list[BlockStored | BlockRemoved | AllBlocksCleared]
def process_event(event_batch):
print(f"Received event batch at {event_batch.ts}:")
for event in event_batch.events:
print(f" - {event}")
def main():
decoder = Decoder(type=KVEventBatch)
last_seq = -1
context = zmq.Context()
# Set up the main subscription socket
sub = context.socket(zmq.SUB)
sub.connect("tcp://localhost:5557")
topic = "kv-events"
sub.setsockopt_string(zmq.SUBSCRIBE, topic)
# Initialize replay socket
replay = context.socket(zmq.REQ)
replay.connect("tcp://localhost:5558")
poller = zmq.Poller()
poller.register(replay, zmq.POLLIN)
print("Listening for KV cache events on topic:", topic)
while True:
try:
if sub.poll(50):
_, seq_bytes, payload = sub.recv_multipart()
seq = int.from_bytes(seq_bytes, "big")
if last_seq >= 0 and seq > last_seq + 1:
missed = seq - last_seq - 1
print(
f"Missed {missed} messages (last: {last_seq}, current: {seq})"
)
replay.send((last_seq + 1).to_bytes(8, "big"))
while poller.poll(timeout=200):
seq_bytes, replay_payload = replay.recv_multipart()
if not replay_payload:
# End of replay marker is sent as an empty frame
# for the payload
break
replay_seq = int.from_bytes(seq_bytes, "big")
if replay_seq > last_seq:
event_batch = decoder.decode(replay_payload)
process_event(event_batch)
last_seq = replay_seq
if replay_seq >= seq - 1:
break
event_batch = decoder.decode(payload)
process_event(event_batch)
# ... do other periodic work or check for shutdown ...
except KeyboardInterrupt:
print("Interrupted")
break
except Exception as e:
print("Error decoding message:", e)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,119 @@
#!/bin/bash
#
# Helper script to manually start or join a Ray cluster for online serving of vLLM models.
# This script is first executed on the head node, and then on each worker node with the IP address
# of the head node.
#
# Subcommands:
# leader: Launches a Ray head node and blocks until the cluster reaches the expected size (head + workers).
# worker: Starts a worker node that connects to an existing Ray head node.
#
# Example usage:
# On the head node machine, start the Ray head node process and run a vLLM server.
# ./multi-node-serving.sh leader --ray_port=6379 --ray_cluster_size=<SIZE> [<extra ray args>] && \
# vllm serve meta-llama/Meta-Llama-3.1-405B-Instruct --port 8080 --tensor-parallel-size 8 --pipeline_parallel_size 2
#
# On each worker node, start the Ray worker node process.
# ./multi-node-serving.sh worker --ray_address=<HEAD_NODE_IP> --ray_port=6379 [<extra ray args>]
#
# About Ray:
# Ray is an open-source distributed execution framework that simplifies
# distributed computing. Learn more:
# https://ray.io/
subcommand=$1 # Either "leader" or "worker".
shift # Remove the subcommand from the argument list.
ray_port=6379 # Port used by the Ray head node.
ray_init_timeout=300 # Seconds to wait before timing out.
declare -a start_params # Parameters forwarded to the underlying 'ray start' command.
# Handle the worker subcommand.
case "$subcommand" in
worker)
ray_address=""
while [ $# -gt 0 ]; do
case "$1" in
--ray_address=*)
ray_address="${1#*=}"
;;
--ray_port=*)
ray_port="${1#*=}"
;;
--ray_init_timeout=*)
ray_init_timeout="${1#*=}"
;;
*)
start_params+=("$1")
esac
shift
done
if [ -z "$ray_address" ]; then
echo "Error: Missing argument --ray_address"
exit 1
fi
# Retry until the worker node connects to the head node or the timeout expires.
for (( i=0; i < $ray_init_timeout; i+=5 )); do
ray start --address=$ray_address:$ray_port --block "${start_params[@]}"
if [ $? -eq 0 ]; then
echo "Worker: Ray runtime started with head address $ray_address:$ray_port"
exit 0
fi
echo "Waiting until the ray worker is active..."
sleep 5s;
done
echo "Ray worker starts timeout, head address: $ray_address:$ray_port"
exit 1
;;
# Handle the leader subcommand.
leader)
ray_cluster_size=""
while [ $# -gt 0 ]; do
case "$1" in
--ray_port=*)
ray_port="${1#*=}"
;;
--ray_cluster_size=*)
ray_cluster_size="${1#*=}"
;;
--ray_init_timeout=*)
ray_init_timeout="${1#*=}"
;;
*)
start_params+=("$1")
esac
shift
done
if [ -z "$ray_cluster_size" ]; then
echo "Error: Missing argument --ray_cluster_size"
exit 1
fi
# Start the Ray head node.
ray start --head --port=$ray_port "${start_params[@]}"
# Poll Ray until every worker node is active.
for (( i=0; i < $ray_init_timeout; i+=5 )); do
active_nodes=`python3 -c 'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))'`
if [ $active_nodes -eq $ray_cluster_size ]; then
echo "All ray workers are active and the ray cluster is initialized successfully."
exit 0
fi
echo "Wait for all ray workers to be active. $active_nodes/$ray_cluster_size is active"
sleep 5s;
done
echo "Waiting for all ray workers to be active timed out."
exit 1
;;
*)
echo "unknown subcommand: $subcommand"
exit 1
;;
esac

View File

@@ -0,0 +1,87 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import threading
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.outputs import RequestOutput
from vllm.sampling_params import SamplingParams
from vllm.v1.metrics.loggers import AggregatedLoggingStatLogger
"""
To run this example, run the following commands simultaneously with
different CUDA_VISIBLE_DEVICES:
python examples/online_serving/multi_instance_data_parallel.py
vllm serve ibm-research/PowerMoE-3b -dp 2 -dpr 1 \
--data-parallel-address 127.0.0.1 --data-parallel-rpc-port 62300 \
--data-parallel-size-local 1 --enforce-eager --headless
Once both instances have completed the handshake, this example will
send a request to the instance with DP rank 1.
"""
def _do_background_logging(engine, interval, stop_event):
try:
while not stop_event.is_set():
asyncio.run(engine.do_log_stats())
stop_event.wait(interval)
except Exception as e:
print(f"vLLM background logging shutdown: {e}")
pass
async def main():
engine_args = AsyncEngineArgs(
model="ibm-research/PowerMoE-3b",
data_parallel_size=2,
tensor_parallel_size=1,
dtype="auto",
max_model_len=2048,
data_parallel_address="127.0.0.1",
data_parallel_rpc_port=62300,
data_parallel_size_local=1,
enforce_eager=True,
enable_log_requests=True,
disable_custom_all_reduce=True,
)
engine_client = AsyncLLMEngine.from_engine_args(
engine_args,
# Example: Using aggregated logger
stat_loggers=[AggregatedLoggingStatLogger],
)
stop_logging_event = threading.Event()
logging_thread = threading.Thread(
target=_do_background_logging,
args=(engine_client, 5, stop_logging_event),
daemon=True,
)
logging_thread.start()
sampling_params = SamplingParams(
temperature=0.7,
top_p=0.9,
max_tokens=100,
)
num_prompts = 10
for i in range(num_prompts):
prompt = "Who won the 2004 World Series?"
final_output: RequestOutput | None = None
async for output in engine_client.generate(
prompt=prompt,
sampling_params=sampling_params,
request_id=f"abcdef-{i}",
data_parallel_rank=1,
):
final_output = output
if final_output:
print(final_output.outputs[0].text)
stop_logging_event.set()
logging_thread.join()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,64 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Example Python client for OpenAI Chat Completion using vLLM API server
NOTE: start a supported chat completion model server with `vllm serve`, e.g.
vllm serve meta-llama/Llama-2-7b-chat-hf
"""
import argparse
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Who won the world series in 2020?"},
{
"role": "assistant",
"content": "The Los Angeles Dodgers won the World Series in 2020.",
},
{"role": "user", "content": "Where was it played?"},
]
def parse_args():
parser = argparse.ArgumentParser(description="Client for vLLM API server")
parser.add_argument(
"--stream", action="store_true", help="Enable streaming response"
)
return parser.parse_args()
def main(args):
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
# Chat Completion API
chat_completion = client.chat.completions.create(
messages=messages,
model=model,
stream=args.stream,
)
print("-" * 50)
print("Chat completion results:")
if args.stream:
for c in chat_completion:
print(c)
else:
print(chat_completion)
print("-" * 50)
if __name__ == "__main__":
args = parse_args()
main(args)

View File

@@ -0,0 +1,353 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""An example showing how to use vLLM to serve multimodal models
and run online serving with OpenAI client.
Launch the vLLM server with the following command:
(single image inference with Llava)
vllm serve llava-hf/llava-1.5-7b-hf
(multi-image inference with Phi-3.5-vision-instruct)
vllm serve microsoft/Phi-3.5-vision-instruct --runner generate \
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
(audio inference with Ultravox)
vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b \
--max-model-len 4096 --trust-remote-code
run the script with
python openai_chat_completion_client_for_multimodal.py --chat-type audio
"""
import base64
import requests
from openai import OpenAI
from utils import get_first_model
from vllm.utils.argparse_utils import FlexibleArgumentParser
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
headers = {"User-Agent": "vLLM Example Client"}
def encode_base64_content_from_url(content_url: str) -> str:
"""Encode a content retrieved from a remote url to base64 format."""
with requests.get(content_url, headers=headers) as response:
response.raise_for_status()
result = base64.b64encode(response.content).decode("utf-8")
return result
# Text-only inference
def run_text_only(model: str, max_completion_tokens: int) -> None:
chat_completion = client.chat.completions.create(
messages=[{"role": "user", "content": "What's the capital of France?"}],
model=model,
max_completion_tokens=max_completion_tokens,
)
result = chat_completion.choices[0].message.content
print("Chat completion output:\n", result)
# Single-image input inference
def run_single_image(model: str, max_completion_tokens: int) -> None:
## Use image url in the payload
image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
chat_completion_from_url = client.chat.completions.create(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this image?"},
{
"type": "image_url",
"image_url": {"url": image_url},
},
],
}
],
model=model,
max_completion_tokens=max_completion_tokens,
)
result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from image url:\n", result)
## Use base64 encoded image in the payload
image_base64 = encode_base64_content_from_url(image_url)
chat_completion_from_base64 = client.chat.completions.create(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this image?"},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
},
],
}
],
model=model,
max_completion_tokens=max_completion_tokens,
)
result = chat_completion_from_base64.choices[0].message.content
print("Chat completion output from base64 encoded image:", result)
# Multi-image input inference
def run_multi_image(model: str, max_completion_tokens: int) -> None:
image_url_duck = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg"
image_url_lion = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/lion.jpg"
chat_completion_from_url = client.chat.completions.create(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What are the animals in these images?"},
{
"type": "image_url",
"image_url": {"url": image_url_duck},
},
{
"type": "image_url",
"image_url": {"url": image_url_lion},
},
],
}
],
model=model,
max_completion_tokens=max_completion_tokens,
)
result = chat_completion_from_url.choices[0].message.content
print("Chat completion output:\n", result)
# Video input inference
def run_video(model: str, max_completion_tokens: int) -> None:
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
video_base64 = encode_base64_content_from_url(video_url)
## Use video url in the payload
chat_completion_from_url = client.chat.completions.create(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this video?"},
{
"type": "video_url",
"video_url": {"url": video_url},
},
],
}
],
model=model,
max_completion_tokens=max_completion_tokens,
)
result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from video url:\n", result)
## Use base64 encoded video in the payload
chat_completion_from_base64 = client.chat.completions.create(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this video?"},
{
"type": "video_url",
"video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
},
],
}
],
model=model,
max_completion_tokens=max_completion_tokens,
)
result = chat_completion_from_base64.choices[0].message.content
print("Chat completion output from base64 encoded video:\n", result)
# Audio input inference
def run_audio(model: str, max_completion_tokens: int) -> None:
from vllm.assets.audio import AudioAsset
audio_url = AudioAsset("winning_call").url
audio_base64 = encode_base64_content_from_url(audio_url)
# OpenAI-compatible schema (`input_audio`)
chat_completion_from_base64 = client.chat.completions.create(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this audio?"},
{
"type": "input_audio",
"input_audio": {
# Any format supported by librosa is supported
"data": audio_base64,
"format": "wav",
},
},
],
}
],
model=model,
max_completion_tokens=max_completion_tokens,
)
result = chat_completion_from_base64.choices[0].message.content
print("Chat completion output from input audio:\n", result)
# HTTP URL
chat_completion_from_url = client.chat.completions.create(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this audio?"},
{
"type": "audio_url",
"audio_url": {
# Any format supported by librosa is supported
"url": audio_url
},
},
],
}
],
model=model,
max_completion_tokens=max_completion_tokens,
)
result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from audio url:\n", result)
# base64 URL
chat_completion_from_base64 = client.chat.completions.create(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this audio?"},
{
"type": "audio_url",
"audio_url": {
# Any format supported by librosa is supported
"url": f"data:audio/ogg;base64,{audio_base64}"
},
},
],
}
],
model=model,
max_completion_tokens=max_completion_tokens,
)
result = chat_completion_from_base64.choices[0].message.content
print("Chat completion output from base64 encoded audio:\n", result)
def run_multi_audio(model: str, max_completion_tokens: int) -> None:
from vllm.assets.audio import AudioAsset
# Two different audios to showcase batched inference.
audio_url = AudioAsset("winning_call").url
audio_base64 = encode_base64_content_from_url(audio_url)
audio_url2 = AudioAsset("azacinto_foscolo").url
audio_base64_2 = encode_base64_content_from_url(audio_url2)
# OpenAI-compatible schema (`input_audio`)
chat_completion_from_base64 = client.chat.completions.create(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Are these two audios the same?"},
{
"type": "input_audio",
"input_audio": {
"data": audio_base64,
"format": "wav",
},
},
{
"type": "input_audio",
"input_audio": {
"data": audio_base64_2,
"format": "wav",
},
},
],
}
],
model=model,
max_completion_tokens=max_completion_tokens,
)
result = chat_completion_from_base64.choices[0].message.content
print("Chat completion output from input audio:\n", result)
example_function_map = {
"text-only": run_text_only,
"single-image": run_single_image,
"multi-image": run_multi_image,
"multi-audio": run_multi_audio,
"video": run_video,
"audio": run_audio,
}
def parse_args():
parser = FlexibleArgumentParser(
description="Demo on using OpenAI client for online serving with "
"multimodal language models served with vLLM."
)
parser.add_argument(
"--chat-type",
"-c",
type=str,
default="single-image",
choices=list(example_function_map.keys()),
help="Conversation type with multimodal data.",
)
parser.add_argument(
"--max-completion-tokens",
"-n",
type=int,
default=128,
help="Maximum number of tokens to generate for each completion.",
)
return parser.parse_args()
def main(args) -> None:
chat_type = args.chat_type
model = get_first_model(client)
example_function_map[chat_type](model, args.max_completion_tokens)
if __name__ == "__main__":
args = parse_args()
main(args)

View File

@@ -0,0 +1,195 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Set up this example by starting a vLLM OpenAI-compatible server with tool call
options enabled. For example:
IMPORTANT: for mistral, you must use one of the provided mistral tool call
templates, or your own - the model default doesn't work for tool calls with vLLM
See the vLLM docs on OpenAI server & tool calling for more details.
vllm serve mistralai/Mistral-7B-Instruct-v0.3 \
--chat-template examples/tool_chat_template_mistral.jinja \
--enable-auto-tool-choice --tool-call-parser mistral
OR
vllm serve NousResearch/Hermes-2-Pro-Llama-3-8B \
--chat-template examples/tool_chat_template_hermes.jinja \
--enable-auto-tool-choice --tool-call-parser hermes
"""
import json
from typing import Any
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
properties = {
"city": {
"type": "string",
"description": "The city to find the weather for, e.g. 'San Francisco'",
},
"state": {
"type": "string",
"description": "the two-letter abbreviation for the state that the city is"
" in, e.g. 'CA' which would mean 'California'",
},
"unit": {
"type": "string",
"description": "The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"],
},
}
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": properties,
"required": ["city", "state", "unit"],
},
},
}
]
messages = [
{"role": "user", "content": "Hi! How are you doing today?"},
{"role": "assistant", "content": "I'm doing well! How can I help you?"},
{
"role": "user",
"content": (
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
),
},
]
def get_current_weather(city: str, state: str, unit: "str"):
return (
"The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
"partly cloudly, with highs in the 90's."
)
def handle_tool_calls_stream(
client: OpenAI,
messages: list[dict[str, str]],
model: str,
tools: list[dict[str, Any]],
) -> list[Any]:
tool_calls_stream = client.chat.completions.create(
messages=messages, model=model, tools=tools, stream=True
)
chunks = []
print("chunks: ")
for chunk in tool_calls_stream:
chunks.append(chunk)
if chunk.choices[0].delta.tool_calls:
print(chunk.choices[0].delta.tool_calls[0])
else:
print(chunk.choices[0].delta)
return chunks
def handle_tool_calls_arguments(chunks: list[Any]) -> list[str]:
arguments = []
tool_call_idx = -1
print("arguments: ")
for chunk in chunks:
if chunk.choices[0].delta.tool_calls:
tool_call = chunk.choices[0].delta.tool_calls[0]
if tool_call.index != tool_call_idx:
if tool_call_idx >= 0:
print(f"streamed tool call arguments: {arguments[tool_call_idx]}")
tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
arguments.append("")
if tool_call.id:
print(f"streamed tool call id: {tool_call.id} ")
if tool_call.function:
if tool_call.function.name:
print(f"streamed tool call name: {tool_call.function.name}")
if tool_call.function.arguments:
arguments[tool_call_idx] += tool_call.function.arguments
return arguments
def main():
# Initialize OpenAI client
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
# Get available models and select one
models = client.models.list()
model = models.data[0].id
chat_completion = client.chat.completions.create(
messages=messages, model=model, tools=tools
)
print("-" * 70)
print("Chat completion results:")
print(chat_completion)
print("-" * 70)
# Stream tool calls
chunks = handle_tool_calls_stream(client, messages, model, tools)
print("-" * 70)
# Handle arguments from streamed tool calls
arguments = handle_tool_calls_arguments(chunks)
if len(arguments):
print(f"streamed tool call arguments: {arguments[-1]}\n")
print("-" * 70)
# Add tool call results to the conversation
messages.append(
{
"role": "assistant",
"tool_calls": chat_completion.choices[0].message.tool_calls,
"reasoning": chat_completion.choices[0].message.reasoning,
}
)
# Now, simulate a tool call
available_tools = {"get_current_weather": get_current_weather}
completion_tool_calls = chat_completion.choices[0].message.tool_calls
for call in completion_tool_calls:
tool_to_call = available_tools[call.function.name]
args = json.loads(call.function.arguments)
result = tool_to_call(**args)
print("tool_to_call result: ", result)
messages.append(
{
"role": "tool",
"content": result,
"tool_call_id": call.id,
"name": call.function.name,
}
)
chat_completion_2 = client.chat.completions.create(
messages=messages, model=model, tools=tools, stream=False
)
print("Chat completion2 results:")
print(chat_completion_2)
print("-" * 70)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,130 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
To run this example, you can start the vLLM server
without any specific flags:
```bash
vllm serve unsloth/Llama-3.2-1B-Instruct \
--structured-outputs-config.backend outlines
```
This example demonstrates how to generate chat completions
using the OpenAI Python client library.
"""
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description": "The city to find the weather for"
", e.g. 'San Francisco'",
},
"state": {
"type": "string",
"description": (
"the two-letter abbreviation for the state that the "
"city is in, e.g. 'CA' which would mean 'California'"
),
},
"unit": {
"type": "string",
"description": "The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["city", "state", "unit"],
},
},
},
{
"type": "function",
"function": {
"name": "get_forecast",
"description": "Get the weather forecast for a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description": (
"The city to get the forecast for, e.g. 'New York'"
),
},
"state": {
"type": "string",
"description": (
"The two-letter abbreviation for the state, e.g. 'NY'"
),
},
"days": {
"type": "integer",
"description": "Number of days to get the forecast for (1-7)",
},
"unit": {
"type": "string",
"description": "The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["city", "state", "days", "unit"],
},
},
},
]
messages = [
{"role": "user", "content": "Hi! How are you doing today?"},
{"role": "assistant", "content": "I'm doing well! How can I help you?"},
{
"role": "user",
"content": "Can you tell me what the current weather is in Dallas \
and the forecast for the next 5 days, in fahrenheit?",
},
]
def main():
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
chat_completion = client.chat.completions.create(
messages=messages,
model=model,
tools=tools,
tool_choice="required",
stream=True, # Enable streaming response
)
for chunk in chat_completion:
if chunk.choices and chunk.choices[0].delta.tool_calls:
print(chunk.choices[0].delta.tool_calls)
chat_completion = client.chat.completions.create(
messages=messages, model=model, tools=tools, tool_choice="required"
)
print(chat_completion.choices[0].message.tool_calls)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,245 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
"""
Set up this example by starting a vLLM OpenAI-compatible server with tool call
options enabled for xLAM-2 models:
vllm serve --model Salesforce/Llama-xLAM-2-8b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
OR
vllm serve --model Salesforce/xLAM-2-3b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
"""
import json
import time
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "empty"
openai_api_base = "http://localhost:8000/v1"
# Define tool functions
def get_weather(location: str, unit: str):
return f"Weather in {location} is 22 degrees {unit}."
def calculate_expression(expression: str):
try:
result = eval(expression)
return f"The result of {expression} is {result}"
except Exception as e:
return f"Could not calculate {expression}: {e}"
def translate_text(text: str, target_language: str):
return f"Translation of '{text}' to {target_language}: [translated content]"
# Define tools
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City and state, e.g., 'San Francisco, CA'",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location", "unit"],
},
},
},
{
"type": "function",
"function": {
"name": "calculate_expression",
"description": "Calculate a mathematical expression",
"parameters": {
"type": "object",
"properties": {
"expression": {
"type": "string",
"description": "Mathematical expression to evaluate, needs to be a valid python expression",
}
},
"required": ["expression"],
},
},
},
{
"type": "function",
"function": {
"name": "translate_text",
"description": "Translate text to another language",
"parameters": {
"type": "object",
"properties": {
"text": {"type": "string", "description": "Text to translate"},
"target_language": {
"type": "string",
"description": "Target language for translation",
},
},
"required": ["text", "target_language"],
},
},
},
]
# Map of function names to implementations
tool_functions = {
"get_weather": get_weather,
"calculate_expression": calculate_expression,
"translate_text": translate_text,
}
def process_response(response, tool_functions, original_query):
"""Process a non-streaming response with possible tool calls"""
print("\n--- Response Output ---")
# Check if the response has content
if response.choices[0].message.content:
print(f"Content: {response.choices[0].message.content}")
# Check if the response has tool calls
if response.choices[0].message.tool_calls:
print("--------------------------------")
print(f"Tool calls: {response.choices[0].message.tool_calls}")
print("--------------------------------")
# Collect all tool calls and results before making follow-up request
tool_results = []
assistant_message = {"role": "assistant"}
if response.choices[0].message.content:
assistant_message["content"] = response.choices[0].message.content
assistant_tool_calls = []
# Process each tool call
for tool_call in response.choices[0].message.tool_calls:
function_name = tool_call.function.name
function_args = tool_call.function.arguments
function_id = tool_call.id
print(f"Function called: {function_name}")
print(f"Arguments: {function_args}")
print(f"Function ID: {function_id}")
# Execute the function
try:
# Parse the JSON arguments
args = json.loads(function_args)
# Call the function with the arguments
function_result = tool_functions[function_name](**args)
print(f"\n--- Function Result ---\n{function_result}\n")
# Add tool call to assistant message
assistant_tool_calls.append(
{
"id": function_id,
"type": "function",
"function": {"name": function_name, "arguments": function_args},
}
)
# Add tool result to tool_results
tool_results.append(
{
"role": "tool",
"tool_call_id": function_id,
"content": function_result,
}
)
except Exception as e:
print(f"Error executing function: {e}")
# Add tool_calls to assistant message
assistant_message["tool_calls"] = assistant_tool_calls
# Create a follow-up message with all function results
follow_up_messages = [
{"role": "user", "content": original_query},
assistant_message,
]
# Add all tool results to the messages
follow_up_messages.extend(tool_results)
# Get completion with all tool results in a single follow-up
follow_up_response = client.chat.completions.create(
model=client.models.list().data[0].id,
messages=follow_up_messages,
stream=False,
)
print("\n--- Follow-up Response ---")
print(follow_up_response.choices[0].message.content)
print("--- End Follow-up ---\n")
print("--- End Response ---\n")
def run_test_case(query, test_name):
"""Run a single test case with the given query"""
print(f"\n{'=' * 50}\nTEST CASE: {test_name}\n{'=' * 50}")
print(f"Query: '{query}'")
start_time = time.time()
# Create non-streaming chat completion request
response = client.chat.completions.create(
model=client.models.list().data[0].id,
messages=[{"role": "user", "content": query}],
tools=tools,
tool_choice="auto",
stream=False,
)
# Process the non-streaming response, passing the original query
process_response(response, tool_functions, query)
end_time = time.time()
print(f"Test completed in {end_time - start_time:.2f} seconds")
def main():
# Initialize OpenAI client
global client
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
# Run test cases
test_cases = [
("I want to know the weather in San Francisco", "Weather Information"),
("Calculate 25 * 17 + 31", "Math Calculation"),
("Translate 'Hello world' to Spanish", "Text Translation"),
("What is the weather in Tokyo and New York in celsius", "Multiple Tool Usage"),
]
# Execute all test cases
for query, test_name in test_cases:
run_test_case(query, test_name)
time.sleep(1) # Small delay between tests
print("\nAll tests completed.")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,273 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
"""
Set up this example by starting a vLLM OpenAI-compatible server with tool call
options enabled for xLAM-2 models:
vllm serve --model Salesforce/Llama-xLAM-2-8b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
OR
vllm serve --model Salesforce/xLAM-2-3b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
This example demonstrates streaming tool calls with xLAM models.
"""
import json
import time
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "empty"
openai_api_base = "http://localhost:8000/v1"
# Define tool functions
def get_weather(location: str, unit: str):
return f"Weather in {location} is 22 degrees {unit}."
def calculate_expression(expression: str):
try:
result = eval(expression)
return f"The result of {expression} is {result}"
except Exception as e:
return f"Could not calculate {expression}: {e}"
def translate_text(text: str, target_language: str):
return f"Translation of '{text}' to {target_language}: [translated content]"
# Define tools
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City and state, e.g., 'San Francisco, CA'",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location", "unit"],
},
},
},
{
"type": "function",
"function": {
"name": "calculate_expression",
"description": "Calculate a mathematical expression",
"parameters": {
"type": "object",
"properties": {
"expression": {
"type": "string",
"description": "Mathematical expression to evaluate, needs to be a valid Python expression",
}
},
"required": ["expression"],
},
},
},
{
"type": "function",
"function": {
"name": "translate_text",
"description": "Translate text to another language",
"parameters": {
"type": "object",
"properties": {
"text": {"type": "string", "description": "Text to translate"},
"target_language": {
"type": "string",
"description": "Target language for translation",
},
},
"required": ["text", "target_language"],
},
},
},
]
# Map of function names to implementations
tool_functions = {
"get_weather": get_weather,
"calculate_expression": calculate_expression,
"translate_text": translate_text,
}
def process_stream(response, tool_functions, original_query):
"""Process a streaming response with possible tool calls"""
# Track multiple tool calls
tool_calls = {} # Dictionary to store tool calls by ID
current_id = None
print("\n--- Stream Output ---")
for chunk in response:
# Handle tool calls in the stream
if chunk.choices[0].delta.tool_calls:
for tool_call_chunk in chunk.choices[0].delta.tool_calls:
# Get the tool call ID
if hasattr(tool_call_chunk, "id") and tool_call_chunk.id:
current_id = tool_call_chunk.id
if current_id not in tool_calls:
tool_calls[current_id] = {
"function_name": None,
"function_args": "",
"function_id": current_id,
}
# Extract function information as it comes in chunks
if (
hasattr(tool_call_chunk, "function")
and current_id
and current_id in tool_calls
):
if (
hasattr(tool_call_chunk.function, "name")
and tool_call_chunk.function.name
):
tool_calls[current_id]["function_name"] = (
tool_call_chunk.function.name
)
print(f"Function called: {tool_call_chunk.function.name}")
if (
hasattr(tool_call_chunk.function, "arguments")
and tool_call_chunk.function.arguments
):
tool_calls[current_id]["function_args"] += (
tool_call_chunk.function.arguments
)
print(f"Arguments chunk: {tool_call_chunk.function.arguments}")
# Handle regular content in the stream
elif chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="")
print("\n--- End Stream ---\n")
# Execute each function call and build messages for follow-up
follow_up_messages = [{"role": "user", "content": original_query}]
for tool_id, tool_data in tool_calls.items():
function_name = tool_data["function_name"]
function_args = tool_data["function_args"]
function_id = tool_data["function_id"]
if function_name and function_args:
try:
# Parse the JSON arguments
args = json.loads(function_args)
# Call the function with the arguments
function_result = tool_functions[function_name](**args)
print(
f"\n--- Function Result ({function_name}) ---\n{function_result}\n"
)
# Add the assistant message with tool call
follow_up_messages.append(
{
"role": "assistant",
"tool_calls": [
{
"id": function_id,
"type": "function",
"function": {
"name": function_name,
"arguments": function_args,
},
}
],
}
)
# Add the tool message with function result
follow_up_messages.append(
{
"role": "tool",
"tool_call_id": function_id,
"content": function_result,
}
)
except Exception as e:
print(f"Error executing function: {e}")
# Only send follow-up if we have results to process
if len(follow_up_messages) > 1:
# Create a follow-up message with all the function results
follow_up_response = client.chat.completions.create(
model=client.models.list().data[0].id,
messages=follow_up_messages,
stream=True,
)
print("\n--- Follow-up Response ---")
for chunk in follow_up_response:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="")
print("\n--- End Follow-up ---\n")
def run_test_case(query, test_name):
"""Run a single test case with the given query"""
print(f"\n{'=' * 50}\nTEST CASE: {test_name}\n{'=' * 50}")
print(f"Query: '{query}'")
start_time = time.time()
# Create streaming chat completion request
response = client.chat.completions.create(
model=client.models.list().data[0].id,
messages=[{"role": "user", "content": query}],
tools=tools,
tool_choice="auto",
stream=True,
)
# Process the streaming response
process_stream(response, tool_functions, query)
end_time = time.time()
print(f"Test completed in {end_time - start_time:.2f} seconds")
def main():
# Initialize OpenAI client
global client
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
# Run test cases
test_cases = [
("I want to know the weather in San Francisco", "Weather Information"),
("Calculate 25 * 17 + 31", "Math Calculation"),
("Translate 'Hello world' to Spanish", "Text Translation"),
("What is the weather in Tokyo and New York in celsius", "Multiple Tool Usage"),
]
# Execute all test cases
for query, test_name in test_cases:
run_test_case(query, test_name)
time.sleep(1) # Small delay between tests
print("\nAll tests completed.")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,170 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
An example demonstrates how to use tool calling with reasoning models
like QwQ-32B. The reasoning will not be parsed by the tool
calling process; only the final output will be parsed.
To run this example, you need to start the vLLM server with both
the reasoning parser and tool calling enabled.
```bash
vllm serve Qwen/QwQ-32B \
--reasoning-parser deepseek_r1 \
--enable-auto-tool-choice --tool-call-parser hermes
```
"""
from openai import OpenAI
# Now, simulate a tool call
def get_current_weather(city: str, state: str, unit: "str"):
return (
"The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
"partly cloudly, with highs in the 90's."
)
available_tools = {"get_current_weather": get_current_weather}
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
properties = {
"city": {
"type": "string",
"description": "The city to find the weather for, e.g. 'San Francisco'",
},
"state": {
"type": "string",
"description": "the two-letter abbreviation for the state that the city is"
" in, e.g. 'CA' which would mean 'California'",
},
"unit": {
"type": "string",
"description": "The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"],
},
}
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": properties,
"required": ["city", "state", "unit"],
},
},
}
]
messages = [
{"role": "user", "content": "Hi! How are you doing today?"},
{"role": "assistant", "content": "I'm doing well! How can I help you?"},
{
"role": "user",
"content": (
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
),
},
]
def extract_reasoning_and_calls(chunks: list):
reasoning = ""
tool_call_idx = -1
arguments = []
function_names = []
for chunk in chunks:
if chunk.choices[0].delta.tool_calls:
tool_call = chunk.choices[0].delta.tool_calls[0]
if tool_call.index != tool_call_idx:
tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
arguments.append("")
function_names.append("")
if tool_call.function:
if tool_call.function.name:
function_names[tool_call_idx] = tool_call.function.name
if tool_call.function.arguments:
arguments[tool_call_idx] += tool_call.function.arguments
else:
if hasattr(chunk.choices[0].delta, "reasoning"):
reasoning += chunk.choices[0].delta.reasoning
return reasoning, arguments, function_names
def main():
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
print("---------Full Generate With Automatic Function Calling-------------")
tool_calls = client.chat.completions.create(
messages=messages, model=model, tools=tools
)
print(f"reasoning: {tool_calls.choices[0].message.reasoning}")
print(f"function name: {tool_calls.choices[0].message.tool_calls[0].function.name}")
print(
f"function arguments: "
f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}"
)
print("----------Stream Generate With Automatic Function Calling-----------")
tool_calls_stream = client.chat.completions.create(
messages=messages, model=model, tools=tools, stream=True
)
chunks = list(tool_calls_stream)
reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
print(f"reasoning: {reasoning}")
print(f"function name: {function_names[0]}")
print(f"function arguments: {arguments[0]}")
print("----------Full Generate With Named Function Calling-----------------")
tool_calls = client.chat.completions.create(
messages=messages,
model=model,
tools=tools,
tool_choice={"type": "function", "function": {"name": "get_current_weather"}},
)
tool_call = tool_calls.choices[0].message.tool_calls[0].function
print(f"reasoning: {tool_calls.choices[0].message.reasoning}")
print(f"function name: {tool_call.name}")
print(f"function arguments: {tool_call.arguments}")
print("----------Stream Generate With Named Function Calling--------------")
tool_calls_stream = client.chat.completions.create(
messages=messages,
model=model,
tools=tools,
tool_choice={"type": "function", "function": {"name": "get_current_weather"}},
stream=True,
)
chunks = list(tool_calls_stream)
reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
print(f"reasoning: {reasoning}")
print(f"function name: {function_names[0]}")
print(f"function arguments: {arguments[0]}")
print("\n\n")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,65 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
An example shows how to generate chat completions from reasoning models
like DeepSeekR1.
To run this example, you need to start the vLLM server
with the reasoning parser:
```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
--reasoning-parser deepseek_r1
```
This example demonstrates how to generate chat completions from reasoning models
using the OpenAI Python client library.
"""
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
def main():
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
# Round 1
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
# ruff: noqa: E501
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
response = client.chat.completions.create(model=model, messages=messages)
reasoning = response.choices[0].message.reasoning
content = response.choices[0].message.content
print("reasoning for Round 1:", reasoning)
print("content for Round 1:", content)
# Round 2
messages.append({"role": "assistant", "content": content})
messages.append(
{
"role": "user",
"content": "How many Rs are there in the word 'strawberry'?",
}
)
response = client.chat.completions.create(model=model, messages=messages)
reasoning = response.choices[0].message.reasoning
content = response.choices[0].message.content
print("reasoning for Round 2:", reasoning)
print("content for Round 2:", content)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,73 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
An example shows how to generate chat completions from reasoning models
like DeepSeekR1.
To run this example, you need to start the vLLM server with the reasoning
parser:
```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
--reasoning-parser deepseek_r1
```
Unlike openai_chat_completion_with_reasoning.py, this example demonstrates the
streaming chat completions feature.
The streaming chat completions feature allows you to receive chat completions
in real-time as they are generated by the model. This is useful for scenarios
where you want to display chat completions to the user as they are generated
by the model.
Remember to check content and reasoning exist in `ChatCompletionChunk`,
content may not exist leading to errors if you try to access it.
"""
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
def main():
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
# ruff: noqa: E501
# For granite: add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
stream = client.chat.completions.create(model=model, messages=messages, stream=True)
print("client: Start streaming chat completions...")
printed_reasoning = False
printed_content = False
for chunk in stream:
# Safely extract reasoning and content from delta,
# defaulting to None if attributes don't exist or are empty strings
reasoning = getattr(chunk.choices[0].delta, "reasoning", None) or None
content = getattr(chunk.choices[0].delta, "content", None) or None
if reasoning is not None:
if not printed_reasoning:
printed_reasoning = True
print("reasoning:", end="", flush=True)
print(reasoning, end="", flush=True)
elif content is not None:
if not printed_content:
printed_content = True
print("\ncontent:", end="", flush=True)
# Extract and print the content
print(content, end="", flush=True)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,53 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
def parse_args():
parser = argparse.ArgumentParser(description="Client for vLLM API server")
parser.add_argument(
"--stream", action="store_true", help="Enable streaming response"
)
return parser.parse_args()
def main(args):
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
# Completion API
completion = client.completions.create(
model=model,
prompt="A robot may not injure a human being",
echo=False,
n=2,
stream=args.stream,
logprobs=3,
)
print("-" * 50)
print("Completion results:")
if args.stream:
for c in completion:
print(c)
else:
print(completion)
print("-" * 50)
if __name__ == "__main__":
args = parse_args()
main(args)

View File

@@ -0,0 +1,44 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Set up this example by starting a vLLM OpenAI-compatible server.
Reasoning models can be used through the Responses API as seen here
https://platform.openai.com/docs/api-reference/responses
For example:
vllm serve Qwen/Qwen3-8B --reasoning-parser qwen3
"""
from openai import OpenAI
input_messages = [{"role": "user", "content": "What model are you?"}]
def main():
base_url = "http://localhost:8000/v1"
client = OpenAI(base_url=base_url, api_key="empty")
model = "Qwen/Qwen3-8B" # get_first_model(client)
response = client.responses.create(
model=model,
input=input_messages,
)
for message in response.output:
if message.type == "reasoning":
# append reasoning message
input_messages.append(message)
response_2 = client.responses.create(
model=model,
input=input_messages,
)
print(response_2.output_text)
# I am Qwen, a large language model developed by Alibaba Cloud.
# I am designed to assist with a wide range of tasks, including
# answering questions, creating content, coding, and engaging in
# conversations. I can help with various topics and provide
# information or support in multiple languages. How can I assist you today?
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,184 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Example demonstrating MCP (Model Context Protocol) tools with the Responses API.
This example shows how to use MCP tools with different allowed_tools configurations:
1. No filter (allows all tools from the MCP server)
2. Wildcard "*" (explicitly allows all tools)
3. Specific tool names (filters to only those tools)
Set up this example by starting a vLLM OpenAI-compatible server with MCP tools enabled.
For example:
vllm serve openai/gpt-oss-20b --enforce-eager --tool-server demo
Environment variables:
- VLLM_ENABLE_RESPONSES_API_STORE=1
- VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS=code_interpreter,container
- VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS=1
"""
from openai import OpenAI
from utils import get_first_model
def example_no_filter():
"""Example with no allowed_tools filter - allows all tools."""
print("=" * 60)
print("Example 1: No allowed_tools filter (allows all tools)")
print("=" * 60)
base_url = "http://0.0.0.0:8000/v1"
client = OpenAI(base_url=base_url, api_key="empty")
model = get_first_model(client)
response = client.responses.create(
model=model,
input="Execute this code: print('Hello from Python!')",
instructions="Use the Python tool to execute code.",
tools=[
{
"type": "mcp",
"server_label": "code_interpreter",
"server_url": "http://localhost:8888",
# No allowed_tools specified - all tools are available
}
],
)
print(f"Status: {response.status}")
print(f"Output: {response.output_text}")
print()
def example_wildcard():
"""Example with allowed_tools=['*'] - explicitly allows all tools."""
print("=" * 60)
print("Example 2: allowed_tools=['*'] (select all tools)")
print("=" * 60)
base_url = "http://0.0.0.0:8000/v1"
client = OpenAI(base_url=base_url, api_key="empty")
model = get_first_model(client)
response = client.responses.create(
model=model,
input="Execute this code: print('Hello from Python with wildcard!')",
instructions="Use the Python tool to execute code.",
tools=[
{
"type": "mcp",
"server_label": "code_interpreter",
"server_url": "http://localhost:8888",
# Using "*" to explicitly allow all tools from this MCP server
# This is equivalent to not specifying allowed_tools
"allowed_tools": ["*"],
}
],
)
print(f"Status: {response.status}")
print(f"Output: {response.output_text}")
print()
def example_specific_tools():
"""Example with specific allowed_tools list - filters available tools.
Note: This example uses 'web_search_preview' (browser) which has multiple
sub-tools: 'search', 'open', 'find'. The code_interpreter (python) doesn't
have sub-tools, so filtering doesn't apply there.
"""
print("=" * 60)
print("Example 3: allowed_tools=['search'] (filter browser to specific tools)")
print("=" * 60)
base_url = "http://0.0.0.0:8000/v1"
client = OpenAI(base_url=base_url, api_key="empty")
model = get_first_model(client)
response = client.responses.create(
model=model,
input="Search for 'Python programming tutorials'",
instructions="Use the browser tool to search.",
tools=[
{
"type": "mcp",
"server_label": "web_search_preview",
"server_url": "http://localhost:8888",
# Browser has tools: 'search', 'open', 'find'
# Only allow 'search' - blocks 'open' and 'find'
"allowed_tools": ["search"],
}
],
)
print(f"Status: {response.status}")
print(f"Output: {response.output_text}")
print()
def example_object_format():
"""Example using object format for allowed_tools with browser tools."""
print("=" * 60)
print("Example 4: allowed_tools with object format")
print("=" * 60)
base_url = "http://0.0.0.0:8000/v1"
client = OpenAI(base_url=base_url, api_key="empty")
model = get_first_model(client)
response = client.responses.create(
model=model,
input="Search for 'machine learning' and open the first result",
instructions="Use the browser tool.",
tools=[
{
"type": "mcp",
"server_label": "web_search_preview",
"server_url": "http://localhost:8888",
# Object format with tool_names field
# Can also include read_only and other fields
# Browser has tools: 'search', 'open', 'find'
"allowed_tools": {
"tool_names": [
"search",
"open",
], # Allow search and open, block find
"read_only": False,
},
}
],
)
print(f"Status: {response.status}")
print(f"Output: {response.output_text}")
print()
def main():
"""Run all examples."""
print("\n" + "=" * 60)
print("MCP Tools with allowed_tools Examples")
print("=" * 60 + "\n")
# Run all examples
example_no_filter()
example_wildcard()
example_specific_tools()
example_object_format()
print("=" * 60)
print("Summary:")
print(" - No filter or '*' → All tools available from server")
print(" - Specific list → Only those sub-tools available")
print(" - Object format → More control with tool_names field")
print("")
print("Note: allowed_tools filters SUB-TOOLS within an MCP server:")
print(" - code_interpreter (python): No sub-tools to filter")
print(" - web_search_preview (browser): Has 'search', 'open', 'find'")
print("=" * 60)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,83 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Set up this example by starting a vLLM OpenAI-compatible server with tool call
options enabled.
Reasoning models can be used through the Responses API as seen here
https://platform.openai.com/docs/api-reference/responses
For example:
vllm serve Qwen/Qwen3-1.7B --reasoning-parser qwen3 \
--structured-outputs-config.backend xgrammar \
--enable-auto-tool-choice --tool-call-parser hermes
"""
import json
from openai import OpenAI
from utils import get_first_model
def get_weather(latitude: float, longitude: float) -> str:
"""
Mock function to simulate getting weather data.
In a real application, this would call an external weather API.
"""
return f"Current temperature at ({latitude}, {longitude}) is 20°C."
tools = [
{
"type": "function",
"name": "get_weather",
"description": "Get current temperature for provided coordinates in celsius.",
"parameters": {
"type": "object",
"properties": {
"latitude": {"type": "number"},
"longitude": {"type": "number"},
},
"required": ["latitude", "longitude"],
"additionalProperties": False,
},
"strict": True,
}
]
input_messages = [
{"role": "user", "content": "What's the weather like in Paris today?"}
]
def main():
base_url = "http://0.0.0.0:8000/v1"
client = OpenAI(base_url=base_url, api_key="empty")
model = get_first_model(client)
response = client.responses.create(
model=model, input=input_messages, tools=tools, tool_choice="required"
)
for out in response.output:
if out.type == "function_call":
print("Function call:", out.name, out.arguments)
tool_call = out
args = json.loads(tool_call.arguments)
result = get_weather(args["latitude"], args["longitude"])
input_messages.append(tool_call) # append model's function call message
input_messages.append(
{ # append result message
"type": "function_call_output",
"call_id": tool_call.call_id,
"output": str(result),
}
)
response_2 = client.responses.create(
model=model,
input=input_messages,
tools=tools,
)
print(response_2.output_text)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,97 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This script demonstrates how to use the vLLM API server to perform audio
transcription with the `openai/whisper-large-v3` model.
Before running this script, you must start the vLLM server with the following command:
vllm serve openai/whisper-large-v3
Requirements:
- vLLM with audio support
- openai Python SDK
- httpx for streaming support
The script performs:
1. Synchronous transcription using OpenAI-compatible API.
2. Streaming transcription using raw HTTP request to the vLLM server.
"""
import asyncio
from openai import AsyncOpenAI, OpenAI
from vllm.assets.audio import AudioAsset
def sync_openai(audio_path: str, client: OpenAI):
"""
Perform synchronous transcription using OpenAI-compatible API.
"""
with open(audio_path, "rb") as f:
transcription = client.audio.transcriptions.create(
file=f,
model="openai/whisper-large-v3",
language="en",
response_format="json",
temperature=0.0,
# Additional sampling params not provided by OpenAI API.
extra_body=dict(
seed=4419,
repetition_penalty=1.3,
),
)
print("transcription result:", transcription.text)
async def stream_openai_response(audio_path: str, client: AsyncOpenAI):
"""
Perform asynchronous transcription using OpenAI-compatible API.
"""
print("\ntranscription result:", end=" ")
with open(audio_path, "rb") as f:
transcription = await client.audio.transcriptions.create(
file=f,
model="openai/whisper-large-v3",
language="en",
response_format="json",
temperature=0.0,
# Additional sampling params not provided by OpenAI API.
extra_body=dict(
seed=420,
top_p=0.6,
),
stream=True,
)
async for chunk in transcription:
if chunk.choices:
content = chunk.choices[0].get("delta", {}).get("content")
print(content, end="", flush=True)
print() # Final newline after stream ends
def main():
mary_had_lamb = str(AudioAsset("mary_had_lamb").get_local_path())
winning_call = str(AudioAsset("winning_call").get_local_path())
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
sync_openai(mary_had_lamb, client)
# Run the asynchronous function
client = AsyncOpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
asyncio.run(stream_openai_response(winning_call, client))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,75 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import json
import httpx
from openai import OpenAI
from vllm.assets.audio import AudioAsset
def sync_openai(audio_path: str, client: OpenAI):
with open(audio_path, "rb") as f:
translation = client.audio.translations.create(
file=f,
model="openai/whisper-large-v3",
response_format="json",
temperature=0.0,
# Additional params not provided by OpenAI API.
extra_body=dict(
language="it",
seed=4419,
repetition_penalty=1.3,
),
)
print("translation result:", translation.text)
async def stream_openai_response(audio_path: str, base_url: str, api_key: str):
data = {
"language": "it",
"stream": True,
"model": "openai/whisper-large-v3",
}
url = base_url + "/audio/translations"
headers = {"Authorization": f"Bearer {api_key}"}
print("translation result:", end=" ")
# OpenAI translation API client does not support streaming.
async with httpx.AsyncClient() as client:
with open(audio_path, "rb") as f:
async with client.stream(
"POST", url, files={"file": f}, data=data, headers=headers
) as response:
async for line in response.aiter_lines():
# Each line is a JSON object prefixed with 'data: '
if line:
if line.startswith("data: "):
line = line[len("data: ") :]
# Last chunk, stream ends
if line.strip() == "[DONE]":
break
# Parse the JSON response
chunk = json.loads(line)
# Extract and print the content
content = chunk["choices"][0].get("delta", {}).get("content")
print(content, end="")
def main():
foscolo = str(AudioAsset("azacinto_foscolo").get_local_path())
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
sync_openai(foscolo, client)
# Run the asynchronous function
asyncio.run(stream_openai_response(foscolo, openai_api_base, openai_api_key))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,94 @@
# Setup OpenTelemetry POC
1. Install OpenTelemetry packages:
```bash
pip install \
'opentelemetry-sdk>=1.26.0,<1.27.0' \
'opentelemetry-api>=1.26.0,<1.27.0' \
'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'
```
1. Start Jaeger in a docker container:
```bash
# From: https://www.jaegertracing.io/docs/1.57/getting-started/
docker run --rm --name jaeger \
-e COLLECTOR_ZIPKIN_HOST_PORT=:9411 \
-p 6831:6831/udp \
-p 6832:6832/udp \
-p 5778:5778 \
-p 16686:16686 \
-p 4317:4317 \
-p 4318:4318 \
-p 14250:14250 \
-p 14268:14268 \
-p 14269:14269 \
-p 9411:9411 \
jaegertracing/all-in-one:1.57
```
1. In a new shell, export Jaeger IP:
```bash
export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger)
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
```
Then set vLLM's service name for OpenTelemetry, enable insecure connections to Jaeger and run vLLM:
```bash
export OTEL_SERVICE_NAME="vllm-server"
export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
```
1. In a new shell, send requests with trace context from a dummy client
```bash
export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger)
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
export OTEL_SERVICE_NAME="client-service"
python dummy_client.py
```
1. Open Jaeger webui: <http://localhost:16686/>
In the search pane, select `vllm-server` service and hit `Find Traces`. You should get a list of traces, one for each request.
![Traces](https://i.imgur.com/GYHhFjo.png)
1. Clicking on a trace will show its spans and their tags. In this demo, each trace has 2 spans. One from the dummy client containing the prompt text and one from vLLM containing metadata about the request.
![Spans details](https://i.imgur.com/OPf6CBL.png)
## Exporter Protocol
OpenTelemetry supports either `grpc` or `http/protobuf` as the transport protocol for trace data in the exporter.
By default, `grpc` is used. To set `http/protobuf` as the protocol, configure the `OTEL_EXPORTER_OTLP_TRACES_PROTOCOL` environment variable as follows:
```bash
export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
```
## Instrumentation of FastAPI
OpenTelemetry allows automatic instrumentation of FastAPI.
1. Install the instrumentation library
```bash
pip install opentelemetry-instrumentation-fastapi
```
1. Run vLLM with `opentelemetry-instrument`
```bash
opentelemetry-instrument vllm serve facebook/opt-125m
```
1. Send a request to vLLM and find its trace in Jaeger. It should contain spans from FastAPI.
![FastAPI Spans](https://i.imgur.com/hywvoOJ.png)

View File

@@ -0,0 +1,34 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import requests
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
from opentelemetry.trace import SpanKind, set_tracer_provider
from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
trace_provider = TracerProvider()
set_tracer_provider(trace_provider)
trace_provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter()))
trace_provider.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))
tracer = trace_provider.get_tracer("dummy-client")
url = "http://localhost:8000/v1/completions"
with tracer.start_as_current_span("client-span", kind=SpanKind.CLIENT) as span:
prompt = "San Francisco is a"
span.set_attribute("prompt", prompt)
headers = {}
TraceContextTextMapPropagator().inject(headers)
payload = {
"model": "facebook/opt-125m",
"prompt": prompt,
"max_tokens": 10,
"n": 3,
"use_beam_search": "true",
"temperature": 0.0,
# "stream": True,
}
response = requests.post(url, headers=headers, json=payload)

View File

@@ -0,0 +1,57 @@
# Prometheus and Grafana
This is a simple example that shows you how to connect vLLM metric logging to the Prometheus/Grafana stack. For this example, we launch Prometheus and Grafana via Docker. You can checkout other methods through [Prometheus](https://prometheus.io/) and [Grafana](https://grafana.com/) websites.
Install:
- [`docker`](https://docs.docker.com/engine/install/)
- [`docker compose`](https://docs.docker.com/compose/install/linux/#install-using-the-repository)
## Launch
Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint:
```bash
vllm serve mistralai/Mistral-7B-v0.1 \
--max-model-len 2048
```
Launch Prometheus and Grafana servers with `docker compose`:
```bash
docker compose up
```
Submit some sample requests to the server:
```bash
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
vllm bench serve \
--model mistralai/Mistral-7B-v0.1 \
--tokenizer mistralai/Mistral-7B-v0.1 \
--endpoint /v1/completions \
--dataset-name sharegpt \
--dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \
--request-rate 3.0
```
Navigating to [`http://localhost:8000/metrics`](http://localhost:8000/metrics) will show the raw Prometheus metrics being exposed by vLLM.
## Grafana Dashboard
Navigate to [`http://localhost:3000`](http://localhost:3000). Log in with the default username (`admin`) and password (`admin`).
### Add Prometheus Data Source
Navigate to [`http://localhost:3000/connections/datasources/new`](http://localhost:3000/connections/datasources/new) and select Prometheus.
On Prometheus configuration page, we need to add the `Prometheus Server URL` in `Connection`. For this setup, Grafana and Prometheus are running in separate containers, but Docker creates DNS name for each container. You can just use `http://prometheus:9090`.
Click `Save & Test`. You should get a green check saying "Successfully queried the Prometheus API.".
### Import Dashboard
Navigate to [`http://localhost:3000/dashboard/import`](http://localhost:3000/dashboard/import), upload `grafana.json`, and select the `prometheus` datasource. You should see a screen that looks like the following:
![Grafana Dashboard Image](https://i.imgur.com/R2vH9VW.png)

View File

@@ -0,0 +1,19 @@
# docker-compose.yaml
version: "3"
services:
prometheus:
image: prom/prometheus:latest
extra_hosts:
- "host.docker.internal:host-gateway" # allow a direct connection from container to the local machine
ports:
- "9090:9090" # the default port used by Prometheus
volumes:
- ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml # mount Prometheus config file
grafana:
image: grafana/grafana:latest
depends_on:
- prometheus
ports:
- "3000:3000" # the default port used by Grafana

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,10 @@
# prometheus.yaml
global:
scrape_interval: 5s
evaluation_interval: 30s
scrape_configs:
- job_name: vllm
static_configs:
- targets:
- 'host.docker.internal:8000'

View File

@@ -0,0 +1,79 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
vLLM OpenAI-Compatible Client with Prompt Embeddings
This script demonstrates how to:
1. Generate prompt embeddings using Hugging Face Transformers
2. Encode them in base64 format
3. Send them to a vLLM server via the OpenAI-compatible Completions API
Run the vLLM server first:
vllm serve meta-llama/Llama-3.2-1B-Instruct \
--runner generate \
--max-model-len 4096 \
--enable-prompt-embeds
Run the client:
python examples/online_serving/prompt_embed_inference_with_openai_client.py
Model: meta-llama/Llama-3.2-1B-Instruct
Note: This model is gated on Hugging Face Hub.
You must request access to use it:
https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct
Dependencies:
- transformers
- torch
- openai
"""
import transformers
from openai import OpenAI
from vllm.utils.serial_utils import tensor2base64
def main():
client = OpenAI(
api_key="EMPTY",
base_url="http://localhost:8000/v1",
)
model_name = "meta-llama/Llama-3.2-1B-Instruct"
# Transformers
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
# Refer to the HuggingFace repo for the correct format to use
chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
token_ids = tokenizer.apply_chat_template(
chat, add_generation_prompt=True, return_tensors="pt"
)
embedding_layer = transformers_model.get_input_embeddings()
prompt_embeds = embedding_layer(token_ids).squeeze(0)
# Prompt embeddings
encoded_embeds = tensor2base64(prompt_embeds)
completion = client.completions.create(
model=model_name,
# NOTE: The OpenAI client does not allow `None` as an input to
# `prompt`. Use an empty string if you have no text prompts.
prompt="",
max_tokens=5,
temperature=0.0,
# NOTE: The OpenAI client allows passing in extra JSON body via the
# `extra_body` argument.
extra_body={"prompt_embeds": encoded_embeds},
)
print("-" * 30)
print(completion.choices[0].text)
print("-" * 30)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,55 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Deploy DeepSeek R1 or V3 with Ray Serve LLM.
Ray Serve LLM is a scalable and production-grade model serving library built
on the Ray distributed computing framework and first-class support for the vLLM engine.
Key features:
- Automatic scaling, back-pressure, and load balancing across a Ray cluster.
- Unified multi-node multi-model deployment.
- Exposes an OpenAI-compatible HTTP API.
- Multi-LoRA support with shared base models.
Run `python3 ray_serve_deepseek.py` to launch an endpoint.
Learn more in the official Ray Serve LLM documentation:
https://docs.ray.io/en/latest/serve/llm/serving-llms.html
"""
from ray import serve
from ray.serve.llm import LLMConfig, build_openai_app
llm_config = LLMConfig(
model_loading_config={
"model_id": "deepseek",
# Pre-downloading the model to local storage is recommended since
# the model is large. Set model_source="/path/to/the/model".
"model_source": "deepseek-ai/DeepSeek-R1",
},
deployment_config={
"autoscaling_config": {
"min_replicas": 1,
"max_replicas": 1,
}
},
# Set to the node's accelerator type.
accelerator_type="H100",
# Customize engine arguments as required (for example, vLLM engine kwargs).
engine_kwargs={
"tensor_parallel_size": 8,
"pipeline_parallel_size": 2,
"gpu_memory_utilization": 0.92,
"dtype": "auto",
"max_num_seqs": 40,
"max_model_len": 16384,
"enable_chunked_prefill": True,
"enable_prefix_caching": True,
"trust_remote_code": True,
},
)
# Deploy the application.
llm_app = build_openai_app({"llm_configs": [llm_config]})
serve.run(llm_app)

View File

@@ -0,0 +1,257 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Retrieval Augmented Generation (RAG) Implementation with Langchain
==================================================================
This script demonstrates a RAG implementation using LangChain, Milvus
and vLLM. RAG enhances LLM responses by retrieving relevant context
from a document collection.
Features:
- Web content loading and chunking
- Vector storage with Milvus
- Embedding generation with vLLM
- Question answering with context
Prerequisites:
1. Install dependencies:
pip install -U vllm \
langchain_milvus langchain_openai \
langchain_community beautifulsoup4 \
langchain-text-splitters
2. Start services:
# Start embedding service (port 8000)
vllm serve ssmits/Qwen2-7B-Instruct-embed-base
# Start chat service (port 8001)
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
Usage:
python retrieval_augmented_generation_with_langchain.py
Notes:
- Ensure both vLLM services are running before executing
- Default ports: 8000 (embedding), 8001 (chat)
- First run may take time to download models
"""
import argparse
from argparse import Namespace
from typing import Any
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_milvus import Milvus
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
def load_and_split_documents(config: dict[str, Any]):
"""
Load and split documents from web URL
"""
try:
loader = WebBaseLoader(web_paths=(config["url"],))
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=config["chunk_size"],
chunk_overlap=config["chunk_overlap"],
)
return text_splitter.split_documents(docs)
except Exception as e:
print(f"Error loading document from {config['url']}: {str(e)}")
raise
def init_vectorstore(config: dict[str, Any], documents: list[Document]):
"""
Initialize vector store with documents
"""
return Milvus.from_documents(
documents=documents,
embedding=OpenAIEmbeddings(
model=config["embedding_model"],
openai_api_key=config["vllm_api_key"],
openai_api_base=config["vllm_embedding_endpoint"],
),
connection_args={"uri": config["uri"]},
drop_old=True,
)
def init_llm(config: dict[str, Any]):
"""
Initialize llm
"""
return ChatOpenAI(
model=config["chat_model"],
openai_api_key=config["vllm_api_key"],
openai_api_base=config["vllm_chat_endpoint"],
)
def get_qa_prompt():
"""
Get question answering prompt template
"""
template = """You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Use three sentences maximum and keep the answer concise.
Question: {question}
Context: {context}
Answer:
"""
return PromptTemplate.from_template(template)
def format_docs(docs: list[Document]):
"""
Format documents for prompt
"""
return "\n\n".join(doc.page_content for doc in docs)
def create_qa_chain(retriever: Any, llm: ChatOpenAI, prompt: PromptTemplate):
"""
Set up question answering chain
"""
return (
{
"context": retriever | format_docs,
"question": RunnablePassthrough(),
}
| prompt
| llm
| StrOutputParser()
)
def get_parser() -> argparse.ArgumentParser:
"""
Parse command line arguments
"""
parser = argparse.ArgumentParser(description="RAG with vLLM and langchain")
# Add command line arguments
parser.add_argument(
"--vllm-api-key", default="EMPTY", help="API key for vLLM compatible services"
)
parser.add_argument(
"--vllm-embedding-endpoint",
default="http://localhost:8000/v1",
help="Base URL for embedding service",
)
parser.add_argument(
"--vllm-chat-endpoint",
default="http://localhost:8001/v1",
help="Base URL for chat service",
)
parser.add_argument("--uri", default="./milvus.db", help="URI for Milvus database")
parser.add_argument(
"--url",
default=("https://docs.vllm.ai/en/latest/getting_started/quickstart.html"),
help="URL of the document to process",
)
parser.add_argument(
"--embedding-model",
default="ssmits/Qwen2-7B-Instruct-embed-base",
help="Model name for embeddings",
)
parser.add_argument(
"--chat-model", default="qwen/Qwen1.5-0.5B-Chat", help="Model name for chat"
)
parser.add_argument(
"-i", "--interactive", action="store_true", help="Enable interactive Q&A mode"
)
parser.add_argument(
"-k", "--top-k", type=int, default=3, help="Number of top results to retrieve"
)
parser.add_argument(
"-c",
"--chunk-size",
type=int,
default=1000,
help="Chunk size for document splitting",
)
parser.add_argument(
"-o",
"--chunk-overlap",
type=int,
default=200,
help="Chunk overlap for document splitting",
)
return parser
def init_config(args: Namespace):
"""
Initialize configuration settings from command line arguments
"""
return {
"vllm_api_key": args.vllm_api_key,
"vllm_embedding_endpoint": args.vllm_embedding_endpoint,
"vllm_chat_endpoint": args.vllm_chat_endpoint,
"uri": args.uri,
"embedding_model": args.embedding_model,
"chat_model": args.chat_model,
"url": args.url,
"chunk_size": args.chunk_size,
"chunk_overlap": args.chunk_overlap,
"top_k": args.top_k,
}
def main():
# Parse command line arguments
args = get_parser().parse_args()
# Initialize configuration
config = init_config(args)
# Load and split documents
documents = load_and_split_documents(config)
# Initialize vector store and retriever
vectorstore = init_vectorstore(config, documents)
retriever = vectorstore.as_retriever(search_kwargs={"k": config["top_k"]})
# Initialize llm and prompt
llm = init_llm(config)
prompt = get_qa_prompt()
# Set up QA chain
qa_chain = create_qa_chain(retriever, llm, prompt)
# Interactive mode
if args.interactive:
print("\nWelcome to Interactive Q&A System!")
print("Enter 'q' or 'quit' to exit.")
while True:
question = input("\nPlease enter your question: ")
if question.lower() in ["q", "quit"]:
print("\nThank you for using! Goodbye!")
break
output = qa_chain.invoke(question)
print(output)
else:
# Default single question mode
question = "How to install vLLM?"
output = qa_chain.invoke(question)
print("-" * 50)
print(output)
print("-" * 50)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,225 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
RAG (Retrieval Augmented Generation) Implementation with LlamaIndex
================================================================
This script demonstrates a RAG system using:
- LlamaIndex: For document indexing and retrieval
- Milvus: As vector store backend
- vLLM: For embedding and text generation
Features:
1. Document Loading & Processing
2. Embedding & Storage
3. Query Processing
Requirements:
1. Install dependencies:
pip install llama-index llama-index-readers-web \
llama-index-llms-openai-like \
llama-index-embeddings-openai-like \
llama-index-vector-stores-milvus \
2. Start services:
# Start embedding service (port 8000)
vllm serve ssmits/Qwen2-7B-Instruct-embed-base
# Start chat service (port 8001)
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
Usage:
python retrieval_augmented_generation_with_llamaindex.py
Notes:
- Ensure both vLLM services are running before executing
- Default ports: 8000 (embedding), 8001 (chat)
- First run may take time to download models
"""
import argparse
from argparse import Namespace
from typing import Any
from llama_index.core import Settings, StorageContext, VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai_like import OpenAILikeEmbedding
from llama_index.llms.openai_like import OpenAILike
from llama_index.readers.web import SimpleWebPageReader
from llama_index.vector_stores.milvus import MilvusVectorStore
def init_config(args: Namespace):
"""Initialize configuration with command line arguments"""
return {
"url": args.url,
"embedding_model": args.embedding_model,
"chat_model": args.chat_model,
"vllm_api_key": args.vllm_api_key,
"embedding_endpoint": args.embedding_endpoint,
"chat_endpoint": args.chat_endpoint,
"db_path": args.db_path,
"chunk_size": args.chunk_size,
"chunk_overlap": args.chunk_overlap,
"top_k": args.top_k,
}
def load_documents(url: str) -> list:
"""Load and process web documents"""
return SimpleWebPageReader(html_to_text=True).load_data([url])
def setup_models(config: dict[str, Any]):
"""Configure embedding and chat models"""
Settings.embed_model = OpenAILikeEmbedding(
api_base=config["embedding_endpoint"],
api_key=config["vllm_api_key"],
model_name=config["embedding_model"],
)
Settings.llm = OpenAILike(
model=config["chat_model"],
api_key=config["vllm_api_key"],
api_base=config["chat_endpoint"],
context_window=128000,
is_chat_model=True,
is_function_calling_model=False,
)
Settings.transformations = [
SentenceSplitter(
chunk_size=config["chunk_size"],
chunk_overlap=config["chunk_overlap"],
)
]
def setup_vector_store(db_path: str) -> MilvusVectorStore:
"""Initialize vector store"""
sample_emb = Settings.embed_model.get_text_embedding("test")
print(f"Embedding dimension: {len(sample_emb)}")
return MilvusVectorStore(uri=db_path, dim=len(sample_emb), overwrite=True)
def create_index(documents: list, vector_store: MilvusVectorStore):
"""Create document index"""
storage_context = StorageContext.from_defaults(vector_store=vector_store)
return VectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
)
def query_document(index: VectorStoreIndex, question: str, top_k: int):
"""Query document with given question"""
query_engine = index.as_query_engine(similarity_top_k=top_k)
return query_engine.query(question)
def get_parser() -> argparse.ArgumentParser:
"""Parse command line arguments"""
parser = argparse.ArgumentParser(description="RAG with vLLM and LlamaIndex")
# Add command line arguments
parser.add_argument(
"--url",
default=("https://docs.vllm.ai/en/latest/getting_started/quickstart.html"),
help="URL of the document to process",
)
parser.add_argument(
"--embedding-model",
default="ssmits/Qwen2-7B-Instruct-embed-base",
help="Model name for embeddings",
)
parser.add_argument(
"--chat-model", default="qwen/Qwen1.5-0.5B-Chat", help="Model name for chat"
)
parser.add_argument(
"--vllm-api-key", default="EMPTY", help="API key for vLLM compatible services"
)
parser.add_argument(
"--embedding-endpoint",
default="http://localhost:8000/v1",
help="Base URL for embedding service",
)
parser.add_argument(
"--chat-endpoint",
default="http://localhost:8001/v1",
help="Base URL for chat service",
)
parser.add_argument(
"--db-path", default="./milvus_demo.db", help="Path to Milvus database"
)
parser.add_argument(
"-i", "--interactive", action="store_true", help="Enable interactive Q&A mode"
)
parser.add_argument(
"-c",
"--chunk-size",
type=int,
default=1000,
help="Chunk size for document splitting",
)
parser.add_argument(
"-o",
"--chunk-overlap",
type=int,
default=200,
help="Chunk overlap for document splitting",
)
parser.add_argument(
"-k", "--top-k", type=int, default=3, help="Number of top results to retrieve"
)
return parser
def main():
# Parse command line arguments
args = get_parser().parse_args()
# Initialize configuration
config = init_config(args)
# Load documents
documents = load_documents(config["url"])
# Setup models
setup_models(config)
# Setup vector store
vector_store = setup_vector_store(config["db_path"])
# Create index
index = create_index(documents, vector_store)
if args.interactive:
print("\nEntering interactive mode. Type 'quit' to exit.")
while True:
# Get user question
question = input("\nEnter your question: ")
# Check for exit command
if question.lower() in ["quit", "exit", "q"]:
print("Exiting interactive mode...")
break
# Get and print response
print("\n" + "-" * 50)
print("Response:\n")
response = query_document(index, question, config["top_k"])
print(response)
print("-" * 50)
else:
# Single query mode
question = "How to install vLLM?"
response = query_document(index, question, config["top_k"])
print("-" * 50)
print("Response:\n")
print(response)
print("-" * 50)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,131 @@
#!/bin/bash
#
# Launch a Ray cluster inside Docker for vLLM inference.
#
# This script can start either a head node or a worker node, depending on the
# --head or --worker flag provided as the third positional argument.
#
# Usage:
# 1. Designate one machine as the head node and execute:
# bash run_cluster.sh \
# vllm/vllm-openai \
# <head_node_ip> \
# --head \
# /abs/path/to/huggingface/cache \
# -e VLLM_HOST_IP=<head_node_ip>
#
# 2. On every worker machine, execute:
# bash run_cluster.sh \
# vllm/vllm-openai \
# <head_node_ip> \
# --worker \
# /abs/path/to/huggingface/cache \
# -e VLLM_HOST_IP=<worker_node_ip>
#
# Each worker requires a unique VLLM_HOST_IP value.
# Keep each terminal session open. Closing a session stops the associated Ray
# node and thereby shuts down the entire cluster.
# Every machine must be reachable at the supplied IP address.
#
# The container is named "node-<random_suffix>". To open a shell inside
# a container after launch, use:
# docker exec -it node-<random_suffix> /bin/bash
#
# Then, you can execute vLLM commands on the Ray cluster as if it were a
# single machine, e.g. vllm serve ...
#
# To stop the container, use:
# docker stop node-<random_suffix>
# Check for minimum number of required arguments.
if [ $# -lt 4 ]; then
echo "Usage: $0 docker_image head_node_ip --head|--worker path_to_hf_home [additional_args...]"
exit 1
fi
# Extract the mandatory positional arguments and remove them from $@.
DOCKER_IMAGE="$1"
HEAD_NODE_ADDRESS="$2"
NODE_TYPE="$3" # Should be --head or --worker.
PATH_TO_HF_HOME="$4"
shift 4
# Preserve any extra arguments so they can be forwarded to Docker.
ADDITIONAL_ARGS=("$@")
# Validate the NODE_TYPE argument.
if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then
echo "Error: Node type must be --head or --worker"
exit 1
fi
# Extract VLLM_HOST_IP from ADDITIONAL_ARGS (e.g. "-e VLLM_HOST_IP=...").
VLLM_HOST_IP=""
for ((i = 0; i < ${#ADDITIONAL_ARGS[@]}; i++)); do
arg="${ADDITIONAL_ARGS[$i]}"
case "${arg}" in
-e)
next="${ADDITIONAL_ARGS[$((i + 1))]:-}"
if [[ "${next}" == VLLM_HOST_IP=* ]]; then
VLLM_HOST_IP="${next#VLLM_HOST_IP=}"
break
fi
;;
-eVLLM_HOST_IP=* | VLLM_HOST_IP=*)
VLLM_HOST_IP="${arg#*=}"
break
;;
esac
done
# For the head node, HEAD_NODE_ADDRESS and VLLM_HOST_IP should be consistent.
if [[ "${NODE_TYPE}" == "--head" && -n "${VLLM_HOST_IP}" ]]; then
if [[ "${VLLM_HOST_IP}" != "${HEAD_NODE_ADDRESS}" ]]; then
echo "Warning: VLLM_HOST_IP (${VLLM_HOST_IP}) differs from head_node_ip (${HEAD_NODE_ADDRESS})."
echo "Using VLLM_HOST_IP as the head node address."
HEAD_NODE_ADDRESS="${VLLM_HOST_IP}"
fi
fi
# Generate a unique container name with random suffix.
# Docker container names must be unique on each host.
# The random suffix allows multiple Ray containers to run simultaneously on the same machine,
# for example, on a multi-GPU machine.
CONTAINER_NAME="node-${RANDOM}"
# Define a cleanup routine that removes the container when the script exits.
# This prevents orphaned containers from accumulating if the script is interrupted.
cleanup() {
docker stop "${CONTAINER_NAME}"
docker rm "${CONTAINER_NAME}"
}
trap cleanup EXIT
# Build the Ray start command based on the node role.
# The head node manages the cluster and accepts connections on port 6379,
# while workers connect to the head's address.
RAY_START_CMD="ray start --block"
if [ "${NODE_TYPE}" == "--head" ]; then
RAY_START_CMD+=" --head --node-ip-address=${HEAD_NODE_ADDRESS} --port=6379"
else
RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
if [ -n "${VLLM_HOST_IP}" ]; then
RAY_START_CMD+=" --node-ip-address=${VLLM_HOST_IP}"
fi
fi
# Launch the container with the assembled parameters.
# --network host: Allows Ray nodes to communicate directly via host networking
# --shm-size 10.24g: Increases shared memory
# --gpus all: Gives container access to all GPUs on the host
# -v HF_HOME: Mounts HuggingFace cache to avoid re-downloading models
docker run \
--entrypoint /bin/bash \
--network host \
--name "${CONTAINER_NAME}" \
--shm-size 10.24g \
--gpus all \
-v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
"${ADDITIONAL_ARGS[@]}" \
"${DOCKER_IMAGE}" -c "${RAY_START_CMD}"

View File

@@ -0,0 +1,24 @@
#!/bin/bash
# Define the prefix for environment variables to look for
PREFIX="SM_VLLM_"
ARG_PREFIX="--"
# Initialize an array for storing the arguments
# port 8080 required by sagemaker, https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-code-container-response
ARGS=(--port 8080)
# Loop through all environment variables
while IFS='=' read -r key value; do
# Remove the prefix from the key, convert to lowercase, and replace underscores with dashes
arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')
# Add the argument name and value to the ARGS array
ARGS+=("${ARG_PREFIX}${arg_name}")
if [ -n "$value" ]; then
ARGS+=("$value")
fi
done < <(env | grep "^${PREFIX}")
# Pass the collected arguments to the main entrypoint
exec vllm serve "${ARGS[@]}"

View File

@@ -0,0 +1,311 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
vLLM Chat Assistant - A Streamlit Web Interface
A streamlined chat interface that quickly integrates
with vLLM API server.
Features:
- Multiple chat sessions management
- Streaming response display
- Configurable API endpoint
- Real-time chat history
- Reasoning Display: Optional thinking process visualization
Requirements:
pip install streamlit openai
Usage:
# Start the app with default settings
streamlit run streamlit_openai_chatbot_webserver.py
# Start with custom vLLM API endpoint
VLLM_API_BASE="http://your-server:8000/v1" \
streamlit run streamlit_openai_chatbot_webserver.py
# Enable debug mode
streamlit run streamlit_openai_chatbot_webserver.py \
--logger.level=debug
"""
import os
from datetime import datetime
import streamlit as st
from openai import OpenAI
# Get command line arguments from environment variables
openai_api_key = os.getenv("VLLM_API_KEY", "EMPTY")
openai_api_base = os.getenv("VLLM_API_BASE", "http://localhost:8000/v1")
# Initialize session states for managing chat sessions
if "sessions" not in st.session_state:
st.session_state.sessions = {}
if "current_session" not in st.session_state:
st.session_state.current_session = None
if "messages" not in st.session_state:
st.session_state.messages = []
if "active_session" not in st.session_state:
st.session_state.active_session = None
# Add new session state for reasoning
if "show_reasoning" not in st.session_state:
st.session_state.show_reasoning = {}
# Initialize session state for API base URL
if "api_base_url" not in st.session_state:
st.session_state.api_base_url = openai_api_base
def create_new_chat_session():
"""Create a new chat session with timestamp as unique identifier.
This function initializes a new chat session by:
1. Generating a timestamp-based session ID
2. Creating an empty message list for the new session
3. Setting the new session as both current and active session
4. Resetting the messages list for the new session
Returns:
None
Session State Updates:
- sessions: Adds new empty message list with timestamp key
- current_session: Sets to new session ID
- active_session: Sets to new session ID
- messages: Resets to empty list
"""
session_id = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
st.session_state.sessions[session_id] = []
st.session_state.current_session = session_id
st.session_state.active_session = session_id
st.session_state.messages = []
def switch_to_chat_session(session_id):
"""Switch the active chat context to a different session.
Args:
session_id (str): The timestamp ID of the session to switch to
This function handles chat session switching by:
1. Setting the specified session as current
2. Updating the active session marker
3. Loading the messages history from the specified session
Session State Updates:
- current_session: Updated to specified session_id
- active_session: Updated to specified session_id
- messages: Loaded from sessions[session_id]
"""
st.session_state.current_session = session_id
st.session_state.active_session = session_id
st.session_state.messages = st.session_state.sessions[session_id]
def get_llm_response(messages, model, reason, content_ph=None, reasoning_ph=None):
"""Generate and stream LLM response with optional reasoning process.
Args:
messages (list): List of conversation message dicts with 'role' and 'content'
model (str): The model identifier to use for generation
reason (bool): Whether to enable and display reasoning process
content_ph (streamlit.empty): Placeholder for streaming response content
reasoning_ph (streamlit.empty): Placeholder for streaming reasoning process
Returns:
tuple: (str, str)
- First string contains the complete response text
- Second string contains the complete reasoning text (if enabled)
Features:
- Streams both reasoning and response text in real-time
- Handles model API errors gracefully
- Supports live updating of thinking process
- Maintains separate content and reasoning displays
Raises:
Exception: Wrapped in error message if API call fails
Note:
The function uses streamlit placeholders for live updates.
When reason=True, the reasoning process appears above the response.
"""
full_text = ""
think_text = ""
live_think = None
# Build request parameters
params = {"model": model, "messages": messages, "stream": True}
if reason:
params["extra_body"] = {"chat_template_kwargs": {"enable_thinking": True}}
try:
response = client.chat.completions.create(**params)
if isinstance(response, str):
if content_ph:
content_ph.markdown(response)
return response, ""
# Prepare reasoning expander above content
if reason and reasoning_ph:
exp = reasoning_ph.expander("💭 Thinking Process (live)", expanded=True)
live_think = exp.empty()
# Stream chunks
for chunk in response:
delta = chunk.choices[0].delta
# Stream reasoning first
if reason and hasattr(delta, "reasoning") and live_think:
rc = delta.reasoning
if rc:
think_text += rc
live_think.markdown(think_text + "")
# Then stream content
if hasattr(delta, "content") and delta.content and content_ph:
full_text += delta.content
content_ph.markdown(full_text + "")
# Finalize displays: reasoning remains above, content below
if reason and live_think:
live_think.markdown(think_text)
if content_ph:
content_ph.markdown(full_text)
return full_text, think_text
except Exception as e:
st.error(f"Error details: {str(e)}")
return f"Error: {str(e)}", ""
# Sidebar - API Settings first
st.sidebar.title("API Settings")
new_api_base = st.sidebar.text_input(
"API Base URL:", value=st.session_state.api_base_url
)
if new_api_base != st.session_state.api_base_url:
st.session_state.api_base_url = new_api_base
st.rerun()
st.sidebar.divider()
# Sidebar - Session Management
st.sidebar.title("Chat Sessions")
if st.sidebar.button("New Session"):
create_new_chat_session()
# Display all sessions in reverse chronological order
for session_id in sorted(st.session_state.sessions.keys(), reverse=True):
# Mark the active session with a pinned button
if session_id == st.session_state.active_session:
st.sidebar.button(
f"📍 {session_id}",
key=session_id,
type="primary",
on_click=switch_to_chat_session,
args=(session_id,),
)
else:
st.sidebar.button(
f"Session {session_id}",
key=session_id,
on_click=switch_to_chat_session,
args=(session_id,),
)
# Main interface
st.title("vLLM Chat Assistant")
# Initialize OpenAI client with API settings
client = OpenAI(api_key=openai_api_key, base_url=st.session_state.api_base_url)
# Get and display current model id
models = client.models.list()
model = models.data[0].id
st.markdown(f"**Model**: {model}")
# Initialize first session if none exists
if st.session_state.current_session is None:
create_new_chat_session()
st.session_state.active_session = st.session_state.current_session
# Update the chat history display section
for idx, msg in enumerate(st.session_state.messages):
# Render user messages normally
if msg["role"] == "user":
with st.chat_message("user"):
st.write(msg["content"])
# Render assistant messages with reasoning above
else:
# If reasoning exists for this assistant message, show it above the content
if idx in st.session_state.show_reasoning:
with st.expander("💭 Thinking Process", expanded=False):
st.markdown(st.session_state.show_reasoning[idx])
with st.chat_message("assistant"):
st.write(msg["content"])
# Setup & Cache reasoning support check
@st.cache_data(show_spinner=False)
def server_supports_reasoning():
"""Check if the current model supports reasoning capability.
Returns:
bool: True if the model supports reasoning, False otherwise
"""
resp = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": "Hi"}],
stream=False,
)
return hasattr(resp.choices[0].message, "reasoning") and bool(
resp.choices[0].message.reasoning
)
# Check support
supports_reasoning = server_supports_reasoning()
# Add reasoning toggle in sidebar if supported
reason = False # Default to False
if supports_reasoning:
reason = st.sidebar.checkbox("Enable Reasoning", value=False)
else:
st.sidebar.markdown(
"<span style='color:gray;'>Reasoning unavailable for this model.</span>",
unsafe_allow_html=True,
)
# reason remains False
# Update the input handling section
if prompt := st.chat_input("Type your message here..."):
# Save and display user message
st.session_state.messages.append({"role": "user", "content": prompt})
st.session_state.sessions[st.session_state.current_session] = (
st.session_state.messages
)
with st.chat_message("user"):
st.write(prompt)
# Prepare LLM messages
msgs = [
{"role": m["role"], "content": m["content"]} for m in st.session_state.messages
]
# Stream assistant response
with st.chat_message("assistant"):
# Placeholders: reasoning above, content below
reason_ph = st.empty()
content_ph = st.empty()
full, think = get_llm_response(msgs, model, reason, content_ph, reason_ph)
# Determine index for this new assistant message
message_index = len(st.session_state.messages)
# Save assistant reply
st.session_state.messages.append({"role": "assistant", "content": full})
# Persist reasoning in session state if any
if reason and think:
st.session_state.show_reasoning[message_index] = think

View File

@@ -0,0 +1,58 @@
# Structured Outputs
This script demonstrates various structured output capabilities of vLLM's OpenAI-compatible server.
It can run individual constraint type or all of them.
It supports both streaming responses and concurrent non-streaming requests.
To use this example, you must start an vLLM server with any model of your choice.
```bash
vllm serve Qwen/Qwen2.5-3B-Instruct
```
To serve a reasoning model, you can use the following command:
```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \
--reasoning-parser deepseek_r1
```
If you want to run this script standalone with `uv`, you can use the following:
```bash
uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs \
structured-outputs
```
See [feature docs](https://docs.vllm.ai/en/latest/features/structured_outputs.html) for more information.
!!! tip
If vLLM is running remotely, then set `OPENAI_BASE_URL=<remote_url>` before running the script.
## Usage
Run all constraints, non-streaming:
```bash
uv run structured_outputs.py
```
Run all constraints, streaming:
```bash
uv run structured_outputs.py --stream
```
Run certain constraints, for example `structural_tag` and `regex`, streaming:
```bash
uv run structured_outputs.py \
--constraint structural_tag regex \
--stream
```
Run all constraints, with reasoning models and streaming:
```bash
uv run structured_outputs.py --reasoning --stream
```

View File

@@ -0,0 +1,8 @@
[project]
name = "examples-online-structured-outputs"
requires-python = ">=3.10, <3.14"
dependencies = ["openai==1.78.1", "pydantic==2.11.4"]
version = "0.0.0"
[project.scripts]
structured-outputs = "structured_outputs:main"

View File

@@ -0,0 +1,268 @@
# ruff: noqa: E501
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import asyncio
import enum
import os
from typing import Any, Literal
import openai
import pydantic
from openai.types.chat import ChatCompletionChunk
ConstraintsFormat = Literal[
"choice",
"regex",
"json",
"grammar",
"structural_tag",
]
async def print_stream_response(
stream_response: openai.AsyncStream[ChatCompletionChunk],
title: str,
args: argparse.Namespace,
):
print(f"\n\n{title} (Streaming):")
local_reasoning_header_printed = False
local_content_header_printed = False
async for chunk in stream_response:
delta = chunk.choices[0].delta
reasoning_chunk_text: str | None = getattr(delta, "reasoning", None)
content_chunk_text = delta.content
if args.reasoning:
if reasoning_chunk_text:
if not local_reasoning_header_printed:
print(" Reasoning: ", end="")
local_reasoning_header_printed = True
print(reasoning_chunk_text, end="", flush=True)
if content_chunk_text:
if not local_content_header_printed:
if local_reasoning_header_printed:
print()
print(" Content: ", end="")
local_content_header_printed = True
print(content_chunk_text, end="", flush=True)
else:
if content_chunk_text:
if not local_content_header_printed:
print(" Content: ", end="")
local_content_header_printed = True
print(content_chunk_text, end="", flush=True)
print()
class CarType(str, enum.Enum):
SEDAN = "SEDAN"
SUV = "SUV"
TRUCK = "TRUCK"
COUPE = "COUPE"
class CarDescription(pydantic.BaseModel):
brand: str
model: str
car_type: CarType
PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
"choice": {
"messages": [
{
"role": "user",
"content": "Classify this sentiment: vLLM is wonderful!",
}
],
"extra_body": {"structured_outputs": {"choice": ["positive", "negative"]}},
},
"regex": {
"messages": [
{
"role": "user",
"content": "Generate an email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: 'alan.turing@enigma.com\n'",
}
],
"extra_body": {
"structured_outputs": {"regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n"},
},
},
"json": {
"messages": [
{
"role": "user",
"content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
}
],
"response_format": {
"type": "json_schema",
"json_schema": {
"name": "car-description",
"schema": CarDescription.model_json_schema(),
},
},
},
"grammar": {
"messages": [
{
"role": "user",
"content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
}
],
"extra_body": {
"structured_outputs": {
"grammar": """
root ::= select_statement
select_statement ::= "SELECT " column " from " table " where " condition
column ::= "col_1 " | "col_2 "
table ::= "table_1 " | "table_2 "
condition ::= column "= " number
number ::= "1 " | "2 "
""",
}
},
},
"structural_tag": {
"messages": [
{
"role": "user",
"content": """
You have access to the following function to retrieve the weather in a city:
{
"name": "get_weather",
"parameters": {
"city": {
"param_type": "string",
"description": "The city to get the weather for",
"required": True
}
}
}
If a you choose to call a function ONLY reply in the following format:
<{start_tag}={function_name}>{parameters}{end_tag}
where
start_tag => `<function`
parameters => a JSON dict with the function argument name as key and function
argument value as value.
end_tag => `</function>`
Here is an example,
<function=example_function_name>{"example_name": "example_value"}</function>
Reminder:
- Function calls MUST follow the specified format
- Required parameters MUST be specified
- Only call one function at a time
- Put the entire function call reply on one line
- Always add your sources when using search results to answer the user query
You are a helpful assistant.
Given the previous instructions, what is the weather in New York City, Boston,
and San Francisco?""",
},
],
"response_format": {
"type": "structural_tag",
"structures": [
{
"begin": "<function=get_weather>",
"schema": {
"type": "object",
"properties": {"city": {"type": "string"}},
"required": ["city"],
},
"end": "</function>",
}
],
"triggers": ["<function="],
},
},
}
async def cli():
parser = argparse.ArgumentParser(
description="Run OpenAI Chat Completion with various structured outputs capabilities",
)
_ = parser.add_argument(
"--constraint",
type=str,
nargs="+",
choices=[*list(PARAMS), "*"],
default=["*"],
help="Specify which constraint(s) to run.",
)
_ = parser.add_argument(
"--stream",
action=argparse.BooleanOptionalAction,
default=False,
help="Enable streaming output",
)
_ = parser.add_argument(
"--reasoning",
action=argparse.BooleanOptionalAction,
default=False,
help="Enable printing of reasoning traces if available.",
)
args = parser.parse_args()
base_url = os.getenv("OPENAI_BASE_URL", "http://localhost:8000/v1")
client = openai.AsyncOpenAI(base_url=base_url, api_key="EMPTY")
constraints = list(PARAMS) if "*" in args.constraint else list(set(args.constraint))
model = (await client.models.list()).data[0].id
if args.stream:
results = await asyncio.gather(
*[
client.chat.completions.create(
model=model,
max_tokens=1024,
stream=True,
**PARAMS[name],
)
for name in constraints
]
)
for constraint, stream in zip(constraints, results):
await print_stream_response(stream, constraint, args)
else:
results = await asyncio.gather(
*[
client.chat.completions.create(
model=model,
max_tokens=1024,
stream=False,
**PARAMS[name],
)
for name in constraints
]
)
for constraint, response in zip(constraints, results):
print(f"\n\n{constraint}:")
message = response.choices[0].message
if args.reasoning and hasattr(message, "reasoning"):
print(f" Reasoning: {message.reasoning or ''}")
print(f" Content: {message.content!r}")
def main():
asyncio.run(cli())
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,49 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import httpx
from transformers import AutoTokenizer
GEN_ENDPOINT = "http://localhost:8000/inference/v1/generate"
DUMMY_API_KEY = "empty"
MODEL_NAME = "Qwen/Qwen3-0.6B"
transport = httpx.HTTPTransport()
headers = {"Authorization": f"Bearer {DUMMY_API_KEY}"}
client = httpx.Client(
transport=transport,
base_url=GEN_ENDPOINT,
timeout=600,
headers=headers,
)
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "How many countries are in the EU?"},
]
def main(client):
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
token_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
enable_thinking=False,
)
payload = {
"model": MODEL_NAME,
"token_ids": token_ids,
"sampling_params": {"max_tokens": 24, "temperature": 0.2, "detokenize": False},
"stream": False,
}
resp = client.post(GEN_ENDPOINT, json=payload)
resp.raise_for_status()
data = resp.json()
print(data)
print("-" * 50)
print("Token generation results:")
res = tokenizer.decode(data["choices"][0]["token_ids"])
print(res)
print("-" * 50)
if __name__ == "__main__":
main(client)

View File

@@ -0,0 +1,26 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from openai import APIConnectionError, OpenAI
from openai.pagination import SyncPage
from openai.types.model import Model
def get_first_model(client: OpenAI) -> str:
"""
Get the first model from the vLLM server.
"""
try:
models: SyncPage[Model] = client.models.list()
except APIConnectionError as e:
raise RuntimeError(
"Failed to get the list of models from the vLLM server at "
f"{client.base_url} with API key {client.api_key}. Check\n"
"1. the server is running\n"
"2. the server URL is correct\n"
"3. the API key is correct"
) from e
if len(models.data) == 0:
raise RuntimeError(f"No models found on the vLLM server at {client.base_url}")
return models.data[0].id