Sync from v0.13
This commit is contained in:
93
examples/online_serving/api_client.py
Normal file
93
examples/online_serving/api_client.py
Normal file
@@ -0,0 +1,93 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Example Python client for `vllm.entrypoints.api_server`
|
||||
Start the demo server:
|
||||
python -m vllm.entrypoints.api_server --model <model_name>
|
||||
|
||||
NOTE: The API server is used only for demonstration and simple performance
|
||||
benchmarks. It is not intended for production use.
|
||||
For production use, we recommend `vllm serve` and the OpenAI client API.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from argparse import Namespace
|
||||
from collections.abc import Iterable
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
def clear_line(n: int = 1) -> None:
|
||||
LINE_UP = "\033[1A"
|
||||
LINE_CLEAR = "\x1b[2K"
|
||||
for _ in range(n):
|
||||
print(LINE_UP, end=LINE_CLEAR, flush=True)
|
||||
|
||||
|
||||
def post_http_request(
|
||||
prompt: str, api_url: str, n: int = 1, stream: bool = False
|
||||
) -> requests.Response:
|
||||
headers = {"User-Agent": "Test Client"}
|
||||
pload = {
|
||||
"prompt": prompt,
|
||||
"n": n,
|
||||
"temperature": 0.0,
|
||||
"max_tokens": 16,
|
||||
"stream": stream,
|
||||
}
|
||||
response = requests.post(api_url, headers=headers, json=pload, stream=stream)
|
||||
return response
|
||||
|
||||
|
||||
def get_streaming_response(response: requests.Response) -> Iterable[list[str]]:
|
||||
for chunk in response.iter_lines(
|
||||
chunk_size=8192, decode_unicode=False, delimiter=b"\n"
|
||||
):
|
||||
if chunk:
|
||||
data = json.loads(chunk.decode("utf-8"))
|
||||
output = data["text"]
|
||||
yield output
|
||||
|
||||
|
||||
def get_response(response: requests.Response) -> list[str]:
|
||||
data = json.loads(response.content)
|
||||
output = data["text"]
|
||||
return output
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host", type=str, default="localhost")
|
||||
parser.add_argument("--port", type=int, default=8000)
|
||||
parser.add_argument("--n", type=int, default=1)
|
||||
parser.add_argument("--prompt", type=str, default="San Francisco is a")
|
||||
parser.add_argument("--stream", action="store_true")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args: Namespace):
|
||||
prompt = args.prompt
|
||||
api_url = f"http://{args.host}:{args.port}/generate"
|
||||
n = args.n
|
||||
stream = args.stream
|
||||
|
||||
print(f"Prompt: {prompt!r}\n", flush=True)
|
||||
response = post_http_request(prompt, api_url, n, stream)
|
||||
|
||||
if stream:
|
||||
num_printed_lines = 0
|
||||
for h in get_streaming_response(response):
|
||||
clear_line(num_printed_lines)
|
||||
num_printed_lines = 0
|
||||
for i, line in enumerate(h):
|
||||
num_printed_lines += 1
|
||||
print(f"Beam candidate {i}: {line!r}", flush=True)
|
||||
else:
|
||||
output = get_response(response)
|
||||
for i, line in enumerate(output):
|
||||
print(f"Beam candidate {i}: {line!r}", flush=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
6
examples/online_serving/chart-helm/.helmignore
Normal file
6
examples/online_serving/chart-helm/.helmignore
Normal file
@@ -0,0 +1,6 @@
|
||||
*.png
|
||||
.git/
|
||||
ct.yaml
|
||||
lintconf.yaml
|
||||
values.schema.json
|
||||
/workflows
|
||||
21
examples/online_serving/chart-helm/Chart.yaml
Normal file
21
examples/online_serving/chart-helm/Chart.yaml
Normal file
@@ -0,0 +1,21 @@
|
||||
apiVersion: v2
|
||||
name: chart-vllm
|
||||
description: Chart vllm
|
||||
|
||||
# A chart can be either an 'application' or a 'library' chart.
|
||||
#
|
||||
# Application charts are a collection of templates that can be packaged into versioned archives
|
||||
# to be deployed.
|
||||
#
|
||||
# Library charts provide useful utilities or functions for the chart developer. They're included as
|
||||
# a dependency of application charts to inject those utilities and functions into the rendering
|
||||
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
|
||||
type: application
|
||||
|
||||
# This is the chart version. This version number should be incremented each time you make changes
|
||||
# to the chart and its templates, including the app version.
|
||||
# Versions are expected to follow Semantic Versioning (https://semver.org/)
|
||||
version: 0.0.1
|
||||
|
||||
maintainers:
|
||||
- name: mfournioux
|
||||
33
examples/online_serving/chart-helm/README.md
Normal file
33
examples/online_serving/chart-helm/README.md
Normal file
@@ -0,0 +1,33 @@
|
||||
# Helm Charts
|
||||
|
||||
This directory contains a Helm chart for deploying the vllm application. The chart includes configurations for deployment, autoscaling, resource management, and more.
|
||||
|
||||
## Files
|
||||
|
||||
- Chart.yaml: Defines the chart metadata including name, version, and maintainers.
|
||||
- ct.yaml: Configuration for chart testing.
|
||||
- lintconf.yaml: Linting rules for YAML files.
|
||||
- values.schema.json: JSON schema for validating values.yaml.
|
||||
- values.yaml: Default values for the Helm chart.
|
||||
- templates/_helpers.tpl: Helper templates for defining common configurations.
|
||||
- templates/configmap.yaml: Template for creating ConfigMaps.
|
||||
- templates/custom-objects.yaml: Template for custom Kubernetes objects.
|
||||
- templates/deployment.yaml: Template for creating Deployments.
|
||||
- templates/hpa.yaml: Template for Horizontal Pod Autoscaler.
|
||||
- templates/job.yaml: Template for Kubernetes Jobs.
|
||||
- templates/poddisruptionbudget.yaml: Template for Pod Disruption Budget.
|
||||
- templates/pvc.yaml: Template for Persistent Volume Claims.
|
||||
- templates/secrets.yaml: Template for Kubernetes Secrets.
|
||||
- templates/service.yaml: Template for creating Services.
|
||||
|
||||
## Running Tests
|
||||
|
||||
This chart includes unit tests using [helm-unittest](https://github.com/helm-unittest/helm-unittest). Install the plugin and run tests:
|
||||
|
||||
```bash
|
||||
# Install plugin
|
||||
helm plugin install https://github.com/helm-unittest/helm-unittest
|
||||
|
||||
# Run tests
|
||||
helm unittest .
|
||||
```
|
||||
3
examples/online_serving/chart-helm/ct.yaml
Normal file
3
examples/online_serving/chart-helm/ct.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
chart-dirs:
|
||||
- charts
|
||||
validate-maintainers: false
|
||||
42
examples/online_serving/chart-helm/lintconf.yaml
Normal file
42
examples/online_serving/chart-helm/lintconf.yaml
Normal file
@@ -0,0 +1,42 @@
|
||||
---
|
||||
rules:
|
||||
braces:
|
||||
min-spaces-inside: 0
|
||||
max-spaces-inside: 0
|
||||
min-spaces-inside-empty: -1
|
||||
max-spaces-inside-empty: -1
|
||||
brackets:
|
||||
min-spaces-inside: 0
|
||||
max-spaces-inside: 0
|
||||
min-spaces-inside-empty: -1
|
||||
max-spaces-inside-empty: -1
|
||||
colons:
|
||||
max-spaces-before: 0
|
||||
max-spaces-after: 1
|
||||
commas:
|
||||
max-spaces-before: 0
|
||||
min-spaces-after: 1
|
||||
max-spaces-after: 1
|
||||
comments:
|
||||
require-starting-space: true
|
||||
min-spaces-from-content: 2
|
||||
document-end: disable
|
||||
document-start: disable # No --- to start a file
|
||||
empty-lines:
|
||||
max: 2
|
||||
max-start: 0
|
||||
max-end: 0
|
||||
hyphens:
|
||||
max-spaces-after: 1
|
||||
indentation:
|
||||
spaces: consistent
|
||||
indent-sequences: whatever # - list indentation will handle both indentation and without
|
||||
check-multi-line-strings: false
|
||||
key-duplicates: enable
|
||||
line-length: disable # Lines can be any length
|
||||
new-line-at-end-of-file: disable
|
||||
new-lines:
|
||||
type: unix
|
||||
trailing-spaces: enable
|
||||
truthy:
|
||||
level: warning
|
||||
165
examples/online_serving/chart-helm/templates/_helpers.tpl
Normal file
165
examples/online_serving/chart-helm/templates/_helpers.tpl
Normal file
@@ -0,0 +1,165 @@
|
||||
{{/*
|
||||
Define ports for the pods
|
||||
*/}}
|
||||
{{- define "chart.container-port" -}}
|
||||
{{- default "8000" .Values.containerPort }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Define service name
|
||||
*/}}
|
||||
{{- define "chart.service-name" -}}
|
||||
{{- if .Values.serviceName }}
|
||||
{{- .Values.serviceName | lower | trim }}
|
||||
{{- else }}
|
||||
"{{ .Release.Name }}-service"
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Define service port
|
||||
*/}}
|
||||
{{- define "chart.service-port" -}}
|
||||
{{- if .Values.servicePort }}
|
||||
{{- .Values.servicePort }}
|
||||
{{- else }}
|
||||
{{- include "chart.container-port" . }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Define service port name
|
||||
*/}}
|
||||
{{- define "chart.service-port-name" -}}
|
||||
"service-port"
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Define container port name
|
||||
*/}}
|
||||
{{- define "chart.container-port-name" -}}
|
||||
"container-port"
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Define deployment strategy
|
||||
*/}}
|
||||
{{- define "chart.strategy" -}}
|
||||
strategy:
|
||||
{{- if not .Values.deploymentStrategy }}
|
||||
rollingUpdate:
|
||||
maxSurge: 100%
|
||||
maxUnavailable: 0
|
||||
{{- else }}
|
||||
{{ toYaml .Values.deploymentStrategy | indent 2 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Define additional ports
|
||||
*/}}
|
||||
{{- define "chart.extraPorts" }}
|
||||
{{- with .Values.extraPorts }}
|
||||
{{ toYaml . }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Define chart external ConfigMaps and Secrets
|
||||
*/}}
|
||||
{{- define "chart.externalConfigs" -}}
|
||||
{{- with .Values.externalConfigs -}}
|
||||
{{ toYaml . }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
|
||||
{{/*
|
||||
Define liveness et readiness probes
|
||||
*/}}
|
||||
{{- define "chart.probes" -}}
|
||||
{{- if .Values.readinessProbe }}
|
||||
readinessProbe:
|
||||
{{- with .Values.readinessProbe }}
|
||||
{{- toYaml . | nindent 2 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.livenessProbe }}
|
||||
livenessProbe:
|
||||
{{- with .Values.livenessProbe }}
|
||||
{{- toYaml . | nindent 2 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Define resources
|
||||
*/}}
|
||||
{{- define "chart.resources" -}}
|
||||
requests:
|
||||
memory: {{ required "Value 'resources.requests.memory' must be defined !" .Values.resources.requests.memory | quote }}
|
||||
cpu: {{ required "Value 'resources.requests.cpu' must be defined !" .Values.resources.requests.cpu | quote }}
|
||||
{{- if and (gt (int (index .Values.resources.requests "nvidia.com/gpu")) 0) (gt (int (index .Values.resources.limits "nvidia.com/gpu")) 0) }}
|
||||
nvidia.com/gpu: {{ required "Value 'resources.requests.nvidia.com/gpu' must be defined !" (index .Values.resources.requests "nvidia.com/gpu") | quote }}
|
||||
{{- end }}
|
||||
limits:
|
||||
memory: {{ required "Value 'resources.limits.memory' must be defined !" .Values.resources.limits.memory | quote }}
|
||||
cpu: {{ required "Value 'resources.limits.cpu' must be defined !" .Values.resources.limits.cpu | quote }}
|
||||
{{- if and (gt (int (index .Values.resources.requests "nvidia.com/gpu")) 0) (gt (int (index .Values.resources.limits "nvidia.com/gpu")) 0) }}
|
||||
nvidia.com/gpu: {{ required "Value 'resources.limits.nvidia.com/gpu' must be defined !" (index .Values.resources.limits "nvidia.com/gpu") | quote }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
|
||||
{{/*
|
||||
Define User used for the main container
|
||||
*/}}
|
||||
{{- define "chart.user" }}
|
||||
{{- if .Values.image.runAsUser }}
|
||||
runAsUser:
|
||||
{{- with .Values.runAsUser }}
|
||||
{{- toYaml . | nindent 2 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
|
||||
{{- define "chart.extraInitEnv" -}}
|
||||
- name: S3_ENDPOINT_URL
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: {{ .Release.Name }}-secrets
|
||||
key: s3endpoint
|
||||
- name: S3_BUCKET_NAME
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: {{ .Release.Name }}-secrets
|
||||
key: s3bucketname
|
||||
- name: AWS_ACCESS_KEY_ID
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: {{ .Release.Name }}-secrets
|
||||
key: s3accesskeyid
|
||||
- name: AWS_SECRET_ACCESS_KEY
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: {{ .Release.Name }}-secrets
|
||||
key: s3accesskey
|
||||
{{- if .Values.extraInit.s3modelpath }}
|
||||
- name: S3_PATH
|
||||
value: "{{ .Values.extraInit.s3modelpath }}"
|
||||
{{- end }}
|
||||
{{- if hasKey .Values.extraInit "awsEc2MetadataDisabled" }}
|
||||
- name: AWS_EC2_METADATA_DISABLED
|
||||
value: "{{ .Values.extraInit.awsEc2MetadataDisabled }}"
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Define chart labels
|
||||
*/}}
|
||||
{{- define "chart.labels" -}}
|
||||
{{- with .Values.labels -}}
|
||||
{{ toYaml . }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
11
examples/online_serving/chart-helm/templates/configmap.yaml
Normal file
11
examples/online_serving/chart-helm/templates/configmap.yaml
Normal file
@@ -0,0 +1,11 @@
|
||||
{{- if .Values.configs -}}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: "{{ .Release.Name }}-configs"
|
||||
namespace: {{ .Release.Namespace }}
|
||||
data:
|
||||
{{- with .Values.configs }}
|
||||
{{- toYaml . | nindent 2 }}
|
||||
{{- end }}
|
||||
{{- end -}}
|
||||
@@ -0,0 +1,6 @@
|
||||
{{- if .Values.customObjects }}
|
||||
{{- range .Values.customObjects }}
|
||||
{{- tpl (. | toYaml) $ }}
|
||||
---
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
131
examples/online_serving/chart-helm/templates/deployment.yaml
Normal file
131
examples/online_serving/chart-helm/templates/deployment.yaml
Normal file
@@ -0,0 +1,131 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: "{{ .Release.Name }}-deployment-vllm"
|
||||
namespace: {{ .Release.Namespace }}
|
||||
labels:
|
||||
{{- include "chart.labels" . | nindent 4 }}
|
||||
spec:
|
||||
replicas: {{ .Values.replicaCount }}
|
||||
{{- include "chart.strategy" . | nindent 2 }}
|
||||
selector:
|
||||
matchLabels:
|
||||
environment: "test"
|
||||
release: "test"
|
||||
progressDeadlineSeconds: 1200
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
environment: "test"
|
||||
release: "test"
|
||||
spec:
|
||||
containers:
|
||||
- name: "vllm"
|
||||
image: "{{ required "Required value 'image.repository' must be defined !" .Values.image.repository }}:{{ required "Required value 'image.tag' must be defined !" .Values.image.tag }}"
|
||||
{{- if .Values.image.command }}
|
||||
command :
|
||||
{{- with .Values.image.command }}
|
||||
{{- toYaml . | nindent 10 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
securityContext:
|
||||
{{- if .Values.image.securityContext }}
|
||||
{{- with .Values.image.securityContext }}
|
||||
{{- toYaml . | nindent 12 }}
|
||||
{{- end }}
|
||||
{{- else }}
|
||||
runAsNonRoot: false
|
||||
{{- include "chart.user" . | indent 12 }}
|
||||
{{- end }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
{{- if .Values.image.env }}
|
||||
env :
|
||||
{{- with .Values.image.env }}
|
||||
{{- toYaml . | nindent 10 }}
|
||||
{{- end }}
|
||||
{{- else }}
|
||||
env: []
|
||||
{{- end }}
|
||||
{{- if or .Values.externalConfigs .Values.configs .Values.secrets }}
|
||||
envFrom:
|
||||
{{- if .Values.configs }}
|
||||
- configMapRef:
|
||||
name: "{{ .Release.Name }}-configs"
|
||||
{{- end }}
|
||||
{{- if .Values.secrets}}
|
||||
- secretRef:
|
||||
name: "{{ .Release.Name }}-secrets"
|
||||
{{- end }}
|
||||
{{- include "chart.externalConfigs" . | nindent 12 }}
|
||||
{{- end }}
|
||||
ports:
|
||||
- name: {{ include "chart.container-port-name" . }}
|
||||
containerPort: {{ include "chart.container-port" . }}
|
||||
{{- include "chart.extraPorts" . | nindent 12 }}
|
||||
{{- include "chart.probes" . | indent 10 }}
|
||||
resources: {{- include "chart.resources" . | nindent 12 }}
|
||||
volumeMounts:
|
||||
- name: {{ .Release.Name }}-storage
|
||||
mountPath: /data
|
||||
|
||||
{{- with .Values.extraContainers }}
|
||||
{{ toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
|
||||
{{- if and .Values.extraInit (or .Values.extraInit.modelDownload.enabled .Values.extraInit.initContainers) }}
|
||||
initContainers:
|
||||
{{- if .Values.extraInit.modelDownload.enabled }}
|
||||
- name: wait-download-model
|
||||
image: {{ .Values.extraInit.modelDownload.image.repository }}:{{ .Values.extraInit.modelDownload.image.tag }}
|
||||
imagePullPolicy: {{ .Values.extraInit.modelDownload.image.pullPolicy }}
|
||||
command: {{ .Values.extraInit.modelDownload.waitContainer.command | toJson }}
|
||||
args:
|
||||
{{- toYaml .Values.extraInit.modelDownload.waitContainer.args | nindent 10 }}
|
||||
env:
|
||||
{{- if .Values.extraInit.modelDownload.waitContainer.env }}
|
||||
{{- toYaml .Values.extraInit.modelDownload.waitContainer.env | nindent 10 }}
|
||||
{{- else }}
|
||||
{{- include "chart.extraInitEnv" . | nindent 10 }}
|
||||
{{- end }}
|
||||
resources:
|
||||
requests:
|
||||
cpu: 200m
|
||||
memory: 1Gi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 2Gi
|
||||
volumeMounts:
|
||||
- name: {{ .Release.Name }}-storage
|
||||
mountPath: /data
|
||||
{{- end }}
|
||||
{{- with .Values.extraInit.initContainers }}
|
||||
{{- toYaml . | nindent 6 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
volumes:
|
||||
- name: {{ .Release.Name }}-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: {{ .Release.Name }}-storage-claim
|
||||
|
||||
{{- with .Values.nodeSelector }}
|
||||
nodeSelector:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.tolerations }}
|
||||
tolerations:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- if and (gt (int (index .Values.resources.requests "nvidia.com/gpu")) 0) (gt (int (index .Values.resources.limits "nvidia.com/gpu")) 0) }}
|
||||
runtimeClassName: nvidia
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: nvidia.com/gpu.product
|
||||
operator: In
|
||||
{{- with .Values.gpuModels }}
|
||||
values:
|
||||
{{- toYaml . | nindent 20 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
31
examples/online_serving/chart-helm/templates/hpa.yaml
Normal file
31
examples/online_serving/chart-helm/templates/hpa.yaml
Normal file
@@ -0,0 +1,31 @@
|
||||
{{- if .Values.autoscaling.enabled }}
|
||||
apiVersion: autoscaling/v2
|
||||
kind: HorizontalPodAutoscaler
|
||||
metadata:
|
||||
name: "{{ .Release.Name }}-hpa"
|
||||
namespace: {{ .Release.Namespace }}
|
||||
spec:
|
||||
scaleTargetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: vllm
|
||||
minReplicas: {{ .Values.autoscaling.minReplicas }}
|
||||
maxReplicas: {{ .Values.autoscaling.maxReplicas }}
|
||||
metrics:
|
||||
{{- if .Values.autoscaling.targetCPUUtilizationPercentage }}
|
||||
- type: Resource
|
||||
resource:
|
||||
name: cpu
|
||||
target:
|
||||
type: Utilization
|
||||
averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
|
||||
{{- end }}
|
||||
{{- if .Values.autoscaling.targetMemoryUtilizationPercentage }}
|
||||
- type: Resource
|
||||
resource:
|
||||
name: memory
|
||||
target:
|
||||
type: Utilization
|
||||
averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
41
examples/online_serving/chart-helm/templates/job.yaml
Normal file
41
examples/online_serving/chart-helm/templates/job.yaml
Normal file
@@ -0,0 +1,41 @@
|
||||
{{- if and .Values.extraInit .Values.extraInit.modelDownload.enabled }}
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: "{{ .Release.Name }}-init-vllm"
|
||||
namespace: {{ .Release.Namespace }}
|
||||
spec:
|
||||
ttlSecondsAfterFinished: 100
|
||||
template:
|
||||
metadata:
|
||||
name: init-vllm
|
||||
spec:
|
||||
containers:
|
||||
- name: job-download-model
|
||||
image: {{ .Values.extraInit.modelDownload.image.repository }}:{{ .Values.extraInit.modelDownload.image.tag }}
|
||||
imagePullPolicy: {{ .Values.extraInit.modelDownload.image.pullPolicy }}
|
||||
command: {{ .Values.extraInit.modelDownload.downloadJob.command | toJson }}
|
||||
args:
|
||||
{{- toYaml .Values.extraInit.modelDownload.downloadJob.args | nindent 8 }}
|
||||
env:
|
||||
{{- if .Values.extraInit.modelDownload.downloadJob.env }}
|
||||
{{- toYaml .Values.extraInit.modelDownload.downloadJob.env | nindent 8 }}
|
||||
{{- else }}
|
||||
{{- include "chart.extraInitEnv" . | nindent 8 }}
|
||||
{{- end }}
|
||||
volumeMounts:
|
||||
- name: {{ .Release.Name }}-storage
|
||||
mountPath: /data
|
||||
resources:
|
||||
requests:
|
||||
cpu: 200m
|
||||
memory: 1Gi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 2Gi
|
||||
restartPolicy: OnFailure
|
||||
volumes:
|
||||
- name: {{ .Release.Name }}-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: "{{ .Release.Name }}-storage-claim"
|
||||
{{- end }}
|
||||
@@ -0,0 +1,7 @@
|
||||
apiVersion: policy/v1
|
||||
kind: PodDisruptionBudget
|
||||
metadata:
|
||||
name: "{{ .Release.Name }}-pdb"
|
||||
namespace: {{ .Release.Namespace }}
|
||||
spec:
|
||||
maxUnavailable: {{ default 1 .Values.maxUnavailablePodDisruptionBudget }}
|
||||
13
examples/online_serving/chart-helm/templates/pvc.yaml
Normal file
13
examples/online_serving/chart-helm/templates/pvc.yaml
Normal file
@@ -0,0 +1,13 @@
|
||||
{{- if .Values.extraInit }}
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: "{{ .Release.Name }}-storage-claim"
|
||||
namespace: {{ .Release.Namespace }}
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: {{ .Values.extraInit.pvcStorage }}
|
||||
{{- end }}
|
||||
10
examples/online_serving/chart-helm/templates/secrets.yaml
Normal file
10
examples/online_serving/chart-helm/templates/secrets.yaml
Normal file
@@ -0,0 +1,10 @@
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: "{{ .Release.Name }}-secrets"
|
||||
namespace: {{ .Release.Namespace }}
|
||||
type: Opaque
|
||||
data:
|
||||
{{- range $key, $val := .Values.secrets }}
|
||||
{{ $key }}: {{ $val | b64enc | quote }}
|
||||
{{- end }}
|
||||
14
examples/online_serving/chart-helm/templates/service.yaml
Normal file
14
examples/online_serving/chart-helm/templates/service.yaml
Normal file
@@ -0,0 +1,14 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: "{{ .Release.Name }}-service"
|
||||
namespace: {{ .Release.Namespace }}
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- name: {{ include "chart.service-port-name" . }}
|
||||
port: {{ include "chart.service-port" . }}
|
||||
targetPort: {{ include "chart.container-port-name" . }}
|
||||
protocol: TCP
|
||||
selector:
|
||||
{{- include "chart.labels" . | nindent 4 }}
|
||||
135
examples/online_serving/chart-helm/tests/deployment_test.yaml
Normal file
135
examples/online_serving/chart-helm/tests/deployment_test.yaml
Normal file
@@ -0,0 +1,135 @@
|
||||
suite: test deployment
|
||||
templates:
|
||||
- deployment.yaml
|
||||
tests:
|
||||
- it: should create wait-download-model init container when modelDownload is enabled
|
||||
set:
|
||||
extraInit:
|
||||
modelDownload:
|
||||
enabled: true
|
||||
image:
|
||||
repository: "amazon/aws-cli"
|
||||
tag: "2.6.4"
|
||||
pullPolicy: "IfNotPresent"
|
||||
waitContainer:
|
||||
command: [ "/bin/bash" ]
|
||||
args:
|
||||
- "-eucx"
|
||||
- "while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done"
|
||||
downloadJob:
|
||||
command: [ "/bin/bash" ]
|
||||
args:
|
||||
- "-eucx"
|
||||
- "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data"
|
||||
initContainers: [ ]
|
||||
pvcStorage: "1Gi"
|
||||
s3modelpath: "relative_s3_model_path/opt-125m"
|
||||
awsEc2MetadataDisabled: true
|
||||
asserts:
|
||||
- hasDocuments:
|
||||
count: 1
|
||||
- isKind:
|
||||
of: Deployment
|
||||
- isNotEmpty:
|
||||
path: spec.template.spec.initContainers
|
||||
- equal:
|
||||
path: spec.template.spec.initContainers[0].name
|
||||
value: wait-download-model
|
||||
- equal:
|
||||
path: spec.template.spec.initContainers[0].image
|
||||
value: amazon/aws-cli:2.6.4
|
||||
- equal:
|
||||
path: spec.template.spec.initContainers[0].imagePullPolicy
|
||||
value: IfNotPresent
|
||||
|
||||
- it: should only create custom init containers when modelDownload is disabled
|
||||
set:
|
||||
extraInit:
|
||||
modelDownload:
|
||||
enabled: false
|
||||
image:
|
||||
repository: "amazon/aws-cli"
|
||||
tag: "2.6.4"
|
||||
pullPolicy: "IfNotPresent"
|
||||
waitContainer:
|
||||
command: [ "/bin/bash" ]
|
||||
args: [ "-c", "echo test" ]
|
||||
downloadJob:
|
||||
command: [ "/bin/bash" ]
|
||||
args: [ "-c", "echo test" ]
|
||||
initContainers:
|
||||
- name: llm-d-routing-proxy
|
||||
image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
|
||||
imagePullPolicy: IfNotPresent
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: proxy
|
||||
pvcStorage: "10Gi"
|
||||
asserts:
|
||||
- hasDocuments:
|
||||
count: 1
|
||||
- isKind:
|
||||
of: Deployment
|
||||
- lengthEqual:
|
||||
path: spec.template.spec.initContainers
|
||||
count: 1
|
||||
- equal:
|
||||
path: spec.template.spec.initContainers[0].name
|
||||
value: llm-d-routing-proxy
|
||||
- equal:
|
||||
path: spec.template.spec.initContainers[0].image
|
||||
value: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
|
||||
- equal:
|
||||
path: spec.template.spec.initContainers[0].ports[0].containerPort
|
||||
value: 8080
|
||||
|
||||
- it: should create both wait-download-model and custom init containers when both are enabled
|
||||
set:
|
||||
extraInit:
|
||||
modelDownload:
|
||||
enabled: true
|
||||
image:
|
||||
repository: "amazon/aws-cli"
|
||||
tag: "2.6.4"
|
||||
pullPolicy: "IfNotPresent"
|
||||
waitContainer:
|
||||
command: [ "/bin/bash" ]
|
||||
args:
|
||||
- "-eucx"
|
||||
- "while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done"
|
||||
downloadJob:
|
||||
command: [ "/bin/bash" ]
|
||||
args:
|
||||
- "-eucx"
|
||||
- "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data"
|
||||
initContainers:
|
||||
- name: llm-d-routing-proxy
|
||||
image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
|
||||
imagePullPolicy: IfNotPresent
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: proxy
|
||||
pvcStorage: "10Gi"
|
||||
asserts:
|
||||
- hasDocuments:
|
||||
count: 1
|
||||
- isKind:
|
||||
of: Deployment
|
||||
- lengthEqual:
|
||||
path: spec.template.spec.initContainers
|
||||
count: 2
|
||||
- equal:
|
||||
path: spec.template.spec.initContainers[0].name
|
||||
value: wait-download-model
|
||||
- equal:
|
||||
path: spec.template.spec.initContainers[0].image
|
||||
value: amazon/aws-cli:2.6.4
|
||||
- equal:
|
||||
path: spec.template.spec.initContainers[1].name
|
||||
value: llm-d-routing-proxy
|
||||
- equal:
|
||||
path: spec.template.spec.initContainers[1].image
|
||||
value: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
|
||||
- equal:
|
||||
path: spec.template.spec.initContainers[1].ports[0].containerPort
|
||||
value: 8080
|
||||
61
examples/online_serving/chart-helm/tests/job_test.yaml
Normal file
61
examples/online_serving/chart-helm/tests/job_test.yaml
Normal file
@@ -0,0 +1,61 @@
|
||||
suite: test job
|
||||
templates:
|
||||
- job.yaml
|
||||
tests:
|
||||
- it: should create job when modelDownload is enabled
|
||||
set:
|
||||
extraInit:
|
||||
modelDownload:
|
||||
enabled: true
|
||||
image:
|
||||
repository: "amazon/aws-cli"
|
||||
tag: "2.6.4"
|
||||
pullPolicy: "IfNotPresent"
|
||||
waitContainer:
|
||||
command: [ "/bin/bash" ]
|
||||
args: [ "-c", "wait" ]
|
||||
downloadJob:
|
||||
command: [ "/bin/bash" ]
|
||||
args:
|
||||
- "-eucx"
|
||||
- "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data"
|
||||
pvcStorage: "1Gi"
|
||||
s3modelpath: "relative_s3_model_path/opt-125m"
|
||||
awsEc2MetadataDisabled: true
|
||||
asserts:
|
||||
- hasDocuments:
|
||||
count: 1
|
||||
- isKind:
|
||||
of: Job
|
||||
- equal:
|
||||
path: spec.template.spec.containers[0].name
|
||||
value: job-download-model
|
||||
- equal:
|
||||
path: spec.template.spec.containers[0].image
|
||||
value: amazon/aws-cli:2.6.4
|
||||
- equal:
|
||||
path: spec.template.spec.restartPolicy
|
||||
value: OnFailure
|
||||
|
||||
- it: should not create job when modelDownload is disabled
|
||||
set:
|
||||
extraInit:
|
||||
modelDownload:
|
||||
enabled: false
|
||||
image:
|
||||
repository: "amazon/aws-cli"
|
||||
tag: "2.6.4"
|
||||
pullPolicy: "IfNotPresent"
|
||||
waitContainer:
|
||||
command: [ "/bin/bash" ]
|
||||
args: [ "-c", "wait" ]
|
||||
downloadJob:
|
||||
command: [ "/bin/bash" ]
|
||||
args: [ "-c", "download" ]
|
||||
initContainers:
|
||||
- name: llm-d-routing-proxy
|
||||
image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
|
||||
pvcStorage: "10Gi"
|
||||
asserts:
|
||||
- hasDocuments:
|
||||
count: 0
|
||||
32
examples/online_serving/chart-helm/tests/pvc_test.yaml
Normal file
32
examples/online_serving/chart-helm/tests/pvc_test.yaml
Normal file
@@ -0,0 +1,32 @@
|
||||
suite: test pvc
|
||||
templates:
|
||||
- pvc.yaml
|
||||
tests:
|
||||
# Test Case: PVC Created When extraInit Defined
|
||||
- it: should create pvc when extraInit is defined
|
||||
set:
|
||||
extraInit:
|
||||
modelDownload:
|
||||
enabled: true
|
||||
image:
|
||||
repository: "amazon/aws-cli"
|
||||
tag: "2.6.4"
|
||||
pullPolicy: "IfNotPresent"
|
||||
waitContainer:
|
||||
command: ["/bin/bash"]
|
||||
args: ["-c", "wait"]
|
||||
downloadJob:
|
||||
command: ["/bin/bash"]
|
||||
args: ["-c", "download"]
|
||||
pvcStorage: "10Gi"
|
||||
asserts:
|
||||
- hasDocuments:
|
||||
count: 1
|
||||
- isKind:
|
||||
of: PersistentVolumeClaim
|
||||
- equal:
|
||||
path: spec.accessModes[0]
|
||||
value: ReadWriteOnce
|
||||
- equal:
|
||||
path: spec.resources.requests.storage
|
||||
value: 10Gi
|
||||
329
examples/online_serving/chart-helm/values.schema.json
Normal file
329
examples/online_serving/chart-helm/values.schema.json
Normal file
@@ -0,0 +1,329 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/schema#",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"image": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"repository": {
|
||||
"type": "string"
|
||||
},
|
||||
"tag": {
|
||||
"type": "string"
|
||||
},
|
||||
"command": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"command",
|
||||
"repository",
|
||||
"tag"
|
||||
]
|
||||
},
|
||||
"containerPort": {
|
||||
"type": "integer"
|
||||
},
|
||||
"serviceName": {
|
||||
"type": "null"
|
||||
},
|
||||
"servicePort": {
|
||||
"type": "integer"
|
||||
},
|
||||
"extraPorts": {
|
||||
"type": "array"
|
||||
},
|
||||
"replicaCount": {
|
||||
"type": "integer"
|
||||
},
|
||||
"deploymentStrategy": {
|
||||
"type": "object"
|
||||
},
|
||||
"resources": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"requests": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"cpu": {
|
||||
"type": "integer"
|
||||
},
|
||||
"memory": {
|
||||
"type": "string"
|
||||
},
|
||||
"nvidia.com/gpu": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"cpu",
|
||||
"memory",
|
||||
"nvidia.com/gpu"
|
||||
]
|
||||
},
|
||||
"limits": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"cpu": {
|
||||
"type": "integer"
|
||||
},
|
||||
"memory": {
|
||||
"type": "string"
|
||||
},
|
||||
"nvidia.com/gpu": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"cpu",
|
||||
"memory",
|
||||
"nvidia.com/gpu"
|
||||
]
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"limits",
|
||||
"requests"
|
||||
]
|
||||
},
|
||||
"gpuModels": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"autoscaling": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"enabled": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"minReplicas": {
|
||||
"type": "integer"
|
||||
},
|
||||
"maxReplicas": {
|
||||
"type": "integer"
|
||||
},
|
||||
"targetCPUUtilizationPercentage": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"enabled",
|
||||
"maxReplicas",
|
||||
"minReplicas",
|
||||
"targetCPUUtilizationPercentage"
|
||||
]
|
||||
},
|
||||
"configs": {
|
||||
"type": "object"
|
||||
},
|
||||
"secrets": {
|
||||
"type": "object"
|
||||
},
|
||||
"externalConfigs": {
|
||||
"type": "array"
|
||||
},
|
||||
"customObjects": {
|
||||
"type": "array"
|
||||
},
|
||||
"maxUnavailablePodDisruptionBudget": {
|
||||
"type": "string"
|
||||
},
|
||||
"extraInit": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"modelDownload": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"enabled": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"image": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"repository": {
|
||||
"type": "string"
|
||||
},
|
||||
"tag": {
|
||||
"type": "string"
|
||||
},
|
||||
"pullPolicy": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": ["repository", "tag", "pullPolicy"]
|
||||
},
|
||||
"waitContainer": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"command": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
},
|
||||
"args": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
},
|
||||
"env": {
|
||||
"type": "array",
|
||||
"items": {"type": "object"}
|
||||
}
|
||||
},
|
||||
"required": ["command", "args"]
|
||||
},
|
||||
"downloadJob": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"command": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
},
|
||||
"args": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
},
|
||||
"env": {
|
||||
"type": "array",
|
||||
"items": {"type": "object"}
|
||||
}
|
||||
},
|
||||
"required": ["command", "args"]
|
||||
}
|
||||
},
|
||||
"required": ["enabled", "image", "waitContainer", "downloadJob"]
|
||||
},
|
||||
"initContainers": {
|
||||
"type": "array",
|
||||
"items": {"type": "object"}
|
||||
},
|
||||
"s3modelpath": {
|
||||
"type": "string"
|
||||
},
|
||||
"pvcStorage": {
|
||||
"type": "string"
|
||||
},
|
||||
"awsEc2MetadataDisabled": {
|
||||
"type": "boolean"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"modelDownload",
|
||||
"initContainers",
|
||||
"pvcStorage"
|
||||
]
|
||||
},
|
||||
"extraContainers": {
|
||||
"type": "array"
|
||||
},
|
||||
"readinessProbe": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"initialDelaySeconds": {
|
||||
"type": "integer"
|
||||
},
|
||||
"periodSeconds": {
|
||||
"type": "integer"
|
||||
},
|
||||
"failureThreshold": {
|
||||
"type": "integer"
|
||||
},
|
||||
"httpGet": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"path": {
|
||||
"type": "string"
|
||||
},
|
||||
"port": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"path",
|
||||
"port"
|
||||
]
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"failureThreshold",
|
||||
"httpGet",
|
||||
"initialDelaySeconds",
|
||||
"periodSeconds"
|
||||
]
|
||||
},
|
||||
"livenessProbe": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"initialDelaySeconds": {
|
||||
"type": "integer"
|
||||
},
|
||||
"failureThreshold": {
|
||||
"type": "integer"
|
||||
},
|
||||
"periodSeconds": {
|
||||
"type": "integer"
|
||||
},
|
||||
"httpGet": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"path": {
|
||||
"type": "string"
|
||||
},
|
||||
"port": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"path",
|
||||
"port"
|
||||
]
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"failureThreshold",
|
||||
"httpGet",
|
||||
"initialDelaySeconds",
|
||||
"periodSeconds"
|
||||
]
|
||||
},
|
||||
"labels": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"environment": {
|
||||
"type": "string"
|
||||
},
|
||||
"release": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"environment",
|
||||
"release"
|
||||
]
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"autoscaling",
|
||||
"configs",
|
||||
"containerPort",
|
||||
"customObjects",
|
||||
"deploymentStrategy",
|
||||
"externalConfigs",
|
||||
"extraContainers",
|
||||
"extraInit",
|
||||
"extraPorts",
|
||||
"gpuModels",
|
||||
"image",
|
||||
"labels",
|
||||
"livenessProbe",
|
||||
"maxUnavailablePodDisruptionBudget",
|
||||
"readinessProbe",
|
||||
"replicaCount",
|
||||
"resources",
|
||||
"secrets",
|
||||
"servicePort"
|
||||
]
|
||||
}
|
||||
174
examples/online_serving/chart-helm/values.yaml
Normal file
174
examples/online_serving/chart-helm/values.yaml
Normal file
@@ -0,0 +1,174 @@
|
||||
# -- Default values for chart vllm
|
||||
# -- Declare variables to be passed into your templates.
|
||||
|
||||
# -- Image configuration
|
||||
image:
|
||||
# -- Image repository
|
||||
repository: "vllm/vllm-openai"
|
||||
# -- Image tag
|
||||
tag: "latest"
|
||||
# -- Container launch command
|
||||
command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--enforce-eager", "--dtype", "bfloat16", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"]
|
||||
|
||||
# -- Container port
|
||||
containerPort: 8000
|
||||
# -- Service name
|
||||
serviceName:
|
||||
# -- Service port
|
||||
servicePort: 80
|
||||
# -- Additional ports configuration
|
||||
extraPorts: []
|
||||
|
||||
# -- Number of replicas
|
||||
replicaCount: 1
|
||||
|
||||
# -- Deployment strategy configuration
|
||||
deploymentStrategy: {}
|
||||
|
||||
# -- Resource configuration
|
||||
resources:
|
||||
requests:
|
||||
# -- Number of CPUs
|
||||
cpu: 4
|
||||
# -- CPU memory configuration
|
||||
memory: 16Gi
|
||||
# -- Number of gpus used
|
||||
nvidia.com/gpu: 1
|
||||
limits:
|
||||
# -- Number of CPUs
|
||||
cpu: 4
|
||||
# -- CPU memory configuration
|
||||
memory: 16Gi
|
||||
# -- Number of gpus used
|
||||
nvidia.com/gpu: 1
|
||||
|
||||
# -- Type of gpu used
|
||||
gpuModels:
|
||||
- "TYPE_GPU_USED"
|
||||
|
||||
# -- Autoscaling configuration
|
||||
autoscaling:
|
||||
# -- Enable autoscaling
|
||||
enabled: false
|
||||
# -- Minimum replicas
|
||||
minReplicas: 1
|
||||
# -- Maximum replicas
|
||||
maxReplicas: 100
|
||||
# -- Target CPU utilization for autoscaling
|
||||
targetCPUUtilizationPercentage: 80
|
||||
# targetMemoryUtilizationPercentage: 80
|
||||
|
||||
# -- Configmap
|
||||
configs: {}
|
||||
|
||||
# -- Secrets configuration
|
||||
secrets: {}
|
||||
|
||||
# -- External configuration
|
||||
externalConfigs: []
|
||||
|
||||
# -- Custom Objects configuration
|
||||
customObjects: []
|
||||
|
||||
# -- Disruption Budget Configuration
|
||||
maxUnavailablePodDisruptionBudget: ""
|
||||
|
||||
# -- Additional configuration for the init container
|
||||
extraInit:
|
||||
# -- Model download functionality (optional)
|
||||
modelDownload:
|
||||
# -- Enable model download job and wait container
|
||||
enabled: true
|
||||
# -- Image configuration for model download operations
|
||||
image:
|
||||
# -- Image repository
|
||||
repository: "amazon/aws-cli"
|
||||
# -- Image tag
|
||||
tag: "2.6.4"
|
||||
# -- Image pull policy
|
||||
pullPolicy: "IfNotPresent"
|
||||
# -- Wait container configuration (init container that waits for model to be ready)
|
||||
waitContainer:
|
||||
# -- Command to execute
|
||||
command: ["/bin/bash"]
|
||||
# -- Arguments for the wait container
|
||||
args:
|
||||
- "-eucx"
|
||||
- "while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done"
|
||||
# -- Environment variables (optional, overrides S3 defaults entirely if specified)
|
||||
# env:
|
||||
# - name: HUGGING_FACE_HUB_TOKEN
|
||||
# value: "your-token"
|
||||
# - name: MODEL_ID
|
||||
# value: "meta-llama/Llama-2-7b"
|
||||
# -- Download job configuration (job that actually downloads the model)
|
||||
downloadJob:
|
||||
# -- Command to execute
|
||||
command: ["/bin/bash"]
|
||||
# -- Arguments for the download job
|
||||
args:
|
||||
- "-eucx"
|
||||
- "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data"
|
||||
# -- Environment variables (optional, overrides S3 defaults entirely if specified)
|
||||
# env:
|
||||
# - name: HUGGING_FACE_HUB_TOKEN
|
||||
# value: "your-token"
|
||||
# - name: MODEL_ID
|
||||
# value: "meta-llama/Llama-2-7b"
|
||||
|
||||
# -- Custom init containers (appended after wait-download-model if modelDownload is enabled)
|
||||
initContainers: []
|
||||
# Example for llm-d sidecar:
|
||||
# initContainers:
|
||||
# - name: llm-d-routing-proxy
|
||||
# image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
|
||||
# imagePullPolicy: IfNotPresent
|
||||
# ports:
|
||||
# - containerPort: 8080
|
||||
# name: proxy
|
||||
# securityContext:
|
||||
# runAsUser: 1000
|
||||
|
||||
# -- Path of the model on the s3 which hosts model weights and config files
|
||||
s3modelpath: "relative_s3_model_path/opt-125m"
|
||||
# -- Storage size for the PVC
|
||||
pvcStorage: "1Gi"
|
||||
# -- Disable AWS EC2 metadata service
|
||||
awsEc2MetadataDisabled: true
|
||||
|
||||
# -- Additional containers configuration
|
||||
extraContainers: []
|
||||
|
||||
# -- Readiness probe configuration
|
||||
readinessProbe:
|
||||
# -- Number of seconds after the container has started before readiness probe is initiated
|
||||
initialDelaySeconds: 5
|
||||
# -- How often (in seconds) to perform the readiness probe
|
||||
periodSeconds: 5
|
||||
# -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
|
||||
failureThreshold: 3
|
||||
# -- Configuration of the Kubelet http request on the server
|
||||
httpGet:
|
||||
# -- Path to access on the HTTP server
|
||||
path: /health
|
||||
# -- Name or number of the port to access on the container, on which the server is listening
|
||||
port: 8000
|
||||
|
||||
# -- Liveness probe configuration
|
||||
livenessProbe:
|
||||
# -- Number of seconds after the container has started before liveness probe is initiated
|
||||
initialDelaySeconds: 15
|
||||
# -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
|
||||
failureThreshold: 3
|
||||
# -- How often (in seconds) to perform the liveness probe
|
||||
periodSeconds: 10
|
||||
# -- Configuration of the Kubelet http request on the server
|
||||
httpGet:
|
||||
# -- Path to access on the HTTP server
|
||||
path: /health
|
||||
# -- Name or number of the port to access on the container, on which the server is listening
|
||||
port: 8000
|
||||
|
||||
labels:
|
||||
environment: "test"
|
||||
release: "test"
|
||||
87
examples/online_serving/dashboards/README.md
Normal file
87
examples/online_serving/dashboards/README.md
Normal file
@@ -0,0 +1,87 @@
|
||||
# Monitoring Dashboards
|
||||
|
||||
This directory contains monitoring dashboard configurations for vLLM, providing
|
||||
comprehensive observability for your vLLM deployments.
|
||||
|
||||
## Dashboard Platforms
|
||||
|
||||
We provide dashboards for two popular observability platforms:
|
||||
|
||||
- **[Grafana](https://grafana.com)**
|
||||
- **[Perses](https://perses.dev)**
|
||||
|
||||
## Dashboard Format Approach
|
||||
|
||||
All dashboards are provided in **native formats** that work across different
|
||||
deployment methods:
|
||||
|
||||
### Grafana (JSON)
|
||||
|
||||
- ✅ Works with any Grafana instance (cloud, self-hosted, Docker)
|
||||
- ✅ Direct import via Grafana UI or API
|
||||
- ✅ Can be wrapped in Kubernetes operators when needed
|
||||
- ✅ No vendor lock-in or deployment dependencies
|
||||
|
||||
### Perses (YAML)
|
||||
|
||||
- ✅ Works with standalone Perses instances
|
||||
- ✅ Compatible with Perses API and CLI
|
||||
- ✅ Supports Dashboard-as-Code workflows
|
||||
- ✅ Can be wrapped in Kubernetes operators when needed
|
||||
|
||||
## Dashboard Contents
|
||||
|
||||
Both platforms provide equivalent monitoring capabilities:
|
||||
|
||||
| Dashboard | Description |
|
||||
|-----------|-------------|
|
||||
| **Performance Statistics** | Tracks latency, throughput, and performance metrics |
|
||||
| **Query Statistics** | Monitors request volume, query performance, and KPIs |
|
||||
|
||||
## Quick Start
|
||||
|
||||
First, navigate to this example's directory:
|
||||
|
||||
```bash
|
||||
cd examples/online_serving/dashboards
|
||||
```
|
||||
|
||||
### Grafana
|
||||
|
||||
Import the JSON directly into the Grafana UI, or use the API:
|
||||
|
||||
```bash
|
||||
curl -X POST http://grafana/api/dashboards/db \
|
||||
-H "Content-Type: application/json" \
|
||||
-d @grafana/performance_statistics.json
|
||||
```
|
||||
|
||||
### Perses
|
||||
|
||||
Import via the Perses CLI:
|
||||
|
||||
```bash
|
||||
percli apply -f perses/performance_statistics.yaml
|
||||
```
|
||||
|
||||
## Requirements
|
||||
|
||||
- **Prometheus** metrics from your vLLM deployment
|
||||
- **Data source** configured in your monitoring platform
|
||||
- **vLLM metrics** enabled and accessible
|
||||
|
||||
## Platform-Specific Documentation
|
||||
|
||||
For detailed deployment instructions and platform-specific options, see:
|
||||
|
||||
- **[Grafana Documentation](./grafana)** - JSON dashboards, operator usage, manual import
|
||||
- **[Perses Documentation](./perses)** - YAML specs, CLI usage, operator wrapping
|
||||
|
||||
## Contributing
|
||||
|
||||
When adding new dashboards, please:
|
||||
|
||||
1. Provide native formats (JSON for Grafana, YAML specs for Perses)
|
||||
2. Update platform-specific README files
|
||||
3. Ensure dashboards work across deployment methods
|
||||
4. Test with the latest platform versions
|
||||
59
examples/online_serving/dashboards/grafana/README.md
Normal file
59
examples/online_serving/dashboards/grafana/README.md
Normal file
@@ -0,0 +1,59 @@
|
||||
# Grafana Dashboards for vLLM Monitoring
|
||||
|
||||
This directory contains Grafana dashboard configurations (as JSON) designed to monitor
|
||||
vLLM performance and metrics.
|
||||
|
||||
## Requirements
|
||||
|
||||
- Grafana 8.0+
|
||||
- Prometheus data source configured in Grafana
|
||||
- vLLM deployment with Prometheus metrics enabled
|
||||
|
||||
## Dashboard Descriptions
|
||||
|
||||
- **performance_statistics.json**: Tracks performance metrics including latency and
|
||||
throughput for your vLLM service.
|
||||
- **query_statistics.json**: Tracks query performance, request volume, and key
|
||||
performance indicators for your vLLM service.
|
||||
|
||||
## Deployment Options
|
||||
|
||||
### Manual Import (Recommended)
|
||||
|
||||
The easiest way to use these dashboards is to manually import the JSON configurations
|
||||
directly into your Grafana instance:
|
||||
|
||||
1. Navigate to your Grafana instance
|
||||
2. Click the '+' icon in the sidebar
|
||||
3. Select 'Import'
|
||||
4. Copy and paste the JSON content from the dashboard files, or upload the JSON files
|
||||
directly
|
||||
|
||||
### Grafana Operator
|
||||
|
||||
If you're using the [Grafana Operator](https://github.com/grafana-operator/grafana-operator)
|
||||
in Kubernetes, you can wrap these JSON configurations in a `GrafanaDashboard` custom
|
||||
resource:
|
||||
|
||||
```yaml
|
||||
# Note: Adjust the instanceSelector to match your Grafana instance's labels
|
||||
# You can check with: kubectl get grafana -o yaml
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: vllm-performance-dashboard
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: grafana # Adjust to match your Grafana instance labels
|
||||
folder: "vLLM Monitoring"
|
||||
json: |
|
||||
# Replace this comment with the complete JSON content from
|
||||
# performance_statistics.json - The JSON should start with { and end with }
|
||||
```
|
||||
|
||||
Then apply to your cluster:
|
||||
|
||||
```bash
|
||||
kubectl apply -f your-dashboard.yaml -n <namespace>
|
||||
```
|
||||
File diff suppressed because it is too large
Load Diff
760
examples/online_serving/dashboards/grafana/query_statistics.json
Normal file
760
examples/online_serving/dashboards/grafana/query_statistics.json
Normal file
@@ -0,0 +1,760 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": {
|
||||
"type": "grafana",
|
||||
"uid": "-- Grafana --"
|
||||
},
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": "High-level overview of VLLM model deployment behavior and key performance indicators. Designed for Data Scientists and Product Managers to monitor request volume, token throughput, and latency",
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": 47,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": true,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
||||
"id": 20,
|
||||
"panels": [],
|
||||
"title": "Request Over Time",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"barWidthFactor": 0.6,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
|
||||
},
|
||||
"unit": "req/s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 10, "x": 0, "y": 1 },
|
||||
"id": 1,
|
||||
"options": {
|
||||
"legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "single", "sort": "none" }
|
||||
},
|
||||
"pluginVersion": "11.3.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"editorMode": "code",
|
||||
"expr": "sum by (model_name) (\n rate(vllm:request_success_total{model_name=~\"$Deployment_id\"}[$__rate_interval])\n)",
|
||||
"interval": "1",
|
||||
"legendFormat": "{{model_name}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Successful Requests Over Time",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
|
||||
},
|
||||
"unit": "req/s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 7, "x": 10, "y": 1 },
|
||||
"id": 2,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"percentChangeColorMode": "standard",
|
||||
"reduceOptions": { "calcs": ["mean"], "fields": "", "values": false },
|
||||
"showPercentChange": false,
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "11.3.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "sum(rate(vllm:request_success_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Requests Avg Rate",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [
|
||||
{ "options": { "Calcultaions": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
|
||||
},
|
||||
"unit": "ms"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 7, "x": 17, "y": 1 },
|
||||
"id": 3,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"percentChangeColorMode": "standard",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showPercentChange": false,
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "11.3.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.50, sum by(le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "p50 Latency",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [
|
||||
{ "options": { "Calculation": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
|
||||
},
|
||||
"unit": "ms"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 7, "x": 10, "y": 4 },
|
||||
"id": 4,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"percentChangeColorMode": "standard",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showPercentChange": false,
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "11.3.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.90, sum by(le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "p90 Latency",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [
|
||||
{ "options": { "Calculation": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
|
||||
},
|
||||
"unit": "ms"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 7, "x": 17, "y": 4 },
|
||||
"id": 5,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"percentChangeColorMode": "standard",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showPercentChange": false,
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "11.3.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.99, sum by(le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "p99 Latency",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 7 },
|
||||
"id": 19,
|
||||
"panels": [],
|
||||
"title": "Size Distribution",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"fillOpacity": 80,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"lineWidth": 1,
|
||||
"stacking": { "group": "A", "mode": "none" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
|
||||
},
|
||||
"unit": "cps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 10, "x": 0, "y": 8 },
|
||||
"id": 6,
|
||||
"options": {
|
||||
"legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "single", "sort": "none" }
|
||||
},
|
||||
"pluginVersion": "11.3.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "sum by (le, model_name) (rate(vllm:request_prompt_tokens_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
|
||||
"legendFormat": "{{model_name}} le={{le}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Input Token Size Distribution",
|
||||
"type": "histogram"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [
|
||||
{ "options": { "calculation ": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
|
||||
},
|
||||
"unit": "cps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 7, "x": 10, "y": 8 },
|
||||
"id": 9,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"percentChangeColorMode": "standard",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showPercentChange": false,
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "11.3.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.90, sum by(le, model_name) (rate(vllm:request_prompt_tokens_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Input Token Size p90",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [
|
||||
{ "options": { "Calcultion": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
|
||||
},
|
||||
"unit": "cps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 7, "x": 17, "y": 8 },
|
||||
"id": 8,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"percentChangeColorMode": "standard",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showPercentChange": false,
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "11.3.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.50, sum by(le, model_name) (rate(vllm:request_prompt_tokens_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Input Token Size p50",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [
|
||||
{ "options": { "Calcultaion": { "index": 0, "text": "mean" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
|
||||
},
|
||||
"unit": "cps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 7, "x": 10, "y": 11 },
|
||||
"id": 7,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"percentChangeColorMode": "standard",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showPercentChange": false,
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "11.3.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "sum(rate(vllm:prompt_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))\n/\nsum(rate(vllm:request_success_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Input Token Size Avg",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [
|
||||
{ "options": { "Calculation": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
|
||||
},
|
||||
"unit": "cps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 7, "x": 17, "y": 11 },
|
||||
"id": 10,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"percentChangeColorMode": "standard",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showPercentChange": false,
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "11.3.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.99, sum by(le, model_name) (rate(vllm:request_prompt_tokens_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Input Token Size p99",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"collapsed": true,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
|
||||
"id": 18,
|
||||
"panels": [],
|
||||
"title": "Input Token Over Time",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"barWidthFactor": 0.6,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
|
||||
},
|
||||
"unit": "cps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 10, "x": 0, "y": 15 },
|
||||
"id": 11,
|
||||
"options": {
|
||||
"legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "single", "sort": "none" }
|
||||
},
|
||||
"pluginVersion": "11.3.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "sum by (model_name) (rate(vllm:prompt_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
|
||||
"legendFormat": "{{model_name}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Input Tokens Over Time",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [
|
||||
{ "options": { "Calculation": { "index": 0, "text": "mean" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
|
||||
},
|
||||
"unit": "cps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 7, "x": 10, "y": 15 },
|
||||
"id": 12,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"percentChangeColorMode": "standard",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showPercentChange": false,
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "11.3.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "sum(rate(vllm:prompt_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Input Tokens/Sec Avg",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 },
|
||||
"id": 17,
|
||||
"panels": [],
|
||||
"title": "Output Token Over Time",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"barWidthFactor": 0.6,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
|
||||
},
|
||||
"unit": "cps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 10, "x": 0, "y": 22 },
|
||||
"id": 13,
|
||||
"options": {
|
||||
"legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "single", "sort": "none" }
|
||||
},
|
||||
"pluginVersion": "11.3.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "sum by (model_name) (rate(vllm:generation_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
|
||||
"legendFormat": "{{model_name}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Output Tokens Over Time",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [
|
||||
{ "options": { "Calculation": { "index": 0, "text": "mean" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
|
||||
},
|
||||
"unit": "cps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 7, "x": 10, "y": 22 },
|
||||
"id": 14,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"percentChangeColorMode": "standard",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showPercentChange": false,
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "11.3.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "sum(rate(vllm:generation_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Output Tokens/Sec Avg",
|
||||
"type": "stat"
|
||||
}
|
||||
],
|
||||
"preload": false,
|
||||
"schemaVersion": 40,
|
||||
"tags": [],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": { "text": "Prometheus", "value": "4184fc20-68a7-483a-8d9b-7caa59c680dd" },
|
||||
"label": "datasource",
|
||||
"name": "DS_PROMETHEUS",
|
||||
"options": [],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"current": { "text": ["All"], "value": ["$__all"] },
|
||||
"definition": "label_values(vllm:request_success_total,model_name)",
|
||||
"includeAll": true,
|
||||
"label": "Deployment_ID",
|
||||
"multi": true,
|
||||
"name": "Deployment_id",
|
||||
"options": [],
|
||||
"query": {
|
||||
"qryType": 1,
|
||||
"query": "label_values(vllm:request_success_total,model_name)",
|
||||
"refId": "PrometheusVariableQueryEditor-VariableQuery"
|
||||
},
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"current": { "text": "All hours", "value": "All hours" },
|
||||
"hide": 2,
|
||||
"label": "Rush Hours Only",
|
||||
"name": "rush_hours",
|
||||
"options": [
|
||||
{ "selected": true, "text": "false", "value": "All hours" },
|
||||
{ "selected": false, "text": "true", "value": "Rush hours" }
|
||||
],
|
||||
"query": "false : All hours, true : Rush hours",
|
||||
"type": "custom"
|
||||
},
|
||||
{
|
||||
"current": { "text": "All", "value": "All" },
|
||||
"hide": 2,
|
||||
"label": "Rush Hours Type",
|
||||
"name": "rush_hours_type",
|
||||
"options": [
|
||||
{ "selected": true, "text": "^All__.*$", "value": "All" },
|
||||
{ "selected": false, "text": "^Static__.*$", "value": "Static" },
|
||||
{ "selected": false, "text": "^Dynamic__.*$", "value": "Dynamic" }
|
||||
],
|
||||
"query": "^All__.*$ : All, ^Static__.*$ : Static, ^Dynamic__.*$ : Dynamic",
|
||||
"type": "custom"
|
||||
},
|
||||
{
|
||||
"current": { "text": "", "value": "" },
|
||||
"hide": 2,
|
||||
"name": "query0",
|
||||
"options": [],
|
||||
"query": "",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": { "from": "now-12h", "to": "now" },
|
||||
"timepicker": {},
|
||||
"timezone": "browser",
|
||||
"title": "Query Statistics_New4",
|
||||
"uid": "query-statistics4",
|
||||
"version": 2,
|
||||
"weekStart": ""
|
||||
}
|
||||
|
||||
48
examples/online_serving/dashboards/perses/README.md
Normal file
48
examples/online_serving/dashboards/perses/README.md
Normal file
@@ -0,0 +1,48 @@
|
||||
# Perses Dashboards for vLLM Monitoring
|
||||
|
||||
This directory contains Perses dashboard configurations designed to monitor vLLM
|
||||
performance and metrics.
|
||||
|
||||
## Requirements
|
||||
|
||||
- Perses instance (standalone or via operator)
|
||||
- Prometheus data source configured in Perses
|
||||
- vLLM deployment with Prometheus metrics enabled
|
||||
|
||||
## Dashboard Format
|
||||
|
||||
We provide dashboards in the **native Perses YAML format** that works across all
|
||||
deployment methods:
|
||||
|
||||
- **Files**: `*.yaml` (native Perses dashboard specifications)
|
||||
- **Format**: Pure dashboard specifications that work everywhere
|
||||
- **Usage**: Works with standalone Perses, API imports, CLI, and file provisioning
|
||||
- **Kubernetes**: Directly compatible with Perses Operator
|
||||
|
||||
## Dashboard Descriptions
|
||||
|
||||
- **performance_statistics.yaml**: Performance metrics with aggregated latency
|
||||
statistics
|
||||
- **query_statistics.yaml**: Query performance and deployment metrics
|
||||
|
||||
## Deployment Options
|
||||
|
||||
### Direct Import to Perses
|
||||
|
||||
Import the dashboard specifications via Perses API or CLI:
|
||||
|
||||
```bash
|
||||
percli apply -f performance_statistics.yaml
|
||||
```
|
||||
|
||||
### Perses Operator (Kubernetes)
|
||||
|
||||
The native YAML format works directly with the Perses Operator:
|
||||
|
||||
```bash
|
||||
kubectl apply -f performance_statistics.yaml -n <namespace>
|
||||
```
|
||||
|
||||
### File Provisioning
|
||||
|
||||
Place the YAML files in a Perses provisioning folder for automatic loading.
|
||||
@@ -0,0 +1,764 @@
|
||||
kind: PersesDashboard
|
||||
metadata:
|
||||
name: performance-statistics
|
||||
createdAt: 0001-01-01T00:00:00Z
|
||||
updatedAt: 0001-01-01T00:00:00Z
|
||||
version: 0
|
||||
project: ""
|
||||
spec:
|
||||
display:
|
||||
name: Performance Statistics
|
||||
|
||||
variables:
|
||||
- kind: ListVariable
|
||||
spec:
|
||||
display:
|
||||
name: Deployment_ID
|
||||
hidden: false
|
||||
name: Deployment_id
|
||||
allowAllValue: true
|
||||
allowMultiple: true
|
||||
defaultValue:
|
||||
- $__all
|
||||
sort: alphabetical-asc
|
||||
plugin:
|
||||
kind: PrometheusLabelValuesVariable
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
labelName: model_name
|
||||
matchers:
|
||||
# Any one vllm metric that always carries model_name
|
||||
- vllm:generation_tokens_total{}
|
||||
|
||||
panels:
|
||||
"1":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: E2E Latency over Time
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend:
|
||||
mode: table
|
||||
position: bottom
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
# avg latency by model = sum(rate(sum)) / sum(rate(count))
|
||||
query: >
|
||||
sum by (model_name) (rate(vllm:e2e_request_latency_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
|
||||
/
|
||||
sum by (model_name) (rate(vllm:e2e_request_latency_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
|
||||
seriesNameFormat: '{{model_name}}'
|
||||
|
||||
"2":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: E2E Latency (Avg)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
(sum by (model_name) (increase(vllm:e2e_request_latency_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
|
||||
/
|
||||
(sum by (model_name) (increase(vllm:e2e_request_latency_seconds_count{model_name=~"$Deployment_id"}[$__range])))
|
||||
|
||||
"3":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: E2E Latency (P50)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.50,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:e2e_request_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
|
||||
"4":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: E2E Latency (P90)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.90,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:e2e_request_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
|
||||
"5":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: E2E Latency (P99)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.99,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:e2e_request_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
|
||||
"6":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: TTFT over Time
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend:
|
||||
mode: table
|
||||
position: bottom
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
sum by (model_name) (rate(vllm:time_to_first_token_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
|
||||
/
|
||||
sum by (model_name) (rate(vllm:time_to_first_token_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
|
||||
seriesNameFormat: '{{model_name}}'
|
||||
|
||||
"7":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: TTFT (Avg)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
(sum by (model_name) (increase(vllm:time_to_first_token_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
|
||||
/
|
||||
(sum by (model_name) (increase(vllm:time_to_first_token_seconds_count{model_name=~"$Deployment_id"}[$__range])))
|
||||
|
||||
"8":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: TTFT (P50)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.50,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:time_to_first_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
|
||||
"9":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: TTFT (P90)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.90,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:time_to_first_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
|
||||
"10":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: TTFT (P99)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.99,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:time_to_first_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
|
||||
"11":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: ITL (Time per Output Token) over Time
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend:
|
||||
mode: table
|
||||
position: bottom
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
sum by (model_name) (rate(vllm:time_per_output_token_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
|
||||
/
|
||||
sum by (model_name) (rate(vllm:time_per_output_token_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
|
||||
seriesNameFormat: '{{model_name}}'
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.50,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
seriesNameFormat: '{{model_name}} p50'
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.90,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
seriesNameFormat: '{{model_name}} p90'
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.99,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
seriesNameFormat: '{{model_name}} p99'
|
||||
|
||||
"12":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: ITL (Avg)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
(sum by (model_name) (increase(vllm:time_per_output_token_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
|
||||
/
|
||||
(sum by (model_name) (increase(vllm:time_per_output_token_seconds_count{model_name=~"$Deployment_id"}[$__range])))
|
||||
|
||||
"13":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: ITL (P50)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.50,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
|
||||
"14":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: ITL (P90)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.90,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
|
||||
"15":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: ITL (P99)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.99,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
|
||||
"16":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: TPS (Tokens/sec) over Time
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend:
|
||||
mode: table
|
||||
position: bottom
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
sum by (model_name) (rate(vllm:generation_tokens_total{model_name=~"$Deployment_id"}[$__interval]))
|
||||
seriesNameFormat: '{{model_name}} generation'
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
sum by (model_name) (rate(vllm:prompt_tokens_total{model_name=~"$Deployment_id"}[$__interval]))
|
||||
seriesNameFormat: '{{model_name}} prompt'
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
# overall iteration tokens/sec if exposed
|
||||
query: >
|
||||
rate(vllm:iteration_tokens_total_count[$__interval])
|
||||
seriesNameFormat: 'iteration overall'
|
||||
|
||||
"17":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: KV Cache Usage (avg %)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
# Multiply by 100 so we can read it as a percentage without setting a unit (avoids CUE unit conflicts)
|
||||
query: >
|
||||
100 * avg(vllm:kv_cache_usage_perc)
|
||||
|
||||
"18":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: Running Requests by Pod
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend:
|
||||
mode: table
|
||||
position: bottom
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
sum by (pod) (vllm:num_requests_running)
|
||||
seriesNameFormat: '{{pod}}'
|
||||
|
||||
"19":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: Waiting Requests by Pod
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend:
|
||||
mode: table
|
||||
position: bottom
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
sum by (pod) (vllm:num_requests_waiting)
|
||||
seriesNameFormat: '{{pod}}'
|
||||
|
||||
"20":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: Running Requests (sum)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: sum(vllm:num_requests_running)
|
||||
|
||||
"21":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: Waiting Requests (sum)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: sum(vllm:num_requests_waiting)
|
||||
|
||||
layouts:
|
||||
- kind: Grid
|
||||
spec:
|
||||
display:
|
||||
title: Overview
|
||||
items:
|
||||
- x: 0
|
||||
y: 0
|
||||
width: 6
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/17' } # KV cache %
|
||||
- x: 6
|
||||
y: 0
|
||||
width: 6
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/20' } # running sum
|
||||
- x: 12
|
||||
y: 0
|
||||
width: 6
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/21' } # waiting sum
|
||||
|
||||
- kind: Grid
|
||||
spec:
|
||||
display:
|
||||
title: E2E Latency
|
||||
items:
|
||||
- x: 0
|
||||
y: 1
|
||||
width: 10
|
||||
height: 6
|
||||
content: { $ref: '#/spec/panels/1' }
|
||||
- x: 10
|
||||
y: 1
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/2' }
|
||||
- x: 17
|
||||
y: 1
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/3' }
|
||||
- x: 10
|
||||
y: 4
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/4' }
|
||||
- x: 17
|
||||
y: 4
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/5' }
|
||||
|
||||
- kind: Grid
|
||||
spec:
|
||||
display:
|
||||
title: TTFT
|
||||
items:
|
||||
- x: 0
|
||||
y: 8
|
||||
width: 10
|
||||
height: 6
|
||||
content: { $ref: '#/spec/panels/6' }
|
||||
- x: 10
|
||||
y: 8
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/7' }
|
||||
- x: 17
|
||||
y: 8
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/8' }
|
||||
- x: 10
|
||||
y: 11
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/9' }
|
||||
- x: 17
|
||||
y: 11
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/10' }
|
||||
|
||||
- kind: Grid
|
||||
spec:
|
||||
display:
|
||||
title: ITL (Time per Output Token)
|
||||
items:
|
||||
- x: 0
|
||||
y: 15
|
||||
width: 10
|
||||
height: 6
|
||||
content: { $ref: '#/spec/panels/11' }
|
||||
- x: 10
|
||||
y: 15
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/12' }
|
||||
- x: 17
|
||||
y: 15
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/13' }
|
||||
- x: 10
|
||||
y: 18
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/14' }
|
||||
- x: 17
|
||||
y: 18
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/15' }
|
||||
|
||||
- kind: Grid
|
||||
spec:
|
||||
display:
|
||||
title: TPS (Prompt / Generation / Iteration)
|
||||
items:
|
||||
- x: 0
|
||||
y: 22
|
||||
width: 14
|
||||
height: 6
|
||||
content: { $ref: '#/spec/panels/16' }
|
||||
|
||||
- kind: Grid
|
||||
spec:
|
||||
display:
|
||||
title: Per-Pod Request State
|
||||
items:
|
||||
- x: 0
|
||||
y: 28
|
||||
width: 12
|
||||
height: 6
|
||||
content: { $ref: '#/spec/panels/18' }
|
||||
- x: 12
|
||||
y: 28
|
||||
width: 12
|
||||
height: 6
|
||||
content: { $ref: '#/spec/panels/19' }
|
||||
|
||||
392
examples/online_serving/dashboards/perses/query_statistics.yaml
Normal file
392
examples/online_serving/dashboards/perses/query_statistics.yaml
Normal file
@@ -0,0 +1,392 @@
|
||||
kind: PersesDashboard
|
||||
metadata:
|
||||
name: query-statistics
|
||||
createdAt: 0001-01-01T00:00:00Z
|
||||
updatedAt: 0001-01-01T00:00:00Z
|
||||
version: 0
|
||||
project: ""
|
||||
spec:
|
||||
display:
|
||||
name: Query Statistics_New
|
||||
|
||||
variables:
|
||||
- kind: ListVariable
|
||||
spec:
|
||||
name: NS
|
||||
display: { name: Namespace }
|
||||
allowMultiple: false
|
||||
defaultValue: llm-d
|
||||
plugin:
|
||||
kind: PrometheusLabelValuesVariable
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
labelName: namespace
|
||||
matchers:
|
||||
- up{service=~".*vllm.*"}
|
||||
|
||||
- kind: ListVariable
|
||||
spec:
|
||||
name: SVC
|
||||
display: { name: Service }
|
||||
allowMultiple: false
|
||||
defaultValue: vllm-qwen2-0-5b-sim
|
||||
plugin:
|
||||
kind: PrometheusLabelValuesVariable
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
labelName: service
|
||||
matchers:
|
||||
- up{namespace="$NS",service=~".*vllm.*"}
|
||||
|
||||
- kind: ListVariable
|
||||
spec:
|
||||
name: MODEL
|
||||
display: { name: Model (real vLLM) }
|
||||
allowAllValue: true
|
||||
allowMultiple: true
|
||||
defaultValue: ["$__all"]
|
||||
plugin:
|
||||
kind: PrometheusLabelValuesVariable
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
labelName: model_name
|
||||
matchers:
|
||||
- vllm:request_success_total{namespace="$NS",service="$SVC"}
|
||||
|
||||
panels:
|
||||
|
||||
# --- Core (works on Simulator & Real) ---
|
||||
core_running_now:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Running Requests (now) }
|
||||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: sum(vllm:num_requests_running{namespace="$NS",service="$SVC"}) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
core_waiting_now:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Waiting Requests (now) }
|
||||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: sum(vllm:num_requests_waiting{namespace="$NS",service="$SVC"}) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
core_kv_usage_now:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: KV Cache Usage (0–1) }
|
||||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: avg(vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
core_running_ts:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Running Over Time }
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend: { mode: table, position: bottom }
|
||||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: sum by (service) (vllm:num_requests_running{namespace="$NS",service="$SVC"}) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
core_waiting_ts:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Waiting Over Time }
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend: { mode: table, position: bottom }
|
||||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: sum by (service) (vllm:num_requests_waiting{namespace="$NS",service="$SVC"}) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
core_targets_up:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Scrape Targets Up }
|
||||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: count(up{namespace="$NS",service="$SVC"} == 1) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
# --- KV Cache as Percent (works on Simulator & Real) ---
|
||||
core_kv_usage_pct_now:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: KV Cache Usage (%) – now }
|
||||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
# multiply by 100 to present percentage; omit format.unit to avoid schema conflicts
|
||||
query: (avg(vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
core_kv_usage_pct_ts:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: KV Cache Usage (%) – over time }
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend: { mode: table, position: bottom }
|
||||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: (avg by (service) (vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
# --- Per-Pod breakdowns (works on Simulator & Real) ---
|
||||
per_pod_running_ts:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Running by Pod }
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend: { mode: table, position: bottom }
|
||||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: sum by (pod) (vllm:num_requests_running{namespace="$NS",service="$SVC"}) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
per_pod_waiting_ts:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Waiting by Pod }
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend: { mode: table, position: bottom }
|
||||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: sum by (pod) (vllm:num_requests_waiting{namespace="$NS",service="$SVC"}) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
per_pod_kv_pct_ts:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: KV Cache (%) by Pod }
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend: { mode: table, position: bottom }
|
||||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
# if your exporter labels kv metric with pod (the sim does), this works; otherwise it will just return empty
|
||||
query: (avg by (pod) (vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
# --- Real vLLM only (zeros on simulator) ---
|
||||
real_req_rate_ts:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Request Rate (real vLLM) }
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend: { mode: table, position: bottom }
|
||||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: sum by (model_name) (rate(vllm:request_success_total{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval])) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
real_p50:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: p50 Latency (real vLLM) }
|
||||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: histogram_quantile(0.50, sum by (le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval]))) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
real_p90:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: p90 Latency (real vLLM) }
|
||||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: histogram_quantile(0.90, sum by (le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval]))) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
real_p99:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: p99 Latency (real vLLM) }
|
||||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: histogram_quantile(0.99, sum by (le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval]))) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
real_input_tokens_ts:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Input Tokens / sec (real vLLM) }
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend: { mode: table, position: bottom }
|
||||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: sum by (model_name) (rate(vllm:prompt_tokens_total{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval])) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
real_output_tokens_ts:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Output Tokens / sec (real vLLM) }
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend: { mode: table, position: bottom }
|
||||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: sum by (model_name) (rate(vllm:generation_tokens_total{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval])) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
layouts:
|
||||
- kind: Grid
|
||||
spec:
|
||||
display: { title: Core (Sim & Real) }
|
||||
items:
|
||||
- { x: 0, y: 0, width: 6, height: 3, content: { $ref: '#/spec/panels/core_running_now' } }
|
||||
- { x: 6, y: 0, width: 6, height: 3, content: { $ref: '#/spec/panels/core_waiting_now' } }
|
||||
- { x: 12, y: 0, width: 6, height: 3, content: { $ref: '#/spec/panels/core_kv_usage_now' } }
|
||||
- { x: 18, y: 0, width: 6, height: 3, content: { $ref: '#/spec/panels/core_targets_up' } }
|
||||
- { x: 0, y: 3, width: 12, height: 6, content: { $ref: '#/spec/panels/core_running_ts' } }
|
||||
- { x: 12, y: 3, width: 12, height: 6, content: { $ref: '#/spec/panels/core_waiting_ts' } }
|
||||
|
||||
- kind: Grid
|
||||
spec:
|
||||
display: { title: KV Cache (%) }
|
||||
items:
|
||||
- { x: 0, y: 9, width: 6, height: 3, content: { $ref: '#/spec/panels/core_kv_usage_pct_now' } }
|
||||
- { x: 6, y: 9, width: 18, height: 6, content: { $ref: '#/spec/panels/core_kv_usage_pct_ts' } }
|
||||
|
||||
- kind: Grid
|
||||
spec:
|
||||
display: { title: Per-Pod breakdowns }
|
||||
items:
|
||||
- { x: 0, y: 15, width: 12, height: 6, content: { $ref: '#/spec/panels/per_pod_running_ts' } }
|
||||
- { x: 12, y: 15, width: 12, height: 6, content: { $ref: '#/spec/panels/per_pod_waiting_ts' } }
|
||||
- { x: 0, y: 21, width: 24, height: 6, content: { $ref: '#/spec/panels/per_pod_kv_pct_ts' } }
|
||||
|
||||
- kind: Grid
|
||||
spec:
|
||||
display: { title: Real vLLM only (shows 0 on simulator) }
|
||||
items:
|
||||
- { x: 0, y: 27, width: 12, height: 6, content: { $ref: '#/spec/panels/real_req_rate_ts' } }
|
||||
- { x: 12, y: 27, width: 4, height: 3, content: { $ref: '#/spec/panels/real_p50' } }
|
||||
- { x: 16, y: 27, width: 4, height: 3, content: { $ref: '#/spec/panels/real_p90' } }
|
||||
- { x: 20, y: 27, width: 4, height: 3, content: { $ref: '#/spec/panels/real_p99' } }
|
||||
- { x: 0, y: 33, width: 12, height: 6, content: { $ref: '#/spec/panels/real_input_tokens_ts' } }
|
||||
- { x: 12, y: 33, width: 12, height: 6, content: { $ref: '#/spec/panels/real_output_tokens_ts' } }
|
||||
|
||||
119
examples/online_serving/disaggregated_encoder/README.md
Normal file
119
examples/online_serving/disaggregated_encoder/README.md
Normal file
@@ -0,0 +1,119 @@
|
||||
# Disaggregated Encoder
|
||||
|
||||
These example scripts that demonstrate the disaggregated encoder (EPD) features of vLLM.
|
||||
|
||||
For a detailed explanation of the EPD features, please refer to the [Disaggregated Encoder Feature Documentation](../../../docs/features/disagg_encoder.md).
|
||||
|
||||
## Files
|
||||
|
||||
- `disagg_epd_proxy.py` - Proxy script that demonstrates the XeYpZd setup (X encode instances, Y prefill instances, Z decode instances). Currently stable for the 1e1p1d configuration.
|
||||
|
||||
- `disagg_1e1p1d_example.sh` - Sets up the 1e1p1d configuration, runs the VisionArena benchmark, and processes a single request with a local image.
|
||||
|
||||
- `disagg_1e1pd_example.sh` - Sets up the 1e1pd configuration, runs the VisionArena benchmark, and processes a single request with a local image.
|
||||
|
||||
### Custom Configuration
|
||||
|
||||
```bash
|
||||
# Use specific GPUs
|
||||
GPU_E=0 GPU_PD=1 GPU_P=1 GPU_D=2 bash disagg_1e1p1d_example.sh
|
||||
|
||||
# Use specific ports
|
||||
ENDPOINT_PORT=10001 bash disagg_1e1p1d_example.sh
|
||||
|
||||
# Use specific model
|
||||
MODEL="Qwen/Qwen2.5-VL-3B-Instruct" bash disagg_1e1p1d_example.sh
|
||||
|
||||
# Use specific storage path
|
||||
EC_SHARED_STORAGE_PATH="/tmp/my_ec_cache" bash disagg_1e1p1d_example.sh
|
||||
```
|
||||
|
||||
## Encoder Instances
|
||||
|
||||
Encoder engines should be launched with the following flags:
|
||||
|
||||
- `--enforce-eager` **(required)** – The current EPD implementation is only compatible with encoder instances running in this mode.
|
||||
|
||||
- `--no-enable-prefix-caching` **(required)** – Encoder instances do not consume KV cache; prefix caching is disabled to avoid conflicts with other features.
|
||||
|
||||
- `--max-num-batched-tokens=<large value>` **(default: 2048)** – This flag controls the token scheduling budget per decoding step and is irrelevant to encoder-only instances. **Set it to a very high value (effectively unlimited) to bypass scheduler limitations.** The actual token budget is managed by the encoder cache manager.
|
||||
|
||||
## Local media inputs
|
||||
|
||||
To support local image inputs (from your ```MEDIA_PATH``` directory), add the following flag to the encoder instance:
|
||||
|
||||
```bash
|
||||
--allowed-local-media-path $MEDIA_PATH
|
||||
```
|
||||
|
||||
The vllm instances and `disagg_encoder_proxy` supports local URIs with ```{"url": "file://'"$MEDIA_PATH_FILENAME"'}``` as multimodal inputs. Each URI is passed unchanged from the `disagg_encoder_proxy` to the encoder instance so that the encoder can load the media locally.
|
||||
|
||||
## EC connector and KV transfer
|
||||
|
||||
The `ECExampleonnector` is used to store the encoder cache on local disk and facilitate transfer. To enable the encoder disaggregation feature, add the following configuration:
|
||||
|
||||
```bash
|
||||
# Add to encoder instance:
|
||||
--ec-transfer-config '{
|
||||
"ec_connector": "ECExampleConnector",
|
||||
"ec_role": "ec_producer",
|
||||
"ec_connector_extra_config": {
|
||||
"shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
|
||||
}
|
||||
}'
|
||||
|
||||
# Add to prefill/prefill+decode instance:
|
||||
--ec-transfer-config '{
|
||||
"ec_connector": "ECExampleConnector",
|
||||
"ec_role": "ec_consumer",
|
||||
"ec_connector_extra_config": {
|
||||
"shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
`$EC_SHARED_STORAGE_PATH` is the path where the EC connector temporarily stores the cache.
|
||||
|
||||
If you enable prefill instance (`--prefill-servers-urls` not disabled), you will need --kv-transfer-config to facilitate the PD disaggregation. Currently, we use the `NixlConnector` for this purpose. Refer to `tests/v1/kv_connector/nixl_integration` for more example codes on PD disaggregation with Nixl.
|
||||
|
||||
```bash
|
||||
# Add to prefill instance:
|
||||
--kv-transfer-config '{
|
||||
"kv_connector": "NixlConnector",
|
||||
"kv_role": "kv_producer"
|
||||
}'
|
||||
|
||||
# Add to decode instance:
|
||||
--kv-transfer-config '{
|
||||
"kv_connector": "NixlConnector",
|
||||
"kv_role": "kv_consumer"
|
||||
}'
|
||||
```
|
||||
|
||||
## Proxy Instance Flags (`disagg_epd_proxy.py`)
|
||||
|
||||
| Flag | Description |
|
||||
|------|-------------|
|
||||
| `--encode-servers-urls` | Comma-separated list of encoder endpoints. Every multimodal item extracted from the request is fanned out to one of these URLs in a round-robin fashion. |
|
||||
| `--prefill-servers-urls` | Comma-separated list of prefill endpoints. Set to `disable`, `none`, or `""` to skip the dedicated prefill phase and run E+PD (encoder + combined prefill/decode). |
|
||||
| `--decode-servers-urls` | Comma-separated list of decode endpoints. Non-stream and stream paths both round-robin over this list. |
|
||||
| `--host`, `--port` | Bind address for the proxy itself (defaults: `0.0.0.0:8000`). |
|
||||
|
||||
Example usage:
|
||||
For E + PD setup:
|
||||
|
||||
```bash
|
||||
$ python disagg_encoder_proxy.py \
|
||||
--encode-servers-urls "http://e1:8001,http://e2:8002" \
|
||||
--prefill-servers-urls "disable" \
|
||||
--decode-servers-urls "http://pd1:8003,http://pd2:8004"
|
||||
```
|
||||
|
||||
For E + P + D setup:
|
||||
|
||||
```bash
|
||||
$ python disagg_encoder_proxy.py \
|
||||
--encode-servers-urls "http://e1:8001,http://e2:8001" \
|
||||
--prefill-servers-urls "http://p1:8003,http://p2:8004" \
|
||||
--decode-servers-urls "http://d1:8005,http://d2:8006"
|
||||
```
|
||||
@@ -0,0 +1,221 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
declare -a PIDS=()
|
||||
|
||||
###############################################################################
|
||||
# Configuration -- override via env before running
|
||||
###############################################################################
|
||||
MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}"
|
||||
LOG_PATH="${LOG_PATH:-./logs}"
|
||||
mkdir -p $LOG_PATH
|
||||
|
||||
ENCODE_PORT="${ENCODE_PORT:-19534}"
|
||||
PREFILL_PORT="${PREFILL_PORT:-19535}"
|
||||
DECODE_PORT="${DECODE_PORT:-19536}"
|
||||
PROXY_PORT="${PROXY_PORT:-10001}"
|
||||
|
||||
GPU_E="${GPU_E:-2}"
|
||||
GPU_P="${GPU_P:-2}"
|
||||
GPU_D="${GPU_D:-3}"
|
||||
|
||||
EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache}"
|
||||
TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-12000}" # wait_for_server timeout
|
||||
|
||||
NUM_PROMPTS="${NUM_PROMPTS:-100}" # number of prompts to send in benchmark
|
||||
|
||||
export UCX_TLS=all
|
||||
export UCX_NET_DEVICES=all
|
||||
|
||||
###############################################################################
|
||||
# Helpers
|
||||
###############################################################################
|
||||
# Find the git repository root directory
|
||||
GIT_ROOT=$(git rev-parse --show-toplevel)
|
||||
|
||||
START_TIME=$(date +"%Y%m%d_%H%M%S")
|
||||
ENC_LOG=$LOG_PATH/encoder_${START_TIME}.log
|
||||
P_LOG=$LOG_PATH/p_${START_TIME}.log
|
||||
D_LOG=$LOG_PATH/d_${START_TIME}.log
|
||||
PROXY_LOG=$LOG_PATH/proxy_${START_TIME}.log
|
||||
|
||||
wait_for_server() {
|
||||
local port=$1
|
||||
timeout "$TIMEOUT_SECONDS" bash -c "
|
||||
until curl -s localhost:$port/v1/chat/completions > /dev/null; do
|
||||
sleep 1
|
||||
done" && return 0 || return 1
|
||||
}
|
||||
|
||||
# Cleanup function
|
||||
cleanup() {
|
||||
echo "Stopping everything…"
|
||||
trap - INT TERM USR1 # prevent re-entrancy
|
||||
|
||||
# Kill all tracked PIDs
|
||||
for pid in "${PIDS[@]}"; do
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
echo "Killing process $pid"
|
||||
kill "$pid" 2>/dev/null
|
||||
fi
|
||||
done
|
||||
|
||||
# Wait a moment for graceful shutdown
|
||||
sleep 2
|
||||
|
||||
# Force kill any remaining processes
|
||||
for pid in "${PIDS[@]}"; do
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
echo "Force killing process $pid"
|
||||
kill -9 "$pid" 2>/dev/null
|
||||
fi
|
||||
done
|
||||
|
||||
# Kill the entire process group as backup
|
||||
kill -- -$$ 2>/dev/null
|
||||
|
||||
echo "All processes stopped."
|
||||
exit 0
|
||||
}
|
||||
|
||||
trap cleanup INT
|
||||
trap cleanup USR1
|
||||
trap cleanup TERM
|
||||
|
||||
# clear previous cache
|
||||
echo "remove previous ec cache folder"
|
||||
rm -rf $EC_SHARED_STORAGE_PATH
|
||||
|
||||
echo "make ec cache folder"
|
||||
mkdir -p $EC_SHARED_STORAGE_PATH
|
||||
|
||||
###############################################################################
|
||||
# Encoder worker
|
||||
###############################################################################
|
||||
CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
|
||||
--gpu-memory-utilization 0.01 \
|
||||
--port "$ENCODE_PORT" \
|
||||
--enforce-eager \
|
||||
--enable-request-id-headers \
|
||||
--no-enable-prefix-caching \
|
||||
--max-num-batched-tokens 114688 \
|
||||
--max-num-seqs 128 \
|
||||
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
|
||||
--ec-transfer-config '{
|
||||
"ec_connector": "ECExampleConnector",
|
||||
"ec_role": "ec_producer",
|
||||
"ec_connector_extra_config": {
|
||||
"shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
|
||||
}
|
||||
}' \
|
||||
>"${ENC_LOG}" 2>&1 &
|
||||
|
||||
PIDS+=($!)
|
||||
|
||||
###############################################################################
|
||||
# Prefill worker
|
||||
###############################################################################
|
||||
CUDA_VISIBLE_DEVICES="$GPU_P" \
|
||||
UCX_NET_DEVICES=all \
|
||||
VLLM_NIXL_SIDE_CHANNEL_PORT=5559 \
|
||||
vllm serve "$MODEL" \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--port "$PREFILL_PORT" \
|
||||
--enforce-eager \
|
||||
--enable-request-id-headers \
|
||||
--max-num-seqs 128 \
|
||||
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
|
||||
--ec-transfer-config '{
|
||||
"ec_connector": "ECExampleConnector",
|
||||
"ec_role": "ec_consumer",
|
||||
"ec_connector_extra_config": {
|
||||
"shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
|
||||
}
|
||||
}' \
|
||||
--kv-transfer-config '{
|
||||
"kv_connector": "NixlConnector",
|
||||
"kv_role": "kv_producer"
|
||||
}' \
|
||||
>"${P_LOG}" 2>&1 &
|
||||
|
||||
PIDS+=($!)
|
||||
|
||||
###############################################################################
|
||||
# Decode worker
|
||||
###############################################################################
|
||||
CUDA_VISIBLE_DEVICES="$GPU_D" \
|
||||
UCX_NET_DEVICES=all \
|
||||
VLLM_NIXL_SIDE_CHANNEL_PORT=6000 \
|
||||
vllm serve "$MODEL" \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--port "$DECODE_PORT" \
|
||||
--enforce-eager \
|
||||
--enable-request-id-headers \
|
||||
--max-num-seqs 128 \
|
||||
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
|
||||
--kv-transfer-config '{
|
||||
"kv_connector": "NixlConnector",
|
||||
"kv_role": "kv_consumer"
|
||||
}' \
|
||||
>"${D_LOG}" 2>&1 &
|
||||
|
||||
PIDS+=($!)
|
||||
|
||||
# Wait for workers
|
||||
wait_for_server $ENCODE_PORT
|
||||
wait_for_server $PREFILL_PORT
|
||||
wait_for_server $DECODE_PORT
|
||||
|
||||
###############################################################################
|
||||
# Proxy
|
||||
###############################################################################
|
||||
python disagg_epd_proxy.py \
|
||||
--host "0.0.0.0" \
|
||||
--port "$PROXY_PORT" \
|
||||
--encode-servers-urls "http://localhost:$ENCODE_PORT" \
|
||||
--prefill-servers-urls "http://localhost:$PREFILL_PORT" \
|
||||
--decode-servers-urls "http://localhost:$DECODE_PORT" \
|
||||
>"${PROXY_LOG}" 2>&1 &
|
||||
|
||||
PIDS+=($!)
|
||||
|
||||
wait_for_server $PROXY_PORT
|
||||
echo "All services are up!"
|
||||
|
||||
###############################################################################
|
||||
# Benchmark
|
||||
###############################################################################
|
||||
echo "Running benchmark (stream)..."
|
||||
vllm bench serve \
|
||||
--model $MODEL \
|
||||
--backend openai-chat \
|
||||
--endpoint /v1/chat/completions \
|
||||
--dataset-name hf \
|
||||
--dataset-path lmarena-ai/VisionArena-Chat \
|
||||
--seed 0 \
|
||||
--num-prompts $NUM_PROMPTS \
|
||||
--port $PROXY_PORT
|
||||
|
||||
PIDS+=($!)
|
||||
|
||||
###############################################################################
|
||||
# Single request with local image
|
||||
###############################################################################
|
||||
echo "Running single request with local image (non-stream)..."
|
||||
curl http://127.0.0.1:${PROXY_PORT}/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "'${MODEL}'",
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": [
|
||||
{"type": "image_url", "image_url": {"url": "file://'"${GIT_ROOT}"'/tests/v1/ec_connector/integration/hato.jpg"}},
|
||||
{"type": "text", "text": "What is in this image?"}
|
||||
]}
|
||||
]
|
||||
}'
|
||||
|
||||
|
||||
# cleanup
|
||||
echo "cleanup..."
|
||||
cleanup
|
||||
@@ -0,0 +1,186 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
declare -a PIDS=()
|
||||
|
||||
###############################################################################
|
||||
# Configuration -- override via env before running
|
||||
###############################################################################
|
||||
MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}"
|
||||
LOG_PATH="${LOG_PATH:-./logs}"
|
||||
mkdir -p $LOG_PATH
|
||||
|
||||
ENCODE_PORT="${ENCODE_PORT:-19534}"
|
||||
PREFILL_DECODE_PORT="${PREFILL_DECODE_PORT:-19535}"
|
||||
PROXY_PORT="${PROXY_PORT:-10001}"
|
||||
|
||||
GPU_E="${GPU_E:-0}"
|
||||
GPU_PD="${GPU_PD:-1}"
|
||||
|
||||
EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache}"
|
||||
TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-12000}" # wait_for_server timeout
|
||||
|
||||
NUM_PROMPTS="${NUM_PROMPTS:-100}" # number of prompts to send in benchmark
|
||||
|
||||
###############################################################################
|
||||
# Helpers
|
||||
###############################################################################
|
||||
# Find the git repository root directory
|
||||
GIT_ROOT=$(git rev-parse --show-toplevel)
|
||||
|
||||
START_TIME=$(date +"%Y%m%d_%H%M%S")
|
||||
ENC_LOG=$LOG_PATH/encoder_${START_TIME}.log
|
||||
PD_LOG=$LOG_PATH/pd_${START_TIME}.log
|
||||
PROXY_LOG=$LOG_PATH/proxy_${START_TIME}.log
|
||||
|
||||
wait_for_server() {
|
||||
local port=$1
|
||||
timeout "$TIMEOUT_SECONDS" bash -c "
|
||||
until curl -s localhost:$port/v1/chat/completions > /dev/null; do
|
||||
sleep 1
|
||||
done" && return 0 || return 1
|
||||
}
|
||||
|
||||
# Cleanup function
|
||||
cleanup() {
|
||||
echo "Stopping everything…"
|
||||
trap - INT TERM USR1 # prevent re-entrancy
|
||||
|
||||
# Kill all tracked PIDs
|
||||
for pid in "${PIDS[@]}"; do
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
echo "Killing process $pid"
|
||||
kill "$pid" 2>/dev/null
|
||||
fi
|
||||
done
|
||||
|
||||
# Wait a moment for graceful shutdown
|
||||
sleep 2
|
||||
|
||||
# Force kill any remaining processes
|
||||
for pid in "${PIDS[@]}"; do
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
echo "Force killing process $pid"
|
||||
kill -9 "$pid" 2>/dev/null
|
||||
fi
|
||||
done
|
||||
|
||||
# Kill the entire process group as backup
|
||||
kill -- -$$ 2>/dev/null
|
||||
|
||||
echo "All processes stopped."
|
||||
exit 0
|
||||
}
|
||||
|
||||
trap cleanup INT
|
||||
trap cleanup USR1
|
||||
trap cleanup TERM
|
||||
|
||||
# clear previous cache
|
||||
echo "remove previous ec cache folder"
|
||||
rm -rf $EC_SHARED_STORAGE_PATH
|
||||
|
||||
echo "make ec cache folder"
|
||||
mkdir -p $EC_SHARED_STORAGE_PATH
|
||||
|
||||
###############################################################################
|
||||
# Encoder worker
|
||||
###############################################################################
|
||||
CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
|
||||
--gpu-memory-utilization 0.01 \
|
||||
--port "$ENCODE_PORT" \
|
||||
--enforce-eager \
|
||||
--enable-request-id-headers \
|
||||
--no-enable-prefix-caching \
|
||||
--max-num-batched-tokens 114688 \
|
||||
--max-num-seqs 128 \
|
||||
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
|
||||
--ec-transfer-config '{
|
||||
"ec_connector": "ECExampleConnector",
|
||||
"ec_role": "ec_producer",
|
||||
"ec_connector_extra_config": {
|
||||
"shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
|
||||
}
|
||||
}' \
|
||||
>"${ENC_LOG}" 2>&1 &
|
||||
|
||||
PIDS+=($!)
|
||||
|
||||
###############################################################################
|
||||
# Prefill+Decode worker
|
||||
###############################################################################
|
||||
CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--port "$PREFILL_DECODE_PORT" \
|
||||
--enforce-eager \
|
||||
--enable-request-id-headers \
|
||||
--max-num-seqs 128 \
|
||||
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
|
||||
--ec-transfer-config '{
|
||||
"ec_connector": "ECExampleConnector",
|
||||
"ec_role": "ec_consumer",
|
||||
"ec_connector_extra_config": {
|
||||
"shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
|
||||
}
|
||||
}' \
|
||||
>"${PD_LOG}" 2>&1 &
|
||||
|
||||
PIDS+=($!)
|
||||
|
||||
# Wait for workers
|
||||
wait_for_server $ENCODE_PORT
|
||||
wait_for_server $PREFILL_DECODE_PORT
|
||||
|
||||
###############################################################################
|
||||
# Proxy
|
||||
###############################################################################
|
||||
python disagg_epd_proxy.py \
|
||||
--host "0.0.0.0" \
|
||||
--port "$PROXY_PORT" \
|
||||
--encode-servers-urls "http://localhost:$ENCODE_PORT" \
|
||||
--prefill-servers-urls "disable" \
|
||||
--decode-servers-urls "http://localhost:$PREFILL_DECODE_PORT" \
|
||||
>"${PROXY_LOG}" 2>&1 &
|
||||
|
||||
PIDS+=($!)
|
||||
|
||||
wait_for_server $PROXY_PORT
|
||||
echo "All services are up!"
|
||||
|
||||
###############################################################################
|
||||
# Benchmark
|
||||
###############################################################################
|
||||
echo "Running benchmark (stream)..."
|
||||
vllm bench serve \
|
||||
--model $MODEL \
|
||||
--backend openai-chat \
|
||||
--endpoint /v1/chat/completions \
|
||||
--dataset-name hf \
|
||||
--dataset-path lmarena-ai/VisionArena-Chat \
|
||||
--seed 0 \
|
||||
--num-prompts $NUM_PROMPTS \
|
||||
--port $PROXY_PORT
|
||||
|
||||
PIDS+=($!)
|
||||
|
||||
###############################################################################
|
||||
# Single request with local image
|
||||
###############################################################################
|
||||
echo "Running single request with local image (non-stream)..."
|
||||
curl http://127.0.0.1:${PROXY_PORT}/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "'${MODEL}'",
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": [
|
||||
{"type": "image_url", "image_url": {"url": "file://'"${GIT_ROOT}"'/tests/v1/ec_connector/integration/hato.jpg"}},
|
||||
{"type": "text", "text": "What is in this image?"}
|
||||
]}
|
||||
]
|
||||
}'
|
||||
|
||||
|
||||
# cleanup
|
||||
echo "cleanup..."
|
||||
cleanup
|
||||
@@ -0,0 +1,606 @@
|
||||
#!/usr/bin/env python3
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
disagg_encoder_proxy.py
|
||||
|
||||
Proxy that routes OpenAI-compatible “/v1/chat/completions” requests to two
|
||||
clusters:
|
||||
• encode (multimodal feature extraction)
|
||||
• decode (language-model inference)
|
||||
|
||||
For MM input we:
|
||||
1. Extract *every* image/audio item.
|
||||
2. Fire N concurrent requests to the encoder cluster
|
||||
(one request per item, with **all text removed**).
|
||||
3. Wait for all of them to succeed.
|
||||
4. Forward the *original* request to a decode server.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import uuid
|
||||
from collections.abc import AsyncIterator
|
||||
|
||||
import aiohttp
|
||||
import uvicorn
|
||||
from fastapi import FastAPI, HTTPException, Request
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
|
||||
###############################################################################
|
||||
# FastAPI app & global state
|
||||
###############################################################################
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG, format="%(asctime)s %(levelname)s: %(message)s"
|
||||
)
|
||||
logger = logging.getLogger("proxy")
|
||||
|
||||
app = FastAPI()
|
||||
encode_session: aiohttp.ClientSession | None = None
|
||||
prefill_session: aiohttp.ClientSession | None = None
|
||||
decode_session: aiohttp.ClientSession | None = None
|
||||
|
||||
###############################################################################
|
||||
# Utils
|
||||
###############################################################################
|
||||
|
||||
|
||||
MM_TYPES = {"image_url", "audio_url", "input_audio"}
|
||||
|
||||
|
||||
def extract_mm_items(request_data: dict) -> list[dict]:
|
||||
"""
|
||||
Return *all* image/audio items that appear anywhere in `messages`.
|
||||
|
||||
Each returned dict looks like:
|
||||
{ "type": "image_url", "image_url": {...} }
|
||||
"""
|
||||
items: list[dict] = []
|
||||
for msg in request_data.get("messages", []):
|
||||
content = msg.get("content")
|
||||
if not isinstance(content, list):
|
||||
continue
|
||||
|
||||
for item in content:
|
||||
if item.get("type") in MM_TYPES:
|
||||
items.append(item)
|
||||
return items
|
||||
|
||||
|
||||
async def fanout_encoder_primer(
|
||||
orig_request: dict,
|
||||
e_urls: list[str],
|
||||
req_id: str,
|
||||
) -> None:
|
||||
"""
|
||||
1. Build one request *per MM item* with all text removed.
|
||||
2. Send them concurrently to the encode cluster.
|
||||
3. Raise if any of them fails.
|
||||
"""
|
||||
logger.info("[%s] Processing multimodal items...", req_id)
|
||||
|
||||
mm_items = extract_mm_items(orig_request)
|
||||
if not mm_items:
|
||||
logger.info("[%s] No multimodal items, skipping encoder", req_id)
|
||||
return # nothing to do
|
||||
|
||||
logger.info("[%s] got %d multimodal items...", req_id, len(mm_items))
|
||||
|
||||
tasks = []
|
||||
|
||||
# Round-robin over encode servers to distribute load a bit
|
||||
url_cycle = (e_urls[i % len(e_urls)] for i in range(len(mm_items)))
|
||||
|
||||
for idx, (item, target_url) in enumerate(zip(mm_items, url_cycle)):
|
||||
# Derive a *child* request id: <parent>:<index>:<random-short>
|
||||
child_req_id = f"{req_id}:{idx}:{uuid.uuid4().hex[:6]}"
|
||||
headers = {"x-request-id": child_req_id}
|
||||
|
||||
encoder_req = {
|
||||
# You *may* need to keep additional fields
|
||||
"model": orig_request.get("model"),
|
||||
"messages": [
|
||||
{"role": "user", "content": [item]},
|
||||
],
|
||||
# Only need 1 token so the server actually runs the encoder path
|
||||
"max_tokens": 1,
|
||||
"stream": False,
|
||||
}
|
||||
tasks.append(
|
||||
encode_session.post(
|
||||
f"{target_url}/v1/chat/completions",
|
||||
json=encoder_req,
|
||||
headers=headers,
|
||||
)
|
||||
)
|
||||
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
# Fail fast if any sub-request failed
|
||||
for idx, r in enumerate(results):
|
||||
if isinstance(r, Exception):
|
||||
logger.error(
|
||||
"[%s] Encoder request #%d raised exception: %s",
|
||||
req_id,
|
||||
idx,
|
||||
r,
|
||||
exc_info=r,
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=502, detail=f"Encoder request failed: {str(r)}"
|
||||
)
|
||||
if r.status != 200:
|
||||
try:
|
||||
detail = await r.text()
|
||||
except Exception:
|
||||
detail = "<unable to read body>"
|
||||
logger.error(
|
||||
"[%s] Encoder request #%d returned status %s: %s",
|
||||
req_id,
|
||||
idx,
|
||||
r.status,
|
||||
detail,
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=r.status,
|
||||
detail=f"Encoder request failed: {detail}",
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"[%s] All %d encoder requests completed successfully", req_id, len(mm_items)
|
||||
)
|
||||
|
||||
|
||||
async def maybe_prefill(
|
||||
req_data: dict,
|
||||
p_url: str,
|
||||
req_id: str,
|
||||
) -> dict:
|
||||
"""
|
||||
- Do prefill-only task if p_url exist;
|
||||
- Return modified request data with kv transfer params (for nixl connector)
|
||||
- Else, skip and return the original request data for decode
|
||||
"""
|
||||
if p_url:
|
||||
logger.info("[%s] Processing through prefill: %s", req_id, p_url)
|
||||
|
||||
prefill_response = await process_prefill_stage(req_data, p_url, req_id)
|
||||
# for nixl connector to facilitate kv transfer...
|
||||
prefill_response_json = await prefill_response.json()
|
||||
kv_transfer_params = prefill_response_json.get("kv_transfer_params", {})
|
||||
if kv_transfer_params:
|
||||
req_data["kv_transfer_params"] = kv_transfer_params
|
||||
|
||||
return req_data
|
||||
else:
|
||||
return req_data
|
||||
|
||||
|
||||
async def process_prefill_stage(
|
||||
req_data: dict,
|
||||
p_url: str,
|
||||
req_id: str,
|
||||
) -> dict:
|
||||
"""Process request through Prefill stage and return kv_transfer_params"""
|
||||
logger.info("[%s] Sending prefill request to: %s", req_id, p_url)
|
||||
|
||||
prefill_request = req_data.copy()
|
||||
prefill_request["kv_transfer_params"] = {
|
||||
"do_remote_decode": True,
|
||||
"do_remote_prefill": False,
|
||||
"remote_engine_id": None,
|
||||
"remote_block_ids": None,
|
||||
"remote_host": None,
|
||||
"remote_port": None,
|
||||
}
|
||||
prefill_request["stream"] = False
|
||||
prefill_request["max_tokens"] = 1
|
||||
if "max_completion_tokens" in prefill_request:
|
||||
prefill_request["max_completion_tokens"] = 1
|
||||
if "stream_options" in prefill_request:
|
||||
del prefill_request["stream_options"]
|
||||
|
||||
headers = {"x-request-id": req_id}
|
||||
try:
|
||||
prefill_response = await prefill_session.post(
|
||||
f"{p_url}/v1/chat/completions", json=prefill_request, headers=headers
|
||||
)
|
||||
prefill_response.raise_for_status()
|
||||
|
||||
if prefill_response.status != 200:
|
||||
error_text = await prefill_response.text()
|
||||
logger.error(
|
||||
"[%s] Prefill request failed with status %d: %s",
|
||||
req_id,
|
||||
prefill_response.status,
|
||||
error_text,
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=prefill_response.status,
|
||||
detail={"error": "Prefill request failed", "message": error_text},
|
||||
)
|
||||
logger.info("[%s] Prefill request completed successfully", req_id)
|
||||
|
||||
return prefill_response
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Prefill processing failed: %s", str(e))
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail={"error": "Prefill processing error", "message": str(e)},
|
||||
) from e
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Middleware for request/response logging
|
||||
###############################################################################
|
||||
|
||||
|
||||
@app.middleware("http")
|
||||
async def log_requests(request: Request, call_next):
|
||||
"""Middleware to log all incoming requests and responses"""
|
||||
req_id = request.headers.get("x-request-id", str(uuid.uuid4()))
|
||||
|
||||
# Log incoming request
|
||||
logger.info(
|
||||
">>> [%s] %s %s from %s",
|
||||
req_id,
|
||||
request.method,
|
||||
request.url.path,
|
||||
request.client.host if request.client else "unknown",
|
||||
)
|
||||
|
||||
try:
|
||||
# Process request
|
||||
response = await call_next(request)
|
||||
|
||||
# Log response
|
||||
logger.info(
|
||||
"<<< [%s] %s %s completed with status %d",
|
||||
req_id,
|
||||
request.method,
|
||||
request.url.path,
|
||||
response.status_code,
|
||||
)
|
||||
|
||||
return response
|
||||
except Exception as e:
|
||||
# Log errors
|
||||
logger.exception(
|
||||
"!!! [%s] %s %s failed with error: %s",
|
||||
req_id,
|
||||
request.method,
|
||||
request.url.path,
|
||||
str(e),
|
||||
)
|
||||
raise
|
||||
|
||||
|
||||
###############################################################################
|
||||
# FastAPI lifecycle
|
||||
###############################################################################
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def on_startup() -> None:
|
||||
global encode_session, prefill_session, decode_session
|
||||
timeout = aiohttp.ClientTimeout(total=100_000)
|
||||
connector = aiohttp.TCPConnector(limit=0, force_close=False)
|
||||
encode_session = aiohttp.ClientSession(timeout=timeout, connector=connector)
|
||||
if app.state.p_urls:
|
||||
# only setup if prefill instance(s) exist
|
||||
prefill_session = aiohttp.ClientSession(timeout=timeout, connector=connector)
|
||||
decode_session = aiohttp.ClientSession(timeout=timeout, connector=connector)
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def on_shutdown() -> None:
|
||||
global encode_session, prefill_session, decode_session
|
||||
if encode_session:
|
||||
await encode_session.close()
|
||||
if prefill_session:
|
||||
await prefill_session.close()
|
||||
if decode_session:
|
||||
await decode_session.close()
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Core forwarding
|
||||
###############################################################################
|
||||
|
||||
|
||||
async def forward_non_stream(
|
||||
req_data: dict, req_id: str, e_urls: list[str], p_url: str, d_url: str
|
||||
) -> dict:
|
||||
try:
|
||||
# Step 1: Process through Encoder instance (if has MM input)
|
||||
await fanout_encoder_primer(req_data, e_urls, req_id)
|
||||
|
||||
# Step 2: Process through Prefill instance
|
||||
req_data = await maybe_prefill(req_data, p_url, req_id)
|
||||
|
||||
# Step 3: Process through Decode instance
|
||||
logger.info("[%s] Forwarding to decode: %s", req_id, d_url)
|
||||
headers = {"x-request-id": req_id}
|
||||
|
||||
# Non-streaming response
|
||||
async with decode_session.post(
|
||||
f"{d_url}/v1/chat/completions", json=req_data, headers=headers
|
||||
) as resp:
|
||||
resp.raise_for_status()
|
||||
return await resp.json()
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.exception("[%s] Error in forward_non_stream: %s", req_id, str(e))
|
||||
raise HTTPException(status_code=500, detail=f"Proxy error: {str(e)}") from e
|
||||
|
||||
|
||||
async def forward_stream(
|
||||
req_data: dict, req_id: str, e_urls: list[str], p_url: str, d_url: str
|
||||
) -> AsyncIterator[str]:
|
||||
try:
|
||||
# Step 1: Process through Encoder instance (if has MM input)
|
||||
await fanout_encoder_primer(req_data, e_urls, req_id)
|
||||
|
||||
# Step 2: Process through Prefill instance
|
||||
req_data = await maybe_prefill(req_data, p_url, req_id)
|
||||
|
||||
# Step 3: Process through Decode instance
|
||||
logger.info("[%s] Starting streaming from decode: %s", req_id, d_url)
|
||||
headers = {"x-request-id": req_id}
|
||||
|
||||
# Streaming response
|
||||
async with decode_session.post(
|
||||
f"{d_url}/v1/chat/completions",
|
||||
json=req_data,
|
||||
headers=headers,
|
||||
) as resp:
|
||||
resp.raise_for_status()
|
||||
async for chunk in resp.content.iter_chunked(1024):
|
||||
if chunk:
|
||||
yield chunk.decode("utf-8", errors="ignore")
|
||||
|
||||
logger.info("[%s] Streaming completed", req_id)
|
||||
|
||||
except HTTPException:
|
||||
logger.exception("[%s] HTTPException in forward_stream", req_id)
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.exception("[%s] Error in forward_stream: %s", req_id, str(e))
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Proxy streaming error: {str(e)}"
|
||||
) from e
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Public routes
|
||||
###############################################################################
|
||||
|
||||
|
||||
@app.post("/v1/chat/completions")
|
||||
async def chat_completions(request: Request):
|
||||
try:
|
||||
req_data = await request.json()
|
||||
req_id = request.headers.get("x-request-id", str(uuid.uuid4()))
|
||||
|
||||
e_urls = app.state.e_urls # we want the full list for fan-out
|
||||
p_url = random.choice(app.state.p_urls) if app.state.p_urls else None
|
||||
d_url = random.choice(app.state.d_urls)
|
||||
|
||||
is_streaming = req_data.get("stream", False)
|
||||
|
||||
if is_streaming:
|
||||
return StreamingResponse(
|
||||
forward_stream(req_data, req_id, e_urls, p_url, d_url),
|
||||
media_type="text/event-stream",
|
||||
)
|
||||
result = await forward_non_stream(req_data, req_id, e_urls, p_url, d_url)
|
||||
return JSONResponse(content=result)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.exception("Error in chat_completions endpoint: %s", str(e))
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Request processing error: {str(e)}"
|
||||
) from e
|
||||
|
||||
|
||||
@app.get("/v1/models")
|
||||
async def list_models():
|
||||
async with decode_session.get(f"{app.state.d_urls[0]}/v1/models") as resp:
|
||||
resp.raise_for_status()
|
||||
return await resp.json()
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check():
|
||||
async def healthy(urls):
|
||||
if not urls:
|
||||
return "empty"
|
||||
for u in urls:
|
||||
try:
|
||||
async with encode_session.get(f"{u}/health") as resp:
|
||||
resp.raise_for_status()
|
||||
except Exception:
|
||||
return "unhealthy"
|
||||
return "healthy"
|
||||
|
||||
e_status, p_status, d_status = await asyncio.gather(
|
||||
healthy(app.state.e_urls), healthy(app.state.p_urls), healthy(app.state.d_urls)
|
||||
)
|
||||
|
||||
overall_healthy = all(
|
||||
status != "unhealthy" for status in (e_status, p_status, d_status)
|
||||
)
|
||||
|
||||
status_code = 200 if overall_healthy else 503
|
||||
|
||||
return JSONResponse(
|
||||
{
|
||||
"proxy": "healthy",
|
||||
"encode_cluster": e_status,
|
||||
"prefill_cluster": p_status,
|
||||
"decode_cluster": d_status,
|
||||
},
|
||||
status_code=status_code,
|
||||
)
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Simple profiler fan-out (unchanged except for sessions)
|
||||
###############################################################################
|
||||
|
||||
|
||||
async def _post_if_available(
|
||||
session: aiohttp.ClientSession,
|
||||
url: str,
|
||||
payload: dict,
|
||||
headers: dict,
|
||||
) -> dict | None:
|
||||
"""
|
||||
POST `payload` to `url`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
• The decoded JSON body on success (2xx)
|
||||
• None if the endpoint does not exist (404)
|
||||
• Raises for anything else.
|
||||
"""
|
||||
try:
|
||||
resp = await session.post(url, json=payload, headers=headers)
|
||||
if resp.status == 404: # profiling disabled on that server
|
||||
logger.warning("Profiling endpoint missing on %s", url)
|
||||
return None
|
||||
resp.raise_for_status()
|
||||
return await resp.json(content_type=None)
|
||||
except aiohttp.ClientResponseError as exc:
|
||||
# Pass 404 through the branch above, re-raise everything else
|
||||
if exc.status == 404:
|
||||
logger.warning("Profiling endpoint missing on %s", url)
|
||||
return None
|
||||
raise
|
||||
except Exception:
|
||||
# Network errors etc.: propagate
|
||||
raise
|
||||
|
||||
|
||||
async def _profile_cmd(cmd: str, payload: dict, e_url: str, p_url: str, d_url: str):
|
||||
"""
|
||||
Fire & forget to both clusters, tolerate 404.
|
||||
"""
|
||||
headers = {"Authorization": f"Bearer {os.getenv('OPENAI_API_KEY', '')}"}
|
||||
|
||||
encode_task = _post_if_available(
|
||||
encode_session, f"{e_url}/{cmd}_profile", payload, headers
|
||||
)
|
||||
prefill_task = (
|
||||
_post_if_available(prefill_session, f"{p_url}/{cmd}_profile", payload, headers)
|
||||
if p_url is not None
|
||||
else asyncio.sleep(0)
|
||||
)
|
||||
decode_task = _post_if_available(
|
||||
decode_session, f"{d_url}/{cmd}_profile", payload, headers
|
||||
)
|
||||
|
||||
encode_res, prefill_res, decode_res = await asyncio.gather(
|
||||
encode_task, prefill_task, decode_task
|
||||
)
|
||||
|
||||
# If *all* clusters said “I don’t have that route”, surface an error
|
||||
if encode_res is prefill_res is decode_res is None:
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail="Profiling endpoints are disabled on all clusters",
|
||||
)
|
||||
|
||||
return {
|
||||
"encode": encode_res, # may be None
|
||||
"prefill": prefill_res, # may be None
|
||||
"decode": decode_res, # may be None
|
||||
}
|
||||
|
||||
|
||||
@app.post("/start_profile")
|
||||
async def start_profile(request: Request):
|
||||
body = await request.json()
|
||||
# TODO: handle multi urls properly
|
||||
e_url = random.choice(app.state.e_urls)
|
||||
p_url = random.choice(app.state.p_urls) if app.state.p_urls else None
|
||||
d_url = random.choice(app.state.d_urls)
|
||||
return await _profile_cmd("start", body, e_url, p_url, d_url)
|
||||
|
||||
|
||||
@app.post("/stop_profile")
|
||||
async def stop_profile(request: Request):
|
||||
body = await request.json()
|
||||
# TODO: handle multi urls properly
|
||||
e_url = random.choice(app.state.e_urls)
|
||||
p_url = random.choice(app.state.p_urls) if app.state.p_urls else None
|
||||
d_url = random.choice(app.state.d_urls)
|
||||
return await _profile_cmd("stop", body, e_url, p_url, d_url)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host", default="0.0.0.0")
|
||||
parser.add_argument("--port", type=int, default=8000)
|
||||
parser.add_argument(
|
||||
"--encode-servers-urls",
|
||||
required=True,
|
||||
help='Comma-separated encode URLs ("http://e1:8001,http://e2:8001")',
|
||||
)
|
||||
parser.add_argument(
|
||||
"--prefill-servers-urls",
|
||||
required=True,
|
||||
help=(
|
||||
'Comma-separated prefill URLs ("http://p1:8003,http://p2:8004") ',
|
||||
'to enable E->P->D, set "disable" or "none" to enable E->PD',
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--decode-servers-urls",
|
||||
required=True,
|
||||
help='Comma-separated decode URLs ("http://d1:8005,http://d2:8006")',
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
app.state.e_urls = [
|
||||
u.strip() for u in args.encode_servers_urls.split(",") if u.strip()
|
||||
]
|
||||
app.state.d_urls = [
|
||||
u.strip() for u in args.decode_servers_urls.split(",") if u.strip()
|
||||
]
|
||||
# handle prefill instances
|
||||
if args.prefill_servers_urls.lower() in ("disable", "none", ""):
|
||||
app.state.p_urls = []
|
||||
logger.info(
|
||||
"Disaggregated prefill phase explicitly disabled by user. Running E + PD..."
|
||||
)
|
||||
else:
|
||||
app.state.p_urls = [
|
||||
u.strip() for u in args.prefill_servers_urls.split(",") if u.strip()
|
||||
]
|
||||
logger.info("Disaggregated prefill phase is enabled. Running E + P + D...")
|
||||
|
||||
logger.info("Proxy listening on %s:%s", args.host, args.port)
|
||||
logger.info("Encode servers: %s", app.state.e_urls)
|
||||
logger.info("Prefill instances %s", app.state.p_urls)
|
||||
logger.info("Decode servers: %s", app.state.d_urls)
|
||||
|
||||
uvicorn.run(
|
||||
app,
|
||||
host=args.host,
|
||||
port=args.port,
|
||||
log_level="info",
|
||||
loop="uvloop",
|
||||
access_log=True,
|
||||
)
|
||||
125
examples/online_serving/disaggregated_prefill.sh
Normal file
125
examples/online_serving/disaggregated_prefill.sh
Normal file
@@ -0,0 +1,125 @@
|
||||
#!/bin/bash
|
||||
# This file demonstrates the example usage of disaggregated prefilling
|
||||
# We will launch 2 vllm instances (1 for prefill and 1 for decode),
|
||||
# and then transfer the KV cache between them.
|
||||
|
||||
set -xe
|
||||
|
||||
echo "🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧"
|
||||
sleep 1
|
||||
|
||||
# meta-llama/Meta-Llama-3.1-8B-Instruct or deepseek-ai/DeepSeek-V2-Lite
|
||||
MODEL_NAME=${HF_MODEL_NAME:-meta-llama/Meta-Llama-3.1-8B-Instruct}
|
||||
|
||||
# Trap the SIGINT signal (triggered by Ctrl+C)
|
||||
trap 'cleanup' INT
|
||||
|
||||
# Cleanup function
|
||||
cleanup() {
|
||||
echo "Caught Ctrl+C, cleaning up..."
|
||||
# Cleanup commands
|
||||
pgrep python | xargs kill -9
|
||||
pkill -f python
|
||||
echo "Cleanup complete. Exiting."
|
||||
exit 0
|
||||
}
|
||||
|
||||
|
||||
if [[ -z "${VLLM_HOST_IP:-}" ]]; then
|
||||
export VLLM_HOST_IP=127.0.0.1
|
||||
echo "Using default VLLM_HOST_IP=127.0.0.1 (override by exporting VLLM_HOST_IP before running this script)"
|
||||
else
|
||||
echo "Using provided VLLM_HOST_IP=${VLLM_HOST_IP}"
|
||||
fi
|
||||
|
||||
|
||||
# install quart first -- required for disagg prefill proxy serve
|
||||
if python3 -c "import quart" &> /dev/null; then
|
||||
echo "Quart is already installed."
|
||||
else
|
||||
echo "Quart is not installed. Installing..."
|
||||
python3 -m pip install quart
|
||||
fi
|
||||
|
||||
# a function that waits vLLM server to start
|
||||
wait_for_server() {
|
||||
local port=$1
|
||||
timeout 1200 bash -c "
|
||||
until curl -i localhost:${port}/v1/models > /dev/null; do
|
||||
sleep 1
|
||||
done" && return 0 || return 1
|
||||
}
|
||||
|
||||
|
||||
# You can also adjust --kv-ip and --kv-port for distributed inference.
|
||||
|
||||
# prefilling instance, which is the KV producer
|
||||
CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
|
||||
--host 0.0.0.0 \
|
||||
--port 8100 \
|
||||
--max-model-len 100 \
|
||||
--gpu-memory-utilization 0.8 \
|
||||
--trust-remote-code \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":"1e9","kv_port":"14579","kv_connector_extra_config":{"proxy_ip":"'"$VLLM_HOST_IP"'","proxy_port":"30001","http_ip":"'"$VLLM_HOST_IP"'","http_port":"8100","send_type":"PUT_ASYNC"}}' &
|
||||
|
||||
# decoding instance, which is the KV consumer
|
||||
CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
|
||||
--host 0.0.0.0 \
|
||||
--port 8200 \
|
||||
--max-model-len 100 \
|
||||
--gpu-memory-utilization 0.8 \
|
||||
--trust-remote-code \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":"1e10","kv_port":"14580","kv_connector_extra_config":{"proxy_ip":"'"$VLLM_HOST_IP"'","proxy_port":"30001","http_ip":"'"$VLLM_HOST_IP"'","http_port":"8200","send_type":"PUT_ASYNC"}}' &
|
||||
|
||||
# wait until prefill and decode instances are ready
|
||||
wait_for_server 8100
|
||||
wait_for_server 8200
|
||||
|
||||
# launch a proxy server that opens the service at port 8000
|
||||
# the workflow of this proxy:
|
||||
# - send the request to prefill vLLM instance (port 8100), change max_tokens
|
||||
# to 1
|
||||
# - after the prefill vLLM finishes prefill, send the request to decode vLLM
|
||||
# instance
|
||||
# NOTE: the usage of this API is subject to change --- in the future we will
|
||||
# introduce "vllm connect" to connect between prefill and decode instances
|
||||
python3 ../../benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py &
|
||||
sleep 1
|
||||
|
||||
# serve two example requests
|
||||
output1=$(curl -X POST -s http://localhost:8000/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "'"$MODEL_NAME"'",
|
||||
"prompt": "San Francisco is a",
|
||||
"max_tokens": 10,
|
||||
"temperature": 0
|
||||
}')
|
||||
|
||||
output2=$(curl -X POST -s http://localhost:8000/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "'"$MODEL_NAME"'",
|
||||
"prompt": "Santa Clara is a",
|
||||
"max_tokens": 10,
|
||||
"temperature": 0
|
||||
}')
|
||||
|
||||
|
||||
# Cleanup commands
|
||||
pgrep python | xargs kill -9
|
||||
pkill -f python
|
||||
|
||||
echo ""
|
||||
|
||||
sleep 1
|
||||
|
||||
# Print the outputs of the curl requests
|
||||
echo ""
|
||||
echo "Output of first request: $output1"
|
||||
echo "Output of second request: $output2"
|
||||
|
||||
echo "🎉🎉 Successfully finished 2 test requests! 🎉🎉"
|
||||
echo ""
|
||||
8
examples/online_serving/disaggregated_serving/README.md
Normal file
8
examples/online_serving/disaggregated_serving/README.md
Normal file
@@ -0,0 +1,8 @@
|
||||
# Disaggregated Serving
|
||||
|
||||
This example contains scripts that demonstrate the disaggregated serving features of vLLM.
|
||||
|
||||
## Files
|
||||
|
||||
- `disagg_proxy_demo.py` - Demonstrates XpYd (X prefill instances, Y decode instances).
|
||||
- `kv_events.sh` - Demonstrates KV cache event publishing.
|
||||
@@ -0,0 +1,452 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
This file provides a disaggregated prefilling proxy demo to demonstrate an
|
||||
example usage of XpYd disaggregated prefilling.
|
||||
We can launch multiple vllm instances (2 for prefill and 2 for decode), and
|
||||
launch this proxy demo through:
|
||||
python3 examples/online_serving/disaggregated_serving/disagg_proxy_demo.py \
|
||||
--model $model_name \
|
||||
--prefill localhost:8100 localhost:8101 \
|
||||
--decode localhost:8200 localhost:8201 \
|
||||
--port 8000
|
||||
|
||||
Note: This demo will be removed once the PDController implemented in PR 15343
|
||||
(https://github.com/vllm-project/vllm/pull/15343) supports XpYd.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import ipaddress
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from abc import ABC, abstractmethod
|
||||
from collections.abc import Callable
|
||||
|
||||
import aiohttp
|
||||
import requests
|
||||
import uvicorn
|
||||
from fastapi import APIRouter, Depends, FastAPI, Header, HTTPException, Request, status
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
|
||||
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
|
||||
logger = logging.getLogger()
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
|
||||
class SchedulingPolicy(ABC):
|
||||
@abstractmethod
|
||||
def schedule(self, cycler: itertools.cycle):
|
||||
raise NotImplementedError("Scheduling Proxy is not set.")
|
||||
|
||||
|
||||
class Proxy:
|
||||
def __init__(
|
||||
self,
|
||||
prefill_instances: list[str],
|
||||
decode_instances: list[str],
|
||||
model: str,
|
||||
scheduling_policy: SchedulingPolicy,
|
||||
custom_create_completion: Callable[[Request], StreamingResponse] | None = None,
|
||||
custom_create_chat_completion: Callable[[Request], StreamingResponse]
|
||||
| None = None,
|
||||
):
|
||||
self.prefill_instances = prefill_instances
|
||||
self.decode_instances = decode_instances
|
||||
self.prefill_cycler = itertools.cycle(prefill_instances)
|
||||
self.decode_cycler = itertools.cycle(decode_instances)
|
||||
self.model = model
|
||||
self.scheduling_policy = scheduling_policy
|
||||
self.custom_create_completion = custom_create_completion
|
||||
self.custom_create_chat_completion = custom_create_chat_completion
|
||||
self.router = APIRouter()
|
||||
self.setup_routes()
|
||||
|
||||
def setup_routes(self):
|
||||
self.router.post(
|
||||
"/v1/completions", dependencies=[Depends(self.validate_json_request)]
|
||||
)(
|
||||
self.custom_create_completion
|
||||
if self.custom_create_completion
|
||||
else self.create_completion
|
||||
)
|
||||
self.router.post(
|
||||
"/v1/chat/completions", dependencies=[Depends(self.validate_json_request)]
|
||||
)(
|
||||
self.custom_create_chat_completion
|
||||
if self.custom_create_chat_completion
|
||||
else self.create_chat_completion
|
||||
)
|
||||
self.router.get("/status", response_class=JSONResponse)(self.get_status)
|
||||
self.router.post(
|
||||
"/instances/add", dependencies=[Depends(self.api_key_authenticate)]
|
||||
)(self.add_instance_endpoint)
|
||||
|
||||
async def validate_json_request(self, raw_request: Request):
|
||||
content_type = raw_request.headers.get("content-type", "").lower()
|
||||
if content_type != "application/json":
|
||||
raise HTTPException(
|
||||
status_code=415,
|
||||
detail="Unsupported Media Type: Only 'application/json' is allowed",
|
||||
)
|
||||
|
||||
def api_key_authenticate(self, x_api_key: str = Header(...)):
|
||||
expected_api_key = os.environ.get("ADMIN_API_KEY")
|
||||
if not expected_api_key:
|
||||
logger.error("ADMIN_API_KEY is not set in the environment.")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail="Server configuration error.",
|
||||
)
|
||||
if x_api_key != expected_api_key:
|
||||
logger.warning("Unauthorized access attempt with API Key: %s", x_api_key)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_403_FORBIDDEN,
|
||||
detail="Forbidden: Invalid API Key.",
|
||||
)
|
||||
|
||||
async def validate_instance(self, instance: str) -> bool:
|
||||
url = f"http://{instance}/v1/models"
|
||||
try:
|
||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as client:
|
||||
logger.info("Verifying %s ...", instance)
|
||||
async with client.get(url) as response:
|
||||
if response.status == 200:
|
||||
data = await response.json()
|
||||
if "data" in data and len(data["data"]) > 0:
|
||||
model_cur = data["data"][0].get("id", "")
|
||||
if model_cur == self.model:
|
||||
logger.info("Instance: %s could be added.", instance)
|
||||
return True
|
||||
else:
|
||||
logger.warning(
|
||||
"Mismatch model %s : %s != %s",
|
||||
instance,
|
||||
model_cur,
|
||||
self.model,
|
||||
)
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
except aiohttp.ClientError as e:
|
||||
logger.error(str(e))
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(str(e))
|
||||
return False
|
||||
|
||||
async def add_instance_endpoint(self, request: Request):
|
||||
try:
|
||||
data = await request.json()
|
||||
logger.warning(str(data))
|
||||
instance_type = data.get("type")
|
||||
instance = data.get("instance")
|
||||
if instance_type not in ["prefill", "decode"]:
|
||||
raise HTTPException(status_code=400, detail="Invalid instance type.")
|
||||
if not instance or ":" not in instance:
|
||||
raise HTTPException(status_code=400, detail="Invalid instance format.")
|
||||
host, port_str = instance.split(":")
|
||||
try:
|
||||
if host != "localhost":
|
||||
ipaddress.ip_address(host)
|
||||
port = int(port_str)
|
||||
if not (0 < port < 65536):
|
||||
raise HTTPException(status_code=400, detail="Invalid port number.")
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=400, detail="Invalid instance address."
|
||||
) from e
|
||||
|
||||
is_valid = await self.validate_instance(instance)
|
||||
if not is_valid:
|
||||
raise HTTPException(
|
||||
status_code=400, detail="Instance validation failed."
|
||||
)
|
||||
|
||||
if instance_type == "prefill":
|
||||
if instance not in self.prefill_instances:
|
||||
self.prefill_instances.append(instance)
|
||||
self.prefill_cycler = itertools.cycle(self.prefill_instances)
|
||||
else:
|
||||
raise HTTPException(
|
||||
status_code=400, detail="Instance already exists."
|
||||
)
|
||||
else:
|
||||
if instance not in self.decode_instances:
|
||||
self.decode_instances.append(instance)
|
||||
self.decode_cycler = itertools.cycle(self.decode_instances)
|
||||
else:
|
||||
raise HTTPException(
|
||||
status_code=400, detail="Instance already exists."
|
||||
)
|
||||
|
||||
return JSONResponse(
|
||||
content={"message": f"Added {instance} to {instance_type}_instances."}
|
||||
)
|
||||
except HTTPException as http_exc:
|
||||
raise http_exc
|
||||
except Exception as e:
|
||||
logger.error("Error in add_instance_endpoint: %s", str(e))
|
||||
raise HTTPException(status_code=500, detail=str(e)) from e
|
||||
|
||||
async def forward_request(self, url, data, use_chunked=True):
|
||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
||||
try:
|
||||
async with session.post(
|
||||
url=url, json=data, headers=headers
|
||||
) as response:
|
||||
if 200 <= response.status < 300 or 400 <= response.status < 500:
|
||||
if use_chunked:
|
||||
async for chunk_bytes in response.content.iter_chunked(
|
||||
1024
|
||||
):
|
||||
yield chunk_bytes
|
||||
else:
|
||||
content = await response.read()
|
||||
yield content
|
||||
else:
|
||||
error_content = await response.text()
|
||||
try:
|
||||
error_content = json.loads(error_content)
|
||||
except json.JSONDecodeError:
|
||||
error_content = error_content
|
||||
logger.error(
|
||||
"Request failed with status %s: %s",
|
||||
response.status,
|
||||
error_content,
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=response.status,
|
||||
detail=f"Request failed with status {response.status}: "
|
||||
f"{error_content}",
|
||||
)
|
||||
except aiohttp.ClientError as e:
|
||||
logger.error("ClientError occurred: %s", str(e))
|
||||
raise HTTPException(
|
||||
status_code=502,
|
||||
detail="Bad Gateway: Error communicating with upstream server.",
|
||||
) from e
|
||||
except Exception as e:
|
||||
logger.error("Unexpected error: %s", str(e))
|
||||
raise HTTPException(status_code=500, detail=str(e)) from e
|
||||
|
||||
def schedule(self, cycler: itertools.cycle) -> str:
|
||||
return self.scheduling_policy.schedule(cycler)
|
||||
|
||||
async def get_status(self):
|
||||
status = {
|
||||
"prefill_node_count": len(self.prefill_instances),
|
||||
"decode_node_count": len(self.decode_instances),
|
||||
"prefill_nodes": self.prefill_instances,
|
||||
"decode_nodes": self.decode_instances,
|
||||
}
|
||||
return status
|
||||
|
||||
async def create_completion(self, raw_request: Request):
|
||||
try:
|
||||
request = await raw_request.json()
|
||||
|
||||
kv_prepare_request = request.copy()
|
||||
kv_prepare_request["max_tokens"] = 1
|
||||
|
||||
prefill_instance = self.schedule(self.prefill_cycler)
|
||||
try:
|
||||
async for _ in self.forward_request(
|
||||
f"http://{prefill_instance}/v1/completions", kv_prepare_request
|
||||
):
|
||||
continue
|
||||
except HTTPException as http_exc:
|
||||
self.remove_instance_endpoint("prefill", prefill_instance)
|
||||
raise http_exc
|
||||
|
||||
# Perform kv recv and decoding stage
|
||||
decode_instance = self.schedule(self.decode_cycler)
|
||||
|
||||
try:
|
||||
generator = self.forward_request(
|
||||
f"http://{decode_instance}/v1/completions", request
|
||||
)
|
||||
except HTTPException as http_exc:
|
||||
self.remove_instance_endpoint("decode", decode_instance)
|
||||
raise http_exc
|
||||
response = StreamingResponse(generator)
|
||||
return response
|
||||
except Exception:
|
||||
import sys
|
||||
|
||||
exc_info = sys.exc_info()
|
||||
print("Error occurred in disagg proxy server")
|
||||
print(exc_info)
|
||||
|
||||
async def create_chat_completion(self, raw_request: Request):
|
||||
try:
|
||||
request = await raw_request.json()
|
||||
|
||||
# add params to request
|
||||
kv_prepare_request = request.copy()
|
||||
kv_prepare_request["max_tokens"] = 1
|
||||
if "max_completion_tokens" in kv_prepare_request:
|
||||
kv_prepare_request["max_completion_tokens"] = 1
|
||||
|
||||
# prefill stage
|
||||
prefill_instance = self.schedule(self.prefill_cycler)
|
||||
try:
|
||||
async for _ in self.forward_request(
|
||||
f"http://{prefill_instance}/v1/chat/completions", kv_prepare_request
|
||||
):
|
||||
continue
|
||||
except HTTPException as http_exc:
|
||||
self.remove_instance_endpoint("prefill", prefill_instance)
|
||||
raise http_exc
|
||||
# Perform kv recv and decoding stage
|
||||
decode_instance = self.schedule(self.decode_cycler)
|
||||
|
||||
try:
|
||||
generator = self.forward_request(
|
||||
"http://" + decode_instance + "/v1/chat/completions", request
|
||||
)
|
||||
except HTTPException as http_exc:
|
||||
self.remove_instance_endpoint("decode", decode_instance)
|
||||
raise http_exc
|
||||
response = StreamingResponse(content=generator)
|
||||
return response
|
||||
except Exception:
|
||||
exc_info = sys.exc_info()
|
||||
error_messages = [str(e) for e in exc_info if e]
|
||||
print("Error occurred in disagg proxy server")
|
||||
print(error_messages)
|
||||
return StreamingResponse(
|
||||
content=iter(error_messages), media_type="text/event-stream"
|
||||
)
|
||||
|
||||
def remove_instance_endpoint(self, instance_type, instance):
|
||||
if instance_type == "decode" and instance in self.decode_instances:
|
||||
self.decode_instances.remove(instance)
|
||||
self.decode_cycler = itertools.cycle(self.decode_instances)
|
||||
if instance_type == "prefill" and instance in self.decode_instances:
|
||||
self.prefill_instances.remove(instance)
|
||||
self.prefill_cycler = itertools.cycle(self.decode_instances)
|
||||
|
||||
|
||||
class RoundRobinSchedulingPolicy(SchedulingPolicy):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def schedule(self, cycler: itertools.cycle) -> str:
|
||||
return next(cycler)
|
||||
|
||||
|
||||
class ProxyServer:
|
||||
def __init__(
|
||||
self,
|
||||
args: argparse.Namespace,
|
||||
scheduling_policy: SchedulingPolicy | None = None,
|
||||
create_completion: Callable[[Request], StreamingResponse] | None = None,
|
||||
create_chat_completion: Callable[[Request], StreamingResponse] | None = None,
|
||||
):
|
||||
self.validate_parsed_serve_args(args)
|
||||
self.port = args.port
|
||||
self.proxy_instance = Proxy(
|
||||
prefill_instances=[] if args.prefill is None else args.prefill,
|
||||
decode_instances=[] if args.decode is None else args.decode,
|
||||
model=args.model,
|
||||
scheduling_policy=(
|
||||
scheduling_policy
|
||||
if scheduling_policy is not None
|
||||
else RoundRobinSchedulingPolicy()
|
||||
),
|
||||
custom_create_completion=create_completion,
|
||||
custom_create_chat_completion=create_chat_completion,
|
||||
)
|
||||
|
||||
def validate_parsed_serve_args(self, args: argparse.Namespace):
|
||||
if not args.prefill:
|
||||
raise ValueError("Please specify at least one prefill node.")
|
||||
if not args.decode:
|
||||
raise ValueError("Please specify at least one decode node.")
|
||||
self.validate_instances(args.prefill)
|
||||
self.validate_instances(args.decode)
|
||||
self.verify_model_config(args.prefill, args.model)
|
||||
self.verify_model_config(args.decode, args.model)
|
||||
|
||||
def validate_instances(self, instances: list):
|
||||
for instance in instances:
|
||||
if len(instance.split(":")) != 2:
|
||||
raise ValueError(f"Invalid instance format: {instance}")
|
||||
host, port = instance.split(":")
|
||||
try:
|
||||
if host != "localhost":
|
||||
ipaddress.ip_address(host)
|
||||
port = int(port)
|
||||
if not (0 < port < 65536):
|
||||
raise ValueError(f"Invalid port number in instance: {instance}")
|
||||
except Exception as e:
|
||||
raise ValueError(f"Invalid instance {instance}: {str(e)}") from e
|
||||
|
||||
def verify_model_config(self, instances: list, model: str) -> None:
|
||||
model_suffix = model.split("/")[-1]
|
||||
for instance in instances:
|
||||
try:
|
||||
response = requests.get(f"http://{instance}/v1/models")
|
||||
if response.status_code == 200:
|
||||
model_cur = response.json()["data"][0]["id"]
|
||||
model_cur_suffix = model_cur.split("/")[-1]
|
||||
if model_cur_suffix != model_suffix:
|
||||
raise ValueError(
|
||||
f"{instance} serves a different model: "
|
||||
f"{model_cur} != {model}"
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Cannot get model id from {instance}!")
|
||||
except requests.RequestException as e:
|
||||
raise ValueError(
|
||||
f"Error communicating with {instance}: {str(e)}"
|
||||
) from e
|
||||
|
||||
def run_server(self):
|
||||
app = FastAPI()
|
||||
app.include_router(self.proxy_instance.router)
|
||||
config = uvicorn.Config(app, port=self.port, loop="uvloop")
|
||||
server = uvicorn.Server(config)
|
||||
server.run()
|
||||
|
||||
|
||||
def parse_args():
|
||||
# Todo: allow more config
|
||||
parser = argparse.ArgumentParser("vLLM disaggregated proxy server.")
|
||||
parser.add_argument("--model", "-m", type=str, required=True, help="Model name")
|
||||
|
||||
parser.add_argument(
|
||||
"--prefill",
|
||||
"-p",
|
||||
type=str,
|
||||
nargs="+",
|
||||
help="List of prefill node URLs (host:port)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--decode",
|
||||
"-d",
|
||||
type=str,
|
||||
nargs="+",
|
||||
help="List of decode node URLs (host:port)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--port",
|
||||
type=int,
|
||||
default=8000,
|
||||
help="Server port number",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
proxy_server = ProxyServer(args=args)
|
||||
proxy_server.run_server()
|
||||
86
examples/online_serving/disaggregated_serving/kv_events.sh
Normal file
86
examples/online_serving/disaggregated_serving/kv_events.sh
Normal file
@@ -0,0 +1,86 @@
|
||||
#!/bin/bash
|
||||
# This file demonstrates the KV cache event publishing
|
||||
# We will launch a vllm instances configured to publish KV cache
|
||||
# events and launch a simple subscriber to log those events.
|
||||
|
||||
set -xe
|
||||
|
||||
echo "🚧🚧 Warning: The usage of KV cache events is experimental and subject to change 🚧🚧"
|
||||
sleep 1
|
||||
|
||||
MODEL_NAME=${HF_MODEL_NAME:-meta-llama/Meta-Llama-3.1-8B-Instruct}
|
||||
|
||||
# Trap the SIGINT signal (triggered by Ctrl+C)
|
||||
trap 'cleanup' INT
|
||||
|
||||
# Cleanup function
|
||||
cleanup() {
|
||||
echo "Caught Ctrl+C, cleaning up..."
|
||||
# Cleanup commands
|
||||
pgrep python | xargs kill -9
|
||||
pkill -f python
|
||||
echo "Cleanup complete. Exiting."
|
||||
exit 0
|
||||
}
|
||||
|
||||
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
||||
|
||||
# a function that waits vLLM server to start
|
||||
wait_for_server() {
|
||||
local port=$1
|
||||
timeout 1200 bash -c "
|
||||
until curl -s localhost:${port}/v1/completions > /dev/null; do
|
||||
sleep 1
|
||||
done" && return 0 || return 1
|
||||
}
|
||||
|
||||
vllm serve $MODEL_NAME \
|
||||
--port 8100 \
|
||||
--max-model-len 100 \
|
||||
--enforce-eager \
|
||||
--gpu-memory-utilization 0.8 \
|
||||
--trust-remote-code \
|
||||
--kv-events-config \
|
||||
'{"enable_kv_cache_events": true, "publisher": "zmq", "topic": "kv-events"}' &
|
||||
|
||||
wait_for_server 8100
|
||||
|
||||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
||||
|
||||
python3 "$SCRIPT_DIR/kv_events_subscriber.py" &
|
||||
sleep 1
|
||||
|
||||
# serve two example requests
|
||||
output1=$(curl -X POST -s http://localhost:8100/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "'"$MODEL_NAME"'",
|
||||
"prompt": "Explain quantum computing in simple terms a 5-year-old could understand.",
|
||||
"max_tokens": 80,
|
||||
"temperature": 0
|
||||
}')
|
||||
|
||||
output2=$(curl -X POST -s http://localhost:8100/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "'"$MODEL_NAME"'",
|
||||
"prompt": "Explain quantum computing in simple terms a 50-year-old could understand.",
|
||||
"max_tokens": 80,
|
||||
"temperature": 0
|
||||
}')
|
||||
|
||||
# Cleanup commands
|
||||
pkill -9 -u "$USER" -f python
|
||||
pkill -9 -u "$USER" -f vllm
|
||||
|
||||
sleep 1
|
||||
|
||||
echo "Cleaned up"
|
||||
|
||||
# Print the outputs of the curl requests
|
||||
echo ""
|
||||
echo "Output of first request: $output1"
|
||||
echo "Output of second request: $output2"
|
||||
|
||||
echo "🎉🎉 Successfully finished 2 test requests! 🎉🎉"
|
||||
echo ""
|
||||
@@ -0,0 +1,244 @@
|
||||
#!/bin/bash
|
||||
|
||||
# =============================================================================
|
||||
# vLLM Disaggregated Serving Script - P2P NCCL XpYd Architecture
|
||||
# =============================================================================
|
||||
# This script demonstrates disaggregated prefill and decode serving using
|
||||
# P2P NCCL communication. The architecture supports various XpYd configurations:
|
||||
#
|
||||
# - 1P3D: 1 Prefill server + 3 Decode servers (current default)
|
||||
# - 3P1D: 3 Prefill servers + 1 Decode server
|
||||
# - etc.
|
||||
#
|
||||
# Configuration can be customized via environment variables:
|
||||
# MODEL: Model to serve
|
||||
# PREFILL_GPUS: Comma-separated GPU IDs for prefill servers
|
||||
# DECODE_GPUS: Comma-separated GPU IDs for decode servers
|
||||
# PREFILL_PORTS: Comma-separated ports for prefill servers
|
||||
# DECODE_PORTS: Comma-separated ports for decode servers
|
||||
# PROXY_PORT: Proxy server port used to setup XpYd connection.
|
||||
# TIMEOUT_SECONDS: Server startup timeout
|
||||
# =============================================================================
|
||||
|
||||
# Configuration - can be overridden via environment variables
|
||||
MODEL=${MODEL:-meta-llama/Llama-3.1-8B-Instruct}
|
||||
TIMEOUT_SECONDS=${TIMEOUT_SECONDS:-1200}
|
||||
PROXY_PORT=${PROXY_PORT:-30001}
|
||||
|
||||
# Default 1P3D configuration (1 Prefill + 3 Decode)
|
||||
PREFILL_GPUS=${PREFILL_GPUS:-0}
|
||||
DECODE_GPUS=${DECODE_GPUS:-1,2,3}
|
||||
PREFILL_PORTS=${PREFILL_PORTS:-20003}
|
||||
DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009}
|
||||
|
||||
echo "Warning: P2P NCCL disaggregated prefill XpYd support for vLLM v1 is experimental and subject to change."
|
||||
echo ""
|
||||
echo "Architecture Configuration:"
|
||||
echo " Model: $MODEL"
|
||||
echo " Prefill GPUs: $PREFILL_GPUS, Ports: $PREFILL_PORTS"
|
||||
echo " Decode GPUs: $DECODE_GPUS, Ports: $DECODE_PORTS"
|
||||
echo " Proxy Port: $PROXY_PORT"
|
||||
echo " Timeout: ${TIMEOUT_SECONDS}s"
|
||||
echo ""
|
||||
|
||||
PIDS=()
|
||||
|
||||
# Switch to the directory of the current script
|
||||
cd "$(dirname "${BASH_SOURCE[0]}")"
|
||||
|
||||
check_required_files() {
|
||||
local files=("disagg_proxy_p2p_nccl_xpyd.py")
|
||||
for file in "${files[@]}"; do
|
||||
if [[ ! -f "$file" ]]; then
|
||||
echo "Required file $file not found in $(pwd)"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
check_hf_token() {
|
||||
if [ -z "$HF_TOKEN" ]; then
|
||||
echo "HF_TOKEN is not set. Please set it to your Hugging Face token."
|
||||
echo "Example: export HF_TOKEN=your_token_here"
|
||||
exit 1
|
||||
fi
|
||||
if [[ "$HF_TOKEN" != hf_* ]]; then
|
||||
echo "HF_TOKEN is not a valid Hugging Face token. Please set it to your Hugging Face token."
|
||||
exit 1
|
||||
fi
|
||||
echo "HF_TOKEN is set and valid."
|
||||
}
|
||||
|
||||
check_num_gpus() {
|
||||
# Check if the number of GPUs are >=2 via nvidia-smi
|
||||
num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
|
||||
if [ "$num_gpus" -lt 2 ]; then
|
||||
echo "You need at least 2 GPUs to run disaggregated prefill."
|
||||
exit 1
|
||||
else
|
||||
echo "Found $num_gpus GPUs."
|
||||
fi
|
||||
}
|
||||
|
||||
ensure_python_library_installed() {
|
||||
echo "Checking if $1 is installed..."
|
||||
if ! python3 -c "import $1" > /dev/null 2>&1; then
|
||||
echo "$1 is not installed. Please install it via pip install $1."
|
||||
exit 1
|
||||
else
|
||||
echo "$1 is installed."
|
||||
fi
|
||||
}
|
||||
|
||||
cleanup() {
|
||||
echo "Stopping everything…"
|
||||
trap - INT TERM # prevent re-entrancy
|
||||
pkill -9 -f "disagg_proxy_p2p_nccl_xpyd.py"
|
||||
kill -- -$$ # negative PID == "this whole process-group"
|
||||
wait # reap children so we don't leave zombies
|
||||
exit 0
|
||||
}
|
||||
|
||||
wait_for_server() {
|
||||
local port=$1
|
||||
local timeout_seconds=$TIMEOUT_SECONDS
|
||||
local start_time=$(date +%s)
|
||||
|
||||
echo "Waiting for server on port $port..."
|
||||
|
||||
while true; do
|
||||
if curl -s "localhost:${port}/v1/completions" > /dev/null; then
|
||||
echo "Server on port $port is ready."
|
||||
return 0
|
||||
fi
|
||||
|
||||
local now=$(date +%s)
|
||||
if (( now - start_time >= timeout_seconds )); then
|
||||
echo "Timeout waiting for server on port $port"
|
||||
return 1
|
||||
fi
|
||||
|
||||
sleep 1
|
||||
done
|
||||
}
|
||||
|
||||
main() {
|
||||
check_required_files
|
||||
check_hf_token
|
||||
check_num_gpus
|
||||
ensure_python_library_installed pandas
|
||||
ensure_python_library_installed datasets
|
||||
ensure_python_library_installed vllm
|
||||
ensure_python_library_installed quart
|
||||
|
||||
trap cleanup INT
|
||||
trap cleanup USR1
|
||||
trap cleanup TERM
|
||||
|
||||
echo "Launching disaggregated serving components..."
|
||||
echo "Please check the log files for detailed output:"
|
||||
echo " - prefill*.log: Prefill server logs"
|
||||
echo " - decode*.log: Decode server logs"
|
||||
echo " - proxy.log: Proxy server log"
|
||||
|
||||
# =============================================================================
|
||||
# Launch Proxy Server
|
||||
# =============================================================================
|
||||
echo ""
|
||||
echo "Starting proxy server on port $PROXY_PORT..."
|
||||
python3 disagg_proxy_p2p_nccl_xpyd.py &
|
||||
PIDS+=($!)
|
||||
|
||||
# Parse GPU and port arrays
|
||||
IFS=',' read -ra PREFILL_GPU_ARRAY <<< "$PREFILL_GPUS"
|
||||
IFS=',' read -ra DECODE_GPU_ARRAY <<< "$DECODE_GPUS"
|
||||
IFS=',' read -ra PREFILL_PORT_ARRAY <<< "$PREFILL_PORTS"
|
||||
IFS=',' read -ra DECODE_PORT_ARRAY <<< "$DECODE_PORTS"
|
||||
|
||||
# =============================================================================
|
||||
# Launch Prefill Servers (X Producers)
|
||||
# =============================================================================
|
||||
echo ""
|
||||
echo "Starting ${#PREFILL_GPU_ARRAY[@]} prefill server(s)..."
|
||||
for i in "${!PREFILL_GPU_ARRAY[@]}"; do
|
||||
local gpu_id=${PREFILL_GPU_ARRAY[$i]}
|
||||
local port=${PREFILL_PORT_ARRAY[$i]}
|
||||
local kv_port=$((21001 + i))
|
||||
|
||||
echo " Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
|
||||
CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
|
||||
--enforce-eager \
|
||||
--host 0.0.0.0 \
|
||||
--port $port \
|
||||
--tensor-parallel-size 1 \
|
||||
--seed 1024 \
|
||||
--dtype float16 \
|
||||
--max-model-len 10000 \
|
||||
--max-num-batched-tokens 10000 \
|
||||
--max-num-seqs 256 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--kv-transfer-config \
|
||||
"{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_producer\",\"kv_buffer_size\":\"1e1\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}" > prefill$((i+1)).log 2>&1 &
|
||||
PIDS+=($!)
|
||||
done
|
||||
|
||||
# =============================================================================
|
||||
# Launch Decode Servers (Y Decoders)
|
||||
# =============================================================================
|
||||
echo ""
|
||||
echo "Starting ${#DECODE_GPU_ARRAY[@]} decode server(s)..."
|
||||
for i in "${!DECODE_GPU_ARRAY[@]}"; do
|
||||
local gpu_id=${DECODE_GPU_ARRAY[$i]}
|
||||
local port=${DECODE_PORT_ARRAY[$i]}
|
||||
local kv_port=$((22001 + i))
|
||||
|
||||
echo " Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
|
||||
CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
|
||||
--enforce-eager \
|
||||
--host 0.0.0.0 \
|
||||
--port $port \
|
||||
--tensor-parallel-size 1 \
|
||||
--seed 1024 \
|
||||
--dtype float16 \
|
||||
--max-model-len 10000 \
|
||||
--max-num-batched-tokens 10000 \
|
||||
--max-num-seqs 256 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--kv-transfer-config \
|
||||
"{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_consumer\",\"kv_buffer_size\":\"8e9\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}" > decode$((i+1)).log 2>&1 &
|
||||
PIDS+=($!)
|
||||
done
|
||||
|
||||
# =============================================================================
|
||||
# Wait for All Servers to Start
|
||||
# =============================================================================
|
||||
echo ""
|
||||
echo "Waiting for all servers to start..."
|
||||
for port in "${PREFILL_PORT_ARRAY[@]}" "${DECODE_PORT_ARRAY[@]}"; do
|
||||
if ! wait_for_server $port; then
|
||||
echo "Failed to start server on port $port"
|
||||
cleanup
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "All servers are up. Starting benchmark..."
|
||||
|
||||
# =============================================================================
|
||||
# Run Benchmark
|
||||
# =============================================================================
|
||||
cd ../../../benchmarks/
|
||||
vllm bench serve --port 10001 --seed $(date +%s) \
|
||||
--model $MODEL \
|
||||
--dataset-name random --random-input-len 7500 --random-output-len 200 \
|
||||
--num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
|
||||
|
||||
echo "Benchmarking done. Cleaning up..."
|
||||
|
||||
cleanup
|
||||
}
|
||||
|
||||
main
|
||||
@@ -0,0 +1,190 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
import socket
|
||||
import threading
|
||||
import time
|
||||
import uuid
|
||||
from typing import Any
|
||||
|
||||
import aiohttp
|
||||
import msgpack
|
||||
import zmq
|
||||
from quart import Quart, make_response, request
|
||||
|
||||
count = 0
|
||||
prefill_instances: dict[str, Any] = {} # http_address: (zmq_address, stamp)
|
||||
decode_instances: dict[str, Any] = {} # http_address: (zmq_address, stamp)
|
||||
|
||||
prefill_cv = threading.Condition()
|
||||
decode_cv = threading.Condition()
|
||||
|
||||
DEFAULT_PING_SECONDS = 5
|
||||
|
||||
|
||||
def _remove_oldest_instances(instances: dict[str, Any]) -> None:
|
||||
oldest_key = next(iter(instances), None)
|
||||
while oldest_key is not None:
|
||||
value = instances[oldest_key]
|
||||
if value[1] > time.time():
|
||||
break
|
||||
print(f"🔴Remove [HTTP:{oldest_key}, ZMQ:{value[0]}, stamp:{value[1]}]")
|
||||
instances.pop(oldest_key, None)
|
||||
oldest_key = next(iter(instances), None)
|
||||
|
||||
|
||||
def _listen_for_register(poller, router_socket):
|
||||
while True:
|
||||
socks = dict(poller.poll())
|
||||
if router_socket in socks:
|
||||
remote_address, message = router_socket.recv_multipart()
|
||||
# data: {"type": "P", "http_address": "ip:port",
|
||||
# "zmq_address": "ip:port"}
|
||||
data = msgpack.loads(message)
|
||||
if data["type"] == "P":
|
||||
global prefill_instances
|
||||
global prefill_cv
|
||||
with prefill_cv:
|
||||
node = prefill_instances.get(data["http_address"], None)
|
||||
prefill_instances[data["http_address"]] = (
|
||||
data["zmq_address"],
|
||||
time.time() + DEFAULT_PING_SECONDS,
|
||||
)
|
||||
_remove_oldest_instances(prefill_instances)
|
||||
|
||||
elif data["type"] == "D":
|
||||
global decode_instances
|
||||
global decode_cv
|
||||
with decode_cv:
|
||||
node = decode_instances.get(data["http_address"], None)
|
||||
decode_instances[data["http_address"]] = (
|
||||
data["zmq_address"],
|
||||
time.time() + DEFAULT_PING_SECONDS,
|
||||
)
|
||||
_remove_oldest_instances(decode_instances)
|
||||
else:
|
||||
print(
|
||||
"Unexpected, Received message from %s, data: %s",
|
||||
remote_address,
|
||||
data,
|
||||
)
|
||||
return
|
||||
|
||||
if node is None:
|
||||
print(f"🔵Add [HTTP:{data['http_address']}, ZMQ:{data['zmq_address']}]")
|
||||
|
||||
|
||||
def start_service_discovery(hostname, port):
|
||||
if not hostname:
|
||||
hostname = socket.gethostname()
|
||||
if port == 0:
|
||||
raise ValueError("Port cannot be 0")
|
||||
|
||||
context = zmq.Context()
|
||||
router_socket = context.socket(zmq.ROUTER)
|
||||
router_socket.bind(f"tcp://{hostname}:{port}")
|
||||
|
||||
poller = zmq.Poller()
|
||||
poller.register(router_socket, zmq.POLLIN)
|
||||
|
||||
_listener_thread = threading.Thread(
|
||||
target=_listen_for_register, args=[poller, router_socket], daemon=True
|
||||
)
|
||||
_listener_thread.start()
|
||||
return _listener_thread
|
||||
|
||||
|
||||
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
|
||||
|
||||
app = Quart(__name__)
|
||||
|
||||
|
||||
def random_uuid() -> str:
|
||||
return str(uuid.uuid4().hex)
|
||||
|
||||
|
||||
async def forward_request(url, data, request_id):
|
||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||
headers = {
|
||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
|
||||
"X-Request-Id": request_id,
|
||||
}
|
||||
async with session.post(url=url, json=data, headers=headers) as response:
|
||||
if response.status == 200:
|
||||
if True:
|
||||
async for chunk_bytes in response.content.iter_chunked(1024):
|
||||
yield chunk_bytes
|
||||
else:
|
||||
content = await response.read()
|
||||
yield content
|
||||
|
||||
|
||||
@app.route("/v1/completions", methods=["POST"])
|
||||
@app.route("/v1/chat/completions", methods=["POST"])
|
||||
async def handle_request():
|
||||
try:
|
||||
original_request_data = await request.get_json()
|
||||
|
||||
prefill_request = original_request_data.copy()
|
||||
# change max_tokens = 1 to let it only do prefill
|
||||
prefill_request["max_tokens"] = 1
|
||||
if "max_completion_tokens" in prefill_request:
|
||||
prefill_request["max_completion_tokens"] = 1
|
||||
|
||||
global count
|
||||
global prefill_instances
|
||||
global prefill_cv
|
||||
with prefill_cv:
|
||||
prefill_list = list(prefill_instances.items())
|
||||
prefill_addr, prefill_zmq_addr = prefill_list[count % len(prefill_list)]
|
||||
prefill_zmq_addr = prefill_zmq_addr[0]
|
||||
|
||||
global decode_instances
|
||||
global decode_cv
|
||||
with decode_cv:
|
||||
decode_list = list(decode_instances.items())
|
||||
decode_addr, decode_zmq_addr = decode_list[count % len(decode_list)]
|
||||
decode_zmq_addr = decode_zmq_addr[0]
|
||||
|
||||
print(
|
||||
f"handle_request count: {count}, [HTTP:{prefill_addr}, "
|
||||
f"ZMQ:{prefill_zmq_addr}] 👉 [HTTP:{decode_addr}, "
|
||||
f"ZMQ:{decode_zmq_addr}]"
|
||||
)
|
||||
count += 1
|
||||
|
||||
request_id = (
|
||||
f"___prefill_addr_{prefill_zmq_addr}___decode_addr_"
|
||||
f"{decode_zmq_addr}_{random_uuid()}"
|
||||
)
|
||||
|
||||
# finish prefill
|
||||
async for _ in forward_request(
|
||||
f"http://{prefill_addr}{request.path}", prefill_request, request_id
|
||||
):
|
||||
continue
|
||||
|
||||
# return decode
|
||||
generator = forward_request(
|
||||
f"http://{decode_addr}{request.path}", original_request_data, request_id
|
||||
)
|
||||
response = await make_response(generator)
|
||||
response.timeout = None
|
||||
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
exc_info = sys.exc_info()
|
||||
print("Error occurred in disagg prefill proxy server")
|
||||
print(e)
|
||||
print("".join(traceback.format_exception(*exc_info)))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
t = start_service_discovery("0.0.0.0", 30001)
|
||||
app.run(host="0.0.0.0", port=10001)
|
||||
t.join()
|
||||
57
examples/online_serving/elastic_ep/bench.sh
Normal file
57
examples/online_serving/elastic_ep/bench.sh
Normal file
@@ -0,0 +1,57 @@
|
||||
#!/bin/bash
|
||||
|
||||
MODEL_NAME="deepseek-ai/DeepSeek-V2-Lite"
|
||||
LOCAL_MODEL_PATH="/models/models--deepseek-ai--DeepSeek-V2-Lite/snapshots/604d5664dddd88a0433dbae533b7fe9472482de0"
|
||||
HOST="localhost"
|
||||
PORT=8006
|
||||
NUM_PROMPTS=20
|
||||
REQUEST_RATE=5
|
||||
|
||||
# Parse command line arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--model)
|
||||
MODEL_NAME="$2"
|
||||
shift 2
|
||||
;;
|
||||
--local-model)
|
||||
MODEL_NAME=$LOCAL_MODEL_PATH
|
||||
shift
|
||||
;;
|
||||
--host)
|
||||
HOST="$2"
|
||||
shift 2
|
||||
;;
|
||||
--port)
|
||||
PORT="$2"
|
||||
shift 2
|
||||
;;
|
||||
--num-prompts)
|
||||
NUM_PROMPTS="$2"
|
||||
shift 2
|
||||
;;
|
||||
--request-rate)
|
||||
REQUEST_RATE="$2"
|
||||
shift 2
|
||||
;;
|
||||
-h|--help)
|
||||
echo "Usage: $0 [OPTIONS]"
|
||||
echo "Options:"
|
||||
echo " --model MODEL_NAME Set model name or path (default: deepseek-ai/DeepSeek-V2-Lite)"
|
||||
echo " --local-model Use local model path (convenience option)"
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1"
|
||||
echo "Use -h or --help for usage information"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
vllm bench serve \
|
||||
--model $MODEL_NAME \
|
||||
--host $HOST \
|
||||
--port $PORT \
|
||||
--num-prompts $NUM_PROMPTS \
|
||||
--request-rate $REQUEST_RATE
|
||||
53
examples/online_serving/elastic_ep/scale.py
Normal file
53
examples/online_serving/elastic_ep/scale.py
Normal file
@@ -0,0 +1,53 @@
|
||||
#!/usr/bin/env python3
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
def scale(host, port, new_dp_size):
|
||||
url = f"http://{host}:{port}/scale_elastic_ep"
|
||||
payload = {"new_data_parallel_size": new_dp_size}
|
||||
headers = {"Content-Type": "application/json"}
|
||||
|
||||
print(f"Sending scale request to {url}")
|
||||
print(f"Payload: {json.dumps(payload, indent=2)}")
|
||||
|
||||
try:
|
||||
response = requests.post(url, json=payload, headers=headers, timeout=300)
|
||||
|
||||
print(f"Status Code: {response.status_code}")
|
||||
print(f"Response: {response.text}")
|
||||
|
||||
if response.status_code == 200:
|
||||
print("Scale up/down request successful!")
|
||||
return True
|
||||
else:
|
||||
print("Scale up/down request failed!")
|
||||
return False
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"Request failed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Test scale up/down functionality")
|
||||
parser.add_argument("--host", default="localhost", help="API server host")
|
||||
parser.add_argument("--port", type=int, default=8006, help="API server port")
|
||||
parser.add_argument(
|
||||
"--new-dp-size", type=int, default=2, help="New data parallel size"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
success = scale(args.host, args.port, args.new_dp_size)
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
71
examples/online_serving/elastic_ep/serve_deepseek_v2.sh
Normal file
71
examples/online_serving/elastic_ep/serve_deepseek_v2.sh
Normal file
@@ -0,0 +1,71 @@
|
||||
#!/bin/bash
|
||||
|
||||
HOST="0.0.0.0"
|
||||
PORT=8006
|
||||
DATA_PARALLEL_SIZE=4
|
||||
REDUNDANT_EXPERTS=0
|
||||
LOCAL_MODEL_PATH="/models/models--deepseek-ai--DeepSeek-V2-Lite/snapshots/604d5664dddd88a0433dbae533b7fe9472482de0"
|
||||
MODEL_NAME="deepseek-ai/DeepSeek-V2-Lite"
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--dp)
|
||||
DATA_PARALLEL_SIZE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--re)
|
||||
REDUNDANT_EXPERTS="$2"
|
||||
shift 2
|
||||
;;
|
||||
--host)
|
||||
HOST="$2"
|
||||
shift 2
|
||||
;;
|
||||
--port)
|
||||
PORT="$2"
|
||||
shift 2
|
||||
;;
|
||||
--model)
|
||||
MODEL_NAME="$2"
|
||||
shift 2
|
||||
;;
|
||||
--local-model)
|
||||
MODEL_NAME=$LOCAL_MODEL_PATH
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
echo "Usage: $0 [OPTIONS]"
|
||||
echo "Options:"
|
||||
echo " --dp SIZE Set data parallel size (default: 4)"
|
||||
echo " --re SIZE Set redundant experts (default: 0)"
|
||||
echo " --host HOST Set host address (default: 0.0.0.0)"
|
||||
echo " --port PORT Set port number (default: 8006)"
|
||||
echo " --model MODEL_NAME Set model name or path"
|
||||
echo " -h, --help Show this help message"
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1"
|
||||
echo "Use -h or --help for usage information"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALLEL_SIZE and redundant experts: $REDUNDANT_EXPERTS"
|
||||
|
||||
export RAY_DEDUP_LOGS=0
|
||||
export VLLM_ALL2ALL_BACKEND="pplx"
|
||||
export VLLM_USE_DEEP_GEMM=1
|
||||
|
||||
vllm serve $MODEL_NAME \
|
||||
--data-parallel-size $DATA_PARALLEL_SIZE \
|
||||
--data-parallel-size-local $DATA_PARALLEL_SIZE \
|
||||
--data-parallel-backend ray \
|
||||
--enforce-eager \
|
||||
--enable-expert-parallel \
|
||||
--enable-eplb \
|
||||
--num-redundant-experts $REDUNDANT_EXPERTS \
|
||||
--trust-remote-code \
|
||||
--host $HOST \
|
||||
--port $PORT
|
||||
112
examples/online_serving/gradio_openai_chatbot_webserver.py
Normal file
112
examples/online_serving/gradio_openai_chatbot_webserver.py
Normal file
@@ -0,0 +1,112 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Example for starting a Gradio OpenAI Chatbot Webserver
|
||||
Start vLLM API server:
|
||||
vllm serve meta-llama/Llama-2-7b-chat-hf
|
||||
|
||||
Start Gradio OpenAI Chatbot Webserver:
|
||||
python examples/online_serving/gradio_openai_chatbot_webserver.py \
|
||||
-m meta-llama/Llama-2-7b-chat-hf
|
||||
|
||||
Note that `pip install --upgrade gradio` is needed to run this example.
|
||||
More details: https://github.com/gradio-app/gradio
|
||||
|
||||
If your antivirus software blocks the download of frpc for gradio,
|
||||
you can install it manually by following these steps:
|
||||
|
||||
1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64
|
||||
2. Rename the downloaded file to: frpc_linux_amd64_v0.3
|
||||
3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
||||
import gradio as gr
|
||||
from openai import OpenAI
|
||||
|
||||
|
||||
def predict(message, history, client, model_name, temp, stop_token_ids):
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a great AI assistant."},
|
||||
*history,
|
||||
{"role": "user", "content": message},
|
||||
]
|
||||
|
||||
# Send request to OpenAI API (vLLM server)
|
||||
stream = client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
temperature=temp,
|
||||
stream=True,
|
||||
extra_body={
|
||||
"repetition_penalty": 1,
|
||||
"stop_token_ids": [int(id.strip()) for id in stop_token_ids.split(",")]
|
||||
if stop_token_ids
|
||||
else [],
|
||||
},
|
||||
)
|
||||
|
||||
# Collect all chunks and concatenate them into a full message
|
||||
full_message = ""
|
||||
for chunk in stream:
|
||||
full_message += chunk.choices[0].delta.content or ""
|
||||
|
||||
# Return the full message as a single response
|
||||
return full_message
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Chatbot Interface with Customizable Parameters"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model-url", type=str, default="http://localhost:8000/v1", help="Model URL"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-m", "--model", type=str, required=True, help="Model name for the chatbot"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--temp", type=float, default=0.8, help="Temperature for text generation"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--stop-token-ids", type=str, default="", help="Comma-separated stop token IDs"
|
||||
)
|
||||
parser.add_argument("--host", type=str, default=None)
|
||||
parser.add_argument("--port", type=int, default=8001)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def build_gradio_interface(client, model_name, temp, stop_token_ids):
|
||||
def chat_predict(message, history):
|
||||
return predict(message, history, client, model_name, temp, stop_token_ids)
|
||||
|
||||
return gr.ChatInterface(
|
||||
fn=chat_predict,
|
||||
title="Chatbot Interface",
|
||||
description="A simple chatbot powered by vLLM",
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
# Parse the arguments
|
||||
args = parse_args()
|
||||
|
||||
# Set OpenAI's API key and API base to use vLLM's API server
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = args.model_url
|
||||
|
||||
# Create an OpenAI client
|
||||
client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
|
||||
|
||||
# Define the Gradio chatbot interface using the predict function
|
||||
gradio_interface = build_gradio_interface(
|
||||
client, args.model, args.temp, args.stop_token_ids
|
||||
)
|
||||
|
||||
gradio_interface.queue().launch(
|
||||
server_name=args.host, server_port=args.port, share=True
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
75
examples/online_serving/gradio_webserver.py
Normal file
75
examples/online_serving/gradio_webserver.py
Normal file
@@ -0,0 +1,75 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Example for starting a Gradio Webserver
|
||||
Start vLLM API server:
|
||||
python -m vllm.entrypoints.api_server \
|
||||
--model meta-llama/Llama-2-7b-chat-hf
|
||||
|
||||
Start Webserver:
|
||||
python examples/online_serving/gradio_webserver.py
|
||||
|
||||
Note that `pip install --upgrade gradio` is needed to run this example.
|
||||
More details: https://github.com/gradio-app/gradio
|
||||
|
||||
If your antivirus software blocks the download of frpc for gradio,
|
||||
you can install it manually by following these steps:
|
||||
|
||||
1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64
|
||||
2. Rename the downloaded file to: frpc_linux_amd64_v0.3
|
||||
3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
|
||||
import gradio as gr
|
||||
import requests
|
||||
|
||||
|
||||
def http_bot(prompt):
|
||||
headers = {"User-Agent": "vLLM Client"}
|
||||
pload = {
|
||||
"prompt": prompt,
|
||||
"stream": True,
|
||||
"max_tokens": 128,
|
||||
}
|
||||
response = requests.post(args.model_url, headers=headers, json=pload, stream=True)
|
||||
|
||||
for chunk in response.iter_lines(
|
||||
chunk_size=8192, decode_unicode=False, delimiter=b"\n"
|
||||
):
|
||||
if chunk:
|
||||
data = json.loads(chunk.decode("utf-8"))
|
||||
output = data["text"][0]
|
||||
yield output
|
||||
|
||||
|
||||
def build_demo():
|
||||
with gr.Blocks() as demo:
|
||||
gr.Markdown("# vLLM text completion demo\n")
|
||||
inputbox = gr.Textbox(label="Input", placeholder="Enter text and press ENTER")
|
||||
outputbox = gr.Textbox(
|
||||
label="Output", placeholder="Generated result from the model"
|
||||
)
|
||||
inputbox.submit(http_bot, [inputbox], [outputbox])
|
||||
return demo
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host", type=str, default=None)
|
||||
parser.add_argument("--port", type=int, default=8001)
|
||||
parser.add_argument(
|
||||
"--model-url", type=str, default="http://localhost:8000/generate"
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args):
|
||||
demo = build_demo()
|
||||
demo.queue().launch(server_name=args.host, server_port=args.port, share=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
117
examples/online_serving/kv_events_subscriber.py
Normal file
117
examples/online_serving/kv_events_subscriber.py
Normal file
@@ -0,0 +1,117 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Any
|
||||
|
||||
import msgspec
|
||||
import zmq
|
||||
from msgspec.msgpack import Decoder
|
||||
|
||||
from vllm.v1.core.kv_cache_utils import ExternalBlockHash
|
||||
|
||||
|
||||
#
|
||||
# Types copied from vllm.distributed.kv_events
|
||||
#
|
||||
class EventBatch(msgspec.Struct, array_like=True, omit_defaults=True, gc=False):
|
||||
ts: float
|
||||
events: list[Any]
|
||||
|
||||
|
||||
class KVCacheEvent(
|
||||
msgspec.Struct, array_like=True, omit_defaults=True, gc=False, tag=True
|
||||
):
|
||||
"""Base class for all KV cache-related events"""
|
||||
|
||||
|
||||
class BlockStored(KVCacheEvent):
|
||||
block_hashes: list[ExternalBlockHash]
|
||||
parent_block_hash: ExternalBlockHash | None
|
||||
token_ids: list[int]
|
||||
block_size: int
|
||||
lora_id: int | None
|
||||
medium: str | None
|
||||
|
||||
|
||||
class BlockRemoved(KVCacheEvent):
|
||||
block_hashes: list[ExternalBlockHash]
|
||||
medium: str | None
|
||||
|
||||
|
||||
class AllBlocksCleared(KVCacheEvent):
|
||||
pass
|
||||
|
||||
|
||||
class KVEventBatch(EventBatch):
|
||||
events: list[BlockStored | BlockRemoved | AllBlocksCleared]
|
||||
|
||||
|
||||
def process_event(event_batch):
|
||||
print(f"Received event batch at {event_batch.ts}:")
|
||||
for event in event_batch.events:
|
||||
print(f" - {event}")
|
||||
|
||||
|
||||
def main():
|
||||
decoder = Decoder(type=KVEventBatch)
|
||||
last_seq = -1
|
||||
|
||||
context = zmq.Context()
|
||||
|
||||
# Set up the main subscription socket
|
||||
sub = context.socket(zmq.SUB)
|
||||
sub.connect("tcp://localhost:5557")
|
||||
topic = "kv-events"
|
||||
sub.setsockopt_string(zmq.SUBSCRIBE, topic)
|
||||
|
||||
# Initialize replay socket
|
||||
replay = context.socket(zmq.REQ)
|
||||
replay.connect("tcp://localhost:5558")
|
||||
poller = zmq.Poller()
|
||||
poller.register(replay, zmq.POLLIN)
|
||||
|
||||
print("Listening for KV cache events on topic:", topic)
|
||||
|
||||
while True:
|
||||
try:
|
||||
if sub.poll(50):
|
||||
_, seq_bytes, payload = sub.recv_multipart()
|
||||
seq = int.from_bytes(seq_bytes, "big")
|
||||
|
||||
if last_seq >= 0 and seq > last_seq + 1:
|
||||
missed = seq - last_seq - 1
|
||||
print(
|
||||
f"Missed {missed} messages (last: {last_seq}, current: {seq})"
|
||||
)
|
||||
|
||||
replay.send((last_seq + 1).to_bytes(8, "big"))
|
||||
|
||||
while poller.poll(timeout=200):
|
||||
seq_bytes, replay_payload = replay.recv_multipart()
|
||||
if not replay_payload:
|
||||
# End of replay marker is sent as an empty frame
|
||||
# for the payload
|
||||
break
|
||||
|
||||
replay_seq = int.from_bytes(seq_bytes, "big")
|
||||
|
||||
if replay_seq > last_seq:
|
||||
event_batch = decoder.decode(replay_payload)
|
||||
process_event(event_batch)
|
||||
last_seq = replay_seq
|
||||
if replay_seq >= seq - 1:
|
||||
break
|
||||
|
||||
event_batch = decoder.decode(payload)
|
||||
process_event(event_batch)
|
||||
|
||||
# ... do other periodic work or check for shutdown ...
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("Interrupted")
|
||||
break
|
||||
except Exception as e:
|
||||
print("Error decoding message:", e)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
119
examples/online_serving/multi-node-serving.sh
Normal file
119
examples/online_serving/multi-node-serving.sh
Normal file
@@ -0,0 +1,119 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Helper script to manually start or join a Ray cluster for online serving of vLLM models.
|
||||
# This script is first executed on the head node, and then on each worker node with the IP address
|
||||
# of the head node.
|
||||
#
|
||||
# Subcommands:
|
||||
# leader: Launches a Ray head node and blocks until the cluster reaches the expected size (head + workers).
|
||||
# worker: Starts a worker node that connects to an existing Ray head node.
|
||||
#
|
||||
# Example usage:
|
||||
# On the head node machine, start the Ray head node process and run a vLLM server.
|
||||
# ./multi-node-serving.sh leader --ray_port=6379 --ray_cluster_size=<SIZE> [<extra ray args>] && \
|
||||
# vllm serve meta-llama/Meta-Llama-3.1-405B-Instruct --port 8080 --tensor-parallel-size 8 --pipeline_parallel_size 2
|
||||
#
|
||||
# On each worker node, start the Ray worker node process.
|
||||
# ./multi-node-serving.sh worker --ray_address=<HEAD_NODE_IP> --ray_port=6379 [<extra ray args>]
|
||||
#
|
||||
# About Ray:
|
||||
# Ray is an open-source distributed execution framework that simplifies
|
||||
# distributed computing. Learn more:
|
||||
# https://ray.io/
|
||||
|
||||
|
||||
subcommand=$1 # Either "leader" or "worker".
|
||||
shift # Remove the subcommand from the argument list.
|
||||
|
||||
ray_port=6379 # Port used by the Ray head node.
|
||||
ray_init_timeout=300 # Seconds to wait before timing out.
|
||||
declare -a start_params # Parameters forwarded to the underlying 'ray start' command.
|
||||
|
||||
# Handle the worker subcommand.
|
||||
case "$subcommand" in
|
||||
worker)
|
||||
ray_address=""
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--ray_address=*)
|
||||
ray_address="${1#*=}"
|
||||
;;
|
||||
--ray_port=*)
|
||||
ray_port="${1#*=}"
|
||||
;;
|
||||
--ray_init_timeout=*)
|
||||
ray_init_timeout="${1#*=}"
|
||||
;;
|
||||
*)
|
||||
start_params+=("$1")
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
if [ -z "$ray_address" ]; then
|
||||
echo "Error: Missing argument --ray_address"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Retry until the worker node connects to the head node or the timeout expires.
|
||||
for (( i=0; i < $ray_init_timeout; i+=5 )); do
|
||||
ray start --address=$ray_address:$ray_port --block "${start_params[@]}"
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "Worker: Ray runtime started with head address $ray_address:$ray_port"
|
||||
exit 0
|
||||
fi
|
||||
echo "Waiting until the ray worker is active..."
|
||||
sleep 5s;
|
||||
done
|
||||
echo "Ray worker starts timeout, head address: $ray_address:$ray_port"
|
||||
exit 1
|
||||
;;
|
||||
|
||||
# Handle the leader subcommand.
|
||||
leader)
|
||||
ray_cluster_size=""
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--ray_port=*)
|
||||
ray_port="${1#*=}"
|
||||
;;
|
||||
--ray_cluster_size=*)
|
||||
ray_cluster_size="${1#*=}"
|
||||
;;
|
||||
--ray_init_timeout=*)
|
||||
ray_init_timeout="${1#*=}"
|
||||
;;
|
||||
*)
|
||||
start_params+=("$1")
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
if [ -z "$ray_cluster_size" ]; then
|
||||
echo "Error: Missing argument --ray_cluster_size"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Start the Ray head node.
|
||||
ray start --head --port=$ray_port "${start_params[@]}"
|
||||
|
||||
# Poll Ray until every worker node is active.
|
||||
for (( i=0; i < $ray_init_timeout; i+=5 )); do
|
||||
active_nodes=`python3 -c 'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))'`
|
||||
if [ $active_nodes -eq $ray_cluster_size ]; then
|
||||
echo "All ray workers are active and the ray cluster is initialized successfully."
|
||||
exit 0
|
||||
fi
|
||||
echo "Wait for all ray workers to be active. $active_nodes/$ray_cluster_size is active"
|
||||
sleep 5s;
|
||||
done
|
||||
|
||||
echo "Waiting for all ray workers to be active timed out."
|
||||
exit 1
|
||||
;;
|
||||
|
||||
*)
|
||||
echo "unknown subcommand: $subcommand"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
87
examples/online_serving/multi_instance_data_parallel.py
Normal file
87
examples/online_serving/multi_instance_data_parallel.py
Normal file
@@ -0,0 +1,87 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import asyncio
|
||||
import threading
|
||||
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.metrics.loggers import AggregatedLoggingStatLogger
|
||||
|
||||
"""
|
||||
To run this example, run the following commands simultaneously with
|
||||
different CUDA_VISIBLE_DEVICES:
|
||||
python examples/online_serving/multi_instance_data_parallel.py
|
||||
|
||||
vllm serve ibm-research/PowerMoE-3b -dp 2 -dpr 1 \
|
||||
--data-parallel-address 127.0.0.1 --data-parallel-rpc-port 62300 \
|
||||
--data-parallel-size-local 1 --enforce-eager --headless
|
||||
|
||||
Once both instances have completed the handshake, this example will
|
||||
send a request to the instance with DP rank 1.
|
||||
"""
|
||||
|
||||
|
||||
def _do_background_logging(engine, interval, stop_event):
|
||||
try:
|
||||
while not stop_event.is_set():
|
||||
asyncio.run(engine.do_log_stats())
|
||||
stop_event.wait(interval)
|
||||
except Exception as e:
|
||||
print(f"vLLM background logging shutdown: {e}")
|
||||
pass
|
||||
|
||||
|
||||
async def main():
|
||||
engine_args = AsyncEngineArgs(
|
||||
model="ibm-research/PowerMoE-3b",
|
||||
data_parallel_size=2,
|
||||
tensor_parallel_size=1,
|
||||
dtype="auto",
|
||||
max_model_len=2048,
|
||||
data_parallel_address="127.0.0.1",
|
||||
data_parallel_rpc_port=62300,
|
||||
data_parallel_size_local=1,
|
||||
enforce_eager=True,
|
||||
enable_log_requests=True,
|
||||
disable_custom_all_reduce=True,
|
||||
)
|
||||
|
||||
engine_client = AsyncLLMEngine.from_engine_args(
|
||||
engine_args,
|
||||
# Example: Using aggregated logger
|
||||
stat_loggers=[AggregatedLoggingStatLogger],
|
||||
)
|
||||
stop_logging_event = threading.Event()
|
||||
logging_thread = threading.Thread(
|
||||
target=_do_background_logging,
|
||||
args=(engine_client, 5, stop_logging_event),
|
||||
daemon=True,
|
||||
)
|
||||
logging_thread.start()
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.7,
|
||||
top_p=0.9,
|
||||
max_tokens=100,
|
||||
)
|
||||
num_prompts = 10
|
||||
for i in range(num_prompts):
|
||||
prompt = "Who won the 2004 World Series?"
|
||||
final_output: RequestOutput | None = None
|
||||
async for output in engine_client.generate(
|
||||
prompt=prompt,
|
||||
sampling_params=sampling_params,
|
||||
request_id=f"abcdef-{i}",
|
||||
data_parallel_rank=1,
|
||||
):
|
||||
final_output = output
|
||||
if final_output:
|
||||
print(final_output.outputs[0].text)
|
||||
|
||||
stop_logging_event.set()
|
||||
logging_thread.join()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
64
examples/online_serving/openai_chat_completion_client.py
Normal file
64
examples/online_serving/openai_chat_completion_client.py
Normal file
@@ -0,0 +1,64 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Example Python client for OpenAI Chat Completion using vLLM API server
|
||||
NOTE: start a supported chat completion model server with `vllm serve`, e.g.
|
||||
vllm serve meta-llama/Llama-2-7b-chat-hf
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "Who won the world series in 2020?"},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "The Los Angeles Dodgers won the World Series in 2020.",
|
||||
},
|
||||
{"role": "user", "content": "Where was it played?"},
|
||||
]
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Client for vLLM API server")
|
||||
parser.add_argument(
|
||||
"--stream", action="store_true", help="Enable streaming response"
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args):
|
||||
client = OpenAI(
|
||||
# defaults to os.environ.get("OPENAI_API_KEY")
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
|
||||
# Chat Completion API
|
||||
chat_completion = client.chat.completions.create(
|
||||
messages=messages,
|
||||
model=model,
|
||||
stream=args.stream,
|
||||
)
|
||||
|
||||
print("-" * 50)
|
||||
print("Chat completion results:")
|
||||
if args.stream:
|
||||
for c in chat_completion:
|
||||
print(c)
|
||||
else:
|
||||
print(chat_completion)
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
@@ -0,0 +1,353 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""An example showing how to use vLLM to serve multimodal models
|
||||
and run online serving with OpenAI client.
|
||||
|
||||
Launch the vLLM server with the following command:
|
||||
|
||||
(single image inference with Llava)
|
||||
vllm serve llava-hf/llava-1.5-7b-hf
|
||||
|
||||
(multi-image inference with Phi-3.5-vision-instruct)
|
||||
vllm serve microsoft/Phi-3.5-vision-instruct --runner generate \
|
||||
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
|
||||
|
||||
(audio inference with Ultravox)
|
||||
vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b \
|
||||
--max-model-len 4096 --trust-remote-code
|
||||
|
||||
run the script with
|
||||
python openai_chat_completion_client_for_multimodal.py --chat-type audio
|
||||
"""
|
||||
|
||||
import base64
|
||||
|
||||
import requests
|
||||
from openai import OpenAI
|
||||
from utils import get_first_model
|
||||
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
client = OpenAI(
|
||||
# defaults to os.environ.get("OPENAI_API_KEY")
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
headers = {"User-Agent": "vLLM Example Client"}
|
||||
|
||||
|
||||
def encode_base64_content_from_url(content_url: str) -> str:
|
||||
"""Encode a content retrieved from a remote url to base64 format."""
|
||||
|
||||
with requests.get(content_url, headers=headers) as response:
|
||||
response.raise_for_status()
|
||||
result = base64.b64encode(response.content).decode("utf-8")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# Text-only inference
|
||||
def run_text_only(model: str, max_completion_tokens: int) -> None:
|
||||
chat_completion = client.chat.completions.create(
|
||||
messages=[{"role": "user", "content": "What's the capital of France?"}],
|
||||
model=model,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
)
|
||||
|
||||
result = chat_completion.choices[0].message.content
|
||||
print("Chat completion output:\n", result)
|
||||
|
||||
|
||||
# Single-image input inference
|
||||
def run_single_image(model: str, max_completion_tokens: int) -> None:
|
||||
## Use image url in the payload
|
||||
image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
chat_completion_from_url = client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "What's in this image?"},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
)
|
||||
|
||||
result = chat_completion_from_url.choices[0].message.content
|
||||
print("Chat completion output from image url:\n", result)
|
||||
|
||||
## Use base64 encoded image in the payload
|
||||
image_base64 = encode_base64_content_from_url(image_url)
|
||||
chat_completion_from_base64 = client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "What's in this image?"},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
)
|
||||
|
||||
result = chat_completion_from_base64.choices[0].message.content
|
||||
print("Chat completion output from base64 encoded image:", result)
|
||||
|
||||
|
||||
# Multi-image input inference
|
||||
def run_multi_image(model: str, max_completion_tokens: int) -> None:
|
||||
image_url_duck = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg"
|
||||
image_url_lion = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/lion.jpg"
|
||||
chat_completion_from_url = client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "What are the animals in these images?"},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url_duck},
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url_lion},
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
)
|
||||
|
||||
result = chat_completion_from_url.choices[0].message.content
|
||||
print("Chat completion output:\n", result)
|
||||
|
||||
|
||||
# Video input inference
|
||||
def run_video(model: str, max_completion_tokens: int) -> None:
|
||||
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
|
||||
video_base64 = encode_base64_content_from_url(video_url)
|
||||
|
||||
## Use video url in the payload
|
||||
chat_completion_from_url = client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "What's in this video?"},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": video_url},
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
)
|
||||
|
||||
result = chat_completion_from_url.choices[0].message.content
|
||||
print("Chat completion output from video url:\n", result)
|
||||
|
||||
## Use base64 encoded video in the payload
|
||||
chat_completion_from_base64 = client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "What's in this video?"},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
)
|
||||
|
||||
result = chat_completion_from_base64.choices[0].message.content
|
||||
print("Chat completion output from base64 encoded video:\n", result)
|
||||
|
||||
|
||||
# Audio input inference
|
||||
def run_audio(model: str, max_completion_tokens: int) -> None:
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
audio_url = AudioAsset("winning_call").url
|
||||
audio_base64 = encode_base64_content_from_url(audio_url)
|
||||
|
||||
# OpenAI-compatible schema (`input_audio`)
|
||||
chat_completion_from_base64 = client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "What's in this audio?"},
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": {
|
||||
# Any format supported by librosa is supported
|
||||
"data": audio_base64,
|
||||
"format": "wav",
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
)
|
||||
|
||||
result = chat_completion_from_base64.choices[0].message.content
|
||||
print("Chat completion output from input audio:\n", result)
|
||||
|
||||
# HTTP URL
|
||||
chat_completion_from_url = client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "What's in this audio?"},
|
||||
{
|
||||
"type": "audio_url",
|
||||
"audio_url": {
|
||||
# Any format supported by librosa is supported
|
||||
"url": audio_url
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
)
|
||||
|
||||
result = chat_completion_from_url.choices[0].message.content
|
||||
print("Chat completion output from audio url:\n", result)
|
||||
|
||||
# base64 URL
|
||||
chat_completion_from_base64 = client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "What's in this audio?"},
|
||||
{
|
||||
"type": "audio_url",
|
||||
"audio_url": {
|
||||
# Any format supported by librosa is supported
|
||||
"url": f"data:audio/ogg;base64,{audio_base64}"
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
)
|
||||
|
||||
result = chat_completion_from_base64.choices[0].message.content
|
||||
print("Chat completion output from base64 encoded audio:\n", result)
|
||||
|
||||
|
||||
def run_multi_audio(model: str, max_completion_tokens: int) -> None:
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
# Two different audios to showcase batched inference.
|
||||
audio_url = AudioAsset("winning_call").url
|
||||
audio_base64 = encode_base64_content_from_url(audio_url)
|
||||
audio_url2 = AudioAsset("azacinto_foscolo").url
|
||||
audio_base64_2 = encode_base64_content_from_url(audio_url2)
|
||||
|
||||
# OpenAI-compatible schema (`input_audio`)
|
||||
chat_completion_from_base64 = client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Are these two audios the same?"},
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": {
|
||||
"data": audio_base64,
|
||||
"format": "wav",
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": {
|
||||
"data": audio_base64_2,
|
||||
"format": "wav",
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
)
|
||||
|
||||
result = chat_completion_from_base64.choices[0].message.content
|
||||
print("Chat completion output from input audio:\n", result)
|
||||
|
||||
|
||||
example_function_map = {
|
||||
"text-only": run_text_only,
|
||||
"single-image": run_single_image,
|
||||
"multi-image": run_multi_image,
|
||||
"multi-audio": run_multi_audio,
|
||||
"video": run_video,
|
||||
"audio": run_audio,
|
||||
}
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Demo on using OpenAI client for online serving with "
|
||||
"multimodal language models served with vLLM."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--chat-type",
|
||||
"-c",
|
||||
type=str,
|
||||
default="single-image",
|
||||
choices=list(example_function_map.keys()),
|
||||
help="Conversation type with multimodal data.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-completion-tokens",
|
||||
"-n",
|
||||
type=int,
|
||||
default=128,
|
||||
help="Maximum number of tokens to generate for each completion.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args) -> None:
|
||||
chat_type = args.chat_type
|
||||
model = get_first_model(client)
|
||||
example_function_map[chat_type](model, args.max_completion_tokens)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
@@ -0,0 +1,195 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Set up this example by starting a vLLM OpenAI-compatible server with tool call
|
||||
options enabled. For example:
|
||||
|
||||
IMPORTANT: for mistral, you must use one of the provided mistral tool call
|
||||
templates, or your own - the model default doesn't work for tool calls with vLLM
|
||||
See the vLLM docs on OpenAI server & tool calling for more details.
|
||||
|
||||
vllm serve mistralai/Mistral-7B-Instruct-v0.3 \
|
||||
--chat-template examples/tool_chat_template_mistral.jinja \
|
||||
--enable-auto-tool-choice --tool-call-parser mistral
|
||||
|
||||
OR
|
||||
vllm serve NousResearch/Hermes-2-Pro-Llama-3-8B \
|
||||
--chat-template examples/tool_chat_template_hermes.jinja \
|
||||
--enable-auto-tool-choice --tool-call-parser hermes
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
properties = {
|
||||
"city": {
|
||||
"type": "string",
|
||||
"description": "The city to find the weather for, e.g. 'San Francisco'",
|
||||
},
|
||||
"state": {
|
||||
"type": "string",
|
||||
"description": "the two-letter abbreviation for the state that the city is"
|
||||
" in, e.g. 'CA' which would mean 'California'",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"description": "The unit to fetch the temperature in",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
},
|
||||
}
|
||||
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": properties,
|
||||
"required": ["city", "state", "unit"],
|
||||
},
|
||||
},
|
||||
}
|
||||
]
|
||||
|
||||
messages = [
|
||||
{"role": "user", "content": "Hi! How are you doing today?"},
|
||||
{"role": "assistant", "content": "I'm doing well! How can I help you?"},
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
|
||||
),
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def get_current_weather(city: str, state: str, unit: "str"):
|
||||
return (
|
||||
"The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
|
||||
"partly cloudly, with highs in the 90's."
|
||||
)
|
||||
|
||||
|
||||
def handle_tool_calls_stream(
|
||||
client: OpenAI,
|
||||
messages: list[dict[str, str]],
|
||||
model: str,
|
||||
tools: list[dict[str, Any]],
|
||||
) -> list[Any]:
|
||||
tool_calls_stream = client.chat.completions.create(
|
||||
messages=messages, model=model, tools=tools, stream=True
|
||||
)
|
||||
chunks = []
|
||||
print("chunks: ")
|
||||
for chunk in tool_calls_stream:
|
||||
chunks.append(chunk)
|
||||
if chunk.choices[0].delta.tool_calls:
|
||||
print(chunk.choices[0].delta.tool_calls[0])
|
||||
else:
|
||||
print(chunk.choices[0].delta)
|
||||
return chunks
|
||||
|
||||
|
||||
def handle_tool_calls_arguments(chunks: list[Any]) -> list[str]:
|
||||
arguments = []
|
||||
tool_call_idx = -1
|
||||
print("arguments: ")
|
||||
for chunk in chunks:
|
||||
if chunk.choices[0].delta.tool_calls:
|
||||
tool_call = chunk.choices[0].delta.tool_calls[0]
|
||||
if tool_call.index != tool_call_idx:
|
||||
if tool_call_idx >= 0:
|
||||
print(f"streamed tool call arguments: {arguments[tool_call_idx]}")
|
||||
tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
|
||||
arguments.append("")
|
||||
if tool_call.id:
|
||||
print(f"streamed tool call id: {tool_call.id} ")
|
||||
|
||||
if tool_call.function:
|
||||
if tool_call.function.name:
|
||||
print(f"streamed tool call name: {tool_call.function.name}")
|
||||
|
||||
if tool_call.function.arguments:
|
||||
arguments[tool_call_idx] += tool_call.function.arguments
|
||||
|
||||
return arguments
|
||||
|
||||
|
||||
def main():
|
||||
# Initialize OpenAI client
|
||||
client = OpenAI(
|
||||
# defaults to os.environ.get("OPENAI_API_KEY")
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
# Get available models and select one
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
|
||||
chat_completion = client.chat.completions.create(
|
||||
messages=messages, model=model, tools=tools
|
||||
)
|
||||
|
||||
print("-" * 70)
|
||||
print("Chat completion results:")
|
||||
print(chat_completion)
|
||||
print("-" * 70)
|
||||
|
||||
# Stream tool calls
|
||||
chunks = handle_tool_calls_stream(client, messages, model, tools)
|
||||
print("-" * 70)
|
||||
|
||||
# Handle arguments from streamed tool calls
|
||||
arguments = handle_tool_calls_arguments(chunks)
|
||||
|
||||
if len(arguments):
|
||||
print(f"streamed tool call arguments: {arguments[-1]}\n")
|
||||
|
||||
print("-" * 70)
|
||||
|
||||
# Add tool call results to the conversation
|
||||
messages.append(
|
||||
{
|
||||
"role": "assistant",
|
||||
"tool_calls": chat_completion.choices[0].message.tool_calls,
|
||||
"reasoning": chat_completion.choices[0].message.reasoning,
|
||||
}
|
||||
)
|
||||
|
||||
# Now, simulate a tool call
|
||||
available_tools = {"get_current_weather": get_current_weather}
|
||||
|
||||
completion_tool_calls = chat_completion.choices[0].message.tool_calls
|
||||
for call in completion_tool_calls:
|
||||
tool_to_call = available_tools[call.function.name]
|
||||
args = json.loads(call.function.arguments)
|
||||
result = tool_to_call(**args)
|
||||
print("tool_to_call result: ", result)
|
||||
messages.append(
|
||||
{
|
||||
"role": "tool",
|
||||
"content": result,
|
||||
"tool_call_id": call.id,
|
||||
"name": call.function.name,
|
||||
}
|
||||
)
|
||||
|
||||
chat_completion_2 = client.chat.completions.create(
|
||||
messages=messages, model=model, tools=tools, stream=False
|
||||
)
|
||||
print("Chat completion2 results:")
|
||||
print(chat_completion_2)
|
||||
print("-" * 70)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,130 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
To run this example, you can start the vLLM server
|
||||
without any specific flags:
|
||||
|
||||
```bash
|
||||
vllm serve unsloth/Llama-3.2-1B-Instruct \
|
||||
--structured-outputs-config.backend outlines
|
||||
```
|
||||
|
||||
This example demonstrates how to generate chat completions
|
||||
using the OpenAI Python client library.
|
||||
"""
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type": "string",
|
||||
"description": "The city to find the weather for"
|
||||
", e.g. 'San Francisco'",
|
||||
},
|
||||
"state": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"the two-letter abbreviation for the state that the "
|
||||
"city is in, e.g. 'CA' which would mean 'California'"
|
||||
),
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"description": "The unit to fetch the temperature in",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
},
|
||||
},
|
||||
"required": ["city", "state", "unit"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_forecast",
|
||||
"description": "Get the weather forecast for a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"The city to get the forecast for, e.g. 'New York'"
|
||||
),
|
||||
},
|
||||
"state": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"The two-letter abbreviation for the state, e.g. 'NY'"
|
||||
),
|
||||
},
|
||||
"days": {
|
||||
"type": "integer",
|
||||
"description": "Number of days to get the forecast for (1-7)",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"description": "The unit to fetch the temperature in",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
},
|
||||
},
|
||||
"required": ["city", "state", "days", "unit"],
|
||||
},
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
messages = [
|
||||
{"role": "user", "content": "Hi! How are you doing today?"},
|
||||
{"role": "assistant", "content": "I'm doing well! How can I help you?"},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Can you tell me what the current weather is in Dallas \
|
||||
and the forecast for the next 5 days, in fahrenheit?",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def main():
|
||||
client = OpenAI(
|
||||
# defaults to os.environ.get("OPENAI_API_KEY")
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
|
||||
chat_completion = client.chat.completions.create(
|
||||
messages=messages,
|
||||
model=model,
|
||||
tools=tools,
|
||||
tool_choice="required",
|
||||
stream=True, # Enable streaming response
|
||||
)
|
||||
|
||||
for chunk in chat_completion:
|
||||
if chunk.choices and chunk.choices[0].delta.tool_calls:
|
||||
print(chunk.choices[0].delta.tool_calls)
|
||||
|
||||
chat_completion = client.chat.completions.create(
|
||||
messages=messages, model=model, tools=tools, tool_choice="required"
|
||||
)
|
||||
|
||||
print(chat_completion.choices[0].message.tool_calls)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,245 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# ruff: noqa: E501
|
||||
"""
|
||||
Set up this example by starting a vLLM OpenAI-compatible server with tool call
|
||||
options enabled for xLAM-2 models:
|
||||
|
||||
vllm serve --model Salesforce/Llama-xLAM-2-8b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
|
||||
|
||||
OR
|
||||
|
||||
vllm serve --model Salesforce/xLAM-2-3b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
openai_api_key = "empty"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
|
||||
# Define tool functions
|
||||
def get_weather(location: str, unit: str):
|
||||
return f"Weather in {location} is 22 degrees {unit}."
|
||||
|
||||
|
||||
def calculate_expression(expression: str):
|
||||
try:
|
||||
result = eval(expression)
|
||||
return f"The result of {expression} is {result}"
|
||||
except Exception as e:
|
||||
return f"Could not calculate {expression}: {e}"
|
||||
|
||||
|
||||
def translate_text(text: str, target_language: str):
|
||||
return f"Translation of '{text}' to {target_language}: [translated content]"
|
||||
|
||||
|
||||
# Define tools
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "City and state, e.g., 'San Francisco, CA'",
|
||||
},
|
||||
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
|
||||
},
|
||||
"required": ["location", "unit"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "calculate_expression",
|
||||
"description": "Calculate a mathematical expression",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"expression": {
|
||||
"type": "string",
|
||||
"description": "Mathematical expression to evaluate, needs to be a valid python expression",
|
||||
}
|
||||
},
|
||||
"required": ["expression"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "translate_text",
|
||||
"description": "Translate text to another language",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {"type": "string", "description": "Text to translate"},
|
||||
"target_language": {
|
||||
"type": "string",
|
||||
"description": "Target language for translation",
|
||||
},
|
||||
},
|
||||
"required": ["text", "target_language"],
|
||||
},
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
# Map of function names to implementations
|
||||
tool_functions = {
|
||||
"get_weather": get_weather,
|
||||
"calculate_expression": calculate_expression,
|
||||
"translate_text": translate_text,
|
||||
}
|
||||
|
||||
|
||||
def process_response(response, tool_functions, original_query):
|
||||
"""Process a non-streaming response with possible tool calls"""
|
||||
|
||||
print("\n--- Response Output ---")
|
||||
|
||||
# Check if the response has content
|
||||
if response.choices[0].message.content:
|
||||
print(f"Content: {response.choices[0].message.content}")
|
||||
|
||||
# Check if the response has tool calls
|
||||
if response.choices[0].message.tool_calls:
|
||||
print("--------------------------------")
|
||||
print(f"Tool calls: {response.choices[0].message.tool_calls}")
|
||||
print("--------------------------------")
|
||||
|
||||
# Collect all tool calls and results before making follow-up request
|
||||
tool_results = []
|
||||
assistant_message = {"role": "assistant"}
|
||||
|
||||
if response.choices[0].message.content:
|
||||
assistant_message["content"] = response.choices[0].message.content
|
||||
|
||||
assistant_tool_calls = []
|
||||
|
||||
# Process each tool call
|
||||
for tool_call in response.choices[0].message.tool_calls:
|
||||
function_name = tool_call.function.name
|
||||
function_args = tool_call.function.arguments
|
||||
function_id = tool_call.id
|
||||
|
||||
print(f"Function called: {function_name}")
|
||||
print(f"Arguments: {function_args}")
|
||||
print(f"Function ID: {function_id}")
|
||||
|
||||
# Execute the function
|
||||
try:
|
||||
# Parse the JSON arguments
|
||||
args = json.loads(function_args)
|
||||
|
||||
# Call the function with the arguments
|
||||
function_result = tool_functions[function_name](**args)
|
||||
print(f"\n--- Function Result ---\n{function_result}\n")
|
||||
|
||||
# Add tool call to assistant message
|
||||
assistant_tool_calls.append(
|
||||
{
|
||||
"id": function_id,
|
||||
"type": "function",
|
||||
"function": {"name": function_name, "arguments": function_args},
|
||||
}
|
||||
)
|
||||
|
||||
# Add tool result to tool_results
|
||||
tool_results.append(
|
||||
{
|
||||
"role": "tool",
|
||||
"tool_call_id": function_id,
|
||||
"content": function_result,
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error executing function: {e}")
|
||||
|
||||
# Add tool_calls to assistant message
|
||||
assistant_message["tool_calls"] = assistant_tool_calls
|
||||
|
||||
# Create a follow-up message with all function results
|
||||
follow_up_messages = [
|
||||
{"role": "user", "content": original_query},
|
||||
assistant_message,
|
||||
]
|
||||
|
||||
# Add all tool results to the messages
|
||||
follow_up_messages.extend(tool_results)
|
||||
|
||||
# Get completion with all tool results in a single follow-up
|
||||
follow_up_response = client.chat.completions.create(
|
||||
model=client.models.list().data[0].id,
|
||||
messages=follow_up_messages,
|
||||
stream=False,
|
||||
)
|
||||
|
||||
print("\n--- Follow-up Response ---")
|
||||
print(follow_up_response.choices[0].message.content)
|
||||
print("--- End Follow-up ---\n")
|
||||
|
||||
print("--- End Response ---\n")
|
||||
|
||||
|
||||
def run_test_case(query, test_name):
|
||||
"""Run a single test case with the given query"""
|
||||
print(f"\n{'=' * 50}\nTEST CASE: {test_name}\n{'=' * 50}")
|
||||
print(f"Query: '{query}'")
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# Create non-streaming chat completion request
|
||||
response = client.chat.completions.create(
|
||||
model=client.models.list().data[0].id,
|
||||
messages=[{"role": "user", "content": query}],
|
||||
tools=tools,
|
||||
tool_choice="auto",
|
||||
stream=False,
|
||||
)
|
||||
|
||||
# Process the non-streaming response, passing the original query
|
||||
process_response(response, tool_functions, query)
|
||||
|
||||
end_time = time.time()
|
||||
print(f"Test completed in {end_time - start_time:.2f} seconds")
|
||||
|
||||
|
||||
def main():
|
||||
# Initialize OpenAI client
|
||||
global client
|
||||
client = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
# Run test cases
|
||||
test_cases = [
|
||||
("I want to know the weather in San Francisco", "Weather Information"),
|
||||
("Calculate 25 * 17 + 31", "Math Calculation"),
|
||||
("Translate 'Hello world' to Spanish", "Text Translation"),
|
||||
("What is the weather in Tokyo and New York in celsius", "Multiple Tool Usage"),
|
||||
]
|
||||
|
||||
# Execute all test cases
|
||||
for query, test_name in test_cases:
|
||||
run_test_case(query, test_name)
|
||||
time.sleep(1) # Small delay between tests
|
||||
|
||||
print("\nAll tests completed.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,273 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# ruff: noqa: E501
|
||||
"""
|
||||
Set up this example by starting a vLLM OpenAI-compatible server with tool call
|
||||
options enabled for xLAM-2 models:
|
||||
|
||||
vllm serve --model Salesforce/Llama-xLAM-2-8b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
|
||||
|
||||
OR
|
||||
|
||||
vllm serve --model Salesforce/xLAM-2-3b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
|
||||
|
||||
This example demonstrates streaming tool calls with xLAM models.
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
openai_api_key = "empty"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
|
||||
# Define tool functions
|
||||
def get_weather(location: str, unit: str):
|
||||
return f"Weather in {location} is 22 degrees {unit}."
|
||||
|
||||
|
||||
def calculate_expression(expression: str):
|
||||
try:
|
||||
result = eval(expression)
|
||||
return f"The result of {expression} is {result}"
|
||||
except Exception as e:
|
||||
return f"Could not calculate {expression}: {e}"
|
||||
|
||||
|
||||
def translate_text(text: str, target_language: str):
|
||||
return f"Translation of '{text}' to {target_language}: [translated content]"
|
||||
|
||||
|
||||
# Define tools
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "City and state, e.g., 'San Francisco, CA'",
|
||||
},
|
||||
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
|
||||
},
|
||||
"required": ["location", "unit"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "calculate_expression",
|
||||
"description": "Calculate a mathematical expression",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"expression": {
|
||||
"type": "string",
|
||||
"description": "Mathematical expression to evaluate, needs to be a valid Python expression",
|
||||
}
|
||||
},
|
||||
"required": ["expression"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "translate_text",
|
||||
"description": "Translate text to another language",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {"type": "string", "description": "Text to translate"},
|
||||
"target_language": {
|
||||
"type": "string",
|
||||
"description": "Target language for translation",
|
||||
},
|
||||
},
|
||||
"required": ["text", "target_language"],
|
||||
},
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
# Map of function names to implementations
|
||||
tool_functions = {
|
||||
"get_weather": get_weather,
|
||||
"calculate_expression": calculate_expression,
|
||||
"translate_text": translate_text,
|
||||
}
|
||||
|
||||
|
||||
def process_stream(response, tool_functions, original_query):
|
||||
"""Process a streaming response with possible tool calls"""
|
||||
# Track multiple tool calls
|
||||
tool_calls = {} # Dictionary to store tool calls by ID
|
||||
|
||||
current_id = None
|
||||
|
||||
print("\n--- Stream Output ---")
|
||||
for chunk in response:
|
||||
# Handle tool calls in the stream
|
||||
if chunk.choices[0].delta.tool_calls:
|
||||
for tool_call_chunk in chunk.choices[0].delta.tool_calls:
|
||||
# Get the tool call ID
|
||||
if hasattr(tool_call_chunk, "id") and tool_call_chunk.id:
|
||||
current_id = tool_call_chunk.id
|
||||
if current_id not in tool_calls:
|
||||
tool_calls[current_id] = {
|
||||
"function_name": None,
|
||||
"function_args": "",
|
||||
"function_id": current_id,
|
||||
}
|
||||
|
||||
# Extract function information as it comes in chunks
|
||||
if (
|
||||
hasattr(tool_call_chunk, "function")
|
||||
and current_id
|
||||
and current_id in tool_calls
|
||||
):
|
||||
if (
|
||||
hasattr(tool_call_chunk.function, "name")
|
||||
and tool_call_chunk.function.name
|
||||
):
|
||||
tool_calls[current_id]["function_name"] = (
|
||||
tool_call_chunk.function.name
|
||||
)
|
||||
print(f"Function called: {tool_call_chunk.function.name}")
|
||||
|
||||
if (
|
||||
hasattr(tool_call_chunk.function, "arguments")
|
||||
and tool_call_chunk.function.arguments
|
||||
):
|
||||
tool_calls[current_id]["function_args"] += (
|
||||
tool_call_chunk.function.arguments
|
||||
)
|
||||
print(f"Arguments chunk: {tool_call_chunk.function.arguments}")
|
||||
|
||||
# Handle regular content in the stream
|
||||
elif chunk.choices[0].delta.content:
|
||||
print(chunk.choices[0].delta.content, end="")
|
||||
|
||||
print("\n--- End Stream ---\n")
|
||||
|
||||
# Execute each function call and build messages for follow-up
|
||||
follow_up_messages = [{"role": "user", "content": original_query}]
|
||||
|
||||
for tool_id, tool_data in tool_calls.items():
|
||||
function_name = tool_data["function_name"]
|
||||
function_args = tool_data["function_args"]
|
||||
function_id = tool_data["function_id"]
|
||||
|
||||
if function_name and function_args:
|
||||
try:
|
||||
# Parse the JSON arguments
|
||||
args = json.loads(function_args)
|
||||
|
||||
# Call the function with the arguments
|
||||
function_result = tool_functions[function_name](**args)
|
||||
print(
|
||||
f"\n--- Function Result ({function_name}) ---\n{function_result}\n"
|
||||
)
|
||||
|
||||
# Add the assistant message with tool call
|
||||
follow_up_messages.append(
|
||||
{
|
||||
"role": "assistant",
|
||||
"tool_calls": [
|
||||
{
|
||||
"id": function_id,
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": function_name,
|
||||
"arguments": function_args,
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
# Add the tool message with function result
|
||||
follow_up_messages.append(
|
||||
{
|
||||
"role": "tool",
|
||||
"tool_call_id": function_id,
|
||||
"content": function_result,
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error executing function: {e}")
|
||||
|
||||
# Only send follow-up if we have results to process
|
||||
if len(follow_up_messages) > 1:
|
||||
# Create a follow-up message with all the function results
|
||||
follow_up_response = client.chat.completions.create(
|
||||
model=client.models.list().data[0].id,
|
||||
messages=follow_up_messages,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
print("\n--- Follow-up Response ---")
|
||||
for chunk in follow_up_response:
|
||||
if chunk.choices[0].delta.content:
|
||||
print(chunk.choices[0].delta.content, end="")
|
||||
print("\n--- End Follow-up ---\n")
|
||||
|
||||
|
||||
def run_test_case(query, test_name):
|
||||
"""Run a single test case with the given query"""
|
||||
print(f"\n{'=' * 50}\nTEST CASE: {test_name}\n{'=' * 50}")
|
||||
print(f"Query: '{query}'")
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# Create streaming chat completion request
|
||||
response = client.chat.completions.create(
|
||||
model=client.models.list().data[0].id,
|
||||
messages=[{"role": "user", "content": query}],
|
||||
tools=tools,
|
||||
tool_choice="auto",
|
||||
stream=True,
|
||||
)
|
||||
|
||||
# Process the streaming response
|
||||
process_stream(response, tool_functions, query)
|
||||
|
||||
end_time = time.time()
|
||||
print(f"Test completed in {end_time - start_time:.2f} seconds")
|
||||
|
||||
|
||||
def main():
|
||||
# Initialize OpenAI client
|
||||
global client
|
||||
client = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
# Run test cases
|
||||
test_cases = [
|
||||
("I want to know the weather in San Francisco", "Weather Information"),
|
||||
("Calculate 25 * 17 + 31", "Math Calculation"),
|
||||
("Translate 'Hello world' to Spanish", "Text Translation"),
|
||||
("What is the weather in Tokyo and New York in celsius", "Multiple Tool Usage"),
|
||||
]
|
||||
|
||||
# Execute all test cases
|
||||
for query, test_name in test_cases:
|
||||
run_test_case(query, test_name)
|
||||
time.sleep(1) # Small delay between tests
|
||||
|
||||
print("\nAll tests completed.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,170 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
An example demonstrates how to use tool calling with reasoning models
|
||||
like QwQ-32B. The reasoning will not be parsed by the tool
|
||||
calling process; only the final output will be parsed.
|
||||
|
||||
To run this example, you need to start the vLLM server with both
|
||||
the reasoning parser and tool calling enabled.
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/QwQ-32B \
|
||||
--reasoning-parser deepseek_r1 \
|
||||
--enable-auto-tool-choice --tool-call-parser hermes
|
||||
|
||||
```
|
||||
|
||||
"""
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
|
||||
# Now, simulate a tool call
|
||||
def get_current_weather(city: str, state: str, unit: "str"):
|
||||
return (
|
||||
"The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
|
||||
"partly cloudly, with highs in the 90's."
|
||||
)
|
||||
|
||||
|
||||
available_tools = {"get_current_weather": get_current_weather}
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
properties = {
|
||||
"city": {
|
||||
"type": "string",
|
||||
"description": "The city to find the weather for, e.g. 'San Francisco'",
|
||||
},
|
||||
"state": {
|
||||
"type": "string",
|
||||
"description": "the two-letter abbreviation for the state that the city is"
|
||||
" in, e.g. 'CA' which would mean 'California'",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"description": "The unit to fetch the temperature in",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
},
|
||||
}
|
||||
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": properties,
|
||||
"required": ["city", "state", "unit"],
|
||||
},
|
||||
},
|
||||
}
|
||||
]
|
||||
messages = [
|
||||
{"role": "user", "content": "Hi! How are you doing today?"},
|
||||
{"role": "assistant", "content": "I'm doing well! How can I help you?"},
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
|
||||
),
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def extract_reasoning_and_calls(chunks: list):
|
||||
reasoning = ""
|
||||
tool_call_idx = -1
|
||||
arguments = []
|
||||
function_names = []
|
||||
for chunk in chunks:
|
||||
if chunk.choices[0].delta.tool_calls:
|
||||
tool_call = chunk.choices[0].delta.tool_calls[0]
|
||||
if tool_call.index != tool_call_idx:
|
||||
tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
|
||||
arguments.append("")
|
||||
function_names.append("")
|
||||
|
||||
if tool_call.function:
|
||||
if tool_call.function.name:
|
||||
function_names[tool_call_idx] = tool_call.function.name
|
||||
|
||||
if tool_call.function.arguments:
|
||||
arguments[tool_call_idx] += tool_call.function.arguments
|
||||
else:
|
||||
if hasattr(chunk.choices[0].delta, "reasoning"):
|
||||
reasoning += chunk.choices[0].delta.reasoning
|
||||
return reasoning, arguments, function_names
|
||||
|
||||
|
||||
def main():
|
||||
client = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
|
||||
print("---------Full Generate With Automatic Function Calling-------------")
|
||||
tool_calls = client.chat.completions.create(
|
||||
messages=messages, model=model, tools=tools
|
||||
)
|
||||
print(f"reasoning: {tool_calls.choices[0].message.reasoning}")
|
||||
print(f"function name: {tool_calls.choices[0].message.tool_calls[0].function.name}")
|
||||
print(
|
||||
f"function arguments: "
|
||||
f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}"
|
||||
)
|
||||
|
||||
print("----------Stream Generate With Automatic Function Calling-----------")
|
||||
tool_calls_stream = client.chat.completions.create(
|
||||
messages=messages, model=model, tools=tools, stream=True
|
||||
)
|
||||
|
||||
chunks = list(tool_calls_stream)
|
||||
|
||||
reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
|
||||
|
||||
print(f"reasoning: {reasoning}")
|
||||
print(f"function name: {function_names[0]}")
|
||||
print(f"function arguments: {arguments[0]}")
|
||||
|
||||
print("----------Full Generate With Named Function Calling-----------------")
|
||||
tool_calls = client.chat.completions.create(
|
||||
messages=messages,
|
||||
model=model,
|
||||
tools=tools,
|
||||
tool_choice={"type": "function", "function": {"name": "get_current_weather"}},
|
||||
)
|
||||
|
||||
tool_call = tool_calls.choices[0].message.tool_calls[0].function
|
||||
print(f"reasoning: {tool_calls.choices[0].message.reasoning}")
|
||||
print(f"function name: {tool_call.name}")
|
||||
print(f"function arguments: {tool_call.arguments}")
|
||||
print("----------Stream Generate With Named Function Calling--------------")
|
||||
|
||||
tool_calls_stream = client.chat.completions.create(
|
||||
messages=messages,
|
||||
model=model,
|
||||
tools=tools,
|
||||
tool_choice={"type": "function", "function": {"name": "get_current_weather"}},
|
||||
stream=True,
|
||||
)
|
||||
|
||||
chunks = list(tool_calls_stream)
|
||||
|
||||
reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
|
||||
print(f"reasoning: {reasoning}")
|
||||
print(f"function name: {function_names[0]}")
|
||||
print(f"function arguments: {arguments[0]}")
|
||||
print("\n\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,65 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
An example shows how to generate chat completions from reasoning models
|
||||
like DeepSeekR1.
|
||||
|
||||
To run this example, you need to start the vLLM server
|
||||
with the reasoning parser:
|
||||
|
||||
```bash
|
||||
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
|
||||
--reasoning-parser deepseek_r1
|
||||
```
|
||||
|
||||
This example demonstrates how to generate chat completions from reasoning models
|
||||
using the OpenAI Python client library.
|
||||
"""
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
|
||||
def main():
|
||||
client = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
|
||||
# Round 1
|
||||
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
|
||||
# ruff: noqa: E501
|
||||
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
|
||||
response = client.chat.completions.create(model=model, messages=messages)
|
||||
|
||||
reasoning = response.choices[0].message.reasoning
|
||||
content = response.choices[0].message.content
|
||||
|
||||
print("reasoning for Round 1:", reasoning)
|
||||
print("content for Round 1:", content)
|
||||
|
||||
# Round 2
|
||||
messages.append({"role": "assistant", "content": content})
|
||||
messages.append(
|
||||
{
|
||||
"role": "user",
|
||||
"content": "How many Rs are there in the word 'strawberry'?",
|
||||
}
|
||||
)
|
||||
response = client.chat.completions.create(model=model, messages=messages)
|
||||
|
||||
reasoning = response.choices[0].message.reasoning
|
||||
content = response.choices[0].message.content
|
||||
|
||||
print("reasoning for Round 2:", reasoning)
|
||||
print("content for Round 2:", content)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,73 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
An example shows how to generate chat completions from reasoning models
|
||||
like DeepSeekR1.
|
||||
|
||||
To run this example, you need to start the vLLM server with the reasoning
|
||||
parser:
|
||||
|
||||
```bash
|
||||
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
|
||||
--reasoning-parser deepseek_r1
|
||||
```
|
||||
|
||||
Unlike openai_chat_completion_with_reasoning.py, this example demonstrates the
|
||||
streaming chat completions feature.
|
||||
|
||||
The streaming chat completions feature allows you to receive chat completions
|
||||
in real-time as they are generated by the model. This is useful for scenarios
|
||||
where you want to display chat completions to the user as they are generated
|
||||
by the model.
|
||||
|
||||
Remember to check content and reasoning exist in `ChatCompletionChunk`,
|
||||
content may not exist leading to errors if you try to access it.
|
||||
"""
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
|
||||
|
||||
|
||||
def main():
|
||||
client = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
|
||||
# ruff: noqa: E501
|
||||
# For granite: add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
|
||||
stream = client.chat.completions.create(model=model, messages=messages, stream=True)
|
||||
|
||||
print("client: Start streaming chat completions...")
|
||||
printed_reasoning = False
|
||||
printed_content = False
|
||||
|
||||
for chunk in stream:
|
||||
# Safely extract reasoning and content from delta,
|
||||
# defaulting to None if attributes don't exist or are empty strings
|
||||
reasoning = getattr(chunk.choices[0].delta, "reasoning", None) or None
|
||||
content = getattr(chunk.choices[0].delta, "content", None) or None
|
||||
|
||||
if reasoning is not None:
|
||||
if not printed_reasoning:
|
||||
printed_reasoning = True
|
||||
print("reasoning:", end="", flush=True)
|
||||
print(reasoning, end="", flush=True)
|
||||
elif content is not None:
|
||||
if not printed_content:
|
||||
printed_content = True
|
||||
print("\ncontent:", end="", flush=True)
|
||||
# Extract and print the content
|
||||
print(content, end="", flush=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
53
examples/online_serving/openai_completion_client.py
Normal file
53
examples/online_serving/openai_completion_client.py
Normal file
@@ -0,0 +1,53 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Client for vLLM API server")
|
||||
parser.add_argument(
|
||||
"--stream", action="store_true", help="Enable streaming response"
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args):
|
||||
client = OpenAI(
|
||||
# defaults to os.environ.get("OPENAI_API_KEY")
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
|
||||
# Completion API
|
||||
completion = client.completions.create(
|
||||
model=model,
|
||||
prompt="A robot may not injure a human being",
|
||||
echo=False,
|
||||
n=2,
|
||||
stream=args.stream,
|
||||
logprobs=3,
|
||||
)
|
||||
|
||||
print("-" * 50)
|
||||
print("Completion results:")
|
||||
if args.stream:
|
||||
for c in completion:
|
||||
print(c)
|
||||
else:
|
||||
print(completion)
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
44
examples/online_serving/openai_responses_client.py
Normal file
44
examples/online_serving/openai_responses_client.py
Normal file
@@ -0,0 +1,44 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Set up this example by starting a vLLM OpenAI-compatible server.
|
||||
Reasoning models can be used through the Responses API as seen here
|
||||
https://platform.openai.com/docs/api-reference/responses
|
||||
For example:
|
||||
vllm serve Qwen/Qwen3-8B --reasoning-parser qwen3
|
||||
|
||||
"""
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
input_messages = [{"role": "user", "content": "What model are you?"}]
|
||||
|
||||
|
||||
def main():
|
||||
base_url = "http://localhost:8000/v1"
|
||||
client = OpenAI(base_url=base_url, api_key="empty")
|
||||
model = "Qwen/Qwen3-8B" # get_first_model(client)
|
||||
response = client.responses.create(
|
||||
model=model,
|
||||
input=input_messages,
|
||||
)
|
||||
|
||||
for message in response.output:
|
||||
if message.type == "reasoning":
|
||||
# append reasoning message
|
||||
input_messages.append(message)
|
||||
|
||||
response_2 = client.responses.create(
|
||||
model=model,
|
||||
input=input_messages,
|
||||
)
|
||||
print(response_2.output_text)
|
||||
# I am Qwen, a large language model developed by Alibaba Cloud.
|
||||
# I am designed to assist with a wide range of tasks, including
|
||||
# answering questions, creating content, coding, and engaging in
|
||||
# conversations. I can help with various topics and provide
|
||||
# information or support in multiple languages. How can I assist you today?
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,184 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Example demonstrating MCP (Model Context Protocol) tools with the Responses API.
|
||||
|
||||
This example shows how to use MCP tools with different allowed_tools configurations:
|
||||
1. No filter (allows all tools from the MCP server)
|
||||
2. Wildcard "*" (explicitly allows all tools)
|
||||
3. Specific tool names (filters to only those tools)
|
||||
|
||||
Set up this example by starting a vLLM OpenAI-compatible server with MCP tools enabled.
|
||||
For example:
|
||||
vllm serve openai/gpt-oss-20b --enforce-eager --tool-server demo
|
||||
|
||||
Environment variables:
|
||||
- VLLM_ENABLE_RESPONSES_API_STORE=1
|
||||
- VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS=code_interpreter,container
|
||||
- VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS=1
|
||||
"""
|
||||
|
||||
from openai import OpenAI
|
||||
from utils import get_first_model
|
||||
|
||||
|
||||
def example_no_filter():
|
||||
"""Example with no allowed_tools filter - allows all tools."""
|
||||
print("=" * 60)
|
||||
print("Example 1: No allowed_tools filter (allows all tools)")
|
||||
print("=" * 60)
|
||||
|
||||
base_url = "http://0.0.0.0:8000/v1"
|
||||
client = OpenAI(base_url=base_url, api_key="empty")
|
||||
model = get_first_model(client)
|
||||
|
||||
response = client.responses.create(
|
||||
model=model,
|
||||
input="Execute this code: print('Hello from Python!')",
|
||||
instructions="Use the Python tool to execute code.",
|
||||
tools=[
|
||||
{
|
||||
"type": "mcp",
|
||||
"server_label": "code_interpreter",
|
||||
"server_url": "http://localhost:8888",
|
||||
# No allowed_tools specified - all tools are available
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
print(f"Status: {response.status}")
|
||||
print(f"Output: {response.output_text}")
|
||||
print()
|
||||
|
||||
|
||||
def example_wildcard():
|
||||
"""Example with allowed_tools=['*'] - explicitly allows all tools."""
|
||||
print("=" * 60)
|
||||
print("Example 2: allowed_tools=['*'] (select all tools)")
|
||||
print("=" * 60)
|
||||
|
||||
base_url = "http://0.0.0.0:8000/v1"
|
||||
client = OpenAI(base_url=base_url, api_key="empty")
|
||||
model = get_first_model(client)
|
||||
|
||||
response = client.responses.create(
|
||||
model=model,
|
||||
input="Execute this code: print('Hello from Python with wildcard!')",
|
||||
instructions="Use the Python tool to execute code.",
|
||||
tools=[
|
||||
{
|
||||
"type": "mcp",
|
||||
"server_label": "code_interpreter",
|
||||
"server_url": "http://localhost:8888",
|
||||
# Using "*" to explicitly allow all tools from this MCP server
|
||||
# This is equivalent to not specifying allowed_tools
|
||||
"allowed_tools": ["*"],
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
print(f"Status: {response.status}")
|
||||
print(f"Output: {response.output_text}")
|
||||
print()
|
||||
|
||||
|
||||
def example_specific_tools():
|
||||
"""Example with specific allowed_tools list - filters available tools.
|
||||
|
||||
Note: This example uses 'web_search_preview' (browser) which has multiple
|
||||
sub-tools: 'search', 'open', 'find'. The code_interpreter (python) doesn't
|
||||
have sub-tools, so filtering doesn't apply there.
|
||||
"""
|
||||
print("=" * 60)
|
||||
print("Example 3: allowed_tools=['search'] (filter browser to specific tools)")
|
||||
print("=" * 60)
|
||||
|
||||
base_url = "http://0.0.0.0:8000/v1"
|
||||
client = OpenAI(base_url=base_url, api_key="empty")
|
||||
model = get_first_model(client)
|
||||
|
||||
response = client.responses.create(
|
||||
model=model,
|
||||
input="Search for 'Python programming tutorials'",
|
||||
instructions="Use the browser tool to search.",
|
||||
tools=[
|
||||
{
|
||||
"type": "mcp",
|
||||
"server_label": "web_search_preview",
|
||||
"server_url": "http://localhost:8888",
|
||||
# Browser has tools: 'search', 'open', 'find'
|
||||
# Only allow 'search' - blocks 'open' and 'find'
|
||||
"allowed_tools": ["search"],
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
print(f"Status: {response.status}")
|
||||
print(f"Output: {response.output_text}")
|
||||
print()
|
||||
|
||||
|
||||
def example_object_format():
|
||||
"""Example using object format for allowed_tools with browser tools."""
|
||||
print("=" * 60)
|
||||
print("Example 4: allowed_tools with object format")
|
||||
print("=" * 60)
|
||||
|
||||
base_url = "http://0.0.0.0:8000/v1"
|
||||
client = OpenAI(base_url=base_url, api_key="empty")
|
||||
model = get_first_model(client)
|
||||
|
||||
response = client.responses.create(
|
||||
model=model,
|
||||
input="Search for 'machine learning' and open the first result",
|
||||
instructions="Use the browser tool.",
|
||||
tools=[
|
||||
{
|
||||
"type": "mcp",
|
||||
"server_label": "web_search_preview",
|
||||
"server_url": "http://localhost:8888",
|
||||
# Object format with tool_names field
|
||||
# Can also include read_only and other fields
|
||||
# Browser has tools: 'search', 'open', 'find'
|
||||
"allowed_tools": {
|
||||
"tool_names": [
|
||||
"search",
|
||||
"open",
|
||||
], # Allow search and open, block find
|
||||
"read_only": False,
|
||||
},
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
print(f"Status: {response.status}")
|
||||
print(f"Output: {response.output_text}")
|
||||
print()
|
||||
|
||||
|
||||
def main():
|
||||
"""Run all examples."""
|
||||
print("\n" + "=" * 60)
|
||||
print("MCP Tools with allowed_tools Examples")
|
||||
print("=" * 60 + "\n")
|
||||
|
||||
# Run all examples
|
||||
example_no_filter()
|
||||
example_wildcard()
|
||||
example_specific_tools()
|
||||
example_object_format()
|
||||
|
||||
print("=" * 60)
|
||||
print("Summary:")
|
||||
print(" - No filter or '*' → All tools available from server")
|
||||
print(" - Specific list → Only those sub-tools available")
|
||||
print(" - Object format → More control with tool_names field")
|
||||
print("")
|
||||
print("Note: allowed_tools filters SUB-TOOLS within an MCP server:")
|
||||
print(" - code_interpreter (python): No sub-tools to filter")
|
||||
print(" - web_search_preview (browser): Has 'search', 'open', 'find'")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,83 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Set up this example by starting a vLLM OpenAI-compatible server with tool call
|
||||
options enabled.
|
||||
Reasoning models can be used through the Responses API as seen here
|
||||
https://platform.openai.com/docs/api-reference/responses
|
||||
For example:
|
||||
vllm serve Qwen/Qwen3-1.7B --reasoning-parser qwen3 \
|
||||
--structured-outputs-config.backend xgrammar \
|
||||
--enable-auto-tool-choice --tool-call-parser hermes
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
from openai import OpenAI
|
||||
from utils import get_first_model
|
||||
|
||||
|
||||
def get_weather(latitude: float, longitude: float) -> str:
|
||||
"""
|
||||
Mock function to simulate getting weather data.
|
||||
In a real application, this would call an external weather API.
|
||||
"""
|
||||
return f"Current temperature at ({latitude}, {longitude}) is 20°C."
|
||||
|
||||
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"name": "get_weather",
|
||||
"description": "Get current temperature for provided coordinates in celsius.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"latitude": {"type": "number"},
|
||||
"longitude": {"type": "number"},
|
||||
},
|
||||
"required": ["latitude", "longitude"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
"strict": True,
|
||||
}
|
||||
]
|
||||
|
||||
input_messages = [
|
||||
{"role": "user", "content": "What's the weather like in Paris today?"}
|
||||
]
|
||||
|
||||
|
||||
def main():
|
||||
base_url = "http://0.0.0.0:8000/v1"
|
||||
client = OpenAI(base_url=base_url, api_key="empty")
|
||||
model = get_first_model(client)
|
||||
response = client.responses.create(
|
||||
model=model, input=input_messages, tools=tools, tool_choice="required"
|
||||
)
|
||||
|
||||
for out in response.output:
|
||||
if out.type == "function_call":
|
||||
print("Function call:", out.name, out.arguments)
|
||||
tool_call = out
|
||||
args = json.loads(tool_call.arguments)
|
||||
result = get_weather(args["latitude"], args["longitude"])
|
||||
|
||||
input_messages.append(tool_call) # append model's function call message
|
||||
input_messages.append(
|
||||
{ # append result message
|
||||
"type": "function_call_output",
|
||||
"call_id": tool_call.call_id,
|
||||
"output": str(result),
|
||||
}
|
||||
)
|
||||
response_2 = client.responses.create(
|
||||
model=model,
|
||||
input=input_messages,
|
||||
tools=tools,
|
||||
)
|
||||
print(response_2.output_text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
97
examples/online_serving/openai_transcription_client.py
Normal file
97
examples/online_serving/openai_transcription_client.py
Normal file
@@ -0,0 +1,97 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
This script demonstrates how to use the vLLM API server to perform audio
|
||||
transcription with the `openai/whisper-large-v3` model.
|
||||
|
||||
Before running this script, you must start the vLLM server with the following command:
|
||||
|
||||
vllm serve openai/whisper-large-v3
|
||||
|
||||
Requirements:
|
||||
- vLLM with audio support
|
||||
- openai Python SDK
|
||||
- httpx for streaming support
|
||||
|
||||
The script performs:
|
||||
1. Synchronous transcription using OpenAI-compatible API.
|
||||
2. Streaming transcription using raw HTTP request to the vLLM server.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
|
||||
from openai import AsyncOpenAI, OpenAI
|
||||
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
|
||||
def sync_openai(audio_path: str, client: OpenAI):
|
||||
"""
|
||||
Perform synchronous transcription using OpenAI-compatible API.
|
||||
"""
|
||||
with open(audio_path, "rb") as f:
|
||||
transcription = client.audio.transcriptions.create(
|
||||
file=f,
|
||||
model="openai/whisper-large-v3",
|
||||
language="en",
|
||||
response_format="json",
|
||||
temperature=0.0,
|
||||
# Additional sampling params not provided by OpenAI API.
|
||||
extra_body=dict(
|
||||
seed=4419,
|
||||
repetition_penalty=1.3,
|
||||
),
|
||||
)
|
||||
print("transcription result:", transcription.text)
|
||||
|
||||
|
||||
async def stream_openai_response(audio_path: str, client: AsyncOpenAI):
|
||||
"""
|
||||
Perform asynchronous transcription using OpenAI-compatible API.
|
||||
"""
|
||||
print("\ntranscription result:", end=" ")
|
||||
with open(audio_path, "rb") as f:
|
||||
transcription = await client.audio.transcriptions.create(
|
||||
file=f,
|
||||
model="openai/whisper-large-v3",
|
||||
language="en",
|
||||
response_format="json",
|
||||
temperature=0.0,
|
||||
# Additional sampling params not provided by OpenAI API.
|
||||
extra_body=dict(
|
||||
seed=420,
|
||||
top_p=0.6,
|
||||
),
|
||||
stream=True,
|
||||
)
|
||||
async for chunk in transcription:
|
||||
if chunk.choices:
|
||||
content = chunk.choices[0].get("delta", {}).get("content")
|
||||
print(content, end="", flush=True)
|
||||
|
||||
print() # Final newline after stream ends
|
||||
|
||||
|
||||
def main():
|
||||
mary_had_lamb = str(AudioAsset("mary_had_lamb").get_local_path())
|
||||
winning_call = str(AudioAsset("winning_call").get_local_path())
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
client = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
sync_openai(mary_had_lamb, client)
|
||||
# Run the asynchronous function
|
||||
client = AsyncOpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
asyncio.run(stream_openai_response(winning_call, client))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
75
examples/online_serving/openai_translation_client.py
Normal file
75
examples/online_serving/openai_translation_client.py
Normal file
@@ -0,0 +1,75 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import asyncio
|
||||
import json
|
||||
|
||||
import httpx
|
||||
from openai import OpenAI
|
||||
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
|
||||
def sync_openai(audio_path: str, client: OpenAI):
|
||||
with open(audio_path, "rb") as f:
|
||||
translation = client.audio.translations.create(
|
||||
file=f,
|
||||
model="openai/whisper-large-v3",
|
||||
response_format="json",
|
||||
temperature=0.0,
|
||||
# Additional params not provided by OpenAI API.
|
||||
extra_body=dict(
|
||||
language="it",
|
||||
seed=4419,
|
||||
repetition_penalty=1.3,
|
||||
),
|
||||
)
|
||||
print("translation result:", translation.text)
|
||||
|
||||
|
||||
async def stream_openai_response(audio_path: str, base_url: str, api_key: str):
|
||||
data = {
|
||||
"language": "it",
|
||||
"stream": True,
|
||||
"model": "openai/whisper-large-v3",
|
||||
}
|
||||
url = base_url + "/audio/translations"
|
||||
headers = {"Authorization": f"Bearer {api_key}"}
|
||||
print("translation result:", end=" ")
|
||||
# OpenAI translation API client does not support streaming.
|
||||
async with httpx.AsyncClient() as client:
|
||||
with open(audio_path, "rb") as f:
|
||||
async with client.stream(
|
||||
"POST", url, files={"file": f}, data=data, headers=headers
|
||||
) as response:
|
||||
async for line in response.aiter_lines():
|
||||
# Each line is a JSON object prefixed with 'data: '
|
||||
if line:
|
||||
if line.startswith("data: "):
|
||||
line = line[len("data: ") :]
|
||||
# Last chunk, stream ends
|
||||
if line.strip() == "[DONE]":
|
||||
break
|
||||
# Parse the JSON response
|
||||
chunk = json.loads(line)
|
||||
# Extract and print the content
|
||||
content = chunk["choices"][0].get("delta", {}).get("content")
|
||||
print(content, end="")
|
||||
|
||||
|
||||
def main():
|
||||
foscolo = str(AudioAsset("azacinto_foscolo").get_local_path())
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
client = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
sync_openai(foscolo, client)
|
||||
# Run the asynchronous function
|
||||
asyncio.run(stream_openai_response(foscolo, openai_api_base, openai_api_key))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
94
examples/online_serving/opentelemetry/README.md
Normal file
94
examples/online_serving/opentelemetry/README.md
Normal file
@@ -0,0 +1,94 @@
|
||||
# Setup OpenTelemetry POC
|
||||
|
||||
1. Install OpenTelemetry packages:
|
||||
|
||||
```bash
|
||||
pip install \
|
||||
'opentelemetry-sdk>=1.26.0,<1.27.0' \
|
||||
'opentelemetry-api>=1.26.0,<1.27.0' \
|
||||
'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
|
||||
'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'
|
||||
```
|
||||
|
||||
1. Start Jaeger in a docker container:
|
||||
|
||||
```bash
|
||||
# From: https://www.jaegertracing.io/docs/1.57/getting-started/
|
||||
docker run --rm --name jaeger \
|
||||
-e COLLECTOR_ZIPKIN_HOST_PORT=:9411 \
|
||||
-p 6831:6831/udp \
|
||||
-p 6832:6832/udp \
|
||||
-p 5778:5778 \
|
||||
-p 16686:16686 \
|
||||
-p 4317:4317 \
|
||||
-p 4318:4318 \
|
||||
-p 14250:14250 \
|
||||
-p 14268:14268 \
|
||||
-p 14269:14269 \
|
||||
-p 9411:9411 \
|
||||
jaegertracing/all-in-one:1.57
|
||||
```
|
||||
|
||||
1. In a new shell, export Jaeger IP:
|
||||
|
||||
```bash
|
||||
export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger)
|
||||
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
|
||||
```
|
||||
|
||||
Then set vLLM's service name for OpenTelemetry, enable insecure connections to Jaeger and run vLLM:
|
||||
|
||||
```bash
|
||||
export OTEL_SERVICE_NAME="vllm-server"
|
||||
export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
|
||||
vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
|
||||
```
|
||||
|
||||
1. In a new shell, send requests with trace context from a dummy client
|
||||
|
||||
```bash
|
||||
export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger)
|
||||
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
|
||||
export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
|
||||
export OTEL_SERVICE_NAME="client-service"
|
||||
python dummy_client.py
|
||||
```
|
||||
|
||||
1. Open Jaeger webui: <http://localhost:16686/>
|
||||
|
||||
In the search pane, select `vllm-server` service and hit `Find Traces`. You should get a list of traces, one for each request.
|
||||

|
||||
|
||||
1. Clicking on a trace will show its spans and their tags. In this demo, each trace has 2 spans. One from the dummy client containing the prompt text and one from vLLM containing metadata about the request.
|
||||

|
||||
|
||||
## Exporter Protocol
|
||||
|
||||
OpenTelemetry supports either `grpc` or `http/protobuf` as the transport protocol for trace data in the exporter.
|
||||
By default, `grpc` is used. To set `http/protobuf` as the protocol, configure the `OTEL_EXPORTER_OTLP_TRACES_PROTOCOL` environment variable as follows:
|
||||
|
||||
```bash
|
||||
export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf
|
||||
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
|
||||
vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
|
||||
```
|
||||
|
||||
## Instrumentation of FastAPI
|
||||
|
||||
OpenTelemetry allows automatic instrumentation of FastAPI.
|
||||
|
||||
1. Install the instrumentation library
|
||||
|
||||
```bash
|
||||
pip install opentelemetry-instrumentation-fastapi
|
||||
```
|
||||
|
||||
1. Run vLLM with `opentelemetry-instrument`
|
||||
|
||||
```bash
|
||||
opentelemetry-instrument vllm serve facebook/opt-125m
|
||||
```
|
||||
|
||||
1. Send a request to vLLM and find its trace in Jaeger. It should contain spans from FastAPI.
|
||||
|
||||

|
||||
34
examples/online_serving/opentelemetry/dummy_client.py
Normal file
34
examples/online_serving/opentelemetry/dummy_client.py
Normal file
@@ -0,0 +1,34 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import requests
|
||||
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
||||
from opentelemetry.sdk.trace import TracerProvider
|
||||
from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
|
||||
from opentelemetry.trace import SpanKind, set_tracer_provider
|
||||
from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
|
||||
|
||||
trace_provider = TracerProvider()
|
||||
set_tracer_provider(trace_provider)
|
||||
|
||||
trace_provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter()))
|
||||
trace_provider.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))
|
||||
|
||||
tracer = trace_provider.get_tracer("dummy-client")
|
||||
|
||||
url = "http://localhost:8000/v1/completions"
|
||||
with tracer.start_as_current_span("client-span", kind=SpanKind.CLIENT) as span:
|
||||
prompt = "San Francisco is a"
|
||||
span.set_attribute("prompt", prompt)
|
||||
headers = {}
|
||||
TraceContextTextMapPropagator().inject(headers)
|
||||
payload = {
|
||||
"model": "facebook/opt-125m",
|
||||
"prompt": prompt,
|
||||
"max_tokens": 10,
|
||||
"n": 3,
|
||||
"use_beam_search": "true",
|
||||
"temperature": 0.0,
|
||||
# "stream": True,
|
||||
}
|
||||
response = requests.post(url, headers=headers, json=payload)
|
||||
57
examples/online_serving/prometheus_grafana/README.md
Normal file
57
examples/online_serving/prometheus_grafana/README.md
Normal file
@@ -0,0 +1,57 @@
|
||||
# Prometheus and Grafana
|
||||
|
||||
This is a simple example that shows you how to connect vLLM metric logging to the Prometheus/Grafana stack. For this example, we launch Prometheus and Grafana via Docker. You can checkout other methods through [Prometheus](https://prometheus.io/) and [Grafana](https://grafana.com/) websites.
|
||||
|
||||
Install:
|
||||
|
||||
- [`docker`](https://docs.docker.com/engine/install/)
|
||||
- [`docker compose`](https://docs.docker.com/compose/install/linux/#install-using-the-repository)
|
||||
|
||||
## Launch
|
||||
|
||||
Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint:
|
||||
|
||||
```bash
|
||||
vllm serve mistralai/Mistral-7B-v0.1 \
|
||||
--max-model-len 2048
|
||||
```
|
||||
|
||||
Launch Prometheus and Grafana servers with `docker compose`:
|
||||
|
||||
```bash
|
||||
docker compose up
|
||||
```
|
||||
|
||||
Submit some sample requests to the server:
|
||||
|
||||
```bash
|
||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
|
||||
vllm bench serve \
|
||||
--model mistralai/Mistral-7B-v0.1 \
|
||||
--tokenizer mistralai/Mistral-7B-v0.1 \
|
||||
--endpoint /v1/completions \
|
||||
--dataset-name sharegpt \
|
||||
--dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||
--request-rate 3.0
|
||||
```
|
||||
|
||||
Navigating to [`http://localhost:8000/metrics`](http://localhost:8000/metrics) will show the raw Prometheus metrics being exposed by vLLM.
|
||||
|
||||
## Grafana Dashboard
|
||||
|
||||
Navigate to [`http://localhost:3000`](http://localhost:3000). Log in with the default username (`admin`) and password (`admin`).
|
||||
|
||||
### Add Prometheus Data Source
|
||||
|
||||
Navigate to [`http://localhost:3000/connections/datasources/new`](http://localhost:3000/connections/datasources/new) and select Prometheus.
|
||||
|
||||
On Prometheus configuration page, we need to add the `Prometheus Server URL` in `Connection`. For this setup, Grafana and Prometheus are running in separate containers, but Docker creates DNS name for each container. You can just use `http://prometheus:9090`.
|
||||
|
||||
Click `Save & Test`. You should get a green check saying "Successfully queried the Prometheus API.".
|
||||
|
||||
### Import Dashboard
|
||||
|
||||
Navigate to [`http://localhost:3000/dashboard/import`](http://localhost:3000/dashboard/import), upload `grafana.json`, and select the `prometheus` datasource. You should see a screen that looks like the following:
|
||||
|
||||

|
||||
@@ -0,0 +1,19 @@
|
||||
# docker-compose.yaml
|
||||
version: "3"
|
||||
|
||||
services:
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway" # allow a direct connection from container to the local machine
|
||||
ports:
|
||||
- "9090:9090" # the default port used by Prometheus
|
||||
volumes:
|
||||
- ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml # mount Prometheus config file
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
depends_on:
|
||||
- prometheus
|
||||
ports:
|
||||
- "3000:3000" # the default port used by Grafana
|
||||
1527
examples/online_serving/prometheus_grafana/grafana.json
Normal file
1527
examples/online_serving/prometheus_grafana/grafana.json
Normal file
File diff suppressed because it is too large
Load Diff
10
examples/online_serving/prometheus_grafana/prometheus.yaml
Normal file
10
examples/online_serving/prometheus_grafana/prometheus.yaml
Normal file
@@ -0,0 +1,10 @@
|
||||
# prometheus.yaml
|
||||
global:
|
||||
scrape_interval: 5s
|
||||
evaluation_interval: 30s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: vllm
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'host.docker.internal:8000'
|
||||
@@ -0,0 +1,79 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
vLLM OpenAI-Compatible Client with Prompt Embeddings
|
||||
|
||||
This script demonstrates how to:
|
||||
1. Generate prompt embeddings using Hugging Face Transformers
|
||||
2. Encode them in base64 format
|
||||
3. Send them to a vLLM server via the OpenAI-compatible Completions API
|
||||
|
||||
Run the vLLM server first:
|
||||
vllm serve meta-llama/Llama-3.2-1B-Instruct \
|
||||
--runner generate \
|
||||
--max-model-len 4096 \
|
||||
--enable-prompt-embeds
|
||||
|
||||
Run the client:
|
||||
python examples/online_serving/prompt_embed_inference_with_openai_client.py
|
||||
|
||||
Model: meta-llama/Llama-3.2-1B-Instruct
|
||||
Note: This model is gated on Hugging Face Hub.
|
||||
You must request access to use it:
|
||||
https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct
|
||||
|
||||
Dependencies:
|
||||
- transformers
|
||||
- torch
|
||||
- openai
|
||||
"""
|
||||
|
||||
import transformers
|
||||
from openai import OpenAI
|
||||
|
||||
from vllm.utils.serial_utils import tensor2base64
|
||||
|
||||
|
||||
def main():
|
||||
client = OpenAI(
|
||||
api_key="EMPTY",
|
||||
base_url="http://localhost:8000/v1",
|
||||
)
|
||||
|
||||
model_name = "meta-llama/Llama-3.2-1B-Instruct"
|
||||
|
||||
# Transformers
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
|
||||
transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
|
||||
|
||||
# Refer to the HuggingFace repo for the correct format to use
|
||||
chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
|
||||
token_ids = tokenizer.apply_chat_template(
|
||||
chat, add_generation_prompt=True, return_tensors="pt"
|
||||
)
|
||||
|
||||
embedding_layer = transformers_model.get_input_embeddings()
|
||||
prompt_embeds = embedding_layer(token_ids).squeeze(0)
|
||||
|
||||
# Prompt embeddings
|
||||
encoded_embeds = tensor2base64(prompt_embeds)
|
||||
|
||||
completion = client.completions.create(
|
||||
model=model_name,
|
||||
# NOTE: The OpenAI client does not allow `None` as an input to
|
||||
# `prompt`. Use an empty string if you have no text prompts.
|
||||
prompt="",
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
# NOTE: The OpenAI client allows passing in extra JSON body via the
|
||||
# `extra_body` argument.
|
||||
extra_body={"prompt_embeds": encoded_embeds},
|
||||
)
|
||||
|
||||
print("-" * 30)
|
||||
print(completion.choices[0].text)
|
||||
print("-" * 30)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
55
examples/online_serving/ray_serve_deepseek.py
Normal file
55
examples/online_serving/ray_serve_deepseek.py
Normal file
@@ -0,0 +1,55 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Deploy DeepSeek R1 or V3 with Ray Serve LLM.
|
||||
|
||||
Ray Serve LLM is a scalable and production-grade model serving library built
|
||||
on the Ray distributed computing framework and first-class support for the vLLM engine.
|
||||
|
||||
Key features:
|
||||
- Automatic scaling, back-pressure, and load balancing across a Ray cluster.
|
||||
- Unified multi-node multi-model deployment.
|
||||
- Exposes an OpenAI-compatible HTTP API.
|
||||
- Multi-LoRA support with shared base models.
|
||||
|
||||
Run `python3 ray_serve_deepseek.py` to launch an endpoint.
|
||||
|
||||
Learn more in the official Ray Serve LLM documentation:
|
||||
https://docs.ray.io/en/latest/serve/llm/serving-llms.html
|
||||
"""
|
||||
|
||||
from ray import serve
|
||||
from ray.serve.llm import LLMConfig, build_openai_app
|
||||
|
||||
llm_config = LLMConfig(
|
||||
model_loading_config={
|
||||
"model_id": "deepseek",
|
||||
# Pre-downloading the model to local storage is recommended since
|
||||
# the model is large. Set model_source="/path/to/the/model".
|
||||
"model_source": "deepseek-ai/DeepSeek-R1",
|
||||
},
|
||||
deployment_config={
|
||||
"autoscaling_config": {
|
||||
"min_replicas": 1,
|
||||
"max_replicas": 1,
|
||||
}
|
||||
},
|
||||
# Set to the node's accelerator type.
|
||||
accelerator_type="H100",
|
||||
# Customize engine arguments as required (for example, vLLM engine kwargs).
|
||||
engine_kwargs={
|
||||
"tensor_parallel_size": 8,
|
||||
"pipeline_parallel_size": 2,
|
||||
"gpu_memory_utilization": 0.92,
|
||||
"dtype": "auto",
|
||||
"max_num_seqs": 40,
|
||||
"max_model_len": 16384,
|
||||
"enable_chunked_prefill": True,
|
||||
"enable_prefix_caching": True,
|
||||
"trust_remote_code": True,
|
||||
},
|
||||
)
|
||||
|
||||
# Deploy the application.
|
||||
llm_app = build_openai_app({"llm_configs": [llm_config]})
|
||||
serve.run(llm_app)
|
||||
@@ -0,0 +1,257 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Retrieval Augmented Generation (RAG) Implementation with Langchain
|
||||
==================================================================
|
||||
|
||||
This script demonstrates a RAG implementation using LangChain, Milvus
|
||||
and vLLM. RAG enhances LLM responses by retrieving relevant context
|
||||
from a document collection.
|
||||
|
||||
Features:
|
||||
- Web content loading and chunking
|
||||
- Vector storage with Milvus
|
||||
- Embedding generation with vLLM
|
||||
- Question answering with context
|
||||
|
||||
Prerequisites:
|
||||
1. Install dependencies:
|
||||
pip install -U vllm \
|
||||
langchain_milvus langchain_openai \
|
||||
langchain_community beautifulsoup4 \
|
||||
langchain-text-splitters
|
||||
|
||||
2. Start services:
|
||||
# Start embedding service (port 8000)
|
||||
vllm serve ssmits/Qwen2-7B-Instruct-embed-base
|
||||
|
||||
# Start chat service (port 8001)
|
||||
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
|
||||
|
||||
Usage:
|
||||
python retrieval_augmented_generation_with_langchain.py
|
||||
|
||||
Notes:
|
||||
- Ensure both vLLM services are running before executing
|
||||
- Default ports: 8000 (embedding), 8001 (chat)
|
||||
- First run may take time to download models
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from argparse import Namespace
|
||||
from typing import Any
|
||||
|
||||
from langchain_community.document_loaders import WebBaseLoader
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.prompts import PromptTemplate
|
||||
from langchain_core.runnables import RunnablePassthrough
|
||||
from langchain_milvus import Milvus
|
||||
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
||||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||
|
||||
|
||||
def load_and_split_documents(config: dict[str, Any]):
|
||||
"""
|
||||
Load and split documents from web URL
|
||||
"""
|
||||
try:
|
||||
loader = WebBaseLoader(web_paths=(config["url"],))
|
||||
docs = loader.load()
|
||||
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=config["chunk_size"],
|
||||
chunk_overlap=config["chunk_overlap"],
|
||||
)
|
||||
return text_splitter.split_documents(docs)
|
||||
except Exception as e:
|
||||
print(f"Error loading document from {config['url']}: {str(e)}")
|
||||
raise
|
||||
|
||||
|
||||
def init_vectorstore(config: dict[str, Any], documents: list[Document]):
|
||||
"""
|
||||
Initialize vector store with documents
|
||||
"""
|
||||
return Milvus.from_documents(
|
||||
documents=documents,
|
||||
embedding=OpenAIEmbeddings(
|
||||
model=config["embedding_model"],
|
||||
openai_api_key=config["vllm_api_key"],
|
||||
openai_api_base=config["vllm_embedding_endpoint"],
|
||||
),
|
||||
connection_args={"uri": config["uri"]},
|
||||
drop_old=True,
|
||||
)
|
||||
|
||||
|
||||
def init_llm(config: dict[str, Any]):
|
||||
"""
|
||||
Initialize llm
|
||||
"""
|
||||
return ChatOpenAI(
|
||||
model=config["chat_model"],
|
||||
openai_api_key=config["vllm_api_key"],
|
||||
openai_api_base=config["vllm_chat_endpoint"],
|
||||
)
|
||||
|
||||
|
||||
def get_qa_prompt():
|
||||
"""
|
||||
Get question answering prompt template
|
||||
"""
|
||||
template = """You are an assistant for question-answering tasks.
|
||||
Use the following pieces of retrieved context to answer the question.
|
||||
If you don't know the answer, just say that you don't know.
|
||||
Use three sentences maximum and keep the answer concise.
|
||||
Question: {question}
|
||||
Context: {context}
|
||||
Answer:
|
||||
"""
|
||||
return PromptTemplate.from_template(template)
|
||||
|
||||
|
||||
def format_docs(docs: list[Document]):
|
||||
"""
|
||||
Format documents for prompt
|
||||
"""
|
||||
return "\n\n".join(doc.page_content for doc in docs)
|
||||
|
||||
|
||||
def create_qa_chain(retriever: Any, llm: ChatOpenAI, prompt: PromptTemplate):
|
||||
"""
|
||||
Set up question answering chain
|
||||
"""
|
||||
return (
|
||||
{
|
||||
"context": retriever | format_docs,
|
||||
"question": RunnablePassthrough(),
|
||||
}
|
||||
| prompt
|
||||
| llm
|
||||
| StrOutputParser()
|
||||
)
|
||||
|
||||
|
||||
def get_parser() -> argparse.ArgumentParser:
|
||||
"""
|
||||
Parse command line arguments
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description="RAG with vLLM and langchain")
|
||||
|
||||
# Add command line arguments
|
||||
parser.add_argument(
|
||||
"--vllm-api-key", default="EMPTY", help="API key for vLLM compatible services"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--vllm-embedding-endpoint",
|
||||
default="http://localhost:8000/v1",
|
||||
help="Base URL for embedding service",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--vllm-chat-endpoint",
|
||||
default="http://localhost:8001/v1",
|
||||
help="Base URL for chat service",
|
||||
)
|
||||
parser.add_argument("--uri", default="./milvus.db", help="URI for Milvus database")
|
||||
parser.add_argument(
|
||||
"--url",
|
||||
default=("https://docs.vllm.ai/en/latest/getting_started/quickstart.html"),
|
||||
help="URL of the document to process",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--embedding-model",
|
||||
default="ssmits/Qwen2-7B-Instruct-embed-base",
|
||||
help="Model name for embeddings",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--chat-model", default="qwen/Qwen1.5-0.5B-Chat", help="Model name for chat"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-i", "--interactive", action="store_true", help="Enable interactive Q&A mode"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-k", "--top-k", type=int, default=3, help="Number of top results to retrieve"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-c",
|
||||
"--chunk-size",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Chunk size for document splitting",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
"--chunk-overlap",
|
||||
type=int,
|
||||
default=200,
|
||||
help="Chunk overlap for document splitting",
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def init_config(args: Namespace):
|
||||
"""
|
||||
Initialize configuration settings from command line arguments
|
||||
"""
|
||||
|
||||
return {
|
||||
"vllm_api_key": args.vllm_api_key,
|
||||
"vllm_embedding_endpoint": args.vllm_embedding_endpoint,
|
||||
"vllm_chat_endpoint": args.vllm_chat_endpoint,
|
||||
"uri": args.uri,
|
||||
"embedding_model": args.embedding_model,
|
||||
"chat_model": args.chat_model,
|
||||
"url": args.url,
|
||||
"chunk_size": args.chunk_size,
|
||||
"chunk_overlap": args.chunk_overlap,
|
||||
"top_k": args.top_k,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
# Parse command line arguments
|
||||
args = get_parser().parse_args()
|
||||
|
||||
# Initialize configuration
|
||||
config = init_config(args)
|
||||
|
||||
# Load and split documents
|
||||
documents = load_and_split_documents(config)
|
||||
|
||||
# Initialize vector store and retriever
|
||||
vectorstore = init_vectorstore(config, documents)
|
||||
retriever = vectorstore.as_retriever(search_kwargs={"k": config["top_k"]})
|
||||
|
||||
# Initialize llm and prompt
|
||||
llm = init_llm(config)
|
||||
prompt = get_qa_prompt()
|
||||
|
||||
# Set up QA chain
|
||||
qa_chain = create_qa_chain(retriever, llm, prompt)
|
||||
|
||||
# Interactive mode
|
||||
if args.interactive:
|
||||
print("\nWelcome to Interactive Q&A System!")
|
||||
print("Enter 'q' or 'quit' to exit.")
|
||||
|
||||
while True:
|
||||
question = input("\nPlease enter your question: ")
|
||||
if question.lower() in ["q", "quit"]:
|
||||
print("\nThank you for using! Goodbye!")
|
||||
break
|
||||
|
||||
output = qa_chain.invoke(question)
|
||||
print(output)
|
||||
else:
|
||||
# Default single question mode
|
||||
question = "How to install vLLM?"
|
||||
output = qa_chain.invoke(question)
|
||||
print("-" * 50)
|
||||
print(output)
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,225 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
RAG (Retrieval Augmented Generation) Implementation with LlamaIndex
|
||||
================================================================
|
||||
|
||||
This script demonstrates a RAG system using:
|
||||
- LlamaIndex: For document indexing and retrieval
|
||||
- Milvus: As vector store backend
|
||||
- vLLM: For embedding and text generation
|
||||
|
||||
Features:
|
||||
1. Document Loading & Processing
|
||||
2. Embedding & Storage
|
||||
3. Query Processing
|
||||
|
||||
Requirements:
|
||||
1. Install dependencies:
|
||||
pip install llama-index llama-index-readers-web \
|
||||
llama-index-llms-openai-like \
|
||||
llama-index-embeddings-openai-like \
|
||||
llama-index-vector-stores-milvus \
|
||||
|
||||
2. Start services:
|
||||
# Start embedding service (port 8000)
|
||||
vllm serve ssmits/Qwen2-7B-Instruct-embed-base
|
||||
|
||||
# Start chat service (port 8001)
|
||||
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
|
||||
|
||||
Usage:
|
||||
python retrieval_augmented_generation_with_llamaindex.py
|
||||
|
||||
Notes:
|
||||
- Ensure both vLLM services are running before executing
|
||||
- Default ports: 8000 (embedding), 8001 (chat)
|
||||
- First run may take time to download models
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from argparse import Namespace
|
||||
from typing import Any
|
||||
|
||||
from llama_index.core import Settings, StorageContext, VectorStoreIndex
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
from llama_index.embeddings.openai_like import OpenAILikeEmbedding
|
||||
from llama_index.llms.openai_like import OpenAILike
|
||||
from llama_index.readers.web import SimpleWebPageReader
|
||||
from llama_index.vector_stores.milvus import MilvusVectorStore
|
||||
|
||||
|
||||
def init_config(args: Namespace):
|
||||
"""Initialize configuration with command line arguments"""
|
||||
return {
|
||||
"url": args.url,
|
||||
"embedding_model": args.embedding_model,
|
||||
"chat_model": args.chat_model,
|
||||
"vllm_api_key": args.vllm_api_key,
|
||||
"embedding_endpoint": args.embedding_endpoint,
|
||||
"chat_endpoint": args.chat_endpoint,
|
||||
"db_path": args.db_path,
|
||||
"chunk_size": args.chunk_size,
|
||||
"chunk_overlap": args.chunk_overlap,
|
||||
"top_k": args.top_k,
|
||||
}
|
||||
|
||||
|
||||
def load_documents(url: str) -> list:
|
||||
"""Load and process web documents"""
|
||||
return SimpleWebPageReader(html_to_text=True).load_data([url])
|
||||
|
||||
|
||||
def setup_models(config: dict[str, Any]):
|
||||
"""Configure embedding and chat models"""
|
||||
Settings.embed_model = OpenAILikeEmbedding(
|
||||
api_base=config["embedding_endpoint"],
|
||||
api_key=config["vllm_api_key"],
|
||||
model_name=config["embedding_model"],
|
||||
)
|
||||
|
||||
Settings.llm = OpenAILike(
|
||||
model=config["chat_model"],
|
||||
api_key=config["vllm_api_key"],
|
||||
api_base=config["chat_endpoint"],
|
||||
context_window=128000,
|
||||
is_chat_model=True,
|
||||
is_function_calling_model=False,
|
||||
)
|
||||
|
||||
Settings.transformations = [
|
||||
SentenceSplitter(
|
||||
chunk_size=config["chunk_size"],
|
||||
chunk_overlap=config["chunk_overlap"],
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def setup_vector_store(db_path: str) -> MilvusVectorStore:
|
||||
"""Initialize vector store"""
|
||||
sample_emb = Settings.embed_model.get_text_embedding("test")
|
||||
print(f"Embedding dimension: {len(sample_emb)}")
|
||||
return MilvusVectorStore(uri=db_path, dim=len(sample_emb), overwrite=True)
|
||||
|
||||
|
||||
def create_index(documents: list, vector_store: MilvusVectorStore):
|
||||
"""Create document index"""
|
||||
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
||||
return VectorStoreIndex.from_documents(
|
||||
documents,
|
||||
storage_context=storage_context,
|
||||
)
|
||||
|
||||
|
||||
def query_document(index: VectorStoreIndex, question: str, top_k: int):
|
||||
"""Query document with given question"""
|
||||
query_engine = index.as_query_engine(similarity_top_k=top_k)
|
||||
return query_engine.query(question)
|
||||
|
||||
|
||||
def get_parser() -> argparse.ArgumentParser:
|
||||
"""Parse command line arguments"""
|
||||
parser = argparse.ArgumentParser(description="RAG with vLLM and LlamaIndex")
|
||||
|
||||
# Add command line arguments
|
||||
parser.add_argument(
|
||||
"--url",
|
||||
default=("https://docs.vllm.ai/en/latest/getting_started/quickstart.html"),
|
||||
help="URL of the document to process",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--embedding-model",
|
||||
default="ssmits/Qwen2-7B-Instruct-embed-base",
|
||||
help="Model name for embeddings",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--chat-model", default="qwen/Qwen1.5-0.5B-Chat", help="Model name for chat"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--vllm-api-key", default="EMPTY", help="API key for vLLM compatible services"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--embedding-endpoint",
|
||||
default="http://localhost:8000/v1",
|
||||
help="Base URL for embedding service",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--chat-endpoint",
|
||||
default="http://localhost:8001/v1",
|
||||
help="Base URL for chat service",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--db-path", default="./milvus_demo.db", help="Path to Milvus database"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-i", "--interactive", action="store_true", help="Enable interactive Q&A mode"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-c",
|
||||
"--chunk-size",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Chunk size for document splitting",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
"--chunk-overlap",
|
||||
type=int,
|
||||
default=200,
|
||||
help="Chunk overlap for document splitting",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-k", "--top-k", type=int, default=3, help="Number of top results to retrieve"
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def main():
|
||||
# Parse command line arguments
|
||||
args = get_parser().parse_args()
|
||||
|
||||
# Initialize configuration
|
||||
config = init_config(args)
|
||||
|
||||
# Load documents
|
||||
documents = load_documents(config["url"])
|
||||
|
||||
# Setup models
|
||||
setup_models(config)
|
||||
|
||||
# Setup vector store
|
||||
vector_store = setup_vector_store(config["db_path"])
|
||||
|
||||
# Create index
|
||||
index = create_index(documents, vector_store)
|
||||
|
||||
if args.interactive:
|
||||
print("\nEntering interactive mode. Type 'quit' to exit.")
|
||||
while True:
|
||||
# Get user question
|
||||
question = input("\nEnter your question: ")
|
||||
|
||||
# Check for exit command
|
||||
if question.lower() in ["quit", "exit", "q"]:
|
||||
print("Exiting interactive mode...")
|
||||
break
|
||||
|
||||
# Get and print response
|
||||
print("\n" + "-" * 50)
|
||||
print("Response:\n")
|
||||
response = query_document(index, question, config["top_k"])
|
||||
print(response)
|
||||
print("-" * 50)
|
||||
else:
|
||||
# Single query mode
|
||||
question = "How to install vLLM?"
|
||||
response = query_document(index, question, config["top_k"])
|
||||
print("-" * 50)
|
||||
print("Response:\n")
|
||||
print(response)
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
131
examples/online_serving/run_cluster.sh
Normal file
131
examples/online_serving/run_cluster.sh
Normal file
@@ -0,0 +1,131 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Launch a Ray cluster inside Docker for vLLM inference.
|
||||
#
|
||||
# This script can start either a head node or a worker node, depending on the
|
||||
# --head or --worker flag provided as the third positional argument.
|
||||
#
|
||||
# Usage:
|
||||
# 1. Designate one machine as the head node and execute:
|
||||
# bash run_cluster.sh \
|
||||
# vllm/vllm-openai \
|
||||
# <head_node_ip> \
|
||||
# --head \
|
||||
# /abs/path/to/huggingface/cache \
|
||||
# -e VLLM_HOST_IP=<head_node_ip>
|
||||
#
|
||||
# 2. On every worker machine, execute:
|
||||
# bash run_cluster.sh \
|
||||
# vllm/vllm-openai \
|
||||
# <head_node_ip> \
|
||||
# --worker \
|
||||
# /abs/path/to/huggingface/cache \
|
||||
# -e VLLM_HOST_IP=<worker_node_ip>
|
||||
#
|
||||
# Each worker requires a unique VLLM_HOST_IP value.
|
||||
# Keep each terminal session open. Closing a session stops the associated Ray
|
||||
# node and thereby shuts down the entire cluster.
|
||||
# Every machine must be reachable at the supplied IP address.
|
||||
#
|
||||
# The container is named "node-<random_suffix>". To open a shell inside
|
||||
# a container after launch, use:
|
||||
# docker exec -it node-<random_suffix> /bin/bash
|
||||
#
|
||||
# Then, you can execute vLLM commands on the Ray cluster as if it were a
|
||||
# single machine, e.g. vllm serve ...
|
||||
#
|
||||
# To stop the container, use:
|
||||
# docker stop node-<random_suffix>
|
||||
|
||||
# Check for minimum number of required arguments.
|
||||
if [ $# -lt 4 ]; then
|
||||
echo "Usage: $0 docker_image head_node_ip --head|--worker path_to_hf_home [additional_args...]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Extract the mandatory positional arguments and remove them from $@.
|
||||
DOCKER_IMAGE="$1"
|
||||
HEAD_NODE_ADDRESS="$2"
|
||||
NODE_TYPE="$3" # Should be --head or --worker.
|
||||
PATH_TO_HF_HOME="$4"
|
||||
shift 4
|
||||
|
||||
# Preserve any extra arguments so they can be forwarded to Docker.
|
||||
ADDITIONAL_ARGS=("$@")
|
||||
|
||||
# Validate the NODE_TYPE argument.
|
||||
if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then
|
||||
echo "Error: Node type must be --head or --worker"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Extract VLLM_HOST_IP from ADDITIONAL_ARGS (e.g. "-e VLLM_HOST_IP=...").
|
||||
VLLM_HOST_IP=""
|
||||
for ((i = 0; i < ${#ADDITIONAL_ARGS[@]}; i++)); do
|
||||
arg="${ADDITIONAL_ARGS[$i]}"
|
||||
case "${arg}" in
|
||||
-e)
|
||||
next="${ADDITIONAL_ARGS[$((i + 1))]:-}"
|
||||
if [[ "${next}" == VLLM_HOST_IP=* ]]; then
|
||||
VLLM_HOST_IP="${next#VLLM_HOST_IP=}"
|
||||
break
|
||||
fi
|
||||
;;
|
||||
-eVLLM_HOST_IP=* | VLLM_HOST_IP=*)
|
||||
VLLM_HOST_IP="${arg#*=}"
|
||||
break
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# For the head node, HEAD_NODE_ADDRESS and VLLM_HOST_IP should be consistent.
|
||||
if [[ "${NODE_TYPE}" == "--head" && -n "${VLLM_HOST_IP}" ]]; then
|
||||
if [[ "${VLLM_HOST_IP}" != "${HEAD_NODE_ADDRESS}" ]]; then
|
||||
echo "Warning: VLLM_HOST_IP (${VLLM_HOST_IP}) differs from head_node_ip (${HEAD_NODE_ADDRESS})."
|
||||
echo "Using VLLM_HOST_IP as the head node address."
|
||||
HEAD_NODE_ADDRESS="${VLLM_HOST_IP}"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Generate a unique container name with random suffix.
|
||||
# Docker container names must be unique on each host.
|
||||
# The random suffix allows multiple Ray containers to run simultaneously on the same machine,
|
||||
# for example, on a multi-GPU machine.
|
||||
CONTAINER_NAME="node-${RANDOM}"
|
||||
|
||||
# Define a cleanup routine that removes the container when the script exits.
|
||||
# This prevents orphaned containers from accumulating if the script is interrupted.
|
||||
cleanup() {
|
||||
docker stop "${CONTAINER_NAME}"
|
||||
docker rm "${CONTAINER_NAME}"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
# Build the Ray start command based on the node role.
|
||||
# The head node manages the cluster and accepts connections on port 6379,
|
||||
# while workers connect to the head's address.
|
||||
RAY_START_CMD="ray start --block"
|
||||
if [ "${NODE_TYPE}" == "--head" ]; then
|
||||
RAY_START_CMD+=" --head --node-ip-address=${HEAD_NODE_ADDRESS} --port=6379"
|
||||
else
|
||||
|
||||
RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
|
||||
if [ -n "${VLLM_HOST_IP}" ]; then
|
||||
RAY_START_CMD+=" --node-ip-address=${VLLM_HOST_IP}"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Launch the container with the assembled parameters.
|
||||
# --network host: Allows Ray nodes to communicate directly via host networking
|
||||
# --shm-size 10.24g: Increases shared memory
|
||||
# --gpus all: Gives container access to all GPUs on the host
|
||||
# -v HF_HOME: Mounts HuggingFace cache to avoid re-downloading models
|
||||
docker run \
|
||||
--entrypoint /bin/bash \
|
||||
--network host \
|
||||
--name "${CONTAINER_NAME}" \
|
||||
--shm-size 10.24g \
|
||||
--gpus all \
|
||||
-v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
|
||||
"${ADDITIONAL_ARGS[@]}" \
|
||||
"${DOCKER_IMAGE}" -c "${RAY_START_CMD}"
|
||||
24
examples/online_serving/sagemaker-entrypoint.sh
Normal file
24
examples/online_serving/sagemaker-entrypoint.sh
Normal file
@@ -0,0 +1,24 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Define the prefix for environment variables to look for
|
||||
PREFIX="SM_VLLM_"
|
||||
ARG_PREFIX="--"
|
||||
|
||||
# Initialize an array for storing the arguments
|
||||
# port 8080 required by sagemaker, https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-code-container-response
|
||||
ARGS=(--port 8080)
|
||||
|
||||
# Loop through all environment variables
|
||||
while IFS='=' read -r key value; do
|
||||
# Remove the prefix from the key, convert to lowercase, and replace underscores with dashes
|
||||
arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')
|
||||
|
||||
# Add the argument name and value to the ARGS array
|
||||
ARGS+=("${ARG_PREFIX}${arg_name}")
|
||||
if [ -n "$value" ]; then
|
||||
ARGS+=("$value")
|
||||
fi
|
||||
done < <(env | grep "^${PREFIX}")
|
||||
|
||||
# Pass the collected arguments to the main entrypoint
|
||||
exec vllm serve "${ARGS[@]}"
|
||||
311
examples/online_serving/streamlit_openai_chatbot_webserver.py
Normal file
311
examples/online_serving/streamlit_openai_chatbot_webserver.py
Normal file
@@ -0,0 +1,311 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
vLLM Chat Assistant - A Streamlit Web Interface
|
||||
|
||||
A streamlined chat interface that quickly integrates
|
||||
with vLLM API server.
|
||||
|
||||
Features:
|
||||
- Multiple chat sessions management
|
||||
- Streaming response display
|
||||
- Configurable API endpoint
|
||||
- Real-time chat history
|
||||
- Reasoning Display: Optional thinking process visualization
|
||||
|
||||
Requirements:
|
||||
pip install streamlit openai
|
||||
|
||||
Usage:
|
||||
# Start the app with default settings
|
||||
streamlit run streamlit_openai_chatbot_webserver.py
|
||||
|
||||
# Start with custom vLLM API endpoint
|
||||
VLLM_API_BASE="http://your-server:8000/v1" \
|
||||
streamlit run streamlit_openai_chatbot_webserver.py
|
||||
|
||||
# Enable debug mode
|
||||
streamlit run streamlit_openai_chatbot_webserver.py \
|
||||
--logger.level=debug
|
||||
"""
|
||||
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
import streamlit as st
|
||||
from openai import OpenAI
|
||||
|
||||
# Get command line arguments from environment variables
|
||||
openai_api_key = os.getenv("VLLM_API_KEY", "EMPTY")
|
||||
openai_api_base = os.getenv("VLLM_API_BASE", "http://localhost:8000/v1")
|
||||
|
||||
# Initialize session states for managing chat sessions
|
||||
if "sessions" not in st.session_state:
|
||||
st.session_state.sessions = {}
|
||||
|
||||
if "current_session" not in st.session_state:
|
||||
st.session_state.current_session = None
|
||||
|
||||
if "messages" not in st.session_state:
|
||||
st.session_state.messages = []
|
||||
|
||||
if "active_session" not in st.session_state:
|
||||
st.session_state.active_session = None
|
||||
|
||||
# Add new session state for reasoning
|
||||
if "show_reasoning" not in st.session_state:
|
||||
st.session_state.show_reasoning = {}
|
||||
|
||||
# Initialize session state for API base URL
|
||||
if "api_base_url" not in st.session_state:
|
||||
st.session_state.api_base_url = openai_api_base
|
||||
|
||||
|
||||
def create_new_chat_session():
|
||||
"""Create a new chat session with timestamp as unique identifier.
|
||||
|
||||
This function initializes a new chat session by:
|
||||
1. Generating a timestamp-based session ID
|
||||
2. Creating an empty message list for the new session
|
||||
3. Setting the new session as both current and active session
|
||||
4. Resetting the messages list for the new session
|
||||
|
||||
Returns:
|
||||
None
|
||||
|
||||
Session State Updates:
|
||||
- sessions: Adds new empty message list with timestamp key
|
||||
- current_session: Sets to new session ID
|
||||
- active_session: Sets to new session ID
|
||||
- messages: Resets to empty list
|
||||
"""
|
||||
session_id = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
st.session_state.sessions[session_id] = []
|
||||
st.session_state.current_session = session_id
|
||||
st.session_state.active_session = session_id
|
||||
st.session_state.messages = []
|
||||
|
||||
|
||||
def switch_to_chat_session(session_id):
|
||||
"""Switch the active chat context to a different session.
|
||||
|
||||
Args:
|
||||
session_id (str): The timestamp ID of the session to switch to
|
||||
|
||||
This function handles chat session switching by:
|
||||
1. Setting the specified session as current
|
||||
2. Updating the active session marker
|
||||
3. Loading the messages history from the specified session
|
||||
|
||||
Session State Updates:
|
||||
- current_session: Updated to specified session_id
|
||||
- active_session: Updated to specified session_id
|
||||
- messages: Loaded from sessions[session_id]
|
||||
"""
|
||||
st.session_state.current_session = session_id
|
||||
st.session_state.active_session = session_id
|
||||
st.session_state.messages = st.session_state.sessions[session_id]
|
||||
|
||||
|
||||
def get_llm_response(messages, model, reason, content_ph=None, reasoning_ph=None):
|
||||
"""Generate and stream LLM response with optional reasoning process.
|
||||
|
||||
Args:
|
||||
messages (list): List of conversation message dicts with 'role' and 'content'
|
||||
model (str): The model identifier to use for generation
|
||||
reason (bool): Whether to enable and display reasoning process
|
||||
content_ph (streamlit.empty): Placeholder for streaming response content
|
||||
reasoning_ph (streamlit.empty): Placeholder for streaming reasoning process
|
||||
|
||||
Returns:
|
||||
tuple: (str, str)
|
||||
- First string contains the complete response text
|
||||
- Second string contains the complete reasoning text (if enabled)
|
||||
|
||||
Features:
|
||||
- Streams both reasoning and response text in real-time
|
||||
- Handles model API errors gracefully
|
||||
- Supports live updating of thinking process
|
||||
- Maintains separate content and reasoning displays
|
||||
|
||||
Raises:
|
||||
Exception: Wrapped in error message if API call fails
|
||||
|
||||
Note:
|
||||
The function uses streamlit placeholders for live updates.
|
||||
When reason=True, the reasoning process appears above the response.
|
||||
"""
|
||||
full_text = ""
|
||||
think_text = ""
|
||||
live_think = None
|
||||
# Build request parameters
|
||||
params = {"model": model, "messages": messages, "stream": True}
|
||||
if reason:
|
||||
params["extra_body"] = {"chat_template_kwargs": {"enable_thinking": True}}
|
||||
|
||||
try:
|
||||
response = client.chat.completions.create(**params)
|
||||
if isinstance(response, str):
|
||||
if content_ph:
|
||||
content_ph.markdown(response)
|
||||
return response, ""
|
||||
|
||||
# Prepare reasoning expander above content
|
||||
if reason and reasoning_ph:
|
||||
exp = reasoning_ph.expander("💭 Thinking Process (live)", expanded=True)
|
||||
live_think = exp.empty()
|
||||
|
||||
# Stream chunks
|
||||
for chunk in response:
|
||||
delta = chunk.choices[0].delta
|
||||
# Stream reasoning first
|
||||
if reason and hasattr(delta, "reasoning") and live_think:
|
||||
rc = delta.reasoning
|
||||
if rc:
|
||||
think_text += rc
|
||||
live_think.markdown(think_text + "▌")
|
||||
# Then stream content
|
||||
if hasattr(delta, "content") and delta.content and content_ph:
|
||||
full_text += delta.content
|
||||
content_ph.markdown(full_text + "▌")
|
||||
|
||||
# Finalize displays: reasoning remains above, content below
|
||||
if reason and live_think:
|
||||
live_think.markdown(think_text)
|
||||
if content_ph:
|
||||
content_ph.markdown(full_text)
|
||||
|
||||
return full_text, think_text
|
||||
except Exception as e:
|
||||
st.error(f"Error details: {str(e)}")
|
||||
return f"Error: {str(e)}", ""
|
||||
|
||||
|
||||
# Sidebar - API Settings first
|
||||
st.sidebar.title("API Settings")
|
||||
new_api_base = st.sidebar.text_input(
|
||||
"API Base URL:", value=st.session_state.api_base_url
|
||||
)
|
||||
if new_api_base != st.session_state.api_base_url:
|
||||
st.session_state.api_base_url = new_api_base
|
||||
st.rerun()
|
||||
|
||||
st.sidebar.divider()
|
||||
|
||||
# Sidebar - Session Management
|
||||
st.sidebar.title("Chat Sessions")
|
||||
if st.sidebar.button("New Session"):
|
||||
create_new_chat_session()
|
||||
|
||||
|
||||
# Display all sessions in reverse chronological order
|
||||
for session_id in sorted(st.session_state.sessions.keys(), reverse=True):
|
||||
# Mark the active session with a pinned button
|
||||
if session_id == st.session_state.active_session:
|
||||
st.sidebar.button(
|
||||
f"📍 {session_id}",
|
||||
key=session_id,
|
||||
type="primary",
|
||||
on_click=switch_to_chat_session,
|
||||
args=(session_id,),
|
||||
)
|
||||
else:
|
||||
st.sidebar.button(
|
||||
f"Session {session_id}",
|
||||
key=session_id,
|
||||
on_click=switch_to_chat_session,
|
||||
args=(session_id,),
|
||||
)
|
||||
|
||||
# Main interface
|
||||
st.title("vLLM Chat Assistant")
|
||||
|
||||
# Initialize OpenAI client with API settings
|
||||
client = OpenAI(api_key=openai_api_key, base_url=st.session_state.api_base_url)
|
||||
|
||||
# Get and display current model id
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
st.markdown(f"**Model**: {model}")
|
||||
|
||||
# Initialize first session if none exists
|
||||
if st.session_state.current_session is None:
|
||||
create_new_chat_session()
|
||||
st.session_state.active_session = st.session_state.current_session
|
||||
|
||||
# Update the chat history display section
|
||||
for idx, msg in enumerate(st.session_state.messages):
|
||||
# Render user messages normally
|
||||
if msg["role"] == "user":
|
||||
with st.chat_message("user"):
|
||||
st.write(msg["content"])
|
||||
# Render assistant messages with reasoning above
|
||||
else:
|
||||
# If reasoning exists for this assistant message, show it above the content
|
||||
if idx in st.session_state.show_reasoning:
|
||||
with st.expander("💭 Thinking Process", expanded=False):
|
||||
st.markdown(st.session_state.show_reasoning[idx])
|
||||
with st.chat_message("assistant"):
|
||||
st.write(msg["content"])
|
||||
|
||||
|
||||
# Setup & Cache reasoning support check
|
||||
@st.cache_data(show_spinner=False)
|
||||
def server_supports_reasoning():
|
||||
"""Check if the current model supports reasoning capability.
|
||||
|
||||
Returns:
|
||||
bool: True if the model supports reasoning, False otherwise
|
||||
"""
|
||||
resp = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": "Hi"}],
|
||||
stream=False,
|
||||
)
|
||||
return hasattr(resp.choices[0].message, "reasoning") and bool(
|
||||
resp.choices[0].message.reasoning
|
||||
)
|
||||
|
||||
|
||||
# Check support
|
||||
supports_reasoning = server_supports_reasoning()
|
||||
|
||||
# Add reasoning toggle in sidebar if supported
|
||||
reason = False # Default to False
|
||||
if supports_reasoning:
|
||||
reason = st.sidebar.checkbox("Enable Reasoning", value=False)
|
||||
else:
|
||||
st.sidebar.markdown(
|
||||
"<span style='color:gray;'>Reasoning unavailable for this model.</span>",
|
||||
unsafe_allow_html=True,
|
||||
)
|
||||
# reason remains False
|
||||
|
||||
# Update the input handling section
|
||||
if prompt := st.chat_input("Type your message here..."):
|
||||
# Save and display user message
|
||||
st.session_state.messages.append({"role": "user", "content": prompt})
|
||||
st.session_state.sessions[st.session_state.current_session] = (
|
||||
st.session_state.messages
|
||||
)
|
||||
with st.chat_message("user"):
|
||||
st.write(prompt)
|
||||
|
||||
# Prepare LLM messages
|
||||
msgs = [
|
||||
{"role": m["role"], "content": m["content"]} for m in st.session_state.messages
|
||||
]
|
||||
|
||||
# Stream assistant response
|
||||
with st.chat_message("assistant"):
|
||||
# Placeholders: reasoning above, content below
|
||||
reason_ph = st.empty()
|
||||
content_ph = st.empty()
|
||||
full, think = get_llm_response(msgs, model, reason, content_ph, reason_ph)
|
||||
# Determine index for this new assistant message
|
||||
message_index = len(st.session_state.messages)
|
||||
# Save assistant reply
|
||||
st.session_state.messages.append({"role": "assistant", "content": full})
|
||||
# Persist reasoning in session state if any
|
||||
if reason and think:
|
||||
st.session_state.show_reasoning[message_index] = think
|
||||
58
examples/online_serving/structured_outputs/README.md
Normal file
58
examples/online_serving/structured_outputs/README.md
Normal file
@@ -0,0 +1,58 @@
|
||||
# Structured Outputs
|
||||
|
||||
This script demonstrates various structured output capabilities of vLLM's OpenAI-compatible server.
|
||||
It can run individual constraint type or all of them.
|
||||
It supports both streaming responses and concurrent non-streaming requests.
|
||||
|
||||
To use this example, you must start an vLLM server with any model of your choice.
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen2.5-3B-Instruct
|
||||
```
|
||||
|
||||
To serve a reasoning model, you can use the following command:
|
||||
|
||||
```bash
|
||||
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \
|
||||
--reasoning-parser deepseek_r1
|
||||
```
|
||||
|
||||
If you want to run this script standalone with `uv`, you can use the following:
|
||||
|
||||
```bash
|
||||
uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs \
|
||||
structured-outputs
|
||||
```
|
||||
|
||||
See [feature docs](https://docs.vllm.ai/en/latest/features/structured_outputs.html) for more information.
|
||||
|
||||
!!! tip
|
||||
If vLLM is running remotely, then set `OPENAI_BASE_URL=<remote_url>` before running the script.
|
||||
|
||||
## Usage
|
||||
|
||||
Run all constraints, non-streaming:
|
||||
|
||||
```bash
|
||||
uv run structured_outputs.py
|
||||
```
|
||||
|
||||
Run all constraints, streaming:
|
||||
|
||||
```bash
|
||||
uv run structured_outputs.py --stream
|
||||
```
|
||||
|
||||
Run certain constraints, for example `structural_tag` and `regex`, streaming:
|
||||
|
||||
```bash
|
||||
uv run structured_outputs.py \
|
||||
--constraint structural_tag regex \
|
||||
--stream
|
||||
```
|
||||
|
||||
Run all constraints, with reasoning models and streaming:
|
||||
|
||||
```bash
|
||||
uv run structured_outputs.py --reasoning --stream
|
||||
```
|
||||
@@ -0,0 +1,8 @@
|
||||
[project]
|
||||
name = "examples-online-structured-outputs"
|
||||
requires-python = ">=3.10, <3.14"
|
||||
dependencies = ["openai==1.78.1", "pydantic==2.11.4"]
|
||||
version = "0.0.0"
|
||||
|
||||
[project.scripts]
|
||||
structured-outputs = "structured_outputs:main"
|
||||
268
examples/online_serving/structured_outputs/structured_outputs.py
Normal file
268
examples/online_serving/structured_outputs/structured_outputs.py
Normal file
@@ -0,0 +1,268 @@
|
||||
# ruff: noqa: E501
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
import asyncio
|
||||
import enum
|
||||
import os
|
||||
from typing import Any, Literal
|
||||
|
||||
import openai
|
||||
import pydantic
|
||||
from openai.types.chat import ChatCompletionChunk
|
||||
|
||||
ConstraintsFormat = Literal[
|
||||
"choice",
|
||||
"regex",
|
||||
"json",
|
||||
"grammar",
|
||||
"structural_tag",
|
||||
]
|
||||
|
||||
|
||||
async def print_stream_response(
|
||||
stream_response: openai.AsyncStream[ChatCompletionChunk],
|
||||
title: str,
|
||||
args: argparse.Namespace,
|
||||
):
|
||||
print(f"\n\n{title} (Streaming):")
|
||||
|
||||
local_reasoning_header_printed = False
|
||||
local_content_header_printed = False
|
||||
|
||||
async for chunk in stream_response:
|
||||
delta = chunk.choices[0].delta
|
||||
|
||||
reasoning_chunk_text: str | None = getattr(delta, "reasoning", None)
|
||||
content_chunk_text = delta.content
|
||||
|
||||
if args.reasoning:
|
||||
if reasoning_chunk_text:
|
||||
if not local_reasoning_header_printed:
|
||||
print(" Reasoning: ", end="")
|
||||
local_reasoning_header_printed = True
|
||||
print(reasoning_chunk_text, end="", flush=True)
|
||||
|
||||
if content_chunk_text:
|
||||
if not local_content_header_printed:
|
||||
if local_reasoning_header_printed:
|
||||
print()
|
||||
print(" Content: ", end="")
|
||||
local_content_header_printed = True
|
||||
print(content_chunk_text, end="", flush=True)
|
||||
else:
|
||||
if content_chunk_text:
|
||||
if not local_content_header_printed:
|
||||
print(" Content: ", end="")
|
||||
local_content_header_printed = True
|
||||
print(content_chunk_text, end="", flush=True)
|
||||
print()
|
||||
|
||||
|
||||
class CarType(str, enum.Enum):
|
||||
SEDAN = "SEDAN"
|
||||
SUV = "SUV"
|
||||
TRUCK = "TRUCK"
|
||||
COUPE = "COUPE"
|
||||
|
||||
|
||||
class CarDescription(pydantic.BaseModel):
|
||||
brand: str
|
||||
model: str
|
||||
car_type: CarType
|
||||
|
||||
|
||||
PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
|
||||
"choice": {
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Classify this sentiment: vLLM is wonderful!",
|
||||
}
|
||||
],
|
||||
"extra_body": {"structured_outputs": {"choice": ["positive", "negative"]}},
|
||||
},
|
||||
"regex": {
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Generate an email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: 'alan.turing@enigma.com\n'",
|
||||
}
|
||||
],
|
||||
"extra_body": {
|
||||
"structured_outputs": {"regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n"},
|
||||
},
|
||||
},
|
||||
"json": {
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
|
||||
}
|
||||
],
|
||||
"response_format": {
|
||||
"type": "json_schema",
|
||||
"json_schema": {
|
||||
"name": "car-description",
|
||||
"schema": CarDescription.model_json_schema(),
|
||||
},
|
||||
},
|
||||
},
|
||||
"grammar": {
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
|
||||
}
|
||||
],
|
||||
"extra_body": {
|
||||
"structured_outputs": {
|
||||
"grammar": """
|
||||
root ::= select_statement
|
||||
|
||||
select_statement ::= "SELECT " column " from " table " where " condition
|
||||
|
||||
column ::= "col_1 " | "col_2 "
|
||||
|
||||
table ::= "table_1 " | "table_2 "
|
||||
|
||||
condition ::= column "= " number
|
||||
|
||||
number ::= "1 " | "2 "
|
||||
""",
|
||||
}
|
||||
},
|
||||
},
|
||||
"structural_tag": {
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": """
|
||||
You have access to the following function to retrieve the weather in a city:
|
||||
|
||||
{
|
||||
"name": "get_weather",
|
||||
"parameters": {
|
||||
"city": {
|
||||
"param_type": "string",
|
||||
"description": "The city to get the weather for",
|
||||
"required": True
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
If a you choose to call a function ONLY reply in the following format:
|
||||
<{start_tag}={function_name}>{parameters}{end_tag}
|
||||
where
|
||||
|
||||
start_tag => `<function`
|
||||
parameters => a JSON dict with the function argument name as key and function
|
||||
argument value as value.
|
||||
end_tag => `</function>`
|
||||
|
||||
Here is an example,
|
||||
<function=example_function_name>{"example_name": "example_value"}</function>
|
||||
|
||||
Reminder:
|
||||
- Function calls MUST follow the specified format
|
||||
- Required parameters MUST be specified
|
||||
- Only call one function at a time
|
||||
- Put the entire function call reply on one line
|
||||
- Always add your sources when using search results to answer the user query
|
||||
|
||||
You are a helpful assistant.
|
||||
|
||||
Given the previous instructions, what is the weather in New York City, Boston,
|
||||
and San Francisco?""",
|
||||
},
|
||||
],
|
||||
"response_format": {
|
||||
"type": "structural_tag",
|
||||
"structures": [
|
||||
{
|
||||
"begin": "<function=get_weather>",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {"city": {"type": "string"}},
|
||||
"required": ["city"],
|
||||
},
|
||||
"end": "</function>",
|
||||
}
|
||||
],
|
||||
"triggers": ["<function="],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
async def cli():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run OpenAI Chat Completion with various structured outputs capabilities",
|
||||
)
|
||||
_ = parser.add_argument(
|
||||
"--constraint",
|
||||
type=str,
|
||||
nargs="+",
|
||||
choices=[*list(PARAMS), "*"],
|
||||
default=["*"],
|
||||
help="Specify which constraint(s) to run.",
|
||||
)
|
||||
_ = parser.add_argument(
|
||||
"--stream",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
default=False,
|
||||
help="Enable streaming output",
|
||||
)
|
||||
_ = parser.add_argument(
|
||||
"--reasoning",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
default=False,
|
||||
help="Enable printing of reasoning traces if available.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
base_url = os.getenv("OPENAI_BASE_URL", "http://localhost:8000/v1")
|
||||
client = openai.AsyncOpenAI(base_url=base_url, api_key="EMPTY")
|
||||
constraints = list(PARAMS) if "*" in args.constraint else list(set(args.constraint))
|
||||
model = (await client.models.list()).data[0].id
|
||||
|
||||
if args.stream:
|
||||
results = await asyncio.gather(
|
||||
*[
|
||||
client.chat.completions.create(
|
||||
model=model,
|
||||
max_tokens=1024,
|
||||
stream=True,
|
||||
**PARAMS[name],
|
||||
)
|
||||
for name in constraints
|
||||
]
|
||||
)
|
||||
for constraint, stream in zip(constraints, results):
|
||||
await print_stream_response(stream, constraint, args)
|
||||
else:
|
||||
results = await asyncio.gather(
|
||||
*[
|
||||
client.chat.completions.create(
|
||||
model=model,
|
||||
max_tokens=1024,
|
||||
stream=False,
|
||||
**PARAMS[name],
|
||||
)
|
||||
for name in constraints
|
||||
]
|
||||
)
|
||||
for constraint, response in zip(constraints, results):
|
||||
print(f"\n\n{constraint}:")
|
||||
message = response.choices[0].message
|
||||
if args.reasoning and hasattr(message, "reasoning"):
|
||||
print(f" Reasoning: {message.reasoning or ''}")
|
||||
print(f" Content: {message.content!r}")
|
||||
|
||||
|
||||
def main():
|
||||
asyncio.run(cli())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
49
examples/online_serving/token_generation_client.py
Normal file
49
examples/online_serving/token_generation_client.py
Normal file
@@ -0,0 +1,49 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import httpx
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
GEN_ENDPOINT = "http://localhost:8000/inference/v1/generate"
|
||||
DUMMY_API_KEY = "empty"
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
|
||||
transport = httpx.HTTPTransport()
|
||||
headers = {"Authorization": f"Bearer {DUMMY_API_KEY}"}
|
||||
client = httpx.Client(
|
||||
transport=transport,
|
||||
base_url=GEN_ENDPOINT,
|
||||
timeout=600,
|
||||
headers=headers,
|
||||
)
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "How many countries are in the EU?"},
|
||||
]
|
||||
|
||||
|
||||
def main(client):
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||
token_ids = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
enable_thinking=False,
|
||||
)
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
"token_ids": token_ids,
|
||||
"sampling_params": {"max_tokens": 24, "temperature": 0.2, "detokenize": False},
|
||||
"stream": False,
|
||||
}
|
||||
resp = client.post(GEN_ENDPOINT, json=payload)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
print(data)
|
||||
print("-" * 50)
|
||||
print("Token generation results:")
|
||||
res = tokenizer.decode(data["choices"][0]["token_ids"])
|
||||
print(res)
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(client)
|
||||
26
examples/online_serving/utils.py
Normal file
26
examples/online_serving/utils.py
Normal file
@@ -0,0 +1,26 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from openai import APIConnectionError, OpenAI
|
||||
from openai.pagination import SyncPage
|
||||
from openai.types.model import Model
|
||||
|
||||
|
||||
def get_first_model(client: OpenAI) -> str:
|
||||
"""
|
||||
Get the first model from the vLLM server.
|
||||
"""
|
||||
try:
|
||||
models: SyncPage[Model] = client.models.list()
|
||||
except APIConnectionError as e:
|
||||
raise RuntimeError(
|
||||
"Failed to get the list of models from the vLLM server at "
|
||||
f"{client.base_url} with API key {client.api_key}. Check\n"
|
||||
"1. the server is running\n"
|
||||
"2. the server URL is correct\n"
|
||||
"3. the API key is correct"
|
||||
) from e
|
||||
|
||||
if len(models.data) == 0:
|
||||
raise RuntimeError(f"No models found on the vLLM server at {client.base_url}")
|
||||
|
||||
return models.data[0].id
|
||||
Reference in New Issue
Block a user