[docker] Distributed Serving with k8s Statefulset ( good example for DeepSeek-R1) (#3631)
Signed-off-by: Peter Pan <Peter.Pan@daocloud.io> Co-authored-by: Kebe <kebe.liu@daocloud.io>
This commit is contained in:
104
docker/k8s-sglang-distributed-sts.yaml
Normal file
104
docker/k8s-sglang-distributed-sts.yaml
Normal file
@@ -0,0 +1,104 @@
|
||||
# Two Nodes Sglang example
|
||||
|
||||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: distributed-sglang
|
||||
spec:
|
||||
replicas: 2 # number of nodes/pods to run distributed sglang
|
||||
selector:
|
||||
matchLabels:
|
||||
app: distributed-sglang
|
||||
serviceName: ""
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: distributed-sglang
|
||||
spec:
|
||||
containers:
|
||||
- name: sglang-container
|
||||
image: docker.io/lmsysorg/sglang:latest
|
||||
imagePullPolicy: Always # image may be replaced by official CI versioned image
|
||||
command:
|
||||
- /bin/bash
|
||||
- -c
|
||||
# please modify the sglang serving arguments below, as necessary.
|
||||
# NOTE: the --expert-parallel-size and --enable-ep-moe are for MoE model like DeepSeek-R1
|
||||
args:
|
||||
- |
|
||||
python3 -m sglang.launch_server \
|
||||
--model /llm-folder \
|
||||
--dist-init-addr sglang-master-pod:5000 \
|
||||
--tensor-parallel-size 16 \
|
||||
--nnodes 2 \
|
||||
--node-rank $POD_INDEX \
|
||||
--trust-remote-code \
|
||||
--host 0.0.0.0 \
|
||||
--port 8000 \
|
||||
--enable-metrics \
|
||||
--enable-ep-moe \
|
||||
--expert-parallel-size 16
|
||||
env:
|
||||
- name: POD_INDEX # reflects the node-rank
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
apiVersion: v1
|
||||
fieldPath: metadata.labels['apps.kubernetes.io/pod-index']
|
||||
- name: NCCL_DEBUG
|
||||
value: INFO
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
requests:
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /llm-folder
|
||||
name: llm
|
||||
securityContext:
|
||||
privileged: true # to leverage RDMA/InfiniBand device, co-work with HostNetwork=true
|
||||
hostNetwork: true
|
||||
volumes:
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: 10Gi
|
||||
name: dshm
|
||||
- hostPath:
|
||||
path: /llm-folder # replace with PVC or hostPath with your model weights
|
||||
type: DirectoryOrCreate
|
||||
name: llm
|
||||
#- persistentVolumeClaim:
|
||||
# claimName: llm-pvc
|
||||
# name: llm
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: sglang-master-pod
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
app: distributed-sglang
|
||||
apps.kubernetes.io/pod-index: "0"
|
||||
ports:
|
||||
- name: dist-port
|
||||
port: 5000
|
||||
targetPort: 5000
|
||||
---
|
||||
# the serving service
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: sglang-serving-on-master
|
||||
spec:
|
||||
type: NodePort
|
||||
selector:
|
||||
app: distributed-sglang
|
||||
apps.kubernetes.io/pod-index: "0"
|
||||
ports:
|
||||
- name: serving
|
||||
port: 8000
|
||||
targetPort: 8000
|
||||
- name: metrics
|
||||
port: 8080
|
||||
targetPort: 8080
|
||||
@@ -98,7 +98,21 @@ drun v0.4.3.post4-rocm630 python3 -m sglang.bench_one_batch --batch-size 32 --in
|
||||
2. Execute the command `docker compose up -d` in your terminal.
|
||||
</details>
|
||||
|
||||
## Method 5: Run on Kubernetes or Clouds with SkyPilot
|
||||
## Method 5: Using Kubernetes
|
||||
|
||||
<details>
|
||||
<summary>More</summary>
|
||||
|
||||
1. Option 1: For single node serving (typically when the model size fits into GPUs on one node)
|
||||
Execute command `kubectl apply -f docker/k8s-sglang-service.yaml`, to create k8s deployment and service, with llama-31-8b as example.
|
||||
|
||||
2. Option 2: For multi-node serving (usually when a large model requires more than one GPU node, such as `DeepSeek-R1`)
|
||||
Modify the LLM model path and arguments as necessary, then execute command `kubectl apply -f docker/k8s-sglang-distributed-sts.yaml`, to create two nodes k8s statefulset and serving service.
|
||||
</details>
|
||||
|
||||
|
||||
|
||||
## Method 6: Run on Kubernetes or Clouds with SkyPilot
|
||||
|
||||
<details>
|
||||
<summary>More</summary>
|
||||
|
||||
Reference in New Issue
Block a user