[docker] Distributed Serving with k8s Statefulset ( good example for DeepSeek-R1) (#3631)
Signed-off-by: Peter Pan <Peter.Pan@daocloud.io> Co-authored-by: Kebe <kebe.liu@daocloud.io>
This commit is contained in:
104
docker/k8s-sglang-distributed-sts.yaml
Normal file
104
docker/k8s-sglang-distributed-sts.yaml
Normal file
@@ -0,0 +1,104 @@
|
|||||||
|
# Two Nodes Sglang example
|
||||||
|
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: StatefulSet
|
||||||
|
metadata:
|
||||||
|
name: distributed-sglang
|
||||||
|
spec:
|
||||||
|
replicas: 2 # number of nodes/pods to run distributed sglang
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: distributed-sglang
|
||||||
|
serviceName: ""
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: distributed-sglang
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: sglang-container
|
||||||
|
image: docker.io/lmsysorg/sglang:latest
|
||||||
|
imagePullPolicy: Always # image may be replaced by official CI versioned image
|
||||||
|
command:
|
||||||
|
- /bin/bash
|
||||||
|
- -c
|
||||||
|
# please modify the sglang serving arguments below, as necessary.
|
||||||
|
# NOTE: the --expert-parallel-size and --enable-ep-moe are for MoE model like DeepSeek-R1
|
||||||
|
args:
|
||||||
|
- |
|
||||||
|
python3 -m sglang.launch_server \
|
||||||
|
--model /llm-folder \
|
||||||
|
--dist-init-addr sglang-master-pod:5000 \
|
||||||
|
--tensor-parallel-size 16 \
|
||||||
|
--nnodes 2 \
|
||||||
|
--node-rank $POD_INDEX \
|
||||||
|
--trust-remote-code \
|
||||||
|
--host 0.0.0.0 \
|
||||||
|
--port 8000 \
|
||||||
|
--enable-metrics \
|
||||||
|
--enable-ep-moe \
|
||||||
|
--expert-parallel-size 16
|
||||||
|
env:
|
||||||
|
- name: POD_INDEX # reflects the node-rank
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
apiVersion: v1
|
||||||
|
fieldPath: metadata.labels['apps.kubernetes.io/pod-index']
|
||||||
|
- name: NCCL_DEBUG
|
||||||
|
value: INFO
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: "8"
|
||||||
|
requests:
|
||||||
|
volumeMounts:
|
||||||
|
- mountPath: /dev/shm
|
||||||
|
name: dshm
|
||||||
|
- mountPath: /llm-folder
|
||||||
|
name: llm
|
||||||
|
securityContext:
|
||||||
|
privileged: true # to leverage RDMA/InfiniBand device, co-work with HostNetwork=true
|
||||||
|
hostNetwork: true
|
||||||
|
volumes:
|
||||||
|
- emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
sizeLimit: 10Gi
|
||||||
|
name: dshm
|
||||||
|
- hostPath:
|
||||||
|
path: /llm-folder # replace with PVC or hostPath with your model weights
|
||||||
|
type: DirectoryOrCreate
|
||||||
|
name: llm
|
||||||
|
#- persistentVolumeClaim:
|
||||||
|
# claimName: llm-pvc
|
||||||
|
# name: llm
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: sglang-master-pod
|
||||||
|
spec:
|
||||||
|
type: ClusterIP
|
||||||
|
selector:
|
||||||
|
app: distributed-sglang
|
||||||
|
apps.kubernetes.io/pod-index: "0"
|
||||||
|
ports:
|
||||||
|
- name: dist-port
|
||||||
|
port: 5000
|
||||||
|
targetPort: 5000
|
||||||
|
---
|
||||||
|
# the serving service
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: sglang-serving-on-master
|
||||||
|
spec:
|
||||||
|
type: NodePort
|
||||||
|
selector:
|
||||||
|
app: distributed-sglang
|
||||||
|
apps.kubernetes.io/pod-index: "0"
|
||||||
|
ports:
|
||||||
|
- name: serving
|
||||||
|
port: 8000
|
||||||
|
targetPort: 8000
|
||||||
|
- name: metrics
|
||||||
|
port: 8080
|
||||||
|
targetPort: 8080
|
||||||
@@ -98,7 +98,21 @@ drun v0.4.3.post4-rocm630 python3 -m sglang.bench_one_batch --batch-size 32 --in
|
|||||||
2. Execute the command `docker compose up -d` in your terminal.
|
2. Execute the command `docker compose up -d` in your terminal.
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
## Method 5: Run on Kubernetes or Clouds with SkyPilot
|
## Method 5: Using Kubernetes
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>More</summary>
|
||||||
|
|
||||||
|
1. Option 1: For single node serving (typically when the model size fits into GPUs on one node)
|
||||||
|
Execute command `kubectl apply -f docker/k8s-sglang-service.yaml`, to create k8s deployment and service, with llama-31-8b as example.
|
||||||
|
|
||||||
|
2. Option 2: For multi-node serving (usually when a large model requires more than one GPU node, such as `DeepSeek-R1`)
|
||||||
|
Modify the LLM model path and arguments as necessary, then execute command `kubectl apply -f docker/k8s-sglang-distributed-sts.yaml`, to create two nodes k8s statefulset and serving service.
|
||||||
|
</details>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Method 6: Run on Kubernetes or Clouds with SkyPilot
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary>More</summary>
|
<summary>More</summary>
|
||||||
|
|||||||
Reference in New Issue
Block a user