From 0e90ae628a07499936295d19793cee102ddfea8e Mon Sep 17 00:00:00 2001
From: Peter Pan <peter.pan@daocloud.io>
Date: Sun, 9 Mar 2025 15:41:20 +0800
Subject: [PATCH] [docker] Distributed Serving with k8s Statefulset ( good
 example for DeepSeek-R1) (#3631)

Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
Co-authored-by: Kebe <kebe.liu@daocloud.io>
---
 docker/k8s-sglang-distributed-sts.yaml | 104 +++++++++++++++++++++++++
 docs/start/install.md                  |  16 +++-
 2 files changed, 119 insertions(+), 1 deletion(-)
 create mode 100644 docker/k8s-sglang-distributed-sts.yaml
diff --git a/docker/k8s-sglang-distributed-sts.yaml b/docker/k8s-sglang-distributed-sts.yaml
new file mode 100644
index 000000000..6b81d9b14
--- /dev/null
+++ b/docker/k8s-sglang-distributed-sts.yaml
@@ -0,0 +1,104 @@
+# Two Nodes Sglang example
+
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: distributed-sglang
+spec:
+  replicas: 2   # number of nodes/pods to run distributed sglang
+  selector:
+    matchLabels:
+      app: distributed-sglang
+  serviceName: ""
+  template:
+    metadata:
+      labels:
+        app: distributed-sglang
+    spec:
+      containers:
+      - name: sglang-container
+        image: docker.io/lmsysorg/sglang:latest
+        imagePullPolicy: Always # image may be replaced by official CI versioned image
+        command:
+        - /bin/bash
+        - -c
+        # please modify the sglang serving arguments below, as necessary.
+        # NOTE: the --expert-parallel-size and --enable-ep-moe are for MoE model like DeepSeek-R1
+        args:
+        - |
+          python3 -m sglang.launch_server \
+          --model /llm-folder \
+          --dist-init-addr sglang-master-pod:5000 \
+          --tensor-parallel-size 16 \
+          --nnodes 2 \
+          --node-rank $POD_INDEX \
+          --trust-remote-code \
+          --host 0.0.0.0 \
+          --port 8000 \
+          --enable-metrics \
+          --enable-ep-moe \
+          --expert-parallel-size 16
+        env:
+        - name: POD_INDEX     # reflects the node-rank
+          valueFrom:
+            fieldRef:
+              apiVersion: v1
+              fieldPath: metadata.labels['apps.kubernetes.io/pod-index']
+        - name: NCCL_DEBUG
+          value: INFO
+        resources:
+          limits:
+            nvidia.com/gpu: "8"
+          requests:
+        volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+        - mountPath: /llm-folder
+          name: llm
+        securityContext:
+          privileged: true   # to leverage RDMA/InfiniBand device, co-work with HostNetwork=true
+      hostNetwork: true
+      volumes:
+      - emptyDir:
+          medium: Memory
+          sizeLimit: 10Gi
+        name: dshm
+      - hostPath:
+          path: /llm-folder # replace with PVC or hostPath with your model weights
+          type: DirectoryOrCreate
+        name: llm
+      #- persistentVolumeClaim:
+      #  claimName: llm-pvc
+      #  name: llm
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: sglang-master-pod
+spec:
+  type: ClusterIP
+  selector:
+    app: distributed-sglang
+    apps.kubernetes.io/pod-index: "0"
+  ports:
+  - name: dist-port
+    port: 5000
+    targetPort: 5000
+---
+# the serving service
+apiVersion: v1
+kind: Service
+metadata:
+  name: sglang-serving-on-master
+spec:
+  type: NodePort
+  selector:
+    app: distributed-sglang
+    apps.kubernetes.io/pod-index: "0"
+  ports:
+  - name: serving
+    port: 8000
+    targetPort: 8000
+  - name: metrics
+    port: 8080
+    targetPort: 8080
diff --git a/docs/start/install.md b/docs/start/install.md
index fe460e044..f7234c0a6 100644
--- a/docs/start/install.md
+++ b/docs/start/install.md
@@ -98,7 +98,21 @@ drun v0.4.3.post4-rocm630 python3 -m sglang.bench_one_batch --batch-size 32 --in
 2. Execute the command `docker compose up -d` in your terminal.
 </details>
 
-## Method 5: Run on Kubernetes or Clouds with SkyPilot
+## Method 5: Using Kubernetes
+
+<details>
+<summary>More</summary>
+
+1. Option 1: For single node serving (typically when the model size fits into GPUs on one node)
+   Execute command `kubectl apply -f docker/k8s-sglang-service.yaml`, to create k8s deployment and service, with llama-31-8b as example.
+
+2. Option 2: For multi-node serving (usually when a large model requires more than one GPU node, such as `DeepSeek-R1`)
+   Modify the LLM model path and arguments as necessary, then execute command `kubectl apply -f docker/k8s-sglang-distributed-sts.yaml`, to create two nodes k8s statefulset and serving service.
+</details>
+
+
+
+## Method 6: Run on Kubernetes or Clouds with SkyPilot
 
 <details>
 <summary>More</summary>