diff --git a/docker/k8s-sglang-distributed-sts.yaml b/docker/k8s-sglang-distributed-sts.yaml
new file mode 100644
index 000000000..6b81d9b14
--- /dev/null
+++ b/docker/k8s-sglang-distributed-sts.yaml
@@ -0,0 +1,104 @@
+# Two Nodes Sglang example
+
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+ name: distributed-sglang
+spec:
+ replicas: 2 # number of nodes/pods to run distributed sglang
+ selector:
+ matchLabels:
+ app: distributed-sglang
+ serviceName: ""
+ template:
+ metadata:
+ labels:
+ app: distributed-sglang
+ spec:
+ containers:
+ - name: sglang-container
+ image: docker.io/lmsysorg/sglang:latest
+ imagePullPolicy: Always # image may be replaced by official CI versioned image
+ command:
+ - /bin/bash
+ - -c
+ # please modify the sglang serving arguments below, as necessary.
+ # NOTE: the --expert-parallel-size and --enable-ep-moe are for MoE model like DeepSeek-R1
+ args:
+ - |
+ python3 -m sglang.launch_server \
+ --model /llm-folder \
+ --dist-init-addr sglang-master-pod:5000 \
+ --tensor-parallel-size 16 \
+ --nnodes 2 \
+ --node-rank $POD_INDEX \
+ --trust-remote-code \
+ --host 0.0.0.0 \
+ --port 8000 \
+ --enable-metrics \
+ --enable-ep-moe \
+ --expert-parallel-size 16
+ env:
+ - name: POD_INDEX # reflects the node-rank
+ valueFrom:
+ fieldRef:
+ apiVersion: v1
+ fieldPath: metadata.labels['apps.kubernetes.io/pod-index']
+ - name: NCCL_DEBUG
+ value: INFO
+ resources:
+ limits:
+ nvidia.com/gpu: "8"
+ requests:
+ volumeMounts:
+ - mountPath: /dev/shm
+ name: dshm
+ - mountPath: /llm-folder
+ name: llm
+ securityContext:
+ privileged: true # to leverage RDMA/InfiniBand device, co-work with HostNetwork=true
+ hostNetwork: true
+ volumes:
+ - emptyDir:
+ medium: Memory
+ sizeLimit: 10Gi
+ name: dshm
+ - hostPath:
+ path: /llm-folder # replace with PVC or hostPath with your model weights
+ type: DirectoryOrCreate
+ name: llm
+ #- persistentVolumeClaim:
+ # claimName: llm-pvc
+ # name: llm
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: sglang-master-pod
+spec:
+ type: ClusterIP
+ selector:
+ app: distributed-sglang
+ apps.kubernetes.io/pod-index: "0"
+ ports:
+ - name: dist-port
+ port: 5000
+ targetPort: 5000
+---
+# the serving service
+apiVersion: v1
+kind: Service
+metadata:
+ name: sglang-serving-on-master
+spec:
+ type: NodePort
+ selector:
+ app: distributed-sglang
+ apps.kubernetes.io/pod-index: "0"
+ ports:
+ - name: serving
+ port: 8000
+ targetPort: 8000
+ - name: metrics
+ port: 8080
+ targetPort: 8080
diff --git a/docs/start/install.md b/docs/start/install.md
index fe460e044..f7234c0a6 100644
--- a/docs/start/install.md
+++ b/docs/start/install.md
@@ -98,7 +98,21 @@ drun v0.4.3.post4-rocm630 python3 -m sglang.bench_one_batch --batch-size 32 --in
2. Execute the command `docker compose up -d` in your terminal.
-## Method 5: Run on Kubernetes or Clouds with SkyPilot
+## Method 5: Using Kubernetes
+
+
+More
+
+1. Option 1: For single node serving (typically when the model size fits into GPUs on one node)
+ Execute command `kubectl apply -f docker/k8s-sglang-service.yaml`, to create k8s deployment and service, with llama-31-8b as example.
+
+2. Option 2: For multi-node serving (usually when a large model requires more than one GPU node, such as `DeepSeek-R1`)
+ Modify the LLM model path and arguments as necessary, then execute command `kubectl apply -f docker/k8s-sglang-distributed-sts.yaml`, to create two nodes k8s statefulset and serving service.
+
+
+
+
+## Method 6: Run on Kubernetes or Clouds with SkyPilot
More