[docker] Distributed Serving with k8s Statefulset ( good example for DeepSeek-R1) (#3631)

Signed-off-by: Peter Pan <Peter.Pan@daocloud.io> Co-authored-by: Kebe <kebe.liu@daocloud.io>
2025-03-09 15:41:20 +08:00
parent 1361ab9e03
commit 0e90ae628a
2 changed files with 119 additions and 1 deletions
--- a/docker/k8s-sglang-distributed-sts.yaml
+++ b/docker/k8s-sglang-distributed-sts.yaml
@@ -0,0 +1,104 @@
 # Two Nodes Sglang example
 apiVersion: apps/v1
 kind: StatefulSet
 metadata:
  name: distributed-sglang
 spec:
  replicas: 2   # number of nodes/pods to run distributed sglang
  selector:
    matchLabels:
      app: distributed-sglang
  serviceName: ""
  template:
    metadata:
      labels:
        app: distributed-sglang
    spec:
      containers:
      - name: sglang-container
        image: docker.io/lmsysorg/sglang:latest
        imagePullPolicy: Always # image may be replaced by official CI versioned image
        command:
        - /bin/bash
        - -c
        # please modify the sglang serving arguments below, as necessary.
        # NOTE: the --expert-parallel-size and --enable-ep-moe are for MoE model like DeepSeek-R1
        args:
        - |
          python3 -m sglang.launch_server \
          --model /llm-folder \
          --dist-init-addr sglang-master-pod:5000 \
          --tensor-parallel-size 16 \
          --nnodes 2 \
          --node-rank $POD_INDEX \
          --trust-remote-code \
          --host 0.0.0.0 \
          --port 8000 \
          --enable-metrics \
          --enable-ep-moe \
          --expert-parallel-size 16
        env:
        - name: POD_INDEX     # reflects the node-rank
          valueFrom:
            fieldRef:
              apiVersion: v1
              fieldPath: metadata.labels['apps.kubernetes.io/pod-index']
        - name: NCCL_DEBUG
          value: INFO
        resources:
          limits:
            nvidia.com/gpu: "8"
          requests:
        volumeMounts:
        - mountPath: /dev/shm
          name: dshm
        - mountPath: /llm-folder
          name: llm
        securityContext:
          privileged: true   # to leverage RDMA/InfiniBand device, co-work with HostNetwork=true
      hostNetwork: true
      volumes:
      - emptyDir:
          medium: Memory
          sizeLimit: 10Gi
        name: dshm
      - hostPath:
          path: /llm-folder # replace with PVC or hostPath with your model weights
          type: DirectoryOrCreate
        name: llm
      #- persistentVolumeClaim:
      #  claimName: llm-pvc
      #  name: llm
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: sglang-master-pod
 spec:
  type: ClusterIP
  selector:
    app: distributed-sglang
    apps.kubernetes.io/pod-index: "0"
  ports:
  - name: dist-port
    port: 5000
    targetPort: 5000
 ---
 # the serving service
 apiVersion: v1
 kind: Service
 metadata:
  name: sglang-serving-on-master
 spec:
  type: NodePort
  selector:
    app: distributed-sglang
    apps.kubernetes.io/pod-index: "0"
  ports:
  - name: serving
    port: 8000
    targetPort: 8000
  - name: metrics
    port: 8080
    targetPort: 8080
--- a/docs/start/install.md
+++ b/docs/start/install.md
@@ -98,7 +98,21 @@ drun v0.4.3.post4-rocm630 python3 -m sglang.bench_one_batch --batch-size 32 --in
 2. Execute the command `docker compose up -d` in your terminal.
 </details>
-## Method 5: Run on Kubernetes or Clouds with SkyPilot
+## Method 5: Using Kubernetes
 <details>
 <summary>More</summary>
 1. Option 1: For single node serving (typically when the model size fits into GPUs on one node)
   Execute command `kubectl apply -f docker/k8s-sglang-service.yaml`, to create k8s deployment and service, with llama-31-8b as example.
 2. Option 2: For multi-node serving (usually when a large model requires more than one GPU node, such as `DeepSeek-R1`)
   Modify the LLM model path and arguments as necessary, then execute command `kubectl apply -f docker/k8s-sglang-distributed-sts.yaml`, to create two nodes k8s statefulset and serving service.
 </details>
 ## Method 6: Run on Kubernetes or Clouds with SkyPilot
 <details>
 <summary>More</summary>