From 0e90ae628a07499936295d19793cee102ddfea8e Mon Sep 17 00:00:00 2001 From: Peter Pan Date: Sun, 9 Mar 2025 15:41:20 +0800 Subject: [PATCH] [docker] Distributed Serving with k8s Statefulset ( good example for DeepSeek-R1) (#3631) Signed-off-by: Peter Pan Co-authored-by: Kebe --- docker/k8s-sglang-distributed-sts.yaml | 104 +++++++++++++++++++++++++ docs/start/install.md | 16 +++- 2 files changed, 119 insertions(+), 1 deletion(-) create mode 100644 docker/k8s-sglang-distributed-sts.yaml diff --git a/docker/k8s-sglang-distributed-sts.yaml b/docker/k8s-sglang-distributed-sts.yaml new file mode 100644 index 000000000..6b81d9b14 --- /dev/null +++ b/docker/k8s-sglang-distributed-sts.yaml @@ -0,0 +1,104 @@ +# Two Nodes Sglang example + +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: distributed-sglang +spec: + replicas: 2 # number of nodes/pods to run distributed sglang + selector: + matchLabels: + app: distributed-sglang + serviceName: "" + template: + metadata: + labels: + app: distributed-sglang + spec: + containers: + - name: sglang-container + image: docker.io/lmsysorg/sglang:latest + imagePullPolicy: Always # image may be replaced by official CI versioned image + command: + - /bin/bash + - -c + # please modify the sglang serving arguments below, as necessary. + # NOTE: the --expert-parallel-size and --enable-ep-moe are for MoE model like DeepSeek-R1 + args: + - | + python3 -m sglang.launch_server \ + --model /llm-folder \ + --dist-init-addr sglang-master-pod:5000 \ + --tensor-parallel-size 16 \ + --nnodes 2 \ + --node-rank $POD_INDEX \ + --trust-remote-code \ + --host 0.0.0.0 \ + --port 8000 \ + --enable-metrics \ + --enable-ep-moe \ + --expert-parallel-size 16 + env: + - name: POD_INDEX # reflects the node-rank + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.labels['apps.kubernetes.io/pod-index'] + - name: NCCL_DEBUG + value: INFO + resources: + limits: + nvidia.com/gpu: "8" + requests: + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /llm-folder + name: llm + securityContext: + privileged: true # to leverage RDMA/InfiniBand device, co-work with HostNetwork=true + hostNetwork: true + volumes: + - emptyDir: + medium: Memory + sizeLimit: 10Gi + name: dshm + - hostPath: + path: /llm-folder # replace with PVC or hostPath with your model weights + type: DirectoryOrCreate + name: llm + #- persistentVolumeClaim: + # claimName: llm-pvc + # name: llm +--- +apiVersion: v1 +kind: Service +metadata: + name: sglang-master-pod +spec: + type: ClusterIP + selector: + app: distributed-sglang + apps.kubernetes.io/pod-index: "0" + ports: + - name: dist-port + port: 5000 + targetPort: 5000 +--- +# the serving service +apiVersion: v1 +kind: Service +metadata: + name: sglang-serving-on-master +spec: + type: NodePort + selector: + app: distributed-sglang + apps.kubernetes.io/pod-index: "0" + ports: + - name: serving + port: 8000 + targetPort: 8000 + - name: metrics + port: 8080 + targetPort: 8080 diff --git a/docs/start/install.md b/docs/start/install.md index fe460e044..f7234c0a6 100644 --- a/docs/start/install.md +++ b/docs/start/install.md @@ -98,7 +98,21 @@ drun v0.4.3.post4-rocm630 python3 -m sglang.bench_one_batch --batch-size 32 --in 2. Execute the command `docker compose up -d` in your terminal. -## Method 5: Run on Kubernetes or Clouds with SkyPilot +## Method 5: Using Kubernetes + +
+More + +1. Option 1: For single node serving (typically when the model size fits into GPUs on one node) + Execute command `kubectl apply -f docker/k8s-sglang-service.yaml`, to create k8s deployment and service, with llama-31-8b as example. + +2. Option 2: For multi-node serving (usually when a large model requires more than one GPU node, such as `DeepSeek-R1`) + Modify the LLM model path and arguments as necessary, then execute command `kubectl apply -f docker/k8s-sglang-distributed-sts.yaml`, to create two nodes k8s statefulset and serving service. +
+ + + +## Method 6: Run on Kubernetes or Clouds with SkyPilot
More