diff --git a/docs/references/advanced_deploy.rst b/docs/references/advanced_deploy.rst index 06a10021e..bbc3668c5 100644 --- a/docs/references/advanced_deploy.rst +++ b/docs/references/advanced_deploy.rst @@ -5,3 +5,4 @@ Multi-Node Deployment multi_node.md deploy_on_k8s.md + disaggregation/lws_pd_deploy.md diff --git a/docs/references/disaggregation/lws-examples/d-svc.yaml b/docs/references/disaggregation/lws-examples/d-svc.yaml new file mode 100644 index 000000000..27f98009e --- /dev/null +++ b/docs/references/disaggregation/lws-examples/d-svc.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: Service +metadata: + name: deepseekr10528-decode-main +spec: + selector: + leaderworkerset.sigs.k8s.io/name: deepseekr10528-decode-main + role: leader + ports: + - protocol: TCP + port: 30000 + targetPort: 30000 diff --git a/docs/references/disaggregation/lws-examples/d.yaml b/docs/references/disaggregation/lws-examples/d.yaml new file mode 100644 index 000000000..376af741a --- /dev/null +++ b/docs/references/disaggregation/lws-examples/d.yaml @@ -0,0 +1,292 @@ +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + name: deepseekr10528-decode-main +spec: + leaderWorkerTemplate: + leaderTemplate: + metadata: + labels: + role: leader + spec: + containers: + - command: + - python3 + - -m + - sglang.launch_server + - --port + - "30000" + - --host + - "0.0.0.0" + - --model-path + - /work/models + - --chunked-prefill-size + - "262144" + - --page-size + - "64" + - --enable-dp-attention + - --enable-dp-lm-head + - --dp-size + - "16" + - --enable-deepep-moe + - --deepep-mode + - low_latency + - --disaggregation-mode + - decode + - --mem-fraction-static + - "0.849" + - --context-length + - "32768" + - --disaggregation-ib-device + - "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3" + - --cuda-graph-max-bs + - "64" + - --max-running-requests + - "2048" + - --tp-size + - "16" # Size of Tensor Parallelism + - --dist-init-addr + - $(LWS_LEADER_ADDRESS):20102 + - --nnodes + - $(LWS_GROUP_SIZE) + - --node-rank + - $(LWS_WORKER_INDEX) + - --trust-remote-code + - --ep-num-redundant-experts + - "32" + - --moe-dense-tp-size + - "1" + env: + - name: CUDA_LAUNCH_BLOCKING + value: "0" + - name: NVSHMEM_IB_GID_INDEX + value: "3" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "1" + - name: NVSHMEM_HCA_PE_MAPPING + value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_IB_SPLIT_DATA_ON_QPS + value: "1" + - name: NCCL_NET_PLUGIN + value: "none" + - name: NCCL_IB_TC + value: "136" + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: NCCL_IB_SL + value: "5" + - name: MC_TE_METRIC + value: "true" + - name: SGLANG_MOONCAKE_TRANS_THREAD + value: "16" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + - name: NCCL_IB_HCA + value: ^=mlx5_0,mlx5_5,mlx5_6 + - name: LWS_WORKER_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index'] + image: lmsysorg/sglang:latest + name: sglang-leader + ports: + - containerPort: 30000 + protocol: TCP + readinessProbe: + periodSeconds: 30 + tcpSocket: + port: 30000 + resources: + limits: + nvidia.com/gpu: "8" + securityContext: + capabilities: + add: + - IPC_LOCK + privileged: true + volumeMounts: + - mountPath: /root/.cache + name: sgl-cache + - mountPath: /dev/shm + name: dshm + - mountPath: /work/models + name: model + - mountPath: /dev/infiniband + name: ib + - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs + name: cf + dnsPolicy: ClusterFirstWithHostNet + hostIPC: true + hostNetwork: true + nodeSelector: + # should modify according your deployment env + pd: "yes" + tolerations: + # should modify according your deployment env + - key: bopd + operator: Exists + - key: node-role + operator: Exists + volumes: + - hostPath: + path: /data1/sgl_cache1 + type: DirectoryOrCreate + name: sgl-cache + - emptyDir: + medium: Memory + name: dshm + - hostPath: + path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528 + name: model + - hostPath: + path: /dev/infiniband + name: ib + - hostPath: + path: /data1/maas_hosted_models/models/fused_moe_triton/configs + name: cf + restartPolicy: RecreateGroupOnPodRestart + size: 2 + workerTemplate: + metadata: {} + spec: + containers: + - command: + - python3 + - -m + - sglang.launch_server + - --model-path + - /work/models + - --chunked-prefill-size + - "262144" + - --page-size + - "64" + - --enable-dp-attention + - --enable-dp-lm-head + - --dp-size + - "16" + - --enable-deepep-moe + - --deepep-mode + - low_latency + - --disaggregation-mode + - decode + - --mem-fraction-static + - "0.849" + - --context-length + - "32768" + - --disaggregation-ib-device + - "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3" + - --cuda-graph-max-bs + - "64" + - --max-running-requests + - "2048" + - --tp-size + - "16" # Size of Tensor Parallelism + - --dist-init-addr + - $(LWS_LEADER_ADDRESS):20102 + - --nnodes + - $(LWS_GROUP_SIZE) + - --node-rank + - $(LWS_WORKER_INDEX) + - --trust-remote-code + - --ep-num-redundant-experts + - "32" + - --moe-dense-tp-size + - "1" + env: + - name: NVSHMEM_IB_TRAFFIC_CLASS + value: "16" + - name: NVSHMEM_IB_GID_INDEX + value: "3" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "1" + - name: NVSHMEM_HCA_PE_MAPPING + value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_IB_SPLIT_DATA_ON_QPS + value: "1" + - name: NCCL_NET_PLUGIN + value: "none" + - name: NCCL_IB_TC + value: "136" + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: MC_TE_METRIC + value: "true" + - name: NCCL_IB_SL + value: "5" + - name: SGLANG_MOONCAKE_TRANS_THREAD + value: "16" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + - name: NCCL_IB_HCA + value: ^=mlx5_0,mlx5_5,mlx5_6 + - name: LWS_WORKER_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index'] + image: lmsysorg/sglang:latest + name: sglang-worker + ports: + - containerPort: 30001 + resources: + limits: + nvidia.com/gpu: "8" + securityContext: + capabilities: + add: + - IPC_LOCK + privileged: true + volumeMounts: + - mountPath: /root/.cache + name: sgl-cache + - mountPath: /dev/shm + name: dshm + - mountPath: /work/models + name: model + - mountPath: /dev/infiniband + name: ib + - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs + name: cf + dnsPolicy: ClusterFirstWithHostNet + hostIPC: true + hostNetwork: true + nodeSelector: + # should modify according your deployment env + pd: "yes" + tolerations: + # should modify according your deployment env + - key: bopd + operator: Exists + - key: node-role + operator: Exists + volumes: + - hostPath: + path: /data1/sgl_cache1 + type: DirectoryOrCreate + name: sgl-cache + - emptyDir: + medium: Memory + name: dshm + - hostPath: + path: /dev/infiniband + name: ib + - hostPath: + # modify according to you deployment env + path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528 + name: model + - hostPath: + # modify according to you deployment env + path: /data1/maas_hosted_models/models/fused_moe_triton/configs + name: cf + networkConfig: + subdomainPolicy: Shared + replicas: 1 + rolloutStrategy: + rollingUpdateConfiguration: + maxSurge: 0 + maxUnavailable: 1 + type: RollingUpdate + startupPolicy: LeaderCreated diff --git a/docs/references/disaggregation/lws-examples/lb.yaml b/docs/references/disaggregation/lws-examples/lb.yaml new file mode 100644 index 000000000..da7861584 --- /dev/null +++ b/docs/references/disaggregation/lws-examples/lb.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: deepseekr10528-lb-main + labels: + app: deepseekr10528-lb +spec: + replicas: 1 + selector: + matchLabels: + app: deepseekr10528-lb + template: + metadata: + labels: + app: deepseekr10528-lb + spec: + nodeSelector: + bo: "yes" + tolerations: + - key: bopd + operator: Exists + - key: node-role + operator: Exists + containers: + - name: sgl-minilb + image: lmsysorg/sglang:latest + command: + - python + - -m + - sglang.srt.disaggregation.mini_lb + - --prefill + - http://deepseekr10528-prefill-main:30000 + - --decode + - http://deepseekr10528-decode-main:30000 + - --host + - 0.0.0.0 + - --port + - "8000" + ports: + - containerPort: 8000 + +--- +apiVersion: v1 +kind: Service +metadata: + name: deepseekr10528-lb-service +spec: + type: NodePort # NodePort is easy to test, you can also specify `ClusterIP` + selector: + app: deepseekr10528-lb + ports: + - protocol: TCP + port: 8000 # Service Port(In-Cluster) + targetPort: 8000 # Exposed Container + nodePort: 30800 diff --git a/docs/references/disaggregation/lws-examples/p-svc.yaml b/docs/references/disaggregation/lws-examples/p-svc.yaml new file mode 100644 index 000000000..6826a13df --- /dev/null +++ b/docs/references/disaggregation/lws-examples/p-svc.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: Service +metadata: + name: deepseekr10528-prefill-main +spec: + selector: + leaderworkerset.sigs.k8s.io/name: deepseekr10528-prefill-main + role: leader + ports: + - protocol: TCP + port: 30000 + targetPort: 30000 diff --git a/docs/references/disaggregation/lws-examples/p.yaml b/docs/references/disaggregation/lws-examples/p.yaml new file mode 100644 index 000000000..6a463dc69 --- /dev/null +++ b/docs/references/disaggregation/lws-examples/p.yaml @@ -0,0 +1,306 @@ +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + name: deepseekr10528-prefill-main +spec: + leaderWorkerTemplate: + leaderTemplate: + metadata: + labels: + role: leader + spec: + containers: + - command: + - python3 + - -m + - sglang.launch_server + - --port + - "30000" + - --host + - "0.0.0.0" + - --model-path + - /work/models + - --disaggregation-ib-device + # should modify according your rdma env + - mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3 + - --chunked-prefill-size + - "524288" + - --max-prefill-tokens + - "32768" + - --page-size + - "64" + - --ep-dispatch-algorithm + - dynamic + - --eplb-algorithm + - deepseek + - --enable-dp-lm-head + - --enable-dp-attention + - --dp-size + - "16" + - --disable-radix-cache + - --enable-deepep-moe + - --deepep-mode + - normal + - --disaggregation-mode + - prefill + - --mem-fraction-static + - "0.7" + - --context-length + - "32768" + - --tp + - "16" + - --dist-init-addr + - $(LWS_LEADER_ADDRESS):20102 + - --nnodes + - $(LWS_GROUP_SIZE) + - --node-rank + - $(LWS_WORKER_INDEX) + - --trust-remote-code + - --ep-num-redundant-experts + - "32" + - --moe-dense-tp-size + - "1" + - --max-running-requests + - "1024" + env: + - name: NVSHMEM_HCA_PE_MAPPING + # should modify according your rdma env + value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2" + - name: NVSHMEM_IB_GID_INDEX + value: "3" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "1" + - name: SGLANG_SET_CPU_AFFINITY + value: "true" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_IB_SPLIT_DATA_ON_QPS + value: "1" + - name: NCCL_NET_PLUGIN + value: none + - name: NCCL_IB_TC + value: "136" + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: MC_TE_METRIC + value: "false" + - name: NCCL_IB_SL + value: "5" + - name: NCCL_IB_HCA + value: ^=mlx5_0,mlx5_5,mlx5_6 + - name: LWS_WORKER_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index'] + image: lmsysorg/sglang:latest + name: sglang-leader + ports: + - containerPort: 30000 + protocol: TCP + readinessProbe: + periodSeconds: 30 + tcpSocket: + port: 30000 + resources: + limits: + nvidia.com/gpu: "8" + securityContext: + capabilities: + add: + - IPC_LOCK + privileged: true + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /work/models + name: model + - mountPath: /dev/infiniband + name: ib + - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs + name: cf + - mountPath: /root/.cache + name: sgl-cache + dnsPolicy: ClusterFirstWithHostNet + hostIPC: true + hostNetwork: true + nodeSelector: + # should modify according your deployment env + pd: "yes" + tolerations: + # should modify according your deployment env + - key: bopd + operator: Exists + - key: node-role + operator: Exists + volumes: + - emptyDir: + medium: Memory + name: dshm + - hostPath: + path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528 + name: model + - hostPath: + path: /dev/infiniband + name: ib + - hostPath: + path: /data1/maas_hosted_models/models/fused_moe_triton/configs + name: cf + - hostPath: + path: /data1/sgl_cache + type: DirectoryOrCreate + name: sgl-cache + restartPolicy: RecreateGroupOnPodRestart + size: 2 + workerTemplate: + metadata: {} + spec: + containers: + - command: + - python3 + - -m + - sglang.launch_server + - --model-path + - /work/models + - --disaggregation-ib-device + # should modify according your rdma env + - mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3 + - --chunked-prefill-size + - "524288" + - --max-prefill-tokens + - "32768" + - --page-size + - "64" + - --ep-dispatch-algorithm + - dynamic + - --eplb-algorithm + - deepseek + # - --deepep-config + # - /home/aiges/tuned/tuned_8sms.json + # can be tuned using deepep test scripts + - --enable-dp-lm-head + - --enable-dp-attention + - --dp-size + - "16" + - --disable-radix-cache + - --enable-deepep-moe + - --deepep-mode + - normal + - --disaggregation-mode + - prefill + - --mem-fraction-static + - "0.7" + - --context-length + - "32768" + - --tp + - "16" + - --dist-init-addr + - $(LWS_LEADER_ADDRESS):20102 + - --nnodes + - $(LWS_GROUP_SIZE) + - --node-rank + - $(LWS_WORKER_INDEX) + - --trust-remote-code + - --ep-num-redundant-experts + - "32" + - --moe-dense-tp-size + - "1" + - --max-running-requests + - "1024" + env: + - name: SGLANG_SET_CPU_AFFINITY + value: "true" + - name: NVSHMEM_HCA_PE_MAPPING + # should modify according your rdma env + value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2" + - name: NCCL_IB_HCA + value: ^=mlx5_0,mlx5_5,mlx5_6 + - name: NVSHMEM_IB_TRAFFIC_CLASS + value: "16" + - name: NVSHMEM_IB_GID_INDEX + value: "3" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "1" + - name: CUDA_LAUNCH_BLOCKING + value: "0" + - name: SGLANG_MOONCAKE_TRANS_THREAD + value: "8" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + - name: SGL_CHUNKED_PREFIX_CACHE_THRESHOLD + value: "0" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_IB_SPLIT_DATA_ON_QPS + value: "1" + - name: NCCL_NET_PLUGIN + value: none + - name: NCCL_IB_TC + value: "136" + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: MC_TE_METRIC + value: "true" + - name: NCCL_IB_SL + value: "5" + - name: LWS_WORKER_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index'] + image: lmsysorg/sglang:latest + name: sglang-worker + ports: + - containerPort: 30001 + protocol: TCP + resources: + limits: + nvidia.com/gpu: "8" + securityContext: + capabilities: + add: + - IPC_LOCK + privileged: true + volumeMounts: + - mountPath: /root/.cache + name: sgl-cache + - mountPath: /dev/shm + name: dshm + - mountPath: /work/models + name: model + - mountPath: /dev/infiniband + name: ib + - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs + name: cf + dnsPolicy: ClusterFirstWithHostNet + hostIPC: true + hostNetwork: true + nodeSelector: + # should modify according your deployment env + pd: "yes" + tolerations: + # should modify according your deployment env + - key: bopd + operator: Exists + - key: node-role + operator: Exists + volumes: + - emptyDir: + medium: Memory + name: dshm + - hostPath: + path: /dev/infiniband + name: ib + - hostPath: + # modify according to you deployment env + path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528 + name: model + - hostPath: + # modify according to you deployment env + path: /data1/maas_hosted_models/models/fused_moe_triton/configs + name: cf + - hostPath: + # modify according to you deployment env + path: /data1/sgl_cache + type: DirectoryOrCreate + name: sgl-cache diff --git a/docs/references/disaggregation/lws_pd_deploy.md b/docs/references/disaggregation/lws_pd_deploy.md new file mode 100644 index 000000000..bca7c6a28 --- /dev/null +++ b/docs/references/disaggregation/lws_pd_deploy.md @@ -0,0 +1,786 @@ +# LWS Based PD Deploy + +## 0. Prerequisites + +1. k8s >=1.26 +2. lws installed on k8s. + +## 1. Image Preparation + +`lmsysorg/sglang:deepep` + +## 2. Deployment Manifest Files + +***Notice: We will package all deployment files into Helm Chart format in the near future. Interested community members can contact us to contribute*** + +### Prefill + +Prefill manifest file [prefill.yaml](lws-examples/p.yaml) + +*Note: The NodeSelector section, model location section, and taint toleration section can be adjusted according to your actual deployment environment* + +```yaml +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + name: deepseekr10528-prefill-main +spec: + leaderWorkerTemplate: + leaderTemplate: + metadata: + labels: + role: leader + spec: + containers: + - command: + - python3 + - -m + - sglang.launch_server + - --port + - "30000" + - --host + - "0.0.0.0" + - --model-path + - /work/models + - --disaggregation-ib-device + # should modify according your rdma env + - mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3 + - --chunked-prefill-size + - "524288" + - --max-prefill-tokens + - "32768" + - --page-size + - "64" + # - --init-expert-location + # - /home/aiges/tuned/attachment_ep_statistics/prefill_in1024.json + - --ep-dispatch-algorithm + - dynamic + - --eplb-algorithm + - deepseek + # - --deepep-config + # - /home/aiges/tuned/tuned_8sms.json + - --enable-dp-lm-head + - --enable-dp-attention + - --dp-size + - "16" + - --disable-radix-cache + - --enable-deepep-moe + - --deepep-mode + - normal + - --disaggregation-mode + - prefill + - --mem-fraction-static + - "0.7" + - --context-length + - "32768" + - --tp + - "16" + - --dist-init-addr + - $(LWS_LEADER_ADDRESS):20102 + - --nnodes + - $(LWS_GROUP_SIZE) + - --node-rank + - $(LWS_WORKER_INDEX) + - --trust-remote-code + - --ep-num-redundant-experts + - "32" + - --moe-dense-tp-size + - "1" + - --max-running-requests + - "1024" + env: +# - name: NVSHMEM_HCA_PE_MAPPING +# value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2" +# - name: NVSHMEM_HCA_LIST +# value: "mlx5_bond_0:1,mlx5_bond_1:1,mlx5_bond_2:1,mlx5_bond_3:1" + - name: NVSHMEM_IB_GID_INDEX + value: "3" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "1" + - name: SGLANG_SET_CPU_AFFINITY + value: "true" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_IB_SPLIT_DATA_ON_QPS + value: "1" + - name: NCCL_NET_PLUGIN + value: none + - name: NCCL_IB_TC + value: "136" + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: MC_TE_METRIC + value: "false" + - name: NCCL_IB_SL + value: "5" + - name: NCCL_IB_HCA + value: ^=mlx5_0,mlx5_5,mlx5_6 + - name: LWS_WORKER_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index'] + image: lmsysorg/sglang:deepep + name: sglang-leader + ports: + - containerPort: 30000 + protocol: TCP + readinessProbe: + periodSeconds: 30 + tcpSocket: + port: 30000 + resources: + limits: + nvidia.com/gpu: "8" + securityContext: + capabilities: + add: + - IPC_LOCK + privileged: true + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /work/models + name: model + - mountPath: /dev/infiniband + name: ib + - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs + name: cf + - mountPath: /root/.cache + name: sgl-cache + dnsPolicy: ClusterFirstWithHostNet + hostIPC: true + hostNetwork: true + nodeSelector: + pd: "yes" + tolerations: + - key: pd + operator: Exists + - key: node-role + operator: Exists + volumes: + - emptyDir: + medium: Memory + name: dshm + - hostPath: + # modify according to you deployment env + path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528 + name: model + - hostPath: + path: /dev/infiniband + name: ib + - hostPath: + # modify according to you deployment env + path: /data1/maas_hosted_models/models/fused_moe_triton/configs + name: cf + - hostPath: + # modify according to you deployment env + path: /data1/sgl_cache + type: DirectoryOrCreate + name: sgl-cache + restartPolicy: RecreateGroupOnPodRestart + size: 2 + workerTemplate: + metadata: {} + spec: + containers: + - command: + - python3 + - -m + - sglang.launch_server + - --model-path + - /work/models + - --disaggregation-ib-device + - mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3 + - --chunked-prefill-size + - "524288" + - --max-prefill-tokens + - "32768" + - --page-size + - "64" + #- --init-expert-location + #- /home/aiges/tuned/attachment_ep_statistics/prefill_in1024.json + - --ep-dispatch-algorithm + - dynamic + - --eplb-algorithm + - deepseek +# - --deepep-config +# - /home/aiges/tuned/tuned_8sms.json + - --enable-dp-lm-head + - --enable-dp-attention + - --dp-size + - "16" + - --disable-radix-cache + - --enable-deepep-moe + - --deepep-mode + - normal + - --disaggregation-mode + - prefill + - --mem-fraction-static + - "0.7" + - --context-length + - "32768" + - --tp + - "16" + - --dist-init-addr + - $(LWS_LEADER_ADDRESS):20102 + - --nnodes + - $(LWS_GROUP_SIZE) + - --node-rank + - $(LWS_WORKER_INDEX) + - --trust-remote-code + - --ep-num-redundant-experts + - "32" + - --moe-dense-tp-size + - "1" + - --max-running-requests + - "1024" + env: + - name: SGLANG_SET_CPU_AFFINITY + value: "true" + - name: SGLANG_HACK_DEEPEP_NUM_SMS + value: "8" + - name: SGLANG_HACK_DEEPEP_NEW_MODE + value: "0" +# - name: NVSHMEM_HCA_PE_MAPPING +# value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2" +# - name: NVSHMEM_HCA_LIST +# value: "mlx5_bond_0:1,mlx5_bond_1:1,mlx5_bond_2:1,mlx5_bond_3:1" + - name: NCCL_IB_HCA + value: ^=mlx5_0,mlx5_5,mlx5_6 + - name: NVSHMEM_IB_TRAFFIC_CLASS + value: "16" + - name: NVSHMEM_IB_GID_INDEX + value: "3" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "1" + - name: CUDA_LAUNCH_BLOCKING + value: "0" + - name: SGLANG_MOONCAKE_TRANS_THREAD + value: "8" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + - name: SGL_CHUNKED_PREFIX_CACHE_THRESHOLD + value: "0" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_IB_SPLIT_DATA_ON_QPS + value: "1" + - name: NCCL_NET_PLUGIN + value: none + - name: NCCL_IB_TC + value: "136" + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: MC_TE_METRIC + value: "true" + - name: NCCL_IB_SL + value: "5" + - name: LWS_WORKER_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index'] + image: lmsysorg/sglang:deepep + name: sglang-worker + ports: + - containerPort: 30001 + protocol: TCP + resources: + limits: + nvidia.com/gpu: "8" + securityContext: + capabilities: + add: + - IPC_LOCK + privileged: true + volumeMounts: + + - mountPath: /root/.cache + name: sgl-cache + - mountPath: /dev/shm + name: dshm + - mountPath: /work/models + name: model + - mountPath: /dev/infiniband + name: ib + - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs + name: cf + dnsPolicy: ClusterFirstWithHostNet + hostIPC: true + hostNetwork: true + nodeSelector: + pd: "yes" + tolerations: + - key: pd + operator: Exists + - key: node-role + operator: Exists + volumes: + - emptyDir: + medium: Memory + name: dshm + - hostPath: + path: /dev/infiniband + name: ib + - hostPath: + path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528 + name: model + - hostPath: + path: /data1/maas_hosted_models/models/fused_moe_triton/configs + name: cf + - hostPath: + path: /data1/sgl_cache + type: DirectoryOrCreate + name: sgl-cache + +``` + +### Decode + +Decode node deployment manifest file [decode.yaml](lws-examples/d.yaml) + +*Note: The NodeSelector section, model location section, and taint toleration section can be adjusted according to your actual deployment environment* + +```yaml +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + name: deepseekr10528-decode-main +spec: + leaderWorkerTemplate: + leaderTemplate: + metadata: + labels: + role: leader + spec: + containers: + - command: + - python3 + - -m + - sglang.launch_server + - --port + - "30000" + - --host + - "0.0.0.0" + - --model-path + - /work/models + - --chunked-prefill-size + - "262144" + - --page-size + - "64" + - --enable-dp-attention + - --enable-dp-lm-head + - --dp-size + - "16" + - --enable-deepep-moe + - --deepep-mode + - low_latency + - --disaggregation-mode + - decode + - --mem-fraction-static + - "0.849" + - --context-length + - "32768" + - --disaggregation-ib-device + - "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3" + - --cuda-graph-max-bs + - "64" + - --max-running-requests + - "2048" + - --tp-size + - "16" # Size of Tensor Parallelism + - --dist-init-addr + - $(LWS_LEADER_ADDRESS):20102 + - --nnodes + - $(LWS_GROUP_SIZE) + - --node-rank + - $(LWS_WORKER_INDEX) + - --trust-remote-code + - --ep-num-redundant-experts + - "32" + - --moe-dense-tp-size + - "1" + env: + - name: CUDA_LAUNCH_BLOCKING + value: "0" + - name: NVSHMEM_IB_GID_INDEX + value: "3" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "1" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_IB_SPLIT_DATA_ON_QPS + value: "1" + - name: NCCL_NET_PLUGIN + value: "none" + - name: NCCL_IB_TC + value: "136" + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: NCCL_IB_SL + value: "5" + - name: MC_TE_METRIC + value: "true" + - name: SGLANG_MOONCAKE_TRANS_THREAD + value: "16" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + - name: NCCL_IB_HCA + value: ^=mlx5_0,mlx5_5,mlx5_6 + - name: LWS_WORKER_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index'] + image: lmsysorg/sglang:deepep + name: sglang-leader + ports: + - containerPort: 30000 + protocol: TCP + readinessProbe: + periodSeconds: 30 + tcpSocket: + port: 30000 + resources: + limits: + nvidia.com/gpu: "8" + securityContext: + capabilities: + add: + - IPC_LOCK + privileged: true + volumeMounts: + - mountPath: /root/.cache + name: sgl-cache + - mountPath: /dev/shm + name: dshm + - mountPath: /work/models + name: model + - mountPath: /dev/infiniband + name: ib + - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs + name: cf + dnsPolicy: ClusterFirstWithHostNet + hostIPC: true + hostNetwork: true + nodeSelector: + pd: "yes" + tolerations: + - key: pd + operator: Exists + - key: node-role + operator: Exists + volumes: + - hostPath: + path: /data1/sgl_cache1 + type: DirectoryOrCreate + name: sgl-cache + - emptyDir: + medium: Memory + name: dshm + - hostPath: + path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528 + name: model + - hostPath: + path: /dev/infiniband + name: ib + - hostPath: + path: /data1/maas_hosted_models/models/fused_moe_triton/configs + name: cf + restartPolicy: RecreateGroupOnPodRestart + size: 2 + workerTemplate: + metadata: {} + spec: + containers: + - command: + - python3 + - -m + - sglang.launch_server + - --model-path + - /work/models + - --chunked-prefill-size + - "262144" + - --page-size + - "64" + - --enable-dp-attention + - --enable-dp-lm-head + #- --enable-two-batch-overlap + - --dp-size + - "16" + - --enable-deepep-moe + - --deepep-mode + - low_latency + - --disaggregation-mode + - decode + - --mem-fraction-static + - "0.849" + - --context-length + - "32768" + - --disaggregation-ib-device + # should modify according your rdma env + - "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3" + - --cuda-graph-max-bs + - "64" + - --max-running-requests + - "2048" + - --tp-size + - "16" # Size of Tensor Parallelism + - --dist-init-addr + - $(LWS_LEADER_ADDRESS):20102 + - --nnodes + - $(LWS_GROUP_SIZE) + - --node-rank + - $(LWS_WORKER_INDEX) + - --trust-remote-code + - --ep-num-redundant-experts + - "32" + - --moe-dense-tp-size + - "1" + env: + - name: SGLANG_HACK_DEEPEP_NUM_SMS + value: "24" + - name: SGLANG_HACK_DEEPEP_NEW_MODE + value: "0" + - name: NVSHMEM_IB_TRAFFIC_CLASS + value: "16" + - name: NVSHMEM_IB_GID_INDEX + value: "3" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "1" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_IB_SPLIT_DATA_ON_QPS + value: "1" + - name: NCCL_NET_PLUGIN + value: "none" + - name: NCCL_IB_TC + value: "136" + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: MC_TE_METRIC + value: "true" + - name: NCCL_IB_SL + value: "5" + - name: SGLANG_MOONCAKE_TRANS_THREAD + value: "16" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + - name: NCCL_IB_HCA + value: ^=mlx5_0,mlx5_5,mlx5_6 + - name: LWS_WORKER_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index'] + image: lmsysorg/sglang:deepep + name: sglang-worker + ports: + - containerPort: 30001 + resources: + limits: + nvidia.com/gpu: "8" + securityContext: + capabilities: + add: + - IPC_LOCK + privileged: true + volumeMounts: + - mountPath: /root/.cache + name: sgl-cache + - mountPath: /dev/shm + name: dshm + - mountPath: /work/models + name: model + - mountPath: /dev/infiniband + name: ib + - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs + name: cf + dnsPolicy: ClusterFirstWithHostNet + hostIPC: true + hostNetwork: true + nodeSelector: + pd: "yes" + tolerations: + - key: pd + operator: Exists + - key: node-role + operator: Exists + volumes: + - hostPath: + path: /data1/sgl_cache1 + type: DirectoryOrCreate + name: sgl-cache + - emptyDir: + medium: Memory + name: dshm + - hostPath: + path: /dev/infiniband + name: ib + - hostPath: + # modify according to you deployment env + path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528 + name: model + - hostPath: + # modify according to you deployment env + path: /data1/maas_hosted_models/models/fused_moe_triton/configs + name: cf + networkConfig: + subdomainPolicy: Shared + replicas: 1 + rolloutStrategy: + rollingUpdateConfiguration: + maxSurge: 0 + maxUnavailable: 1 + type: RollingUpdate + startupPolicy: LeaderCreated +``` + +Execute separately: + +```bash +kubectl apply -f p.yaml +kubectl apply -f d.yaml +``` + +At this point, we have completed the deployment of the 1P1D SGlang engine part. + +To allow our users to directly experience the model API, we still need a load balancer to handle sequential calls between prefill and decode. Different companies implement LBs differently, and the community will also officially release a new LB component written in Rust in the near future. + +Currently, we use a static K8S service + minilb approach to implement model API calls. + +### Creating Service for Prefill and Decode + +#### Create prefill k8s service +[p-svc.yaml](lws-examples/p-svc.yaml) +```yaml +apiVersion: v1 +kind: Service +metadata: + name: deepseekr10528-prefill-main +spec: + selector: + leaderworkerset.sigs.k8s.io/name: deepseekr10528-prefill-main + role: leader + ports: + - protocol: TCP + port: 30000 + targetPort: 30000 +``` +Execute `kubectl apply -f p-svc.yaml` + +#### Create decode k8s service +[d-svc.yaml](lws-examples/d-svc.yaml) +```yaml +apiVersion: v1 +kind: Service +metadata: + name: deepseekr10528-decode-main +spec: + selector: + leaderworkerset.sigs.k8s.io/name: deepseekr10528-decode-main + role: leader + ports: + - protocol: TCP + port: 30000 + targetPort: 30000 +``` +Execute `kubectl apply -f d-svc.yaml` + +#### Deploy minilb and lb service +[lb.yaml](lws-examples/lb.yaml) +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: deepseekr10528-lb-main + labels: + app: deepseekr10528-lb +spec: + replicas: 1 + selector: + matchLabels: + app: deepseekr10528-lb + template: + metadata: + labels: + app: deepseekr10528-lb + spec: + nodeSelector: + pd: "yes" + tolerations: + - key: pd + operator: Exists + - key: node-role + operator: Exists + containers: + - name: sgl-minilb + image: lmsysorg/sglang:deepep + command: + - python + - -m + - sglang.srt.disaggregation.mini_lb + - --prefill + - http://deepseekr10528-prefill-main:30000 + - --decode + - http://deepseekr10528-decode-main:30000 + - --host + - 0.0.0.0 + - --port + - "8000" + ports: + - containerPort: 8000 +--- +apiVersion: v1 +kind: Service +metadata: + name: deepseekr10528-lb-service +spec: + type: NodePort + selector: + app: deepseekr10528-lb + ports: + - protocol: TCP + port: 8000 # Service Port(In-Cluster) + targetPort: 8000 # Exposed Container + nodePort: 30800 +``` +Execute `kubectl apply -f lb.yaml` + +After waiting for all model deployments to succeed, you will get the following output: + +```bash +[root@ecs-001]# kubectl get po +deepseekr10528-decode-main-0 1/1 Running 0 74m +deepseekr10528-decode-main-0-1 1/1 Running 0 74m +deepseekr10528-lb-main-9c5dbfc57-6lcbd 1/1 Running 0 22m +deepseekr10528-prefill-main-0 1/1 Running 0 74m +deepseekr10528-prefill-main-0-1 1/1 Running 0 74m +[root@ecs-cbm-x1-pd-cpu-001 main_doc]# kubectl get svc |grep dee +deepseekr10528-decode-main ClusterIP None 97m +deepseekr10528-lb-service NodePort 172.16.242.169 8000:30800/TCP 22m +deepseekr10528-prefill-main ClusterIP None 97m +``` + +At this point, select a nodePort:30800 to access: + +```bash +[root@ecs-001]# curl -X POST "http://{nodePort}:30800/v1/chat/completions" \ +> -H "Content-Type: application/json" \ +> -H "Authorization: Bearer None" \ +> -d '{ +> "rid":"ccccdd", +> "model": "r1", +> "messages": [ +> {"role": "system", "content": "0: You are a helpful AI assistant"}, +> {"role": "user", "content": "你是谁?."} +> ], +> "max_tokens":221 +> }' +{"id":"ccccdd","object":"chat.completion","created":1750252498,"model":"qwen2","choices":[{"index":0,"message":{"role":"assistant","content":"\n嗯,用户问了一个很基础的自我介绍问题"你是谁?"。这可能是第一次互动时的常规开场白,也可能是想确认我的身份和功能范围。\n\n用户没有提供任何背景信息,语气简洁中性。这种场景下新用户的可能性较高,需要给出清晰友好的自我介绍,同时突出实用价值来降低陌生感。\n\n考虑到中文用户,应该用简体中文回复。重点要说明三点:身份归属(深度求索)、功能定位(AI助手)、服务范围(学习/工作/生活)。结尾用开放性问题引导对话很关键——既能了解需求,又能避免让用户面对空白输入框时不知所措。\n\n用波浪线结尾可以软化语气,那个笑脸表情😊刚好能中和AI的机械感。不过要控制表情符号数量,避免显得轻浮。\n\n你好呀!我是你的AI助手,由深度求索公司(DeepSeek)开发的语言模型,名字叫 **DeepSeek-R1**。你可以把我当成一个知识丰富、随叫随到的小帮手~😊\n\n我的任务就是陪你聊天、解答问题、","reasoning_content":null,"tool_calls":null},"logprobs":null,"finish_reason":"length","matched_stop":null}],"usage":{"prompt_tokens":14,"total_tokens":235,"completion_tokens":221,"prompt_tokens_details":null}} + +``` +## FAQ + +1. The current deployment startup parameters may not be fully compatible with all RDMA scenarios. Different RDMA NCCL-related environment configurations may be needed in different network environments. + +2. Some preset, optimized configurations for EPLB are not used here. You can adjust them according to [6017](https://github.com/sgl-project/sglang/issues/6017) as needed.