Refactor the docs (#9031)
This commit is contained in:
337
docs/references/multi_node_deployment/deploy_on_k8s.md
Normal file
337
docs/references/multi_node_deployment/deploy_on_k8s.md
Normal file
@@ -0,0 +1,337 @@
|
||||
# Deploy On Kubernetes
|
||||
|
||||
This document is for deploying a RoCE network-based SGLang two-node inference service on a Kubernetes (K8S) cluster.
|
||||
|
||||
[LeaderWorkerSet (LWS)](https://github.com/kubernetes-sigs/lws) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads. A major use case is for multi-host/multi-node distributed inference.
|
||||
|
||||
SGLang can also be deployed with LWS on Kubernetes for distributed model serving.
|
||||
|
||||
Please see this guide for more details on deploying SGLang on Kubernetes using LWS.
|
||||
|
||||
Here we take the deployment of DeepSeek-R1 as an example.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. At least two Kubernetes nodes, each with two H20 systems and eight GPUs, are required.
|
||||
|
||||
2. Make sure your K8S cluster has LWS correctly installed. If it hasn't been set up yet, please follow the [installation instructions](https://github.com/kubernetes-sigs/lws/blob/main/site/content/en/docs/installation/_index.md). **Note:** For LWS versions ≤0.5.x, you must use the Downward API to obtain `LWS_WORKER_INDEX`, as native support for this feature was introduced in v0.6.0.
|
||||
|
||||
## Basic example
|
||||
|
||||
For the basic example documentation, refer to [Deploy Distributed Inference Service with SGLang and LWS on GPUs](https://github.com/kubernetes-sigs/lws/tree/main/docs/examples/sglang).
|
||||
|
||||
However, that document only covers the basic NCCL socket mode.
|
||||
|
||||
In this section, we’ll make some simple modifications to adapt the setup to the RDMA scenario.
|
||||
|
||||
## RDMA RoCE case
|
||||
|
||||
* Check your env:
|
||||
|
||||
```bash
|
||||
[root@node1 ~]# ibstatus
|
||||
Infiniband device 'mlx5_bond_0' port 1 status:
|
||||
default gid: fe80:0000:0000:0000:0225:9dff:fe64:c79a
|
||||
base lid: 0x0
|
||||
sm lid: 0x0
|
||||
state: 4: ACTIVE
|
||||
phys state: 5: LinkUp
|
||||
rate: 200 Gb/sec (2X NDR)
|
||||
link_layer: Ethernet
|
||||
|
||||
Infiniband device 'mlx5_bond_1' port 1 status:
|
||||
default gid: fe80:0000:0000:0000:0225:9dff:fe6e:c3ec
|
||||
base lid: 0x0
|
||||
sm lid: 0x0
|
||||
state: 4: ACTIVE
|
||||
phys state: 5: LinkUp
|
||||
rate: 200 Gb/sec (2X NDR)
|
||||
link_layer: Ethernet
|
||||
|
||||
Infiniband device 'mlx5_bond_2' port 1 status:
|
||||
default gid: fe80:0000:0000:0000:0225:9dff:fe73:0dd7
|
||||
base lid: 0x0
|
||||
sm lid: 0x0
|
||||
state: 4: ACTIVE
|
||||
phys state: 5: LinkUp
|
||||
rate: 200 Gb/sec (2X NDR)
|
||||
link_layer: Ethernet
|
||||
|
||||
Infiniband device 'mlx5_bond_3' port 1 status:
|
||||
default gid: fe80:0000:0000:0000:0225:9dff:fe36:f7ff
|
||||
base lid: 0x0
|
||||
sm lid: 0x0
|
||||
state: 4: ACTIVE
|
||||
phys state: 5: LinkUp
|
||||
rate: 200 Gb/sec (2X NDR)
|
||||
link_layer: Ethernet
|
||||
```
|
||||
|
||||
* Prepare the `lws.yaml` file for deploying on k8s.
|
||||
|
||||
```yaml
|
||||
apiVersion: leaderworkerset.x-k8s.io/v1
|
||||
kind: LeaderWorkerSet
|
||||
metadata:
|
||||
name: sglang
|
||||
spec:
|
||||
replicas: 1
|
||||
leaderWorkerTemplate:
|
||||
size: 2
|
||||
restartPolicy: RecreateGroupOnPodRestart
|
||||
leaderTemplate:
|
||||
metadata:
|
||||
labels:
|
||||
role: leader
|
||||
spec:
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostNetwork: true
|
||||
hostIPC: true
|
||||
containers:
|
||||
- name: sglang-leader
|
||||
image: sglang:latest
|
||||
securityContext:
|
||||
privileged: true
|
||||
env:
|
||||
- name: NCCL_IB_GID_INDEX
|
||||
value: "3"
|
||||
command:
|
||||
- python3
|
||||
- -m
|
||||
- sglang.launch_server
|
||||
- --model-path
|
||||
- /work/models
|
||||
- --mem-fraction-static
|
||||
- "0.93"
|
||||
- --torch-compile-max-bs
|
||||
- "8"
|
||||
- --max-running-requests
|
||||
- "20"
|
||||
- --tp
|
||||
- "16" # Size of Tensor Parallelism
|
||||
- --dist-init-addr
|
||||
- $(LWS_LEADER_ADDRESS):20000
|
||||
- --nnodes
|
||||
- $(LWS_GROUP_SIZE)
|
||||
- --node-rank
|
||||
- $(LWS_WORKER_INDEX)
|
||||
- --trust-remote-code
|
||||
- --host
|
||||
- "0.0.0.0"
|
||||
- --port
|
||||
- "40000"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
ports:
|
||||
- containerPort: 40000
|
||||
readinessProbe:
|
||||
tcpSocket:
|
||||
port: 40000
|
||||
initialDelaySeconds: 15
|
||||
periodSeconds: 10
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- name: model
|
||||
mountPath: /work/models
|
||||
- name: ib
|
||||
mountPath: /dev/infiniband
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
- name: model
|
||||
hostPath:
|
||||
path: '< your models dir >' # modify it according your models dir
|
||||
- name: ib
|
||||
hostPath:
|
||||
path: /dev/infiniband
|
||||
workerTemplate:
|
||||
spec:
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostNetwork: true
|
||||
hostIPC: true
|
||||
containers:
|
||||
- name: sglang-worker
|
||||
image: sglang:latest
|
||||
securityContext:
|
||||
privileged: true
|
||||
env:
|
||||
- name: NCCL_IB_GID_INDEX
|
||||
value: "3"
|
||||
command:
|
||||
- python3
|
||||
- -m
|
||||
- sglang.launch_server
|
||||
- --model-path
|
||||
- /work/models
|
||||
- --mem-fraction-static
|
||||
- "0.93"
|
||||
- --torch-compile-max-bs
|
||||
- "8"
|
||||
- --max-running-requests
|
||||
- "20"
|
||||
- --tp
|
||||
- "16" # Size of Tensor Parallelism
|
||||
- --dist-init-addr
|
||||
- $(LWS_LEADER_ADDRESS):20000
|
||||
- --nnodes
|
||||
- $(LWS_GROUP_SIZE)
|
||||
- --node-rank
|
||||
- $(LWS_WORKER_INDEX)
|
||||
- --trust-remote-code
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- name: model
|
||||
mountPath: /work/models
|
||||
- name: ib
|
||||
mountPath: /dev/infiniband
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
- name: ib
|
||||
hostPath:
|
||||
path: /dev/infiniband
|
||||
- name: model
|
||||
hostPath:
|
||||
path: /data1/models/deepseek_v3_moe
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: sglang-leader
|
||||
spec:
|
||||
selector:
|
||||
leaderworkerset.sigs.k8s.io/name: sglang
|
||||
role: leader
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 40000
|
||||
targetPort: 40000
|
||||
|
||||
```
|
||||
|
||||
* Then use `kubectl apply -f lws.yaml` you will get this output.
|
||||
|
||||
```text
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
sglang-0 0/1 Running 0 9s
|
||||
sglang-0-1 1/1 Running 0 9s
|
||||
```
|
||||
|
||||
Wait for the sglang leader (`sglang-0`) status to change to 1/1, which indicates it is `Ready`.
|
||||
|
||||
You can use the command `kubectl logs -f sglang-0` to view the logs of the leader node.
|
||||
|
||||
Once successful, you should see output like this:
|
||||
|
||||
```text
|
||||
[2025-02-17 05:27:24 TP1] Capture cuda graph end. Time elapsed: 84.89 s
|
||||
[2025-02-17 05:27:24 TP6] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
|
||||
[2025-02-17 05:27:24 TP0] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
|
||||
[2025-02-17 05:27:24 TP7] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
|
||||
[2025-02-17 05:27:24 TP3] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
|
||||
[2025-02-17 05:27:24 TP2] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
|
||||
[2025-02-17 05:27:24 TP4] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
|
||||
[2025-02-17 05:27:24 TP1] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
|
||||
[2025-02-17 05:27:24 TP5] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
|
||||
[2025-02-17 05:27:24] INFO: Started server process [1]
|
||||
[2025-02-17 05:27:24] INFO: Waiting for application startup.
|
||||
[2025-02-17 05:27:24] INFO: Application startup complete.
|
||||
[2025-02-17 05:27:24] INFO: Uvicorn running on http://0.0.0.0:40000 (Press CTRL+C to quit)
|
||||
[2025-02-17 05:27:25] INFO: 127.0.0.1:48908 - "GET /get_model_info HTTP/1.1" 200 OK
|
||||
[2025-02-17 05:27:25 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0
|
||||
[2025-02-17 05:27:32] INFO: 127.0.0.1:48924 - "POST /generate HTTP/1.1" 200 OK
|
||||
[2025-02-17 05:27:32] The server is fired up and ready to roll!
|
||||
```
|
||||
|
||||
If it doesn’t start up successfully, please follow these steps to check for any remaining issues. Thanks!
|
||||
|
||||
### Debug
|
||||
|
||||
* Set `NCCL_DEBUG=TRACE` to check if it is a NCCL communication problem.
|
||||
|
||||
This should resolve most NCCL-related issues.
|
||||
|
||||
***Notice: If you find that NCCL_DEBUG=TRACE is not effective in the container environment, but the process is stuck or you encounter hard-to-diagnose issues, try switching to a different container image. Some images may not handle standard error output properly.***
|
||||
|
||||
#### RoCE scenario
|
||||
|
||||
* Please make sure that RDMA devices are available in the cluster environment.
|
||||
* Please make sure that the nodes in the cluster have Mellanox NICs with RoCE. In this example, we use Mellanox ConnectX 5 model NICs, and the proper OFED driver has been installed. If not, please refer to the document [Install OFED Driver](https://docs.nvidia.com/networking/display/mlnxofedv461000/installing+mellanox+ofed) to install the driver.
|
||||
* Check your env:
|
||||
|
||||
```shell
|
||||
$ lspci -nn | grep Eth | grep Mellanox
|
||||
0000:7f:00.0 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
|
||||
0000:7f:00.1 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
|
||||
0000:c7:00.0 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
|
||||
0000:c7:00.1 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
|
||||
0001:08:00.0 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
|
||||
0001:08:00.1 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
|
||||
0001:a2:00.0 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
|
||||
0001:a2:00.1 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
|
||||
```
|
||||
|
||||
* Check the OFED driver:
|
||||
|
||||
```shell
|
||||
ofed_info -s
|
||||
OFED-internal-23.07-0.5.0:
|
||||
```
|
||||
|
||||
* Show RDMA link status and check IB devices:
|
||||
|
||||
```shell
|
||||
$ rdma link show
|
||||
8/1: mlx5_bond_0/1: state ACTIVE physical_state LINK_UP netdev reth0
|
||||
9/1: mlx5_bond_1/1: state ACTIVE physical_state LINK_UP netdev reth2
|
||||
10/1: mlx5_bond_2/1: state ACTIVE physical_state LINK_UP netdev reth4
|
||||
11/1: mlx5_bond_3/1: state ACTIVE physical_state LINK_UP netdev reth6
|
||||
|
||||
$ ibdev2netdev
|
||||
8/1: mlx5_bond_0/1: state ACTIVE physical_state LINK_UP netdev reth0
|
||||
9/1: mlx5_bond_1/1: state ACTIVE physical_state LINK_UP netdev reth2
|
||||
10/1: mlx5_bond_2/1: state ACTIVE physical_state LINK_UP netdev reth4
|
||||
11/1: mlx5_bond_3/1: state ACTIVE physical_state LINK_UP netdev reth6
|
||||
```
|
||||
|
||||
* Test RoCE network speed on the host:
|
||||
|
||||
```shell
|
||||
yum install qperf
|
||||
# for server:
|
||||
execute qperf
|
||||
# for client
|
||||
qperf -t 60 -cm1 <server_ip> rc_rdma_write_bw
|
||||
```
|
||||
|
||||
* Check RDMA accessible in your container:
|
||||
|
||||
```shell
|
||||
# ibv_devices
|
||||
# ibv_devinfo
|
||||
```
|
||||
|
||||
## Keys to success
|
||||
|
||||
* In the YAML configuration above, pay attention to the NCCL environment variable. For older versions of NCCL, you should check the NCCL_IB_GID_INDEX environment setting.
|
||||
* NCCL_SOCKET_IFNAME is also crucial, but in a containerized environment, this typically isn’t an issue.
|
||||
* In some cases, it’s necessary to configure GLOO_SOCKET_IFNAME correctly.
|
||||
* NCCL_DEBUG is essential for troubleshooting, but I've found that sometimes it doesn't show error logs within containers. This could be related to the Docker image you're using. You may want to try switching images if needed.
|
||||
* Avoid using Docker images based on Ubuntu 18.04, as they tend to have compatibility issues.
|
||||
|
||||
## Remaining issues
|
||||
|
||||
* In Kubernetes, Docker, or Containerd environments, we use hostNetwork to prevent performance degradation.
|
||||
* We utilize privileged mode, which isn’t secure. Additionally, in containerized environments, full GPU isolation cannot be achieved.
|
||||
|
||||
## TODO
|
||||
|
||||
* Integrated with [k8s-rdma-shared-dev-plugin](https://github.com/Mellanox/k8s-rdma-shared-dev-plugin).
|
||||
@@ -0,0 +1,12 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: deepseekr10528-decode-main
|
||||
spec:
|
||||
selector:
|
||||
leaderworkerset.sigs.k8s.io/name: deepseekr10528-decode-main
|
||||
role: leader
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 30000
|
||||
targetPort: 30000
|
||||
290
docs/references/multi_node_deployment/lws_pd/lws-examples/d.yaml
Normal file
290
docs/references/multi_node_deployment/lws_pd/lws-examples/d.yaml
Normal file
@@ -0,0 +1,290 @@
|
||||
apiVersion: leaderworkerset.x-k8s.io/v1
|
||||
kind: LeaderWorkerSet
|
||||
metadata:
|
||||
name: deepseekr10528-decode-main
|
||||
spec:
|
||||
leaderWorkerTemplate:
|
||||
leaderTemplate:
|
||||
metadata:
|
||||
labels:
|
||||
role: leader
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- python3
|
||||
- -m
|
||||
- sglang.launch_server
|
||||
- --port
|
||||
- "30000"
|
||||
- --host
|
||||
- "0.0.0.0"
|
||||
- --model-path
|
||||
- /work/models
|
||||
- --chunked-prefill-size
|
||||
- "262144"
|
||||
- --page-size
|
||||
- "64"
|
||||
- --enable-dp-attention
|
||||
- --enable-dp-lm-head
|
||||
- --dp-size
|
||||
- "16"
|
||||
- --moe-a2a-backend
|
||||
- deepep
|
||||
- --disaggregation-mode
|
||||
- decode
|
||||
- --mem-fraction-static
|
||||
- "0.849"
|
||||
- --context-length
|
||||
- "32768"
|
||||
- --disaggregation-ib-device
|
||||
- "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
|
||||
- --cuda-graph-max-bs
|
||||
- "64"
|
||||
- --max-running-requests
|
||||
- "2048"
|
||||
- --tp-size
|
||||
- "16" # Size of Tensor Parallelism
|
||||
- --dist-init-addr
|
||||
- $(LWS_LEADER_ADDRESS):20102
|
||||
- --nnodes
|
||||
- $(LWS_GROUP_SIZE)
|
||||
- --node-rank
|
||||
- $(LWS_WORKER_INDEX)
|
||||
- --trust-remote-code
|
||||
- --ep-num-redundant-experts
|
||||
- "32"
|
||||
- --moe-dense-tp-size
|
||||
- "1"
|
||||
env:
|
||||
- name: CUDA_LAUNCH_BLOCKING
|
||||
value: "0"
|
||||
- name: NVSHMEM_IB_GID_INDEX
|
||||
value: "3"
|
||||
- name: NVSHMEM_ENABLE_NIC_PE_MAPPING
|
||||
value: "1"
|
||||
- name: NVSHMEM_HCA_PE_MAPPING
|
||||
value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
|
||||
- name: NCCL_IB_QPS_PER_CONNECTION
|
||||
value: "8"
|
||||
- name: NCCL_IB_SPLIT_DATA_ON_QPS
|
||||
value: "1"
|
||||
- name: NCCL_NET_PLUGIN
|
||||
value: "none"
|
||||
- name: NCCL_IB_TC
|
||||
value: "136"
|
||||
- name: NCCL_MIN_NCHANNELS
|
||||
value: "4"
|
||||
- name: NCCL_IB_SL
|
||||
value: "5"
|
||||
- name: MC_TE_METRIC
|
||||
value: "true"
|
||||
- name: SGLANG_MOONCAKE_TRANS_THREAD
|
||||
value: "16"
|
||||
- name: SGL_ENABLE_JIT_DEEPGEMM
|
||||
value: "1"
|
||||
- name: NCCL_IB_HCA
|
||||
value: ^=mlx5_0,mlx5_5,mlx5_6
|
||||
- name: LWS_WORKER_INDEX
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
|
||||
image: lmsysorg/sglang:latest
|
||||
name: sglang-leader
|
||||
ports:
|
||||
- containerPort: 30000
|
||||
protocol: TCP
|
||||
readinessProbe:
|
||||
periodSeconds: 30
|
||||
tcpSocket:
|
||||
port: 30000
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
securityContext:
|
||||
capabilities:
|
||||
add:
|
||||
- IPC_LOCK
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- mountPath: /root/.cache
|
||||
name: sgl-cache
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /work/models
|
||||
name: model
|
||||
- mountPath: /dev/infiniband
|
||||
name: ib
|
||||
- mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
name: cf
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostIPC: true
|
||||
hostNetwork: true
|
||||
nodeSelector:
|
||||
# should modify according your deployment env
|
||||
pd: "yes"
|
||||
tolerations:
|
||||
# should modify according your deployment env
|
||||
- key: bopd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
volumes:
|
||||
- hostPath:
|
||||
path: /data1/sgl_cache1
|
||||
type: DirectoryOrCreate
|
||||
name: sgl-cache
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
name: dshm
|
||||
- hostPath:
|
||||
path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
|
||||
name: model
|
||||
- hostPath:
|
||||
path: /dev/infiniband
|
||||
name: ib
|
||||
- hostPath:
|
||||
path: /data1/maas_hosted_models/models/fused_moe_triton/configs
|
||||
name: cf
|
||||
restartPolicy: RecreateGroupOnPodRestart
|
||||
size: 2
|
||||
workerTemplate:
|
||||
metadata: {}
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- python3
|
||||
- -m
|
||||
- sglang.launch_server
|
||||
- --model-path
|
||||
- /work/models
|
||||
- --chunked-prefill-size
|
||||
- "262144"
|
||||
- --page-size
|
||||
- "64"
|
||||
- --enable-dp-attention
|
||||
- --enable-dp-lm-head
|
||||
- --dp-size
|
||||
- "16"
|
||||
- --moe-a2a-backend
|
||||
- deepep
|
||||
- --disaggregation-mode
|
||||
- decode
|
||||
- --mem-fraction-static
|
||||
- "0.849"
|
||||
- --context-length
|
||||
- "32768"
|
||||
- --disaggregation-ib-device
|
||||
- "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
|
||||
- --cuda-graph-max-bs
|
||||
- "64"
|
||||
- --max-running-requests
|
||||
- "2048"
|
||||
- --tp-size
|
||||
- "16" # Size of Tensor Parallelism
|
||||
- --dist-init-addr
|
||||
- $(LWS_LEADER_ADDRESS):20102
|
||||
- --nnodes
|
||||
- $(LWS_GROUP_SIZE)
|
||||
- --node-rank
|
||||
- $(LWS_WORKER_INDEX)
|
||||
- --trust-remote-code
|
||||
- --ep-num-redundant-experts
|
||||
- "32"
|
||||
- --moe-dense-tp-size
|
||||
- "1"
|
||||
env:
|
||||
- name: NVSHMEM_IB_TRAFFIC_CLASS
|
||||
value: "16"
|
||||
- name: NVSHMEM_IB_GID_INDEX
|
||||
value: "3"
|
||||
- name: NVSHMEM_ENABLE_NIC_PE_MAPPING
|
||||
value: "1"
|
||||
- name: NVSHMEM_HCA_PE_MAPPING
|
||||
value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
|
||||
- name: NCCL_IB_QPS_PER_CONNECTION
|
||||
value: "8"
|
||||
- name: NCCL_IB_SPLIT_DATA_ON_QPS
|
||||
value: "1"
|
||||
- name: NCCL_NET_PLUGIN
|
||||
value: "none"
|
||||
- name: NCCL_IB_TC
|
||||
value: "136"
|
||||
- name: NCCL_MIN_NCHANNELS
|
||||
value: "4"
|
||||
- name: MC_TE_METRIC
|
||||
value: "true"
|
||||
- name: NCCL_IB_SL
|
||||
value: "5"
|
||||
- name: SGLANG_MOONCAKE_TRANS_THREAD
|
||||
value: "16"
|
||||
- name: SGL_ENABLE_JIT_DEEPGEMM
|
||||
value: "1"
|
||||
- name: NCCL_IB_HCA
|
||||
value: ^=mlx5_0,mlx5_5,mlx5_6
|
||||
- name: LWS_WORKER_INDEX
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
|
||||
image: lmsysorg/sglang:latest
|
||||
name: sglang-worker
|
||||
ports:
|
||||
- containerPort: 30001
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
securityContext:
|
||||
capabilities:
|
||||
add:
|
||||
- IPC_LOCK
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- mountPath: /root/.cache
|
||||
name: sgl-cache
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /work/models
|
||||
name: model
|
||||
- mountPath: /dev/infiniband
|
||||
name: ib
|
||||
- mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
name: cf
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostIPC: true
|
||||
hostNetwork: true
|
||||
nodeSelector:
|
||||
# should modify according your deployment env
|
||||
pd: "yes"
|
||||
tolerations:
|
||||
# should modify according your deployment env
|
||||
- key: bopd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
volumes:
|
||||
- hostPath:
|
||||
path: /data1/sgl_cache1
|
||||
type: DirectoryOrCreate
|
||||
name: sgl-cache
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
name: dshm
|
||||
- hostPath:
|
||||
path: /dev/infiniband
|
||||
name: ib
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
|
||||
name: model
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/maas_hosted_models/models/fused_moe_triton/configs
|
||||
name: cf
|
||||
networkConfig:
|
||||
subdomainPolicy: Shared
|
||||
replicas: 1
|
||||
rolloutStrategy:
|
||||
rollingUpdateConfiguration:
|
||||
maxSurge: 0
|
||||
maxUnavailable: 1
|
||||
type: RollingUpdate
|
||||
startupPolicy: LeaderCreated
|
||||
@@ -0,0 +1,55 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: deepseekr10528-lb-main
|
||||
labels:
|
||||
app: deepseekr10528-lb
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: deepseekr10528-lb
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: deepseekr10528-lb
|
||||
spec:
|
||||
nodeSelector:
|
||||
bo: "yes"
|
||||
tolerations:
|
||||
- key: bopd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
containers:
|
||||
- name: sgl-minilb
|
||||
image: lmsysorg/sglang:latest
|
||||
command:
|
||||
- python
|
||||
- -m
|
||||
- sglang.srt.disaggregation.mini_lb
|
||||
- --prefill
|
||||
- http://deepseekr10528-prefill-main:30000
|
||||
- --decode
|
||||
- http://deepseekr10528-decode-main:30000
|
||||
- --host
|
||||
- 0.0.0.0
|
||||
- --port
|
||||
- "8000"
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: deepseekr10528-lb-service
|
||||
spec:
|
||||
type: NodePort # NodePort is easy to test, you can also specify `ClusterIP`
|
||||
selector:
|
||||
app: deepseekr10528-lb
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 8000 # Service Port(In-Cluster)
|
||||
targetPort: 8000 # Exposed Container
|
||||
nodePort: 30800
|
||||
@@ -0,0 +1,12 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: deepseekr10528-prefill-main
|
||||
spec:
|
||||
selector:
|
||||
leaderworkerset.sigs.k8s.io/name: deepseekr10528-prefill-main
|
||||
role: leader
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 30000
|
||||
targetPort: 30000
|
||||
304
docs/references/multi_node_deployment/lws_pd/lws-examples/p.yaml
Normal file
304
docs/references/multi_node_deployment/lws_pd/lws-examples/p.yaml
Normal file
@@ -0,0 +1,304 @@
|
||||
apiVersion: leaderworkerset.x-k8s.io/v1
|
||||
kind: LeaderWorkerSet
|
||||
metadata:
|
||||
name: deepseekr10528-prefill-main
|
||||
spec:
|
||||
leaderWorkerTemplate:
|
||||
leaderTemplate:
|
||||
metadata:
|
||||
labels:
|
||||
role: leader
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- python3
|
||||
- -m
|
||||
- sglang.launch_server
|
||||
- --port
|
||||
- "30000"
|
||||
- --host
|
||||
- "0.0.0.0"
|
||||
- --model-path
|
||||
- /work/models
|
||||
- --disaggregation-ib-device
|
||||
# should modify according your rdma env
|
||||
- mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3
|
||||
- --chunked-prefill-size
|
||||
- "524288"
|
||||
- --max-prefill-tokens
|
||||
- "32768"
|
||||
- --page-size
|
||||
- "64"
|
||||
- --ep-dispatch-algorithm
|
||||
- dynamic
|
||||
- --eplb-algorithm
|
||||
- deepseek
|
||||
- --enable-dp-lm-head
|
||||
- --enable-dp-attention
|
||||
- --dp-size
|
||||
- "16"
|
||||
- --disable-radix-cache
|
||||
- --moe-a2a-backend
|
||||
- deepep
|
||||
- --disaggregation-mode
|
||||
- prefill
|
||||
- --mem-fraction-static
|
||||
- "0.7"
|
||||
- --context-length
|
||||
- "32768"
|
||||
- --tp
|
||||
- "16"
|
||||
- --dist-init-addr
|
||||
- $(LWS_LEADER_ADDRESS):20102
|
||||
- --nnodes
|
||||
- $(LWS_GROUP_SIZE)
|
||||
- --node-rank
|
||||
- $(LWS_WORKER_INDEX)
|
||||
- --trust-remote-code
|
||||
- --ep-num-redundant-experts
|
||||
- "32"
|
||||
- --moe-dense-tp-size
|
||||
- "1"
|
||||
- --max-running-requests
|
||||
- "1024"
|
||||
env:
|
||||
- name: NVSHMEM_HCA_PE_MAPPING
|
||||
# should modify according your rdma env
|
||||
value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
|
||||
- name: NVSHMEM_IB_GID_INDEX
|
||||
value: "3"
|
||||
- name: NVSHMEM_ENABLE_NIC_PE_MAPPING
|
||||
value: "1"
|
||||
- name: SGLANG_SET_CPU_AFFINITY
|
||||
value: "true"
|
||||
- name: SGL_ENABLE_JIT_DEEPGEMM
|
||||
value: "1"
|
||||
- name: NCCL_IB_QPS_PER_CONNECTION
|
||||
value: "8"
|
||||
- name: NCCL_IB_SPLIT_DATA_ON_QPS
|
||||
value: "1"
|
||||
- name: NCCL_NET_PLUGIN
|
||||
value: none
|
||||
- name: NCCL_IB_TC
|
||||
value: "136"
|
||||
- name: NCCL_MIN_NCHANNELS
|
||||
value: "4"
|
||||
- name: MC_TE_METRIC
|
||||
value: "false"
|
||||
- name: NCCL_IB_SL
|
||||
value: "5"
|
||||
- name: NCCL_IB_HCA
|
||||
value: ^=mlx5_0,mlx5_5,mlx5_6
|
||||
- name: LWS_WORKER_INDEX
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
|
||||
image: lmsysorg/sglang:latest
|
||||
name: sglang-leader
|
||||
ports:
|
||||
- containerPort: 30000
|
||||
protocol: TCP
|
||||
readinessProbe:
|
||||
periodSeconds: 30
|
||||
tcpSocket:
|
||||
port: 30000
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
securityContext:
|
||||
capabilities:
|
||||
add:
|
||||
- IPC_LOCK
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /work/models
|
||||
name: model
|
||||
- mountPath: /dev/infiniband
|
||||
name: ib
|
||||
- mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
name: cf
|
||||
- mountPath: /root/.cache
|
||||
name: sgl-cache
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostIPC: true
|
||||
hostNetwork: true
|
||||
nodeSelector:
|
||||
# should modify according your deployment env
|
||||
pd: "yes"
|
||||
tolerations:
|
||||
# should modify according your deployment env
|
||||
- key: bopd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
volumes:
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
name: dshm
|
||||
- hostPath:
|
||||
path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
|
||||
name: model
|
||||
- hostPath:
|
||||
path: /dev/infiniband
|
||||
name: ib
|
||||
- hostPath:
|
||||
path: /data1/maas_hosted_models/models/fused_moe_triton/configs
|
||||
name: cf
|
||||
- hostPath:
|
||||
path: /data1/sgl_cache
|
||||
type: DirectoryOrCreate
|
||||
name: sgl-cache
|
||||
restartPolicy: RecreateGroupOnPodRestart
|
||||
size: 2
|
||||
workerTemplate:
|
||||
metadata: {}
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- python3
|
||||
- -m
|
||||
- sglang.launch_server
|
||||
- --model-path
|
||||
- /work/models
|
||||
- --disaggregation-ib-device
|
||||
# should modify according your rdma env
|
||||
- mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3
|
||||
- --chunked-prefill-size
|
||||
- "524288"
|
||||
- --max-prefill-tokens
|
||||
- "32768"
|
||||
- --page-size
|
||||
- "64"
|
||||
- --ep-dispatch-algorithm
|
||||
- dynamic
|
||||
- --eplb-algorithm
|
||||
- deepseek
|
||||
# - --deepep-config
|
||||
# - /home/aiges/tuned/tuned_8sms.json
|
||||
# can be tuned using deepep test scripts
|
||||
- --enable-dp-lm-head
|
||||
- --enable-dp-attention
|
||||
- --dp-size
|
||||
- "16"
|
||||
- --disable-radix-cache
|
||||
- --moe-a2a-backend
|
||||
- deepep
|
||||
- --disaggregation-mode
|
||||
- prefill
|
||||
- --mem-fraction-static
|
||||
- "0.7"
|
||||
- --context-length
|
||||
- "32768"
|
||||
- --tp
|
||||
- "16"
|
||||
- --dist-init-addr
|
||||
- $(LWS_LEADER_ADDRESS):20102
|
||||
- --nnodes
|
||||
- $(LWS_GROUP_SIZE)
|
||||
- --node-rank
|
||||
- $(LWS_WORKER_INDEX)
|
||||
- --trust-remote-code
|
||||
- --ep-num-redundant-experts
|
||||
- "32"
|
||||
- --moe-dense-tp-size
|
||||
- "1"
|
||||
- --max-running-requests
|
||||
- "1024"
|
||||
env:
|
||||
- name: SGLANG_SET_CPU_AFFINITY
|
||||
value: "true"
|
||||
- name: NVSHMEM_HCA_PE_MAPPING
|
||||
# should modify according your rdma env
|
||||
value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
|
||||
- name: NCCL_IB_HCA
|
||||
value: ^=mlx5_0,mlx5_5,mlx5_6
|
||||
- name: NVSHMEM_IB_TRAFFIC_CLASS
|
||||
value: "16"
|
||||
- name: NVSHMEM_IB_GID_INDEX
|
||||
value: "3"
|
||||
- name: NVSHMEM_ENABLE_NIC_PE_MAPPING
|
||||
value: "1"
|
||||
- name: CUDA_LAUNCH_BLOCKING
|
||||
value: "0"
|
||||
- name: SGLANG_MOONCAKE_TRANS_THREAD
|
||||
value: "8"
|
||||
- name: SGL_ENABLE_JIT_DEEPGEMM
|
||||
value: "1"
|
||||
- name: SGL_CHUNKED_PREFIX_CACHE_THRESHOLD
|
||||
value: "0"
|
||||
- name: NCCL_IB_QPS_PER_CONNECTION
|
||||
value: "8"
|
||||
- name: NCCL_IB_SPLIT_DATA_ON_QPS
|
||||
value: "1"
|
||||
- name: NCCL_NET_PLUGIN
|
||||
value: none
|
||||
- name: NCCL_IB_TC
|
||||
value: "136"
|
||||
- name: NCCL_MIN_NCHANNELS
|
||||
value: "4"
|
||||
- name: MC_TE_METRIC
|
||||
value: "true"
|
||||
- name: NCCL_IB_SL
|
||||
value: "5"
|
||||
- name: LWS_WORKER_INDEX
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
|
||||
image: lmsysorg/sglang:latest
|
||||
name: sglang-worker
|
||||
ports:
|
||||
- containerPort: 30001
|
||||
protocol: TCP
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
securityContext:
|
||||
capabilities:
|
||||
add:
|
||||
- IPC_LOCK
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- mountPath: /root/.cache
|
||||
name: sgl-cache
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /work/models
|
||||
name: model
|
||||
- mountPath: /dev/infiniband
|
||||
name: ib
|
||||
- mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
name: cf
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostIPC: true
|
||||
hostNetwork: true
|
||||
nodeSelector:
|
||||
# should modify according your deployment env
|
||||
pd: "yes"
|
||||
tolerations:
|
||||
# should modify according your deployment env
|
||||
- key: bopd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
volumes:
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
name: dshm
|
||||
- hostPath:
|
||||
path: /dev/infiniband
|
||||
name: ib
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
|
||||
name: model
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/maas_hosted_models/models/fused_moe_triton/configs
|
||||
name: cf
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/sgl_cache
|
||||
type: DirectoryOrCreate
|
||||
name: sgl-cache
|
||||
782
docs/references/multi_node_deployment/lws_pd/lws_pd_deploy.md
Normal file
782
docs/references/multi_node_deployment/lws_pd/lws_pd_deploy.md
Normal file
@@ -0,0 +1,782 @@
|
||||
# LWS Based PD Deploy
|
||||
|
||||
## 0. Prerequisites
|
||||
|
||||
1. k8s >=1.26
|
||||
2. lws installed on k8s.
|
||||
|
||||
## 1. Image Preparation
|
||||
|
||||
`lmsysorg/sglang:deepep`
|
||||
|
||||
## 2. Deployment Manifest Files
|
||||
|
||||
***Notice: We will package all deployment files into Helm Chart format in the near future. Interested community members can contact us to contribute***
|
||||
|
||||
### Prefill
|
||||
|
||||
Prefill manifest file [prefill.yaml](lws-examples/p.yaml)
|
||||
|
||||
*Note: The NodeSelector section, model location section, and taint toleration section can be adjusted according to your actual deployment environment*
|
||||
|
||||
```yaml
|
||||
apiVersion: leaderworkerset.x-k8s.io/v1
|
||||
kind: LeaderWorkerSet
|
||||
metadata:
|
||||
name: deepseekr10528-prefill-main
|
||||
spec:
|
||||
leaderWorkerTemplate:
|
||||
leaderTemplate:
|
||||
metadata:
|
||||
labels:
|
||||
role: leader
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- python3
|
||||
- -m
|
||||
- sglang.launch_server
|
||||
- --port
|
||||
- "30000"
|
||||
- --host
|
||||
- "0.0.0.0"
|
||||
- --model-path
|
||||
- /work/models
|
||||
- --disaggregation-ib-device
|
||||
# should modify according your rdma env
|
||||
- mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3
|
||||
- --chunked-prefill-size
|
||||
- "524288"
|
||||
- --max-prefill-tokens
|
||||
- "32768"
|
||||
- --page-size
|
||||
- "64"
|
||||
# - --init-expert-location
|
||||
# - /home/aiges/tuned/attachment_ep_statistics/prefill_in1024.json
|
||||
- --ep-dispatch-algorithm
|
||||
- dynamic
|
||||
- --eplb-algorithm
|
||||
- deepseek
|
||||
# - --deepep-config
|
||||
# - /home/aiges/tuned/tuned_8sms.json
|
||||
- --enable-dp-lm-head
|
||||
- --enable-dp-attention
|
||||
- --dp-size
|
||||
- "16"
|
||||
- --disable-radix-cache
|
||||
- --moe-a2a-backend
|
||||
- deepep
|
||||
- --disaggregation-mode
|
||||
- prefill
|
||||
- --mem-fraction-static
|
||||
- "0.7"
|
||||
- --context-length
|
||||
- "32768"
|
||||
- --tp
|
||||
- "16"
|
||||
- --dist-init-addr
|
||||
- $(LWS_LEADER_ADDRESS):20102
|
||||
- --nnodes
|
||||
- $(LWS_GROUP_SIZE)
|
||||
- --node-rank
|
||||
- $(LWS_WORKER_INDEX)
|
||||
- --trust-remote-code
|
||||
- --ep-num-redundant-experts
|
||||
- "32"
|
||||
- --moe-dense-tp-size
|
||||
- "1"
|
||||
- --max-running-requests
|
||||
- "1024"
|
||||
env:
|
||||
# - name: NVSHMEM_HCA_PE_MAPPING
|
||||
# value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
|
||||
# - name: NVSHMEM_HCA_LIST
|
||||
# value: "mlx5_bond_0:1,mlx5_bond_1:1,mlx5_bond_2:1,mlx5_bond_3:1"
|
||||
- name: NVSHMEM_IB_GID_INDEX
|
||||
value: "3"
|
||||
- name: NVSHMEM_ENABLE_NIC_PE_MAPPING
|
||||
value: "1"
|
||||
- name: SGLANG_SET_CPU_AFFINITY
|
||||
value: "true"
|
||||
- name: SGL_ENABLE_JIT_DEEPGEMM
|
||||
value: "1"
|
||||
- name: NCCL_IB_QPS_PER_CONNECTION
|
||||
value: "8"
|
||||
- name: NCCL_IB_SPLIT_DATA_ON_QPS
|
||||
value: "1"
|
||||
- name: NCCL_NET_PLUGIN
|
||||
value: none
|
||||
- name: NCCL_IB_TC
|
||||
value: "136"
|
||||
- name: NCCL_MIN_NCHANNELS
|
||||
value: "4"
|
||||
- name: MC_TE_METRIC
|
||||
value: "false"
|
||||
- name: NCCL_IB_SL
|
||||
value: "5"
|
||||
- name: NCCL_IB_HCA
|
||||
value: ^=mlx5_0,mlx5_5,mlx5_6
|
||||
- name: LWS_WORKER_INDEX
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
|
||||
image: lmsysorg/sglang:deepep
|
||||
name: sglang-leader
|
||||
ports:
|
||||
- containerPort: 30000
|
||||
protocol: TCP
|
||||
readinessProbe:
|
||||
periodSeconds: 30
|
||||
tcpSocket:
|
||||
port: 30000
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
securityContext:
|
||||
capabilities:
|
||||
add:
|
||||
- IPC_LOCK
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /work/models
|
||||
name: model
|
||||
- mountPath: /dev/infiniband
|
||||
name: ib
|
||||
- mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
name: cf
|
||||
- mountPath: /root/.cache
|
||||
name: sgl-cache
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostIPC: true
|
||||
hostNetwork: true
|
||||
nodeSelector:
|
||||
pd: "yes"
|
||||
tolerations:
|
||||
- key: pd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
volumes:
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
name: dshm
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
|
||||
name: model
|
||||
- hostPath:
|
||||
path: /dev/infiniband
|
||||
name: ib
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/maas_hosted_models/models/fused_moe_triton/configs
|
||||
name: cf
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/sgl_cache
|
||||
type: DirectoryOrCreate
|
||||
name: sgl-cache
|
||||
restartPolicy: RecreateGroupOnPodRestart
|
||||
size: 2
|
||||
workerTemplate:
|
||||
metadata: {}
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- python3
|
||||
- -m
|
||||
- sglang.launch_server
|
||||
- --model-path
|
||||
- /work/models
|
||||
- --disaggregation-ib-device
|
||||
- mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3
|
||||
- --chunked-prefill-size
|
||||
- "524288"
|
||||
- --max-prefill-tokens
|
||||
- "32768"
|
||||
- --page-size
|
||||
- "64"
|
||||
#- --init-expert-location
|
||||
#- /home/aiges/tuned/attachment_ep_statistics/prefill_in1024.json
|
||||
- --ep-dispatch-algorithm
|
||||
- dynamic
|
||||
- --eplb-algorithm
|
||||
- deepseek
|
||||
# - --deepep-config
|
||||
# - /home/aiges/tuned/tuned_8sms.json
|
||||
- --enable-dp-lm-head
|
||||
- --enable-dp-attention
|
||||
- --dp-size
|
||||
- "16"
|
||||
- --disable-radix-cache
|
||||
- --moe-a2a-backend
|
||||
- deepep
|
||||
- --disaggregation-mode
|
||||
- prefill
|
||||
- --mem-fraction-static
|
||||
- "0.7"
|
||||
- --context-length
|
||||
- "32768"
|
||||
- --tp
|
||||
- "16"
|
||||
- --dist-init-addr
|
||||
- $(LWS_LEADER_ADDRESS):20102
|
||||
- --nnodes
|
||||
- $(LWS_GROUP_SIZE)
|
||||
- --node-rank
|
||||
- $(LWS_WORKER_INDEX)
|
||||
- --trust-remote-code
|
||||
- --ep-num-redundant-experts
|
||||
- "32"
|
||||
- --moe-dense-tp-size
|
||||
- "1"
|
||||
- --max-running-requests
|
||||
- "1024"
|
||||
env:
|
||||
- name: SGLANG_SET_CPU_AFFINITY
|
||||
value: "true"
|
||||
- name: SGLANG_HACK_DEEPEP_NUM_SMS
|
||||
value: "8"
|
||||
- name: SGLANG_HACK_DEEPEP_NEW_MODE
|
||||
value: "0"
|
||||
# - name: NVSHMEM_HCA_PE_MAPPING
|
||||
# value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
|
||||
# - name: NVSHMEM_HCA_LIST
|
||||
# value: "mlx5_bond_0:1,mlx5_bond_1:1,mlx5_bond_2:1,mlx5_bond_3:1"
|
||||
- name: NCCL_IB_HCA
|
||||
value: ^=mlx5_0,mlx5_5,mlx5_6
|
||||
- name: NVSHMEM_IB_TRAFFIC_CLASS
|
||||
value: "16"
|
||||
- name: NVSHMEM_IB_GID_INDEX
|
||||
value: "3"
|
||||
- name: NVSHMEM_ENABLE_NIC_PE_MAPPING
|
||||
value: "1"
|
||||
- name: CUDA_LAUNCH_BLOCKING
|
||||
value: "0"
|
||||
- name: SGLANG_MOONCAKE_TRANS_THREAD
|
||||
value: "8"
|
||||
- name: SGL_ENABLE_JIT_DEEPGEMM
|
||||
value: "1"
|
||||
- name: SGL_CHUNKED_PREFIX_CACHE_THRESHOLD
|
||||
value: "0"
|
||||
- name: NCCL_IB_QPS_PER_CONNECTION
|
||||
value: "8"
|
||||
- name: NCCL_IB_SPLIT_DATA_ON_QPS
|
||||
value: "1"
|
||||
- name: NCCL_NET_PLUGIN
|
||||
value: none
|
||||
- name: NCCL_IB_TC
|
||||
value: "136"
|
||||
- name: NCCL_MIN_NCHANNELS
|
||||
value: "4"
|
||||
- name: MC_TE_METRIC
|
||||
value: "true"
|
||||
- name: NCCL_IB_SL
|
||||
value: "5"
|
||||
- name: LWS_WORKER_INDEX
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
|
||||
image: lmsysorg/sglang:deepep
|
||||
name: sglang-worker
|
||||
ports:
|
||||
- containerPort: 30001
|
||||
protocol: TCP
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
securityContext:
|
||||
capabilities:
|
||||
add:
|
||||
- IPC_LOCK
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
|
||||
- mountPath: /root/.cache
|
||||
name: sgl-cache
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /work/models
|
||||
name: model
|
||||
- mountPath: /dev/infiniband
|
||||
name: ib
|
||||
- mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
name: cf
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostIPC: true
|
||||
hostNetwork: true
|
||||
nodeSelector:
|
||||
pd: "yes"
|
||||
tolerations:
|
||||
- key: pd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
volumes:
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
name: dshm
|
||||
- hostPath:
|
||||
path: /dev/infiniband
|
||||
name: ib
|
||||
- hostPath:
|
||||
path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
|
||||
name: model
|
||||
- hostPath:
|
||||
path: /data1/maas_hosted_models/models/fused_moe_triton/configs
|
||||
name: cf
|
||||
- hostPath:
|
||||
path: /data1/sgl_cache
|
||||
type: DirectoryOrCreate
|
||||
name: sgl-cache
|
||||
|
||||
```
|
||||
|
||||
### Decode
|
||||
|
||||
Decode node deployment manifest file [decode.yaml](lws-examples/d.yaml)
|
||||
|
||||
*Note: The NodeSelector section, model location section, and taint toleration section can be adjusted according to your actual deployment environment*
|
||||
|
||||
```yaml
|
||||
apiVersion: leaderworkerset.x-k8s.io/v1
|
||||
kind: LeaderWorkerSet
|
||||
metadata:
|
||||
name: deepseekr10528-decode-main
|
||||
spec:
|
||||
leaderWorkerTemplate:
|
||||
leaderTemplate:
|
||||
metadata:
|
||||
labels:
|
||||
role: leader
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- python3
|
||||
- -m
|
||||
- sglang.launch_server
|
||||
- --port
|
||||
- "30000"
|
||||
- --host
|
||||
- "0.0.0.0"
|
||||
- --model-path
|
||||
- /work/models
|
||||
- --chunked-prefill-size
|
||||
- "262144"
|
||||
- --page-size
|
||||
- "64"
|
||||
- --enable-dp-attention
|
||||
- --enable-dp-lm-head
|
||||
- --dp-size
|
||||
- "16"
|
||||
- --moe-a2a-backend
|
||||
- deepep
|
||||
- --disaggregation-mode
|
||||
- decode
|
||||
- --mem-fraction-static
|
||||
- "0.849"
|
||||
- --context-length
|
||||
- "32768"
|
||||
- --disaggregation-ib-device
|
||||
- "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
|
||||
- --cuda-graph-max-bs
|
||||
- "64"
|
||||
- --max-running-requests
|
||||
- "2048"
|
||||
- --tp-size
|
||||
- "16" # Size of Tensor Parallelism
|
||||
- --dist-init-addr
|
||||
- $(LWS_LEADER_ADDRESS):20102
|
||||
- --nnodes
|
||||
- $(LWS_GROUP_SIZE)
|
||||
- --node-rank
|
||||
- $(LWS_WORKER_INDEX)
|
||||
- --trust-remote-code
|
||||
- --ep-num-redundant-experts
|
||||
- "32"
|
||||
- --moe-dense-tp-size
|
||||
- "1"
|
||||
env:
|
||||
- name: CUDA_LAUNCH_BLOCKING
|
||||
value: "0"
|
||||
- name: NVSHMEM_IB_GID_INDEX
|
||||
value: "3"
|
||||
- name: NVSHMEM_ENABLE_NIC_PE_MAPPING
|
||||
value: "1"
|
||||
- name: NCCL_IB_QPS_PER_CONNECTION
|
||||
value: "8"
|
||||
- name: NCCL_IB_SPLIT_DATA_ON_QPS
|
||||
value: "1"
|
||||
- name: NCCL_NET_PLUGIN
|
||||
value: "none"
|
||||
- name: NCCL_IB_TC
|
||||
value: "136"
|
||||
- name: NCCL_MIN_NCHANNELS
|
||||
value: "4"
|
||||
- name: NCCL_IB_SL
|
||||
value: "5"
|
||||
- name: MC_TE_METRIC
|
||||
value: "true"
|
||||
- name: SGLANG_MOONCAKE_TRANS_THREAD
|
||||
value: "16"
|
||||
- name: SGL_ENABLE_JIT_DEEPGEMM
|
||||
value: "1"
|
||||
- name: NCCL_IB_HCA
|
||||
value: ^=mlx5_0,mlx5_5,mlx5_6
|
||||
- name: LWS_WORKER_INDEX
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
|
||||
image: lmsysorg/sglang:deepep
|
||||
name: sglang-leader
|
||||
ports:
|
||||
- containerPort: 30000
|
||||
protocol: TCP
|
||||
readinessProbe:
|
||||
periodSeconds: 30
|
||||
tcpSocket:
|
||||
port: 30000
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
securityContext:
|
||||
capabilities:
|
||||
add:
|
||||
- IPC_LOCK
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- mountPath: /root/.cache
|
||||
name: sgl-cache
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /work/models
|
||||
name: model
|
||||
- mountPath: /dev/infiniband
|
||||
name: ib
|
||||
- mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
name: cf
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostIPC: true
|
||||
hostNetwork: true
|
||||
nodeSelector:
|
||||
pd: "yes"
|
||||
tolerations:
|
||||
- key: pd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
volumes:
|
||||
- hostPath:
|
||||
path: /data1/sgl_cache1
|
||||
type: DirectoryOrCreate
|
||||
name: sgl-cache
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
name: dshm
|
||||
- hostPath:
|
||||
path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
|
||||
name: model
|
||||
- hostPath:
|
||||
path: /dev/infiniband
|
||||
name: ib
|
||||
- hostPath:
|
||||
path: /data1/maas_hosted_models/models/fused_moe_triton/configs
|
||||
name: cf
|
||||
restartPolicy: RecreateGroupOnPodRestart
|
||||
size: 2
|
||||
workerTemplate:
|
||||
metadata: {}
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- python3
|
||||
- -m
|
||||
- sglang.launch_server
|
||||
- --model-path
|
||||
- /work/models
|
||||
- --chunked-prefill-size
|
||||
- "262144"
|
||||
- --page-size
|
||||
- "64"
|
||||
- --enable-dp-attention
|
||||
- --enable-dp-lm-head
|
||||
#- --enable-two-batch-overlap
|
||||
- --dp-size
|
||||
- "16"
|
||||
- --moe-a2a-backend
|
||||
- deepep
|
||||
- --disaggregation-mode
|
||||
- decode
|
||||
- --mem-fraction-static
|
||||
- "0.849"
|
||||
- --context-length
|
||||
- "32768"
|
||||
- --disaggregation-ib-device
|
||||
# should modify according your rdma env
|
||||
- "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
|
||||
- --cuda-graph-max-bs
|
||||
- "64"
|
||||
- --max-running-requests
|
||||
- "2048"
|
||||
- --tp-size
|
||||
- "16" # Size of Tensor Parallelism
|
||||
- --dist-init-addr
|
||||
- $(LWS_LEADER_ADDRESS):20102
|
||||
- --nnodes
|
||||
- $(LWS_GROUP_SIZE)
|
||||
- --node-rank
|
||||
- $(LWS_WORKER_INDEX)
|
||||
- --trust-remote-code
|
||||
- --ep-num-redundant-experts
|
||||
- "32"
|
||||
- --moe-dense-tp-size
|
||||
- "1"
|
||||
env:
|
||||
- name: SGLANG_HACK_DEEPEP_NUM_SMS
|
||||
value: "24"
|
||||
- name: SGLANG_HACK_DEEPEP_NEW_MODE
|
||||
value: "0"
|
||||
- name: NVSHMEM_IB_TRAFFIC_CLASS
|
||||
value: "16"
|
||||
- name: NVSHMEM_IB_GID_INDEX
|
||||
value: "3"
|
||||
- name: NVSHMEM_ENABLE_NIC_PE_MAPPING
|
||||
value: "1"
|
||||
- name: NCCL_IB_QPS_PER_CONNECTION
|
||||
value: "8"
|
||||
- name: NCCL_IB_SPLIT_DATA_ON_QPS
|
||||
value: "1"
|
||||
- name: NCCL_NET_PLUGIN
|
||||
value: "none"
|
||||
- name: NCCL_IB_TC
|
||||
value: "136"
|
||||
- name: NCCL_MIN_NCHANNELS
|
||||
value: "4"
|
||||
- name: MC_TE_METRIC
|
||||
value: "true"
|
||||
- name: NCCL_IB_SL
|
||||
value: "5"
|
||||
- name: SGLANG_MOONCAKE_TRANS_THREAD
|
||||
value: "16"
|
||||
- name: SGL_ENABLE_JIT_DEEPGEMM
|
||||
value: "1"
|
||||
- name: NCCL_IB_HCA
|
||||
value: ^=mlx5_0,mlx5_5,mlx5_6
|
||||
- name: LWS_WORKER_INDEX
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
|
||||
image: lmsysorg/sglang:deepep
|
||||
name: sglang-worker
|
||||
ports:
|
||||
- containerPort: 30001
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
securityContext:
|
||||
capabilities:
|
||||
add:
|
||||
- IPC_LOCK
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- mountPath: /root/.cache
|
||||
name: sgl-cache
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /work/models
|
||||
name: model
|
||||
- mountPath: /dev/infiniband
|
||||
name: ib
|
||||
- mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
name: cf
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostIPC: true
|
||||
hostNetwork: true
|
||||
nodeSelector:
|
||||
pd: "yes"
|
||||
tolerations:
|
||||
- key: pd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
volumes:
|
||||
- hostPath:
|
||||
path: /data1/sgl_cache1
|
||||
type: DirectoryOrCreate
|
||||
name: sgl-cache
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
name: dshm
|
||||
- hostPath:
|
||||
path: /dev/infiniband
|
||||
name: ib
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
|
||||
name: model
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/maas_hosted_models/models/fused_moe_triton/configs
|
||||
name: cf
|
||||
networkConfig:
|
||||
subdomainPolicy: Shared
|
||||
replicas: 1
|
||||
rolloutStrategy:
|
||||
rollingUpdateConfiguration:
|
||||
maxSurge: 0
|
||||
maxUnavailable: 1
|
||||
type: RollingUpdate
|
||||
startupPolicy: LeaderCreated
|
||||
```
|
||||
|
||||
Execute separately:
|
||||
|
||||
```bash
|
||||
kubectl apply -f p.yaml
|
||||
kubectl apply -f d.yaml
|
||||
```
|
||||
|
||||
At this point, we have completed the deployment of the 1P1D SGlang engine part.
|
||||
|
||||
To allow our users to directly experience the model API, we still need a load balancer to handle sequential calls between prefill and decode. Different companies implement LBs differently, and the community will also officially release a new LB component written in Rust in the near future.
|
||||
|
||||
Currently, we use a static K8S service + minilb approach to implement model API calls.
|
||||
|
||||
### Creating Service for Prefill and Decode
|
||||
|
||||
#### Create prefill k8s service
|
||||
[p-svc.yaml](lws-examples/p-svc.yaml)
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: deepseekr10528-prefill-main
|
||||
spec:
|
||||
selector:
|
||||
leaderworkerset.sigs.k8s.io/name: deepseekr10528-prefill-main
|
||||
role: leader
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 30000
|
||||
targetPort: 30000
|
||||
```
|
||||
Execute `kubectl apply -f p-svc.yaml`
|
||||
|
||||
#### Create decode k8s service
|
||||
[d-svc.yaml](lws-examples/d-svc.yaml)
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: deepseekr10528-decode-main
|
||||
spec:
|
||||
selector:
|
||||
leaderworkerset.sigs.k8s.io/name: deepseekr10528-decode-main
|
||||
role: leader
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 30000
|
||||
targetPort: 30000
|
||||
```
|
||||
Execute `kubectl apply -f d-svc.yaml`
|
||||
|
||||
#### Deploy minilb and lb service
|
||||
[lb.yaml](lws-examples/lb.yaml)
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: deepseekr10528-lb-main
|
||||
labels:
|
||||
app: deepseekr10528-lb
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: deepseekr10528-lb
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: deepseekr10528-lb
|
||||
spec:
|
||||
nodeSelector:
|
||||
pd: "yes"
|
||||
tolerations:
|
||||
- key: pd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
containers:
|
||||
- name: sgl-minilb
|
||||
image: lmsysorg/sglang:deepep
|
||||
command:
|
||||
- python
|
||||
- -m
|
||||
- sglang.srt.disaggregation.mini_lb
|
||||
- --prefill
|
||||
- http://deepseekr10528-prefill-main:30000
|
||||
- --decode
|
||||
- http://deepseekr10528-decode-main:30000
|
||||
- --host
|
||||
- 0.0.0.0
|
||||
- --port
|
||||
- "8000"
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: deepseekr10528-lb-service
|
||||
spec:
|
||||
type: NodePort
|
||||
selector:
|
||||
app: deepseekr10528-lb
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 8000 # Service Port(In-Cluster)
|
||||
targetPort: 8000 # Exposed Container
|
||||
nodePort: 30800
|
||||
```
|
||||
Execute `kubectl apply -f lb.yaml`
|
||||
|
||||
After waiting for all model deployments to succeed, you will get the following output:
|
||||
|
||||
```bash
|
||||
[root@ecs-001]# kubectl get po
|
||||
deepseekr10528-decode-main-0 1/1 Running 0 74m
|
||||
deepseekr10528-decode-main-0-1 1/1 Running 0 74m
|
||||
deepseekr10528-lb-main-9c5dbfc57-6lcbd 1/1 Running 0 22m
|
||||
deepseekr10528-prefill-main-0 1/1 Running 0 74m
|
||||
deepseekr10528-prefill-main-0-1 1/1 Running 0 74m
|
||||
[root@ecs-cbm-x1-pd-cpu-001 main_doc]# kubectl get svc |grep dee
|
||||
deepseekr10528-decode-main ClusterIP None <none> <none> 97m
|
||||
deepseekr10528-lb-service NodePort 172.16.242.169 <none> 8000:30800/TCP 22m
|
||||
deepseekr10528-prefill-main ClusterIP None <none> <none> 97m
|
||||
```
|
||||
|
||||
At this point, select a nodePort:30800 to access:
|
||||
|
||||
```bash
|
||||
[root@ecs-001]# curl -X POST "http://{nodePort}:30800/v1/chat/completions" \
|
||||
> -H "Content-Type: application/json" \
|
||||
> -H "Authorization: Bearer None" \
|
||||
> -d '{
|
||||
> "rid":"ccccdd",
|
||||
> "model": "r1",
|
||||
> "messages": [
|
||||
> {"role": "system", "content": "0: You are a helpful AI assistant"},
|
||||
> {"role": "user", "content": "你是谁?."}
|
||||
> ],
|
||||
> "max_tokens":221
|
||||
> }'
|
||||
{"id":"ccccdd","object":"chat.completion","created":1750252498,"model":"qwen2","choices":[{"index":0,"message":{"role":"assistant","content":"<think>\n嗯,用户问了一个很基础的自我介绍问题"你是谁?"。这可能是第一次互动时的常规开场白,也可能是想确认我的身份和功能范围。\n\n用户没有提供任何背景信息,语气简洁中性。这种场景下新用户的可能性较高,需要给出清晰友好的自我介绍,同时突出实用价值来降低陌生感。\n\n考虑到中文用户,应该用简体中文回复。重点要说明三点:身份归属(深度求索)、功能定位(AI助手)、服务范围(学习/工作/生活)。结尾用开放性问题引导对话很关键——既能了解需求,又能避免让用户面对空白输入框时不知所措。\n\n用波浪线结尾可以软化语气,那个笑脸表情😊刚好能中和AI的机械感。不过要控制表情符号数量,避免显得轻浮。\n</think>\n你好呀!我是你的AI助手,由深度求索公司(DeepSeek)开发的语言模型,名字叫 **DeepSeek-R1**。你可以把我当成一个知识丰富、随叫随到的小帮手~😊\n\n我的任务就是陪你聊天、解答问题、","reasoning_content":null,"tool_calls":null},"logprobs":null,"finish_reason":"length","matched_stop":null}],"usage":{"prompt_tokens":14,"total_tokens":235,"completion_tokens":221,"prompt_tokens_details":null}}
|
||||
|
||||
```
|
||||
## FAQ
|
||||
|
||||
1. The current deployment startup parameters may not be fully compatible with all RDMA scenarios. Different RDMA NCCL-related environment configurations may be needed in different network environments.
|
||||
|
||||
2. Some preset, optimized configurations for EPLB are not used here. You can adjust them according to [6017](https://github.com/sgl-project/sglang/issues/6017) as needed.
|
||||
90
docs/references/multi_node_deployment/multi_node.md
Normal file
90
docs/references/multi_node_deployment/multi_node.md
Normal file
@@ -0,0 +1,90 @@
|
||||
# Multi-Node Deployment
|
||||
|
||||
## Llama 3.1 405B
|
||||
|
||||
**Run 405B (fp16) on Two Nodes**
|
||||
|
||||
```bash
|
||||
# replace 172.16.4.52:20000 with your own node ip address and port of the first node
|
||||
|
||||
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --dist-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0
|
||||
|
||||
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --dist-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1
|
||||
```
|
||||
|
||||
Note that LLama 405B (fp8) can also be launched on a single node.
|
||||
|
||||
```bash
|
||||
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
||||
```
|
||||
|
||||
## DeepSeek V3/R1
|
||||
|
||||
Please refer to [DeepSeek documents for reference](https://docs.sglang.ai/references/deepseek.html#running-examples-on-multi-node).
|
||||
|
||||
## Multi-Node Inference on SLURM
|
||||
|
||||
This example showcases how to serve SGLang server across multiple nodes by SLURM. Submit the following job to the SLURM cluster.
|
||||
|
||||
```
|
||||
#!/bin/bash -l
|
||||
|
||||
#SBATCH -o SLURM_Logs/%x_%j_master.out
|
||||
#SBATCH -e SLURM_Logs/%x_%j_master.err
|
||||
#SBATCH -D ./
|
||||
#SBATCH -J Llama-405B-Online-Inference-TP16-SGL
|
||||
|
||||
#SBATCH --nodes=2
|
||||
#SBATCH --ntasks=2
|
||||
#SBATCH --ntasks-per-node=1 # Ensure 1 task per node
|
||||
#SBATCH --cpus-per-task=18
|
||||
#SBATCH --mem=224GB
|
||||
#SBATCH --partition="lmsys.org"
|
||||
#SBATCH --gres=gpu:8
|
||||
#SBATCH --time=12:00:00
|
||||
|
||||
echo "[INFO] Activating environment on node $SLURM_PROCID"
|
||||
if ! source ENV_FOLDER/bin/activate; then
|
||||
echo "[ERROR] Failed to activate environment" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Define parameters
|
||||
model=MODEL_PATH
|
||||
tp_size=16
|
||||
|
||||
echo "[INFO] Running inference"
|
||||
echo "[INFO] Model: $model"
|
||||
echo "[INFO] TP Size: $tp_size"
|
||||
|
||||
# Set NCCL initialization address using the hostname of the head node
|
||||
HEAD_NODE=$(scontrol show hostname "$SLURM_NODELIST" | head -n 1)
|
||||
NCCL_INIT_ADDR="${HEAD_NODE}:8000"
|
||||
echo "[INFO] NCCL_INIT_ADDR: $NCCL_INIT_ADDR"
|
||||
|
||||
# Launch the model server on each node using SLURM
|
||||
srun --ntasks=2 --nodes=2 --output="SLURM_Logs/%x_%j_node$SLURM_NODEID.out" \
|
||||
--error="SLURM_Logs/%x_%j_node$SLURM_NODEID.err" \
|
||||
python3 -m sglang.launch_server \
|
||||
--model-path "$model" \
|
||||
--grammar-backend "xgrammar" \
|
||||
--tp "$tp_size" \
|
||||
--dist-init-addr "$NCCL_INIT_ADDR" \
|
||||
--nnodes 2 \
|
||||
--node-rank "$SLURM_NODEID" &
|
||||
|
||||
# Wait for the NCCL server to be ready on port 30000
|
||||
while ! nc -z "$HEAD_NODE" 30000; do
|
||||
sleep 1
|
||||
echo "[INFO] Waiting for $HEAD_NODE:30000 to accept connections"
|
||||
done
|
||||
|
||||
echo "[INFO] $HEAD_NODE:30000 is ready to accept connections"
|
||||
|
||||
# Keep the script running until the SLURM job times out
|
||||
wait
|
||||
```
|
||||
|
||||
Then, you can test the server by sending requests following other [documents](https://docs.sglang.ai/backend/openai_api_completions.html).
|
||||
|
||||
Thanks for [aflah02](https://github.com/aflah02) for providing the example, based on his [blog post](https://aflah02.substack.com/p/multi-node-llm-inference-with-sglang).
|
||||
13
docs/references/multi_node_deployment/multi_node_index.rst
Normal file
13
docs/references/multi_node_deployment/multi_node_index.rst
Normal file
@@ -0,0 +1,13 @@
|
||||
Multi-Node Deployment
|
||||
=====================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Multi-Node Deployment
|
||||
|
||||
multi_node.md
|
||||
deploy_on_k8s.md
|
||||
lws_pd/lws_pd_deploy.md
|
||||
|
||||
- `Deploying DeepSeek with PD Disaggregation and Large-Scale Expert Parallelism on 96 H100 GPUs <https://lmsys.org/blog/2025-05-05-large-scale-ep/>`_
|
||||
- `Deploying Kimi K2 with PD Disaggregation and Large-Scale Expert Parallelism on 128 H200 GPUs <https://lmsys.org/blog/2025-07-20-k2-large-scale-ep/>`_
|
||||
Reference in New Issue
Block a user