[doc] update lws doc for pd (#7318)
This commit is contained in:
@@ -5,3 +5,4 @@ Multi-Node Deployment
|
||||
|
||||
multi_node.md
|
||||
deploy_on_k8s.md
|
||||
disaggregation/lws_pd_deploy.md
|
||||
|
||||
12
docs/references/disaggregation/lws-examples/d-svc.yaml
Normal file
12
docs/references/disaggregation/lws-examples/d-svc.yaml
Normal file
@@ -0,0 +1,12 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: deepseekr10528-decode-main
|
||||
spec:
|
||||
selector:
|
||||
leaderworkerset.sigs.k8s.io/name: deepseekr10528-decode-main
|
||||
role: leader
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 30000
|
||||
targetPort: 30000
|
||||
292
docs/references/disaggregation/lws-examples/d.yaml
Normal file
292
docs/references/disaggregation/lws-examples/d.yaml
Normal file
@@ -0,0 +1,292 @@
|
||||
apiVersion: leaderworkerset.x-k8s.io/v1
|
||||
kind: LeaderWorkerSet
|
||||
metadata:
|
||||
name: deepseekr10528-decode-main
|
||||
spec:
|
||||
leaderWorkerTemplate:
|
||||
leaderTemplate:
|
||||
metadata:
|
||||
labels:
|
||||
role: leader
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- python3
|
||||
- -m
|
||||
- sglang.launch_server
|
||||
- --port
|
||||
- "30000"
|
||||
- --host
|
||||
- "0.0.0.0"
|
||||
- --model-path
|
||||
- /work/models
|
||||
- --chunked-prefill-size
|
||||
- "262144"
|
||||
- --page-size
|
||||
- "64"
|
||||
- --enable-dp-attention
|
||||
- --enable-dp-lm-head
|
||||
- --dp-size
|
||||
- "16"
|
||||
- --enable-deepep-moe
|
||||
- --deepep-mode
|
||||
- low_latency
|
||||
- --disaggregation-mode
|
||||
- decode
|
||||
- --mem-fraction-static
|
||||
- "0.849"
|
||||
- --context-length
|
||||
- "32768"
|
||||
- --disaggregation-ib-device
|
||||
- "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
|
||||
- --cuda-graph-max-bs
|
||||
- "64"
|
||||
- --max-running-requests
|
||||
- "2048"
|
||||
- --tp-size
|
||||
- "16" # Size of Tensor Parallelism
|
||||
- --dist-init-addr
|
||||
- $(LWS_LEADER_ADDRESS):20102
|
||||
- --nnodes
|
||||
- $(LWS_GROUP_SIZE)
|
||||
- --node-rank
|
||||
- $(LWS_WORKER_INDEX)
|
||||
- --trust-remote-code
|
||||
- --ep-num-redundant-experts
|
||||
- "32"
|
||||
- --moe-dense-tp-size
|
||||
- "1"
|
||||
env:
|
||||
- name: CUDA_LAUNCH_BLOCKING
|
||||
value: "0"
|
||||
- name: NVSHMEM_IB_GID_INDEX
|
||||
value: "3"
|
||||
- name: NVSHMEM_ENABLE_NIC_PE_MAPPING
|
||||
value: "1"
|
||||
- name: NVSHMEM_HCA_PE_MAPPING
|
||||
value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
|
||||
- name: NCCL_IB_QPS_PER_CONNECTION
|
||||
value: "8"
|
||||
- name: NCCL_IB_SPLIT_DATA_ON_QPS
|
||||
value: "1"
|
||||
- name: NCCL_NET_PLUGIN
|
||||
value: "none"
|
||||
- name: NCCL_IB_TC
|
||||
value: "136"
|
||||
- name: NCCL_MIN_NCHANNELS
|
||||
value: "4"
|
||||
- name: NCCL_IB_SL
|
||||
value: "5"
|
||||
- name: MC_TE_METRIC
|
||||
value: "true"
|
||||
- name: SGLANG_MOONCAKE_TRANS_THREAD
|
||||
value: "16"
|
||||
- name: SGL_ENABLE_JIT_DEEPGEMM
|
||||
value: "1"
|
||||
- name: NCCL_IB_HCA
|
||||
value: ^=mlx5_0,mlx5_5,mlx5_6
|
||||
- name: LWS_WORKER_INDEX
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
|
||||
image: lmsysorg/sglang:latest
|
||||
name: sglang-leader
|
||||
ports:
|
||||
- containerPort: 30000
|
||||
protocol: TCP
|
||||
readinessProbe:
|
||||
periodSeconds: 30
|
||||
tcpSocket:
|
||||
port: 30000
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
securityContext:
|
||||
capabilities:
|
||||
add:
|
||||
- IPC_LOCK
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- mountPath: /root/.cache
|
||||
name: sgl-cache
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /work/models
|
||||
name: model
|
||||
- mountPath: /dev/infiniband
|
||||
name: ib
|
||||
- mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
name: cf
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostIPC: true
|
||||
hostNetwork: true
|
||||
nodeSelector:
|
||||
# should modify according your deployment env
|
||||
pd: "yes"
|
||||
tolerations:
|
||||
# should modify according your deployment env
|
||||
- key: bopd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
volumes:
|
||||
- hostPath:
|
||||
path: /data1/sgl_cache1
|
||||
type: DirectoryOrCreate
|
||||
name: sgl-cache
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
name: dshm
|
||||
- hostPath:
|
||||
path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
|
||||
name: model
|
||||
- hostPath:
|
||||
path: /dev/infiniband
|
||||
name: ib
|
||||
- hostPath:
|
||||
path: /data1/maas_hosted_models/models/fused_moe_triton/configs
|
||||
name: cf
|
||||
restartPolicy: RecreateGroupOnPodRestart
|
||||
size: 2
|
||||
workerTemplate:
|
||||
metadata: {}
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- python3
|
||||
- -m
|
||||
- sglang.launch_server
|
||||
- --model-path
|
||||
- /work/models
|
||||
- --chunked-prefill-size
|
||||
- "262144"
|
||||
- --page-size
|
||||
- "64"
|
||||
- --enable-dp-attention
|
||||
- --enable-dp-lm-head
|
||||
- --dp-size
|
||||
- "16"
|
||||
- --enable-deepep-moe
|
||||
- --deepep-mode
|
||||
- low_latency
|
||||
- --disaggregation-mode
|
||||
- decode
|
||||
- --mem-fraction-static
|
||||
- "0.849"
|
||||
- --context-length
|
||||
- "32768"
|
||||
- --disaggregation-ib-device
|
||||
- "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
|
||||
- --cuda-graph-max-bs
|
||||
- "64"
|
||||
- --max-running-requests
|
||||
- "2048"
|
||||
- --tp-size
|
||||
- "16" # Size of Tensor Parallelism
|
||||
- --dist-init-addr
|
||||
- $(LWS_LEADER_ADDRESS):20102
|
||||
- --nnodes
|
||||
- $(LWS_GROUP_SIZE)
|
||||
- --node-rank
|
||||
- $(LWS_WORKER_INDEX)
|
||||
- --trust-remote-code
|
||||
- --ep-num-redundant-experts
|
||||
- "32"
|
||||
- --moe-dense-tp-size
|
||||
- "1"
|
||||
env:
|
||||
- name: NVSHMEM_IB_TRAFFIC_CLASS
|
||||
value: "16"
|
||||
- name: NVSHMEM_IB_GID_INDEX
|
||||
value: "3"
|
||||
- name: NVSHMEM_ENABLE_NIC_PE_MAPPING
|
||||
value: "1"
|
||||
- name: NVSHMEM_HCA_PE_MAPPING
|
||||
value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
|
||||
- name: NCCL_IB_QPS_PER_CONNECTION
|
||||
value: "8"
|
||||
- name: NCCL_IB_SPLIT_DATA_ON_QPS
|
||||
value: "1"
|
||||
- name: NCCL_NET_PLUGIN
|
||||
value: "none"
|
||||
- name: NCCL_IB_TC
|
||||
value: "136"
|
||||
- name: NCCL_MIN_NCHANNELS
|
||||
value: "4"
|
||||
- name: MC_TE_METRIC
|
||||
value: "true"
|
||||
- name: NCCL_IB_SL
|
||||
value: "5"
|
||||
- name: SGLANG_MOONCAKE_TRANS_THREAD
|
||||
value: "16"
|
||||
- name: SGL_ENABLE_JIT_DEEPGEMM
|
||||
value: "1"
|
||||
- name: NCCL_IB_HCA
|
||||
value: ^=mlx5_0,mlx5_5,mlx5_6
|
||||
- name: LWS_WORKER_INDEX
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
|
||||
image: lmsysorg/sglang:latest
|
||||
name: sglang-worker
|
||||
ports:
|
||||
- containerPort: 30001
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
securityContext:
|
||||
capabilities:
|
||||
add:
|
||||
- IPC_LOCK
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- mountPath: /root/.cache
|
||||
name: sgl-cache
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /work/models
|
||||
name: model
|
||||
- mountPath: /dev/infiniband
|
||||
name: ib
|
||||
- mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
name: cf
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostIPC: true
|
||||
hostNetwork: true
|
||||
nodeSelector:
|
||||
# should modify according your deployment env
|
||||
pd: "yes"
|
||||
tolerations:
|
||||
# should modify according your deployment env
|
||||
- key: bopd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
volumes:
|
||||
- hostPath:
|
||||
path: /data1/sgl_cache1
|
||||
type: DirectoryOrCreate
|
||||
name: sgl-cache
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
name: dshm
|
||||
- hostPath:
|
||||
path: /dev/infiniband
|
||||
name: ib
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
|
||||
name: model
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/maas_hosted_models/models/fused_moe_triton/configs
|
||||
name: cf
|
||||
networkConfig:
|
||||
subdomainPolicy: Shared
|
||||
replicas: 1
|
||||
rolloutStrategy:
|
||||
rollingUpdateConfiguration:
|
||||
maxSurge: 0
|
||||
maxUnavailable: 1
|
||||
type: RollingUpdate
|
||||
startupPolicy: LeaderCreated
|
||||
55
docs/references/disaggregation/lws-examples/lb.yaml
Normal file
55
docs/references/disaggregation/lws-examples/lb.yaml
Normal file
@@ -0,0 +1,55 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: deepseekr10528-lb-main
|
||||
labels:
|
||||
app: deepseekr10528-lb
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: deepseekr10528-lb
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: deepseekr10528-lb
|
||||
spec:
|
||||
nodeSelector:
|
||||
bo: "yes"
|
||||
tolerations:
|
||||
- key: bopd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
containers:
|
||||
- name: sgl-minilb
|
||||
image: lmsysorg/sglang:latest
|
||||
command:
|
||||
- python
|
||||
- -m
|
||||
- sglang.srt.disaggregation.mini_lb
|
||||
- --prefill
|
||||
- http://deepseekr10528-prefill-main:30000
|
||||
- --decode
|
||||
- http://deepseekr10528-decode-main:30000
|
||||
- --host
|
||||
- 0.0.0.0
|
||||
- --port
|
||||
- "8000"
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: deepseekr10528-lb-service
|
||||
spec:
|
||||
type: NodePort # NodePort is easy to test, you can also specify `ClusterIP`
|
||||
selector:
|
||||
app: deepseekr10528-lb
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 8000 # Service Port(In-Cluster)
|
||||
targetPort: 8000 # Exposed Container
|
||||
nodePort: 30800
|
||||
12
docs/references/disaggregation/lws-examples/p-svc.yaml
Normal file
12
docs/references/disaggregation/lws-examples/p-svc.yaml
Normal file
@@ -0,0 +1,12 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: deepseekr10528-prefill-main
|
||||
spec:
|
||||
selector:
|
||||
leaderworkerset.sigs.k8s.io/name: deepseekr10528-prefill-main
|
||||
role: leader
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 30000
|
||||
targetPort: 30000
|
||||
306
docs/references/disaggregation/lws-examples/p.yaml
Normal file
306
docs/references/disaggregation/lws-examples/p.yaml
Normal file
@@ -0,0 +1,306 @@
|
||||
apiVersion: leaderworkerset.x-k8s.io/v1
|
||||
kind: LeaderWorkerSet
|
||||
metadata:
|
||||
name: deepseekr10528-prefill-main
|
||||
spec:
|
||||
leaderWorkerTemplate:
|
||||
leaderTemplate:
|
||||
metadata:
|
||||
labels:
|
||||
role: leader
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- python3
|
||||
- -m
|
||||
- sglang.launch_server
|
||||
- --port
|
||||
- "30000"
|
||||
- --host
|
||||
- "0.0.0.0"
|
||||
- --model-path
|
||||
- /work/models
|
||||
- --disaggregation-ib-device
|
||||
# should modify according your rdma env
|
||||
- mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3
|
||||
- --chunked-prefill-size
|
||||
- "524288"
|
||||
- --max-prefill-tokens
|
||||
- "32768"
|
||||
- --page-size
|
||||
- "64"
|
||||
- --ep-dispatch-algorithm
|
||||
- dynamic
|
||||
- --eplb-algorithm
|
||||
- deepseek
|
||||
- --enable-dp-lm-head
|
||||
- --enable-dp-attention
|
||||
- --dp-size
|
||||
- "16"
|
||||
- --disable-radix-cache
|
||||
- --enable-deepep-moe
|
||||
- --deepep-mode
|
||||
- normal
|
||||
- --disaggregation-mode
|
||||
- prefill
|
||||
- --mem-fraction-static
|
||||
- "0.7"
|
||||
- --context-length
|
||||
- "32768"
|
||||
- --tp
|
||||
- "16"
|
||||
- --dist-init-addr
|
||||
- $(LWS_LEADER_ADDRESS):20102
|
||||
- --nnodes
|
||||
- $(LWS_GROUP_SIZE)
|
||||
- --node-rank
|
||||
- $(LWS_WORKER_INDEX)
|
||||
- --trust-remote-code
|
||||
- --ep-num-redundant-experts
|
||||
- "32"
|
||||
- --moe-dense-tp-size
|
||||
- "1"
|
||||
- --max-running-requests
|
||||
- "1024"
|
||||
env:
|
||||
- name: NVSHMEM_HCA_PE_MAPPING
|
||||
# should modify according your rdma env
|
||||
value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
|
||||
- name: NVSHMEM_IB_GID_INDEX
|
||||
value: "3"
|
||||
- name: NVSHMEM_ENABLE_NIC_PE_MAPPING
|
||||
value: "1"
|
||||
- name: SGLANG_SET_CPU_AFFINITY
|
||||
value: "true"
|
||||
- name: SGL_ENABLE_JIT_DEEPGEMM
|
||||
value: "1"
|
||||
- name: NCCL_IB_QPS_PER_CONNECTION
|
||||
value: "8"
|
||||
- name: NCCL_IB_SPLIT_DATA_ON_QPS
|
||||
value: "1"
|
||||
- name: NCCL_NET_PLUGIN
|
||||
value: none
|
||||
- name: NCCL_IB_TC
|
||||
value: "136"
|
||||
- name: NCCL_MIN_NCHANNELS
|
||||
value: "4"
|
||||
- name: MC_TE_METRIC
|
||||
value: "false"
|
||||
- name: NCCL_IB_SL
|
||||
value: "5"
|
||||
- name: NCCL_IB_HCA
|
||||
value: ^=mlx5_0,mlx5_5,mlx5_6
|
||||
- name: LWS_WORKER_INDEX
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
|
||||
image: lmsysorg/sglang:latest
|
||||
name: sglang-leader
|
||||
ports:
|
||||
- containerPort: 30000
|
||||
protocol: TCP
|
||||
readinessProbe:
|
||||
periodSeconds: 30
|
||||
tcpSocket:
|
||||
port: 30000
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
securityContext:
|
||||
capabilities:
|
||||
add:
|
||||
- IPC_LOCK
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /work/models
|
||||
name: model
|
||||
- mountPath: /dev/infiniband
|
||||
name: ib
|
||||
- mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
name: cf
|
||||
- mountPath: /root/.cache
|
||||
name: sgl-cache
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostIPC: true
|
||||
hostNetwork: true
|
||||
nodeSelector:
|
||||
# should modify according your deployment env
|
||||
pd: "yes"
|
||||
tolerations:
|
||||
# should modify according your deployment env
|
||||
- key: bopd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
volumes:
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
name: dshm
|
||||
- hostPath:
|
||||
path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
|
||||
name: model
|
||||
- hostPath:
|
||||
path: /dev/infiniband
|
||||
name: ib
|
||||
- hostPath:
|
||||
path: /data1/maas_hosted_models/models/fused_moe_triton/configs
|
||||
name: cf
|
||||
- hostPath:
|
||||
path: /data1/sgl_cache
|
||||
type: DirectoryOrCreate
|
||||
name: sgl-cache
|
||||
restartPolicy: RecreateGroupOnPodRestart
|
||||
size: 2
|
||||
workerTemplate:
|
||||
metadata: {}
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- python3
|
||||
- -m
|
||||
- sglang.launch_server
|
||||
- --model-path
|
||||
- /work/models
|
||||
- --disaggregation-ib-device
|
||||
# should modify according your rdma env
|
||||
- mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3
|
||||
- --chunked-prefill-size
|
||||
- "524288"
|
||||
- --max-prefill-tokens
|
||||
- "32768"
|
||||
- --page-size
|
||||
- "64"
|
||||
- --ep-dispatch-algorithm
|
||||
- dynamic
|
||||
- --eplb-algorithm
|
||||
- deepseek
|
||||
# - --deepep-config
|
||||
# - /home/aiges/tuned/tuned_8sms.json
|
||||
# can be tuned using deepep test scripts
|
||||
- --enable-dp-lm-head
|
||||
- --enable-dp-attention
|
||||
- --dp-size
|
||||
- "16"
|
||||
- --disable-radix-cache
|
||||
- --enable-deepep-moe
|
||||
- --deepep-mode
|
||||
- normal
|
||||
- --disaggregation-mode
|
||||
- prefill
|
||||
- --mem-fraction-static
|
||||
- "0.7"
|
||||
- --context-length
|
||||
- "32768"
|
||||
- --tp
|
||||
- "16"
|
||||
- --dist-init-addr
|
||||
- $(LWS_LEADER_ADDRESS):20102
|
||||
- --nnodes
|
||||
- $(LWS_GROUP_SIZE)
|
||||
- --node-rank
|
||||
- $(LWS_WORKER_INDEX)
|
||||
- --trust-remote-code
|
||||
- --ep-num-redundant-experts
|
||||
- "32"
|
||||
- --moe-dense-tp-size
|
||||
- "1"
|
||||
- --max-running-requests
|
||||
- "1024"
|
||||
env:
|
||||
- name: SGLANG_SET_CPU_AFFINITY
|
||||
value: "true"
|
||||
- name: NVSHMEM_HCA_PE_MAPPING
|
||||
# should modify according your rdma env
|
||||
value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
|
||||
- name: NCCL_IB_HCA
|
||||
value: ^=mlx5_0,mlx5_5,mlx5_6
|
||||
- name: NVSHMEM_IB_TRAFFIC_CLASS
|
||||
value: "16"
|
||||
- name: NVSHMEM_IB_GID_INDEX
|
||||
value: "3"
|
||||
- name: NVSHMEM_ENABLE_NIC_PE_MAPPING
|
||||
value: "1"
|
||||
- name: CUDA_LAUNCH_BLOCKING
|
||||
value: "0"
|
||||
- name: SGLANG_MOONCAKE_TRANS_THREAD
|
||||
value: "8"
|
||||
- name: SGL_ENABLE_JIT_DEEPGEMM
|
||||
value: "1"
|
||||
- name: SGL_CHUNKED_PREFIX_CACHE_THRESHOLD
|
||||
value: "0"
|
||||
- name: NCCL_IB_QPS_PER_CONNECTION
|
||||
value: "8"
|
||||
- name: NCCL_IB_SPLIT_DATA_ON_QPS
|
||||
value: "1"
|
||||
- name: NCCL_NET_PLUGIN
|
||||
value: none
|
||||
- name: NCCL_IB_TC
|
||||
value: "136"
|
||||
- name: NCCL_MIN_NCHANNELS
|
||||
value: "4"
|
||||
- name: MC_TE_METRIC
|
||||
value: "true"
|
||||
- name: NCCL_IB_SL
|
||||
value: "5"
|
||||
- name: LWS_WORKER_INDEX
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
|
||||
image: lmsysorg/sglang:latest
|
||||
name: sglang-worker
|
||||
ports:
|
||||
- containerPort: 30001
|
||||
protocol: TCP
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
securityContext:
|
||||
capabilities:
|
||||
add:
|
||||
- IPC_LOCK
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- mountPath: /root/.cache
|
||||
name: sgl-cache
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /work/models
|
||||
name: model
|
||||
- mountPath: /dev/infiniband
|
||||
name: ib
|
||||
- mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
name: cf
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostIPC: true
|
||||
hostNetwork: true
|
||||
nodeSelector:
|
||||
# should modify according your deployment env
|
||||
pd: "yes"
|
||||
tolerations:
|
||||
# should modify according your deployment env
|
||||
- key: bopd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
volumes:
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
name: dshm
|
||||
- hostPath:
|
||||
path: /dev/infiniband
|
||||
name: ib
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
|
||||
name: model
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/maas_hosted_models/models/fused_moe_triton/configs
|
||||
name: cf
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/sgl_cache
|
||||
type: DirectoryOrCreate
|
||||
name: sgl-cache
|
||||
786
docs/references/disaggregation/lws_pd_deploy.md
Normal file
786
docs/references/disaggregation/lws_pd_deploy.md
Normal file
@@ -0,0 +1,786 @@
|
||||
# LWS Based PD Deploy
|
||||
|
||||
## 0. Prerequisites
|
||||
|
||||
1. k8s >=1.26
|
||||
2. lws installed on k8s.
|
||||
|
||||
## 1. Image Preparation
|
||||
|
||||
`lmsysorg/sglang:deepep`
|
||||
|
||||
## 2. Deployment Manifest Files
|
||||
|
||||
***Notice: We will package all deployment files into Helm Chart format in the near future. Interested community members can contact us to contribute***
|
||||
|
||||
### Prefill
|
||||
|
||||
Prefill manifest file [prefill.yaml](lws-examples/p.yaml)
|
||||
|
||||
*Note: The NodeSelector section, model location section, and taint toleration section can be adjusted according to your actual deployment environment*
|
||||
|
||||
```yaml
|
||||
apiVersion: leaderworkerset.x-k8s.io/v1
|
||||
kind: LeaderWorkerSet
|
||||
metadata:
|
||||
name: deepseekr10528-prefill-main
|
||||
spec:
|
||||
leaderWorkerTemplate:
|
||||
leaderTemplate:
|
||||
metadata:
|
||||
labels:
|
||||
role: leader
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- python3
|
||||
- -m
|
||||
- sglang.launch_server
|
||||
- --port
|
||||
- "30000"
|
||||
- --host
|
||||
- "0.0.0.0"
|
||||
- --model-path
|
||||
- /work/models
|
||||
- --disaggregation-ib-device
|
||||
# should modify according your rdma env
|
||||
- mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3
|
||||
- --chunked-prefill-size
|
||||
- "524288"
|
||||
- --max-prefill-tokens
|
||||
- "32768"
|
||||
- --page-size
|
||||
- "64"
|
||||
# - --init-expert-location
|
||||
# - /home/aiges/tuned/attachment_ep_statistics/prefill_in1024.json
|
||||
- --ep-dispatch-algorithm
|
||||
- dynamic
|
||||
- --eplb-algorithm
|
||||
- deepseek
|
||||
# - --deepep-config
|
||||
# - /home/aiges/tuned/tuned_8sms.json
|
||||
- --enable-dp-lm-head
|
||||
- --enable-dp-attention
|
||||
- --dp-size
|
||||
- "16"
|
||||
- --disable-radix-cache
|
||||
- --enable-deepep-moe
|
||||
- --deepep-mode
|
||||
- normal
|
||||
- --disaggregation-mode
|
||||
- prefill
|
||||
- --mem-fraction-static
|
||||
- "0.7"
|
||||
- --context-length
|
||||
- "32768"
|
||||
- --tp
|
||||
- "16"
|
||||
- --dist-init-addr
|
||||
- $(LWS_LEADER_ADDRESS):20102
|
||||
- --nnodes
|
||||
- $(LWS_GROUP_SIZE)
|
||||
- --node-rank
|
||||
- $(LWS_WORKER_INDEX)
|
||||
- --trust-remote-code
|
||||
- --ep-num-redundant-experts
|
||||
- "32"
|
||||
- --moe-dense-tp-size
|
||||
- "1"
|
||||
- --max-running-requests
|
||||
- "1024"
|
||||
env:
|
||||
# - name: NVSHMEM_HCA_PE_MAPPING
|
||||
# value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
|
||||
# - name: NVSHMEM_HCA_LIST
|
||||
# value: "mlx5_bond_0:1,mlx5_bond_1:1,mlx5_bond_2:1,mlx5_bond_3:1"
|
||||
- name: NVSHMEM_IB_GID_INDEX
|
||||
value: "3"
|
||||
- name: NVSHMEM_ENABLE_NIC_PE_MAPPING
|
||||
value: "1"
|
||||
- name: SGLANG_SET_CPU_AFFINITY
|
||||
value: "true"
|
||||
- name: SGL_ENABLE_JIT_DEEPGEMM
|
||||
value: "1"
|
||||
- name: NCCL_IB_QPS_PER_CONNECTION
|
||||
value: "8"
|
||||
- name: NCCL_IB_SPLIT_DATA_ON_QPS
|
||||
value: "1"
|
||||
- name: NCCL_NET_PLUGIN
|
||||
value: none
|
||||
- name: NCCL_IB_TC
|
||||
value: "136"
|
||||
- name: NCCL_MIN_NCHANNELS
|
||||
value: "4"
|
||||
- name: MC_TE_METRIC
|
||||
value: "false"
|
||||
- name: NCCL_IB_SL
|
||||
value: "5"
|
||||
- name: NCCL_IB_HCA
|
||||
value: ^=mlx5_0,mlx5_5,mlx5_6
|
||||
- name: LWS_WORKER_INDEX
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
|
||||
image: lmsysorg/sglang:deepep
|
||||
name: sglang-leader
|
||||
ports:
|
||||
- containerPort: 30000
|
||||
protocol: TCP
|
||||
readinessProbe:
|
||||
periodSeconds: 30
|
||||
tcpSocket:
|
||||
port: 30000
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
securityContext:
|
||||
capabilities:
|
||||
add:
|
||||
- IPC_LOCK
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /work/models
|
||||
name: model
|
||||
- mountPath: /dev/infiniband
|
||||
name: ib
|
||||
- mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
name: cf
|
||||
- mountPath: /root/.cache
|
||||
name: sgl-cache
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostIPC: true
|
||||
hostNetwork: true
|
||||
nodeSelector:
|
||||
pd: "yes"
|
||||
tolerations:
|
||||
- key: pd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
volumes:
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
name: dshm
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
|
||||
name: model
|
||||
- hostPath:
|
||||
path: /dev/infiniband
|
||||
name: ib
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/maas_hosted_models/models/fused_moe_triton/configs
|
||||
name: cf
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/sgl_cache
|
||||
type: DirectoryOrCreate
|
||||
name: sgl-cache
|
||||
restartPolicy: RecreateGroupOnPodRestart
|
||||
size: 2
|
||||
workerTemplate:
|
||||
metadata: {}
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- python3
|
||||
- -m
|
||||
- sglang.launch_server
|
||||
- --model-path
|
||||
- /work/models
|
||||
- --disaggregation-ib-device
|
||||
- mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3
|
||||
- --chunked-prefill-size
|
||||
- "524288"
|
||||
- --max-prefill-tokens
|
||||
- "32768"
|
||||
- --page-size
|
||||
- "64"
|
||||
#- --init-expert-location
|
||||
#- /home/aiges/tuned/attachment_ep_statistics/prefill_in1024.json
|
||||
- --ep-dispatch-algorithm
|
||||
- dynamic
|
||||
- --eplb-algorithm
|
||||
- deepseek
|
||||
# - --deepep-config
|
||||
# - /home/aiges/tuned/tuned_8sms.json
|
||||
- --enable-dp-lm-head
|
||||
- --enable-dp-attention
|
||||
- --dp-size
|
||||
- "16"
|
||||
- --disable-radix-cache
|
||||
- --enable-deepep-moe
|
||||
- --deepep-mode
|
||||
- normal
|
||||
- --disaggregation-mode
|
||||
- prefill
|
||||
- --mem-fraction-static
|
||||
- "0.7"
|
||||
- --context-length
|
||||
- "32768"
|
||||
- --tp
|
||||
- "16"
|
||||
- --dist-init-addr
|
||||
- $(LWS_LEADER_ADDRESS):20102
|
||||
- --nnodes
|
||||
- $(LWS_GROUP_SIZE)
|
||||
- --node-rank
|
||||
- $(LWS_WORKER_INDEX)
|
||||
- --trust-remote-code
|
||||
- --ep-num-redundant-experts
|
||||
- "32"
|
||||
- --moe-dense-tp-size
|
||||
- "1"
|
||||
- --max-running-requests
|
||||
- "1024"
|
||||
env:
|
||||
- name: SGLANG_SET_CPU_AFFINITY
|
||||
value: "true"
|
||||
- name: SGLANG_HACK_DEEPEP_NUM_SMS
|
||||
value: "8"
|
||||
- name: SGLANG_HACK_DEEPEP_NEW_MODE
|
||||
value: "0"
|
||||
# - name: NVSHMEM_HCA_PE_MAPPING
|
||||
# value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
|
||||
# - name: NVSHMEM_HCA_LIST
|
||||
# value: "mlx5_bond_0:1,mlx5_bond_1:1,mlx5_bond_2:1,mlx5_bond_3:1"
|
||||
- name: NCCL_IB_HCA
|
||||
value: ^=mlx5_0,mlx5_5,mlx5_6
|
||||
- name: NVSHMEM_IB_TRAFFIC_CLASS
|
||||
value: "16"
|
||||
- name: NVSHMEM_IB_GID_INDEX
|
||||
value: "3"
|
||||
- name: NVSHMEM_ENABLE_NIC_PE_MAPPING
|
||||
value: "1"
|
||||
- name: CUDA_LAUNCH_BLOCKING
|
||||
value: "0"
|
||||
- name: SGLANG_MOONCAKE_TRANS_THREAD
|
||||
value: "8"
|
||||
- name: SGL_ENABLE_JIT_DEEPGEMM
|
||||
value: "1"
|
||||
- name: SGL_CHUNKED_PREFIX_CACHE_THRESHOLD
|
||||
value: "0"
|
||||
- name: NCCL_IB_QPS_PER_CONNECTION
|
||||
value: "8"
|
||||
- name: NCCL_IB_SPLIT_DATA_ON_QPS
|
||||
value: "1"
|
||||
- name: NCCL_NET_PLUGIN
|
||||
value: none
|
||||
- name: NCCL_IB_TC
|
||||
value: "136"
|
||||
- name: NCCL_MIN_NCHANNELS
|
||||
value: "4"
|
||||
- name: MC_TE_METRIC
|
||||
value: "true"
|
||||
- name: NCCL_IB_SL
|
||||
value: "5"
|
||||
- name: LWS_WORKER_INDEX
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
|
||||
image: lmsysorg/sglang:deepep
|
||||
name: sglang-worker
|
||||
ports:
|
||||
- containerPort: 30001
|
||||
protocol: TCP
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
securityContext:
|
||||
capabilities:
|
||||
add:
|
||||
- IPC_LOCK
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
|
||||
- mountPath: /root/.cache
|
||||
name: sgl-cache
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /work/models
|
||||
name: model
|
||||
- mountPath: /dev/infiniband
|
||||
name: ib
|
||||
- mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
name: cf
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostIPC: true
|
||||
hostNetwork: true
|
||||
nodeSelector:
|
||||
pd: "yes"
|
||||
tolerations:
|
||||
- key: pd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
volumes:
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
name: dshm
|
||||
- hostPath:
|
||||
path: /dev/infiniband
|
||||
name: ib
|
||||
- hostPath:
|
||||
path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
|
||||
name: model
|
||||
- hostPath:
|
||||
path: /data1/maas_hosted_models/models/fused_moe_triton/configs
|
||||
name: cf
|
||||
- hostPath:
|
||||
path: /data1/sgl_cache
|
||||
type: DirectoryOrCreate
|
||||
name: sgl-cache
|
||||
|
||||
```
|
||||
|
||||
### Decode
|
||||
|
||||
Decode node deployment manifest file [decode.yaml](lws-examples/d.yaml)
|
||||
|
||||
*Note: The NodeSelector section, model location section, and taint toleration section can be adjusted according to your actual deployment environment*
|
||||
|
||||
```yaml
|
||||
apiVersion: leaderworkerset.x-k8s.io/v1
|
||||
kind: LeaderWorkerSet
|
||||
metadata:
|
||||
name: deepseekr10528-decode-main
|
||||
spec:
|
||||
leaderWorkerTemplate:
|
||||
leaderTemplate:
|
||||
metadata:
|
||||
labels:
|
||||
role: leader
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- python3
|
||||
- -m
|
||||
- sglang.launch_server
|
||||
- --port
|
||||
- "30000"
|
||||
- --host
|
||||
- "0.0.0.0"
|
||||
- --model-path
|
||||
- /work/models
|
||||
- --chunked-prefill-size
|
||||
- "262144"
|
||||
- --page-size
|
||||
- "64"
|
||||
- --enable-dp-attention
|
||||
- --enable-dp-lm-head
|
||||
- --dp-size
|
||||
- "16"
|
||||
- --enable-deepep-moe
|
||||
- --deepep-mode
|
||||
- low_latency
|
||||
- --disaggregation-mode
|
||||
- decode
|
||||
- --mem-fraction-static
|
||||
- "0.849"
|
||||
- --context-length
|
||||
- "32768"
|
||||
- --disaggregation-ib-device
|
||||
- "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
|
||||
- --cuda-graph-max-bs
|
||||
- "64"
|
||||
- --max-running-requests
|
||||
- "2048"
|
||||
- --tp-size
|
||||
- "16" # Size of Tensor Parallelism
|
||||
- --dist-init-addr
|
||||
- $(LWS_LEADER_ADDRESS):20102
|
||||
- --nnodes
|
||||
- $(LWS_GROUP_SIZE)
|
||||
- --node-rank
|
||||
- $(LWS_WORKER_INDEX)
|
||||
- --trust-remote-code
|
||||
- --ep-num-redundant-experts
|
||||
- "32"
|
||||
- --moe-dense-tp-size
|
||||
- "1"
|
||||
env:
|
||||
- name: CUDA_LAUNCH_BLOCKING
|
||||
value: "0"
|
||||
- name: NVSHMEM_IB_GID_INDEX
|
||||
value: "3"
|
||||
- name: NVSHMEM_ENABLE_NIC_PE_MAPPING
|
||||
value: "1"
|
||||
- name: NCCL_IB_QPS_PER_CONNECTION
|
||||
value: "8"
|
||||
- name: NCCL_IB_SPLIT_DATA_ON_QPS
|
||||
value: "1"
|
||||
- name: NCCL_NET_PLUGIN
|
||||
value: "none"
|
||||
- name: NCCL_IB_TC
|
||||
value: "136"
|
||||
- name: NCCL_MIN_NCHANNELS
|
||||
value: "4"
|
||||
- name: NCCL_IB_SL
|
||||
value: "5"
|
||||
- name: MC_TE_METRIC
|
||||
value: "true"
|
||||
- name: SGLANG_MOONCAKE_TRANS_THREAD
|
||||
value: "16"
|
||||
- name: SGL_ENABLE_JIT_DEEPGEMM
|
||||
value: "1"
|
||||
- name: NCCL_IB_HCA
|
||||
value: ^=mlx5_0,mlx5_5,mlx5_6
|
||||
- name: LWS_WORKER_INDEX
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
|
||||
image: lmsysorg/sglang:deepep
|
||||
name: sglang-leader
|
||||
ports:
|
||||
- containerPort: 30000
|
||||
protocol: TCP
|
||||
readinessProbe:
|
||||
periodSeconds: 30
|
||||
tcpSocket:
|
||||
port: 30000
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
securityContext:
|
||||
capabilities:
|
||||
add:
|
||||
- IPC_LOCK
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- mountPath: /root/.cache
|
||||
name: sgl-cache
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /work/models
|
||||
name: model
|
||||
- mountPath: /dev/infiniband
|
||||
name: ib
|
||||
- mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
name: cf
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostIPC: true
|
||||
hostNetwork: true
|
||||
nodeSelector:
|
||||
pd: "yes"
|
||||
tolerations:
|
||||
- key: pd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
volumes:
|
||||
- hostPath:
|
||||
path: /data1/sgl_cache1
|
||||
type: DirectoryOrCreate
|
||||
name: sgl-cache
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
name: dshm
|
||||
- hostPath:
|
||||
path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
|
||||
name: model
|
||||
- hostPath:
|
||||
path: /dev/infiniband
|
||||
name: ib
|
||||
- hostPath:
|
||||
path: /data1/maas_hosted_models/models/fused_moe_triton/configs
|
||||
name: cf
|
||||
restartPolicy: RecreateGroupOnPodRestart
|
||||
size: 2
|
||||
workerTemplate:
|
||||
metadata: {}
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- python3
|
||||
- -m
|
||||
- sglang.launch_server
|
||||
- --model-path
|
||||
- /work/models
|
||||
- --chunked-prefill-size
|
||||
- "262144"
|
||||
- --page-size
|
||||
- "64"
|
||||
- --enable-dp-attention
|
||||
- --enable-dp-lm-head
|
||||
#- --enable-two-batch-overlap
|
||||
- --dp-size
|
||||
- "16"
|
||||
- --enable-deepep-moe
|
||||
- --deepep-mode
|
||||
- low_latency
|
||||
- --disaggregation-mode
|
||||
- decode
|
||||
- --mem-fraction-static
|
||||
- "0.849"
|
||||
- --context-length
|
||||
- "32768"
|
||||
- --disaggregation-ib-device
|
||||
# should modify according your rdma env
|
||||
- "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
|
||||
- --cuda-graph-max-bs
|
||||
- "64"
|
||||
- --max-running-requests
|
||||
- "2048"
|
||||
- --tp-size
|
||||
- "16" # Size of Tensor Parallelism
|
||||
- --dist-init-addr
|
||||
- $(LWS_LEADER_ADDRESS):20102
|
||||
- --nnodes
|
||||
- $(LWS_GROUP_SIZE)
|
||||
- --node-rank
|
||||
- $(LWS_WORKER_INDEX)
|
||||
- --trust-remote-code
|
||||
- --ep-num-redundant-experts
|
||||
- "32"
|
||||
- --moe-dense-tp-size
|
||||
- "1"
|
||||
env:
|
||||
- name: SGLANG_HACK_DEEPEP_NUM_SMS
|
||||
value: "24"
|
||||
- name: SGLANG_HACK_DEEPEP_NEW_MODE
|
||||
value: "0"
|
||||
- name: NVSHMEM_IB_TRAFFIC_CLASS
|
||||
value: "16"
|
||||
- name: NVSHMEM_IB_GID_INDEX
|
||||
value: "3"
|
||||
- name: NVSHMEM_ENABLE_NIC_PE_MAPPING
|
||||
value: "1"
|
||||
- name: NCCL_IB_QPS_PER_CONNECTION
|
||||
value: "8"
|
||||
- name: NCCL_IB_SPLIT_DATA_ON_QPS
|
||||
value: "1"
|
||||
- name: NCCL_NET_PLUGIN
|
||||
value: "none"
|
||||
- name: NCCL_IB_TC
|
||||
value: "136"
|
||||
- name: NCCL_MIN_NCHANNELS
|
||||
value: "4"
|
||||
- name: MC_TE_METRIC
|
||||
value: "true"
|
||||
- name: NCCL_IB_SL
|
||||
value: "5"
|
||||
- name: SGLANG_MOONCAKE_TRANS_THREAD
|
||||
value: "16"
|
||||
- name: SGL_ENABLE_JIT_DEEPGEMM
|
||||
value: "1"
|
||||
- name: NCCL_IB_HCA
|
||||
value: ^=mlx5_0,mlx5_5,mlx5_6
|
||||
- name: LWS_WORKER_INDEX
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
|
||||
image: lmsysorg/sglang:deepep
|
||||
name: sglang-worker
|
||||
ports:
|
||||
- containerPort: 30001
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
securityContext:
|
||||
capabilities:
|
||||
add:
|
||||
- IPC_LOCK
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- mountPath: /root/.cache
|
||||
name: sgl-cache
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /work/models
|
||||
name: model
|
||||
- mountPath: /dev/infiniband
|
||||
name: ib
|
||||
- mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
name: cf
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostIPC: true
|
||||
hostNetwork: true
|
||||
nodeSelector:
|
||||
pd: "yes"
|
||||
tolerations:
|
||||
- key: pd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
volumes:
|
||||
- hostPath:
|
||||
path: /data1/sgl_cache1
|
||||
type: DirectoryOrCreate
|
||||
name: sgl-cache
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
name: dshm
|
||||
- hostPath:
|
||||
path: /dev/infiniband
|
||||
name: ib
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
|
||||
name: model
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/maas_hosted_models/models/fused_moe_triton/configs
|
||||
name: cf
|
||||
networkConfig:
|
||||
subdomainPolicy: Shared
|
||||
replicas: 1
|
||||
rolloutStrategy:
|
||||
rollingUpdateConfiguration:
|
||||
maxSurge: 0
|
||||
maxUnavailable: 1
|
||||
type: RollingUpdate
|
||||
startupPolicy: LeaderCreated
|
||||
```
|
||||
|
||||
Execute separately:
|
||||
|
||||
```bash
|
||||
kubectl apply -f p.yaml
|
||||
kubectl apply -f d.yaml
|
||||
```
|
||||
|
||||
At this point, we have completed the deployment of the 1P1D SGlang engine part.
|
||||
|
||||
To allow our users to directly experience the model API, we still need a load balancer to handle sequential calls between prefill and decode. Different companies implement LBs differently, and the community will also officially release a new LB component written in Rust in the near future.
|
||||
|
||||
Currently, we use a static K8S service + minilb approach to implement model API calls.
|
||||
|
||||
### Creating Service for Prefill and Decode
|
||||
|
||||
#### Create prefill k8s service
|
||||
[p-svc.yaml](lws-examples/p-svc.yaml)
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: deepseekr10528-prefill-main
|
||||
spec:
|
||||
selector:
|
||||
leaderworkerset.sigs.k8s.io/name: deepseekr10528-prefill-main
|
||||
role: leader
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 30000
|
||||
targetPort: 30000
|
||||
```
|
||||
Execute `kubectl apply -f p-svc.yaml`
|
||||
|
||||
#### Create decode k8s service
|
||||
[d-svc.yaml](lws-examples/d-svc.yaml)
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: deepseekr10528-decode-main
|
||||
spec:
|
||||
selector:
|
||||
leaderworkerset.sigs.k8s.io/name: deepseekr10528-decode-main
|
||||
role: leader
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 30000
|
||||
targetPort: 30000
|
||||
```
|
||||
Execute `kubectl apply -f d-svc.yaml`
|
||||
|
||||
#### Deploy minilb and lb service
|
||||
[lb.yaml](lws-examples/lb.yaml)
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: deepseekr10528-lb-main
|
||||
labels:
|
||||
app: deepseekr10528-lb
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: deepseekr10528-lb
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: deepseekr10528-lb
|
||||
spec:
|
||||
nodeSelector:
|
||||
pd: "yes"
|
||||
tolerations:
|
||||
- key: pd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
containers:
|
||||
- name: sgl-minilb
|
||||
image: lmsysorg/sglang:deepep
|
||||
command:
|
||||
- python
|
||||
- -m
|
||||
- sglang.srt.disaggregation.mini_lb
|
||||
- --prefill
|
||||
- http://deepseekr10528-prefill-main:30000
|
||||
- --decode
|
||||
- http://deepseekr10528-decode-main:30000
|
||||
- --host
|
||||
- 0.0.0.0
|
||||
- --port
|
||||
- "8000"
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: deepseekr10528-lb-service
|
||||
spec:
|
||||
type: NodePort
|
||||
selector:
|
||||
app: deepseekr10528-lb
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 8000 # Service Port(In-Cluster)
|
||||
targetPort: 8000 # Exposed Container
|
||||
nodePort: 30800
|
||||
```
|
||||
Execute `kubectl apply -f lb.yaml`
|
||||
|
||||
After waiting for all model deployments to succeed, you will get the following output:
|
||||
|
||||
```bash
|
||||
[root@ecs-001]# kubectl get po
|
||||
deepseekr10528-decode-main-0 1/1 Running 0 74m
|
||||
deepseekr10528-decode-main-0-1 1/1 Running 0 74m
|
||||
deepseekr10528-lb-main-9c5dbfc57-6lcbd 1/1 Running 0 22m
|
||||
deepseekr10528-prefill-main-0 1/1 Running 0 74m
|
||||
deepseekr10528-prefill-main-0-1 1/1 Running 0 74m
|
||||
[root@ecs-cbm-x1-pd-cpu-001 main_doc]# kubectl get svc |grep dee
|
||||
deepseekr10528-decode-main ClusterIP None <none> <none> 97m
|
||||
deepseekr10528-lb-service NodePort 172.16.242.169 <none> 8000:30800/TCP 22m
|
||||
deepseekr10528-prefill-main ClusterIP None <none> <none> 97m
|
||||
```
|
||||
|
||||
At this point, select a nodePort:30800 to access:
|
||||
|
||||
```bash
|
||||
[root@ecs-001]# curl -X POST "http://{nodePort}:30800/v1/chat/completions" \
|
||||
> -H "Content-Type: application/json" \
|
||||
> -H "Authorization: Bearer None" \
|
||||
> -d '{
|
||||
> "rid":"ccccdd",
|
||||
> "model": "r1",
|
||||
> "messages": [
|
||||
> {"role": "system", "content": "0: You are a helpful AI assistant"},
|
||||
> {"role": "user", "content": "你是谁?."}
|
||||
> ],
|
||||
> "max_tokens":221
|
||||
> }'
|
||||
{"id":"ccccdd","object":"chat.completion","created":1750252498,"model":"qwen2","choices":[{"index":0,"message":{"role":"assistant","content":"<think>\n嗯,用户问了一个很基础的自我介绍问题"你是谁?"。这可能是第一次互动时的常规开场白,也可能是想确认我的身份和功能范围。\n\n用户没有提供任何背景信息,语气简洁中性。这种场景下新用户的可能性较高,需要给出清晰友好的自我介绍,同时突出实用价值来降低陌生感。\n\n考虑到中文用户,应该用简体中文回复。重点要说明三点:身份归属(深度求索)、功能定位(AI助手)、服务范围(学习/工作/生活)。结尾用开放性问题引导对话很关键——既能了解需求,又能避免让用户面对空白输入框时不知所措。\n\n用波浪线结尾可以软化语气,那个笑脸表情😊刚好能中和AI的机械感。不过要控制表情符号数量,避免显得轻浮。\n</think>\n你好呀!我是你的AI助手,由深度求索公司(DeepSeek)开发的语言模型,名字叫 **DeepSeek-R1**。你可以把我当成一个知识丰富、随叫随到的小帮手~😊\n\n我的任务就是陪你聊天、解答问题、","reasoning_content":null,"tool_calls":null},"logprobs":null,"finish_reason":"length","matched_stop":null}],"usage":{"prompt_tokens":14,"total_tokens":235,"completion_tokens":221,"prompt_tokens_details":null}}
|
||||
|
||||
```
|
||||
## FAQ
|
||||
|
||||
1. The current deployment startup parameters may not be fully compatible with all RDMA scenarios. Different RDMA NCCL-related environment configurations may be needed in different network environments.
|
||||
|
||||
2. Some preset, optimized configurations for EPLB are not used here. You can adjust them according to [6017](https://github.com/sgl-project/sglang/issues/6017) as needed.
|
||||
Reference in New Issue
Block a user