diff --git a/docs/source/tutorials/DeepSeek-V3.2.md b/docs/source/tutorials/DeepSeek-V3.2.md index a20154d2..e375053e 100644 --- a/docs/source/tutorials/DeepSeek-V3.2.md +++ b/docs/source/tutorials/DeepSeek-V3.2.md @@ -126,6 +126,264 @@ If you want to deploy multi-node environment, you need to set up environment on In this tutorial, we suppose you downloaded the model weight to `/root/.cache/`. Feel free to change it to your own path. ::: +### Single-node Deployment + +- Quantized model `DeepSeek-V3.2-w8a8` can be deployed on 1 Atlas 800 A3 (64G × 16). + +Run the following script to execute online inference. + +```shell +export HCCL_OP_EXPANSION_MODE="AIV" +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=10 +export VLLM_USE_V1=1 +export HCCL_BUFFSIZE=200 +export VLLM_ASCEND_ENABLE_MLAPO=1 +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True + +vllm serve /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3.2-W8A8 \ +--host 0.0.0.0 \ +--port 8000 \ +--data-parallel-size 2 \ +--tensor-parallel-size 8 \ +--quantization ascend \ +--seed 1024 \ +--served-model-name deepseek_v3_2 \ +--enable-expert-parallel \ +--max-num-seqs 16 \ +--max-model-len 8192 \ +--max-num-batched-tokens 4096 \ +--trust-remote-code \ +--no-enable-prefix-caching \ +--gpu-memory-utilization 0.92 \ +--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \ +--speculative-config '{"num_speculative_tokens": 3, "method": "deepseek_mtp"}' + +``` + +### Multi-node Deployment + +- `DeepSeek-V3.2-w8a8`: require at least 2 Atlas 800 A2 (64G × 8). + +Run the following scripts on two nodes respectively. + +:::::{tab-set} +:sync-group: install + +::::{tab-item} A3 series +:sync: A3 + +**Node0** + +```{code-block} bash + :substitutions: +# this obtained through ifconfig +# nic_name is the network interface name corresponding to local_ip of the current node +nic_name="xxx" +local_ip="xxx" + +# The value of node0_ip must be consistent with the value of local_ip set in node0 (master node) +node0_ip="xxxx" + +export HCCL_OP_EXPANSION_MODE="AIV" + +export HCCL_IF_IP=$local_ip +export GLOO_SOCKET_IFNAME=$nic_name +export TP_SOCKET_IFNAME=$nic_name +export HCCL_SOCKET_IFNAME=$nic_name +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=10 +export VLLM_USE_V1=1 +export HCCL_BUFFSIZE=200 +export VLLM_ASCEND_ENABLE_MLAPO=1 +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True + +vllm serve /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3.2-W8A8 \ +--host 0.0.0.0 \ +--port 8077 \ +--data-parallel-size 2 \ +--data-parallel-size-local 1 \ +--data-parallel-address $node0_ip \ +--data-parallel-rpc-port 12890 \ +--tensor-parallel-size 16 \ +--quantization ascend \ +--seed 1024 \ +--served-model-name deepseek_v3_2 \ +--enable-expert-parallel \ +--max-num-seqs 16 \ +--max-model-len 8192 \ +--max-num-batched-tokens 4096 \ +--trust-remote-code \ +--no-enable-prefix-caching \ +--gpu-memory-utilization 0.92 \ +--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \ +--speculative-config '{"num_speculative_tokens": 3, "method": "deepseek_mtp"}' +``` + +**Node1** + +```{code-block} bash + :substitutions: +# this obtained through ifconfig +# nic_name is the network interface name corresponding to local_ip of the current node +nic_name="xxx" +local_ip="xxx" + +# The value of node0_ip must be consistent with the value of local_ip set in node0 (master node) +node0_ip="xxxx" + +export HCCL_OP_EXPANSION_MODE="AIV" + +export HCCL_IF_IP=$local_ip +export GLOO_SOCKET_IFNAME=$nic_name +export TP_SOCKET_IFNAME=$nic_name +export HCCL_SOCKET_IFNAME=$nic_name +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=10 +export VLLM_USE_V1=1 +export HCCL_BUFFSIZE=200 +export VLLM_ASCEND_ENABLE_MLAPO=1 +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True + +vllm serve /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3.2-W8A8 \ +--host 0.0.0.0 \ +--port 8077 \ +--headless \ +--data-parallel-size 2 \ +--data-parallel-size-local 1 \ +--data-parallel-start-rank 1 \ +--data-parallel-address $node0_ip \ +--data-parallel-rpc-port 12890 \ +--tensor-parallel-size 16 \ +--quantization ascend \ +--seed 1024 \ +--served-model-name deepseek_v3_2 \ +--enable-expert-parallel \ +--max-num-seqs 16 \ +--max-model-len 8192 \ +--max-num-batched-tokens 4096 \ +--trust-remote-code \ +--no-enable-prefix-caching \ +--gpu-memory-utilization 0.92 \ +--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \ +--speculative-config '{"num_speculative_tokens": 3, "method": "deepseek_mtp"}' +``` + +:::: +::::{tab-item} A2 series +:sync: A2 + +**Node0** + +```{code-block} bash + :substitutions: +# this obtained through ifconfig +# nic_name is the network interface name corresponding to local_ip of the current node +nic_name="xxx" +local_ip="xxx" + +# The value of node0_ip must be consistent with the value of local_ip set in node0 (master node) +node0_ip="xxxx" + +export HCCL_OP_EXPANSION_MODE="AIV" + +export HCCL_IF_IP=$local_ip +export GLOO_SOCKET_IFNAME=$nic_name +export TP_SOCKET_IFNAME=$nic_name +export HCCL_SOCKET_IFNAME=$nic_name +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=100 +export VLLM_USE_V1=1 +export HCCL_BUFFSIZE=200 +export VLLM_ASCEND_ENABLE_MLAPO=1 +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +export VLLM_ASCEND_ENABLE_FLASHCOMM1=0 +export HCCL_CONNECT_TIMEOUT=120 +export HCCL_INTRA_PCIE_ENABLE=1 +export HCCL_INTRA_ROCE_ENABLE=0 + + +vllm serve /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3.2-W8A8 \ +--host 0.0.0.0 \ +--port 8077 \ +--data-parallel-size 2 \ +--data-parallel-size-local 1 \ +--data-parallel-address $node0_ip \ +--data-parallel-rpc-port 13389 \ +--tensor-parallel-size 8 \ +--quantization ascend \ +--seed 1024 \ +--served-model-name deepseek_v3_2 \ +--enable-expert-parallel \ +--max-num-seqs 16 \ +--max-model-len 8192 \ +--max-num-batched-tokens 4096 \ +--trust-remote-code \ +--no-enable-prefix-caching \ +--gpu-memory-utilization 0.92 \ +--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes":[3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48]}' \ +--speculative-config '{"num_speculative_tokens": 2, "method": "deepseek_mtp"}' + +``` + +**Node1** + +```{code-block} bash + :substitutions: +# this obtained through ifconfig +# nic_name is the network interface name corresponding to local_ip of the current node +nic_name="xxx" +local_ip="xxx" + +# The value of node0_ip must be consistent with the value of local_ip set in node0 (master node) +node0_ip="xxxx" + +export HCCL_OP_EXPANSION_MODE="AIV" + +export HCCL_IF_IP=$local_ip +export GLOO_SOCKET_IFNAME=$nic_name +export TP_SOCKET_IFNAME=$nic_name +export HCCL_SOCKET_IFNAME=$nic_name +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=100 +export VLLM_USE_V1=1 +export HCCL_BUFFSIZE=200 +export VLLM_ASCEND_ENABLE_MLAPO=1 +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +export VLLM_ASCEND_ENABLE_FLASHCOMM1=0 +export HCCL_CONNECT_TIMEOUT=120 +export HCCL_INTRA_PCIE_ENABLE=1 +export HCCL_INTRA_ROCE_ENABLE=0 + + +vllm serve /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3.2-W8A8 \ +--host 0.0.0.0 \ +--port 8077 \ +--headless \ +--data-parallel-size 2 \ +--data-parallel-size-local 1 \ +--data-parallel-start-rank 1 \ +--data-parallel-address $node0_ip \ +--data-parallel-rpc-port 13389 \ +--tensor-parallel-size 8 \ +--quantization ascend \ +--seed 1024 \ +--served-model-name deepseek_v3_2 \ +--enable-expert-parallel \ +--max-num-seqs 16 \ +--max-model-len 8192 \ +--max-num-batched-tokens 4096 \ +--trust-remote-code \ +--no-enable-prefix-caching \ +--gpu-memory-utilization 0.92 \ +--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes":[3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48]}' \ +--speculative-config '{"num_speculative_tokens": 2, "method": "deepseek_mtp"}' + +``` + +:::: +::::: + ### Prefill-Decode Disaggregation We'd like to show the deployment guide of `DeepSeek-V3.2` on multi-node environment with 1P1D for better performance.