[Doc] Modify DeepSeek-R1/V3.1 documentation (#5426)
### What this PR does / why we need it? Modify DeepSeek-R1/V3.1 documentation. Mainly update the mtp size and some other configs. Signed-off-by: GDzhu01 <809721801@qq.com>
This commit is contained in:
@@ -88,12 +88,14 @@ local_ip="xxxx"
|
||||
|
||||
# AIV
|
||||
export HCCL_OP_EXPANSION_MODE="AIV"
|
||||
|
||||
export HCCL_IF_IP=$local_ip
|
||||
export GLOO_SOCKET_IFNAME=$nic_name
|
||||
export TP_SOCKET_IFNAME=$nic_name
|
||||
export HCCL_SOCKET_IFNAME=$nic_name
|
||||
export VLLM_ASCEND_ENABLE_MLAPO=1
|
||||
export VLLM_ASCEND_BALANCE_SCHEDULING=1
|
||||
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
|
||||
vllm serve vllm-ascend/DeepSeek-R1-W8A8 \
|
||||
@@ -143,6 +145,9 @@ export HCCL_IF_IP=$local_ip
|
||||
export GLOO_SOCKET_IFNAME=$nic_name
|
||||
export TP_SOCKET_IFNAME=$nic_name
|
||||
export HCCL_SOCKET_IFNAME=$nic_name
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=1
|
||||
export HCCL_BUFFSIZE=200
|
||||
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
|
||||
export VLLM_ASCEND_ENABLE_MLAPO=1
|
||||
export VLLM_ASCEND_BALANCE_SCHEDULING=1
|
||||
@@ -167,8 +172,8 @@ vllm serve vllm-ascend/DeepSeek-R1-W8A8 \
|
||||
--max-model-len 16384 \
|
||||
--max-num-batched-tokens 4096 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.94 \
|
||||
--speculative-config '{"num_speculative_tokens":1,"method":"mtp"}' \
|
||||
--gpu-memory-utilization 0.92 \
|
||||
--speculative-config '{"num_speculative_tokens":3,"method":"mtp"}' \
|
||||
--compilation-config '{"cudagraph_capture_sizes":[4,16,32,48,64], "cudagraph_mode": "FULL_DECODE_ONLY"}'
|
||||
```
|
||||
|
||||
@@ -187,6 +192,9 @@ export HCCL_IF_IP=$local_ip
|
||||
export GLOO_SOCKET_IFNAME=$nic_name
|
||||
export TP_SOCKET_IFNAME=$nic_name
|
||||
export HCCL_SOCKET_IFNAME=$nic_name
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=1
|
||||
export HCCL_BUFFSIZE=200
|
||||
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
|
||||
export VLLM_ASCEND_ENABLE_MLAPO=1
|
||||
export VLLM_ASCEND_BALANCE_SCHEDULING=1
|
||||
@@ -213,8 +221,8 @@ vllm serve vllm-ascend/DeepSeek-R1-W8A8 \
|
||||
--max-model-len 16384 \
|
||||
--max-num-batched-tokens 4096 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.94 \
|
||||
--speculative-config '{"num_speculative_tokens":1,"method":"mtp"}' \
|
||||
--gpu-memory-utilization 0.92 \
|
||||
--speculative-config '{"num_speculative_tokens":3,"method":"mtp"}' \
|
||||
--compilation-config '{"cudagraph_capture_sizes":[4,16,32,48,64], "cudagraph_mode": "FULL_DECODE_ONLY"}'
|
||||
```
|
||||
|
||||
|
||||
@@ -166,8 +166,7 @@ export GLOO_SOCKET_IFNAME=$nic_name
|
||||
export TP_SOCKET_IFNAME=$nic_name
|
||||
export HCCL_SOCKET_IFNAME=$nic_name
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=10
|
||||
export VLLM_USE_V1=1
|
||||
export OMP_NUM_THREADS=1
|
||||
export HCCL_BUFFSIZE=200
|
||||
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
|
||||
export VLLM_ASCEND_ENABLE_MLAPO=1
|
||||
@@ -220,7 +219,7 @@ export GLOO_SOCKET_IFNAME=$nic_name
|
||||
export TP_SOCKET_IFNAME=$nic_name
|
||||
export HCCL_SOCKET_IFNAME=$nic_name
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=10
|
||||
export OMP_NUM_THREADS=1
|
||||
export HCCL_BUFFSIZE=200
|
||||
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
|
||||
export VLLM_ASCEND_ENABLE_MLAPO=1
|
||||
@@ -249,7 +248,7 @@ vllm serve /weights/DeepSeek-V3.1-w8a8-mtp-QuaRot \
|
||||
--trust-remote-code \
|
||||
--no-enable-prefix-caching \
|
||||
--gpu-memory-utilization 0.92 \
|
||||
--speculative-config '{"num_speculative_tokens": 1, "method": "mtp"}' \
|
||||
--speculative-config '{"num_speculative_tokens": 3, "method": "mtp"}' \
|
||||
--compilation-config '{"cudagraph_capture_sizes":[4,16,32,48,64], "cudagraph_mode": "FULL_DECODE_ONLY"}'
|
||||
```
|
||||
|
||||
|
||||
Reference in New Issue
Block a user