diff --git a/docs/source/tutorials/DeepSeek-R1.md b/docs/source/tutorials/DeepSeek-R1.md index cda22a70..ecc42c35 100644 --- a/docs/source/tutorials/DeepSeek-R1.md +++ b/docs/source/tutorials/DeepSeek-R1.md @@ -88,12 +88,14 @@ local_ip="xxxx" # AIV export HCCL_OP_EXPANSION_MODE="AIV" + export HCCL_IF_IP=$local_ip export GLOO_SOCKET_IFNAME=$nic_name export TP_SOCKET_IFNAME=$nic_name export HCCL_SOCKET_IFNAME=$nic_name export VLLM_ASCEND_ENABLE_MLAPO=1 export VLLM_ASCEND_BALANCE_SCHEDULING=1 +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True export VLLM_USE_MODELSCOPE=True vllm serve vllm-ascend/DeepSeek-R1-W8A8 \ @@ -143,6 +145,9 @@ export HCCL_IF_IP=$local_ip export GLOO_SOCKET_IFNAME=$nic_name export TP_SOCKET_IFNAME=$nic_name export HCCL_SOCKET_IFNAME=$nic_name +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=1 +export HCCL_BUFFSIZE=200 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True export VLLM_ASCEND_ENABLE_MLAPO=1 export VLLM_ASCEND_BALANCE_SCHEDULING=1 @@ -167,8 +172,8 @@ vllm serve vllm-ascend/DeepSeek-R1-W8A8 \ --max-model-len 16384 \ --max-num-batched-tokens 4096 \ --trust-remote-code \ - --gpu-memory-utilization 0.94 \ - --speculative-config '{"num_speculative_tokens":1,"method":"mtp"}' \ + --gpu-memory-utilization 0.92 \ + --speculative-config '{"num_speculative_tokens":3,"method":"mtp"}' \ --compilation-config '{"cudagraph_capture_sizes":[4,16,32,48,64], "cudagraph_mode": "FULL_DECODE_ONLY"}' ``` @@ -187,6 +192,9 @@ export HCCL_IF_IP=$local_ip export GLOO_SOCKET_IFNAME=$nic_name export TP_SOCKET_IFNAME=$nic_name export HCCL_SOCKET_IFNAME=$nic_name +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=1 +export HCCL_BUFFSIZE=200 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True export VLLM_ASCEND_ENABLE_MLAPO=1 export VLLM_ASCEND_BALANCE_SCHEDULING=1 @@ -213,8 +221,8 @@ vllm serve vllm-ascend/DeepSeek-R1-W8A8 \ --max-model-len 16384 \ --max-num-batched-tokens 4096 \ --trust-remote-code \ - --gpu-memory-utilization 0.94 \ - --speculative-config '{"num_speculative_tokens":1,"method":"mtp"}' \ + --gpu-memory-utilization 0.92 \ + --speculative-config '{"num_speculative_tokens":3,"method":"mtp"}' \ --compilation-config '{"cudagraph_capture_sizes":[4,16,32,48,64], "cudagraph_mode": "FULL_DECODE_ONLY"}' ``` diff --git a/docs/source/tutorials/DeepSeek-V3.1.md b/docs/source/tutorials/DeepSeek-V3.1.md index a65a0d99..6e1c3490 100644 --- a/docs/source/tutorials/DeepSeek-V3.1.md +++ b/docs/source/tutorials/DeepSeek-V3.1.md @@ -166,8 +166,7 @@ export GLOO_SOCKET_IFNAME=$nic_name export TP_SOCKET_IFNAME=$nic_name export HCCL_SOCKET_IFNAME=$nic_name export OMP_PROC_BIND=false -export OMP_NUM_THREADS=10 -export VLLM_USE_V1=1 +export OMP_NUM_THREADS=1 export HCCL_BUFFSIZE=200 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True export VLLM_ASCEND_ENABLE_MLAPO=1 @@ -220,7 +219,7 @@ export GLOO_SOCKET_IFNAME=$nic_name export TP_SOCKET_IFNAME=$nic_name export HCCL_SOCKET_IFNAME=$nic_name export OMP_PROC_BIND=false -export OMP_NUM_THREADS=10 +export OMP_NUM_THREADS=1 export HCCL_BUFFSIZE=200 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True export VLLM_ASCEND_ENABLE_MLAPO=1 @@ -249,7 +248,7 @@ vllm serve /weights/DeepSeek-V3.1-w8a8-mtp-QuaRot \ --trust-remote-code \ --no-enable-prefix-caching \ --gpu-memory-utilization 0.92 \ ---speculative-config '{"num_speculative_tokens": 1, "method": "mtp"}' \ +--speculative-config '{"num_speculative_tokens": 3, "method": "mtp"}' \ --compilation-config '{"cudagraph_capture_sizes":[4,16,32,48,64], "cudagraph_mode": "FULL_DECODE_ONLY"}' ```